diff --git a/.gitignore b/.gitignore index ed131bdbbad6bd4dad500fa29f40a29fddeb7593..dc0a38edcb563589ce3845803174598ca68ec396 100644 --- a/.gitignore +++ b/.gitignore @@ -63,6 +63,16 @@ test/models/ test/images/ +*.pyc + +# model +*.nb +*.svg +*.dot + +# vim intermediate files +*.swp + # Emacs intermediate files *~ @@ -105,3 +115,5 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models metal/MobileNetDemo/MobileNetDemo/Resources + +build* diff --git a/CMakeLists.txt b/CMakeLists.txt index 3616823985bffb9d53615a031759c701d4b2ff09..73f223493aa232e73f7b428b2678df8339cff13e 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,31 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") message(STATUS "AR tools: ${CMAKE_AR}") + +if(WIN32) + option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) + + set(CMAKE_SUPPRESS_REGENERATION ON) + set(CMAKE_STATIC_LIBRARY_PREFIX lib) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + + if (MSVC_STATIC_CRT) + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + endif() + + add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838) + add_compile_options(/MP) + message(STATUS "Using parallel compiling (/MP)") + set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221") + set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + +endif() + if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) find_package(CUDA QUIET) endif() @@ -62,15 +87,20 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF) lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON) lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) +lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF) +lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) +lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU) lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) +lite_option(LITE_WITH_APU "Enable APU in lite mode" OFF) +lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF) lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF) lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OFF) lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF) -lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF IF LITE_WITH_PROFILE) -lite_option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF) +lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF) +lite_option(LITE_WITH_LOG "Enable log printing or not." ON) lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF) lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF) # publish options @@ -79,6 +109,7 @@ lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF) # cv build options lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF) lite_option(LITE_WITH_STATIC_CUDA "Statically link cuda libraries." ON) +lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF) # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter. if(ANDROID OR IOS OR ARMLINUX) @@ -104,9 +135,16 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) + if(WIN32) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING + "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" + FORCE) + else() + set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" FORCE) + endif() endif() message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") @@ -128,12 +166,18 @@ if (LITE_WITH_PYTHON) include(external/pybind11) # download, build, install pybind11 endif() +if(LITE_WITH_RKNPU) + include(device/rknpu) +endif() + # for mobile if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) message(STATUS "Building the mobile framework") include(cross_compiling/postproject) - include(cross_compiling/npu) # check and prepare NPU DDK + include(device/npu) # check and prepare NPU DDK + include(device/xpu) # check and prepare XPU SDK + include(device/apu) # check and prepare APU SDK # We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON # So the following third party dependencies are not needed. @@ -174,11 +218,17 @@ endif() ######################################################################################## if(LITE_WITH_XPU) - include(xpu) + include(device/xpu) endif() +if(LITE_WITH_MLU) + include(mlu) +endif() +include(coveralls) + include(external/mklml) # download mklml package include(external/xbyak) # download xbyak package + include(external/libxsmm) # download, build, install libxsmm include(external/gflags) # download, build, install gflags include(external/glog) # download, build, install glog @@ -203,7 +253,9 @@ include(generic) # simplify cmake module include(ccache) # set ccache for compilation include(util) # set unittest and link libs include(version) # set PADDLE_VERSION -include(flags) +if(NOT APPLE) + include(flags) +endif() set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") diff --git a/README.md b/README.md index 22b84888294b5ef60c3d91d7a7909aef8f601d81..7094720b498f0a840abc4521f881d53f06b64da8 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,14 @@ # Paddle Lite -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.github.io/Paddle-Lite/) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddle-lite.readthedocs.io/zh/latest/) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) Paddle Lite is an updated version of Paddle-Mobile, an open-open source deep learning framework designed to make it easy to perform inference on mobile, embeded, and IoT devices. It is compatible with PaddlePaddle and pre-trained models from other sources. -For tutorials, please see [PaddleLite Document](https://paddlepaddle.github.io/Paddle-Lite/). +For tutorials, please see [PaddleLite Document](https://paddle-lite.readthedocs.io/zh/latest/). ## Key Features @@ -61,7 +61,8 @@ For demands of Apple's GPU Metal and web front end inference, please see `./meta Paddle Lite has referenced the following open-source projects: - [ARM compute library](http://agroup.baidu.com/paddle-infer/md/article/%28https://github.com/ARM-software/ComputeLibrary%29) -- [Anakin](https://github.com/PaddlePaddle/Anakin). The optimizations under Anakin has been incorporated into Paddle Lite, and so there will not be any future updates of Anakin. As another high-performance inference project under PaddlePaddle, Anakin has been forward-looking and helpful to the making of Paddle Lite. +- [Anakin](https://github.com/PaddlePaddle/Anakin). The optimizations under Anakin has been incorporated into Paddle Lite, and so there will not be any future updates of Anakin. As another high-performance inference project under PaddlePaddle, Anakin has been forward-looking and helpful to the making of Paddle Lite. + ## Feedback and Community Support diff --git a/README_cn.md b/README_cn.md index 11d3967fe8ce88826ca982b71d96268c1a7e5c3a..4f5cd9254d42b4dc02035cb3ecfc8280b0e1c1ac 100644 --- a/README_cn.md +++ b/README_cn.md @@ -1,13 +1,13 @@ # Paddle Lite -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.github.io/Paddle-Lite/) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddle-lite.readthedocs.io/zh/latest/) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) Paddle Lite为Paddle-Mobile的升级版,定位支持包括手机移动端在内更多场景的轻量化高效预测,支持更广泛的硬件和平台,是一个高性能、轻量级的深度学习预测引擎。在保持和PaddlePaddle无缝对接外,也兼容支持其他训练框架产出的模型。 -完整使用文档位于 [PaddleLite 文档](https://paddlepaddle.github.io/Paddle-Lite/) 。 +完整使用文档位于 [PaddleLite 文档](https://paddle-lite.readthedocs.io/zh/latest/) 。 ## 特性 diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 752b22461d9d1c36b3ca6a0bfe472a5dcc3ab976..1b0890e0dbf5e741176c293a059d809752c72a43 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -34,6 +34,15 @@ elseif(SSE3_FOUND) set(SIMD_FLAG ${SSE3_FLAG}) endif() +if(WIN32) + # windows header option for all targets. + add_definitions(-D_XKEYCHECK_H) + + if (NOT MSVC) + message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.") + endif(NOT MSVC) +endif(WIN32) + if(LITE_WITH_CUDA) add_definitions(-DLITE_WITH_CUDA) add_definitions(-DEIGEN_USE_GPU) @@ -70,7 +79,7 @@ endif() if (WITH_MKLML AND MKLML_IOMP_LIB) message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") - if(WIN32) + if(WIN32 OR APPLE) # openmp not support well for now on windows set(OPENMP_FLAGS "") else(WIN32) @@ -122,6 +131,9 @@ if (LITE_WITH_ARM) endif() endif() +if (LITE_WITH_TRAIN) + add_definitions("-DLITE_WITH_TRAIN") +endif() if (WITH_ARM_DOTPROD) add_definitions("-DWITH_ARM_DOTPROD") @@ -131,8 +143,19 @@ if (LITE_WITH_NPU) add_definitions("-DLITE_WITH_NPU") endif() +if (LITE_WITH_APU) + add_definitions("-DLITE_WITH_APU") +endif() + +if (LITE_WITH_RKNPU) + add_definitions("-DLITE_WITH_RKNPU") +endif() + if (LITE_WITH_XPU) add_definitions("-DLITE_WITH_XPU") + if (LITE_WITH_XTCL) + add_definitions("-DLITE_WITH_XTCL") + endif() endif() if (LITE_WITH_OPENCL) @@ -147,19 +170,24 @@ if (LITE_WITH_BM) add_definitions("-DLITE_WITH_BM") endif() +if (LITE_WITH_MLU) +add_definitions("-DLITE_WITH_MLU") +endif() + if (LITE_WITH_PROFILE) add_definitions("-DLITE_WITH_PROFILE") - if (LITE_WITH_PRECISION_PROFILE) - add_definitions("-DLITE_WITH_PRECISION_PROFILE") - endif() +endif() + +if (LITE_WITH_PRECISION_PROFILE) + add_definitions("-DLITE_WITH_PRECISION_PROFILE") endif() if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) add_definitions("-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK") endif() -if (LITE_SHUTDOWN_LOG) - add_definitions("-DLITE_SHUTDOWN_LOG") +if (LITE_WITH_LOG) + add_definitions("-DLITE_WITH_LOG") endif() if (LITE_ON_TINY_PUBLISH) @@ -170,3 +198,6 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL) add_definitions("-DLITE_ON_MODEL_OPTIMIZE_TOOL") endif(LITE_ON_MODEL_OPTIMIZE_TOOL) +if (LITE_WITH_PYTHON) + add_definitions("-DLITE_WITH_PYTHON") +endif(LITE_WITH_PYTHON) diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake index ca1471cabb57c0795ee193493d2e60bb5bd9e1cc..fe272ccb525c6fb71f9d44ceeb76eb8d1ba72626 100644 --- a/cmake/coveralls.cmake +++ b/cmake/coveralls.cmake @@ -20,6 +20,9 @@ function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH) # will be converted from the format "1;2;3" to "1 2 3". set(COVERAGE_SRCS "") foreach (SINGLE_SRC ${_COVERAGE_SRCS}) + if ("${SINGLE_SRC}" MATCHES "/Paddle-Lite/third-party/*") + continue() + endif() set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}") endforeach() @@ -62,7 +65,7 @@ function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH) endfunction() if(WITH_COVERAGE) - set(CMAKE_BUILD_TYPE "Debug") + #set(CMAKE_BUILD_TYPE "Debug") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") @@ -95,9 +98,11 @@ if(WITH_COVERAGE) set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}") endforeach() + set(COVERALLS_UPLOAD ON) code_coverage( "${PADDLE_SRCS}" ${COVERALLS_UPLOAD} "${PROJECT_SOURCE_DIR}/cmake" ) endif() + diff --git a/cmake/cross_compiling/findar.cmake b/cmake/cross_compiling/findar.cmake index bcb0dc70fd811a5041244dedb4a4bcf5b540dc3a..0f86231e49cdca274da27b596305144251a65f4b 100644 --- a/cmake/cross_compiling/findar.cmake +++ b/cmake/cross_compiling/findar.cmake @@ -23,7 +23,7 @@ endif() get_filename_component(AR_PATH ${CMAKE_CXX_COMPILER} PATH) -find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH}) +find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH} NO_DEFAULT_PATH) if(NOT AR_TOOL) message(ERROR "Failed to find AR_TOOL in ${AR_PATH}") diff --git a/cmake/cross_compiling/postproject.cmake b/cmake/cross_compiling/postproject.cmake index 7466b3e6d438277ad31020f76665bf689df436f5..3db715ba74945d9e501637af5ef3086e4f11b294 100644 --- a/cmake/cross_compiling/postproject.cmake +++ b/cmake/cross_compiling/postproject.cmake @@ -57,10 +57,14 @@ function(check_linker_flag) endforeach() set(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} PARENT_SCOPE) endfunction() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if (LITE_ON_TINY_PUBLISH) - if(NOT LITE_WITH_PYTHON) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions") + if((NOT LITE_WITH_PYTHON)) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions") + endif() + if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections") diff --git a/cmake/device/apu.cmake b/cmake/device/apu.cmake new file mode 100644 index 0000000000000000000000000000000000000000..bb690c38074dfb85ec58aa2395af3806176e5829 --- /dev/null +++ b/cmake/device/apu.cmake @@ -0,0 +1,34 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_APU) + return() +endif() + +if(NOT DEFINED APU_DDK_ROOT) + set(APU_DDK_ROOT $ENV{APU_DDK_ROOT}) + if(NOT APU_DDK_ROOT) + message(FATAL_ERROR "Must set APU_DDK_ROOT or env APU_DDK_ROOT when LITE_WITH_APU=ON") + endif() +endif() + +message(STATUS "APU_DDK_ROOT: ${APU_DDK_ROOT}") +find_path(APU_DDK_INC NAMES NeuronAdapter.h + PATHS ${APU_DDK_ROOT}/include NO_DEFAULT_PATH) +if(NOT APU_DDK_INC) + message(FATAL_ERROR "Can not find NeuronAdapter.h in ${APU_DDK_ROOT}/include") +endif() +message(STATUS "APU_DDK_INC: ${APU_DDK_INC}") + +include_directories("${APU_DDK_ROOT}/include") diff --git a/cmake/cross_compiling/npu.cmake b/cmake/device/npu.cmake similarity index 83% rename from cmake/cross_compiling/npu.cmake rename to cmake/device/npu.cmake index c22bb1db4fbf8a7370ff3e7c9aca40cc94d550a2..88598f4690a157b20ac1873d84ad13c2f8652725 100644 --- a/cmake/cross_compiling/npu.cmake +++ b/cmake/device/npu.cmake @@ -17,15 +17,16 @@ if(NOT LITE_WITH_NPU) endif() if(NOT DEFINED NPU_DDK_ROOT) - set(NPU_DDK_ROOT $ENV{NPU_DDK_ROOT}) - if(NOT NPU_DDK_ROOT) - message(FATAL_ERROR "Must set NPU_DDK_ROOT or env NPU_DDK_ROOT when LITE_WITH_NPU=ON") - endif() + set(NPU_DDK_ROOT $ENV{NPU_DDK_ROOT}) + if(NOT NPU_DDK_ROOT) + message(FATAL_ERROR "Must set NPU_DDK_ROOT or env NPU_DDK_ROOT when LITE_WITH_NPU=ON") + endif() endif() message(STATUS "NPU_DDK_ROOT: ${NPU_DDK_ROOT}") find_path(NPU_DDK_INC NAMES HiAiModelManagerService.h - PATHS ${NPU_DDK_ROOT}/include NO_DEFAULT_PATH) + PATHS ${NPU_DDK_ROOT}/include + NO_DEFAULT_PATH) if(NOT NPU_DDK_INC) message(FATAL_ERROR "Can not find HiAiModelManagerService.h in ${NPU_DDK_ROOT}/include") endif() @@ -34,21 +35,24 @@ include_directories("${NPU_DDK_ROOT}/include") set(NPU_SUB_LIB_PATH "lib64") if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") - set(NPU_SUB_LIB_PATH "lib64") + set(NPU_SUB_LIB_PATH "lib64") endif() if(ARM_TARGET_ARCH_ABI STREQUAL "armv7") - set(NPU_SUB_LIB_PATH "lib") + set(NPU_SUB_LIB_PATH "lib") endif() find_library(NPU_DDK_HIAI_FILE NAMES hiai - PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) + PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH} + NO_DEFAULT_PATH) find_library(NPU_DDK_IR_FILE NAMES hiai_ir - PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) + PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH} + NO_DEFAULT_PATH) find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build - PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) + PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH} + NO_DEFAULT_PATH) if(NOT NPU_DDK_HIAI_FILE) message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}") @@ -76,6 +80,3 @@ endif() set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs") set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs") - - - diff --git a/cmake/device/rknpu.cmake b/cmake/device/rknpu.cmake new file mode 100644 index 0000000000000000000000000000000000000000..7d430888072b0219bba3112534818d2e10a55579 --- /dev/null +++ b/cmake/device/rknpu.cmake @@ -0,0 +1,55 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_RKNPU) + return() +endif() + +if(NOT DEFINED RKNPU_DDK_ROOT) + set(RKNPU_DDK_ROOT $ENV{RKNPU_DDK_ROOT}) + if(NOT RKNPU_DDK_ROOT) + message(FATAL_ERROR "Must set RKNPU_DDK_ROOT or env RKNPU_DDK_ROOT when LITE_WITH_RKNPU=ON") + endif() +endif() + +message(STATUS "RKNPU_DDK_ROOT: ${RKNPU_DDK_ROOT}") +find_path(RKNPU_DDK_INC NAMES rknpu/rknpu_pub.h + PATHS ${RKNPU_DDK_ROOT}/include/ NO_DEFAULT_PATH) +if(NOT RKNPU_DDK_INC) + message(FATAL_ERROR "Can not find rknpu_pub.h in ${RKNPU_DDK_ROOT}/include") +endif() + +include_directories("${RKNPU_DDK_ROOT}/include") + +set(RKNPU_SUB_LIB_PATH "lib64") +if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") + set(RKNPU_SUB_LIB_PATH "lib64") +endif() + +if(ARM_TARGET_ARCH_ABI STREQUAL "armv7") + set(RKNPU_SUB_LIB_PATH "lib") +endif() + +find_library(RKNPU_DDK_FILE NAMES rknpu_ddk + PATHS ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}) + +if(NOT RKNPU_DDK_FILE) + message(FATAL_ERROR "Can not find RKNPU_DDK_FILE in ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}") +else() + message(STATUS "Found RKNPU_DDK_FILE Library: ${RKNPU_DDK_FILE}") + add_library(rknpu_ddk SHARED IMPORTED GLOBAL) + set_property(TARGET rknpu_ddk PROPERTY IMPORTED_LOCATION ${RKNPU_DDK_FILE}) +endif() + +set(rknpu_runtime_libs rknpu_ddk CACHE INTERNAL "rknpu ddk runtime libs") diff --git a/cmake/device/xpu.cmake b/cmake/device/xpu.cmake new file mode 100644 index 0000000000000000000000000000000000000000..823048552f3cb5f05375e97e94cd5b5ad63e7563 --- /dev/null +++ b/cmake/device/xpu.cmake @@ -0,0 +1,104 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_XPU) + return() +endif() + +if(NOT DEFINED XPU_SDK_ROOT) + set(XPU_SDK_ROOT $ENV{XPU_SDK_ROOT}) + if(NOT XPU_SDK_ROOT) + message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON") + endif() +endif() +message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}") + +include_directories("${XPU_SDK_ROOT}/XTDK/include") + +find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi + PATHS ${XPU_SDK_ROOT}/XTDK/shlib + NO_DEFAULT_PATH) + +if(NOT XPU_SDK_XPU_API_FILE) + message(FATAL_ERROR "Can not find XPU API Library in ${XPU_SDK_ROOT}") +else() + message(STATUS "Found XPU API Library: ${XPU_SDK_XPU_API_FILE}") + add_library(xpu_sdk_xpu_api SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_xpu_api PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_API_FILE}) +endif() + +find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt + PATHS ${XPU_SDK_ROOT}/XTDK/shlib + NO_DEFAULT_PATH) + +if(NOT XPU_SDK_XPU_RT_FILE) + message(FATAL_ERROR "Can not find XPU RT Library in ${XPU_SDK_ROOT}") +else() + message(STATUS "Found XPU RT Library: ${XPU_SDK_XPU_RT_FILE}") + add_library(xpu_sdk_xpu_rt SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE}) +endif() + +set(xpu_runtime_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu runtime libs") +set(xpu_builder_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu builder libs") + +if(LITE_WITH_XTCL) + find_path(XPU_SDK_INC NAMES xtcl.h + PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH) + if(NOT XPU_SDK_INC) + message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include") + endif() + include_directories("${XPU_SDK_ROOT}/XTCL/include") + + find_library(XPU_SDK_XTCL_FILE NAMES xtcl + PATHS ${XPU_SDK_ROOT}/XTCL/so + NO_DEFAULT_PATH) + + if(NOT XPU_SDK_XTCL_FILE) + message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}") + else() + message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}") + add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE}) + endif() + + find_library(XPU_SDK_TVM_FILE NAMES tvm + PATHS ${XPU_SDK_ROOT}/XTCL/so + NO_DEFAULT_PATH) + + if(NOT XPU_SDK_TVM_FILE) + message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}") + else() + message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}") + add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE}) + endif() + + find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8 + PATHS ${XPU_SDK_ROOT}/XTDK/shlib + NO_DEFAULT_PATH) + + if(NOT XPU_SDK_LLVM_FILE) + message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}") + else() + message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}") + add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE}) + endif() + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1") + + set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs") + set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs") +endif() diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 599e7bba7eaf12da7506ce44e706bd9f50ec6998..f0cbedcba39258327519f45310f24792b4962b91 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -36,7 +36,16 @@ else() # eigen on cuda9.1 missing header of math_funtions.hpp # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen GIT_TAG - URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip + ###################################################################################################### + # url address of eigen before v2.3.0 + # URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip + ###################################################################################################### + # url address of eigen since v2.6.0 + # github address: https://github.com/eigenteam/eigen-git-mirror + # we changed the source code to adapt for windows compiling + # git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h + ###################################################################################################### + URL http://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR} DOWNLOAD_NO_PROGRESS 1 PREFIX ${EIGEN_SOURCE_DIR} diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 142fce816de4f06aa0a36b91e3e4ecb962a8dc2a..8d094d6e064fe57b170d1a50a5457c104d3c3ac2 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -16,12 +16,6 @@ IF(NOT ${WITH_MKLML}) return() ENDIF(NOT ${WITH_MKLML}) -IF(APPLE) - MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.") - SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE) - return() -ENDIF() - INCLUDE(ExternalProject) SET(MKLML_DST_DIR "mklml") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") @@ -38,7 +32,17 @@ IF(WIN32) SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) + SET(MKLML_SHARED_LIB_DEPS ${MKLML_LIB_DIR}/msvcr120.dll) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) +ELSEIF(APPLE) + #TODO(intel-huying): + # Now enable Erf function in mklml library temporarily, it will be updated as offical version later. + SET(MKLML_VER "mklml_mac_2019.0.5.20190502" CACHE STRING "" FORCE) + SET(MKLML_URL "https://paddlelite-data.bj.bcebos.com/third_party_libs/${MKLML_VER}.tgz" CACHE STRING "" FORCE) + SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml.dylib) + SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.dylib) + SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml.dylib) + SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.dylib) ELSE() #TODO(intel-huying): # Now enable Erf function in mklml library temporarily, it will be updated as offical version later. diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index ae99f4df9a3676ae8f5b2c4c01305ead9b7a8254..57e332f1c103b28a194670de609ee521aa41cdf3 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -70,10 +70,10 @@ SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) SET(py_env "") IF(PYTHONINTERP_FOUND) find_python_module(pip REQUIRED) - find_python_module(numpy REQUIRED) + #find_python_module(numpy REQUIRED) #find_python_module(wheel REQUIRED) #find_python_module(google.protobuf REQUIRED) - FIND_PACKAGE(NumPy REQUIRED) + #FIND_PACKAGE(NumPy REQUIRED) #IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0") # MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, " # "please use pip to upgrade protobuf. pip install -U protobuf") diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 225a3c19a16435c4df6403ff7d1bdd01e628dd72..d859404d559282970d96a735c400f745481e8efa 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -276,7 +276,7 @@ function(cc_library TARGET_NAME) add_dependencies(${TARGET_NAME} mklml) if(WIN32) target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB}) - else(WIN32) + elseif(NOT APPLE) target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") endif(WIN32) endif() diff --git a/cmake/lite.cmake b/cmake/lite.cmake index fd40fa437b52ff33089b55c6cfb7df6604a0530d..8408a79fa4265b08771e435dcc5e82801a9d40f9 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -22,7 +22,7 @@ endfunction() function (lite_deps TARGET) set(options "") set(oneValueArgs "") - set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS) + set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps ${lite_deps_DEPS}) @@ -88,6 +88,18 @@ function (lite_deps TARGET) endforeach(var) endif() + if (LITE_WITH_APU) + foreach(var ${lite_deps_APU_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() + + if (LITE_WITH_RKNPU) + foreach(var ${lite_deps_RKNPU_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() + if (LITE_WITH_XPU) foreach(var ${lite_deps_XPU_DEPS}) set(deps ${deps} ${var}) @@ -100,6 +112,12 @@ function (lite_deps TARGET) endforeach(var) endif() + if (LITE_WITH_MLU) + foreach(var ${lite_deps_MLU_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() + set(${TARGET} ${deps} PARENT_SCOPE) endfunction() @@ -125,7 +143,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean function(lite_cc_library TARGET) set(options SHARED shared STATIC static MODULE module) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -136,14 +154,17 @@ function(lite_cc_library TARGET) CUDA_DEPS ${args_CUDA_DEPS} CL_DEPS ${args_CL_DEPS} BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} ARM_DEPS ${args_ARM_DEPS} CV_DEPS ${args_CV_DEPS} FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} + APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} + MLU_DEPS ${args_MLU_DEPS} ) if (args_SHARED OR ARGS_shared) @@ -154,8 +175,10 @@ function(lite_cc_library TARGET) else() cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) endif() - target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + if(NOT WIN32) + target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + endif() # collect targets need to compile for lite if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS) add_dependencies(lite_compile_deps ${TARGET}) @@ -170,7 +193,7 @@ function(lite_cc_binary TARGET) set(options " -g ") endif() set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -183,15 +206,20 @@ function(lite_cc_binary TARGET) ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} + APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} CV_DEPS ${CV_DEPS} + MLU_DEPS ${args_MLU_DEPS} ) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) - target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + if(NOT WIN32) + target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + endif() if (NOT APPLE) # strip binary target to reduce size if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug") @@ -218,7 +246,7 @@ function(lite_cc_test TARGET) endif() set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS COMPILE_LEVEL # (basic|extra) @@ -239,12 +267,15 @@ function(lite_cc_test TARGET) ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} + APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} CV_DEPS ${args_CV_DEPS} + MLU_DEPS ${args_MLU_DEPS} ) _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS}) # strip binary target to reduce size @@ -254,7 +285,9 @@ function(lite_cc_test TARGET) "${TARGET}" COMMENT "Strip debug symbols done on final executable file.") endif() - target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + if(NOT WIN32) + target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + endif() file(APPEND ${offline_test_registry_file} "${TARGET}\n") # collect targets need to compile for lite @@ -268,24 +301,32 @@ set(x86_kernels CACHE INTERNAL "x86 kernels") set(cuda_kernels CACHE INTERNAL "cuda kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels") set(npu_kernels CACHE INTERNAL "npu kernels") +set(apu_kernels CACHE INTERNAL "apu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels") +set(mlu_kernels CACHE INTERNAL "mlu kernels") set(bm_kernels CACHE INTERNAL "bm kernels") +set(rknpu_kernels CACHE INTERNAL "rknpu kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels") set(host_kernels CACHE INTERNAL "host kernels") set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt") file(WRITE ${kernels_src_list} "") # clean + +# file to record faked kernels for opt python lib +set(fake_kernels_src_list "${CMAKE_BINARY_DIR}/fake_kernels_src_list.txt") +file(WRITE ${fake_kernels_src_list} "") # clean + if(LITE_BUILD_TAILOR) set(tailored_kernels_list_path "${LITE_OPTMODEL_DIR}/.tailored_kernels_source_list") file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) endif() # add a kernel for some specific device -# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM) +# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU) # level: one of (basic, extra) function(add_kernel TARGET device level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -302,63 +343,106 @@ function(add_kernel TARGET device level) if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA)) return() endif() - - if (LITE_ON_MODEL_OPTIMIZE_TOOL) - # the source list will collect for model_optimize_tool to fake kernel generation. - foreach(src ${args_SRCS}) - file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") - endforeach() - return() + if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN)) + return() endif() - # when compiling the model_optimize_tool, a source file with all the fake kernel definitions will be generated, - # no need to continue the compilation of the true kernel source. - if (LITE_ON_MODEL_OPTIMIZE_TOOL) - return() - endif(LITE_ON_MODEL_OPTIMIZE_TOOL) - if ("${device}" STREQUAL "Host") + if (LITE_ON_MODEL_OPTIMIZE_TOOL) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + endif() set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "ARM") if (NOT LITE_WITH_ARM) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "X86") - if (NOT LITE_WITH_X86) + if (NOT LITE_WITH_X86 OR LITE_ON_MODEL_OPTIMIZE_TOOL) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "NPU") if (NOT LITE_WITH_NPU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "") endif() + if ("${device}" STREQUAL "APU") + if (NOT LITE_WITH_APU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + endif() + set(apu_kernels "${apu_kernels};${TARGET}" CACHE INTERNAL "") + endif() if ("${device}" STREQUAL "XPU") if (NOT LITE_WITH_XPU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "FPGA") if (NOT LITE_WITH_FPGA) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "BM") if (NOT LITE_WITH_BM) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "") endif() + if ("${device}" STREQUAL "RKNPU") + if (NOT LITE_WITH_RKNPU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + endif() + set(rknpu_kernels "${rknpu_kernels};${TARGET}" CACHE INTERNAL "") + endif() + if ("${device}" STREQUAL "MLU") + if (NOT LITE_WITH_MLU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + endif() + set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "") + endif() if ("${device}" STREQUAL "OPENCL") if (NOT LITE_WITH_OPENCL) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "") @@ -366,6 +450,9 @@ function(add_kernel TARGET device level) if ("${device}" STREQUAL "CUDA") if (NOT LITE_WITH_CUDA) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(cuda_kernels "${cuda_kernels};${TARGET}" CACHE INTERNAL "") @@ -389,8 +476,11 @@ function(add_kernel TARGET device level) ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} + APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} + MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -409,16 +499,18 @@ endif() function(add_operator TARGET level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA)) return() endif() + if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN)) + return() + endif() foreach(src ${args_SRCS}) if(LITE_BUILD_TAILOR) @@ -440,14 +532,40 @@ function(add_operator TARGET level) ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} + APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} + MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} ) endfunction() +#only for windows +function(create_static_lib TARGET_NAME) + set(libs ${ARGN}) + list(REMOVE_DUPLICATES libs) + set(dummy_index 1) + set(dummy_offset 1) + # the dummy target would be consisted of limit size libraries + set(dummy_limit 60) + list(LENGTH libs libs_len) + + foreach(lib ${libs}) + list(APPEND dummy_list ${lib}) + list(LENGTH dummy_list listlen) + if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${libs_len})) + merge_static_libs(${TARGET_NAME}_dummy_${dummy_index} ${dummy_list}) + set(dummy_list) + list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_${dummy_index}) + MATH(EXPR dummy_index "${dummy_index}+1") + endif() + MATH(EXPR dummy_offset "${dummy_offset}+1") + endforeach() + merge_static_libs(${TARGET_NAME} ${${TARGET_NAME}_dummy_list}) +endfunction() # Bundle several static libraries into one. function(bundle_static_library tgt_name bundled_tgt_name fake_target) @@ -491,7 +609,22 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target) set(bundled_tgt_full_name ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}) - #message(STATUS "bundled_tgt_full_name: ${bundled_tgt_full_name}") + message(STATUS "bundled_tgt_full_name: ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}") + + if(WIN32) + set(dummy_tgt_name dummy_${bundled_tgt_name}) + create_static_lib(${bundled_tgt_name} ${static_libs}) + add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_name}) + add_dependencies(${fake_target} ${tgt_name}) + + add_library(${dummy_tgt_name} STATIC IMPORTED) + set_target_properties(${dummy_tgt_name} + PROPERTIES + IMPORTED_LOCATION ${bundled_tgt_full_name} + INTERFACE_INCLUDE_DIRECTORIES $) + add_dependencies(${dummy_tgt_name} ${fake_target}) + return() + endif() if(NOT IOS) file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in diff --git a/cmake/mlu.cmake b/cmake/mlu.cmake new file mode 100644 index 0000000000000000000000000000000000000000..b73ab16462b83e952807289d511fdb95ad74c6cd --- /dev/null +++ b/cmake/mlu.cmake @@ -0,0 +1,61 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_MLU) + return() +endif() + +if(NOT DEFINED NEUWARE_HOME) + set(NEUWARE_HOME $ENV{NEUWARE_HOME}) + if(NOT NEUWARE_HOME) + message(FATAL_ERROR "Must set NEUWARE_HOME or env NEUWARE_HOME when LITE_WITH_MLU=ON") + endif() +endif() + +message(STATUS "LITE_WITH_MLU: ${LITE_WITH_MLU}") +find_path(CNML_INC NAMES cnml.h + PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH) +if(NOT CNML_INC) + message(FATAL_ERROR "Can not find cnml.h in ${NEUWARE_HOME}/include") +endif() + +find_path(CNRT_INC NAMES cnrt.h + PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH) +if(NOT CNRT_INC) + message(FATAL_ERROR "Can not find cnrt.h in ${NEUWARE_HOME}/include") +endif() + +include_directories("${NEUWARE_HOME}/include") + +find_library(CNML_LIB_FILE NAMES cnml + PATHS ${NEUWARE_HOME}/lib64) + +if(NOT CNML_LIB_FILE) + message(FATAL_ERROR "Can not find CNML Library in ${NEUWARE_HOME}/lib64") +else() + message(STATUS "Found CNML Library: ${CNML_LIB_FILE}") + add_library(cnml_lib SHARED IMPORTED GLOBAL) + set_property(TARGET cnml_lib PROPERTY IMPORTED_LOCATION ${CNML_LIB_FILE}) +endif() + +find_library(CNRT_LIB_FILE NAMES cnrt + PATHS ${NEUWARE_HOME}/lib64) + +if(NOT CNRT_LIB_FILE) + message(FATAL_ERROR "Can not find CNRT Library in ${NEUWARE_HOME}/lib64") +else() + message(STATUS "Found CNRT Library: ${CNRT_LIB_FILE}") + add_library(cnrt_lib SHARED IMPORTED GLOBAL) + set_property(TARGET cnrt_lib PROPERTY IMPORTED_LOCATION ${CNRT_LIB_FILE}) +endif() diff --git a/cmake/xpu.cmake b/cmake/xpu.cmake deleted file mode 100644 index 2112f6b658f5f89b20d63c957cd0b979299c350b..0000000000000000000000000000000000000000 --- a/cmake/xpu.cmake +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if(NOT LITE_WITH_XPU) - return() -endif() - -if(NOT DEFINED XPU_SDK_ROOT) - set(XPU_SDK_ROOT $ENV{XPU_SDK_ROOT}) - if(NOT XPU_SDK_ROOT) - message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON") - endif() -endif() - -message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}") -find_path(XPU_SDK_INC NAMES xtcl.h - PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH) -if(NOT XPU_SDK_INC) - message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include") -endif() - -include_directories("${XPU_SDK_ROOT}/XTCL/include") -include_directories("${XPU_SDK_ROOT}/XTDK/include") - -find_library(XPU_SDK_XTCL_FILE NAMES xtcl - PATHS ${XPU_SDK_ROOT}/XTCL/so) - -if(NOT XPU_SDK_XTCL_FILE) - message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}") -else() - message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}") - add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL) - set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE}) -endif() - -find_library(XPU_SDK_TVM_FILE NAMES tvm - PATHS ${XPU_SDK_ROOT}/XTCL/so) - -if(NOT XPU_SDK_TVM_FILE) - message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}") -else() - message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}") - add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL) - set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE}) -endif() - -find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi - PATHS ${XPU_SDK_ROOT}/XTDK/shlib) - -if(NOT XPU_SDK_XPU_API_FILE) - message(FATAL_ERROR "Can not find XPU API Library in ${XPU_SDK_ROOT}") -else() - message(STATUS "Found XPU API Library: ${XPU_SDK_XPU_API_FILE}") - add_library(xpu_sdk_xpu_api SHARED IMPORTED GLOBAL) - set_property(TARGET xpu_sdk_xpu_api PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_API_FILE}) -endif() - -find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt - PATHS ${XPU_SDK_ROOT}/XTDK/shlib) - -if(NOT XPU_SDK_XPU_RT_FILE) - message(FATAL_ERROR "Can not find XPU RT Library in ${XPU_SDK_ROOT}") -else() - message(STATUS "Found XPU RT Library: ${XPU_SDK_XPU_RT_FILE}") - add_library(xpu_sdk_xpu_rt SHARED IMPORTED GLOBAL) - set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE}) -endif() - -find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc - PATHS ${XPU_SDK_ROOT}/XTDK/shlib) - -if(NOT XPU_SDK_XPU_JITC_FILE) - message(FATAL_ERROR "Can not find XPU JITC Library in ${XPU_SDK_ROOT}") -else() - message(STATUS "Found XPU JITC Library: ${XPU_SDK_XPU_JITC_FILE}") - add_library(xpu_sdk_xpu_jitc SHARED IMPORTED GLOBAL) - set_property(TARGET xpu_sdk_xpu_jitc PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_JITC_FILE}) -endif() - -find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8 - PATHS ${XPU_SDK_ROOT}/XTDK/shlib) - -if(NOT XPU_SDK_LLVM_FILE) - message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}") -else() - message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}") - add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL) - set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE}) -endif() - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1 -D_GLIBCXX_USE_CXX11_ABI=0") - -set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs") -set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu builder libs") diff --git a/docs/advanced_user_guides/index.rst b/docs/advanced_user_guides/index.rst deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/docs/advanced_user_guides/model_quantization.md b/docs/advanced_user_guides/model_quantization.md deleted file mode 100644 index 7d781ba9904400c26b64aed5f5dc764ecc5b24fa..0000000000000000000000000000000000000000 --- a/docs/advanced_user_guides/model_quantization.md +++ /dev/null @@ -1,327 +0,0 @@ -# 模型量化 - -本文主要介绍使用Paddle-Lite加载PaddlePaddle产出的量化模型,并进行推理执行。我们以MobileNetV1模型为示例,首先介绍准备量化模型,然后介绍部署执行。 - -## 准备量化模型 - -PaddlePaddle使用量化训练和训练后量化两种方法将FP32模型量化成Int8模型,下面分别介绍两种方法如何产出量化模型。 - -### 量化训练 - -目前,PaddlePaddle框架的量化训练主要针对卷积层(包括二维卷积和Depthwise卷积)、和全连接层,对应算子是conv2d、depthwise_conv2d和mul,更多量化训练的原理请参考[文档](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/tutorial.md#1-quantization-aware-training%E9%87%8F%E5%8C%96%E4%BB%8B%E7%BB%8D)。Paddle-Lite支持运行PaddlePaddle框架量化训练产出的模型,可以进一步加快模型在移动端的执行速度。 - -温馨提示:如果您是初次接触PaddlePaddle框架,建议首先学习[新人入门](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)和[使用指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/index_cn.html)。 - - -您可以选择下载训练好的量化模型,或者使用PaddleSlim模型压缩工具训练得到量化模型。 - -#### 下载量化模型 - -官方发布了[MobileNetV1量化模型](https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip),直接下载到本地。 - -```bash -wget https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip -``` - -#### 使用PaddleSlim模型压缩工具训练量化模型 - -##### 安装PaddlePaddle - -根据操作系统、安装方式、Python版本和CUDA版本,按照[官方说明](https://paddlepaddle.org.cn/start)安装PaddlePaddle。例如: - -Ubuntu 16.04.4 LTS操作系统,CUDA9,cuDNN7,GPU版本安装: -```bash -pip install paddlepaddle-gpu==1.6.0.post97 -i https://mirrors.aliyun.com/pypi/simple/ -``` - -Ubuntu 16.04.4 LTS操作系统,CPU版本安装: -```bash -pip install paddlepaddle==1.6.0 -i https://mirrors.aliyun.com/pypi/simple/ -``` - -##### 克隆量化训练所需的代码库 - -克隆[PaddlePaddle/models](https://github.com/PaddlePaddle/models)到本地,并进入models/PaddleSlim路径。 - -```bash -git clone https://github.com/PaddlePaddle/models.git -cd models/PaddleSlim -``` - -##### 数据准备 -###### 训练数据准备 - -参考[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification#data-preparation)中的数据准备教程,下载训练数据,并且保存到PaddleSlim/data路径下。 - -###### 预训练模型准备 - -参考/models/PaddleSlim/run.sh脚本, 从[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#supported-models-and-performances)下载MobileNetV1的预训练模型,并保存到PaddleSlim/pretrain路径下。 - -经过以上三步,PaddleSlim目录下的文件结构如下所示: - -```bash -. -├── compress.py # 模型压缩任务主脚本,定义了压缩任务需要的模型相关信息 -├── configs # 压缩任务的配置文件,包括:蒸馏、int8量化量化、filter剪切和组合策略的配置文件 -├── data # 存放训练数据(需要用户自己创建) -│   └── ILSVRC2012 -├── pretrain # 存放预训练模型参数,执行run.sh自动生成 -│   ├── MobileNetV1_pretrained -│   ├── MobileNetV1_pretrained.tar -│   ├── ResNet50_pretrained -│   └── ResNet50_pretrained.tar -├── docs # 文档目录 -├── light_nas -├── models # 模型网络结构的定义,如MobileNetV1 -├── quant_low_level_api # 量化训练的底层API, 用于灵活定制量化训练的过程,适用于高阶用户 -├── reader.py # 定义数据处理逻辑 -├── README.md -├── run.sh # 模型压缩任务启动脚本 -└── utility.py # 定义了常用的工具方法 -``` - -##### 压缩脚本介绍 - -在`compress.py`中定义了执行压缩任务需要的所有模型相关的信息,这里对几个关键的步骤进行简要介绍: - -###### 目标网络的定义 - -compress.py的以下代码片段定义了train program, 这里train program只有前向计算操作。 -```python -out = model.net(input=image, class_dim=args.class_dim) -cost = fluid.layers.cross_entropy(input=out, label=label) -avg_cost = fluid.layers.mean(x=cost) -acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) -acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) -``` - -然后,通过clone方法得到eval_program, 用来在压缩过程中评估模型精度,如下: - -```python -val_program = fluid.default_main_program().clone() -``` - -定义完目标网络结构,需要对其初始化,并根据需要加载预训练模型。 - -###### 定义feed_list和fetch_list -对于train program, 定义train_feed_list用于指定从train data reader中取的数据feed给哪些variable。定义train_fetch_list用于指定在训练时,需要在log中展示的结果。如果需要在训练过程中在log中打印accuracy信心,则将('acc_top1', acc_top1.name)添加到train_fetch_list中即可。 -```python -train_feed_list = [('image', image.name), ('label', label.name)] -train_fetch_list = [('loss', avg_cost.name)] -``` - -> 注意: 在train_fetch_list里必须有loss这一项。 - -对于eval program. 同上定义eval_feed_list和train_fetch_list: - -```python -val_feed_list = [('image', image.name), ('label', label.name)] -val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5', acc_top5.name)] -``` - -###### Compressor和量化配置文件 -`compress.py`主要使用Compressor和yaml文件完成对模型的量化训练工作。Compressor类的定义如下: -```python -class Compressor(object): - def __init__(self, - place, - scope, - train_program, - train_reader=None, - train_feed_list=None, - train_fetch_list=None, - eval_program=None, - eval_reader=None, - eval_feed_list=None, - eval_fetch_list=None, - teacher_programs=[], - checkpoint_path='./checkpoints', - train_optimizer=None, - distiller_optimizer=None): -``` - -在定义Compressor对象时,需要注意以下问题: -* train program如果带反向operators和优化更新相关的operators, 参数train_optimizer需要设置为None. -* eval_program中parameter的名称需要与train_program中的parameter的名称完全一致。 -* 最终保存的量化模型是在eval_program网络基础上进行剪枝保存的。所以,如果用户希望最终保存的模型可以用于inference, 则eval program需要包含推理阶段需要的各种operators. -* checkpoint保存的是float数据类型的模型。 - -`configs/quantization.yaml`量化配置文件示例如下: - -```python -version: 1.0 -strategies: - quantization_strategy: - class: 'QuantizationStrategy' - start_epoch: 0 - end_epoch: 9 - float_model_save_path: './output/float' - mobile_model_save_path: './output/mobile' - int8_model_save_path: './output/int8' - weight_bits: 8 - activation_bits: 8 - weight_quantize_type: 'abs_max' - activation_quantize_type: 'moving_average_abs_max' - save_in_nodes: ['image'] - save_out_nodes: ['fc_0.tmp_2'] -compressor: - epoch: 10 - checkpoint_path: './checkpoints_quan/' - strategies: - - quantization_strategy -``` -其中,可配置参数包括: -- **class:** 量化策略的类名称,目前仅支持`QuantizationStrategy`。 -- **start_epoch:** 在start_epoch开始之前,量化训练策略会往train_program和eval_program插入量化operators和反量化operators。 从start_epoch开始,进入量化训练阶段。 -- **end_epoch:** 在end_epoch结束之后,会保存用户指定格式的模型。注意:end_epoch之后并不会停止量化训练,而是继续训练直到epoch数等于compressor.epoch值为止。举例来说,当start_epoch=0,end_epoch=0,compressor.epoch=2时,量化训练开始于epoch0,结束于epoch1,但保存的模型是epoch0结束时的参数状态。 -- **float_model_save_path:** 保存float数据格式的模型路径,即该路径下的模型参数范围为int8范围但参数数据类型为float32。如果设置为None, 则不存储float格式的模型,默认为None。**注意:Paddle-Lite即使用该目录下的模型进行量化模型推理优化,详见本文[使用Paddle-Lite运行量化模型推理](#二使用Paddle-Lite运行量化模型推理)部分。** -- **int8_model_save_path:** 保存int8数据格式的模型路径,即该路径下的模型参数范围为int8范围且参数数据类型为int8。如果设置为None, 则不存储int8格式的模型,默认为None. -- **mobile_model_save_path:** 保存兼容paddle-mobile框架的模型路径。如果设置为None, 则不存储paddle-mobile格式的模型,默认为None。目前paddle-mobile已升级为Paddle-Lite。 -- **weight_bits:** 量化weight的bit数,注意偏置(bias)参数不会被量化。 -- **activation_bits:** 量化activation的bit数。 -- **weight_quantize_type:** weight量化方式,目前量化训练支持`abs_max`、 `channel_wise_abs_max`。 -- **activation_quantize_type:** activation量化方式,目前量化训练支持`range_abs_max`、`moving_average_abs_max`。PaddlePaddle中还支持 `abs_max` 方法对激活进行量化,但是该方法动态计算输入的量化scale,这会增加计算量、减慢模型推理速度,所以lite不支持 `abs_max`激活量化方式。 -- **save_in_nodes:** variable名称列表。在保存量化后模型的时候,需要根据save_in_nodes对eval programg 网络进行前向遍历剪枝。默认为eval_feed_list内指定的variable的名称列表。 -- **save_out_nodes:** varibale名称列表。在保存量化后模型的时候,需要根据save_out_nodes对eval programg 网络进行回溯剪枝。默认为eval_fetch_list内指定的variable的名称列表。 - -> **备注:** -> -> 1)`abs_max`意为在训练的每个step及inference阶段均动态计算量化scale值。`channel_wise_abs_max`与`abs_max`类似,不同点在于它会对卷积权重进行分channel求取量化scale。换言之,`abs_max`属于tensor-wise量化,而`channel_wise_abs_max`属于channel-wise量化,详细说明请猛戳[此处](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/quantization/training_quantization_model_format.md)。 -> -> 2)`moving_average_abs_max`和`range_abs_max`意为在训练阶段计算出一个静态的量化scale值,并将其用于inference阶段。`moving_average_abs_max`使用窗口滑动平均的方法计算量化scale,而`range_abs_max`则使用窗口绝对值最大值的方式。 -> -> 3)**目前,Paddle-Lite仅支持运行weight量化方式使用`abs_max`且activation量化方式使用`moving_average_abs_max`或`range_abs_max`产出的量化模型**。 - -##### 执行int8量化训练 - -修改run.sh,即注释掉`# enable GC strategy`与`# for sensitivity filter pruning`之间的内容并打开`#for quantization`相关的脚本命令(所需打开注释的命令如下所示)。 - -```bash -# for quantization -#--------------------------- -export CUDA_VISIBLE_DEVICES=0 -python compress.py \ ---batch_size 64 \ ---model "MobileNet" \ ---pretrained_model ./pretrain/MobileNetV1_pretrained \ ---compress_config ./configs/quantization.yaml \ ---quant_only True -``` -最后,运行`sh run.sh`命令开始int8量化训练。 - -上述量化训练过程完成后,若按照本文中所述`configs/quantization.yaml`文件内容配置的模型输出路径,则可在models/PaddleSlim/output目录下看到`float`、`int8`和`mobile`三个目录,其中: -* float目录: 参数范围为int8范围但参数数据类型为float32的量化模型。Paddle-Lite即使用该目录下的模型文件及参数进行量化模型的部署。 -* int8目录: 参数范围为int8范围且参数数据类型为int8的量化模型。 -* mobile目录:参数特点与int8目录相同且兼容paddle-mobile的量化模型(目前paddle-mobile已升级为Paddle-Lite)。 - -### 训练后量化 - -下面以MobileNetV1为例,介绍使用训练后量化方法产出量化模型。关于训练后量化的原理和详细使用方法,请参考[文档](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api)。 - -> 该示例的代码放在[models/PaddleSlim/quant_low_level_api/](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api)目录下。如果需要执行该示例,首先clone下来[models](https://github.com/PaddlePaddle/models.git),安装具有训练后量化功能的PaddlePaddle。因为目前Lite支持支持对conv2d、depthwise_conv2d和mul量化,所以修改[run_post_training_quanzation.sh](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/quant_low_level_api/run_post_training_quanzation.sh) 脚本,设置is_full_quantize=False,然后执行该脚本;执行结束后,量化模型保存在`mobilenetv1_int8_model`目录下。下面介绍详细步骤。 - -1)**准备模型和校准数据** - -安装PaddlePaddle的develop分支编译的whl包,准备已经训练好的FP32预测模型。 - -准备校准数据,文件结构如下。val文件夹中有100张图片,val_list.txt文件中包含图片的label。 -```bash -samples_100 -└──val -└──val_list.txt -``` - -2)**配置校准数据生成器** - -MobileNetV1的输入是图片和标签,所以配置读取校准数据的sample_generator,每次返回一张图片和一个标签。详细代码在[models/PaddleSlim/reader.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/reader.py)。 - -3)**调用训练后量化** - -调用训练后量化的核心代码如下,详细代码在[post_training_quantization.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/quant_low_level_api/post_training_quantization.py)。 -``` python -place = fluid.CUDAPlace(0) if args.use_gpu == "True" else fluid.CPUPlace() -exe = fluid.Executor(place) -sample_generator = reader.val(data_dir=args.data_path) - -ptq = PostTrainingQuantization( - executor=exe, - sample_generator=sample_generator, - model_dir=args.model_dir, - model_filename=args.model_filename, - params_filename=args.params_filename, - batch_size=args.batch_size, - batch_nums=args.batch_nums, - algo=args.algo, - is_full_quantize=args.is_full_quantize == "True") -quantized_program = ptq.quantize() -ptq.save_quantized_model(args.save_model_path) -``` - -## 使用Paddle-Lite运行量化模型推理 - -#### 使用模型优化工具对量化模型进行优化 - -接下来,使用原始的量化模型生成适合在移动端直接部署的模型。 - -参考[源码编译](../source_compile)配置编译环境,确保可以编译成功。参考[模型转化方法](../model_optimize_tool),首先编译model_optimize_tool工具,然后执行下面命令对量化训练的模型进行优化(注意,需要自行修改model_file、param_file和optimize_out)。 -```bash -./model_optimize_tool \ ---model_file=mobilenet_v1_quant/float/model \ ---param_file=mobilenet_v1_quant/float/weights \ ---optimize_out_type=naive_buffer \ ---optimize_out=mobilenet_v1_quant_opt \ ---valid_targets=arm \ ---prefer_int8_kernel=true -``` - -如前所述,量化训练后,float目录下的模型参数范围为int8,但参数数据类型仍为float32类型,这样确实没有起到模型参数压缩的效果。但是,经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果,且模型结构也被优化(如进行了各种operator fuse操作)。 - -#### 在手机端准备量化模型文件 - -使用如下命令将mobilenet_v1_quant_opt目录下的量化模型文件导入到手机端: - -```bash -adb push mobilenet_v1_quant_opt /data/local/tmp -``` - -#### 使用mobilenetv1\_light\_api运行优化后的量化模型 - -参考[源码编译](../source_compile)配置编译环境后,在Paddle-Lite执行如下命令获取轻量级API的demo: - -```bash -cd /Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light -make clean && make -j -``` -执行完上述命令后,可在`Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light/`路径下看到`mobilenetv1_light_api`可执行文件。将`mobilenetv1_light_api`导入到手机端并运行量化模型推理。执行命令如下: - -```bash -adb push Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp -adb shell chmod +x /data/local/tmp/mobilenetv1_light_api -adb shell /data/local/tmp/mobilenetv1_light_api \ - --model_dir=/data/local/tmp/mobilenet_v1_quant_opt -``` -**程序运行结果如下:** -```bash -Output dim: 1000 -Output[0]: 0.000228 -Output[100]: 0.000260 -Output[200]: 0.000250 -Output[300]: 0.000560 -Output[400]: 0.000950 -Output[500]: 0.000275 -Output[600]: 0.005143 -Output[700]: 0.002509 -Output[800]: 0.000538 -Output[900]: 0.000969 -``` -在C++中使用Paddle-Lite API的方法请猛戳[此处](../cpp_demo),用户也可参考[mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)的代码示例。 - -### FAQ - -**问题**:Compiled with WITH_GPU, but no GPU found in runtime - -**解答**:检查本机是否支持GPU训练,如果不支持请使用CPU训练。如果在docker进行GPU训练,请使用nvidia_docker启动容器。 - -**问题**:Inufficient GPU memory to allocation. at [/paddle/paddle/fluid/platform/gpu_info.cc:262] - -**解答**:正确设置run.sh脚本中`CUDA_VISIBLE_DEVICES`,确保显卡剩余内存大于需要内存。 diff --git a/docs/advanced_user_guides/x86.md b/docs/advanced_user_guides/x86.md deleted file mode 100644 index 7cb08683440312b0349662699b05e99df0cb6df1..0000000000000000000000000000000000000000 --- a/docs/advanced_user_guides/x86.md +++ /dev/null @@ -1,104 +0,0 @@ -# 使用X86预测库 - -Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../installation/source_compile)。 - -(注意:非docker Linux环境需要是Ubuntu16.04) - -## 编译 - -1、 下载代码 -```bash -git clone https://github.com/PaddlePaddle/Paddle-Lite.git -#需要切换到 release/v2.0.0之后版本 -git checkout -``` - -2、 源码编译 - -```bash -cd Paddle-Lite -./lite/tools/build.sh x86 -``` - -## 编译结果说明 - -x86编译结果位于 `build.lite.x86/inference_lite_lib` -**具体内容**说明: - -1、 `bin`文件夹:可执行工具文件 `test_model_bin` - -2、 `cxx`文件夹:包含c++的库文件与相应的头文件 - -- `include` : 头文件 -- `lib` : 库文件 - - 打包的静态库文件: - - `libpaddle_api_full_bundled.a` :包含 full_api 和 light_api 功能的静态库 - - `libpaddle_api_light_bundled.a` :只包含 light_api 功能的静态库 - - 打包的动态态库文件: - - `libpaddle_full_api_shared.so` :包含 full_api 和 light_api 功能的动态库 - - `libpaddle_light_api_shared.so`:只包含 light_api 功能的动态库 - -3、 `third_party` 文件夹:第三方库文件 - -## x86预测API使用示例 - -```c++ -#include -#include -#include -#include "paddle_api.h" // NOLINT -#include "paddle_use_kernels.h" // NOLINT -#include "paddle_use_ops.h" // NOLINT -#include "paddle_use_passes.h" // NOLINT - -using namespace paddle::lite_api; // NOLINT - -DEFINE_string(model_dir, "", "Model dir path."); -DEFINE_string(optimized_model_dir, "", "Optimized model dir."); -DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels"); - -int64_t ShapeProduction(const shape_t& shape) { - int64_t res = 1; - for (auto i : shape) res *= i; - return res; -} -void RunModel() { - // 1. Set CxxConfig - CxxConfig config; - config.set_model_file(FLAGS_model_dir + "model"); - config.set_param_file(FLAGS_model_dir + "params"); - - config.set_valid_places({ - lite_api::Place{TARGET(kX86), PRECISION(kFloat)} - }); - - // 2. Create PaddlePredictor by CxxConfig - std::shared_ptr predictor = - CreatePaddlePredictor(config); - - // 3. Prepare input data - std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); - input_tensor->Resize(shape_t({1, 3, 224, 224})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 1; - } - - // 4. Run predictor - predictor->Run(); - - // 5. Get output - std::unique_ptr output_tensor( - std::move(predictor->GetOutput(0))); - std::cout << "Output dim: " << output_tensor->shape()[1] << std::endl; - for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { - std::cout << "Output[" << i << "]:" << output_tensor->data()[i] << std::endl; - } -} - -int main(int argc, char** argv) { - google::ParseCommandLineFlags(&argc, &argv, true); - RunModel(); - return 0; -} -``` diff --git a/docs/advanced_user_guides/cv.md b/docs/api_reference/cv.md similarity index 97% rename from docs/advanced_user_guides/cv.md rename to docs/api_reference/cv.md index 1f53ac87564c80dcc15c5979a4212da5c3e730b8..5110e40c423c39e33feb084fa0d09c89ddd13d16 100644 --- a/docs/advanced_user_guides/cv.md +++ b/docs/api_reference/cv.md @@ -1,6 +1,6 @@ -# CV 图像预处理API接口介绍 +# CV图像预处理API -请把编译脚本`Paddle-Lite/lite/too/build.sh`中`BUILD_CV`变量设置为`ON`, 其他编译参数设置请参考[源码编译](../source_compile), 以确保 Lite 可以正确编译。这样`CV`图像的加速库就会编译进去,且会生成`paddle_image_preprocess.h`的API文件 +请把编译脚本`Paddle-Lite/lite/too/build.sh`中`BUILD_CV`变量设置为`ON`, 其他编译参数设置请参考[源码编译](../user_guides/source_compile), 以确保 Lite 可以正确编译。这样`CV`图像的加速库就会编译进去,且会生成`paddle_image_preprocess.h`的API文件 - 硬件平台: `ARM` - 操作系统:`MAC` 和 `LINUX` diff --git a/docs/api_reference/cxx_api_doc.md b/docs/api_reference/cxx_api_doc.md index 38385a4267d5727d9c5c7d985d3457dd011e203c..1eda7d66ca7fbec1d8280d3ae1bc6e28220be6b4 100644 --- a/docs/api_reference/cxx_api_doc.md +++ b/docs/api_reference/cxx_api_doc.md @@ -1,5 +1,5 @@ -# C++ API文档 +# C++ API ## CreatePaddlePredictor @@ -260,14 +260,14 @@ class MobileConfig; `MobileConfig`用来配置构建轻量级PaddlePredictor的配置信息,如NaiveBuffer格式的模型地址、模型的内存地址(从内存加载模型时使用)、能耗模式、工作线程数等等。 -*注意:输入的模型需要使用[Model Optimize Tool](../model_optimize_tool)转化为NaiveBuffer格式的优化模型。* +*注意:输入的模型需要使用[Model Optimize Tool](../user_guides/model_optimize_tool)转化为NaiveBuffer格式的优化模型。* 示例: ```c++ MobileConfig config; // 设置NaiveBuffer格式模型目录,从文件加载模型时使用 -config.set_model_dir(FLAGS_model_dir); +config.set_model_from_file(); // 设置工作线程数 config.set_threads(4); // 设置能耗模式 @@ -277,13 +277,13 @@ config.set_power_mode(LITE_POWER_HIGH); std::shared_ptr predictor = CreatePaddlePredictor(config); ``` -### `set_model_from_file(model_dir)` +### `set_model_from_file(model_file)` 设置模型文件,当需要从磁盘加载模型时使用。 参数: -- `model_dir(std::string)` - 模型文件路径 +- `model_file(std::string)` - 模型文件路径 返回:`None` @@ -400,7 +400,7 @@ std::shared_ptr predictor = CreatePaddlePredictor - `None` -返回:内存中模型结构数据 +返回:内存中模型参数数据 返回类型:`const std::string&` @@ -589,7 +589,7 @@ for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { 根据名称获取输出Tensor的指针。 -**注意**:`GetTensor`接口是为开发者设计的调试接口,可以输出[转化](../model_optimize_tool)后模型中的任一节点。如果出现`GetTensor(InputName)`返回值为空`Tensor`,可能原因是以该`InputName`命名的Tensor在模型转化的**子图融合**过程被融合替换了。 +**注意**:`GetTensor`接口是为开发者设计的调试接口,可以输出[转化](../user_guides/model_optimize_tool)后模型中的任一节点。如果出现`GetTensor(InputName)`返回值为空`Tensor`,可能原因是以该`InputName`命名的Tensor在模型转化的**子图融合**过程被融合替换了。 参数: diff --git a/docs/api_reference/index.rst b/docs/api_reference/index.rst deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/docs/api_reference/java_api_doc.md b/docs/api_reference/java_api_doc.md new file mode 100644 index 0000000000000000000000000000000000000000..3ef8edb6e68daef0a86c04d7bb216106d36b26d5 --- /dev/null +++ b/docs/api_reference/java_api_doc.md @@ -0,0 +1,394 @@ +# Java API + +## MobileConfig + +```java +public class MobileConfig extends ConfigBase; +``` + +`MobileConfig`用来配置构建轻量级PaddlePredictor的配置信息,如NaiveBuffer格式的模型地址、能耗模式、工作线程数等等。 + +*注意:输入的模型需要使用Model Optimize Tool转化为NaiveBuffer格式的优化模型。* + +示例: + +```java +MobileConfig config = new MobileConfig(); +// 设置NaiveBuffer格式模型目录 +config.setModelFromFile(modelfile); +// 设置能耗模式 +config.setPowerMode(PowerMode.LITE_POWER_HIGH); +// 设置工作线程数 +config.setThreads(1); + +// 根据MobileConfig创建PaddlePredictor +PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config); +``` + +### ``setModelFromFile(model_file)`` + +设置模型文件夹路径。 + +参数: + +- `model_file(String)` - 模型文件路径 + +返回:`None` + +返回类型:`void` + + + +### ``setModelDir(model_dir)`` + +**注意**:Lite模型格式在release/v2.3.0之后修改,本接口为加载老格式模型的接口,将在release/v3.0.0废弃。建议替换为`setModelFromFile`接口。 + +设置模型文件夹路径。 + +参数: + +- `model_dir(String)` - 模型文件夹路径 + +返回:`None` + +返回类型:`void` + + + +### ``setModelFromBuffer(model_buffer)`` + +设置模型的内存数据,当需要从内存加载模型时使用。 + +参数: + +- `model_buffer(str)` - 内存中的模型数据 + +返回:`None` + +返回类型:`void` + + + +### `getModelDir()` + +返回设置的模型文件夹路径。 + +参数: + +- `None` + +返回:模型文件夹路径 + +返回类型:`String` + + + +### `setPowerMode(mode)` + +设置CPU能耗模式。若不设置,则默认使用`LITE_POWER_HIGH`。 + +*注意:只在开启`OpenMP`时生效,否则系统自动调度。* + +参数: + +- `mode(PowerMode)` - CPU能耗模式。 + +返回:`None` + +返回类型:`void` + + + +### `getPowerMode()` + +获取设置的CPU能耗模式。 + +参数: + +- `None` + +返回:设置的CPU能耗模式 + +返回类型:`PowerMode` + + + +### `setThreads(threads)` + +设置工作线程数。若不设置,则默认使用单线程。 + +*注意:只在开启`OpenMP`的模式下生效,否则只使用单线程。* + +参数: + +- `threads(int)` - 工作线程数。默认为1。 + +返回:`None` + +返回类型:`void` + + + +### `getThreads()` + +获取设置的工作线程数。 + +参数: + +- `None` + +返回:工作线程数 + +返回类型:`int` + +## PaddlePredictor + +```java +public class PaddlePredictor; +``` + +`PaddlePredictor`是Paddle-Lite的预测器。用户可以根据PaddlePredictor提供的接口使用MobileConfig创建新的预测器、设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。 + +示例: + +```java +// 设置MobileConfig +MobileConfig config = new MobileConfig(); +config.setModelDir(modelPath); + +// 创建PaddlePredictor +PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config); + +// 设置输入数据 +long[] dims = {100, 100}; +float[] inputBuffer = new float[10000]; +for (int i = 0; i < 10000; ++i) { + inputBuffer[i] = i; +} +Tensor input = predictor.getInput(0); +input.resize(dims); +input.setData(inputBuffer); + +// 执行预测 +predictor.run(); + +// 获取输出数据 +Tensor output = predictor.getOutput(0); +float[] output = result.getFloatData(); +for (int i = 0; i < 1000; ++i) { + System.out.println(output[i]); +} +``` + + + +### `CreatePaddlePredictor(config)` + +```java +public static PaddlePredictor createPaddlePredictor(ConfigBase config); +``` + +`CreatePaddlePredictor`用来根据`ConfigBase`动态创建预测器,目前Java API支持使用MobileConfig`。框架会根据您在config中指定的模型路径、能耗模型、工作线程数等自动创建一个预测器。 + +参数: + +- `config(ConfigBase,目前应使用MobileConfig)` - 创建预测器的配置信息 + +返回:根据config创建完成的预测器 + +返回类型:`PaddlePredictor` + + + +### `getInput(index)` + +获取输入Tensor,用来设置模型的输入数据。 + +参数: + +- `index(int)` - 输入Tensor的索引 + +返回:第`index`个输入`Tensor` + +返回类型:`Tensor` + + + +### `getOutput(index)` + +获取输出Tensor,用来获取模型的输出结果。 + +参数: + +- `index(int)` - 输出Tensor的索引 + +返回:第`index`个输出Tensor + +返回类型:`Tensor` + + + +### `run()` + +执行模型预测,需要在***设置输入数据后***调用。 + +参数: + +- `None` + +返回:预测执行状态,成功返回`true`,否则返回`false` + +返回类型:`boolean` + + + +### `getVersion()` + +用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息,如`v2.0-beta`;否则返回代码的`branch(commitid)`,如`develop(7e44619)`。 + +参数: + +- `None` + +返回:当前lib使用的代码版本信息 + +返回类型:`String` + +## PowerMode + +```java +public enum PowerMode; +``` + +`PowerMode`为ARM CPU能耗模式,用户可以根据应用场景设置能耗模式获得最优的能效比。 + +示例: + +```java +MobileConfig config = new MobileConfig(); +// 设置NaiveBuffer格式模型目录 +config.setModelDir(modelPath); +// 设置能耗模式 +config.setPowerMode(PowerMode.LITE_POWER_HIGH); + +// 根据MobileConfig创建PaddlePredictor +PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config); +``` + +PowerMode详细说明如下: + +| 选项 | 说明 | +| :------------------: | ------------------------------------------------------------ | +| LITE_POWER_HIGH | 绑定大核运行模式。如果ARM CPU支持big.LITTLE,则优先使用并绑定Big cluster。如果设置的线程数大于大核数量,则会将线程数自动缩放到大核数量。如果系统不存在大核或者在一些手机的低电量情况下会出现绑核失败,如果失败则进入不绑核模式。 | +| LITE_POWER_LOW | 绑定小核运行模式。如果ARM CPU支持big.LITTLE,则优先使用并绑定Little cluster。如果设置的线程数大于小核数量,则会将线程数自动缩放到小核数量。如果找不到小核,则自动进入不绑核模式。 | +| LITE_POWER_FULL | 大小核混用模式。线程数可以大于大核数量。当线程数大于核心数量时,则会自动将线程数缩放到核心数量。 | +| LITE_POWER_NO_BIND | 不绑核运行模式(推荐)。系统根据负载自动调度任务到空闲的CPU核心上。 | +| LITE_POWER_RAND_HIGH | 轮流绑定大核模式。如果Big cluster有多个核心,则每预测10次后切换绑定到下一个核心。 | +| LITE_POWER_RAND_LOW | 轮流绑定小核模式。如果Little cluster有多个核心,则每预测10次后切换绑定到下一个核心。 | + + + +## Tensor + +```c++ +public class Tensor; +``` + +Tensor是Paddle-Lite的数据组织形式,用于对底层数据进行封装并提供接口对数据进行操作,包括设置维度、数据等。 + +*注意:用户应使用`PaddlePredictor`的`getInput`和`getOuput`接口获取输入/输出的`Tensor`。* + +示例: + +```java +// 导入Java API +import com.baidu.paddle.lite.MobileConfig; +import com.baidu.paddle.lite.Tensor; +import com.baidu.paddle.lite.Predictor; +import com.baidu.paddle.lite.PowerMode; + +// 设置MobileConfig +MobileConfig config = new MobileConfig(); +config.setModelDir(modelPath); + +// 创建PaddlePredictor +PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config); + +// 设置输入数据 +long[] dims = {100, 100}; +float[] inputBuffer = new float[10000]; +for (int i = 0; i < 10000; ++i) { + inputBuffer[i] = i; +} +// 获取输入Tensor +Tensor input = predictor.getInput(0); +// 设置输入维度 +input.resize(dims); +// 设置输入数据 +input.setData(inputBuffer); + +// 执行预测 +predictor.run(); + +// 获取输出Tensor +Tensor result = predictor.getOutput(0); +// 获取输出数据 +float[] output = result.getFloatData(); +for (int i = 0; i < 1000; ++i) { + System.out.println(output[i]); +} +``` + +### `resize(dims)` + +设置Tensor的维度信息。 + +参数: + +- `dims(long[])` - 维度信息 + +返回:设置成功返回`true`,否则返回`false` + +返回类型:`boolean` + + + +### `shape()` + +获取Tensor的维度信息。 + +参数: + +- `None` + +返回:Tensor的维度信息 + +返回类型:`long[]` + + + +### `setData(data)` + +设置Tensor数据。 + +参数: + +- `data(float[])` - 需要设置的数据 + +返回:成功则返回`true`,否则返回`false` + +返回类型:`boolean` + + + +### `getFloatData()` + +获取Tensor的底层float型数据。 + +参数: + +- `None` + +返回:`Tensor`底层数据 + +返回类型:`float[]` diff --git a/docs/api_reference/python_api/CxxConfig.md b/docs/api_reference/python_api/CxxConfig.md new file mode 100755 index 0000000000000000000000000000000000000000..4ee8448a60420dd98e4bd129b2059bfe6a46a0ed --- /dev/null +++ b/docs/api_reference/python_api/CxxConfig.md @@ -0,0 +1,200 @@ +## CxxConfig + +```python +class CxxConfig; +``` + +`CxxConfig`用来配置构建CxxPredictor的配置信息,如protobuf格式的模型地址、能耗模式、工作线程数、place信息等等。 + +示例: + +```python +from paddlelite.lite import * + +config = CxxConfig() +# 设置模型目录,加载非combined模型时使用 +config.set_model_dir() +# 设置工作线程数(该接口只支持armlinux) +# config.set_threads(4); +# 设置能耗模式(该接口只支持armlinux) +# config.set_power_mode(PowerMode.LITE_POWER_NO_BIND) +# 设置valid places +places = [Place(TargetType.ARM, PrecisionType.FP32)] +config.set_valid_places(places) + +# 根据CxxConfig创建CxxPredictor +predictor = lite.create_paddle_predictor(config) +``` + +### `set_model_dir(model_dir)` + +设置模型文件夹路径,当需要从磁盘加载非combined模型时使用。 + +参数: + +- `model_dir(str)` - 模型文件夹路径 + +返回:`None` + +返回类型:`None` + + + +### `model_dir()` + +返回设置的模型文件夹路径。 + +参数: + +- `None` + +返回:模型文件夹路径 + +返回类型:`str` + + + +### `set_model_file(model_file)` + +设置模型文件路径,加载combined形式模型时使用。 + +参数: + +- `model_file(str)` - 模型文件路径 + +返回类型:`None` + + + +### `model_file()` + +获取设置模型文件路径,加载combined形式模型时使用。 + +参数: + +- `None` + +返回:模型文件路径 + +返回类型:`str` + + + +### `set_param_file(param_file)` + +设置模型参数文件路径,加载combined形式模型时使用。 + +参数: + +- `param_file(str)` - 模型文件路径 + +返回类型:`None` + + + +### `param_file()` + +获取设置模型参数文件路径,加载combined形式模型时使用。 + +参数: + +- `None` + +返回:模型参数文件路径 + +返回类型:`str` + + + +### `set_valid_places(valid_places)` + +设置可用的places列表。 + +参数: + +- `valid_places(list)` - 可用place列表。 + +返回类型:`None` + +示例: + +```python +from paddlelite.lite import * + +config = CxxConfig() +# 设置模型目录,加载非combined模型时使用 +config.set_model_dir() +# 设置valid places +# 注意,valid_places列表中Place的排序表明了用户对Place的偏好程度,如用户想优先使用ARM上Int8精度的 +# kernel,则应把Place(TargetType.ARM, PrecisionType.INT8)置于valid_places列表的首位。 +places = [Place(TargetType.ARM, PrecisionType.INT8), + Place(TargetType.ARM, PrecisionType.FP32)] +config.set_valid_places(places) + +# 根据CxxConfig创建CxxPredictor +predictor = create_paddle_predictor(config) +``` + + + +### `set_power_mode(mode)` + +设置CPU能耗模式,该接口只支持`armlinux`平台。若不设置,则默认使用`PowerMode.LITE_POWER_HIGH`。 + +*注意:只在开启`OpenMP`时生效,否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `mode(PowerMode)` - CPU能耗模式 + +返回:`None` + +返回类型:`None` + + + +### `power_mode()` + +获取设置的CPU能耗模式,该接口只支持`armlinux`平台。 + +*注意:此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `None` + +返回:设置的CPU能耗模式 + +返回类型:`PowerMode` + + + +### `set_threads(threads)` + +设置工作线程数,该接口只支持`armlinux`平台。若不设置,则默认使用单线程。 + +*注意:只在开启`OpenMP`的模式下生效,否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `threads(int)` - 工作线程数 + +返回:`None` + +返回类型:`None` + + + +### `threads()` + +获取设置的工作线程数,该接口只支持`armlinux`平台。 + +*注意:此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `None` + +返回:工作线程数 + +返回类型:`int` diff --git a/docs/api_reference/python_api/CxxPredictor.md b/docs/api_reference/python_api/CxxPredictor.md new file mode 100755 index 0000000000000000000000000000000000000000..5c745e86ba91bd3041e0ca2b346513ce52d33658 --- /dev/null +++ b/docs/api_reference/python_api/CxxPredictor.md @@ -0,0 +1,94 @@ +## CxxPredictor + +```c++ +class CxxPredictor +``` + +`CxxPredictor`是Paddle-Lite的预测器,由`create_paddle_predictor`根据`CxxConfig`进行创建。用户可以根据CxxPredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。 + +示例: + +```python +from paddlelite.lite import * +from lite_core import * + +# 1. 设置CxxConfig +config = CxxConfig() +if args.model_file != '' and args.param_file != '': + config.set_model_file(args.model_file) + config.set_param_file(args.param_file) +else: + config.set_model_dir(args.model_dir) +places = [Place(TargetType.ARM, PrecisionType.FP32)] +config.set_valid_places(places) + +# 2. 创建CxxPredictor +predictor = create_paddle_predictor(config) + +# 3. 设置输入数据 +input_tensor = predictor.get_input(0) +input_tensor.resize([1, 3, 224, 224]) +input_tensor.set_float_data([1.] * 3 * 224 * 224) + +# 4. 运行模型 +predictor.run() + +# 5. 获取输出数据 +output_tensor = predictor.get_output(0) +print(output_tensor.shape()) +print(output_tensor.float_data()[:10]) +``` + +### `get_input(index)` + +获取输入Tensor,用来设置模型的输入数据。 + +参数: + +- `index(int)` - 输入Tensor的索引 + +返回:第`index`个输入`Tensor` + +返回类型:`Tensor` + + + +### `get_output(index)` + +获取输出Tensor,用来获取模型的输出结果。 + +参数: + +- `index(int)` - 输出Tensor的索引 + +返回:第`index`个输出`Tensor` + +返回类型:`Tensor` + + + +### `run()` + +执行模型预测,需要在***设置输入数据后***调用。 + +参数: + +- `None` + +返回:`None` + +返回类型:`None` + + + +### `get_version()` + +用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息,如`v2.0-beta`;否则返回代码的`branch(commitid)`,如`develop(7e44619)`。 + +参数: + +- `None` + +返回:当前lib使用的代码版本信息 + +返回类型:`str` diff --git a/docs/api_reference/python_api/LightPredictor.md b/docs/api_reference/python_api/LightPredictor.md new file mode 100755 index 0000000000000000000000000000000000000000..a714777d52b8fe8599184d83d2c1339881d8494a --- /dev/null +++ b/docs/api_reference/python_api/LightPredictor.md @@ -0,0 +1,88 @@ +## LightPredictor + +```c++ +class LightPredictor +``` + +`LightPredictor`是Paddle-Lite的预测器,由`create_paddle_predictor`根据`MobileConfig`进行创建。用户可以根据LightPredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。 + +示例: + +```python +from __future__ import print_function +from paddlelite.lite import * + +# 1. 设置MobileConfig +config = MobileConfig() +config.set_model_dir(args.model_dir) + +# 2. 创建LightPredictor +predictor = create_paddle_predictor(config) + +# 3. 设置输入数据 +input_tensor = predictor.get_input(0) +input_tensor.resize([1, 3, 224, 224]) +input_tensor.set_float_data([1.] * 3 * 224 * 224) + +# 4. 运行模型 +predictor.run() + +# 5. 获取输出数据 +output_tensor = predictor.get_output(0) +print(output_tensor.shape()) +print(output_tensor.float_data()[:10]) +``` + +### `get_input(index)` + +获取输入Tensor,用来设置模型的输入数据。 + +参数: + +- `index(int)` - 输入Tensor的索引 + +返回:第`index`个输入`Tensor` + +返回类型:`Tensor` + + + +### `get_output(index)` + +获取输出Tensor,用来获取模型的输出结果。 + +参数: + +- `index(int)` - 输出Tensor的索引 + +返回:第`index`个输出`Tensor` + +返回类型:`Tensor` + + + +### `run()` + +执行模型预测,需要在***设置输入数据后***调用。 + +参数: + +- `None` + +返回:`None` + +返回类型:`None` + + + +### `get_version()` + +用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息,如`v2.0-beta`;否则返回代码的`branch(commitid)`,如`develop(7e44619)`。 + +参数: + +- `None` + +返回:当前lib使用的代码版本信息 + +返回类型:`str` diff --git a/docs/api_reference/python_api/MobileConfig.md b/docs/api_reference/python_api/MobileConfig.md new file mode 100755 index 0000000000000000000000000000000000000000..58b30a18cbe451f1bc95f2aa1bf829e00edde299 --- /dev/null +++ b/docs/api_reference/python_api/MobileConfig.md @@ -0,0 +1,147 @@ +## MobileConfig + +```python +class MobileConfig; +``` + +`MobileConfig`用来配置构建LightPredictor的配置信息,如NaiveBuffer格式的模型地址、能耗模式、工作线程数等等。 + +示例: + +```python +from paddlelite.lite import * + +config = MobileConfig() +# 设置NaiveBuffer格式模型目录 +config.set_model_from_file() +# 设置工作线程数 +config.set_threads(4); +# 设置能耗模式 +config.set_power_mode(PowerMode.LITE_POWER_NO_BIND) + +# 根据MobileConfig创建LightPredictor +predictor = create_paddle_predictor(config) +``` + +### `set_model_from_file(model_file)` + +**注意**:`model_file`应该是经过`opt`优化后产生的`NaiveBuffer`格式的模型。 + +设置模型文件夹路径。 + +参数: + +- `model_file(str)` - 模型文件路径 + +返回:`None` + +返回类型:`None` + + + +### `set_model_dir(model_dir)` + +**注意**:Lite模型格式在release/v2.3.0之后修改,本接口为加载老格式模型的接口,将在release/v3.0.0废弃。建议替换为`setModelFromFile`接口。`model_dir`应该是经过`Model Optimize Tool`优化后产生的`NaiveBuffer`格式的模型。 + +设置模型文件夹路径。 + +参数: + +- `model_dir(str)` - 模型文件夹路径 + +返回:`None` + +返回类型:`None` + + + +### `set_model_from_buffer(model_buffer)` + +设置模型的内存数据,当需要从内存加载模型时使用。 + +参数: + +- `model_buffer(str)` - 内存中的模型数据 + +返回:`None` + +返回类型:`void` + + + + +### `model_dir()` + +返回设置的模型文件夹路径。 + +参数: + +- `None` + +返回:模型文件夹路径 + +返回类型:`str` + + + +### `set_power_mode(mode)` + +设置CPU能耗模式。若不设置,则默认使用`PowerMode.LITE_POWER_HIGH`。 + +*注意:只在开启`OpenMP`时生效,否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `mode(PowerMode)` - CPU能耗模式 + +返回:`None` + +返回类型:`None` + + + +### `power_mode()` + +获取设置的CPU能耗模式,该接口只支持`armlinux`平台。 + +*注意:此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `None` + +返回:设置的CPU能耗模式 + +返回类型:`PowerMode` + + + +### `set_threads(threads)` + +设置工作线程数,该接口只支持`armlinux`平台。若不设置,则默认使用单线程。 + +*注意:只在开启`OpenMP`的模式下生效,否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `threads(int)` - 工作线程数 + +返回:`None` + +返回类型:`None` + + + +### `threads()` + +获取设置的工作线程数,该接口只支持`armlinux`平台。 + +*注意:此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `None` + +返回:工作线程数 + +返回类型:`int` diff --git a/docs/api_reference/python_api/PowerMode.md b/docs/api_reference/python_api/PowerMode.md new file mode 100755 index 0000000000000000000000000000000000000000..30070c91b6d85b30d374eee4e938a66744c3bf10 --- /dev/null +++ b/docs/api_reference/python_api/PowerMode.md @@ -0,0 +1,33 @@ +## PowerMode + +```python +class PowerMode; +``` + +`PowerMode`为ARM CPU能耗模式,用户可以根据应用场景设置能耗模式获得最优的能效比。 + +示例: + +```python +from paddlelite.lite import * + +config = MobileConfig() +# 设置NaiveBuffer格式模型目录 +config.set_model_dir() +# 设置能耗模式 +config.set_power_mode(PowerMode.LITE_POWER_NO_BIND) + +# 根据MobileConfig创建LightPredictor +predictor = create_paddle_predictor(config) +``` + +PowerMode详细说明如下: + +| 选项 | 说明 | +| :------------------: | ------------------------------------------------------------ | +| LITE_POWER_HIGH | 绑定大核运行模式。如果ARM CPU支持big.LITTLE,则优先使用并绑定Big cluster。如果设置的线程数大于大核数量,则会将线程数自动缩放到大核数量。如果系统不存在大核或者在一些手机的低电量情况下会出现绑核失败,如果失败则进入不绑核模式。 | +| LITE_POWER_LOW | 绑定小核运行模式。如果ARM CPU支持big.LITTLE,则优先使用并绑定Little cluster。如果设置的线程数大于小核数量,则会将线程数自动缩放到小核数量。如果找不到小核,则自动进入不绑核模式。 | +| LITE_POWER_FULL | 大小核混用模式。线程数可以大于大核数量。当线程数大于核心数量时,则会自动将线程数缩放到核心数量。 | +| LITE_POWER_NO_BIND | 不绑核运行模式(推荐)。系统根据负载自动调度任务到空闲的CPU核心上。 | +| LITE_POWER_RAND_HIGH | 轮流绑定大核模式。如果Big cluster有多个核心,则每预测10次后切换绑定到下一个核心。 | +| LITE_POWER_RAND_LOW | 轮流绑定小核模式。如果Little cluster有多个核心,则每预测10次后切换绑定到下一个核心。 | diff --git a/docs/api_reference/python_api/Tensor.md b/docs/api_reference/python_api/Tensor.md new file mode 100755 index 0000000000000000000000000000000000000000..7f2e81b643e49f5bed9bd6af4f2e5b3623bc49f5 --- /dev/null +++ b/docs/api_reference/python_api/Tensor.md @@ -0,0 +1,140 @@ +## Tensor + +```c++ +class Tensor +``` + +Tensor是Paddle-Lite的数据组织形式,用于对底层数据进行封装并提供接口对数据进行操作,包括设置Shape、数据、LoD信息等。 + +*注意:用户应使用`CxxPredictor`或`LightPredictor`的`get_input`和`get_output`接口获取输入/输出的`Tensor`。* + +示例: + +```python +from paddlelite.lite import * +from lite_core import * + +# 1. 设置CxxConfig +config = CxxConfig() +if args.model_file != '' and args.param_file != '': + config.set_model_file(args.model_file) + config.set_param_file(args.param_file) +else: + config.set_model_dir(args.model_dir) +places = [Place(TargetType.ARM, PrecisionType.FP32)] +config.set_valid_places(places) + +# 2. 创建CxxPredictor +predictor = create_paddle_predictor(config) + +# 3. 设置输入数据 +input_tensor = predictor.get_input(0) +input_tensor.resize([1, 3, 224, 224]) +input_tensor.set_float_data([1.] * 3 * 224 * 224) + +# 4. 运行模型 +predictor.run() + +# 5. 获取输出数据 +output_tensor = predictor.get_output(0) +print(output_tensor.shape()) +print(output_tensor.float_data()[:10]) +``` + +### `resize(shape)` + +设置Tensor的维度信息。 + +参数: + +- `shape(list)` - 维度信息 + +返回:`None` + +返回类型:`None` + + + +### `shape()` + +获取Tensor的维度信息。 + +参数: + +- `None` + +返回:Tensor的维度信息 + +返回类型:`list` + + + +### `float_data()` + +获取Tensor的持有的float型数据。 + +示例: + +```python +output_tensor = predictor.get_output(0) +print(output_tensor.shape()) +print(output_tensor.float_data()[:10]) +``` + +参数: + +- `None` + +返回:`Tensor`持有的float型数据 + +返回类型:`list` + + + +### `set_float_data(float_data)` + +设置Tensor持有float数据。 + +示例: + +```python +input_tensor = predictor.get_input(0) +input_tensor.resize([1, 3, 224, 224]) +input_tensor.set_float_data([1.] * 3 * 224 * 224) +``` + +参数: + +- `float_data(list)` - 待设置的float型数据 + +返回:`None` + +返回类型:`None` + + + +### `set_lod(lod)` + +设置Tensor的LoD信息。 + +参数: + +- `lod(list[list])` - Tensor的LoD信息 + +返回:`None` + +返回类型:`None` + + + +### `lod()` + +获取Tensor的LoD信息 + +参数: + +- `None` + +返回:`Tensor`的LoD信息 + +返回类型:`list[list]` diff --git a/docs/api_reference/python_api/TypePlace.md b/docs/api_reference/python_api/TypePlace.md new file mode 100755 index 0000000000000000000000000000000000000000..e2d223bec8598f8187240011e48ba70538007f93 --- /dev/null +++ b/docs/api_reference/python_api/TypePlace.md @@ -0,0 +1,54 @@ +## TargetType + +```python +class TargetType; +``` +`TargetType`为目标设备硬件类型,用户可以根据应用场景选择硬件平台类型。 + +枚举型变量`TargetType`的所有可能取值包括: + +`{X86, CUDA, ARM, OpenCL, FPGA, NPU}` + + +## PrecisionType +```python +class PrecisionType {FP32}; +``` +`PrecisionType`为模型中Tensor的数据精度,默认值为FP32(float32)。 + +枚举型变量`PrecisionType`的所有可能取值包括: + +`{FP32, INT8, INT32, INT64}` + + + + +## DataLayoutType + +```python +class DataLayoutType {NCHW}; +``` +`DataLayoutType`为Tensor的数据格式,默认值为NCHW(number, channel, height, weigth)。 + +枚举型变量`DataLayoutType`的所有可能取值包括: + +` {NCHW, NHWC}` + + + +## Place +```python +class Place{ + TargetType target; + PrecisionType precision{FP32}; + DataLayoutType layout{NCHW} +} +``` +`Place`是`TargetType`、`PrecisionType`和`DataLayoutType`的集合,说明运行时的设备类型、数据精度和数据格式。 + +示例: +```python +from lite_core import * + +Place{TargetType(ARM), PrecisionType(FP32), DataLayoutType(NCHW)} +``` diff --git a/docs/api_reference/python_api/create_paddle_predictor.md b/docs/api_reference/python_api/create_paddle_predictor.md new file mode 100755 index 0000000000000000000000000000000000000000..9d476ad674a3d0677ef04bc5f4dfd894b192884e --- /dev/null +++ b/docs/api_reference/python_api/create_paddle_predictor.md @@ -0,0 +1,32 @@ + +## create_paddle_predictor + +```python +CxxPredictor create_paddle_predictor(config); # config为CxxConfig类型 +LightPredictor create_paddle_predictor(config); # config为MobileConfig类型 +``` + +`create_paddle_predictor`函数用来根据`CxxConfig`或`MobileConfig`构建预测器。 + +示例: + +```python +from paddlelite.lite import * + +# 设置CxxConfig +config = CxxConfig() +config.set_model_dir() +places = [Place(TargetType.ARM, PrecisionType.FP32)] +config.set_valid_places(places) + +# 根据CxxConfig创建CxxPredictor +predictor = create_paddle_predictor(config) +``` + +参数: + +- `config(CxxConfig或MobileConfig)` - 用于构建Predictor的配置信息。 + +返回:预测器`predictor` + +返回类型:`CxxPredictor`或`LightPredictor` diff --git a/docs/api_reference/python_api/opt.md b/docs/api_reference/python_api/opt.md new file mode 100755 index 0000000000000000000000000000000000000000..859d9932416e217c69cc278b12780fe77207bfce --- /dev/null +++ b/docs/api_reference/python_api/opt.md @@ -0,0 +1,128 @@ +## Opt + +```python +class Opt; +``` + +`Opt`模型离线优化接口,Paddle原生模型需经`opt`优化图结构后才能在Paddle-Lite上运行。 + +示例: + +假设待转化模型问当前文件夹下的`mobilenet_v1`,可以使用以下脚本转换 + +```python +# 引用Paddlelite预测库 +from paddlelite.lite import * + +# 1. 创建opt实例 +opt=Opt() +# 2. 指定输入模型地址 +opt.set_model_dir("./mobilenet_v1") +# 3. 指定转化类型: arm、x86、opencl、xpu、npu +opt.set_valid_places("arm") +# 4. 指定模型转化类型: naive_buffer、protobuf +opt.set_model_type("naive_buffer") +# 4. 输出模型地址 +opt.set_optimize_out("mobilenetv1_opt") +# 5. 执行模型优化 +opt.run() +``` + +### `set_model_dir(model_dir)` + +设置模型文件夹路径,当需要从磁盘加载非combined模型时使用。 + +参数: + +- `model_dir(str)` - 模型文件夹路径 + +返回:`None` + + + +### `set_model_file(model_file)` + +设置模型文件路径,加载combined形式模型时使用。 + +参数: + +- `model_file(str)` - 模型文件路径 + + + +### `set_param_file(param_file)` + +设置模型参数文件路径,加载combined形式模型时使用。 + +参数: + +- `param_file(str)` - 模型文件路径 + + +### `set_model_type(type)` + +设置模型的输出类型,当前支持`naive_buffer`和`protobuf`两种格式,移动端预测需要转化为`naive_buffer` + +参数: + +- `type(str)` - 模型格式(`naive_buffer/protobuf`) + + + +### `set_valid_places(valid_places)` + +设置可用的places列表。 + +参数: + +- `valid_places(str)` - 可用place列表,不同place用`,`隔开 + +示例: + +```python +# 引用Paddlelite预测库 +from paddlelite.lite import * + +# 1. 创建opt实例 +opt=Opt() +# 2. 指定转化类型: arm、x86、opencl、xpu、npu +opt.set_valid_places("arm, opencl") +``` + + + + +### `set_optimize_out(optimized_model_name)` + +设置优化后模型的名称,优化后模型文件以`.nb`作为文件后缀。 + +参数: + +- `optimized_model_name(str)` + +### `run()` + +执行模型优化,用以上接口设置完 `模型路径`、`model_type`、`optimize_out`和`valid_places`后,执行`run()`接口会根据以上设置转化模型,转化后模型保存在当前路径下。 + + +### `run_optimize(model_dir, model_file, param_file, type, valid_places, optimized_model_name)` + +执行模型优化,无需设置以上接口,直接指定 `模型路径`、`model_type`、`optimize_out`和`valid_places`并执行模型转化。 + +参数: + +- `model_dir(str)` - 模型文件夹路径 +- `model_file(str)` - 模型文件路径 +- `param_file(str)` - 模型文件路径 +- `type(str)` - 模型格式(`naive_buffer/protobuf`) +- `valid_places(str)` - 可用place列表,不同place用`,`隔开 +- `optimized_model_name(str)` + +```python +# 引用Paddlelite预测库 +from paddlelite.lite import * +# 1. 创建opt实例 +opt=Opt() +# 2. 执行模型优化 +opt.run_optimize("./mobilenet_v1","","","arm","mobilenetv1_opt"); +``` diff --git a/docs/api_reference/python_api_doc.md b/docs/api_reference/python_api_doc.md new file mode 100755 index 0000000000000000000000000000000000000000..80b20f949b4fa3df3bcdbaaff195eb75b6443013 --- /dev/null +++ b/docs/api_reference/python_api_doc.md @@ -0,0 +1,74 @@ +# Python API + + +### [create_paddle_predictor](./python_api/create_paddle_predictor) + +创建预测执行器[`CxxPredictor`](./python_api/CxxPredictor)或者[`LightPredictor`](./python_api/LightPredictor) + +### [Opt](./python_api/opt) + +```python +class Opt; +``` + +`Opt`模型离线优化接口,Paddle原生模型需经`opt`优化图结构后才能在Paddle-Lite上运行。 + +### [CxxConfig](./python_api/CxxConfig) +```python +class CxxConfig; +``` + +`CxxConfig`用来配置构建CxxPredictor的配置信息,如protobuf格式的模型地址、能耗模式、工作线程数、place信息等等。 + + +### [MobileConfig](./python_api/MobileConfig) + +```python +class MobileConfig; +``` + +`MobileConfig`用来配置构建LightPredictor的配置信息,如NaiveBuffer格式的模型地址、能耗模式、工作线程数等等。 + + +### [CxxPredictor](./python_api/CxxPredictor) + +```python +class CxxPredictor +``` + +`CxxPredictor`是Paddle-Lite的预测器,由`create_paddle_predictor`根据`CxxConfig`进行创建。用户可以根据CxxPredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。 + + + +### [TargetType 、PrecisionType、DataLayoutType、Place](./python_api/TypePlace) + +`TargetType`为目标设备硬件类型,用户可以根据应用场景选择硬件平台类型。 + +`PrecisionType`为模型中Tensor的数据精度,默认值为FP32(float32)。 + +`DataLayoutType`为Tensor的数据格式,默认值为NCHW(number, channel, height, weigth)。 + +`Place`是`TargetType`、`PrecisionType`和`DataLayoutType`的集合,说明运行时的设备类型、数据精度和数据格式。 + + + + +### [PowerMode](./python_api/PowerMode) + +```python +class PowerMode; +``` + +`PowerMode`为ARM CPU能耗模式,用户可以根据应用场景设置能耗模式获得最优的能效比。 + + + +### [Tensor](./python_api/Tensor) + +```c++ +class Tensor +``` + +Tensor是Paddle-Lite的数据组织形式,用于对底层数据进行封装并提供接口对数据进行操作,包括设置Shape、数据、LoD信息等。 + +*注意:用户应使用`CxxPredictor`或`LightPredictor`的`get_input`和`get_output`接口获取输入/输出的`Tensor`。* diff --git a/docs/benchmark/benchmark.md b/docs/benchmark/benchmark.md index efb0805fddc0bd62a2b21a130018edaa9213e0cf..2868d0e7e573d83a0fa804732c80744e566e78d3 100644 --- a/docs/benchmark/benchmark.md +++ b/docs/benchmark/benchmark.md @@ -1,4 +1,4 @@ -# Benchmark 数据 +# 性能数据 可以参考[benchmark_tools](benchmark_tools),推荐**一键benchmark**。 @@ -15,14 +15,12 @@ * int8模型 * mobilenet_v1 * mobilenet_v2 - * resnet50 * 测试机器(android ndk ndk-r17c) * 骁龙855 * xiaomi mi9, snapdragon 855 * 4xA76(1@2.84GHz + 3@2.4GHz) + 4xA55@1.78GHz - * 骁龙845 * xiaomi mi8, 845 * 2.8GHz(大四核),1.7GHz(小四核) @@ -30,20 +28,12 @@ * 骁龙835 * xiaomi mix2, snapdragon 835 * 2.45GHz(大四核),1.9GHz(小四核) - - * 骁龙625 - * oppo R9s, snapdragon625 - * A53 x 8, big core@2.0GHz - - * 骁龙653 - * 360 N5, snapdragon 653 - * 4 x A73@2.0GHz + 4 x A53@1.4GHz - + * 麒麟970 * HUAWEI Mate10 * 测试说明 - * branch: release/2.0.0 + * branch: release/v2.3.0 * warmup=10, repeats=30,统计平均时间,单位是ms * 当线程数为1时,```DeviceInfo::Global().SetRunMode```设置LITE_POWER_HIGH,否者设置LITE_POWER_NO_BIND * 模型的输入图像的维度是{1, 3, 224, 224},输入图像的每一位数值是1 @@ -55,78 +45,59 @@ #### paddlepaddle model - 骁龙855|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 ----| ---- | ---- | ---- | ---- |---- |---- threads num|1 |2 |4 |1 |2 |4 -mobilenet_v1 |32.19 |18.81 |10.90 |30.92 |18.31 |10.15 -mobilenet_v2 |22.91 |13.75 |8.64 |21.15 |12.79 |7.84 -shufflenet_v2 |4.67 |3.37 |2.65 |4.43 |3.15 |2.66 -squeezenet_v1.1 |25.10 |15.93 |9.68 |23.28 |14.61 |8.71 -mnasnet |21.84 |13.14 |7.96 |19.61 |11.88 |7.55 +mobilenet_v1 |33.27 |19.52 |11.14 |31.72 |18.76 |10.24 | +mobilenet_v2 |29.08 |15.79 |9.25 |25.89 |14.17 |8.38 | +shufflenet_v2 |4.40 |3.09 |2.30 |4.28 |3.02 |2.35 | +squeezenet_v1.1 |19.96 |12.61 |8.76 |18.25 |11.46 |7.97 | +mnasnet |21.00 |12.54 |7.28 |19.65 |11.65 |6.96 | - -骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 +骁龙845|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 ----| ---- | ---- | ---- | ---- |---- |---- threads num|1 |2 |4 |1 |2 |4 -mobilenet_v1 |94.13 |52.17 |30.68 |88.28 |47.58 |26.64 -mobilenet_v2 |61.24 |34.64 |22.36 |56.66 |32.19 |19.63 -shufflenet_v2 |10.87 |6.92 |5.12 |10.41 |6.76 |4.97 -squeezenet_v1.1 |73.61 |42.25 |24.44 |64.87 |38.43 |23.06 -mnasnet |58.22 |33.43 |20.44 |53.43 |30.20 |18.09 - +mobilenet_v1 |66.36 |35.97 |19.45 |62.66 |33.87 |17.85 | +mobilenet_v2 |45.86 |25.53 |14.6 |41.58 |23.24 |13.39 | +shufflenet_v2 |7.58 |4.89 |3.41 |7.44 |4.91 |3.58 | +squeezenet_v1.1 |37.15 |22.74 |13.51 |34.69 |21.27 |12.74 | +mnasnet |40.09 |21.73 |11.91 |38.19 |21.02 |12.11 | -麒麟980|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 -----| ---- | ---- | ---- | ---- |---- |---- -threads num|1 |2 |4 |1 |2 |4 -mobilenet_v1 |55.11 |28.24 |13.27 |34.24 |17.74 |12.41 -mobilenet_v2 |37.03 |19.80 |51.94 |23.64 |12.98 |9.38 -shufflenet_v2 |7.26 |4.94 |15.06 |5.32 |3.33 |2.82 -squeezenet_v1.1 |42.73 |23.66 |57.39 |26.03 |14.53 |13.66 -mnasnet |36.87 |20.15 |46.04 |21.85 |12.06 |8.68 -麒麟970|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 +骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 ----| ---- | ---- | ---- | ---- |---- |---- threads num|1 |2 |4 |1 |2 |4 -mobilenet_v1 |97.80 |52.64 |34.46 |94.51 |49.36 |28.43 -mobilenet_v2 |66.55 |38.52 |23.19 |62.89 |34.93 |21.53 -shufflenet_v2 |13.78 |8.11 |5.93 |11.95 |7.90 |5.91 -squeezenet_v1.1 |77.64 |43.67 |25.72 |69.91 |40.66 |24.62 -mnasnet |61.86 |34.62 |22.68 |59.61 |32.79 |19.56 +mobilenet_v1 |96.98 |53.92 |32.24 |89.31 |48.02 |27.58 | +mobilenet_v2 |67.72 |37.66 |23.82 |60.10 |34.36 |21.05 | +shufflenet_v2 |10.72 |6.62 |4.63 |10.10 |6.44 |4.63 | +squeezenet_v1.1 |53.89 |33.28 |20.73 |50.83 |32.31 |19.51 | +mnasnet |59.55 |33.53 |20.32 |56.21 |31.58 |19.06 | #### caffe model 骁龙855|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 ----| ---- | ---- | ---- | ---- |---- |---- threads num|1 |2 |4 |1 |2 |4 | -mobilenet_v1 |32.42 |18.68 |10.86 |30.92 |18.35 |10.07 | -mobilenet_v2 |29.53 |17.76 |10.89 |27.19 |16.53 |9.75 | -shufflenet_v2 |4.61 |3.29 |2.61 |4.36 |3.11 |2.51 | - - -骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 -----| ---- | ---- | ---- | ---- |---- |---- -threads num|1 |2 |4 |1 |2 |4 | -mobilenet_v1 |92.52 |52.34 |30.37 |88.31 |49.75 |27.29 | -mobilenet_v2 |79.50 |45.67 |28.79 |76.13 |44.01 |26.13 | -shufflenet_v2 |10.94 |7.08 |5.16 |10.64 |6.83 |5.01 | +mobilenet_v1 |33.36 |19.45 |11.26 |31.63 |18.74 |10.31 | +mobilenet_v2 |31.63 |19.21 |11.61 |28.34 |17.14 |10.16 | +shufflenet_v2 |4.46 |3.08 |2.32 |4.26 |2.98 |2.35 | -麒麟980|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 +骁龙845|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 ----| ---- | ---- | ---- | ---- |---- |---- threads num|1 |2 |4 |1 |2 |4 | -mobilenet_v1 |55.36 |28.18 |13.31 |34.42 |17.93 |12.52 | -mobilenet_v2 |49.17 |26.10 |65.49 |30.50 |16.66 |11.72 | -shufflenet_v2 |8.45 |5.00 |15.65 |4.58 |3.14 |2.83 | +mobilenet_v1 |66.32 |35.83 |19.56 |62.52 |33.79 |17.91 | +mobilenet_v2 |58.46 |32.69 |18.56 |53.72 |29.86 |16.80 | +shufflenet_v2 |7.65 |4.82 |3.46 |7.55 |4.97 |3.62 | -麒麟970|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 +骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 ----| ---- | ---- | ---- | ---- |---- |---- threads num|1 |2 |4 |1 |2 |4 | -mobilenet_v1 |97.85 |53.38 |33.85 |94.29 |49.42 |28.29 | -mobilenet_v2 |87.40 |50.25 |31.85 |85.55 |48.11 |28.24 | -shufflenet_v2 |12.16 |8.39 |6.21 |12.21 |8.33 |6.32 | +mobilenet_v1 |95.38 |54.09 |32.03 |95.05 |48.33 |27.54 | +mobilenet_v2 |88.46 |48.98 |30.23 |79.28 |44.64 |27.10 | +shufflenet_v2 |10.07 |6.51 |4.61 |10.31 |6.50 |4.66 | #### int8量化模型测试数据 @@ -136,6 +107,7 @@ threads num|1 |2 |4 |1 |2 |4 | mobilenet_v1 |36.80 |21.58 |11.12 | 14.01 |8.13 |4.32 | mobilenet_v2 |28.72 |19.08 |12.49 | 17.24 |11.55 |7.82 | + 骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 ----| ---- | ---- | ---- | ---- |---- |---- threads num|1 |2 |4 |1 |2 |4 | diff --git a/docs/benchmark/benchmark_tools.md b/docs/benchmark/benchmark_tools.md index 60341762b70772bc46196b836050714b9d43228b..96a67931c91f1323508bdd4d2fda6d3a55bbb307 100644 --- a/docs/benchmark/benchmark_tools.md +++ b/docs/benchmark/benchmark_tools.md @@ -1,4 +1,4 @@ -# Benchmark 测试方法 +# 测试方法 本文将会介绍,在**Ubuntu:16.04交叉编译环境**下,用安卓手机在终端测试Paddle-Lite的性能,并介绍两种Benchmark方法: @@ -28,63 +28,64 @@ List of devices attached 执行以下命令,完成Benchmark: ```shell -wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/run_benchmark.sh +# Test v2.6 branch +wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_2.6/run_benchmark.sh +sh run_benchmark.sh + +# Test v2.3 branch +wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_2.3/run_benchmark.sh sh run_benchmark.sh ``` 该`run_benchmark.sh`脚本会: -1. 下载模型,并上传手机:包含mobilenetv1/v2、shufflenetv2、squeezenetv1.1、mnasnet; +1. 下载模型,并上传手机:包含mobilenetv1、mobilenetv2、shufflenetv2、squeezenetv1.1、mnasnet、mobilenetv1_int8、mobilenetv2_int8; 2. 下载pre-built android-armv7和android-armv8的可执行文件,并上传手机:`benchmark_bin_v7`和`benchmark_bin_v8`; 3. 自动执行另一个脚本`benchmark.sh`(多台手机连接USB,请在`benchmark.sh`脚本中对`adb`命令后加上测试手机的`serial number`); 4. 从手机下载benchmark结果`result_armv7.txt`和`result_armv8.txt`,到当前目录,并显示Benchmark结果。 ## 二. 逐步Benchmark -### 1. 获取benchmark可执行文件 - -benchmark_bin文件可以测试PaddleLite的性能,有下面两种方式获得。 - -#### 方式一:下载benchmark_bin可执行文件 - -```shell -# Download benchmark_bin for android-armv7 -wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_bin_v7 - -# Download benchmark_bin for android-armv8 -wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_bin_v8 -``` - -#### 方式二:由源码编译benchmark_bin文件 +### 1. 编译benchmark可执行文件 -根据[源码编译](../source_compile)准备编译环境,拉取PaddleLite最新release发布版代码,并在仓库根目录下,执行: +根据[源码编译](../user_guides/source_compile)准备编译环境,拉取PaddleLite最新特定分支代码,并在仓库根目录下,执行: ```shell ########################################### # Build benchmark_bin for android-armv7 # ########################################### -./lite/tools/ci_build.sh \ - --arm_os="android" \ - --arm_abi="armv7" \ - --arm_lang="gcc " \ - build_arm + +./lite/tools/build.sh \ + --arm_os=android \ + --arm_abi=armv7 \ + --arm_lang=gcc \ + --android_stl=c++_static \ + --build_extra=ON \ + --with_log=OFF \ + full_publish # `benchmark_bin` 在: /build.lite.android.armv7.gcc/lite/api/benchmark_bin ########################################### # Build benchmark_bin for android-armv8 # ########################################### -./lite/tools/ci_build.sh \ - --arm_os="android" \ - --arm_abi="armv8" \ - --arm_lang="gcc " \ - build_arm + +./lite/tools/build.sh \ + --arm_os=android \ + --arm_abi=armv8 \ + --arm_lang=gcc \ + --android_stl=c++_static \ + --build_extra=ON \ + --with_log=OFF \ + full_publish # `benchmark_bin` 在: /build.lite.android.armv8.gcc/lite/api/benchmark_bin ``` > **注意**:为了避免在docker内部访问不到手机的问题,建议编译得到benchmark_bin后退出到docker外面,并且将benchmark_bin文件拷贝到一个临时目录。然后在该临时目录下,按照下面步骤下载模型、拷贝脚本、测试。 +> **注意**:如果不是测试常见分类模型(单输入,输入shape是1x3x224x224),需要根据实际情况修改`/PaddleLite/lite/api/benchmark.cc`文件,然后编译得到可执行文件。 + ### 2. 准备模型 PaddleLite为Benchmark准备好了[常见Benchmark模型](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_models.tgz)。 @@ -135,53 +136,53 @@ sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true > 不同手机,不同版本,测试模型的性能数据不同。 ```shell -run benchmark armv7 +run benchmark armv8 -------------------------------------- PaddleLite Benchmark Threads=1 Warmup=10 Repeats=30 --- mnasnet avg = 159.8427 ms --- mobilenet_v1 avg = 235.0072 ms --- mobilenet_v2 avg = 173.0387 ms --- shufflenet_v2 avg = 76.0040 ms --- squeezenet_v11 avg = 164.2957 ms +mnasnet min = 19.83500 max = 19.38500 average = 19.65503 +mobilenetv1 min = 32.00600 max = 31.56900 average = 31.81983 +mobilenetv2 min = 22.37900 max = 22.08700 average = 22.28623 +shufflenetv2 min = 10.80400 max = 10.62900 average = 10.68890 +squeezenet min = 17.67400 max = 17.47900 average = 17.57677 Threads=2 Warmup=10 Repeats=30 --- mnasnet avg = 83.1287 ms --- mobilenet_v1 avg = 121.6029 ms --- mobilenet_v2 avg = 86.6175 ms --- shufflenet_v2 avg = 41.5761 ms --- squeezenet_v11 avg = 87.8678 ms +mnasnet min = 11.85600 max = 11.72000 average = 11.77127 +mobilenetv1 min = 18.75000 max = 18.64300 average = 18.70593 +mobilenetv2 min = 14.05100 max = 13.59900 average = 13.71450 +shufflenetv2 min = 6.67200 max = 6.58300 average = 6.63400 +squeezenet min = 12.07100 max = 11.33400 average = 11.41253 Threads=4 Warmup=10 Repeats=30 --- mnasnet avg = 73.3880 ms --- mobilenet_v1 avg = 119.0739 ms --- mobilenet_v2 avg = 85.3050 ms --- shufflenet_v2 avg = 38.0762 ms --- squeezenet_v11 avg = 64.2201 ms +mnasnet min = 7.19300 max = 7.02600 average = 7.08480 +mobilenetv1 min = 10.42000 max = 10.29100 average = 10.34267 +mobilenetv2 min = 8.61900 max = 8.46900 average = 8.54707 +shufflenetv2 min = 4.55200 max = 4.41900 average = 4.46477 +squeezenet min = 8.60000 max = 7.85200 average = 7.98407 -------------------------------------- -run benchmark armv8 +run benchmark armv7 -------------------------------------- PaddleLite Benchmark Threads=1 Warmup=10 Repeats=30 --- mnasnet avg = 165.3073 ms --- mobilenet_v1 avg = 306.0188 ms --- mobilenet_v2 avg = 195.1884 ms --- shufflenet_v2 avg = 99.3692 ms --- squeezenet_v11 avg = 156.6971 ms +mnasnet min = 20.98300 max = 20.81400 average = 20.92527 +mobilenetv1 min = 33.19000 max = 32.81700 average = 33.08490 +mobilenetv2 min = 25.91400 max = 25.61700 average = 25.73097 +shufflenetv2 min = 11.14300 max = 10.97600 average = 11.06757 +squeezenet min = 19.31800 max = 19.20000 average = 19.26530 Threads=2 Warmup=10 Repeats=30 --- mnasnet avg = 90.2290 ms --- mobilenet_v1 avg = 157.0007 ms --- mobilenet_v2 avg = 118.1607 ms --- shufflenet_v2 avg = 68.6804 ms --- squeezenet_v11 avg = 91.3090 ms +mnasnet min = 12.59900 max = 12.46600 average = 12.52207 +mobilenetv1 min = 19.05800 max = 18.94700 average = 18.97897 +mobilenetv2 min = 15.28400 max = 15.11300 average = 15.19843 +shufflenetv2 min = 6.97000 max = 6.81400 average = 6.90863 +squeezenet min = 12.87900 max = 12.12900 average = 12.22530 Threads=4 Warmup=10 Repeats=30 --- mnasnet avg = 179.9730 ms --- mobilenet_v1 avg = 204.0684 ms --- mobilenet_v2 avg = 181.6486 ms --- shufflenet_v2 avg = 123.2728 ms --- squeezenet_v11 avg = 412.9046 ms +mnasnet min = 7.31400 max = 7.12900 average = 7.20357 +mobilenetv1 min = 11.44000 max = 10.86900 average = 10.94383 +mobilenetv2 min = 9.14900 max = 9.03800 average = 9.09907 +shufflenetv2 min = 4.60600 max = 4.49400 average = 4.53360 +squeezenet min = 8.27000 max = 8.10600 average = 8.19000 -------------------------------------- ``` diff --git a/docs/benchmark/index.rst b/docs/benchmark/index.rst deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/docs/demo_guides/android_app_demo.md b/docs/demo_guides/android_app_demo.md new file mode 100644 index 0000000000000000000000000000000000000000..7c40e1eb52bec0112b98fac7b1c49ef79273089f --- /dev/null +++ b/docs/demo_guides/android_app_demo.md @@ -0,0 +1,133 @@ +# Android Demo + +## 多种应用场景 + +我们提供的Paddle-Lite示例工程[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo),其中包含[Android](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo)、[iOS](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-ios-demo)和[Armlinux](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-armlinux-demo)平台的示例工程。涵盖[人脸识别](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/face_detection_demo)、[人像分割](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/human_segmentation_demo)、[图像分类](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/image_classification_demo)、[目标检测](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/object_detection_demo)4个应用场景。 + +### 1. 人脸识别 + +人脸检测是Paddle-Lite提供的人像检测demo。在移动端上提供了高精度、实时的人脸检测能力,能处理基于人脸检测的业务场景。在移动端预测的效果图如下: + +

     

+ +### 2. 人像分割 + +人像分割是Paddle-Lite 提供的图像分割demo ,在移动端上提供了实时的人像分割能力,可以应用证件照自动抠图、面积测量、智能交通(标记车道和交通标志)等场景。 在移动端预测的效果图如下: + +

     

+ +### 3. 图像分类 + +图像分类是Paddle-Lite 提供的图像处理demo ,在移动端上提供了实时的物体识别能力,可以应用到生产线自动分拣或质检、识别医疗图像、辅助医生肉眼诊断等场景。在移动端预测的效果图如下: + +

     

+ +### 4. 物体检测 + +物体检测是Paddle-Lite 提供的图像识别demo ,在移动端上提供了检测多个物体的位置、名称、位置及数量的能力。可以应用到视频监控(是否有违规物体或行为)、工业质检(微小瑕疵的数量和位置)、医疗诊断(细胞计数、中药识别)等场景。在移动端预测的效果图如下: + +

     

+ +## Android demo部署方法 + +下面我们以 **目标检测示例(object_detection_demo)** 为例讲解如何部署。 + +**目的**:将基于Paddle-Lite预测库的Android APP 部署到手机,实现物体检测 + +**需要的环境**: Android Studio、Android手机(开启USB调试模式)、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程 + +**部署步骤**: + +1、 目标检测的Android示例位于 `Paddle-Lite-Demo\PaddleLite-android-demo\object_detection_demo` + +2、用Android Studio 打开object_detection_demo工程 (本步骤需要联网)。 + +3、手机连接电脑,打开**USB调试**和**文件传输模式**,在Android Studio上连接自己的手机设备(手机需要开启允许从 USB安装软件权限) + +![Android_studio](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Android_studio.png) + +4、按下 Run按钮,自动编译APP并安装到手机。(该过程会自动下载Paddle-Lite预测库和模型,需要联网) + +成功后效果如下,图一:APP安装到手机 图二: APP打开后的效果,会自动识别图片中的物体并标记 + +

     

+ +## Android demo结构讲解 + +Android 示例的代码结构如下图所示: + +

+ + + 1、 Predictor.java: 预测代码 + +```shell +# 位置: +object_detection_demo/app/src/main/java/com/baidu/paddle/lite/demo/object_detection/Predictor.java +``` + + 2、 model.nb : 模型文件 (opt 工具转化后Paddle-Lite模型);pascalvoc_label_list:训练模型时的`labels`文件 + +```shell +# 位置: +object_detection_demo/app/src/main/assets/models/ssd_mobilenet_v1_pascalvoc_for_cpu/model.nb +object_detection_demo/app/src/main/assets/labels/pascalvoc_label_list +``` + + 3、 libpaddle_lite_jni.so、PaddlePredictor.jar:Paddle-Lite Java 预测库与Jar包 + +```shell +# 位置 +object_detection_demo/app/src/main/jniLibs/arm64-v8a/libpaddle_lite_jni.so +object_detection_demo/app/libs/PaddlePredictor.jar +``` + + 4、 build.gradle : 定义编译过程的 gradle 脚本。(不用改动,定义了自动下载Paddle-Lite预测和模型的过程) + +```shell +# 位置 +object_detection_demo/app/build.gradle +``` + + + +## 代码讲解 (使用Paddle-Lite Java API 执行预测) + +Android 示例基于Java API 开发,调用Paddle-Lite Java API包括以下五步。更详细的API 描述参考: [Paddle-Lite Java API](https://paddle-lite.readthedocs.io/zh/latest/api_reference/java_api_doc.html)。 + +```c++ +// 导入Java API +import com.baidu.paddle.lite.MobileConfig; +import com.baidu.paddle.lite.Tensor; +import com.baidu.paddle.lite.Predictor; +import com.baidu.paddle.lite.PowerMode; + +// 1. 写入配置:设置MobileConfig +MobileConfig config = new MobileConfig(); +config.setModelFromFile(); // 设置Paddle-Lite模型路径 +config.setPowerMode(PowerMode.LITE_POWER_NO_BIND); // 设置CPU运行模式 +config.setThreads(4); // 设置工作线程数 + +// 2. 创建 PaddlePredictor +PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config); + +// 3. 设置输入数据 +long[] dims = {100, 100}; +float[] inputBuffer = new float[10000]; +for (int i = 0; i < 10000; ++i) { + inputBuffer[i] = i; +} +Tensor input = predictor.getInput(0); +input.resize(dims); +input.setData(inputBuffer); + +// 4. 执行预测 +predictor.run(); + +// 5. 获取输出数据 +Tensor result = predictor.getOutput(0); +float[] output = result.getFloatData(); +for (int i = 0; i < 1000; ++i) { + System.out.println(output[i]); +} +``` diff --git a/docs/demo_guides/baidu_xpu.md b/docs/demo_guides/baidu_xpu.md new file mode 100644 index 0000000000000000000000000000000000000000..ead2c958e1028ef217f09a8db8796f266d6646ee --- /dev/null +++ b/docs/demo_guides/baidu_xpu.md @@ -0,0 +1,243 @@ +# PaddleLite使用百度XPU预测部署 + +Paddle Lite已支持百度XPU在x86和arm服务器(例如飞腾 FT-2000+/64)上进行预测部署。 +目前支持Kernel和子图两种接入方式,其中子图接入方式与之前华为NPU类似,即加载并分析Paddle模型,将Paddle算子转成XTCL组网API进行网络构建,在线生成并执行模型。 + +## 支持现状 + +### 已支持的芯片 + +- 昆仑818-100(推理芯片) +- 昆仑818-300(训练芯片) + +### 已支持的设备 + +- K100/K200昆仑AI加速卡 + +### 已支持的Paddle模型 + +- [ResNet50](https://paddlelite-demo.bj.bcebos.com/models/resnet50_fp32_224_fluid.tar.gz) +- [BERT](https://paddlelite-demo.bj.bcebos.com/models/bert_fp32_fluid.tar.gz) +- [ERNIE](https://paddlelite-demo.bj.bcebos.com/models/ernie_fp32_fluid.tar.gz) +- YOLOv3 +- Mask R-CNN +- Faster R-CNN +- UNet +- SENet +- SSD +- 百度内部业务模型(由于涉密,不方便透露具体细节) + +### 已支持(或部分支持)的Paddle算子(Kernel接入方式) + +- scale +- relu +- tanh +- sigmoid +- stack +- matmul +- pool2d +- slice +- lookup_table +- elementwise_add +- elementwise_sub +- cast +- batch_norm +- mul +- layer_norm +- softmax +- conv2d +- io_copy +- io_copy_once +- __xpu__fc +- __xpu__multi_encoder +- __xpu__resnet50 +- __xpu__embedding_with_eltwise_add + +### 已支持(或部分支持)的Paddle算子(子图/XTCL接入方式) + +- relu +- tanh +- conv2d +- depthwise_conv2d +- elementwise_add +- pool2d +- softmax +- mul +- batch_norm +- stack +- gather +- scale +- lookup_table +- slice +- transpose +- transpose2 +- reshape +- reshape2 +- layer_norm +- gelu +- dropout +- matmul +- cast +- yolo_box + + +## 参考示例演示 + +### 测试设备(K100昆仑AI加速卡) + +![baidu_xpu](https://paddlelite-demo.bj.bcebos.com/devices/baidu/baidu_xpu.jpg) + +### 准备设备环境 + +- K100/200昆仑AI加速卡[规格说明书](https://paddlelite-demo.bj.bcebos.com/devices/baidu/K100_K200_spec.pdf),如需更详细的规格说明书或购买产品,请联系欧阳剑ouyangjian@baidu.com; +- K100为全长半高PCI-E卡,K200为全长全高PCI-E卡,要求使用PCI-E x16插槽,且需要单独的8针供电线进行供电; +- 安装K100/K200驱动,目前支持Ubuntu和CentOS系统,由于驱动依赖Linux kernel版本,请正确安装对应版本的驱动安装包。 + +### 准备本地编译环境 + +- 为了保证编译环境一致,建议参考[源码编译](../user_guides/source_compile)中的Linux开发环境进行配置; +- 由于编译示例程序需要依赖OpenCV和CMake 3.10.3,请执行如下命令进行安装; + +```shell +$ sudo apt-get update +$ sudo apt-get install gcc g++ make wget unzip libopencv-dev pkg-config +$ wget https://www.cmake.org/files/v3.10/cmake-3.10.3.tar.gz +$ tar -zxvf cmake-3.10.3.tar.gz +$ cd cmake-3.10.3 +$ ./configure +$ make +$ sudo make install +``` + +### 运行图像分类示例程序 + +- 从[https://paddlelite-demo.bj.bcebos.com/devices/baidu/PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/baidu/PaddleLite-linux-demo.tar.gz)下载示例程序,解压后清单如下: + +```shell +- PaddleLite-linux-demo + - image_classification_demo + - assets + - images + - tabby_cat.jpg # 测试图片 + - labels + - synset_words.txt # 1000分类label文件 + - models + - resnet50_fp32_224_fluid # Paddle fluid non-combined格式的resnet50 float32模型 + - __model__ # Paddle fluid模型组网文件,可拖入https://lutzroeder.github.io/netron/进行可视化显示网络结构 + - bn2a_branch1_mean # Paddle fluid模型参数文件 + - bn2a_branch1_scale + ... + - shell + - CMakeLists.txt # 示例程序CMake脚本 + - build + - image_classification_demo # 已编译好的,适用于amd64的示例程序 + - image_classification_demo.cc # 示例程序源码 + - build.sh # 示例程序编译脚本 + - run.sh # 示例程序运行脚本 + - libs + - PaddleLite + - amd64 + - include # PaddleLite头文件 + - lib + - libiomp5.so # Intel OpenMP库 + - libmklml_intel.so # Intel MKL库 + - libxpuapi.so # XPU API库,提供设备管理和算子实现。 + - llibxpurt.so # XPU runtime库 + - libpaddle_full_api_shared.so # 预编译PaddleLite full api库 + - arm64 + - include # PaddleLite头文件 + - lib + - libxpuapi.so # XPU API库,提供设备管理和算子实现。 + - llibxpurt.so # XPU runtime库 + - libpaddle_full_api_shared.so # 预编译PaddleLite full api库 +``` + +- 进入PaddleLite-linux-demo/image_classification_demo/shell,直接执行./run.sh amd64即可; + +```shell +$ cd PaddleLite-linux-demo/image_classification_demo/shell +$ ./run.sh amd64 # 默认已生成amd64版本的build/image_classification_demo,因此,无需重新编译示例程序就可以执行。 +$ ./run.sh arm64 # 需要在arm64(FT-2000+/64)服务器上执行./build.sh arm64后才能执行该命令。 +... +AUTOTUNE:(12758016, 16, 1, 2048, 7, 7, 512, 1, 1, 1, 1, 0, 0, 0) = 1by1_bsp(1, 32, 128, 128) +Find Best Result in 150 choices, avg-conv-op-time = 40 us +[INFO][XPUAPI][/home/qa_work/xpu_workspace/xpu_build_dailyjob/api_root/baidu/xpu/api/src/wrapper/conv.cpp:274] Start Tuning: (12758016, 16, 1, 512, 7, 7, 512, 3, 3, 1, 1, 1, 1, 0) +AUTOTUNE:(12758016, 16, 1, 512, 7, 7, 512, 3, 3, 1, 1, 1, 1, 0) = wpinned_bsp(1, 171, 16, 128) +Find Best Result in 144 choices, avg-conv-op-time = 79 us +I0502 22:34:18.176113 15876 io_copy_compute.cc:75] xpu to host, copy size 4000 +I0502 22:34:18.176406 15876 io_copy_compute.cc:36] host to xpu, copy size 602112 +I0502 22:34:18.176697 15876 io_copy_compute.cc:75] xpu to host, copy size 4000 +iter 0 cost: 2.116000 ms +I0502 22:34:18.178530 15876 io_copy_compute.cc:36] host to xpu, copy size 602112 +I0502 22:34:18.178792 15876 io_copy_compute.cc:75] xpu to host, copy size 4000 +iter 1 cost: 2.101000 ms +I0502 22:34:18.180634 15876 io_copy_compute.cc:36] host to xpu, copy size 602112 +I0502 22:34:18.180881 15876 io_copy_compute.cc:75] xpu to host, copy size 4000 +iter 2 cost: 2.089000 ms +I0502 22:34:18.182726 15876 io_copy_compute.cc:36] host to xpu, copy size 602112 +I0502 22:34:18.182976 15876 io_copy_compute.cc:75] xpu to host, copy size 4000 +iter 3 cost: 2.085000 ms +I0502 22:34:18.184814 15876 io_copy_compute.cc:36] host to xpu, copy size 602112 +I0502 22:34:18.185068 15876 io_copy_compute.cc:75] xpu to host, copy size 4000 +iter 4 cost: 2.101000 ms +warmup: 1 repeat: 5, average: 2.098400 ms, max: 2.116000 ms, min: 2.085000 ms +results: 3 +Top0 tabby, tabby cat - 0.689418 +Top1 tiger cat - 0.190557 +Top2 Egyptian cat - 0.112354 +Preprocess time: 1.553000 ms +Prediction time: 2.098400 ms +Postprocess time: 0.081000 ms +``` + +- 如果需要更改测试图片,可将图片拷贝到PaddleLite-linux-demo/image_classification_demo/assets/images目录下,然后将run.sh的IMAGE_NAME设置成指定文件名即可; +- 如果需要重新编译示例程序,直接运行./build.sh amd64或./build.sh arm64即可。 + +```shell +$ cd PaddleLite-linux-demo/image_classification_demo/shell +$ ./build.sh amd64 # For amd64 +$ ./build.sh arm64 # For arm64(FT-2000+/64) +``` + +### 更新模型 + +- 通过Paddle Fluid训练,或X2Paddle转换得到ResNet50 float32模型[resnet50_fp32_224_fluid](https://paddlelite-demo.bj.bcebos.com/models/resnet50_fp32_224_fluid.tar.gz); +- 由于XPU一般部署在Server端,因此将使用PaddleLite的full api加载原始的Paddle Fluid模型进行预测,即采用CXXConfig配置相关参数。 + +### 更新支持百度XPU的Paddle Lite库 + +- 下载PaddleLite源码; + +```shell +$ git clone https://github.com/PaddlePaddle/Paddle-Lite.git +$ cd Paddle-Lite +$ git checkout +``` + +- 下载xpu_toolchain for amd64 or arm64(FT-2000+/64); + +```shell +$ wget +$ tar -xvf output.tar.gz +$ mv output xpu_toolchain +``` + +- 编译full_publish for amd64 or arm64(FT-2000+/64); + +```shell +For amd64,如果报找不到cxx11::符号的编译错误,请将gcc切换到4.8版本。 +$ ./lite/tools/build.sh --build_xpu=ON --xpu_sdk_root=./xpu_toolchain x86 + +For arm64(FT-2000+/64) +$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --build_xpu=ON --xpu_sdk_root=./xpu_toolchain --with_log=ON full_publish +``` + +- 将编译生成的build.lite.x86/inference_lite_lib/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/amd64/include目录; +- 将编译生成的build.lite.x86/inference_lite_lib/cxx/include/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/amd64/lib/libpaddle_full_api_shared.so文件; +- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.xpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/arm64/include目录; +- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.xpu/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_full_api_shared.so文件。 + +## 其它说明 + +- 如需更进一步的了解相关产品的信息,请联系欧阳剑ouyangjian@baidu.com; +- 百度昆仑的研发同学正在持续适配更多的Paddle算子,以便支持更多的Paddle模型。 diff --git a/docs/demo_guides/cpp_demo.md b/docs/demo_guides/cpp_demo.md new file mode 100644 index 0000000000000000000000000000000000000000..55abd3a70fe23dd0e8798d6a772ee216140c2875 --- /dev/null +++ b/docs/demo_guides/cpp_demo.md @@ -0,0 +1,266 @@ +# C++ Demo + +## 1. 下载最新版本预测库 + +预测库下载界面位于[Paddle-Lite官方预编译库](../user_guides/release_lib),可根据需求选择合适版本。 + +以**Android-ARMv8架构**为例,可以下载以下版本: + + +|ARM Version|build_extra|arm_stl|target|下载| +|:-------:|:-----:|:-----:|:-----:|:-------:| +|armv8|OFF|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.tiny_publish.tar.gz)| + +**解压后内容如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/1inference_lib.png) + +## 2. 转化模型 + +PaddlePaddle的原生模型需要经过[opt]()工具转化为Paddle-Lite可以支持的naive_buffer格式。 + +以`mobilenet_v1`模型为例: + +(1)下载[mobilenet_v1模型](http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz)后解压: + +```shell +wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz +tar zxf mobilenet_v1.tar.gz +``` + +**如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/3inference_model.png) + +(2)下载[opt工具](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt)。放入同一文件夹,终端输入命令转化模型: + +```shell +wget https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt +chmod +x opt +./opt --model_dir=./mobilenet_v1 --optimize_out_type=naive_buffer --optimize_out=./mobilenet_v1_opt +``` + +**结果如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/2opt_model.png) + + + +## 3. 编写预测程序 + +准备好预测库和模型,我们便可以编写程序来执行预测。我们提供涵盖图像分类、目标检测等多种应用场景的C++示例demo可供参考,位于`inference_lite_lib.android.armv8/demo/cxx`。 + +以mobile net_v1预测为例:`mobile_light`为mobilenet_v1预测示例,可以直接调用。 + +**示例如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/4light_demo.png) + + + +## 4. 编译 + +预测程序需要编译为Android可执行文件。 + +以mobilenet_v1模型为例,C++示例位于`inference_lite_lib.android.armv8/demo/mobile_light` + +```shell +cd inference_lite_lib.android.armv8/demo/mobile_light +``` + +编译demo + +```shell +make +``` + +**结果如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/5compile_demo.png) + +## 5. 执行预测 + +通过adb工具将可执行文件推送到手机上执行预测 + +(1)保证电脑已经安装adb工具,手机以"USB调试"、"文件传输模式"连接到电脑。 + +``` shell +adb deveices #查看adb设备是否已被识别 +``` + +**连接如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/6adb_devices.png) + +(2)准备预测库、模型和预测文件 + +1、将模型、动态库和预测文件放入同一文件夹: + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/7files.png) + +**注意**:动态预测库文件位于: `inference_lite_lib.android.armv8/cxx/liblibpaddle_light_api_shared.so` + +2、文件推送到手机: + +``` shell +chmod +x mobilenetv1_light_api +adb push mobilenet_v1_opt.nb /data/local/tmp +adb push libpaddle_light_api_shared.so /data/local/tmp +adb push mobilenetv1_light_api /data/local/tmp +``` +**效果如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/8push_file.png) + +(3)执行预测 + +```shell +adb shell 'cd /data/local/tmp && export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp && mobilenetv1_light_api ./mobilenet_v1_opt.nb' +``` +**结果如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/9result.png) + +上图的`Output`为mobilenet_v1模型在全1输入时,得到的预测输出。至此,Paddle-Lite的C++ demo执行完毕。 + + + + + +## 注:如何在代码中使用 API + +C++代码调用Paddle-Lite执行预测库仅需以下五步: + +(1)引用头文件和命名空间 + +```c++ +#include "paddle_api.h" +using namespace paddle::lite_api; +``` + +(2)指定模型文件,创建Predictor + +```C++ +// 1. Set MobileConfig, model_file_path is +// the path to model model file. +MobileConfig config; +config.set_model_from_file(model_file_path); +// 2. Create PaddlePredictor by MobileConfig +std::shared_ptr predictor = + CreatePaddlePredictor(config); +``` + +(3)设置模型输入 (下面以全一输入为例) + +```c++ +std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); +input_tensor->Resize({1, 3, 224, 224}); +auto* data = input_tensor->mutable_data(); +for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 1; +} +``` + +(4)执行预测 + +```c++ +predictor->Run(); +``` + +(5)获得预测结果 + +```c++ +std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); +// 转化为数据 +auto output_data=output_tensor->data(); +``` + + + + + +## 其他cxx_demo的编译与预期结果 + +### Light API Demo + +```shell +cd ../mobile_light +make +adb push mobilenetv1_light_api /data/local/tmp/ +adb shell chmod +x /data/local/tmp/mobilenetv1_light_api +adb shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt " +``` + + +### 图像分类 Demo + +```shell +cd ../mobile_classify +wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz +tar zxvf mobilenet_v1.tar.gz +make +adb push mobile_classify /data/local/tmp/ +adb push test.jpg /data/local/tmp/ +adb push labels.txt /data/local/tmp/ +adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +adb shell chmod +x /data/local/tmp/mobile_classify +adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1.opt /data/local/tmp/test.jpg /data/local/tmp/labels.txt" +``` + +### 目标检测 Demo + +```shell +cd ../mobile_detection +wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz +tar zxvf mobilenetv1-ssd.tar.gz +make +adb push mobile_detection /data/local/tmp/ +adb push test.jpg /data/local/tmp/ +adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +adb shell chmod +x /data/local/tmp/mobile_detection +adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg" +adb pull /data/local/tmp/test_detection_result.jpg ./ +``` + +### light API Demo 运行结果 + +运行成功后 ,将在控制台输出预测结果的前10个类别的预测概率: + +```shell +Output dim: 1000 +Output[0]: 0.000191 +Output[100]: 0.000160 +Output[200]: 0.000264 +Output[300]: 0.000211 +Output[400]: 0.001032 +Output[500]: 0.000110 +Output[600]: 0.004829 +Output[700]: 0.001845 +Output[800]: 0.000202 +Output[900]: 0.000586 +``` + +### 图像分类 Demo 运行结果 + +运行成功后 ,将在控制台输出预测结果的前5个类别的类型索引、名字和预测概率: + +```shell +parameter: model_dir, image_path and label_file are necessary +parameter: topk, input_width, input_height, are optional +i: 0, index: 285, name: Egyptian cat, score: 0.482870 +i: 1, index: 281, name: tabby, tabby cat, score: 0.471593 +i: 2, index: 282, name: tiger cat, score: 0.039779 +i: 3, index: 287, name: lynx, catamount, score: 0.002430 +i: 4, index: 722, name: ping-pong ball, score: 0.000508 +``` + +### 目标检测 Demo 运行结果 + +运行成功后 ,将在控制台输出检测目标的类型、预测概率和坐标: + +```shell +running result: +detection image size: 935, 1241, detect object: person, score: 0.996098, location: x=187, y=43, width=540, height=592 +detection image size: 935, 1241, detect object: person, score: 0.935293, location: x=123, y=639, width=579, height=597 +``` diff --git a/docs/user_guides/cuda.md b/docs/demo_guides/cuda.md similarity index 72% rename from docs/user_guides/cuda.md rename to docs/demo_guides/cuda.md index 45597057bb18c44b60234459f9a49a59b54135f6..f863fd86864194c6d022e4cf1fc75eb46725cc2c 100644 --- a/docs/user_guides/cuda.md +++ b/docs/demo_guides/cuda.md @@ -1,4 +1,4 @@ -# Lite基于CUDA的模型预测 +# PaddleLite使用CUDA预测部署 Lite支持在x86_64,arm64架构上(如:TX2)进行CUDA的编译运行。 @@ -28,7 +28,27 @@ cd Paddle-Lite ./lite/tools/build.sh --build_python=ON cuda ``` -编译结束会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。 +## 编译结果说明 + +cuda的编译结果位于 `build_cuda/inference_lite_lib` +**具体内容**说明: + +1、 `bin`文件夹:可执行工具文件,目前为空 + +2、 `cxx`文件夹:包含c++的库文件与相应的头文件 + +- `include` : 头文件 +- `lib` : 库文件 + - 打包的静态库文件: + - `libpaddle_api_full_bundled.a` :包含 full_api 和 light_api 功能的静态库 + - 打包的动态态库文件: + - `libpaddle_full_api_shared.so` :包含 full_api 和 light_api 功能的动态库 + +3、 `third_party` 文件夹:第三方库文件 + +4、 `demo` 文件夹:c++ demo. + +如果编译打开了python选项,则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite.so`。 ## 运行 @@ -36,7 +56,6 @@ cd Paddle-Lite 一: 下载darknet_yolov3模型,模型信息请参考[这里](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/yolov3) - ``` # 下载模型 wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz @@ -47,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg 二: 运行 -**NOTE:**此处示例使用的是python接口,后续会开放C++接口以及示例。 +**NOTE:** 此处示例使用的是python接口。 ``` python #-*- coding: utf-8 -*- @@ -56,7 +75,7 @@ import sys import numpy as np import cv2 sys.path.append('build_cuda/inference_lite_lib/python/lib') -from lite_core import * +from lite import * def read_img(im_path, resize_h, resize_w): im = cv2.imread(im_path).astype('float32') @@ -107,4 +126,14 @@ print (output_tensor.float_data()[:6]) ``` -**NOTE:** 对CUDA的支持还在持续开发中。 +**NOTE:** 此处示例使用的是C++接口。 + +``` +cd build_cuda/inference_lite_lib/demo/cxx/ +mkdir build && cd build +cmake .. +make +wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz +tar -zxf yolov3_infer.tar.gz +./demo yolov3_infer +``` diff --git a/docs/user_guides/fpga.md b/docs/demo_guides/fpga.md similarity index 97% rename from docs/user_guides/fpga.md rename to docs/demo_guides/fpga.md index a7c398af2036cab7d914a692ce4f8fdbae13d45c..f7885fd3b7f6600fe890332d2805a386008659e5 100644 --- a/docs/user_guides/fpga.md +++ b/docs/demo_guides/fpga.md @@ -1,4 +1,4 @@ -# Lite基于FPGA的模型预测 +# PaddleLite使用FPGA预测部署 Paddle Lite支持基于arm的FPGA zu3/zu5/zu9的模型预测,提供armv8的交叉编译 @@ -22,7 +22,7 @@ CMAKE编译选项: - 设置`LITE_WITH_FPGA=ON`和`LITE_WITH_ARM=ON` -其他编译选项与ARM编译相同,可以参考[“Paddle Lite在Docker下的ARM编译”](../source_compile)。 +其他编译选项与ARM编译相同,可以参考[“Paddle Lite在Docker下的ARM编译”](../user_guides/source_compile)。 示例如下: ```shell cmake .. \ diff --git a/docs/demo_guides/ios_app_demo.md b/docs/demo_guides/ios_app_demo.md new file mode 100644 index 0000000000000000000000000000000000000000..2d9bbcbf83e1703a116d65c7ce8379638bd13cfe --- /dev/null +++ b/docs/demo_guides/ios_app_demo.md @@ -0,0 +1,129 @@ +# iOS Demo + +## 多种应用场景 + +我们提供Paddle-Lite示例工程[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo),其中包含[Android](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo)、[iOS](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-ios-demo)和[Armlinux](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-armlinux-demo)平台的示例工程。iOS demo涵盖[图像分类](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/image_classification_demo)、[目标检测](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/object_detection_demo)2个应用场景。 + +### 1. 图像分类 + +图像分类是Paddle-Lite 提供的图像处理demo ,在移动端上提供了实时的物体识别能力,可以应用到生产线自动分拣或质检、识别医疗图像、辅助医生肉眼诊断等场景。在移动端预测的效果图如下: + +

     

+ +### 2. 物体检测 + +物体检测是Paddle-Lite 提供的图像识别demo ,在移动端上提供了检测多个物体的位置、名称、位置及数量的能力。可以应用到视频监控(是否有违规物体或行为)、工业质检(微小瑕疵的数量和位置)、医疗诊断(细胞计数、中药识别)等场景。在移动端预测的效果图如下: + +

     

+ +## iOS demo部署方法 + +下面我们以**目标检测(object_detection_demo)**为例讲解如何部署iOS工程。 + +**目的**:将基于Paddle-Lite预测库的iOS APP部署到苹果手机,实现物体检测。 + +**需要的环境**:Mac 电脑上安装Xcode、苹果手机、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程 + +**部署步骤**: + +1、 目标检测的iOS示例位于 `Paddle-Lite-Demo\PaddleLite-ios-demo\object_detection_demo` + +2、终端中执行 `download_dependencies.sh`脚本自动下载模型和Paddle-Lite预测库 + +```shell +cd PaddleLite-ios-demo # 1. 终端中进入 Paddle-Lite-Demo\PaddleLite-ios-demo +sh download_dependencies.sh # 2. 执行脚本下载依赖项 (需要联网) +``` + +下载完成后会出现提示: `Extract done ` + +3、用Xcode打开`object_detection_demo/detection_demo.xcodeproj`文件,修改工程配置。 +依次修改 `General/Identity`和`Signing&Capabilities`属性,替换为自己的工程代号和团队名称。(必须修改,不然无法通过编译) + +![Xcode1](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/Xcode1.png) + + + +![Xcode2](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/Xcode2.png) + +4、 IPhone手机连接电脑,在Xcode中连接自己的手机 (第一次连接IPhone到电脑时,需要在IPhone的`设置->通用->设备管理`中选择本电脑并信任) + +

+ +5、按下左上角的 Run按钮,自动编译APP并安装到手机。在苹果手机中设置信任该APP(进入`设置->通用->设备管理`,选中新安装的APP并`验证该应用`) + +成功后效果如下,图一:APP安装到手机 图二: APP打开后的效果,会自动识别图片中的物体并标记 + +

     

+ +## iOS demo结构讲解 + +iOS 示例的代码结构如下图所示: + +

+ + 1、 mobilenetv1-ssd: 模型文件 (opt 工具转化后Paddle-Lite模型) + +```shell +# 位置: +ios-detection_demo/detection_demo/models/mobilenetv1-ssd +``` + + 2、 libpaddle_api_light_bundled.a、paddle_api.h : Paddle-Lite C++ 预测库和头文件 + +```shell +# 位置: +# iOS预测库 +ios-detection_demo/detection_demo/lib/libpaddle_api_light_bundled.a +# 预测库头文件 +ios-detection_demo/detection_demo/include/paddle_api.h +ios-detection_demo/detection_demo/include/paddle_use_kernels.h +ios-detection_demo/detection_demo/include/paddle_use_ops.h +``` + + 3、 ViewController.mm:主要预测代码 + +```shell +# 位置 +ios-detection_demo/detection_demo/ViewController.mm +``` + +## 代码讲解 (如何使用Paddle-Lite C++ API 执行预测) + +IOS 示例基于C++ API 开发,调用Paddle-Lite C++ API包括以下五步。更详细的API 描述参考: [Paddle-Lite C++ API](https://paddle-lite.readthedocs.io/zh/latest/api_reference/java_api_doc.html)。 + +```c++ +#include +// 引入C++ API +#include "paddle_lite/paddle_api.h" +#include "paddle_lite/paddle_use_ops.h" +#include "paddle_lite/paddle_use_kernels.h" + +// 1. 设置MobileConfig +MobileConfig config; +config.set_model_from_file(); // 设置NaiveBuffer格式模型路径 +config.set_power_mode(LITE_POWER_NO_BIND); // 设置CPU运行模式 +config.set_threads(4); // 设置工作线程数 + +// 2. 创建PaddlePredictor +std::shared_ptr predictor = CreatePaddlePredictor(config); + +// 3. 设置输入数据 +std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); +input_tensor->Resize({1, 3, 224, 224}); +auto* data = input_tensor->mutable_data(); +for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 1; +} + +// 4. 执行预测 +predictor->run(); + +// 5. 获取输出数据 +std::unique_ptr output_tensor(std::move(predictor->GetOutput(0))); +std::cout << "Output shape " << output_tensor->shape()[1] << std::endl; +for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { + std::cout << "Output[" << i << "]: " << output_tensor->data()[i] + << std::endl; +} +``` diff --git a/docs/user_guides/java_demo.md b/docs/demo_guides/java_demo.md similarity index 94% rename from docs/user_guides/java_demo.md rename to docs/demo_guides/java_demo.md index 4a09826cd45f6ae1b8c46331d54d2f61af32fb14..ad37e7b95dbd439ccc7393af27140a404e16cf07 100644 --- a/docs/user_guides/java_demo.md +++ b/docs/demo_guides/java_demo.md @@ -9,7 +9,7 @@ ## 编译 -首先在PaddleLite的开发 [Docker镜像](../source_compile) 中,拉取最新PaddleLite代码,编译对应你手机架构的预测库, +首先在PaddleLite的开发 [Docker镜像](../user_guides/source_compile) 中,拉取最新PaddleLite代码,编译对应你手机架构的预测库, 下面我们以arm8 架构举例。进入paddlelite 目录,运行以下命令: ```shell @@ -73,7 +73,7 @@ resnet50_opt.nb http://paddle-inference-dist.bj.bcebos.com/resnet50_o 下载完后,assets文件夹里要包含解压后的上面五个模型文件夹,但demo里不需要保存原压缩.tar.gz 文件。 -注意:输入的模型要求为naive buffer存储格式,您可以通过 [**Model Optimize Tool**](../model_optimize_tool) 将fluid模型转为naive buffer存储格式。 +注意:输入的模型要求为naive buffer存储格式,您可以通过 [**Model Optimize Tool**](../user_guides/model_optimize_tool) 将fluid模型转为naive buffer存储格式。 ## 运行 Android 程序结果 diff --git a/docs/demo_guides/mediatek_apu.md b/docs/demo_guides/mediatek_apu.md new file mode 100644 index 0000000000000000000000000000000000000000..d2ad860ec850325a07893de89fe2a2ad3b01dc32 --- /dev/null +++ b/docs/demo_guides/mediatek_apu.md @@ -0,0 +1,173 @@ +# PaddleLite使用MTK APU预测部署 + +Paddle Lite已支持MTK APU的预测部署。 +其接入原理是与之前华为NPU类似,即加载并分析Paddle模型,将Paddle算子转成MTK的Neuron adapter API(类似Android NN API)进行网络构建,在线生成并执行模型。 + +## 支持现状 + +### 已支持的芯片 + +- [MT8168](https://www.mediatek.cn/products/tablets/mt8168)/[MT8175](https://www.mediatek.cn/products/tablets/mt8175)及其他智能芯片。 + +### 已支持的设备 + +- MT8168-P2V1 Tablet。 + +### 已支持的Paddle模型 + +- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mobilenet_v1_int8_224_fluid.tar.gz) + +### 已支持(或部分支持)的Paddle算子 + +- relu +- conv2d +- depthwise_conv2d +- elementwise_add +- elementwise_mul +- fc +- pool2d +- softmax + +## 参考示例演示 + +### 测试设备(MT8168-P2V1 Tablet) + +![mt8168_p2v1_tablet_front](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_front.jpg) + +![mt8168_p2v1_tablet_back](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_back.jpg) + +### 准备设备环境 + +- 由于需要依赖特定版本的firmware,感兴趣的同学通过MTK官网[https://www.mediatek.cn/about/contact-us](https://www.mediatek.cn/about/contact-us)提供的联系方式(类别请选择"销售"),获取测试设备和firmware; + +### 准备交叉编译环境 + +- 为了保证编译环境一致,建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置。 + +### 运行图像分类示例程序 + +- 从[https://paddlelite-demo.bj.bcebos.com/devices/mediatek/PaddleLite-android-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/PaddleLite-android-demo.tar.gz)下载示例程序,解压后清单如下: + +```shell +- PaddleLite-android-demo + - image_classification_demo + - assets + - images + - tabby_cat.jpg # 测试图片 + - labels + - synset_words.txt # 1000分类label文件 + - models + - mobilenet_v1_int8_224_for_cpu.nb # 已通过opt转好的、适合arm cpu的mobilenetv1量化模型 + - mobilenet_v1_int8_224_for_apu.nb # 已通过opt转好的、适合mtk apu的mobilenetv1量化模型 + - shell # android shell端的示例程序 + - CMakeLists.txt # 示例程序CMake脚本 + - build + - image_classification_demo # 已编译好的android shell端的示例程序 + - image_classification_demo.cc # 示例程序源码 + - build.sh # 示例程序编译脚本 + - run.sh # 示例程序运行脚本 + - apk # 常规android应用程序 + - app + - src + - main + - java # java层代码 + - cpp # 自定义的jni实现 + - app.iml + - build.gradle + - gradle + ... + - libs + - PaddleLite + - arm64-v8a + - include # PaddleLite头文件 + - lib + - libc++_shared.so + - libpaddle_light_api_shared.so # 预编译PaddleLite库 + - OpenCV # OpenCV 4.2 for android +``` + +- Android shell端的示例程序 + - 进入PaddleLite-android-demo/image_classification_demo/shell,直接执行./run.sh即可,注意:run.sh不能在docker环境执行,否则可能无法找到设备; + - 如果需要更改测试图片,可将图片拷贝到PaddleLite-android-demo/image_classification_demo/assets/images目录下,然后将run.sh的IMAGE_NAME设置成指定文件名即可; + - 如果需要重新编译示例程序,直接运行./build.sh即可,注意:build.sh的执行必须在docker环境中,否则可能编译出错; + - 需要说明的是,由于MTK APU暂时只支持NHWC的数据布局格式,而PaddleLite默认使用NCHW的数据布局格式,导致额外增加了预测中输入张量的NCHW到NHWC的转换,大约耗费8~9ms。 +```shell +$ cd PaddleLite-android-demo/image_classification_demo/shell +$ ./run.sh +... +warmup: 5 repeat: 10, average: 30.998502 ms, max: 31.049002 ms, min: 30.937002 ms +results: 3 +Top0 Egyptian cat - -0.122845 +Top1 tabby, tabby cat - -0.122845 +Top2 tiger cat - -0.544028 +Preprocess time: 3.620000 ms +Prediction time: 30.998502 ms +Postprocess time: 0.069000 ms + +[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b00000, pa = 0xfb3f9000, len = 255 +[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af8000, pa = 0xfb3fa000, len = 255 +[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af7000, pa = 0xf8ffe000, len = 255 +[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af6000, pa = 0xf7bfe000, len = 255 +[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af5000, pa = 0xf7bfd000, len = 255 +[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b0c000, pa = 0xfb3fe000, len = 255 +[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b0b000, pa = 0xfb3ff000, len = 255 +[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b0a000, pa = 0xf31ff000, len = 255 +[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b09000, pa = 0xfb3f6000, len = 255 +[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b08000, pa = 0xf7bff000, len = 255 +``` + +- 常规Android应用程序 + - 安装Android Studio 3.4 + - 打开Android Studio,在"Welcome to Android Studio"窗口点击"Open an existing Android Studio project",在弹出的路径选择窗口中进入"PaddleLite-android-demo/image_classification_demo/apk"目录,然后点击右下角的"Open"按钮即可导入工程; + - 通过USB连接Android手机、平板或开发板; + - 临时关闭selinux模式,允许app调用系统库; +```shell +$ adb root +# setenforce 0 +``` + - 待工程加载完成后,点击菜单栏的Build->Rebuild Project按钮,如果提示CMake版本不匹配,请点击错误提示中的'Install CMake xxx.xxx.xx'按钮,重新安装CMake,然后再次点击菜单栏的Build->Rebuild Project按钮; + - 待工程编译完成后,点击菜单栏的Run->Run 'App'按钮,在弹出的"Select Deployment Target"窗口选择已经连接的Android设备,然后点击"OK"按钮; + - 等待大约1分钟后(第一次时间比较长,需要耐心等待),app已经安装到设备上。默认使用ARM CPU模型进行预测,由于MT8168的CPU由四核Arm-Cortex A53组成,性能较一般手机的A7x系列要弱很多,如下图所示,只有6fps; + +![mt8168_p2v1_tablet_cpu](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_cpu.jpg) + + - 点击app界面右下角的设置按钮,在弹出的设置页面点击"Choose pre-installed models",选择"mobilenet_v1_int8_for_apu",点击返回按钮后,app将切换到APU模型,如下图所示,帧率提高到14fps。 + +![mt8168_p2v1_tablet_apu](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_apu.jpg) + + +### 更新模型 + +- 通过Paddle Fluid训练,或X2Paddle转换得到MobileNetv1 foat32模型[mobilenet_v1_fp32_224_fluid](https://paddlelite-demo.bj.bcebos.com/models/mobilenet_v1_fp32_224_fluid.tar.gz); +- 参考[模型量化-有校准数据训练后量化](../user_guides/post_quant_with_data)使用PaddleSlim对float32模型进行量化(注意:由于MTK APU只支持量化OP,在启动量化脚本时请注意相关参数的设置),最终得到全量化MobileNetV1模型[mobilenet_v1_int8_224_fluid](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mobilenet_v1_int8_224_fluid.tar.gz); +- 参考[模型转化方法](../user_guides/model_optimize_tool),利用opt工具转换生成MTK APU模型,仅需要将valid_targets设置为apu,arm即可。 +```shell +$ ./opt --model_dir=mobilenet_v1_int8_224_fluid \ + --optimize_out_type=naive_buffer \ + --optimize_out=mobilenet_v1_int8_224_for_apu \ + --valid_targets=apu,arm +``` +- 注意:opt生成的模型只是标记了MTK APU支持的Paddle算子,并没有真正生成MTK APU模型,只有在执行时才会将标记的Paddle算子转成MTK Neuron adapter API调用实现组网,最终生成并执行模型。 + +### 更新支持MTK APU的Paddle Lite库 + +- 下载PaddleLite源码和APU DDK; +```shell +$ git clone https://github.com/PaddlePaddle/Paddle-Lite.git +$ cd Paddle-Lite +$ git checkout +$ wget https://paddlelite-demo.bj.bcebos.com/devices/mediatek/apu_ddk.tar.gz +$ tar -xvf apu_ddk.tar.gz +``` +- 编译tiny_publish for MT8168-P2V1 Tablet +```shell +$ ./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared --build_extra=ON --with_log=ON --build_apu=ON --apu_ddk_root=./apu_ddk tiny_publish +``` +- 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/include替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/include目录; +- 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/lib/libpaddle_light_api_shared.so文件。 + + +## 其它说明 + +- 由于涉及到License的问题,无法提供用于测试的firmware,我们深感抱歉。如果确实对此非常感兴趣,可以参照之前提到的联系方式,直接联系MTK的销售; +- MTK研发同学正在持续增加用于适配Paddle算子bridge/converter,以便适配更多Paddle模型。 diff --git a/docs/advanced_user_guides/npu.md b/docs/demo_guides/npu.md similarity index 52% rename from docs/advanced_user_guides/npu.md rename to docs/demo_guides/npu.md index c84a3c3bd151dbc1574a0d874bacfbcd0af330a3..7b37d13350c93c4c39e2970d23024d291f6edd2f 100644 --- a/docs/advanced_user_guides/npu.md +++ b/docs/demo_guides/npu.md @@ -1,4 +1,4 @@ -# 使用华为NPU +# PaddleLite使用NPU(华为)预测部署 Paddle Lite是首款支持华为自研达芬奇架构NPU(Kirin 810/990 SoC搭载的NPU)的预测框架。 原理是在线分析Paddle模型,将Paddle算子转成HiAI IR后,调用HiAI IR/Builder/Runtime APIs生成并执行HiAI模型。 @@ -91,7 +91,7 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_shared tiny_publish ``` -注意:为了保证编译环境一致,建议参考[源码编译](../installation/source_compile)中的Docker开发环境进行配置,然后再执行上述命令。 +注意:为了保证编译环境一致,建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置,然后再执行上述命令。 ## 优化生成NPU模型 @@ -103,7 +103,6 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an --optimize_out_type=(protobuf|naive_buffer) \ --optimize_out= \ --valid_targets=npu,arm \ - --prefer_int8_kernel=(true|false) \ --record_tailoring_info =(true|false) ``` - model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子,并没有真正生成NPU HiAI模型,只有在执行时才会将标记的Paddle算子转成HiAI IR,最终生成并执行HiAI模型,具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)。 @@ -111,19 +110,91 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an ## 通过JAVA接口加载并执行NPU模型 -- 使用方法和[Java实例](../user_guides/java_demo)一致,无需额外设置任何参数,只需将模型换成NPU模型即可。[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中的Image Classification Demo for Android是同时支持CPU和NPU两种模型的图像分类Demo。 +**注意:由于华为手机root权限限制,现在仅支持JAVA接口加载和执行NPU模型** -注意:在拷贝libpaddle_lite_jni.so的时候,由于依赖HiAI DDK so和libc++_shared.so库,需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so,拷到libpaddle_lite_jni.so同级目录下。 - -## 通过C++接口加载并执行NPU模型 - -- 使用方法和[C++实例](../user_guides/cpp_demo)一致,同样无需额外设置任何参数,只需将模型换成NPU模型即可。 - -注意:1)不能使用安卓模拟器,需要使用真实设备,且必须是支持NPU的华为手机。2)在使用adb push命令向手机推送目标程序时,需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so,推送到目标程序同级目录下。 +- 使用方法和[Java实例](java_demo)一致,无需额外设置任何参数,只需将模型换成NPU模型即可。[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中的Image Classification Demo for Android是同时支持CPU和NPU两种模型的图像分类Demo。 +注意:在拷贝libpaddle_lite_jni.so的时候,由于依赖HiAI DDK so和libc++_shared.so库,需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so,拷到libpaddle_lite_jni.so同级目录下。 ## 其它说明 - 华为达芬奇架构的NPU内部大量采用float16进行运算,因此,预测结果会存在偏差,但大部分情况下精度不会有较大损失,可参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中Image Classification Demo for Android对同一张图片CPU与NPU的预测结果。 - 华为Kirin 810/990 Soc搭载的自研达芬奇架构的NPU,与Kirin 970/980 Soc搭载的寒武纪NPU不一样,同样的,与Hi3559A、Hi3519A使用的NNIE也不一样,Paddle Lite只支持华为自研达芬奇架构NPU。 - 我们正在持续增加能够适配HiAI IR的Paddle算子bridge/converter,以便适配更多Paddle模型,同时华为研发同学也在持续对HiAI IR性能进行优化。 + + +## 手动分割子图 + +### 背景 +- Paddle-Lite已经支持了大量的华为NPU的算子,但是仍然不能满足所有模型的需求。对于一个有部分算子不支持的模型,Paddle-Lite会将模型划分为可以跑在NPU上的子图和跑在CPU上的子图,实现NPU和CPU自动调度的功能,通常情况下可以获得比较好的性能。在一些特殊情况下,模型会被自动划分为比较多的子图,导致CPU和NPU的切换开销很大,从而导致整体性能变差。因此,需要手动分割子图的功能来指定一些算子跑在CPU上,避免子图过多。 + +### 功能 +- 通过配置文件来指定需要强制跑在CPU上的算子 + +### 使用方法 +- 1、通过netron打开paddle模型文件,可以查看模型结构,获得算子的类型、输入名称。输出名称。 + - 注意:Paddle-Lite会对模型进行优化,模型算子可以改变,需要以优化后的模型算子为准。后面会举例说明。 +- 2、生成配置文件 ```split_cfg.txt```,记录需要跑在CPU上的算子信息。 + - 每行一条OP记录信息,以冒号":"分隔"op名称","op输入名","op输出名",以逗号","分隔"op输入名"和"op输出名"中的不同var名。 + - 可以部分省略输入或者输出名。比如:```op3:in3_var0```表示,指定类型为"op3",输入为"in3_var0"的算子;```op4```表示所有类型为"op4"的算子 + - 例子1: + ``` + op0:in0_var0,in0_var1:out0_var0,out0_var1 + op1:in1_var0,in1_var1:out1_var0 + op2::out2_var0 + op3:in3_var0 + op4 + ``` + - 例子2: + ``` + transpose:conv2d_22.tmp_1:transpose_0.tmp_0 + ``` + ![image](https://user-images.githubusercontent.com/50474132/80475316-4a5fda80-897b-11ea-910a-6aee13243387.png) + +- 3、使用环境变量```SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE```指定配置文件的位置。 + - 例如: + ``` + export SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE=/data/local/tmp/split_sfg.txt + ``` +- 4、以上步骤完成后,运行的模型中符合条件的算子将被强制跑在CPU上。 + +### 举例 +- 以模型[image](https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz)为例 + +- 1、可以使用netron查看模型 + +- 2、初步分析 + + - 下图是ssd_mobilenet_v1中的部分结构。其中红色部分暂时不支持在NPU上运行,蓝色部分可能NPU上的性能不理想。此时,如果直接让预测库自动调度的话,可能会分成多个子图,而且整体性能不佳。因此,可以将蓝色部分和绿色部分整体指定在CPU上运行,让其他部分自动运行在NPU上(红色部分会自动在CPU上运行)。 + ![ssd_mobilenet_v1_example](https://user-images.githubusercontent.com/50474132/80453173-525b5280-895a-11ea-847f-c7dd5b5799de.png) + +- 3、使用opt转换模型 + + - opt转换过程中会打印log信息。在log中搜索```digraph G```和```// end G```可以找到优化后的模型图。 + ![image](https://user-images.githubusercontent.com/50474132/80454098-145f2e00-895c-11ea-9f16-dde1483a9beb.png) + ![image](https://user-images.githubusercontent.com/50474132/80454123-1de89600-895c-11ea-86b9-a62d78a6616d.png) + - 将从```digraph G```开始的,到```// end G```结束的整段模型图信息,保存到```.dot```格式的文件中。可以用```graphviz```打开查看,或者在[网页版](http://dreampuf.github.io/GraphvizOnline/)查看。 + ![image](https://user-images.githubusercontent.com/50474132/80454841-47ee8800-895d-11ea-9531-5689c5560fcb.png) + - 在此处确认需要被指定的算子是否被优化了。(期望是被指定的算子都还独立存在,如果被融合为了一个算子,需要指定此时融合后的算子)。 + +- 4、写配置文件 + + - 在配置文件中指定可以支持NPU但是需要指定在CPU上运行的算子。 + ``` + reshape + transpose + concat + softmax + ``` + - 由于这些算子都指定在CPU上运行,因此不需要特意配置算子的输入输出名称。 + +- 5、指定配置文件路径 + + - 通过```export SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE=your_split_config_file```的方式实现。 + +- 6、性能测试 + + - 设备:华为mate30 5G + - HIAI ddk版本:320 + - 性能:CPU约71.8ms,NPU约16.6ms。 + diff --git a/docs/demo_guides/opencl.md b/docs/demo_guides/opencl.md new file mode 100644 index 0000000000000000000000000000000000000000..31a0e411566297d5556e6b7fffcec1343cd83781 --- /dev/null +++ b/docs/demo_guides/opencl.md @@ -0,0 +1,213 @@ +# PaddleLite使用OpenCL预测部署 + +Lite支持在Android系统上运行基于OpenCL的程序,目前支持Ubuntu环境下armv8、armv7的交叉编译。 + +## 1. 编译 + +### 1.1 编译环境 + +1. Docker 容器环境; +2. Linux(推荐 Ubuntu 16.04)环境。 + +详见 **源码编译指南-环境准备** 章节。 + +### 1.2 编译Paddle-Lite OpenCL库范例 + +注:以android/armv7/opencl的目标、Docker容器的编译开发环境为例,CMake3.10,android-ndk-r17c位于`/opt/`目录下。 + +#### 针对 Lite 用户的编译命令(无单元测试,有编译产物,适用于benchmark) + +- `with_opencl`: `[ON | OFF]`,编译OpenCL必选; +- `arm_abi`: `[armv7 | armv8]`; +- `toolchain`: `[gcc | clang]`; +- `build_extra`: `[OFF | ON]`,编译全量op和kernel,包含控制流NLP相关的op和kernel体积会大,编译时间长; +- `build_cv`: `[OFF | ON]`,编译arm cpu neon实现的的cv预处理模块; +- `android_stl`: `[c++_shared | c++_static | gnu_static | gnu_shared]`,paddlelite的库以何种方式链接`android_stl`,选择`c++_shared`得到的动态库体积更小,但使用时候记得上传paddlelite所编译版本(armv7或armv8)一致的`libc++_shared.so`。默认使用`c++_static`。 + +```bash +###################################### +# 假设当前位于处于Lite源码根目录下 # +###################################### + +# 导入NDK_ROOT变量,注意检查NDK安装目录若与本示例是否不同 +export NDK_ROOT=/opt/android-ndk-r17c + +# 删除上一次CMake自动生成的.h文件 +rm ./lite/api/paddle_use_kernels.h +rm ./lite/api/paddle_use_ops.h + +# 设置编译参数并开始编译 +./lite/tools/build_android.sh \ + --arch=armv7 \ + --toolchain=clang \ + --with_cv=OFF \ + --with_log=OFF \ + --with_extra=OFF \ + --with_opencl=ON + +# 注:编译帮助请执行: ./lite/tools/build_android.sh help +``` + +注:该方式的编译产物中的`demo/cxx/mobile_light`适用于做benchmark,该过程不会打印开发中加入的log,注意需要提前转好模型。关于使用,详见下文**运行示例1: 编译产物demo示例**。 + +#### 针对 Lite 开发者的编译命令(有单元测试,编译产物) + +注:调用`./lite/tools/ci_build.sh`执行编译,该命令会编译armv7和armv8的opencl库。虽然有编译产物,但因编译单元测试,编译产物包体积可能较大,生产环境不推荐使用。 + +```bash +# 假设当前位于处于Lite源码根目录下 + +# 导入NDK_ROOT变量,注意检查您的安装目录若与本示例不同 +export NDK_ROOT=/opt/android-ndk-r17c + +# 删除上一次CMake自动生成的.h文件 +rm ./lite/api/paddle_use_kernels.h +rm ./lite/api/paddle_use_ops.h + +# 根据指定编译参数编译 +./lite/tools/ci_build.sh \ + --arm_os=android \ + --arm_abi=armv8 \ + --arm_lang=gcc \ + build_opencl +``` + +注:如果要调试cl kernel,假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件,保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`,该命令会自动将修改后,再切到build目录下执行`make publish_inference`或者你要编译的单测的可执行文件名,cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。 + +### 1.3 编译产物说明 + +编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内,根据编译参数不同,文件夹名字会略有不同。这里仅罗列关键产物: + +- `cxx`:该目录是编译目标的C++的头文件和库文件; +- `demo`:该目录包含了两个demo,用来调用使用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`,分别对应`mobile_full`和`mobile_light`文件夹。编译对应的demo仅需在`mobile_full`或`mobile_light`文 + - `mobile_full`:使用cxx config,可直接加载fluid模型,若使用OpenCL需要在`mobilenetv1_full_api.cc`代码里开启`DEMO_USE_OPENCL`的宏,详细见该文件的代码注释; + - `mobile_light`:使用mobile config,只能加载`model_optimize_tool`优化过的模型。 +注:`opencl`实现的相关kernel已经打包到动态库中。 + +```bash +. +|-- cxx +| |-- include +| | |-- paddle_api.h +| | |-- paddle_image_preprocess.h +| | |-- paddle_lite_factory_helper.h +| | |-- paddle_place.h +| | |-- paddle_use_kernels.h +| | |-- paddle_use_ops.h +| | `-- paddle_use_passes.h +| `-- lib +| |-- libpaddle_api_full_bundled.a +| |-- libpaddle_api_light_bundled.a +| |-- libpaddle_full_api_shared.so +| `-- libpaddle_light_api_shared.so +`-- demo + `-- cxx + |-- Makefile.def + |-- README.md + |-- include + | |-- paddle_api.h + | |-- paddle_lite_factory_helper.h + | |-- paddle_place.h + | |-- paddle_use_kernels.h + | |-- paddle_use_ops.h + | `-- paddle_use_passes.h + |-- mobile_full + | |-- Makefile + | `-- mobilenetv1_full_api.cc + `-- mobile_light + |-- Makefile + `-- mobilenetv1_light_api.cc +``` + +调用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`见下一部分运行示例。 + + + +## 2. 运行示例 + +下面以android的环境为例,介绍3个示例,分别如何在手机上执行基于OpenCL的ARM GPU推理过程。 + +### 2.1 运行示例1: 编译产物demo示例和benchmark + +需要提前用模型优化工具opt转好模型(下面假设已经转换好模型,且模型名为`mobilenetv1_opencl_fp32_opt_releasev2.6_b8234efb_20200423.nb`)。编译脚本为前文**针对 Lite 用户的编译命令(无单元测试,有编译产物,适用于benchmark)**。 + +```bash +################################# +# 假设当前位于build.xxx目录下 # +################################# + +# prepare enviroment on phone +adb shell mkdir -p /data/local/tmp/opencl/ + +# build demo +cd inference_lite_lib.android.armv7.opencl/demo/cxx/mobile_light/ +make +cd - + +# push executable binary, library to device +adb push inference_lite_lib.android.armv7.opencl/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp/opencl/ +adb shell chmod +x /data/local/tmp/opencl/mobilenetv1_light_api +adb push inference_lite_lib.android.armv7.opencl/cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/opencl/ + +# push model with optimized(opt) to device +adb push ./mobilenetv1_opencl_fp32_opt_releasev2.6_b8234efb_20200423.nb /data/local/tmp/opencl/ + +# run demo on device +adb shell "export LD_LIBRARY_PATH=/data/local/tmp/opencl/; \ + /data/local/tmp/opencl/mobilenetv1_light_api \ + /data/local/tmp/opencl/mobilenetv1_opencl_fp32_opt_releasev2.6_b8234efb_20200423.nb \ + 1 3 224 224 \ + 100 10 0" # round=100, warmup=10, print_output_tensor=0 +``` + +**注:** 权重参数会在第一次运行时加载,所以第一次执行时间略长。一般将warmup的值设为10,repeats值设为多次。 + +### 2.2 运行示例2: test_mobilenetv1单元测试 + +编译脚本为前文**针对 Lite 开发者的编译命令(有单元测试,编译产物)**。 + +- **运行文件准备** + +```bash +# 在/data/local/tmp目录下创建OpenCL文件目录 +adb shell mkdir -p /data/local/tmp/opencl + +# 将mobilenet_v1的模型文件推送到/data/local/tmp/opencl目录下 +adb shell mkdir -p /data/local/tmp/opencl/mobilenet_v1 +adb push build.lite.android.armv8.gcc.opencl/third_party/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1/ + +# 将OpenCL单元测试程序test_mobilenetv1,推送到/data/local/tmp/opencl目录下 +adb push build.lite.android.armv8.gcc.opencl/lite/api/test_mobilenetv1 /data/local/tmp/opencl +``` + +- **执行OpenCL推理过程** + +```bash +adb shell chmod +x /data/local/tmp/opencl/test_mobilenetv1 + +adb shell "export GLOG_v=1; \ + /data/local/tmp/opencl/test_mobilenetv1 \ + --model_dir=/data/local/tmp/opencl/mobilenetv1_fluid/ \ + --warmup=10 \ + --repeats=100" +``` + +### 2.3 运行示例3: test_layout_opencl单元测试 + +编译脚本为前文**针对 Lite 开发者的编译命令(有单元测试,编译产物)**。 + +```bash +adb shell mkdir -p /data/local/tmp/opencl +adb push build.lite.android.armv8.gcc.opencl/lite/kernels/opencl/test_layout_opencl /data/local/tmp/opencl/ +adb shell chmod +x /data/local/tmp/opencl/test_layout_opencl +adb shell "export GLOG_v=4; \ + /data/local/tmp/opencl/test_layout_opencl" +``` + +## 3. 如何在Code中使用 + +即编译产物`demo/cxx/mobile_light`目录下的代码,在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc); + +注:这里给出的链接会跳转到线上最新develop分支的代码,很可能与您本地的代码存在差异,建议参考自己本地位于`lite/demo/cxx/`目录的代码,查看如何使用。 + +**NOTE:** 对OpenCL的支持还在持续开发中。 diff --git a/docs/demo_guides/rockchip_npu.md b/docs/demo_guides/rockchip_npu.md new file mode 100644 index 0000000000000000000000000000000000000000..c207e7e486d658b98a604b9e66a79210ac45e45e --- /dev/null +++ b/docs/demo_guides/rockchip_npu.md @@ -0,0 +1,157 @@ +# PaddleLite使用RK NPU预测部署 + +Paddle Lite已支持RK NPU的预测部署。 +其接入原理是与之前华为NPU类似,即加载并分析Paddle模型,将Paddle算子转成RK组网API进行网络构建,在线生成并执行模型。 + +## 支持现状 + +### 已支持的芯片 + +- RK1808, RK1806,暂时不支持RK3399Pro。 + +### 已支持的设备 + +- RK1808/1806 EVB。 + +### 已支持的Paddle模型 + +- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/mobilenet_v1_int8_224_fluid.tar.gz) + +### 已支持(或部分支持)的Paddle算子 + +- relu +- conv2d +- depthwise_conv2d +- pool2d +- fc +- softmax +- batch_norm +- concat +- elementwise_add +- elementwise_sub +- elementwise_mul +- elementwise_div + +## 参考示例演示 + +### 测试设备(RK1808 EVB) + +![rk1808_evb_front](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/rk1808_evb_front.jpg) + +![rk1808_evb_back](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/rk1808_evb_back.jpg) + +### 准备设备环境 + +- 需要依赖特定版本的firmware,请参照[rknpu_ddk](https://github.com/airockchip/rknpu_ddk)的说明对设备进行firmware的更新; +- 由于RK1808 EVB在刷firmware后,只是一个纯净的Linux系统,无法像Ubuntu那样使用apt-get命令方便的安装软件,因此,示例程序和PaddleLite库的编译均采用交叉编译方式; +- 将MicroUSB线插入到设备的MicroUSB OTG口,就可以使用Android的adb命令进行设备的交互,再也不用配置网络使用ssh或者通过串口的方式访问设备了,这个设计非常赞! + +### 准备交叉编译环境 + +- 为了保证编译环境一致,建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置。 + +### 运行图像分类示例程序 + +- 从[https://paddlelite-demo.bj.bcebos.com/devices/rockchip/PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/PaddleLite-linux-demo.tar.gz)下载示例程序,解压后清单如下: + +```shell +- PaddleLite-linux-demo + - image_classification_demo + - assets + - images + - tabby_cat.jpg # 测试图片 + - tabby_cat.raw # 已处理成raw数据的测试图片 + - labels + - synset_words.txt # 1000分类label文件 + - models + - mobilenet_v1_int8_224_for_cpu.nb # 已通过opt转好的、适合arm cpu的mobilenetv1量化模型 + - mobilenet_v1_int8_224_for_rknpu.nb # 已通过opt转好的、适合rknpu的mobilenetv1量化模型 + - shell + - CMakeLists.txt # 示例程序CMake脚本 + - build + - image_classification_demo # 已编译好的示例程序 + - image_classification_demo.cc # 示例程序源码 + - convert_to_raw_image.py # 将测试图片保存为raw数据的python脚本 + - build.sh # 示例程序编译脚本 + - run.sh # 示例程序运行脚本 + - libs + - PaddleLite + - arm64 + - include # PaddleLite头文件 + - lib + - libGAL.so # RK DDK库 + - libOpenVX.so + - libVSC.so + - librknpu_ddk.so + - libgomp.so.1 # gnuomp库 + - libpaddle_light_api_shared.so # 预编译PaddleLite库 + - armhf + - include # PaddleLite头文件 + - lib + - libGAL.so + - libOpenVX.so + - libVSC.so + - librknpu_ddk.so + - libgomp.so.1 + - libpaddle_light_api_shared.so +``` + +- 进入PaddleLite-linux-demo/image_classification_demo/shell,直接执行./run.sh arm64即可,注意:run.sh不能在docker环境执行,否则无法找到设备; +```shell +$ cd PaddleLite-linux-demo/image_classification_demo/shell +$ ./run.sh arm64 # For RK1808 EVB +$ ./run.sh armhf # For RK1806 EVB +... +warmup: 5 repeat: 10, average: 6.499500 ms, max: 6.554000 ms, min: 6.468000 ms +results: 3 +Top0 Egyptian cat - 0.532328 +Top1 tabby, tabby cat - 0.345136 +Top2 tiger cat - 0.111146 +Preprocess time: 2.414000 ms +Prediction time: 6.499500 ms +Postprocess time: 0.414000 ms +``` +- 如果需要更改测试图片,可通过convert_to_raw_image.py工具生成; +- 如果需要重新编译示例程序,直接运行./build.sh即可,注意:build.sh的执行必须在docker环境中,否则可能编译出错。 + + +### 更新模型 + +- 通过Paddle Fluid训练,或X2Paddle转换得到MobileNetv1 foat32模型[mobilenet_v1_fp32_224_fluid](https://paddlelite-demo.bj.bcebos.com/models/mobilenet_v1_fp32_224_fluid.tar.gz); +- 参考[模型量化-有校准数据训练后量化](../user_guides/post_quant_with_data)使用PaddleSlim对float32模型进行量化(注意:由于RK NPU只支持tensor-wise的全量化模型,在启动量化脚本时请注意相关参数的设置),最终得到全量化MobileNetV1模型[mobilenet_v1_int8_224_fluid](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/mobilenet_v1_int8_224_fluid.tar.gz); +- 参考[模型转化方法](../user_guides/model_optimize_tool),利用opt工具转换生成RKNPU模型,仅需要将valid_targets设置为rknpu,arm即可。 +```shell +$ ./opt --model_dir=mobilenet_v1_int8_224_fluid \ + --optimize_out_type=naive_buffer \ + --optimize_out=mobilenet_v1_int8_224_for_rknpu \ + --valid_targets=rknpu,arm +``` +- 注意:opt生成的模型只是标记了RKNPU支持的Paddle算子,并没有真正生成RK NPU模型,只有在执行时才会将标记的Paddle算子转成RK NPU组网API,最终生成并执行模型。 + +### 更新支持RK NPU的Paddle Lite库 + +- 下载PaddleLite源码和RK DDK; +```shell +$ git clone https://github.com/PaddlePaddle/Paddle-Lite.git +$ cd Paddle-Lite +$ git checkout +$ git clone https://github.com/airockchip/rknpu_ddk.git +``` +- 编译full_publish and tiny_publish for RK1808 and RK1806 EVB +```shell +For RK1808 EVB +$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk full_publish +$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk tiny_publish + +For RK1806 EVB +$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv7 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk full_publish +$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv7 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk tiny_publish +``` +- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/arm64/include目录; +- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_light_api_shared.so文件; +- 将编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录; +- 将编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_light_api_shared.so文件。 + +## 其它说明 + +- RK研发同学正在持续增加用于适配Paddle算子bridge/converter,以便适配更多Paddle模型。 diff --git a/docs/demo_guides/x86.md b/docs/demo_guides/x86.md new file mode 100644 index 0000000000000000000000000000000000000000..9d31aab05b31df8f96caa1cb70b302cd02f879ff --- /dev/null +++ b/docs/demo_guides/x86.md @@ -0,0 +1,242 @@ +# PaddleLite使用X86预测部署 + +## 一、Docker或者Linux环境 + +Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../user_guides/source_compile)。 + +(注意:非docker Linux环境需要是Ubuntu16.04) + +### 编译 + +1、 下载代码 +```bash +# 下载Paddle-Lite源码 +git clone https://github.com/PaddlePaddle/Paddle-Lite.git +# 切换到release分支 +git checkout release/v2.6.0 +``` + +2、 源码编译 + +```bash +cd Paddle-Lite +./lite/tools/build.sh x86 + +# 其他可选择编译选项 +# --with_log=OFF 关闭LOG信息输出 +``` + +### 编译结果说明 + +x86编译结果位于 `build.lite.x86/inference_lite_lib` +**具体内容**说明: + +1、 `bin`文件夹:可执行工具文件 `test_model_bin` + +2、 `cxx`文件夹:包含c++的库文件与相应的头文件 + +- `include` : 头文件 +- `lib` : 库文件 + - 静态库文件: + - `libpaddle_api_full_bundled.a` :full_api 静态库 + - `libpaddle_api_light_bundled.a` :light_api 静态库 + - 动态库文件: + - `libpaddle_full_api_shared.so` :full_api 动态库 + - `libpaddle_light_api_shared.so`:light_api 动态库 + +3、 `third_party` 文件夹:依赖的第三方预测库mklml + +- mklml : Paddle-Lite预测库依赖的mklml数学库 + +4、 `demo/cxx`文件夹:x86预测库的C++ 示例demo + +- `mobilenetv1_full` :使用full_api 执行mobilenet_v1预测的C++ demo +- `mobilenetv1_light` :使用light_api 执行mobilenet_v1预测的C++ demo + + + + +### x86预测API使用示例 + +1、`mobilenetv1_full`目录结构 + +```bash +mobilenetv1_full/ +|-- CMakeLists.txt +|-- build.sh +`-- mobilenet_full_api.cc +``` + +本demo使用cmake构建`CMakeLists.txt`为cmake脚本,`mobilenet_full_api.cc`是x86示例的源代码、`build.sh`为编译的脚本。 + +2、demo使用方法 + +``` bash +# 1、编译 +cd mobilenetv1_full +sh build.sh +``` +编译结果为当前目录下的 `mobilenet_full_api ` +``` bash +# 2、执行预测 +./mobilenet_full_api ./mobilenet_v1 +``` +下载并解压模型[`mobilenet_v1`](http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz)到当前目录,执行以上命令进行预测。 + +```bash +# 3、执行demo后输出结果如下,全一输入下mobilenet_v1的预测结果 +Output shape 1000 +Output[0]: 0.000191312 +Output[100]: 0.000159713 +Output[200]: 0.000264313 +Output[300]: 0.000210793 +Output[400]: 0.00103236 +Output[500]: 0.000110071 +Output[600]: 0.00482924 +Output[700]: 0.00184533 +Output[800]: 0.000202116 +Output[900]: 0.000585591 +``` + + + +3、示例源码`mobilenet_full_api.cc` + +```c++ +#include +#include +#include "paddle_api.h" + + +using namespace paddle::lite_api; // NOLINT + +int64_t ShapeProduction(const shape_t& shape) { + int64_t res = 1; + for (auto i : shape) res *= i; + return res; +} + +void RunModel(std::string model_dir) { + // 1. Create CxxConfig + CxxConfig config; + config.set_model_dir(model_dir); + config.set_valid_places({ + Place{TARGET(kX86), PRECISION(kFloat)}, + Place{TARGET(kHost), PRECISION(kFloat)} + }); + // 2. Create PaddlePredictor by CxxConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + // 3. Prepare input data + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize({1, 3, 224, 224}); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 1; + } + + // 4. Run predictor + predictor->Run(); + + // 5. Get output + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + std::cout << "Output shape " << output_tensor->shape()[1] << std::endl; + for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { + std::cout << "Output[" << i << "]: " << output_tensor->data()[i] + << std::endl; + } +} + +int main(int argc, char** argv) { + if (argc < 2) { + std::cerr << "[ERROR] usage: ./" << argv[0] << " naive_buffer_model_dir\n"; + exit(1); + } + std::string model_dir = argv[1]; + RunModel(model_dir); + return 0; +} + +``` + +## 二、Windows环境 + +### 环境准备 + +#### 编译环境需求 + +- Windows 10 专业版 + - 目前Windows暂不支持GPU模式 +- *Python 版本 2.7/3.5.1+/3.6/3.7 (64 bit)* +- *pip 或 pip3 版本 9.0.1+ (64 bit)* +- *Visual Studio 2015 Update3* + +#### 安装步骤 + +1. cmake 需要3.15版本, 可在官网[下载](https://cmake.org/download/),并添加到环境变量中。 + +2. python 需要2.7 及以上版本, 可在官网[下载](https://www.python.org/download/releases/2.7/)。 + +3. git可以在官网[下载](https://gitforwindows.org/),并添加到环境变量中 + +### 编译 + +1、 下载代码 +```bash +git clone https://github.com/PaddlePaddle/Paddle-Lite.git +# 切换到release分支 +git checkout release/v2.3 +``` +2、 源码编译 + +```bash +cd Paddle-Lite +lite/tools/build_windows.bat with_extra with_python with_profile +``` +编译脚本`lite/tools/build.bat`,追加参数说明: + +| 参数 | 介绍 | 值 | +|-----------|-------------|-------------| +| with_extra | 可选,是否编译全量预测库(默认为OFF)。详情可参考[预测库说明](./library.html)。 | `ON`、`OFF` | +| with_python | 可选,是否编译python预测库(默认为OFF) 。 | `ON`、`OFF` | +| with_profile | 可选,是否支持分析器模式(默认为OFF) 。 | `ON`、`OFF` | + +### 编译结果 + +x86编译结果位于 `build.lite.x86/inference_lite_lib` +**具体内容**说明: + +1、 `bin`文件夹:可执行工具文件 `test_model_bin` + +2、 `cxx`文件夹:包含c++的库文件与相应的头文件 + +- `include` : 头文件 +- `lib` : 库文件 + - 打包的静态库文件: + - `libpaddle_api_full_bundled.lib` :full_api 静态库 + - `libpaddle_api_light_bundled.lib` :light_api 静态库 + +3、 `third_party` 文件夹:第三方库文件 + +### x86预测API使用示例 + +1、我们提供Windows环境下x86 API运行mobilenet_v1的示例:[mobilenet_full_x86demo](https://paddlelite-data.bj.bcebos.com/x86/mobilenet_full_x86demo.zip)。下载解压后内容如下>: + +![](https://paddlelite-data.bj.bcebos.com/x86/x86-doc/demo.png) + +`mobilenet_v1`为模型文件、`lib`和`include`分别是Paddle-Lite的预测库和头文件、`third_party`下是编译时依赖的第三方库`mklml`、`mobilenet_full_api.cc`是x86示例的源代码、`build.bat`为编译的脚本。 + +2、demo内容与使用方法 + +``` bash +# 1、编译(需在vs2015的命令窗口执行该脚本) +build.bat +``` +编译结果为当前目录下的 `Release\\mobilenet_full_api.exe` +``` bash +# 2、执行预测 +Release\\mobilenet_full_api.exe ..\mobilenet_v1 +``` +`mobilenet_v1`为模型路径,`mobilenet_full_api.exe`为第一步编译出的可执行文件。 diff --git a/docs/advanced_user_guides/add_layout.md b/docs/develop_guides/add_layout.md similarity index 99% rename from docs/advanced_user_guides/add_layout.md rename to docs/develop_guides/add_layout.md index 11e504f93c2b1bcaefaa06c0a5f51aea0995884e..26b7a07cc5788ee6e7fa36206c2432f5fc3def1c 100644 --- a/docs/advanced_user_guides/add_layout.md +++ b/docs/develop_guides/add_layout.md @@ -1,4 +1,4 @@ -# 如何增加Layout +# 新增Layout Paddle-Lite中Place包含了Target、Layout、Precision信息,用来注册和选择模型中的具体Kernel。下面以增加Place中的layout:`ImageDefault`、`ImageFolder`、`ImageNW`为例,讲解如何增加新Layout。 diff --git a/docs/advanced_user_guides/add_new_pass.md b/docs/develop_guides/add_new_pass.md similarity index 99% rename from docs/advanced_user_guides/add_new_pass.md rename to docs/develop_guides/add_new_pass.md index 93b27cd038642c702cd213adffcc378dc852a1b3..5740b7978f18cfad5754c0f77a8208bece565893 100644 --- a/docs/advanced_user_guides/add_new_pass.md +++ b/docs/develop_guides/add_new_pass.md @@ -1,5 +1,4 @@ - -# 新增Pass方法 +# 新增Pass 本文从三个方面介绍了`Lite`中的`Pass`结构:**Pass是什么**、**Pass的实现与接口**、**Pass的一般注册流程**。最后以`Fc_fuse_pass`为例介绍了`fusion_pass`的作用与注册方法。 diff --git a/docs/advanced_user_guides/add_operation.md b/docs/develop_guides/add_operation.md similarity index 99% rename from docs/advanced_user_guides/add_operation.md rename to docs/develop_guides/add_operation.md index 525832f8a9d7341c3124498084e05b160358b2ad..1aa955fa6a1b260fd3a17401e658e33b2b862fd9 100644 --- a/docs/advanced_user_guides/add_operation.md +++ b/docs/develop_guides/add_operation.md @@ -1,4 +1,4 @@ -# 新增OP的方法 +# 新增OP 以下以添加argmax为例,详细说明新增op的方法。 diff --git a/docs/develop_guides/architecture-intro.md b/docs/develop_guides/architecture-intro.md new file mode 100644 index 0000000000000000000000000000000000000000..f49f0525e122de9da19bacb441dfa84ab0eef7ca --- /dev/null +++ b/docs/develop_guides/architecture-intro.md @@ -0,0 +1,245 @@ +# 架构详解 + +这篇文档会从开发者角度详细介绍开发 Paddle-Lite 需要的相关信息。 + +## 设计及思考 + +近年来,各种深度学习预估硬件称出不穷,从手机APP到车载设备,再到音箱,均需要部署深度学习预测,且有如下共性需求: + +1. 高性能 +2. 硬件支持和扩展容易 +3. 轻量级部署 + +Paddle-Lite 的架构方面便是定向参考如上需求设计实现的,具体地 + +- 高性能方面 + - 通过 MIR(Machine IR) 实现精细复杂的计算图的分析和优化 + - 执行期 Kernel 的简单设计,几乎没有额外调度开销 + - 适当的硬件层抽象,框架支持各个硬件后端中做特定的调度实现 +- 轻量级部署方面 + - 拆分分析和执行两个阶段,执行阶段轻量级实现,可以单独部署 + - 轻量级 Op 和 Kernel 设计 +- 硬件支持和扩展方面 + - 通过 MIR 支撑带硬件和执行信息的宏观分析优化 + - TypeSystem 抽象带硬件的不同计算模式的表示,实现整个计算图的强类型推导,以及执行状态机的静态分析 + +Paddle-Lite 的架构尝试从强类型推导的角度建模支持多硬件,多种计算模式(不同量化精度、不同的 data layout等)的混合计算,从而实现宏观上的各异硬件和计算模式的混合。 + +框架部分已经经过 FPGA,GPU,NPU 等异构硬件的打磨,各项能力也在完善中。 + +## 重要模块介绍 + +### OpLite + +[OpLite](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/op_lite.h#L52) 是 Paddle-Lite 中的 Operator,用户扩展单个硬件时,最多的就是扩展 Op 和 Kernel。 + +重要方法如下: + +```c++ +class OpLite : public Registry { + public: + // Check the shape. + virtual bool CheckShape() const { return true; } + // Inference the outputs' shape. + virtual bool InferShape() const { return true; } + // Link the external execution environ to internal context. + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope); +}; +``` + +其中,分析期执行 + +- `AttachImpl` + +执行期执行 + +- `CheckShape` +- `InferShape` + +扩展须知: + +1. `CheckShape` 只在第一个 batch 执行,所以耗时不敏感 + +2. `InferShape` 需要在每个 batch 执行,应该严格耗时 + + 1. 可以通过添加 member variable 的方式,对其中一部分信息增加 cache,比如 + + ```c++ + class XXOp : public OpLite { + void InferShape() { + int batch_size = param().input.shape[0]; + if (!shape_cache_.empty()) { + shape_cache_[0] = batch_size; + param().output->Resize(shape_cache_); + } + } + + private: + shape_t shape_cache_; + } + ``` + + + +### OpParam + +[OpParam](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/operators/op_params.h) 用于存储执行期 Kernel 需要的各项参数。 所有字段可以直接存储(比如指针或者 `int`),以避免执行中获取参数的延迟。 + +因为没有需求,OpParam 暂时没有设置基类。 + +实际例子: + +```c++ +// For Softmax op +struct SoftmaxParam { + lite::Tensor* x{}; + lite::Tensor* output{}; + int axis{-1}; +}; +``` + +OpLite 的 `AttachImpl` 方法就用于构建 `OpParam` ,复制传递给 `Kernel` 用于执行。 + +OpParam 是执行期的重要模块,需要严格保证性能,相应的扩展要求: + +1. 字段的获取必须是低延迟的,可以直接用指针,或者直接复制值 +2. 避免执行无关信息混入,包括 debug 信息 +3. 命名需要与 Paddle OpDesc 中的信息严格一致,以降低功能对齐和理解的难度 + +### Kernel + +```c++ +template +class KernelLite : public KernelBase { + public: + // Run the kernel. + virtual void Run() { CHECK(false) << "Not Implemented"; } + + TargetType target() const override { return Target; } + PrecisionType precision() const override { return Precision; } + DataLayoutType layout() const override { return DataLayout; } + Place place() const override { return Place{Target, Precision, DataLayout}; } + std::string name() const override; +}; +``` + +由于是执行期的重要概念,因此 Kernel 设计地非常简单高效。 + +其中,执行期的 `Run` 是其唯一重要的接口,其中包含具体的计算逻辑。 + +模板中的参数主要用于方便多硬件编译,以及自解释: + +- Target: 执行硬件 +- Precision: 主要的计算精度 +- DataLayout:主要计算的 data layout + +这部分信息用于帮助挑选 kernel,具体的值并不严格。 + + + +Kernel 的注册需要用到 TypeSystem,不光对 Kernel 本身的特性进行描述,对其输入和输出均进行详尽的定义。 + +例如 FullyConnected 的注册 + +```c++ +REGISTER_LITE_KERNEL( + fc, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::FcCompute, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat), LAYOUT(kNCHW))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); +``` + +Kernel自身定义是 `kARM` 的,也就是ARM上的kernel,主要的计算精度是 `kFloat`,主要的 Data layout 是 `kNCHW`。 + +接着会对其所有的输入和输出做详细定义,比如看 `Input` 输入的定义是 `LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat), LAYOUT(kNCHW))`,也就是声明其 Target 是 `kARM`, PRECISION 是 `kFloat`,Data Layout 是 `kNCHW`。 + +这里的设计思想是类似C++中的函数重载,同一个 Kernel(的名字),在重载了其输入输出的类型之后可以是不同的kernel。 + +#### 扩展须知 + +1. 模板参数选用计算中主要的来表示 + 1. 比如,scale kernel,同时能接受 `float` 和 `int` 的输入,但其不算量化 kernel,那应该设置为 `Precision=float`,代表常规的计算精度中使用 +2. Kernel 输入输出的定义需要足够精确,是什么类型就是什么类型;框架会根据其输入输出的定义来动态构建状态机,否则会出现分析期和执行期的状态机不一致,造成未定义行为 + +### MIR + +MIR 类似于 LLVM 里的 IR,只是加上了硬件和执行期的信息参与分析优化。 + +Pass 是MIR中的模块化策略,其输入和输出都是 SSA Graph. + +框架会自动基于模型的Program 构建 SSA Graph,之后按 [Optimizer](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/optimizer.h) 中定义的pass的顺序调用一系列 Pass。 + +#### Op Fusion + +MIR 中的 [PatternMacher](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/mir/pattern_matcher.h) 实现了简单有效的基于图的模板识别的算法,相关的 op fusion 的图操作可以基于此实现。 + +实际的例子可以参考 [fc_fuse_pass.h](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/mir/fusion/fc_fuse_pass.h)。 + +### TypeSystem + +TypeSystem 是 Paddle-Lite 中构建复杂计算图的基础模块,核心思想是协助 SSA Graph 构建一个状态机,表示其中不同的状态。 + +这里的 Type 主要包含下面四组信息,更多的信息可以按需扩展: + +- TargetType +- Precision +- DataLayout +- device id,用于表示卡号 + + + +状态机的表示: + +```python +Tensor0(kARM, kFloat, kNCHW) --pass--> Tensor1(kOpenCL, kFloat, kNCHW) +``` + +MIR 会识别出,Tensor0 和 Tensor1 的硬件位置不同,因此触发相依的 Pass 插入对应的 cast op 来进行 type cast,比如 + +``` +Tensor0(kARM, kFloat, kNCHW) --pass-> IoCopyOp(kARM, kOpenCL) --pass-> Tensor1(kOpenCL, kFloat, kNCHW) +``` + +### KernelContext + +KernelContext 是硬件支持的核心封装,主要用于为 Kernel 提供执行期的硬件上下文。 + +KernelContext 的设计类似于 OpParam,两者均没有基类;对于 KernelContext,其假定是,不同的硬件间的接口和逻辑可能完全不同,比如 kARM 和 kCUDA,因此不设定基类,也不需要提供统一的接口来封装不同硬件行为。 + +不同硬件的 KernelContext 直接与该硬件对应的 Kernel 对接。 + +KernelContext 的行为可以被 MIR 在分析期确定和调度。 + +注意事项: + +1. 由于是执行期概念,KernelContext 也需要注意性能和轻量化 +2. 移动端部署时只会部署执行期,因此 MIR 和 KernelContext 会拆开,因此 KernelContext 相应的设置需要能够序列化到 ProgramDesc 中,以便执行期载入和执行 + +## 扩展硬件后端 + +### 扩展现有的硬件后端 + +主要是扩充 Op 和 Kernel 的工作,如果需要 fuse,则参考 MIR 章节,增加相应的fuse pass便可,具体地,可以参考 + +- [fc_op](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/operators/fc_op.h) 实现类似的 Op +- [fc_compute](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/kernels/arm/fc_compute.h) 实现类似的 Kernel +- [fc_fuse_pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/fusion/fc_fuse_pass.h) 实现fuse逻辑,并注册到 [optimizer](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/optimizer.h) + +### 扩展全新硬件后端 + +需要额外扩充如下模块,让框架能够支撑硬件执行: + +- TypeSystem,需要扩充其中相关的 type + - 相关 [enum](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/api/paddle_place.h#L44) +- MIR,需要扩展其中的 type cast 相关的 pass + - [TargetType cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_target_cast_pass.cc) 用于拷贝不同硬件上的tensor + - [Data layout cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_target_cast_pass.h) 用于转化不同的 data layout + - [Precision cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_precision_cast_pass.h) 用于转化不同 tensor 的量化精度 +- KernelContext,具体地可以参考 + - [ARM context](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/context.h#L91) + - 需要注意的是,硬件 context 的接口只服务于该硬件的 kernel + - context 有分析期和执行期两个阶段,如果分析期没有特殊的优化,则无需考虑;否则,需要注意将分析期的信息整理并序列化到离线模型中,用于执行期直接加载。 diff --git a/docs/develop_guides/for-developer.md b/docs/develop_guides/for-developer.md new file mode 100644 index 0000000000000000000000000000000000000000..fc7bd412ee5091552c7244a621f9e298496973a4 --- /dev/null +++ b/docs/develop_guides/for-developer.md @@ -0,0 +1,14 @@ +# 开发基础须知 + +可以参考 [Paddle 开发者文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/development/contribute_to_paddle/local_dev_guide.html)。 + +## 提交PR + +需要在 commit message 里加上 `test=develop` 才能触发 CI + +## 版本发布检查清单 + +1. 所有 feature 梳理,确认状态 +2. 所有 QA 测试结果梳理,确认版本可靠 +3. Release note 确认 review 通过 +4. 确认需要 release 的 binary 编译完毕 diff --git a/docs/develop_guides/index.rst b/docs/develop_guides/index.rst deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/docs/index.rst b/docs/index.rst index 9f9a2be8c9a34901cabc9f69d21de4fa57cc9057..120af007df4232cfad5c0ff8b61b3aa90458555c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -14,10 +14,11 @@ Welcome to Paddle-Lite's documentation! introduction/tech_highlights introduction/architecture introduction/support_hardware + introduction/support_operation_list .. toctree:: :maxdepth: 1 - :caption: Benchmark数据和方法 + :caption: Benchmark :name: sec-benchmark benchmark/benchmark @@ -25,51 +26,70 @@ Welcome to Paddle-Lite's documentation! .. toctree:: :maxdepth: 1 - :caption: 安装 - :name: sec-install - - installation/source_compile - -.. toctree:: - :maxdepth: 1 - :caption: 使用指南 + :caption: 使用方法 :name: sec-user-guides + user_guides/tutorial + user_guides/release_lib + user_guides/source_compile + user_guides/x2paddle user_guides/model_optimize_tool + user_guides/post_quant_with_data + user_guides/post_quant_no_data + user_guides/model_quantization + user_guides/debug user_guides/library_tailoring - user_guides/cuda - user_guides/fpga - user_guides/opencl - user_guides/cpp_demo - user_guides/java_demo .. toctree:: :maxdepth: 1 - :caption: 进阶使用指南 - - advanced_user_guides/support_operation_list - advanced_user_guides/add_operation - advanced_user_guides/add_layout - advanced_user_guides/model_quantization - advanced_user_guides/add_new_pass - advanced_user_guides/npu - advanced_user_guides/x86 - advanced_user_guides/cv + :caption: 部署示例 + :name: sec-demo_guides + + demo_guides/cpp_demo + demo_guides/java_demo + demo_guides/android_app_demo + demo_guides/ios_app_demo + demo_guides/x86 + demo_guides/cuda + demo_guides/opencl + demo_guides/fpga + demo_guides/npu + demo_guides/baidu_xpu + demo_guides/rockchip_npu + demo_guides/mediatek_apu .. toctree:: :maxdepth: 1 - :caption: 开发者文档 + :caption: API文档 + + api_reference/cxx_api_doc + api_reference/java_api_doc + api_reference/python_api_doc + api_reference/cv + +.. toctree:: + :maxdepth: 1 + :caption: 开发者贡献 + + develop_guides/for-developer + develop_guides/architecture-intro + develop_guides/add_operation + develop_guides/add_layout + develop_guides/add_new_pass .. toctree:: :maxdepth: 1 - :caption: API文档 + :caption: Roadmap + :name: sec-roadmap - api_reference/cxx_api_doc + introduction/roadmap .. toctree:: :maxdepth: 1 :caption: FAQ + introduction/faq + .. toctree:: :maxdepth: 1 :caption: paddle-mobile diff --git a/docs/installation/library.md b/docs/installation/library.md deleted file mode 100644 index ef2f8fdb18ade439d620b348738cbb752d5bd8b6..0000000000000000000000000000000000000000 --- a/docs/installation/library.md +++ /dev/null @@ -1,61 +0,0 @@ - -# 预测库说明 - -Paddle-Lite的编译结果为预测库文件(包括静态库和动态库),具体编译过程参考[源码编译](./source_compile)。 - -Lite预测库分为**基础预测库**和**全量预测库**:基础预测库只打包了基础模型需要的基础算子,预测库体积较小;全量预测库打包了所有的Lite算子,可以支持更多的模型,但是预测库的体积也更大。 编译时由编译选项 `build_extra`(默认为OFF)控制,`--build_extra=OFF`时编译基础预测库,`--build_extra=ON`时编译全量的预测库。 - -## 基础预测库 - -### 编译方法 -编译时设置`--build_extra=OFF` (默认值) 或不指定即可编译出基础预测库。例如: - -``` -./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static tiny_publish -``` - -### 基础预测库支持的功能 - -(1)支持基础CV模型 - -(2)支持基础的in8量化模型 - -(3)支持[benchmark测试](../benchmark/benchmark) - - -### 基础预测库支持的基础模型: - -1. fluid基础模型(paddle model 提供的基础模型9个) - -``` -mobileNetV1 mnasnet yolov3 ssd_mobilenetv1 shufflenet_v2 -mobileNetV2 resnet50 unet squeezenet_v11 -``` - -2. int8量化模型模型 - -``` -mobilenet_v1 mobilenet_v2 resnet50 -``` - -### 特点 - 轻量级预测库,体积更小,支持常用的基础模型。 - - - -## 全量预测库 - -### 编译方法 -编译时设置`--build_extra=ON` 即可编译出全量预测库。例如: - -``` -./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static --build_extra=ON tiny_publish -``` -### 全量预测库功能 - -(1) 基础预测库所有功能 - -(2)支持所有Paddle-Lite中注册的所有算子 - -### 特点 - 支持更多的硬件平台和算子,可以支持更多模型但体量更大。 diff --git a/docs/introduction/faq.md b/docs/introduction/faq.md new file mode 100644 index 0000000000000000000000000000000000000000..768b92a31b42934d454bfa3afbee6f8dba1ef462 --- /dev/null +++ b/docs/introduction/faq.md @@ -0,0 +1,8 @@ +# FAQ 常见问题 + +问题或建议可以发Issue,为加快问题解决效率,可先检索是否有类似问题,我们也会及时解答! +欢迎加入Paddle-Lite百度官方QQ群:696965088 + +1. 在Host端采用交叉编译方式编译PaddleLite,将编译后的libpaddle_light_api_shared.so和可执行程序放到板卡上运行,出现了如下图所示的错误,怎么解决? +![host_target_compiling_env_miss_matched](https://user-images.githubusercontent.com/9973393/75761527-31b8b700-5d74-11ea-8a9a-0bc0253ee003.png) +- 原因是Host端的交叉编译环境与Target端板卡的运行环境不一致,导致libpaddle_light_api_shared.so链接的GLIBC库高于板卡环境的GLIBC库。目前有四种解决办法(为了保证编译环境与官方一致,推荐第一种方式):1)在Host端,参考[源码编译](../user_guides/source_compile)中的Docker方式重新编译libpaddle_light_api_shared.so;2)在Host端,使用与Target端版本一致的ARM GCC和GLIBC库重新编译libpaddle_light_api_shared.so;3)在Target端板卡上,参考[源码编译](../user_guides/source_compile)中的ARM Linux本地编译方式重新编译libpaddle_light_api_shared.so;4)在Target端板卡上,将GLIBC库升级到和Host端一致的版本,即GLIBC2.27。 diff --git a/docs/introduction/roadmap.md b/docs/introduction/roadmap.md new file mode 100644 index 0000000000000000000000000000000000000000..0c5b5366041ff4cf406fe5d9d67833925c7795f8 --- /dev/null +++ b/docs/introduction/roadmap.md @@ -0,0 +1,32 @@ +# Road map + +这篇文档会介绍 Paddle-Lite 近期对外的开源版本和计划。 + +其中包含的 feature 为最小集合,按最终发布的版本为准。 + + +## 2.0.0-beta1-prerelease + +预计发布 *2019-8-26 ~ 2days* + +- 完善编译和 benchmark 文档 +- 增加第三方依赖代码的离线下载功能,加速编译过程 +- 去掉 `tiny_publish` 模式下无关的第三方代码下载,可以不依赖任何第三方 + +## 2.0.0-beta1 + +预计发布 *2019-9-1~2days* + +- `model_optimize_tool` 从 ARM 上执行修改为 Host 上执行,只从 kernel 分布来确定计算图优化;后续硬件针对优化会发布新的工具; +- Paddle 模型支持参数 composed 的格式 +- 增加分层编译来控制常用模型的部署库的大小,分两个模式 `basic`, `extra`;默认 `basic` 模式只发布核心的op 和kernel;将控制流相关的Op和kernel 折叠进 `extra` 按需编译 +- 增加 INT8 量化,从 PaddleSlim 训练到 PaddleLite 部署完整案例 +- 支持内存中加载模型,以支持 APP 的简易加密 + +## 2.3 + +[v2.3 project](https://github.com/PaddlePaddle/Paddle-Lite/milestone/3?closed=1) + +## 2.6 + +[v2.6 project](https://github.com/PaddlePaddle/Paddle-Lite/milestones/v2.6) diff --git a/docs/introduction/support_hardware.md b/docs/introduction/support_hardware.md index b4f76577bc9a0b80b188aedfc2c5cf33f786033a..b1a6823d26d4fe8838afee00732707608b836599 100644 --- a/docs/introduction/support_hardware.md +++ b/docs/introduction/support_hardware.md @@ -1,5 +1,5 @@ -# 支持硬件列表 +# 支持硬件 ## ARM CPU @@ -30,3 +30,16 @@ Paddle Lite支持移动端GPU和Nvidia端上GPU设备,支持列表如下: - ARM Mali G 系列 - Qualcomm Adreno 系列 - Nvida tegra系列: tx1, tx2, nano, xavier + +## NPU +Paddle Lite支持NPU,支持列表如下: +- 华为达芬奇架构NPU + +## FPGA +Paddle Lite支持FPGA,支持列表如下: +- 百度Edgeboard系列:ZU9, ZU5, ZU3 + +## XPU +Paddle Lite支持XPU,支持列表如下: +- 百度昆仑818-100芯片 +- 百度昆仑818-300芯片 diff --git a/docs/advanced_user_guides/support_operation_list.md b/docs/introduction/support_operation_list.md similarity index 96% rename from docs/advanced_user_guides/support_operation_list.md rename to docs/introduction/support_operation_list.md index c0acb02b9d7fb71f8abf79a651e07f2d78c1d2c1..7a60cf46e424dfe610a0541c9e364cf6e5d98531 100644 --- a/docs/advanced_user_guides/support_operation_list.md +++ b/docs/introduction/support_operation_list.md @@ -1,40 +1,26 @@ -# 支持OP列表 +# 支持OP -## Ops +## Ops (共计158个算子) +### Basic Operators (默认编译的算子) - affine_channel -- anchor_generator - arg_max -- assign -- assign_value -- attention_padding_mask -- axpy - batch_norm -- beam_search -- beam_search_decode - bilinear_interp -- box_clip - box_coder - calib -- calib_once - cast -- collect_fpn_proposals - concat -- conditional_block - conv2d - conv2d_transpose -- crop -- decode_bboxes - density_prior_box - depthwise_conv2d -- distribute_fpn_proposals - dropout - elementwise_add - elementwise_div - elementwise_max - elementwise_mul - elementwise_sub -- equal - exp - expand - fake_channel_wise_dequantize_max_abs @@ -56,29 +42,87 @@ - fusion_elementwise_max_activation - fusion_elementwise_mul_activation - fusion_elementwise_sub_activation -- gather - gelu +- grid_sampler +- hard_sigmoid +- instance_norm +- io_copy +- io_copy_once +- layout +- leaky_relu +- log +- matmul +- mean +- mul +- multiclass_nms +- nearest_interp +- pad2d +- pool2d +- prelu +- prior_box +- range +- reduce_mean +- relu +- relu6 +- relu_clipped +- reshape +- reshape2 +- rsqrt +- scale +- search_fc +- sequence_topk_avg_pooling +- shuffle_channel +- sigmoid +- slice +- softmax +- softsign +- split +- sqrt +- square +- squeeze +- squeeze2 +- stack +- subgraph +- swish +- tanh +- transpose +- transpose2 +- unsqueeze +- unsqueeze2 +- yolo_box + +### Extra Operators (打开 `--build_extra=ON`开关才会编译) + +- anchor_generator +- assign +- assign_value +- attention_padding_mask +- axpy +- beam_search +- beam_search_decode +- box_clip +- calib_once +- collect_fpn_proposals +- conditional_block +- crop +- decode_bboxes +- distribute_fpn_proposals +- equal +- gather - generate_proposals +- graph_op - greater_equal - greater_than -- grid_sampler - gru - gru_unit -- hard_sigmoid - im2sequence - increment -- instance_norm -- io_copy -- io_copy_once - is_empty - layer_norm -- layout - layout_once -- leaky_relu - less_equal - less_than - lod_reset -- log - logical_and - logical_not - logical_or @@ -87,37 +131,18 @@ - lookup_table_v2 - lrn - match_matrix_tensor -- matmul -- mean - merge_lod_tensor -- mul -- multiclass_nms -- nearest_interp - negative - norm - not_equal -- pad2d -- pool2d - power -- prelu -- prior_box -- range - read_from_array - reduce_max -- reduce_mean - reduce_prod - reduce_sum -- relu -- relu6 -- relu_clipped -- reshape -- reshape2 - roi_align -- rsqrt -- scale - search_aligned_mat_mul - search_attention_padding_mask -- search_fc - search_grnn - search_group_padding - search_seq_arithmetic @@ -129,37 +154,18 @@ - sequence_expand - sequence_expand_as - sequence_pool -- sequence_pool_concat - sequence_reshape - sequence_reverse - sequence_softmax -- sequence_topk_avg_pooling - shape -- shuffle_channel -- sigmoid -- slice -- softmax -- softsign -- split - split_lod_tensor -- sqrt -- square -- squeeze -- squeeze2 -- stack -- subgraph -- swish -- tanh - top_k -- transpose -- transpose2 - uniform_random -- unsqueeze -- unsqueeze2 - var_conv_2d - while - write_to_array -- yolo_box + + ## Kernels @@ -220,7 +226,6 @@ - generate_proposals - greater_equal - greater_than -- grid_sampler - gru - gru_unit - hard_sigmoid @@ -306,9 +311,6 @@ - gelu - gru - layer_norm -- leaky_relu -- lookup_table -- lookup_table_v2 - match_matrix_tensor - matmul - mul @@ -386,11 +388,9 @@ - yolo_box ### OpenCL kernels -- concat - conv2d - depthwise_conv2d - elementwise_add -- elementwise_mul - fc - fusion_elementwise_add_activation - layout @@ -398,10 +398,5 @@ - io_copy - io_copy_once - mul -- nearest_interp - pool2d - relu -- reshape -- reshape2 -- scale -- sigmoid diff --git a/docs/user_guides/Compile/Android.md b/docs/user_guides/Compile/Android.md new file mode 100644 index 0000000000000000000000000000000000000000..5ff0525f2eec8ef5fe6e49835b6a92447799b46c --- /dev/null +++ b/docs/user_guides/Compile/Android.md @@ -0,0 +1,106 @@ + +# 编译Android预测库 + +**注意:本编译方法只适用于release/v2.6.0之后版本(包括 v2.6.0)** + +安装了Android的编译环境,可以下载并编译 Paddle-Lite源码 + +```shell +# 1. 下载Paddle-Lite源码 并切换到release分支 +git clone https://github.com/PaddlePaddle/Paddle-Lite.git +cd Paddle-Lite && git checkout release/v2.3 + +# 2. 编译Paddle-Lite Android预测库 (armv8, gcc编译, 静态链接ndk stl) +./lite/tools/build_android.sh +``` + + + +### 编译结果 + +位于`Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8`: + +```shell +inference_lite_lib.android.armv8/ +|-- cxx C++ 预测库和头文件 +| |-- include C++ 头文件 +| | |-- paddle_api.h +| | |-- paddle_image_preprocess.h +| | |-- paddle_lite_factory_helper.h +| | |-- paddle_place.h +| | |-- paddle_use_kernels.h +| | |-- paddle_use_ops.h +| | `-- paddle_use_passes.h +| `-- lib C++预测库 +| |-- libpaddle_api_light_bundled.a C++静态库 +| `-- libpaddle_light_api_shared.so C++动态库 +|-- java Java预测库 +| |-- jar +| | `-- PaddlePredictor.jar +| |-- so +| | `-- libpaddle_lite_jni.so +| `-- src +|-- demo C++和Java示例代码 +| |-- cxx C++ 预测库demo +| `-- java Java 预测库demo +``` + + + +### 编译命令 + +- 默认编译方法: (armv8, gcc, c++_static) +``` shell +./lite/tools/build_android.sh +``` + +- 打印 help 信息: + +```shell +./lite/tools/build_android.sh help +``` + +- 其他可选编译命令: + +```shell +--arch: (armv8|armv7) arm版本,默认为armv8 +--toolchain: (gcc|clang) 编译器类型,默认为gcc +--android_stl: (c++_static|c++_shared) NDK stl库链接方法,默认为静态链接c++_static +--with_java: (OFF|ON) 是否编译Java预测库, 默认为 ON +--with_cv: (OFF|ON) 是否编译CV相关预处理库, 默认为 OFF +--with_log: (OFF|ON) 是否输出日志信息, 默认为 ON +--with_extra: (OFF|ON) 是否编译OCR或NLP相关模型的kernel&OP,默认为OFF,只编译CV模型相关kernel&OP +``` + +- 裁剪预测库方法(只编译模型中的kernel&OP,降低预测库体积): + +```shell +./lite/tools/build_android.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir +``` +```shell +--with_strip: (OFF|ON); 是否根据输入模型裁剪预测库,默认为OFF +--opt_model_dir: 输入模型的绝对路径,需要为opt转化之后的模型 +``` +详情请参考: [裁剪预测库](https://paddle-lite.readthedocs.io/zh/latest/user_guides/library_tailoring.html) + + +- 编译 Android npu 预测库方法: + +```shell +./lite/tools/build_android.sh --with_huawei_kirin_npu=ON --huawei_kirin_npu_sdk_root=YourNpuSdkPath +``` +```shell +--with_huawei_kirin_npu: (OFF|ON); 是否编译编译huawei_kirin_npu 的预测库,默认为OFF +--huawei_kirin_npu_sdk_root: `huawei HiAi DDK`文件的绝对路径,可从下面网址下载: +https://developer.huawei.com/consumer/cn/hiai/ +``` +详情请参考:[PaddleLite使用NPU(华为)预测部署](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/npu.html) + +- 编译Android opencl 预测库方法:(armv8, gcc, c++_static) + +```shell +./lite/tools/build_android.sh --with_opencl=ON +``` +```shell +--with_opencl: (OFF|ON); 是否编译opencl预测库, 默认为 OFF +``` diff --git a/docs/user_guides/Compile/Linux.md b/docs/user_guides/Compile/Linux.md new file mode 100644 index 0000000000000000000000000000000000000000..01f2341c5c73e5d4a90a48f1cba3fc16b84d4f7e --- /dev/null +++ b/docs/user_guides/Compile/Linux.md @@ -0,0 +1,101 @@ + +# 编译Linux预测库 + +**注意:本编译方法只适用于release/v2.6.0之后版本(包括 v2.6.0)** +**注意:本编译方法暂时只适用于ARM的设备** + +安装了ArmLinux的编译环境,可以下载并编译 Paddle-Lite源码 + +```shell +# 1. 下载Paddle-Lite源码 并切换到release分支 +git clone https://github.com/PaddlePaddle/Paddle-Lite.git +cd Paddle-Lite && git checkout release/v2.6 + +# 2. 编译Paddle-Lite Android预测库 (armv8, gcc编译) +./lite/tools/build_linux.sh +``` + + +### 编译结果 + +位于 `Paddle-Lite/build.lite.linux.armv8.gcc/inference_lite_lib.armlinux.armv8` : + +```shell +inference_lite_lib.armlinux.armv8/ +|-- cxx C++ 预测库和头文件 +| |-- include C++ 头文件 +| | |-- paddle_api.h +| | |-- paddle_image_preprocess.h +| | |-- paddle_lite_factory_helper.h +| | |-- paddle_place.h +| | |-- paddle_use_kernels.h +| | |-- paddle_use_ops.h +| | `-- paddle_use_passes.h +| `-- lib C++预测库 +| |-- libpaddle_api_light_bundled.a C++静态库 +| `-- libpaddle_light_api_shared.so C++动态库 +| +|-- demo +| `-- python python预测库demo +| +|-- python Python预测库(需要打开with_python选项) +| |-- install +| | `-- dist +| | `-- paddlelite-*.whl python whl包 +| |-- lib +| `-- lite.so python预测库 +``` + + +### 编译命令 + +- 默认编译方法: (armv8, gcc) +```shell +./lite/tools/build_linux.sh +``` + +- 打印 help 信息: + +```shell +./lite/tools/build_linux.sh help +``` + +- 其他可选编译命令: + +```shell +--arch: (armv8|armv7|armv7hf) arm版本,默认为armv8 +--toolchain: (gcc|clang) 编译器类型,默认为gcc +--with_extra: (OFF|ON) 是否编译OCR或NLP相关模型的kernel&OP,默认为OFF,只编译CV模型相关kernel&OP +--with_python: (OFF|ON) 是否编译python预测库, 默认为 OFF +--with_cv: (OFF|ON) 是否编译CV相关预处理库, 默认为 OFF +--with_log: (OFF|ON) 是否输出日志信息, 默认为 ON +``` +**注意:with_python现在仅支持armlinux的本地编译,尚不支持docker环境和ubuntu环境** + +- 裁剪预测库方法(只编译模型中的kernel&OP,降低预测库体积): + +```shell +./lite/tools/build_linux.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir +``` +```shell +--with_strip: (OFF|ON); 是否根据输入模型裁剪预测库,默认为OFF +--opt_model_dir: 输入模型的绝对路径,需要为opt转化之后的模型 +``` +详情请参考: [裁剪预测库](https://paddle-lite.readthedocs.io/zh/latest/user_guides/library_tailoring.html) + + +- 使用 rockchip npu 方法: + +```shell +--with_rockchip_npu: (OFF|ON); 是否编译编译 huawei_kirin_npu 的预测库,默认为OFF +--rockchip_npu_sdk_root: `rockchip_npu DDK`文件的绝对路径 +``` +详情请参考:[PaddleLite使用RK NPU预测部署](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/rockchip_npu.html) + +- 使用 baidu xpu 方法: + +```shell +--with_baidu_xpu: (OFF|ON); 是否编译编译 baidu_xpu 的预测库,默认为OFF +--baidu_xpu_sdk_root: `baidu_xpu DDK`文件的绝对路径 +``` +详情请参考:[PaddleLite使用百度XPU预测部署](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/baidu_xpu.html) diff --git a/docs/user_guides/Compile/iOS.md b/docs/user_guides/Compile/iOS.md new file mode 100644 index 0000000000000000000000000000000000000000..355cc11875ce8f8db891fb843d2f1624180b71ff --- /dev/null +++ b/docs/user_guides/Compile/iOS.md @@ -0,0 +1,70 @@ + +# 编译iOS预测库 + +**注意:本编译方法只适用于release/v2.6.0之后版本(包括 v2.6.0)** + +安装了iOS的编译环境,可以下载并编译 Paddle-Lite源码 + +```shell +# 1. 下载Paddle-Lite源码 并切换到release分支 +git clone https://github.com/PaddlePaddle/Paddle-Lite.git +cd Paddle-Lite && git checkout release/v2.6.0 + +# 2. 编译Paddle-Lite Android预测库 (armv8, gcc编译, 静态链接ndk stl) +./lite/tools/build_ios.sh +``` + + + +### 编译结果 + +位于`Paddle-Lite/build.ios.ios64.armv8/inference_lite_lib.ios64.armv8`: + +```shell +inference_lite_lib.ios64.armv8 iOS预测库和头文件 +|-- include C++头文件 +| |-- paddle_api.h +| |-- paddle_image_preprocess.h +| |-- paddle_lite_factory_helper.h +| |-- paddle_place.h +| |-- paddle_use_kernels.h +| |-- paddle_use_ops.h +| `-- paddle_use_passes.h +`-- lib C++预测库(静态库) + `-- libpaddle_api_light_bundled.a +``` + + + +### 编译命令 + +- 默认编译方法: (armv8) +``` shell +./lite/tools/build_ios.sh +``` + +- 打印 help 信息: + +```shell +./lite/tools/build_ios.sh help +``` + +- 其他可选编译命令: + +```shell +--arch: (armv8|armv7) arm版本,默认为armv8 +--with_cv: (OFF|ON) 是否编译CV相关预处理库, 默认为 OFF +--with_log: (OFF|ON) 是否输出日志信息, 默认为 ON +--with_extra: (OFF|ON) 是否编译OCR或NLP相关模型的kernel&OP,默认为OFF,只编译CV模型相关kernel&OP +``` + +- 裁剪预测库方法(只编译模型中的kernel&OP,降低预测库体积): + +```shell +./lite/tools/build_android.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir +``` +```shell +--with_strip: (OFF|ON); 是否根据输入模型裁剪预测库,默认为OFF +--opt_model_dir: 输入模型的绝对路径,需要为opt转化之后的模型 +``` +详情参考: [裁剪预测库](https://paddle-lite.readthedocs.io/zh/latest/user_guides/library_tailoring.html) diff --git a/docs/user_guides/Compile/v2.3_compile.md b/docs/user_guides/Compile/v2.3_compile.md new file mode 100644 index 0000000000000000000000000000000000000000..3bd4923ddb6d51e484f8c04fc1fe0f5eb24674a4 --- /dev/null +++ b/docs/user_guides/Compile/v2.3_compile.md @@ -0,0 +1,164 @@ +# release/v2.3 源码编译 +**说明:release/v2.3 之前版本(包括v2.3版本)的源码编译请参考本文档** + +**注意:OpenCL、华为NPU、FPGA、CUDA、X86预测库、CV模块的编译,请见进阶使用指南的对应章节。** + +### 下载代码 + +```shell +git clone https://github.com/PaddlePaddle/Paddle-Lite.git +cd Paddle-Lite +git checkout +``` + +### 编译模式与参数 + +编译脚本`./lite/tools/build.sh`,支持三种编译模式: + +| 编译模式 | 介绍 | 适用对象 | +|:-------:|-----|:-------:| +| tiny_publish | 编译移动端部署库,无第三方库依赖 | 用户 | +| full_publish | 编译移动端部署库,有第三方依赖如protobuf、glags等,含有可将模型转换为无需protobuf依赖的naive buffer格式的工具,供tiny_publish库使用 | 用户 | +| test | 编译指定`arm_os`、`arm_abi`下的移动端单元测试 | 框架开发者 | + +编译脚本`./lite/tools/build.sh`,追加参数说明: + +| 参数 | 介绍 | 值 | +|-----------|-------------|-------------| +| --arm_os |必选,选择安装平台 | `android`、`ios`、`ios64`、`armlinux` | +| --arm_abi |必选,选择编译的arm版本,其中`armv7hf`为ARMLinux编译时选用| `armv8`、`armv7`、`armv7hf`(仅`armlinux`支持) | +| --arm_lang |arm_os=android时必选,选择编译器 | `gcc`、`clang`(`clang`当前暂不支持) | +| --android_stl |arm_os=android时必选,选择静态链接STL或动态链接STL | `c++_static`、`c++_shared`| +| --build_java | 可选,是否编译java预测库(默认为ON) | `ON`、`OFF` | +| --build_extra | 可选,是否编译全量预测库(默认为OFF)。详情可参考[预测库说明](./library.html)。 | `ON`、`OFF` | +| target |必选,选择编译模式,`tiny_publish`为编译移动端部署库、`full_publish`为带依赖的移动端部署库、`test`为移动端单元测试、`ios`为编译ios端`tiny_publish` | `tiny_publish`、`full_publish`、`test`、 `ios` | + +### 编译代码 + +**注意**:非开发者建议在编译前使用[**“加速第三方依赖库的下载”**](#id22)的方法,加速工程中第三方依赖库的下载与编译。 + +#### 编译`tiny publish`动态库 + +##### Android +```shell +./lite/tools/build.sh \ + --arm_os=android \ + --arm_abi=armv8 \ + --build_extra=OFF \ + --arm_lang=gcc \ + --android_stl=c++_static \ + tiny_publish +``` +##### IOS +```shell +./lite/tools/build.sh \ + --arm_os=ios64 \ + --arm_abi=armv8 \ + --build_extra=OFF \ + ios +``` +**注意:mac环境编译IOS 时,cmake版本需要高于cmake 3.15;mac环境上编译Android时,cmake版本需要设置为cmake 3.10。** + +ios tiny publish支持的编译选项: + +* `--arm_os`: 可选ios或者ios64 +* `--arm_abi`: 可选armv7和armv8(**注意**:当`arm_os=ios`时只能选择`arm_abi=armv7`,当`arm_os=ios64`时只能选择`arm_abi=armv8`) +* 如果mac编译过程中报错:"Invalid CMAKE_DEVELOPER_ROOT: does not exist", 运行: +```shell +sudo xcode-select -s /Applications/Xcode.app/Contents/Developer +``` +##### ARMLinux +```shell +./lite/tools/build.sh \ + --build_extra=OFF \ + --arm_os=armlinux \ + --arm_abi=armv7hf \ + --arm_lang=gcc \ + tiny_publish +``` +- `--arm_abi`: 树莓派3b使用armv7hf,RK3399使用armv8 + +#### 编译`full publish`动态库 + +##### Android +```shell +./lite/tools/build.sh \ + --arm_os=android \ + --arm_abi=armv8 \ + --build_extra=OFF \ + --arm_lang=gcc \ + --android_stl=c++_static \ + full_publish +``` +##### ARMLinux +```shell +./lite/tools/build.sh \ + --arm_os=armlinux \ + --arm_abi=armv7hf \ + --arm_lang=gcc \ + --build_extra=OFF \ + full_publish +``` +- `--arm_abi`: 树莓派3b使用armv7hf,RK3399使用armv8 + +### 编译结果说明 + +**编译最终产物位置**在 `build.lite.xxx.xxx.xxx` 下的 `inference_lite_lib.xxx.xxx` ,如 Android 下 ARMv8 的产物位于`inference_lite_lib.android.armv8`: + +![](https://user-images.githubusercontent.com/45189361/65375706-204e8780-dccb-11e9-9816-ab4563ce0963.png) + +**目录内容**(可能)如下: + +**Full_publish编译结果:** + +![](https://user-images.githubusercontent.com/45189361/65375704-19c01000-dccb-11e9-9650-6856c7a5bf82.png) + +**Tiny_publish结果:** + +![](https://user-images.githubusercontent.com/45189361/65375726-3bb99280-dccb-11e9-9903-8ce255371905.png) + +**IOS编译结果:** + +![](https://user-images.githubusercontent.com/45189361/65375726-3bb99280-dccb-11e9-9903-8ce255371905.png) + + + +**具体内容**说明: + +1、 `bin`文件夹:可执行工具文件 `paddle_code_generator`、`test_model_bin` + +2、 `cxx`文件夹:包含c++的库文件与相应的头文件 + +- `include` : 头文件 +- `lib` : 库文件 + - 打包的静态库文件: + - `libpaddle_api_full_bundled.a` :包含 full_api 和 light_api 功能的静态库 + - `libpaddle_api_light_bundled.a` :只包含 light_api 功能的静态库 + - 打包的动态态库文件: + - `libpaddle_full_api_shared.so` :包含 full_api 和 light_api 功能的动态库 + - `libpaddle_light_api_shared.so`:只包含 light_api 功能的动态库 + +3、 `demo`文件夹:示例 demo ,包含 C++ demo 和 Java demo。 + +- `cxx` : C++示例 demo + - `mobile_full` : full_api 的使用示例 + - `mobile_light` : light_api的使用示例 +- `java` :Java 示例 demo + - `android` : Java的 Android 示例 + +4、 `java` 文件夹:包含 Jni 的动态库文件与相应的 Jar 包 + +- `jar` : `PaddlePredictor.jar` +- `so` : Jni动态链接库 `libpaddle_lite_jni.so` + +5、 `third_party` 文件夹:第三方库文件`gflags` + +**注意:** + +1、 只有当`--arm_os=android` 时才会编译出: + +- Java库文件与示例:`Java`和`demo/java` + +- 动态库文件:`libpaddle_full_api_shared.so`,`libpaddle_light_api_shared.so` + +2、 `tiny_publish`编译结果不包括 C++ demo和 C++ 静态库,但提供 C++ 的 light_api 动态库、 Jni 动态库和Java demo diff --git a/docs/user_guides/cpp_demo.md b/docs/user_guides/cpp_demo.md deleted file mode 100644 index a915a3f05ef133988db10a77584b565352a1a8f6..0000000000000000000000000000000000000000 --- a/docs/user_guides/cpp_demo.md +++ /dev/null @@ -1,343 +0,0 @@ -# C++ Demo - -## 编译 - -首先按照[PaddleLite 源码编译](https://github.com/PaddlePaddle/Paddle-Lite/wiki/source_compile)准备交叉编译环境,之后拉取最新[PaddleLite release发布版代码](https://github.com/PaddlePaddle/Paddle-Lite)。下面以Android-ARMv8架构为例,介绍编译过程,并最终在手机上跑通MobilNetv1模型。 - -进入 Paddle-Lite 目录,运行以下命令编译代码(**需加编译选项`--build_extra=ON`确保完整编译**): - -``` -./lite/tools/build.sh \ - --arm_os=android \ - --arm_abi=armv8 \ - --arm_lang=gcc \ - --android_stl=c++_static \ - --build_extra=ON \ - full_publish -``` - -编译完成后 `./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/` 文件夹下包含: - -- cxx - - include (头文件文件夹) - - lib (库文件文件夹) - - libpaddle_api_full_bundled.a - - libpaddle_api_light_bundled.a - - libpaddle_light_api_shared.so - - libpaddle_full_api_shared.so -- demo - - cxx (C++ demo) - - mobile_light (light api demo) - - mobile_full (full api demo) - - mobile_detection (detection model api demo) - - mobile_classify (classify model api demo) - - Makefile.def - - include -- third_party (第三方库文件夹) - - gflags - -## 准备执行环境 - -执行环境有两种:使用安卓手机;若没安卓手机,也可在安卓模拟器中执行。 - -### 环境一:使用安卓手机 - -将手机连上电脑,在手机上打开选项 -> 开启-开发者模式 -> 开启-USB调试模式。确保 `adb devices` 能够看到相应的设备。 - -### 环境二:使用安卓模拟器 - -运行下面命令,分别创建安卓armv8、armv7架构的模拟器。若需在真机测试,将模拟器换成相应架构的真机环境即可。 - -``` -*android-armv8* -adb kill-server -adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done -echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a" -echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port 5554 & -sleep 1m -``` - -``` -*android-armv7* -adb kill-server -adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done -echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a" -echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port 5554 & -sleep 1m -``` - -## 下载模型并运行示例 - -``` -cd inference_lite_lib.android.armv8/demo/cxx/mobile_full -wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz -tar zxvf mobilenet_v1.tar.gz - -make - -adb push mobilenet_v1 /data/local/tmp/ -adb push mobilenetv1_full_api /data/local/tmp/ -adb shell chmod +x /data/local/tmp/mobilenetv1_full_api -adb shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt" -``` - -注:我们也提供了轻量级 API 的 demo、图像分类demo和目标检测demo,支持图像输入; - -### Light API Demo - -``` -cd ../mobile_light -make -adb push mobilenetv1_light_api /data/local/tmp/ -adb shell chmod +x /data/local/tmp/mobilenetv1_light_api -adb shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt " -``` - - -### 图像分类 Demo - -``` -cd ../mobile_classify -wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz -tar zxvf mobilenet_v1.tar.gz -make -adb push mobile_classify /data/local/tmp/ -adb push test.jpg /data/local/tmp/ -adb push labels.txt /data/local/tmp/ -adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ -adb shell chmod +x /data/local/tmp/mobile_classify -adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1.opt /data/local/tmp/test.jpg /data/local/tmp/labels.txt" -``` - -### 目标检测 Demo - -``` -cd ../mobile_detection -wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz -tar zxvf mobilenetv1-ssd.tar.gz -make -adb push mobile_detection /data/local/tmp/ -adb push test.jpg /data/local/tmp/ -adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ -adb shell chmod +x /data/local/tmp/mobile_detection -adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg" -adb pull /data/local/tmp/test_detection_result.jpg ./ -``` - -## Demo 程序运行结果 - -### light API Demo 运行结果 - -运行成功后 ,将在控制台输出预测结果的前10个类别的预测概率: - -``` -Output dim: 1000 -Output[0]: 0.000191 -Output[100]: 0.000160 -Output[200]: 0.000264 -Output[300]: 0.000211 -Output[400]: 0.001032 -Output[500]: 0.000110 -Output[600]: 0.004829 -Output[700]: 0.001845 -Output[800]: 0.000202 -Output[900]: 0.000586 -``` - -### 图像分类 Demo 运行结果 - -运行成功后 ,将在控制台输出预测结果的前5个类别的类型索引、名字和预测概率: - -``` -parameter: model_dir, image_path and label_file are necessary -parameter: topk, input_width, input_height, are optional -i: 0, index: 285, name: Egyptian cat, score: 0.482870 -i: 1, index: 281, name: tabby, tabby cat, score: 0.471593 -i: 2, index: 282, name: tiger cat, score: 0.039779 -i: 3, index: 287, name: lynx, catamount, score: 0.002430 -i: 4, index: 722, name: ping-pong ball, score: 0.000508 -``` - -### 目标检测 Demo 运行结果 - -运行成功后 ,将在控制台输出检测目标的类型、预测概率和坐标: - -``` -running result: -detection image size: 935, 1241, detect object: person, score: 0.996098, location: x=187, y=43, width=540, height=592 -detection image size: 935, 1241, detect object: person, score: 0.935293, location: x=123, y=639, width=579, height=597 -``` - -## 如何在代码中使用 API - -在C++中使用PaddleLite API非常简单,不需要添加太多额外代码,具体步骤如下: - -- 加入头文件引用 - -``` - #include - #include - #include "paddle_api.h" - #include "paddle_use_kernels.h" - #include "paddle_use_ops.h" - #include "paddle_use_passes.h" -``` - -- 通过MobileConfig设置:模型文件位置(model_dir)、线程数(thread)和能耗模式( power mode )。输入数据(input),从 MobileConfig 创建 PaddlePredictor 并执行预测。 (注:Lite还支持从memory直接加载模型,可以通过MobileConfig::set_model_buffer方法实现) - -代码示例: - -``` -// 1. Create MobileConfig -MobileConfig config; - -// 2. Load model -config.set_model_dir("path to your model directory"); // model dir -/*load model: Lite supports loading model from file or from memory (naive buffer from optimized model) -//Method One: Load model from memory: -void set_model_buffer(const char* model_buffer, - size_t model_buffer_size, - const char* param_buffer, - size_t param_buffer_size) -//Method Two: Load model from file: -void set_model_dir(const std::string& model_dir) */ - -// 3. Set MobileConfig (or you can skip this step to use default value): -config.set_power_mode(LITE_POWER_HIGH); // power mode -/*power modes: Lite supports the following power modes - LITE_POWER_HIGH - LITE_POWER_LOW - LITE_POWER_FULL - LITE_POWER_NO_BIND - LITE_POWER_RAND_HIGH - LITE_POWER_RAND_LOW */ -config.set_threads("num of threads"); // threads - -// 4. Create PaddlePredictor by MobileConfig -std::shared_ptr predictor = - CreatePaddlePredictor(config); - -// 5. Prepare input data -std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); -input_tensor->Resize({1, 3, 224, 224}); -auto *data = input_tensor -> mutable_data(); -for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 1; -} - -// 6. Run predictor -predictor->Run(); - -// 7. Get output -std::unique_ptr output_tensor(std::move(predictor->GetOutput(0))); -``` - -## CxxConfig案例: OCR_model的运行 - -1. OCR 模型文件: - - 我们提供Pb格式的[ocr_attention_mode](https://paddle-inference-dist.cdn.bcebos.com/ocr_attention.tar.gz)l下载 - - 也可以从[Paddle/model项目](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/ocr_recognition)中训练出模型 -2. 示例代码: - - -``` -#include "paddle_api.h" // NOLINT -#include "paddle_use_passes.h" // NOLINT -#include -#include -#include -using namespace paddle::lite_api; // NOLINT - -DEFINE_string(model_dir, "", "Model dir path."); -DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels"); - -int64_t ShapeProduction(const shape_t &shape) { - int64_t res = 1; - for (auto i : shape) - res *= i; - return res; -} - -void RunModel() { - // 1. Set CxxConfig - CxxConfig config; - config.set_model_dir(FLAGS_model_dir); - std::vector valid_places({Place{TARGET(kARM), PRECISION(kFloat)}}); - if (FLAGS_prefer_int8_kernel) { - valid_places.insert(valid_places.begin(), - Place{TARGET(kARM), PRECISION(kInt8)}); - } - config.set_valid_places(valid_places); - - // 2. Create PaddlePredictor by CxxConfig - std::shared_ptr predictor = - CreatePaddlePredictor(config); - - // 3. Prepare input data - // input 0 - std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); - input_tensor->Resize(shape_t({1, 1, 48, 512})); - auto *data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 1; - } - // input1 - std::unique_ptr init_ids(std::move(predictor->GetInput(1))); - init_ids->Resize(shape_t({1, 1})); - auto *data_ids = init_ids->mutable_data(); - for (int i = 0; i < ShapeProduction(init_ids->shape()); ++i) { - data_ids[i] = 0; - } - - lod_t lod_i; - lod_i.push_back({0, 1}); - lod_i.push_back({0, 1}); - init_ids->SetLoD(lod_i); - // input2 - std::unique_ptr init_scores(std::move(predictor->GetInput(2))); - init_scores->Resize(shape_t({1, 1})); - auto *data_scores = init_scores->mutable_data(); - for (int i = 0; i < ShapeProduction(init_scores->shape()); ++i) { - data_scores[i] = 0; - } - lod_t lod_s; - lod_s.push_back({0, 1}); - lod_s.push_back({0, 1}); - init_scores->SetLoD(lod_s); - - // 4. Run predictor - predictor->Run(); - - // 5. Get output - std::unique_ptr output_tensor( - std::move(predictor->GetOutput(0))); - for (int i = 0; i < ShapeProduction(output_tensor->shape()); i++) { - printf("Output[%d]: %f\n", i, output_tensor->data()[i]); - } -} - -int main(int argc, char **argv) { - google::ParseCommandLineFlags(&argc, &argv, true); - RunModel(); - return 0; -} -``` - -3. 运行方法: - 参考以上代码编译出可执行文件`OCR_DEMO`,模型文件夹为`ocr_attention`。手机以USB调试、文件传输模式连接电脑。 -``` -简单编译出`OCR_DEMO`的方法:用以上示例代码替换编译结果中`build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_full/mobilenetv1_full_api.cc`文件的内容,终端进入该路径(`demo/cxx/mobile_full/`),终端中执行`make && mv mobilenetv1_full_api OCR_DEMO`即编译出了OCR模型的可执行文件`OCR_DEMO` -``` - 在终端中输入以下命令执行OCR model测试: - -``` -#OCR_DEMO为编译出的可执行文件名称;ocr_attention为ocr_attention模型的文件夹名称;libpaddle_full_api_shared.so是编译出的动态库文件,位于`build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/cxx/lib` -adb push OCR_DEMO /data/local/tmp -adb push ocr_attention /data/local/tmp -adb push libpaddle_full_api_shared.so /data/local/tmp/ -adb shell 'export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && cd /data/local/tmp && ./OCR_DEMO --model_dir=./OCR_DEMO' -``` - -4. 运行结果 - - diff --git a/docs/user_guides/debug.md b/docs/user_guides/debug.md new file mode 100644 index 0000000000000000000000000000000000000000..93395b25fae772954f83a1128cdb7e86c9eee994 --- /dev/null +++ b/docs/user_guides/debug.md @@ -0,0 +1,89 @@ +# 调试 + +## Profiler工具 + +Basic profiler 用于 CPU 上kernel 耗时的统计。 + +### 开启方法: + +参照 [编译安装](../user_guides/source_compile) 中的**full_publish**部分进行环境配置,在 cmake 时添加 `-DLITE_WITH_PROFILE=ON` ,就可以开启相应支持。 + +### 使用示例: + +在模型执行完毕后,会自动打印类似如下 profiler 的日志 + +``` + kernel average min max count + feed/def/1/4/2 0 0 0 1 + conv2d/def/4/1/1 1175 1175 1175 1 + conv2d/def/4/1/1 1253 1253 1253 1 + depthwise_conv2d/def/4/1/1 519 519 519 1 + conv2d/def/4/1/1 721 721 721 1 + elementwise_add/def/4/1/1 18 18 18 1 + conv2d/def/4/1/1 2174 2174 2174 1 + depthwise_conv2d/def/4/1/1 380 380 380 1 + conv2d/def/4/1/1 773 773 773 1 + elementwise_add/def/4/1/1 2 2 2 1 + conv2d/def/4/1/1 1248 1248 1248 1 + depthwise_conv2d/def/4/1/1 492 492 492 1 + conv2d/def/4/1/1 1150 1150 1150 1 + elementwise_add/def/4/1/1 33 33 33 1 + elementwise_add/def/4/1/1 3 3 3 1 + conv2d/def/4/1/1 1254 1254 1254 1 + depthwise_conv2d/def/4/1/1 126 126 126 1 +``` + +## Debug工具 + +**Lite Model Debug Tool** 是用来检查Paddle-Lite框架与Paddle-Fluid框架运行时tensor(包括variable与weight)之间diff信息的基础工具。 + +### 编译方法: + +1. 参照 [编译安装](../user_guides/source_compile) 中的**full_publish**部分进行环境配置和编译。 +2. 在生成的`build`目录下,执行`make lite_model_debug_tool`,`lite_model_debug_tool`产出在编译目录的`lite/tools/debug`目录下。 + +### 工作流程: + +1. 运行 `/bin/bash check_model.sh --model_dir= --build_root_dir= debug_cpp_stage` 获得模型在Paddle-Lite框架下的运行拓扑信息、varibles信息和weights信息。运行后拓扑信息将会存储在默认名为 `topo_file.txt` 的文件中,variables和weights信息将会存储在默认名为 `tensor_cpp.txt` 的文件中。 +2. 运行 `/bin/bash check_model.sh --model_dir= --build_root_dir= debug_py_stage`执行fluid框架预测以获取相同模型在fluid框架下的variable与weight信息(注意:我们使用fluid的python api运行fluid模型,因此您在运行此步之前应确保已正确安装fluid的python api)。然后debug tool将会自动比较Paddle-Lite框架输出的信息和Paddle-Fluid框架输出的信息来检查是否存在运行时diff。 执行Paddle-Fluid框架,输出的信息将会存储在默认名为 `tensor_py.txt` 的文件中,相应的diff信息将会存储在默认名为 `diff.txt`的文件中(默认情况下,只会输出执行拓扑序中第一个有diff的variable相关的信息)。 + +### 注意事项: + +1. 输出的结果是在**执行完一次预测后**输出的相应变量/权重的最终值,因此如果您在预测过程进行过诸如变量复用/子图融合等优化方法,则相应的输出可能会出现偏差。 +2. 默认情况下debug tools将以全1作为输入进行比对。 +3. 默认情况下,为了保证与Paddle-Fluid框架的结果可比对,debug tool将会禁用掉所有的Paddle-Lite的优化策略。 +4. Paddle-Lite框架的执行环境由与您的编译选项有关,比如您开启了LITE_WITH_ARM编译选项,那debug tool的`debug_cpp_stage`也需要在ARM平台下运行。 + +### Diff信息输出: + +如果debug tool检测到diff信息,那么在`diff.txt`中将会输出类似以下结构信息 + +```c++ +>>>>>>>>>>>>>>>>>>DIFF VARIABLE: dropout_0.tmp_0<<<<<<<<<<<<<<<<<<< +dropout (X:pool2d_7.tmp_0) (Mask:dropout_0.tmp_1 Out:dropout_0.tmp_0) +--------------- Tensor File info --------------- +pool2d_7.tmp_0 {1,1536,1,1} 0.749892 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0150336 0.621641 0.147099 0.636727 0.0 0.0 0.00410917 0.784708 0.0 0.0704846 0.233599 0.840123 0.239201 0.112878 0.0 0.155352 0.306906 0.0 0.0 0.860938 0.221037 0.787316 0.256585 ... +dropout_0.tmp_0 {1,1536,1,1} 0.749892 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0150336 0.621641 0.147099 0.636727 0.0 0.0 0.00410917 0.784708 0.0 0.0704846 0.233599 0.840123 0.239201 0.112878 0.0 0.155352 0.306906 0.0 0.0 0.860938 0.221037 0.787316 0.256585 ... +--------------- Fluid Tensor info --------------- +pool2d_7.tmp_0 {1,1536,1,1} 0.7498912 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.015033395 0.6216395 0.14709876 0.63672537 0.0 0.0 0.0041093696 0.7847073 0.0 0.07048465 0.23359808 0.8401219 0.23919891 0.1128789 0.0 0.1553514 0.3069055 0.0 0.0 0.8609365 0.22103554 ... +dropout_0.tmp_0 {1,1536,1,1} 0.599913 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.012026716 0.4973116 0.117679015 0.5093803 0.0 0.0 0.0032874958 0.62776583 0.0 0.056387722 0.18687847 0.67209756 0.19135913 0.090303116 0.0 0.12428112 0.2455244 0.0 0.0 0.68874925 ... +``` + +其中第二行为op相关信息,标明了执行哪个op出现了diff及其对应的输入输出变量名。Tensor File info为Paddle-Lite框架的输出信息,而Fluid Tensor info为Paddle-Fluid框架的相应输出信息。 +示例中的`dropout_0.tmp_1`没有相应的tensor信息是因为工具检测到其在预测的后序流程中未被使用,因此不会对预测结果造成影响,从而将其自动屏蔽掉以保证输出尽量简洁。 + +### 其他选项: + +| Option | Description | +| --------------------------- | ------------------------------------------------------------ | +| --input_file | 输入文件名,不同field以逗号分隔,相同field内以空格分隔, 只有文件中的第一行输入信息会被使用. 如果您不指定input_file,那么所有输入将会被置为1。注意:`debug_py_stage`目前不支持多field输入。 | +| --cpp_topo_file | 存储运行时拓扑信息,由`debug_cpp_stage`写入并且由`debug_py_stage`读取使用。 默认为`topo_file.txt` 。 | +| --cpp_tensor_file | 存储`debug_cpp_stage` 在运行拓扑序下的输出信息,默认为 `tensor_cpp.txt` 。 | +| --tensor_names | 如果此选项不为空,那么只输出由此选项中指定名字的variable/weight信息,名字间用逗号分隔。 | +| --tensor_output_length | 输出数据的长度,默认为全部输出。 | +| --py_threshold | 判断diff发生的阈值,默认为 `1e-5` 。 | +| --py_tensor_file | 存储`debug_py_stage` 在运行拓扑序下的输出信息,默认为`tensor_py.txt`. | +| --py_output_file | diff信息的存储文件,默认为`diff.txt`。 | +| --py_only_output_first_diff | 是否只输出运行时拓扑序中第一个有diff的var/op信息,默认为true | + +您可以参考 `check_model.sh` 脚本中的代码以获得更多细节. diff --git a/docs/user_guides/library.md b/docs/user_guides/library.md new file mode 100644 index 0000000000000000000000000000000000000000..20f16322c67cc9d10d2f667fa2ca7bceb83e338b --- /dev/null +++ b/docs/user_guides/library.md @@ -0,0 +1,57 @@ + +# `build_extra`参数说明: + +Lite预测库分为**基础预测库**和**全量预测库(with_extra)**:基础预测库只包含基础CV算子(OP),体积较小;全量预测库包含所有Lite算子,体积较大,支持模型较多。 + +编译时由编译选项 `build_extra`(默认为OFF)控制,`--build_extra=OFF`时编译**基础预测库**,`--build_extra=ON`时编译**全量预测库**。 + +## 基础预测库( [基础OP列表](../advanced_user_guides/support_operation_list.html#basic-operators) ) + + +### 支持功能 + +(1)87个[基础OP](../advanced_user_guides/support_operation_list.html#basic-operators) (2)9个基础模型 (3)3个in8量化模型 + + +### 支持的模型 + +1. fluid基础模型(来源:[paddle-models](https://github.com/PaddlePaddle/models) ) + +``` +mobilenetV1 mnasnet yolov3 ssd_mobilenetv1 shufflenet_v2 +mobilenetV2 resnet50 unet squeezenet_v11 +``` + +2. int8量化模型 + +``` +mobilenet_v1 mobilenet_v2 resnet50 +``` + +### 特点 + 轻量级预测库,体积更小,支持常用模型。 + +### 编译方法 +编译时设置`--build_extra=OFF` (默认值) 编译出基础预测库。例如: + +``` +./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static tiny_publish +``` + + +## 全量预测库( [OP列表](../advanced_user_guides/support_operation_list.html#op) ) + + +### 支持功能 + + Paddle-Lite中的全量算子( [基础OP](../advanced_user_guides/support_operation_list.html#basic-operators) + [Extra OP](../advanced_user_guides/support_operation_list.html#extra-operators-build-extra-on) ) + +### 特点 + 包含更多算子、支持更多模型,但体量更大。 + +### 编译方法 +设置`--build_extra=ON` 可编译出全量预测库。例如: + +``` +./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static --build_extra=ON tiny_publish +``` diff --git a/docs/user_guides/library_tailoring.md b/docs/user_guides/library_tailoring.md index 5ba12cf819945ab2f182f672a2c96123bc12e070..704974ec0d91b2d6aec10ba898f74f2fcf3b2db7 100644 --- a/docs/user_guides/library_tailoring.md +++ b/docs/user_guides/library_tailoring.md @@ -1,5 +1,5 @@ -# 裁剪预测库方法 +# 裁剪预测库 Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编译会将所有已注册的operator打包到预测库中,造成库文件体积膨胀;**裁剪预测库**能针对具体的模型,只打包优化后该模型需要的operator,有效降低预测库文件大小。 @@ -24,22 +24,29 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编 ### 1、转化模型时记录优化后模型信息 -说明:使用model_optimize_tool转化模型时,选择 `--record_tailoring_info =true` 会将优化后模型的OP和kernel信息保存到输出文件夹,这些信息将用于编译裁剪后的动态库。 -注意:需要使用Paddle-Lite 最新版本(release/v2.0.0之后)代码编译出的model_optimize_tool +说明:使用`opt`转化模型时,选择 `--record_tailoring_info =true` 会将优化后模型的OP和kernel信息保存到输出文件夹,这些信息将用于编译裁剪后的动态库。 例如: ```bash -./model_optimize_tool --model_dir=./mobilenet_v1 --optimize_out_type=naive_buffer --optimize_out=mobilenet_v1NB --record_tailoring_info =true --valid_targets=arm +./opt --model_dir=./mobilenet_v1 --optimize_out_type=naive_buffer --optimize_out=mobilenet_v1NB --record_tailoring_info =true --valid_targets=arm ``` -效果:优化后模型使用的OP和kernel信息被保存在 `mobilenet_v1NB`文件夹中的隐藏文件里了 +效果:优化后模型使用的`OP`和`kernel`信息被保存在 `mobilenet_v1NB`文件夹中的隐藏文件里了 ### 2、根据模型信息编译裁剪后的预测库 说明:编译Paddle-Lite时选择`--build_tailor=ON` ,并且用 `–-opt_model_dir=` 指定优化后的模型的地址 例如: +**release/v2.6.0以后版本或develop分支使用以下命令**: + +```bash +./lite/tools/build_android.sh --with_strip=ON --opt_model_dir=../mobilenet_v1NB +``` + +**release/v2.3之前版本使用以下命令**: + ```bash -./lite/tools/build.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_static --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB full_publish +./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB tiny_publish ``` **注意**:上面命令中的`../mobilenet_v1NB`是第1步得到的转化模型的输出路径 @@ -88,9 +95,6 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编 #include #include #include "paddle_api.h" // NOLINT -#include "paddle_use_kernels.h" // NOLINT -#include "paddle_use_ops.h" // NOLINT -#include "paddle_use_passes.h" // NOLINT using namespace paddle::lite_api; // NOLINT @@ -151,13 +155,13 @@ int main(int argc, char** argv) { ## 按模型集合裁剪预测库 -为了方便用户使用,我们同时提供了按模型集合进行预测库裁剪的功能。用户可以提供一个模型集合,Model Optimize Tool会根据用户所指定的模型集合分析其**优化后的**模型所需要的算子信息对预测库进行裁剪。使用此功能用户根据自己的需要使用模型集合来对预测库中的算子进行任意裁剪。 +为了方便用户使用,我们同时提供了按模型集合进行预测库裁剪的功能。用户可以提供一个模型集合,opt 会根据用户所指定的模型集合分析其**优化后的**模型所需要的算子信息对预测库进行裁剪。使用此功能用户根据自己的需要使用模型集合来对预测库中的算子进行任意裁剪。 使用方法如下所示: ```shell # 非combined模型集合 -./model_optimize_tool \ +./opt \ --model_set_dir= \ --optimize_out_type=naive_buffer \ --optimize_out= \ @@ -165,7 +169,7 @@ int main(int argc, char** argv) { --valid_targets=arm # combined模型集合 -./model_optimize_tool \ +./opt \ --model_set_dir= \ --optimize_out_type=naive_buffer \ --model_filename= \ @@ -175,11 +179,11 @@ int main(int argc, char** argv) { --valid_targets=arm ``` -经过以上步骤后会在``中生成模型集合中各模型对应的NaiveBuffer格式的优化模型。此步会对模型集合中所需算子信息进行搜集并存储到``中。下一步编译预测库的流程与使用单模型进行预测库裁剪步骤相同。 +经过以上步骤后会在``中生成模型集合中各模型对应的`NaiveBuffer`格式的优化模型。此步会对模型集合中所需算子信息进行搜集并存储到``中。下一步编译预测库的流程与使用单模型进行预测库裁剪步骤相同。 **注意:** 1. 模型集合**必须**均为combined参数模型或均为非combined参数模型。 2. 使用非combined参数模型时,模型拓扑文件名应为`__model__`,使用非combined参数模型时,集合中各模型的拓扑与参数名应相同,分别由`--model_filename`和`--param_filename`指定。 3. 模型集合**必须**均为INT8量化模型或均为非INT8量化模型。 -4. 需要使用Paddle-Lite 最新版本(release/v2.1.0之后)代码编译出的model_optimize_tool。 +4. 需要使用Paddle-Lite `release/v2.1.0`之后版本代码编译出的模型优化工具。 diff --git a/docs/user_guides/model_optimize_tool.md b/docs/user_guides/model_optimize_tool.md index fccc6d8b23c78474257d11399d121816f57fc422..fed728cb0e06c9758a0497a9cbb93d7edf39bda7 100644 --- a/docs/user_guides/model_optimize_tool.md +++ b/docs/user_guides/model_optimize_tool.md @@ -1,161 +1,61 @@ -# 模型转化方法 +# 模型优化工具 opt -Lite架构在预测过程中表现出来的高性能得益于其丰富的优化组件,其中包括量化、子图融合、混合调度、Kernel优选等等策略。为了使优化过程更加方便易用,我们提供了**opt**来自动完成优化步骤,输出一个轻量的、最优的可执行模型。具体使用方法介绍如下: +Paddle-Lite 提供了多种策略来自动优化原始的训练模型,其中包括量化、子图融合、混合调度、Kernel优选等等方法。为了使优化过程更加方便易用,我们提供了**opt** 工具来自动完成优化步骤,输出一个轻量的、最优的可执行模型。 -**注意**:release/v2.2.0之前的模型转化工具名称为`model_optimize_tool`,从release/v2.3开始模型转化工具名称修改为`opt` +具体使用方法介绍如下: -## 准备opt -当前获得opt方法有三种: - -1. 我们提供当前develop分支编译结果下载:[opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt)、[opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) -release/v2.2.0之前版本的model_optimize_tool: [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool)、[model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) - -2. 可以进入Paddle-Lite Github仓库的[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases),选择release版本下载对应的转化工具`opt` - (release/v2.2.0之前的转化工具为model_optimize_tool、release/v2.3.0之后为opt) +**注意**:`v2.2.0` 之前的模型转化工具名称为`model_optimize_tool`,从 `v2.3` 开始模型转化工具名称修改为 `opt`,从`v2.6.0`开始支持python调用`opt`转化模型(Windows/Ubuntu/Mac) -3. 可以下载Paddle-Lite源码,从源码编译出opt工具 -```bash -git clone https://github.com/PaddlePaddle/Paddle-Lite.git -cd Paddle-Lite -git checkout -./lite/tools/build.sh build_optimize_tool -``` -编译结果位于`Paddle-Lite/build.opt/lite/api/opt` -**注意**:从源码编译opt前需要先[安装Paddle-Lite的开发环境](../installation/source_compile)。 +## 准备opt +当前获得`opt`工具的方法有三种: -## 使用opt +- 方法一: 安装opt的python版本 -opt是x86平台上的可执行文件,需要在PC端运行:包括Linux终端和Mac终端。 +安装`paddlelite` python库,安装成功后调用opt转化模型(支持`windows\Mac\Ubuntu`) -### 帮助信息 - 执行opt时不加入任何输入选项,会输出帮助信息,提示当前支持的选项: ```bash - ./opt +pip install paddlelite ``` -![](https://paddlelite-data.bj.bcebos.com/doc_images/1.png) - -### 功能一:转化模型为Paddle-Lite格式 -opt可以将PaddlePaddle支持的模型转化为Paddle-Lite支持的模型格式,期间执行的操作包括:将protobuf格式的模型文件转化为naive_buffer格式的模型文件,有效降低模型体积;执行“量化、子图融合、混合调度、Kernel优选”等图优化操作,提升其在Paddle-Lite上的运行速度、内存占用等性能指标。 - -模型优化过程: -(1)准备待优化的PaddlePaddle模型 +- 方法二: 下载opt可执行文件 +从[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases),选择当前预测库对应版本的`opt`转化工具 -PaddlePaddle模型有两种保存格式: - Combined Param:所有参数信息保存在单个文件`params`中,模型的拓扑信息保存在`__model__`文件中。 +本文提供`release/v2.6`和`release/v2.2.0`版本的优化工具下载 -![opt_combined_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fcombined_model.png) +|版本 | Linux | MacOS| +|---|---|---| +| `release/v2.3`| [opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) | +|`release/v2.2.0` | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) | - Seperated Param:参数信息分开保存在多个参数文件中,模型的拓扑信息保存在`__model__`文件中。 -![opt_seperated_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fseperated_model.png) - -(2) 终端中执行`opt`优化模型 -**使用示例**:转化`mobilenet_v1`模型 +- 方法三: 源码编译opt +源码编译 opt 可执行文件 ``` -./opt --model_dir=./mobilenet_v1 --valid_targets=arm --optimize_out_type=naive_buffer --optimize_out=mobilenet_v1_opt +cd Paddle-Lite && ./lite/tools/build.sh build_optimize_tool ``` -以上命令可以将`mobilenet_v1`模型转化为arm硬件平台、naive_buffer格式的Paddle_Lite支持模型,优化后的模型文件为`mobilenet_v1_opt.nb`,转化结果如下图所示: -![opt_resulted_model](https://paddlelite-data.bj.bcebos.com/doc_images/2.png) +编译结果位于`build.opt/lite/api/`下的可执行文件`opt` + +## 使用opt +当前使用`opt`工具转化模型的方法有以下三种: -(3) **更详尽的转化命令**总结: +- 方法一: [安装 python版本opt后,使用终端命令](./opt/opt_python) (支持Mac/Ubuntu) +- 方法二: [安装python版本opt后,使用python脚本](../api_reference/python_api/opt)(支持window/Mac/Ubuntu) +- 方法三:[直接下载并执行opt可执行工具](./opt/opt_bin)(支持Mac/Ubuntu) +- Q&A:如何安装python版本opt ? +可以通过以下命令安装paddlelite的python库(支持`windows/Mac/Ubuntu`): ```shell -./opt \ - --model_dir= \ - --model_file= \ - --param_file= \ - --optimize_out_type=(protobuf|naive_buffer) \ - --optimize_out= \ - --valid_targets=(arm|opencl|x86|npu|xpu) \ - --prefer_int8_kernel=(true|false) \ - --record_tailoring_info =(true|false) +pip install paddlelite ``` -| 选项 | 说明 | -| ------------------- | ------------------------------------------------------------ | -| --model_dir | 待优化的PaddlePaddle模型(非combined形式)的路径 | -| --model_file | 待优化的PaddlePaddle模型(combined形式)的网络结构文件路径。 | -| --param_file | 待优化的PaddlePaddle模型(combined形式)的权重文件路径。 | -| --optimize_out_type | 输出模型类型,目前支持两种类型:protobuf和naive_buffer,其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测,请将此选项设置为naive_buffer。默认为protobuf。 | -| --optimize_out | 优化模型的输出路径。 | -| --valid_targets | 指定模型可执行的backend,默认为arm。目前可支持x86、arm、opencl、npu、xpu,可以同时指定多个backend(以空格分隔),Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU(Kirin 810/990 Soc搭载的达芬奇架构NPU),应当设置为npu, arm。 | -| --prefer_int8_kernel | 若待优化模型为int8量化模型(如量化训练得到的量化模型),则设置该选项为true以使用int8内核函数进行推理加速,默认为false。 | -| --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时,则设置该选项为true,以记录优化后模型含有的kernel和OP信息,默认为false。 | - -* 如果待优化的fluid模型是非combined形式,请设置`--model_dir`,忽略`--model_file`和`--param_file`。 -* 如果待优化的fluid模型是combined形式,请设置`--model_file`和`--param_file`,忽略`--model_dir`。 -* 优化后的模型包括__model__.nb和param.nb文件。 - -### 功能二:统计模型算子信息、判断是否支持 - -opt可以统计并打印出model中的算子信息、判断Paddle-Lite是否支持该模型。并可以打印出当前Paddle-Lite的算子支持情况。 - -(1)使用opt统计模型中算子信息 - -下面命令可以打印出mobilenet_v1模型中包含的所有算子,并判断在硬件平台`valid_targets`下Paddle-Lite是否支持该模型 -`./opt --print_model_ops=true --model_dir=mobilenet_v1 --valid_targets=arm` -![opt_print_modelops](https://paddlelite-data.bj.bcebos.com/doc_images/3.png) - -(2)使用opt打印当前Paddle-Lite支持的算子信息 - -`./opt --print_all_ops=true` - -以上命令可以打印出当前Paddle-Lite支持的所有算子信息,包括OP的数量和每个OP支持哪些硬件平台: - -![opt_print_allops](https://paddlelite-data.bj.bcebos.com/doc_images/4.png) - -`./opt ----print_supported_ops=true --valid_targets=x86` - -以上命令可以打印出当`valid_targets=x86`时Paddle-Lite支持的所有OP: - -![opt_print_supportedops](https://paddlelite-data.bj.bcebos.com/doc_images/5.png) - -## 其他功能:合并x2paddle和opt的一键脚本 +## 合并x2paddle和opt的一键脚本 **背景**:如果想用Paddle-Lite运行第三方来源(tensorflow、caffe、onnx)模型,一般需要经过两次转化。即使用x2paddle工具将第三方模型转化为PaddlePaddle格式,再使用opt将PaddlePaddle模型转化为Padde-Lite可支持格式。 -为了简化这一过程,我们提供一键脚本,将x2paddle转化和opt转化合并: +为了简化这一过程,我们提供了: -**一键转化脚本**:[auto_transform.sh](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/auto_transform.sh) - - -**环境要求**:使用`auto_transform.sh`脚本转化第三方模型时,需要先安装x2paddle环境,请参考[x2paddle环境安装方法](https://github.com/PaddlePaddle/X2Paddle#环境依赖) 安装x2paddle和其环境依赖项。 - -**使用方法**: - -(1)打印帮助帮助信息:` ./auto_transform.sh` - -(2)转化模型方法 - -```bash -USAGE: - auto_transform.sh combines the function of x2paddle and opt, it can - tranform model from tensorflow/caffe/onnx form into paddle-lite naive-buffer form. ----------------------------------------- -example: - ./auto_transform.sh --framework=tensorflow --model=tf_model.pb --optimize_out=opt_model_result ----------------------------------------- -Arguments about x2paddle: - --framework=(tensorflow|caffe|onnx); - --model='model file for tensorflow or onnx'; - --prototxt='proto file for caffe' --weight='weight file for caffe' - For TensorFlow: - --framework=tensorflow --model=tf_model.pb - - For Caffe: - --framework=caffe --prototxt=deploy.prototxt --weight=deploy.caffemodel - - For ONNX - --framework=onnx --model=onnx_model.onnx - -Arguments about opt: - --valid_targets=(arm|opencl|x86|npu|xpu); valid targets on Paddle-Lite. - --fluid_save_dir='path to outputed model after x2paddle' - --optimize_out='path to outputed Paddle-Lite model' ----------------------------------------- -``` + [合并x2paddle和opt的一键脚本](./opt/x2paddle&opt) diff --git a/docs/user_guides/model_quantization.md b/docs/user_guides/model_quantization.md new file mode 100644 index 0000000000000000000000000000000000000000..cb1e4a4337594521cdebaf479faa77547f2c8bf8 --- /dev/null +++ b/docs/user_guides/model_quantization.md @@ -0,0 +1,66 @@ +# 模型量化-量化训练 + +本文主要介绍使用Paddle-Lite加载PaddlePaddle产出的量化模型,并进行推理执行。 + +## 1 简介 + +量化训练是使用较多练数据,对训练好的预测模型进行量化。该方法使用模拟量化的思想,在训练阶段更新权重,实现减小量化误差。 + +使用条件: +* 有预训练模型 +* 有较多训练数据(大于5000) + +使用步骤: +* 产出量化模型:使用PaddlePaddle调用量化训练接口,产出量化模型 +* 量化模型预测:使用PaddleLite加载量化模型进行预测推理 + +优点: +* 减小计算量、降低计算内存、减小模型大小 +* 模型精度受量化影响小 + +缺点: +* 使用条件较苛刻,使用门槛稍高 + +建议首先使用“有校准数据训练后量化”对模型进行量化,然后使用使用量化模型进行预测。如果该量化模型的精度达不到要求,再使用“量化训练”。 + +## 2 产出量化模型 + +目前,PaddleSlim 框架的量化训练主要针对卷积层(包括二维卷积和Depthwise卷积)、和全连接层,对应算子是conv2d、depthwise_conv2d和mul。Paddle-Lite支持运行PaddlePaddle框架量化训练产出的模型,可以进一步加快模型在移动端的执行速度。 + +温馨提示:如果您是初次接触PaddlePaddle框架,建议首先学习[新人入门](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)和[使用指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/index_cn.html)。 + +使用PaddleSlim模型压缩工具训练量化模型,请参考文档: +* 量化训练[快速开始教程](https://paddlepaddle.github.io/PaddleSlim/quick_start/quant_aware_tutorial.html) +* 量化训练[API接口说明](https://paddlepaddle.github.io/PaddleSlim/api_cn/quantization_api.html) +* 量化训练[Demo](https://github.com/PaddlePaddle/PaddleSlim/tree/release/1.0.1/demo/quant/quant_aware) + +## 3 使用Paddle-Lite运行量化模型推理 + +首先,使用PaddleLite提供的模型转换工具(model_optimize_tool)将量化模型转换成移动端预测的模型,然后加载转换后的模型进行预测部署。 + +### 3.1 模型转换 + +参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具,建议从Release页面下载。 + +参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具,参数按照实际情况设置。比如在安卓手机ARM端进行预测,模型转换的命令为: +```bash +./opt --model_dir=./mobilenet_v1_quant \ + --optimize_out_type=naive_buffer \ + --optimize_out=mobilenet_v1_quant_opt \ + --valid_targets=arm +``` + +### 3.2 量化模型预测 + +和FP32模型一样,转换后的量化模型可以在Android/IOS APP中加载预测,建议参考[C++ Demo](../demo_guides/cpp_demo)、[Java Demo](../demo_guides/java_demo)、[Android/IOS Demo](../demo_guides/android_app_demo)。 + + +## FAQ + +**问题**:Compiled with WITH_GPU, but no GPU found in runtime + +**解答**:检查本机是否支持GPU训练,如果不支持请使用CPU训练。如果在docker进行GPU训练,请使用nvidia_docker启动容器。 + +**问题**:Inufficient GPU memory to allocation. at [/paddle/paddle/fluid/platform/gpu_info.cc:262] + +**解答**:正确设置run.sh脚本中`CUDA_VISIBLE_DEVICES`,确保显卡剩余内存大于需要内存。 diff --git a/docs/user_guides/opencl.md b/docs/user_guides/opencl.md deleted file mode 100644 index e9533af1ff6e2447a8e4d389df90cdb457f58fb2..0000000000000000000000000000000000000000 --- a/docs/user_guides/opencl.md +++ /dev/null @@ -1,242 +0,0 @@ -# Lite基于OpenCL的ARM GPU预测 - -Lite支持在Android系统上运行基于OpenCL的程序,目前支持Ubuntu环境下armv8、armv7的交叉编译。 - -## 编译 - -### 编译环境 - -1. Docker 容器环境; -2. Linux(推荐 Ubuntu 16.04)环境。 - -详见 **源码编译指南-环境准备** 章节。 - -### 编译选项 - -|参数|介绍|值| -|--------|--------|--------| -|--arm_os|代表目标操作系统|目前仅支持且默认为`android`| -|--arm_abi|代表体系结构类型,支持armv8和armv7|默认为`armv8`即arm64-v8a;`armv7`即armeabi-v7a| -|--arm_lang|代表编译目标文件所使用的编译器|默认为gcc,支持 gcc和clang两种| - -### 编译Paddle-Lite OpenCL库范例 - -注:以android-armv8-opencl的目标、Docker容器的编译开发环境为例,CMake3.10,android-ndk-r17c位于`/opt/`目录下。 - -```bash -# 假设当前位于处于Lite源码根目录下 - -# 导入NDK_ROOT变量,注意检查您的安装目录若与本示例不同 -export NDK_ROOT=/opt/android-ndk-r17c - -# 删除上一次CMake自动生成的.h文件 -rm ./lite/api/paddle_use_kernels.h -rm ./lite/api/paddle_use_ops.h - -# 根据指定编译参数编译 -./lite/tools/ci_build.sh \ - --arm_os=android \ - --arm_abi=armv8 \ - --arm_lang=gcc \ - build_test_arm_opencl -``` - -编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内,这里仅罗列关键产物: - -- `cxx`:该目录是编译目标的C++的头文件和库文件; -- `demo`:该目录包含了两个demo,用来调用使用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`,分别对应`mobile_full`和`mobile_light`文件夹。编译对应的demo仅需在`mobile_full`或`mobile_light`文 - - `mobile_full`:使用cxx config,可直接加载fluid模型,若使用OpenCL需要在`mobilenetv1_full_api.cc`代码里开启`DEMO_USE_OPENCL`的宏,详细见代码注释; - - `mobile_light`:使用mobile config,只能加载`model_optimize_tool`优化过的模型; -- `opencl`:该目录存放opencl实现的相关kernel。 - -```bash -. -|-- cxx -| |-- include -| | |-- paddle_api.h -| | |-- paddle_image_preprocess.h -| | |-- paddle_lite_factory_helper.h -| | |-- paddle_place.h -| | |-- paddle_use_kernels.h -| | |-- paddle_use_ops.h -| | `-- paddle_use_passes.h -| `-- lib -| |-- libpaddle_api_full_bundled.a -| |-- libpaddle_api_light_bundled.a -| |-- libpaddle_full_api_shared.so -| `-- libpaddle_light_api_shared.so -|-- demo -| `-- cxx -| |-- Makefile.def -| |-- README.md -| |-- include -| | |-- paddle_api.h -| | |-- paddle_lite_factory_helper.h -| | |-- paddle_place.h -| | |-- paddle_use_kernels.h -| | |-- paddle_use_ops.h -| | `-- paddle_use_passes.h -| |-- mobile_full -| | |-- Makefile -| | `-- mobilenetv1_full_api.cc -| `-- mobile_light -| |-- Makefile -| `-- mobilenetv1_light_api.cc -`-- opencl - `-- cl_kernel - |-- buffer - | |-- depthwise_conv2d_kernel.cl - | |-- elementwise_add_kernel.cl - | |-- fc_kernel.cl - | |-- im2col_kernel.cl - | |-- layout_kernel.cl - | |-- mat_mul_kernel.cl - | |-- pool_kernel.cl - | `-- relu_kernel.cl - |-- cl_common.h - `-- image - |-- channel_add_kernel.cl - |-- elementwise_add_kernel.cl - |-- pool_kernel.cl - `-- relu_kernel.cl -``` - -调用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`见下一部分运行示例。 - - - -## 运行示例 - -下面以android、ARMv8、gcc的环境为例,介绍3个示例,分别如何在手机上执行基于OpenCL的ARM GPU推理过程。 - - -**注意:** 以下命令均在Lite源码根目录下运行。在3个示例前,下面这段命令都先要执行用来准备环境: - -```bash -# 在/data/local/tmp目录下创建OpenCL文件目录 -adb shell mkdir -p /data/local/tmp/opencl -adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/buffer -adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/image - -# 将OpenCL的kernels文件推送到/data/local/tmp/opencl目录下 -adb push lite/backends/opencl/cl_kernel/cl_common.h /data/local/tmp/opencl/cl_kernel/ -adb push lite/backends/opencl/cl_kernel/buffer/* /data/local/tmp/opencl/cl_kernel/buffer/ -adb push lite/backends/opencl/cl_kernel/image/* /data/local/tmp/opencl/cl_kernel/image/ -``` - -### 运行示例1: 编译产物demo示例 - -```bash -###################################################################### -# 编译mobile_full的demo # -###################################################################### -# 步骤: # -# 0.确保编译Paddle-Lite时编译了OpenCL; # -# 1.编辑`mobilenetv1_full_api.cc`代码, 开启`DEMO_USE_OPENCL`的宏; # -# 2.在产物目录`demo/cxx/mobile_full`下编译`mobile_full`的demo; # -# 3.上传demo, 模型, opencl kernel文件到手机; # -# 4.运行demo得到预期结果. # -###################################################################### -adb shell mkdir /data/local/tmp/opencl/mobilenet_v1 -chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api -adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api /data/local/tmp/opencl/ -adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1 - -# use mobile_full run mobilenet_v1 -# `GLOG_v` is log level -adb shell "export GLOG_v=0; \ - /data/local/tmp/opencl/mobilenetv1_full_api \ - --model_dir=/data/local/tmp/opencl/mobilenet_v1 \ - --optimized_model_dir=/data/local/tmp/opencl/full_api_opt_model" - - - -###################################################################### -# 编译mobile_light的demo # -###################################################################### -# 步骤: # -# 0.确保编译Paddle-Lite时编译了OpenCL; # -# 1.编译model_optimize_tool并对模型优化, `targets`参数为`opencl`; # -# 2.在产物目录`demo/cxx/mobile_light`下编译`mobile_light`的demo; # -# 3.上传demo, 模型, opencl kernel文件到手机; # -# 4.运行demo得到预期结果. # -###################################################################### - -# use model_optimize_tool to optimize model -./build.model_optimize_tool/lite/api/model_optimize_tool \ - --model_dir=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \ - --optimize_out_type=naive_buffer \ - --optimize_out=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \ - --valid_targets=opencl - -adb shell mkdir /data/local/tmp/opencl/mobilenet_v1 -chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api -adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp/opencl/ -adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1 - -# use mobile_light run mobilenet_v1 -adb shell "export GLOG_v=5; \ - /data/local/tmp/opencl/mobilenetv1_light_api \ - --model_dir=/data/local/tmp/opencl/" -``` - -### 运行示例2: test_mobilenetv1单元测试 - -- **运行文件准备** - -```bash -# 将mobilenet_v1的模型文件推送到/data/local/tmp/opencl目录下 -adb shell mkdir -p /data/local/tmp/opencl/mobilenet_v1 -adb push build.lite.android.armv8.gcc.opencl/third_party/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1/ - -# 将OpenCL单元测试程序test_mobilenetv1,推送到/data/local/tmp/opencl目录下 -adb push build.lite.android.armv8.gcc.opencl/lite/api/test_mobilenetv1 /data/local/tmp/opencl -``` - -- **执行OpenCL推理过程** - -使用如下命令运行OpenCL程序。其中: - -- `--cl_path`指定了OpenCL的kernels文件即cl\_kernel所在目录; -- `--modle_dir`指定了模型文件所在目录。 - -```bash -adb shell chmod +x /data/local/tmp/opencl/test_mobilenetv1 - -adb shell /data/local/tmp/opencl/test_mobilenetv1 \ - --cl_path=/data/local/tmp/opencl \ - --model_dir=/data/local/tmp/opencl/mobilenet_v1 \ - --warmup=1 \ - --repeats=1 -``` - -**注意:** 因为权重参数均会在Op Kernel第一次运行时进行加载,所以第一次的执行时间会略长。一般将warmup的值设为1,repeats值设为多次。 - -### 运行示例3: test_layout_opencl单元测试 - -- **运行文件准备** - -```bash -# 将OpenCL单元测试程序test_layout_opencl,推送到/data/local/tmp/opencl目录下 -adb push build.lite.android.armv8.gcc.opencl/lite/kernels/opencl/test_layout_opencl /data/local/tmp/opencl/ -``` - - -OpenCL推理过程** - -```bash -adb shell chmod +x /data/local/tmp/opencl/test_layout_opencl -adb shell /data/local/tmp/opencl/test_layout_opencl -``` - - -# 如何在Code中使用 - -见运行示例1的demo代码: - -1. [./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc); -2. [./lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc). - -注:这里给出的链接会跳转到线上最新develop分支的代码,很可能与您本地的代码存在差异,建议参考自己本地位于`lite/demo/cxx/`目录的代码,查看如何使用。 - -**NOTE:** 对OpenCL的支持还在持续开发中。 diff --git a/docs/user_guides/opt/opt_bin.md b/docs/user_guides/opt/opt_bin.md new file mode 100644 index 0000000000000000000000000000000000000000..0b9b614d6f18ab1cfd1e4bad0ccbf234752ef00c --- /dev/null +++ b/docs/user_guides/opt/opt_bin.md @@ -0,0 +1,96 @@ +## 使用opt转化模型 + +opt是 x86 平台上的可执行文件,需要在PC端运行:支持Linux终端和Mac终端。 + +### 帮助信息 + 执行opt时不加入任何输入选项,会输出帮助信息,提示当前支持的选项: +```bash + ./opt +``` +![](https://paddlelite-data.bj.bcebos.com/doc_images/1.png) + +### 功能一:转化模型为Paddle-Lite格式 +opt可以将PaddlePaddle的部署模型格式转化为Paddle-Lite 支持的模型格式,期间执行的操作包括: + +- 将protobuf格式的模型文件转化为naive_buffer格式的模型文件,有效降低模型体积 +- 执行“量化、子图融合、混合调度、Kernel优选”等图优化操作,提升其在Paddle-Lite上的运行速度、内存占用等效果 + +模型优化过程: + +(1)准备待优化的PaddlePaddle模型 + +PaddlePaddle模型有两种保存格式: + Combined Param:所有参数信息保存在单个文件`params`中,模型的拓扑信息保存在`__model__`文件中。 + +![opt_combined_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fcombined_model.png) + + Seperated Param:参数信息分开保存在多个参数文件中,模型的拓扑信息保存在`__model__`文件中。 +![opt_seperated_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fseperated_model.png) + +(2) 终端中执行`opt`优化模型 +**使用示例**:转化`mobilenet_v1`模型 + +```shell +paddle_lite_opt --model_dir=./mobilenet_v1 \ + --valid_targets=arm \ + --optimize_out_type=naive_buffer \ + --optimize_out=mobilenet_v1_opt +``` +以上命令可以将`mobilenet_v1`模型转化为arm硬件平台、naive_buffer格式的Paddle_Lite支持模型,优化后的模型文件为`mobilenet_v1_opt.nb`,转化结果如下图所示: + +![opt_resulted_model](https://paddlelite-data.bj.bcebos.com/doc_images/2.png) + + +(3) **更详尽的转化命令**总结: + +```shell +paddle_lite_opt \ + --model_dir= \ + --model_file= \ + --param_file= \ + --optimize_out_type=(protobuf|naive_buffer) \ + --optimize_out= \ + --valid_targets=(arm|opencl|x86|npu|xpu) \ + --record_tailoring_info =(true|false) +``` + +| 选项 | 说明 | +| ------------------- | ------------------------------------------------------------ | +| --model_dir | 待优化的PaddlePaddle模型(非combined形式)的路径 | +| --model_file | 待优化的PaddlePaddle模型(combined形式)的网络结构文件路径。 | +| --param_file | 待优化的PaddlePaddle模型(combined形式)的权重文件路径。 | +| --optimize_out_type | 输出模型类型,目前支持两种类型:protobuf和naive_buffer,其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测,请将此选项设置为naive_buffer。默认为protobuf。 | +| --optimize_out | 优化模型的输出路径。 | +| --valid_targets | 指定模型可执行的backend,默认为arm。目前可支持x86、arm、opencl、npu、xpu,可以同时指定多个backend(以空格分隔),Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU(Kirin 810/990 Soc搭载的达芬奇架构NPU),应当设置为npu, arm。 | +| --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时,则设置该选项为true,以记录优化后模型含有的kernel和OP信息,默认为false。 | + +* 如果待优化的fluid模型是非combined形式,请设置`--model_dir`,忽略`--model_file`和`--param_file`。 +* 如果待优化的fluid模型是combined形式,请设置`--model_file`和`--param_file`,忽略`--model_dir`。 +* 优化后的模型为以`.nb`名称结尾的单个文件。 +* 删除`prefer_int8_kernel`的输入参数,`opt`自动判别是否是量化模型,进行相应的优化操作。 + +### 功能二:统计模型算子信息、判断是否支持 + +opt可以统计并打印出model中的算子信息、判断Paddle-Lite是否支持该模型。并可以打印出当前Paddle-Lite的算子支持情况。 + +(1)使用opt统计模型中算子信息 + +下面命令可以打印出mobilenet_v1模型中包含的所有算子,并判断在硬件平台`valid_targets`下Paddle-Lite是否支持该模型 + +`./opt --print_model_ops=true --model_dir=mobilenet_v1 --valid_targets=arm` + +![opt_print_modelops](https://paddlelite-data.bj.bcebos.com/doc_images/3.png) + +(2)使用opt打印当前Paddle-Lite支持的算子信息 + +`./opt --print_all_ops=true` + +以上命令可以打印出当前Paddle-Lite支持的所有算子信息,包括OP的数量和每个OP支持哪些硬件平台: + +![opt_print_allops](https://paddlelite-data.bj.bcebos.com/doc_images/4.png) + +`./opt --print_supported_ops=true --valid_targets=x86` + +以上命令可以打印出当`valid_targets=x86`时Paddle-Lite支持的所有OP: + +![opt_print_supportedops](https://paddlelite-data.bj.bcebos.com/doc_images/5.png) diff --git a/docs/user_guides/opt/opt_python.md b/docs/user_guides/opt/opt_python.md new file mode 100644 index 0000000000000000000000000000000000000000..f681d637f828ba52a97a55903c96c1bae19c705c --- /dev/null +++ b/docs/user_guides/opt/opt_python.md @@ -0,0 +1,103 @@ + +## python调用opt转化模型 + +安装了paddle-lite 的python库后,可以通过python调用 opt 工具转化模型。(支持MAC&Ubuntu系统) + +### 安装Paddle-Lite + +``` +pip install paddlelite +``` + +### 帮助信息 +安装成功后可以查看帮助信息 +```bash + paddle_lite_opt +``` +![](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/python_opt/help.jpg) + +### 功能一:转化模型为Paddle-Lite格式 +opt可以将PaddlePaddle的部署模型格式转化为Paddle-Lite 支持的模型格式,期间执行的操作包括: + +- 将protobuf格式的模型文件转化为naive_buffer格式的模型文件,有效降低模型体积 +- 执行“量化、子图融合、混合调度、Kernel优选”等图优化操作,提升其在Paddle-Lite上的运行速度、内存占用等效果 + +模型优化过程: + +(1)准备待优化的PaddlePaddle模型 + +PaddlePaddle模型有两种保存格式: + Combined Param:所有参数信息保存在单个文件`params`中,模型的拓扑信息保存在`__model__`文件中。 + +![opt_combined_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fcombined_model.png) + + Seperated Param:参数信息分开保存在多个参数文件中,模型的拓扑信息保存在`__model__`文件中。 +![opt_seperated_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fseperated_model.png) + +(2) 终端中执行`opt`优化模型 +**使用示例**:转化`mobilenet_v1`模型 + +``` +paddle_lite_opt --model_dir=./mobilenet_v1 \ + --valid_targets=arm \ + --optimize_out_type=naive_buffer \ + --optimize_out=mobilenet_v1_opt +``` +以上命令可以将`mobilenet_v1`模型转化为arm硬件平台、naive_buffer格式的Paddle_Lite支持模型,优化后的模型文件为`mobilenet_v1_opt.nb`,转化结果如下图所示: + +![opt_resulted_model](https://paddlelite-data.bj.bcebos.com/doc_images/2.png) + + +(3) **更详尽的转化命令**总结: + +```shell +paddle_lite_opt \ + --model_dir= \ + --model_file= \ + --param_file= \ + --optimize_out_type=(protobuf|naive_buffer) \ + --optimize_out= \ + --valid_targets=(arm|opencl|x86|npu|xpu) \ + --record_tailoring_info =(true|false) +``` + +| 选项 | 说明 | +| ------------------- | ------------------------------------------------------------ | +| --model_dir | 待优化的PaddlePaddle模型(非combined形式)的路径 | +| --model_file | 待优化的PaddlePaddle模型(combined形式)的网络结构文件路径。 | +| --param_file | 待优化的PaddlePaddle模型(combined形式)的权重文件路径。 | +| --optimize_out_type | 输出模型类型,目前支持两种类型:protobuf和naive_buffer,其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测,请将此选项设置为naive_buffer。默认为protobuf。 | +| --optimize_out | 优化模型的输出路径。 | +| --valid_targets | 指定模型可执行的backend,默认为arm。目前可支持x86、arm、opencl、npu、xpu,可以同时指定多个backend(以空格分隔),Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU(Kirin 810/990 Soc搭载的达芬奇架构NPU),应当设置为npu, arm。 | +| --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时,则设置该选项为true,以记录优化后模型含有的kernel和OP信息,默认为false。 | + +* 如果待优化的fluid模型是非combined形式,请设置`--model_dir`,忽略`--model_file`和`--param_file`。 +* 如果待优化的fluid模型是combined形式,请设置`--model_file`和`--param_file`,忽略`--model_dir`。 +* 优化后的模型为以`.nb`名称结尾的单个文件。 +* 删除`prefer_int8_kernel`的输入参数,`opt`自动判别是否是量化模型,进行相应的优化操作。 + +### 功能二:统计模型算子信息、判断是否支持 + +opt可以统计并打印出model中的算子信息、判断Paddle-Lite是否支持该模型。并可以打印出当前Paddle-Lite的算子支持情况。 + +(1)使用opt统计模型中算子信息 + +下面命令可以打印出mobilenet_v1模型中包含的所有算子,并判断在硬件平台`valid_targets`下Paddle-Lite是否支持该模型 + +`paddle_lite_opt --print_model_ops=true --model_dir=mobilenet_v1 --valid_targets=arm` + +![opt_print_modelops](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/python_opt/check_model.png) + +(2)使用opt打印当前Paddle-Lite支持的算子信息 + +`paddle_lite_opt --print_all_ops=true` + +以上命令可以打印出当前Paddle-Lite支持的所有算子信息,包括OP的数量和每个OP支持哪些硬件平台: + +![opt_print_allops](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/python_opt/print_op.png) + +`paddle_lite_opt --print_supported_ops=true --valid_targets=x86` + +以上命令可以打印出当`valid_targets=x86`时Paddle-Lite支持的所有OP: + +![opt_print_supportedops](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/python_opt/print_x86op.png) diff --git a/docs/user_guides/opt/x2paddle&opt.md b/docs/user_guides/opt/x2paddle&opt.md new file mode 100644 index 0000000000000000000000000000000000000000..1316f5e4c12b035d9b1ab2972b0e39195007a9ac --- /dev/null +++ b/docs/user_guides/opt/x2paddle&opt.md @@ -0,0 +1,43 @@ +## 合并x2paddle和opt的一键脚本 + +**背景**:如果想用Paddle-Lite运行第三方来源(tensorflow、caffe、onnx)模型,一般需要经过两次转化。即使用x2paddle工具将第三方模型转化为PaddlePaddle格式,再使用opt将PaddlePaddle模型转化为Padde-Lite可支持格式。 +为了简化这一过程,我们提供一键脚本,将x2paddle转化和opt转化合并: + +**一键转化脚本**:[auto_transform.sh](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.3/lite/tools/auto_transform.sh) + + +**环境要求**:使用`auto_transform.sh`脚本转化第三方模型时,需要先安装x2paddle环境,请参考[x2paddle环境安装方法](https://github.com/PaddlePaddle/X2Paddle#环境依赖) 安装x2paddle和x2paddle依赖项(tensorflow、caffe等)。 + +**使用方法**: + +(1)打印帮助帮助信息:` sh ./auto_transform.sh` + +(2)转化模型方法 + +```bash +USAGE: + auto_transform.sh combines the function of x2paddle and opt, it can + tranform model from tensorflow/caffe/onnx form into paddle-lite naive-buffer form. +---------------------------------------- +example: + sh ./auto_transform.sh --framework=tensorflow --model=tf_model.pb --optimize_out=opt_model_result +---------------------------------------- +Arguments about x2paddle: + --framework=(tensorflow|caffe|onnx); + --model='model file for tensorflow or onnx'; + --prototxt='proto file for caffe' --weight='weight file for caffe' + For TensorFlow: + --framework=tensorflow --model=tf_model.pb + + For Caffe: + --framework=caffe --prototxt=deploy.prototxt --weight=deploy.caffemodel + + For ONNX + --framework=onnx --model=onnx_model.onnx + +Arguments about opt: + --valid_targets=(arm|opencl|x86|npu|xpu); valid targets on Paddle-Lite. + --fluid_save_dir='path to outputed model after x2paddle' + --optimize_out='path to outputed Paddle-Lite model' +---------------------------------------- +``` diff --git a/docs/user_guides/paddle_mobile.md b/docs/user_guides/paddle_mobile.md new file mode 100644 index 0000000000000000000000000000000000000000..43d17db7be4935b11ff0101e06e1f06998e9f532 --- /dev/null +++ b/docs/user_guides/paddle_mobile.md @@ -0,0 +1,7 @@ +# paddle-mobile 编译 + +详情可以参考 [mobile/README](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/mobile) + +要切换 paddle-mobile 编译,cmake 需要加上 **-DWITH_PADDLE_MOBILE=ON** 开关,其余 flag 请参考上面文档添加到后面 + +所有其他选项跟 paddle-mobile 原始操作完全一致 diff --git a/docs/user_guides/post_quant_no_data.md b/docs/user_guides/post_quant_no_data.md new file mode 100644 index 0000000000000000000000000000000000000000..7443b4cac9d8de977ce6d52e6a61b8f78b7aaef4 --- /dev/null +++ b/docs/user_guides/post_quant_no_data.md @@ -0,0 +1,109 @@ +# 模型量化-无校准数据训练后量化 + +本文首先简单介绍无校准数据训练后量化,然后说明产出量化模型,最后阐述量化模型预测。 + +## 1 简介 + +无校准数据训练后量化,将模型中特定OP的权重从FP32类型量化成INT8/16类型,可以减小预测模型的大小。使用该量化模型预测,首先将INT8/16类型的权重反量化成FP32类型,然后再进行预测。 + +使用条件: +* 有训练好的预测模型 + +使用步骤: +* 产出量化模型:使用PaddlePaddle调用无校准数据训练后量化接口,产出量化模型 +* 量化模型预测:使用PaddleLite加载量化模型进行预测推理 + +优点: +* 权重量化成INT16类型,模型精度不受影响,模型大小为原始的1/2 +* 权重量化成INT8类型,模型精度会受到影响,模型大小为原始的1/4 + +缺点: +* 只可以减小模型大小,不能加快模型推理 + +## 2 产出量化模型 + +因为目前该方法还没有在PaddleSlim中集成,大家可以使用PaddlePaddle调用无校准数据训练后量化接口,得到量化模型。 + +### 2.1 安装PaddlePaddle + +参考PaddlePaddle[官网](https://www.paddlepaddle.org.cn/install/quick),安装PaddlePaddle CPU/GPU 1.7版本。 + +### 2.2 准备模型 + +准备已经训练好的FP32预测模型,即 `save_inference_model()` 保存的模型。 + +### 2.3 调用无校准数据训练后量化 + +对于调用无校准数据训练后量化,首先给出一个例子。 + +```python +from paddle.fluid.contrib.slim.quantization import WeightQuantization + +model_dir = path/to/fp32_model_params +save_model_dir = path/to/save_model_path +weight_quant = WeightQuantization(model_dir=model_dir) +weight_quant.quantize_weight_to_int(save_model_dir=save_model_dir, + weight_bits=8, + quantizable_op_type=['conv2d', 'mul'], + weight_quantize_type="channel_wise_abs_max", + generate_test_model=False) +``` + +执行完成后,可以在 `save_model_dir/quantized_model` 目录下得到量化模型。 + + +对于调用无校准数据训练后量化,以下对api接口进行详细介绍。 + +```python +class WeightQuantization(model_dir, model_filename=None, params_filename=None) +``` +参数说明如下: +* model_dir(str):待量化模型的路径,其中保存模型文件和权重文件。 +* model_filename(str, optional):待量化模型的模型文件名,如果模型文件名不是`__model__`,则需要使用model_filename设置模型文件名。 +* params_filename(str, optional):待量化模型的权重文件名,如果所有权重保存成一个文件,则需要使用params_filename设置权重文件名。 + +```python +WeightQuantization.quantize_weight_to_int(self, + save_model_dir, + save_model_filename=None, + save_params_filename=None, + quantizable_op_type=["conv2d", "mul"], + weight_bits=8, + weight_quantize_type="channel_wise_abs_max", + generate_test_model=False, + threshold_rate=0.0) +``` +参数说明如下: +* save_model_dir(str):保存量化模型的路径。 +* save_model_filename(str, optional):如果save_model_filename等于None,则模型的网络结构保存到__model__文件,如果save_model_filename不等于None,则模型的网络结构保存到特定的文件。默认为None。 +* save_params_filename(str, optional):如果save_params_filename等于None,则模型的参数分别保存到一系列文件中,如果save_params_filename不等于None,则模型的参数会保存到一个文件中,文件名为设置的save_params_filename。默认为None。 +* quantizable_op_type(list[str]): 需要量化的op类型,默认是`['conv2d', 'mul']`,列表中的值可以是任意支持量化的op类型 `['conv2d', 'depthwise_conv2d', 'mul']`。一般不对 `depthwise_conv2d` 量化,因为对减小模型大小收益不大,同时可能影响模型精度。 +* weight_bits(int, optional):权重量化保存的比特数,可以是8~16,一般设置为8/16,默认为8。量化为8bit,模型体积最多可以减小4倍,可能存在微小的精度损失。量化成16bit,模型大小最多可以减小2倍,基本没有精度损失。 +* weight_quantize_type(str, optional): 权重量化的方式,支持 `channel_wise_abs_max` 和 `abs_max`,一般都是 `channel_wise_abs_max`,量化模型精度损失小。 +* generate_test_model(bool, optional): 是否产出测试模型,用于测试量化模型部署时的精度。测试模型保存在 `save_model_dir/test_model` 目录下,可以和FP32模型一样使用Fluid加载测试,但是该模型不能用于预测端部署。 + + +## 3 量化模型预测 + +目前,对于无校准数据训练后量化产出的量化模型,只能使用PaddleLite进行预测部署。 + +很简单,首先使用PaddleLite提供的模型转换工具(opt)将量化模型转换成移动端预测的模型,然后加载转换后的模型进行预测部署。 + +注意,PaddleLite 2.3版本才支持无校准数据训练后量化产出的量化,所以转换工具和预测库必须是2.3及之后的版本。 + +### 3.1 模型转换 + +参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具,建议从Release页面下载。 + +参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。 +比如在安卓手机ARM端进行预测,模型转换的命令为: +```bash +./opt --model_dir=./mobilenet_v1_quant \ + --optimize_out_type=naive_buffer \ + --optimize_out=mobilenet_v1_quant_opt \ + --valid_targets=arm +``` + +### 3.2 量化模型预测 + +和FP32模型一样,转换后的量化模型可以在Android/IOS APP中加载预测,建议参考[C++ Demo](../demo_guides/cpp_demo)、[Java Demo](../demo_guides/java_demo)、[Android/IOS Demo](../demo_guides/android_app_demo)。 diff --git a/docs/user_guides/post_quant_with_data.md b/docs/user_guides/post_quant_with_data.md new file mode 100644 index 0000000000000000000000000000000000000000..11b33c06e31f7f6ab63970ef307d7741888445e3 --- /dev/null +++ b/docs/user_guides/post_quant_with_data.md @@ -0,0 +1,105 @@ +# 模型量化-有校准数据训练后量化 + +## 1 简介 + +有校准数据训练后量化,使用少量校准数据计算量化因子,可以快速得到量化模型。使用该量化模型进行预测,可以减少计算量、降低计算内存、减小模型大小。 + +有校准数据训练后量化中,有两种计算量化因子的方法,非饱和量化方法和饱和量化方法。非饱和量化方法计算整个Tensor的绝对值最大值`abs_max`,将其映射为127。饱和量化方法使用KL散度计算一个合适的阈值`T` (`0 -``` - -### 编译模式与参数 +`develop分支`和`release/v2.6.0`之后版本的源码编译请参考以下说明,release/v2.3之前版本(包括v2.3)源码编译请参考[release/v2.3源码编译方法](./Compile/v2.3_compile)。 -编译脚本`./lite/tools/build.sh`,支持三种编译模式: +### Android 预测库编译方法 -| 编译模式 | 介绍 | 适用对象 | -|:-------:|-----|:-------:| -| tiny_publish | 编译移动端部署库,无第三方库依赖 | 用户 | -| full_publish | 编译移动端部署库,有第三方依赖如protobuf、glags等,含有可将模型转换为无需protobuf依赖的naive buffer格式的工具,供tiny_publish库使用 | 用户 | -| test | 编译指定`arm_os`、`arm_abi`下的移动端单元测试 | 框架开发者 | +Paddle-Lite支持在 “Docker 环境、Linux 环境、Mac 环境” 源码编译Android 预测库 -编译脚本`./lite/tools/build.sh`,追加参数说明: +**编译方法参见**:[Android预测库编译方法](./Compile/Android) -| 参数 | 介绍 | 值 | -|-----------|-------------|-------------| -| --arm_os |必选,选择安装平台 | `android`、`ios`、`ios64`、`armlinux` | -| --arm_abi |必选,选择编译的arm版本,其中`armv7hf`为ARMLinux编译时选用| `armv8`、`armv7`、`armv7hf`(仅`armlinux`支持) | -| --arm_lang |arm_os=android时必选,选择编译器 | `gcc`、`clang`(`clang`当前暂不支持) | -| --android_stl |arm_os=android时必选,选择静态链接STL或动态链接STL | `c++_static`、`c++_shared`| -| --build_java | 可选,是否编译java预测库(默认为OFF) | `ON`、`OFF` | -| --build_extra | 可选,是否编译全量预测库(默认为OFF)。详情可参考[预测库说明](./library.html)。 | `ON`、`OFF` | -| target |必选,选择编译模式,`tiny_publish`为编译移动端部署库、`full_publish`为带依赖的移动端部署库、`test`为移动端单元测试、`ios`为编译ios端`tiny_publish` | `tiny_publish`、`full_publish`、`test`、 `ios` | -### 编译代码 +### iOS 预测库编译方法 -**注意**:非开发者建议在编译前使用[**“加速第三方依赖库的下载”**](#id22)的方法,加速工程中第三方依赖库的下载与编译。 +Paddle-Lite只支持在 “Mac 环境” 源码编译iOS 预测库 -#### 编译`tiny publish`动态库 +**编译方法参见**:[iOS预测库编译方法](./Compile/iOS) -##### Android -```shell -./lite/tools/build.sh \ - --arm_os=android \ - --arm_abi=armv8 \ - --build_extra=OFF \ - --arm_lang=gcc \ - --android_stl=c++_static \ - --build_extra=OFF \ - tiny_publish -``` -##### IOS -```shell -./lite/tools/build.sh \ - --arm_os=ios64 \ - --arm_abi=armv8 \ - --build_extra=OFF \ - ios -``` -**注意:mac环境编译IOS 时,cmake版本需要高于cmake 3.15;mac环境上编译Android时,cmake版本需要设置为cmake 3.10。** - -ios tiny publish支持的编译选项: - -* `--arm_os`: 可选ios或者ios64 -* `--arm_abi`: 可选armv7和armv8(**注意**:当`arm_os=ios`时只能选择`arm_abi=armv7`,当`arm_os=ios64`时只能选择`arm_abi=armv8`) -* 如果mac编译过程中报错:"Invalid CMAKE_DEVELOPER_ROOT: does not exist", 运行: -```shell -sudo xcode-select -s /Applications/Xcode.app/Contents/Developer -``` -##### ARMLinux -```shell -./lite/tools/build.sh \ - --build_extra=OFF \ - --arm_os=armlinux \ - --arm_abi=armv7hf \ - --arm_lang=gcc \ - --build_extra=OFF \ - tiny_publish -``` -- `--arm_abi`: 树莓派3b使用armv7hf,RK3399使用armv8 - -#### 编译`full publish`动态库 - -##### Android -```shell -./lite/tools/build.sh \ - --arm_os=android \ - --arm_abi=armv8 \ - --build_extra=OFF \ - --arm_lang=gcc \ - --android_stl=c++_static \ - --build_extra=OFF \ - full_publish -``` -##### ARMLinux -```shell -./lite/tools/build.sh \ - --arm_os=armlinux \ - --arm_abi=armv7hf \ - --arm_lang=gcc \ - --build_extra=OFF \ - full_publish -``` -- `--arm_abi`: 树莓派3b使用armv7hf,RK3399使用armv8 - -### 编译结果说明 -**编译最终产物位置**在 `build.lite.xxx.xxx.xxx` 下的 `inference_lite_lib.xxx.xxx` ,如 Android 下 ARMv8 的产物位于`inference_lite_lib.android.armv8`: +### Linux 预测库编译方法 -![](https://user-images.githubusercontent.com/45189361/65375706-204e8780-dccb-11e9-9816-ab4563ce0963.png) +**编译方法参见**:[Linux预测库编译方法](./Compile/Linux) -**目录内容**(可能)如下: -**Full_publish编译结果:** - -![](https://user-images.githubusercontent.com/45189361/65375704-19c01000-dccb-11e9-9650-6856c7a5bf82.png) - -**Tiny_publish结果:** - -![](https://user-images.githubusercontent.com/45189361/65375726-3bb99280-dccb-11e9-9903-8ce255371905.png) - -**IOS编译结果:** - -![](https://user-images.githubusercontent.com/45189361/65375726-3bb99280-dccb-11e9-9903-8ce255371905.png) - - - -**具体内容**说明: - -1、 `bin`文件夹:可执行工具文件 `paddle_code_generator`、`test_model_bin` - -2、 `cxx`文件夹:包含c++的库文件与相应的头文件 - -- `include` : 头文件 -- `lib` : 库文件 - - 打包的静态库文件: - - `libpaddle_api_full_bundled.a` :包含 full_api 和 light_api 功能的静态库 - - `libpaddle_api_light_bundled.a` :只包含 light_api 功能的静态库 - - 打包的动态态库文件: - - `libpaddle_full_api_shared.so` :包含 full_api 和 light_api 功能的动态库 - - `libpaddle_light_api_shared.so`:只包含 light_api 功能的动态库 - -3、 `demo`文件夹:示例 demo ,包含 C++ demo 和 Java demo。 - -- `cxx` : C++示例 demo - - `mobile_full` : full_api 的使用示例 - - `mobile_light` : light_api的使用示例 -- `java` :Java 示例 demo - - `android` : Java的 Android 示例 - -4、 `java` 文件夹:包含 Jni 的动态库文件与相应的 Jar 包 - -- `jar` : `PaddlePredictor.jar` -- `so` : Jni动态链接库 `libpaddle_lite_jni.so` - -5、 `third_party` 文件夹:第三方库文件`gflags` - -**注意:** - -1、 只有当`--arm_os=android` 时才会编译出: - -- Java库文件与示例:`Java`和`demo/java` - -- 动态库文件:`libpaddle_full_api_shared.so`,`libpaddle_light_api_shared.so` +### 加速第三方依赖库的下载 -2、 `tiny_publish`编译结果不包括 C++ demo和 C++ 静态库,但提供 C++ 的 light_api 动态库、 Jni 动态库和Java demo +如出现源码编译耗时过长,一般是第三方库下载过慢或失败导致: -### 加速第三方依赖库的下载 +- 移动端相关编译所需的第三方库均位于 `/third-party` 目录下,默认编译过程中,会利用`git submodule update --init --recursive`链上相关的第三方依赖的仓库。 -移动端相关编译所需的第三方库均位于 `/third-party` 目录下,默认编译过程中,会利用`git submodule update --init --recursive`链上相关的第三方依赖的仓库。 +- 为加速`full_publish`、`test`编译模式中对`protobuf`等第三方依赖的下载,`build.sh` 和 `ci_build.sh`支持了从国内 CDN 下载第三方依赖的压缩包。 -为加速`full_publish`、`test`编译模式中对`protobuf`等第三方依赖的下载,`build.sh` 和 `ci_build.sh`支持了从国内 CDN 下载第三方依赖的压缩包。 +可使用本节方法加速第三方库下载过程,以加速编译: -使用方法:`git clone`完`Paddle-Lite`仓库代码后,手动删除本地仓库根目录下的`third-party`目录: +- **加速方法**:`git clone`完`Paddle-Lite`仓库代码后,手动删除本地仓库根目录下的`third-party`目录: ```shell git clone https://github.com/PaddlePaddle/Paddle-Lite.git @@ -412,4 +276,4 @@ cd Paddle-Lite rm -rf third-party ``` -之后再根据本文档,进行后续编译时,便会忽略第三方依赖对应的`submodule`,改为下载第三方压缩包。 +之后再根据本文档,进行后续编译时,便会忽略第三方依赖对应的`submodule`,改为直接下载第三方压缩包。 diff --git a/docs/user_guides/tutorial.md b/docs/user_guides/tutorial.md new file mode 100644 index 0000000000000000000000000000000000000000..338449bfcb92e4029763c4357eb6d1fd5b820272 --- /dev/null +++ b/docs/user_guides/tutorial.md @@ -0,0 +1,52 @@ +# 使用流程 + +Lite是一种轻量级、灵活性强、易于扩展的高性能的深度学习预测框架,它可以支持诸如ARM、OpenCL、NPU等等多种终端,同时拥有强大的图优化及预测加速能力。如果您希望将Lite框架集成到自己的项目中,那么只需要如下几步简单操作即可。 + +## 一. 准备模型 + +Lite框架目前支持的模型结构为[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)深度学习框架产出的模型格式。因此,在您开始使用 Lite 框架前您需要准备一个由PaddlePaddle框架保存的模型。 +如果您手中的模型是由诸如Caffe2、Tensorflow等框架产出的,那么我们推荐您使用 [X2Paddle](https://github.com/PaddlePaddle/X2Paddle) 工具进行模型格式转换。 + +## 二. 模型优化 + +Lite框架拥有强大的加速、优化策略及实现,其中包含诸如量化、子图融合、Kernel优选等等优化手段,为了方便您使用这些优化策略,我们提供了[opt](model_optimize_tool)帮助您轻松进行模型优化。优化后的模型更轻量级,耗费资源更少,并且执行速度也更快。 + +opt的详细介绍,请您参考 [模型优化方法](model_optimize_tool) 。 + +下载opt工具后执行以下代码: + +``` shell +$ ./opt \ + --model_dir= \ + --model_file= \ + --param_file= \ + --optimize_out_type=(protobuf|naive_buffer) \ + --optimize_out= \ + --valid_targets=(arm|opencl|x86) +``` + +其中,optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式,其目前支持Protobuf与Naive Buffer两种方式,其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测,那么您需要设置optimize_out_type=naive_buffer。 + +## 三. 使用Lite框架执行预测 + +在上一节中,我们已经通过`opt`获取到了优化后的模型,使用优化模型进行预测也十分的简单。为了方便您的使用,Lite进行了良好的API设计,隐藏了大量您不需要投入时间研究的细节。您只需要简单的五步即可使用Lite在移动端完成预测(以C++ API进行说明): + + +1. 声明MobileConfig。在config中可以设置**从文件加载模型**也可以设置**从memory加载模型**。从文件加载模型需要声明模型文件路径,如 `config.set_model_from_file(FLAGS_model_file)` ;从memory加载模型方法现只支持加载优化后模型的naive buffer,实现方法为: +`void set_model_from_buffer(model_buffer) ` + +2. 创建Predictor。Predictor即为Lite框架的预测引擎,为了方便您的使用我们提供了 `CreatePaddlePredictor` 接口,你只需要简单的执行一行代码即可完成预测引擎的初始化,`std::shared_ptr predictor = CreatePaddlePredictor(config)` 。 +3. 准备输入。执行predictor->GetInput(0)您将会获得输入的第0个field,同样的,如果您的模型有多个输入,那您可以执行 `predictor->GetInput(i)` 来获取相应的输入变量。得到输入变量后您可以使用Resize方法指定其具体大小,并填入输入值。 +4. 执行预测。您只需要执行 `predictor->Run()` 即可使用Lite框架完成预测。 +5. 获取输出。与输入类似,您可以使用 `predictor->GetOutput(i)` 来获得输出的第i个变量。您可以通过其shape()方法获取输出变量的维度,通过 `data()` 模板方法获取其输出值。 + + + + +## 四. Lite API + +为了方便您的使用,我们提供了C++、Java、Python三种API,并且提供了相应的api的完整使用示例:[C++完整示例](../demo_guides/cpp_demo)、[Java完整示例](../demo_guides/java_demo)、[Python完整示例](../demo_guides/cuda),您可以参考示例中的说明快速了解C++/Java/Python的API使用方法,并集成到您自己的项目中去。需要说明的是,为了减少第三方库的依赖、提高Lite预测框架的通用性,在移动端使用Lite API您需要准备Naive Buffer存储格式的模型,具体方法可参考第2节`模型优化`。 + +## 五. 测试工具 + +为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Debug工具](debug#debug) 和 [Profile工具](debug#profiler)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。 diff --git a/docs/user_guides/x2paddle.md b/docs/user_guides/x2paddle.md new file mode 100644 index 0000000000000000000000000000000000000000..7e44ba980cc6836189d3f1a03bbbf29c8d7bd5c1 --- /dev/null +++ b/docs/user_guides/x2paddle.md @@ -0,0 +1,69 @@ +# 模型转换工具 X2Paddle + +X2Paddle可以将caffe、tensorflow、onnx模型转换成Paddle支持的模型。 + +[X2Paddle](https://github.com/PaddlePaddle/X2Paddle)支持将Caffe/TensorFlow模型转换为PaddlePaddle模型。目前X2Paddle支持的模型参考[x2paddle_model_zoo](https://github.com/PaddlePaddle/X2Paddle/blob/develop/x2paddle_model_zoo.md)。 + + +## 多框架支持 + +|模型 | caffe | tensorflow | onnx | +|---|---|---|---| +|mobilenetv1 | Y | Y | | +|mobilenetv2 | Y | Y | Y | +|resnet18 | Y | Y | | +|resnet50 | Y | Y | Y | +|mnasnet | Y | Y | | +|efficientnet | Y | Y | Y | +|squeezenetv1.1 | Y | Y | Y | +|shufflenet | Y | Y | | +|mobilenet_ssd | Y | Y | | +|mobilenet_yolov3 | | Y | | +|inceptionv4 | | | | +|mtcnn | Y | Y | | +|facedetection | Y | | | +|unet | Y | Y | | +|ocr_attention | | | | +|vgg16 | | | | + + +## 安装 + +``` +pip install x2paddle +``` + +安装最新版本,可使用如下安装方式 + +``` +pip install git+https://github.com/PaddlePaddle/X2Paddle.git@develop +``` + +## 使用 + +### Caffe + +``` +x2paddle --framework caffe \ + --prototxt model.proto \ + --weight model.caffemodel \ + --save_dir paddle_model +``` + +### TensorFlow + +``` +x2paddle --framework tensorflow \ + --model model.pb \ + --save_dir paddle_model +``` + +## 转换结果说明 + +在指定的`save_dir`下生成两个目录 +1. inference_model : 模型结构和参数均序列化保存的模型格式 +2. model_with_code : 保存了模型参数文件和模型的python代码 + +## 问题反馈 + +X2Paddle使用时存在问题时,欢迎您将问题或Bug报告以[Github Issues](https://github.com/PaddlePaddle/X2Paddle/issues)的形式提交给我们,我们会实时跟进。 diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index bac6f80c4721e0c5de201eebfe7e6a39a0bdc73a..1c1fc1b0deadc9b16cbd3b30be6f062aa5d63212 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -7,8 +7,12 @@ message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}") message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") +message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") +message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}") +message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") +message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}") message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}") @@ -64,12 +68,21 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (LITE_WITH_NPU) set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.npu") endif(LITE_WITH_NPU) + if (LITE_WITH_XPU) + set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.xpu") + endif(LITE_WITH_XPU) + if (LITE_WITH_APU) + set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.apu") + endif(LITE_WITH_APU) if (LITE_WITH_FPGA) set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga") endif(LITE_WITH_FPGA) if (LITE_WITH_BM) set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm") endif(LITE_WITH_BM) + if (LITE_WITH_RKNPU) + set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.rknpu") + endif(LITE_WITH_RKNPU) else() set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib") endif() @@ -77,9 +90,61 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}") # add python lib if (LITE_WITH_PYTHON) - add_custom_target(publish_inference_python_lib ${TARGET} - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite_core.so") + if(WIN32) + set(LITE_CORE "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd") + set(LITE_CORE_DEPS ${LITE_CORE}) + add_custom_command(OUTPUT ${LITE_CORE} + COMMAND cmake -E copy $ ${LITE_CORE} + DEPENDS lite_pybind) + add_custom_target(copy_lite_pybind ALL DEPENDS ${LITE_CORE_DEPS}) + + add_custom_target(publish_inference_python_lib ${TARGET} + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/lib" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/libs" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.pyd" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.pyd" + DEPENDS copy_lite_pybind + ) + + add_custom_target(publish_inference_python_installer ${TARGET} + COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel + WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/ + DEPENDS publish_inference_python_lib) + add_custom_target(publish_inference_python_light_demo ${TARGET} + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/python" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_full_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/" + ) + add_dependencies(publish_inference publish_inference_python_lib) + add_dependencies(publish_inference publish_inference_python_installer) + add_dependencies(publish_inference publish_inference_python_light_demo) + else() + if(APPLE) + add_custom_target(publish_inference_python_lib ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so") + else() + add_custom_target(publish_inference_python_lib ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so") + endif() + add_custom_target(publish_inference_python_installer ${TARGET} + COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel + WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/ + DEPENDS publish_inference_python_lib) add_custom_target(publish_inference_python_light_demo ${TARGET} COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/python" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/") @@ -91,11 +156,29 @@ if (LITE_WITH_PYTHON) endif() add_dependencies(publish_inference_python_lib lite_pybind) add_dependencies(publish_inference publish_inference_python_lib) + add_dependencies(publish_inference publish_inference_python_installer) add_dependencies(publish_inference publish_inference_python_light_demo) + endif(WIN32) endif() -if (LITE_WITH_X86) - add_custom_target(publish_inference_x86_cxx_lib ${TARGET} +if (LITE_WITH_CUDA OR LITE_WITH_X86) + if(APPLE) + add_custom_target(publish_inference_cxx_lib ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.dylib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + ) + add_custom_target(publish_inference_third_party ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party") + add_dependencies(publish_inference_cxx_lib paddle_full_api_shared) + add_dependencies(publish_inference_cxx_lib paddle_light_api_shared) + add_dependencies(publish_inference publish_inference_cxx_lib) + add_dependencies(publish_inference publish_inference_third_party) + elseif(NOT WIN32) + add_custom_target(publish_inference_cxx_lib ${TARGET} COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include" @@ -103,28 +186,85 @@ if (LITE_WITH_X86) COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + ) + if (LITE_WITH_CUDA) + add_custom_target(publish_inference_third_party ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party") + add_dependencies(publish_inference publish_inference_third_party) + endif() + add_dependencies(publish_inference_cxx_lib bundle_full_api) + add_dependencies(publish_inference_cxx_lib bundle_light_api) + add_dependencies(publish_inference_cxx_lib paddle_full_api_shared) + add_dependencies(publish_inference_cxx_lib paddle_light_api_shared) + add_dependencies(publish_inference publish_inference_cxx_lib) + endif() +endif() + +if (LITE_WITH_X86) + if(WIN32) + add_custom_target(publish_inference_x86_cxx_lib ${TARGET} + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/bin" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api//${CMAKE_BUILD_TYPE}/test_model_bin.exe" "${INFER_LITE_PUBLISH_ROOT}/bin" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_api.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_place.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_passes.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_lite_factory_helper.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_full_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_light_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + ) + + add_dependencies(publish_inference_x86_cxx_lib test_model_bin) + add_dependencies(publish_inference_x86_cxx_lib bundle_full_api) + add_dependencies(publish_inference_x86_cxx_lib bundle_light_api) + add_dependencies(publish_inference publish_inference_x86_cxx_lib) + + add_custom_target(publish_inference_x86_cxx_demos ${TARGET} + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/install" "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_SOURCE_DIR}/lite/demo/cxx" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + ) + add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos) + add_dependencies(publish_inference_x86_cxx_demos paddle_api_full_bundled eigen3) + + else() + + add_custom_target(publish_inference_x86_cxx_lib ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin" ) - add_dependencies(publish_inference_x86_cxx_lib bundle_full_api) - add_dependencies(publish_inference_x86_cxx_lib bundle_light_api) add_dependencies(publish_inference_x86_cxx_lib test_model_bin) - add_dependencies(publish_inference_x86_cxx_lib paddle_full_api_shared) - add_dependencies(publish_inference_x86_cxx_lib paddle_light_api_shared) - add_dependencies(publish_inference publish_inference_x86_cxx_lib) add_custom_target(publish_inference_x86_cxx_demos ${TARGET} - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" - COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party" - COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND rm -rf "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" - ) + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_light_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_light" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_full_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_full" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/mklml" "${INFER_LITE_PUBLISH_ROOT}/third_party/" + ) add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos) add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3) + add_dependencies(publish_inference publish_inference_x86_cxx_lib) + add_dependencies(publish_inference publish_inference_x86_cxx_demos) + endif() endif() if(LITE_WITH_CUDA) - add_dependencies(publish_inference paddle_full_api_shared) -endif(LITE_WITH_CUDA) + add_custom_target(publish_inference_cuda_cxx_demos ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/cuda_demo/*" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + ) + add_dependencies(publish_inference_cuda_cxx_demos paddle_full_api_shared) + add_dependencies(publish_inference publish_inference_cuda_cxx_demos) +endif(LITE_WITH_CUDA) + if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (NOT LITE_ON_TINY_PUBLISH) # add cxx lib @@ -135,27 +275,29 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" - #COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/model_optimize_tool" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/gen_code/paddle_code_generator" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" ) if(NOT IOS) - #add_dependencies(publish_inference_cxx_lib model_optimize_tool) add_dependencies(publish_inference_cxx_lib paddle_code_generator) add_dependencies(publish_inference_cxx_lib bundle_full_api) add_dependencies(publish_inference_cxx_lib bundle_light_api) add_dependencies(publish_inference_cxx_lib test_model_bin) + add_dependencies(publish_inference_cxx_lib benchmark_bin) if (ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux") add_dependencies(publish_inference_cxx_lib paddle_full_api_shared) add_dependencies(publish_inference paddle_light_api_shared) add_custom_command(TARGET publish_inference_cxx_lib - COMMAND cp ${CMAKE_BINARY_DIR}/lite/api/*.so ${INFER_LITE_PUBLISH_ROOT}/cxx/lib) + COMMAND cp ${CMAKE_BINARY_DIR}/lite/api/*.so ${INFER_LITE_PUBLISH_ROOT}/cxx/lib + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/benchmark_bin" "${INFER_LITE_PUBLISH_ROOT}/bin" + ) endif() add_dependencies(publish_inference publish_inference_cxx_lib) if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug") add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD - COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a) + COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a + COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.so) endif() endif() else() @@ -185,6 +327,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) add_dependencies(publish_inference tiny_publish_cxx_lib) if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug") add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD + COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_api_light_bundled.a COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so) endif() endif() @@ -234,6 +377,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_libs" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_libs/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_libs/Makefile" ) add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) @@ -281,6 +426,10 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/opencl" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/backends/opencl/cl_kernel" "${INFER_LITE_PUBLISH_ROOT}/opencl" ) + if (NOT LITE_ON_TINY_PUBLISH) add_dependencies(publish_inference_cxx_lib publish_inference_opencl) + else() + add_dependencies(tiny_publish_cxx_lib publish_inference_opencl) + endif() endif() endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 8ef2257f17465be8e6ac92a842862ac68e45f765..7296429f934f4eaee92133c1bd235712ab751ce9 100755 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -1,4 +1,5 @@ - if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) + +if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR (NOT LITE_WITH_LOG)) lite_cc_library(place SRCS paddle_place.cc DEPS logging) else() lite_cc_library(place SRCS paddle_place.cc DEPS glog) @@ -8,49 +9,78 @@ if (LITE_ON_TINY_PUBLISH) set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG") set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG") endif() -set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer) -if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) + +set(light_lib_DEPS light_api paddle_api paddle_api_light) + +if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) #full api dynamic library - add_library(paddle_full_api_shared SHARED "") - target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc) + lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc + DEPS paddle_api paddle_api_light paddle_api_full) add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto) target_link_libraries(paddle_full_api_shared framework_proto) if(LITE_WITH_X86) add_dependencies(paddle_full_api_shared xxhash) target_link_libraries(paddle_full_api_shared xxhash) - if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) + if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) add_dependencies(paddle_full_api_shared dynload_mklml) endif() + if(WIN32) + target_link_libraries(paddle_full_api_shared shlwapi.lib) + endif() endif() if(LITE_WITH_CUDA) target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive") endif(LITE_WITH_CUDA) #light api dynamic library - lite_cc_library(paddle_light_api_shared MODULE - SRCS light_api_shared.cc - DEPS ${light_lib_DEPS} - ARM_DEPS ${arm_kernels} - CV_DEPS paddle_cv_arm - NPU_DEPS ${npu_kernels}) - - target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels}) - set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map") - set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}") - add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...) - add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE}) - set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS}) - add_dependencies(paddle_full_api_shared custom_linker_map) + lite_cc_library(paddle_light_api_shared SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc + DEPS ${light_lib_DEPS} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels} + APU_DEPS ${apu_kernels} + RKNPU_DEPS ${rknpu_kernels} + ) + + add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) + if(WIN32) + target_link_libraries(paddle_light_api_shared shlwapi.lib) + endif() + target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels}) + if(APPLE) + set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds") + set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}") + add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...) + add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE}) + set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS}) + add_dependencies(paddle_full_api_shared custom_linker_map) + elseif(NOT WIN32) + set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map") + set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}") + add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...) + add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE}) + set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS}) + add_dependencies(paddle_full_api_shared custom_linker_map) + endif() else() if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux")) add_library(paddle_light_api_shared SHARED "") target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc) - set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") + set(TARGET_COMIPILE_FLAGS "-fdata-sections") + if (NOT (ARM_TARGET_LANG STREQUAL "clang")) #gcc + set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto") + endif() + set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}") add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) if (LITE_WITH_NPU) # Need to add HIAI runtime libs (libhiai.so) dependency target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs}) endif() + if (LITE_WITH_RKNPU) + # Need to add RKNPU runtime libs dependency + target_link_libraries(paddle_light_api_shared ${rknpu_builder_libs} ${rknpu_runtime_libs}) + endif() + endif() endif() @@ -61,7 +91,11 @@ if (WITH_TESTING) CUDA_DEPS ${cuda_kernels} X86_DEPS ${x86_kernels} XPU_DEPS ${xpu_kernels} - BM_DEPS ${bm_kernels}) + RKNPU_DEPS ${rknpu_kernels} + BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels} + APU_DEPS ${apu_kernels}) + endif() if(LITE_WITH_FPGA) set(light_api_deps ${light_api_deps} ${fpga_deps}) @@ -73,15 +107,25 @@ if(LITE_WITH_BM) set(cxx_api_deps ${cxx_api_deps} ${bm_deps}) endif() +if(LITE_WITH_RKNPU) + set(light_api_deps ${light_api_deps} ${rknpu_deps}) + set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps}) +endif() + + message(STATUS "get ops ${ops}") message(STATUS "get X86 kernels ${x86_kernels}") message(STATUS "get CUDA kernels ${cuda_kernels}") message(STATUS "get Host kernels ${host_kernels}") message(STATUS "get ARM kernels ${arm_kernels}") +message(STATUS "get OpenCL kernels ${opencl_kernels}") message(STATUS "get NPU kernels ${npu_kernels}") +message(STATUS "get APU kernels ${apu_kernels}") message(STATUS "get XPU kernels ${xpu_kernels}") +message(STATUS "get RKNPU kernels ${rknpu_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}") message(STATUS "get BM kernels ${bm_kernels}") +message(STATUS "get MLU kernels ${mlu_kernels}") # for full api if (NOT LITE_ON_TINY_PUBLISH) @@ -96,6 +140,8 @@ if (NOT LITE_ON_TINY_PUBLISH) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + APU_DEPS ${apu_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels}) @@ -116,74 +162,88 @@ lite_cc_library(light_api SRCS light_api.cc ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} + APU_DEPS ${apu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - BM_DEPS ${bm_kernels}) + BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels}) include(ExternalProject) set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING "A path setting inference demo download directories.") if(WITH_TESTING) - lite_cc_test(test_cxx_api SRCS cxx_api_test.cc - DEPS cxx_api mir_passes lite_api_test_helper - ${ops} ${host_kernels} - X86_DEPS ${x86_kernels} - CUDA_DEPS ${cuda_kernels} - ARM_DEPS ${arm_kernels} - CV_DEPS paddle_cv_arm - NPU_DEPS ${npu_kernels} - XPU_DEPS ${xpu_kernels} - CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels} - BM_DEPS ${bm_kernels} - EXCLUDE_COMPILE_DEPS "ON" - ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model - --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) - add_dependencies(test_cxx_api extern_lite_download_lite_naive_model_tar_gz) + if(NOT WITH_COVERAGE) + lite_cc_test(test_cxx_api SRCS cxx_api_test.cc + DEPS cxx_api mir_passes lite_api_test_helper + ${ops} ${host_kernels} + X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels} + APU_DEPS ${apu_kernels} + XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels} + BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels} + EXCLUDE_COMPILE_DEPS "ON" + ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model + --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) + add_dependencies(test_cxx_api extern_lite_download_lite_naive_model_tar_gz) + endif() if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - lite_cc_test(test_googlenet SRCS test_googlenet_lite.cc - DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/googlenet) - add_dependencies(test_googlenet extern_lite_download_GoogleNet_inference_tar_gz) - lite_cc_test(test_mobilenetv1_lite_x86 SRCS test_mobilenetv1_lite_x86.cc - DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1) - add_dependencies(test_mobilenetv1_lite_x86 extern_lite_download_mobilenet_v1_tar_gz) - lite_cc_test(test_mobilenetv2_lite_x86 SRCS test_mobilenetv2_lite_x86.cc - DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu) - add_dependencies(test_mobilenetv2_lite_x86 extern_lite_download_mobilenet_v2_relu_tar_gz) - lite_cc_test(test_inceptionv4_lite_x86 SRCS test_inceptionv4_lite_x86.cc - DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/inception_v4_simple) - add_dependencies(test_inceptionv4_lite_x86 extern_lite_download_inception_v4_simple_tar_gz) - lite_cc_test(test_resnet50_lite_x86 SRCS test_resnet50_lite_x86.cc - DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) - add_dependencies(test_resnet50_lite_x86 extern_lite_download_resnet50_tar_gz) - lite_cc_test(test_step_rnn_lite_x86 SRCS test_step_rnn_lite_x86.cc - DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/step_rnn) - add_dependencies(test_step_rnn_lite_x86 extern_lite_download_step_rnn_tar_gz) + if(LITE_WITH_X86) + lite_cc_test(test_googlenet SRCS test_googlenet_lite.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/googlenet) + add_dependencies(test_googlenet extern_lite_download_GoogleNet_inference_tar_gz) + lite_cc_test(test_mobilenetv1_lite_x86 SRCS test_mobilenetv1_lite_x86.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1) + add_dependencies(test_mobilenetv1_lite_x86 extern_lite_download_mobilenet_v1_tar_gz) + lite_cc_test(test_mobilenetv2_lite_x86 SRCS test_mobilenetv2_lite_x86.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu) + add_dependencies(test_mobilenetv2_lite_x86 extern_lite_download_mobilenet_v2_relu_tar_gz) + lite_cc_test(test_inceptionv4_lite_x86 SRCS test_inceptionv4_lite_x86.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/inception_v4_simple) + add_dependencies(test_inceptionv4_lite_x86 extern_lite_download_inception_v4_simple_tar_gz) + lite_cc_test(test_resnet50_lite_x86 SRCS test_resnet50_lite_x86.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) + add_dependencies(test_resnet50_lite_x86 extern_lite_download_resnet50_tar_gz) + lite_cc_test(test_step_rnn_lite_x86 SRCS test_step_rnn_lite_x86.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/step_rnn) + add_dependencies(test_step_rnn_lite_x86 extern_lite_download_step_rnn_tar_gz) + endif() if(LITE_WITH_BM) - lite_cc_test(test_resnet50_lite_bm SRCS test_resnet50_lite_bm.cc + lite_cc_test(test_classify_lite_bm SRCS test_classify_lite_bm.cc DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges} - ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) + ARGS --model_dir=${LITE_MODEL_DIR}/classify) + lite_cc_test(test_yolov3_lite_bm SRCS test_yolov3_lite_bm.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges} + ARGS --model_dir=${LITE_MODEL_DIR}/yolov3) endif() endif() endif() if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) - set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${fpga_kernels}) + set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels}) lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc DEPS ${lite_model_test_DEPS} @@ -199,8 +259,10 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL) add_dependencies(test_mobilenetv1 extern_lite_download_mobilenet_v1_tar_gz) - set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") - set_target_properties(test_mobilenetv1 PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + if(NOT WIN32) + set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") + set_target_properties(test_mobilenetv1 PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + endif() lite_cc_test(test_mobilenetv2 SRCS mobilenetv2_test.cc DEPS ${lite_model_test_DEPS} @@ -208,7 +270,9 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu SERIAL) add_dependencies(test_mobilenetv2 extern_lite_download_mobilenet_v2_relu_tar_gz) - set_target_properties(test_mobilenetv2 PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + if(NOT WIN32) + set_target_properties(test_mobilenetv2 PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + endif() lite_cc_test(test_resnet50 SRCS resnet50_test.cc DEPS ${lite_model_test_DEPS} paddle_api_light @@ -239,9 +303,15 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL) add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz) + lite_cc_test(test_ocr_attention_fpga SRCS ocr_attention_test_fpga.cc DEPS ${lite_model_test_DEPS}) + + # brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model + # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc + # DEPS ${lite_model_test_DEPS}) + # lite_cc_test(model_run_test_image SRCS model_run_test_image.cc # DEPS ${lite_model_test_DEPS} # CL_DEPS ${opencl_kernels} @@ -263,8 +333,10 @@ if (NOT LITE_ON_TINY_PUBLISH) ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} + APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels}) + FPGA_DEPS ${fpga_kernels} + BM_DEPS ${bm_kernels}) # The final inference library for just MobileConfig. bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api) target_link_libraries(paddle_api_full ${cuda_deps}) @@ -276,22 +348,27 @@ bundle_static_library(paddle_api_light paddle_api_light_bundled bundle_light_api # These tests needs CLI arguments, and is not supported in ARM CI. # TODO(Superjomn) support latter. -lite_cc_test(test_light_api SRCS light_api_test.cc +if(NOT WITH_COVERAGE) + lite_cc_test(test_light_api SRCS light_api_test.cc DEPS light_api program mir_passes paddle_api_light CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) -lite_cc_test(test_apis SRCS apis_test.cc + lite_cc_test(test_apis SRCS apis_test.cc DEPS cxx_api light_api ${ops} paddle_api_light CL_DEPS ${opencl_kernels} X86_DEPS ${x86_kernels} XPU_DEPS ${xpu_kernels} FPGA_DEPS ${fpga_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) +endif() if (LITE_WITH_JAVA AND LITE_WITH_ARM) add_subdirectory(android) @@ -305,26 +382,36 @@ if (LITE_ON_TINY_PUBLISH) return() endif() + +# add library for opt_base +lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils) +add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h) + if (LITE_ON_MODEL_OPTIMIZE_TOOL) message(STATUS "Compiling opt") lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc - DEPS gflags kernel op optimizer mir_passes utils) + DEPS gflags kernel op optimizer mir_passes utils ${host_kernels}) add_dependencies(opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h) endif(LITE_ON_MODEL_OPTIMIZE_TOOL) -lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light - ${ops} - ARM_DEPS ${arm_kernels} - CV_DEPS paddle_cv_arm - NPU_DEPS ${npu_kernels} - XPU_DEPS ${xpu_kernels} - CL_DEPS ${opencl_kernels} - X86_DEPS ${x86_kernels} - FPGA_DEPS ${fpga_kernels} - BM_DEPS ${bm_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL) -if (WITH_TESTING) - add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz) +if(NOT WITH_COVERAGE) + lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light + ${ops} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} + APU_DEPS ${apu_kernels} + RKNPU_DEPS ${rknpu_kernels} + CL_DEPS ${opencl_kernels} + X86_DEPS ${x86_kernels} + FPGA_DEPS ${fpga_kernels} + BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL) + if (WITH_TESTING) + add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz) + endif() endif() # Some bins @@ -335,8 +422,41 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + MLU_DEPS ${mlu_kernels} + APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} + RKNPU_DEPS ${rknpu_kernels} + FPGA_DEPS ${fpga_kernels} + X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels}) + + lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} + MLU_DEPS ${mlu_kernels} + APU_DEPS ${apu_kernels} + CL_DEPS ${opencl_kernels} + BM_DEPS ${bm_kernels} + RKNPU_DEPS ${rknpu_kernels} + FPGA_DEPS ${fpga_kernels} + X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels}) + + lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} + MLU_DEPS ${mlu_kernels} + APU_DEPS ${apu_kernels} + CL_DEPS ${opencl_kernels} + BM_DEPS ${bm_kernels} + RKNPU_DEPS ${rknpu_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) @@ -347,21 +467,41 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} + MLU_DEPS ${mlu_kernels} + APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) + lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} + APU_DEPS ${apu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} + MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) + + lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels} + RKNPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} + APU_DEPS ${apu_kernels} + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels} + X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels}) endif() #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc diff --git a/lite/api/_paddle_use_ops.h b/lite/api/_paddle_use_ops.h index 6da47e53789d651f4a36d0b8d6a7ca1ea5a0a3d3..63d5938cf5eacd5f829d92a391d82212923829e4 100644 --- a/lite/api/_paddle_use_ops.h +++ b/lite/api/_paddle_use_ops.h @@ -48,6 +48,7 @@ USE_LITE_OP(concat) USE_LITE_OP(conv2d) USE_LITE_OP(depthwise_conv2d) USE_LITE_OP(pool2d) +USE_LITE_OP(max_pool2d_with_index) USE_LITE_OP(batch_norm) USE_LITE_OP(fusion_elementwise_sub_activation) USE_LITE_OP(transpose) @@ -63,6 +64,7 @@ USE_LITE_OP(swish) USE_LITE_OP(log) USE_LITE_OP(exp) USE_LITE_OP(conv2d_transpose) +USE_LITE_OP(depthwise_conv2d_transpose) USE_LITE_OP(negative) USE_LITE_OP(pad2d) USE_LITE_OP(power) diff --git a/lite/api/android/jni/native/CMakeLists.txt b/lite/api/android/jni/native/CMakeLists.txt index c1766772f8aaa417c3da1d72f2692c10c10194b4..d46e9f7cdec1cf422340ff11165ee166c7520bab 100644 --- a/lite/api/android/jni/native/CMakeLists.txt +++ b/lite/api/android/jni/native/CMakeLists.txt @@ -25,7 +25,11 @@ if (NOT LITE_ON_TINY_PUBLISH) endif() else() add_library(paddle_lite_jni SHARED "") - set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") + set(TARGET_COMIPILE_FLAGS "-fdata-sections") + if (NOT (ARM_TARGET_LANG STREQUAL "clang")) #gcc + set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto") + endif() + set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS ${TARGET_COMIPILE_FLAGS}) target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc) add_dependencies(paddle_lite_jni op_list_h kernel_list_h) if (LITE_WITH_NPU) diff --git a/lite/api/android/jni/native/paddle_lite_jni.h b/lite/api/android/jni/native/paddle_lite_jni.h index f447ce105a1ca7b2d94a00287d2b699f920a09af..983f108a869db91c7cfeb9eb539286e2a3f0bf99 100644 --- a/lite/api/android/jni/native/paddle_lite_jni.h +++ b/lite/api/android/jni/native/paddle_lite_jni.h @@ -17,11 +17,6 @@ #include /* Header for class com_baidu_paddle_lite_PaddlePredictor */ #include "lite/api/paddle_lite_factory_helper.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#ifndef LITE_ON_TINY_PUBLISH -#include "lite/api/paddle_use_passes.h" -#endif #ifdef __cplusplus extern "C" { #endif diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java b/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java index e150f98f22113ef6bcedd5e9882e0bd2a6378c97..fe05c4302c71b439ae125e165244146726b3bf3d 100644 --- a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java +++ b/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java @@ -78,7 +78,7 @@ public class MobileConfig extends ConfigBase { * * @return liteModelFile */ - public String getModelFile() { + public String getModelFromFile() { return liteModelFile; } @@ -96,7 +96,7 @@ public class MobileConfig extends ConfigBase { * * @return liteModelBuffer */ - public String getModelBuffer() { + public String getModelFromBuffer() { return liteModelBuffer; } diff --git a/lite/api/apis_test.cc b/lite/api/apis_test.cc index bb852297d11a8862460ed6f12e007d727aca9428..917f2a73a95c3fbd7464fd40824b833993a2a18c 100644 --- a/lite/api/apis_test.cc +++ b/lite/api/apis_test.cc @@ -21,9 +21,6 @@ #include #include "lite/api/cxx_api.h" #include "lite/api/light_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" #include "lite/core/mir/pass_registry.h" DEFINE_string(model_dir, "", ""); diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc index 718dbe44296f2d197efc5b567cf0cc211835d176..63d498c41fe5eb265a65a7fe4e849ced8153530e 100644 --- a/lite/api/benchmark.cc +++ b/lite/api/benchmark.cc @@ -13,7 +13,14 @@ // limitations under the License. #include +#if !defined(_WIN32) #include +#else +#define NOMINMAX // msvc max/min macro conflict with std::min/max +#include +#include "lite/backends/x86/port.h" +#endif +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include #include #include @@ -23,31 +30,34 @@ #include #include #include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" #include "lite/core/device_info.h" #include "lite/utils/cp_logging.h" #include "lite/utils/string.h" +DEFINE_string(optimized_model_path, + "", + "the path of the model that is optimized by opt."); DEFINE_string(model_dir, "", - "the path of the model, set model_dir when the model is no " - "combined formate. This option will be ignored if model_file " - "and param_file are exist."); -DEFINE_string(model_file, + "the path of the model, the model and param files is under " + "model_dir."); +DEFINE_string(model_filename, "", - "the path of model file, set model_file when the model is " - "combined formate."); -DEFINE_string(param_file, + "the filename of model file. When the model is combined formate, " + "please set model_file."); +DEFINE_string(param_filename, "", - "the path of param file, set param_file when the model is " + "the filename of param file, set param_file when the model is " "combined formate."); DEFINE_string(input_shape, "1,3,224,224", "set input shapes according to the model, " "separated by colon and comma, " - "such as 1,3,244,244:1,3,300,300."); + "such as 1,3,244,244"); +DEFINE_string(input_img_path, + "", + "the path of input image, if not set " + "input_img_path, the input of model will be 1.0."); DEFINE_int32(warmup, 0, "warmup times"); DEFINE_int32(repeats, 1, "repeats times"); DEFINE_int32(power_mode, @@ -60,16 +70,8 @@ DEFINE_int32(power_mode, DEFINE_int32(threads, 1, "threads num"); DEFINE_string(result_filename, "result.txt", - "save benchmark " - "result to the file"); -DEFINE_bool(run_model_optimize, - false, - "if set true, apply model_optimize_tool to " - "model and use optimized model to test. "); -DEFINE_bool(is_quantized_model, - false, - "if set true, " - "test the performance of the quantized model. "); + "save the inference time to the file."); +DEFINE_bool(show_output, false, "Wether to show the output in shell."); namespace paddle { namespace lite_api { @@ -80,19 +82,16 @@ inline double GetCurrentUS() { return 1e+6 * time.tv_sec + time.tv_usec; } -void OutputOptModel(const std::string& save_optimized_model_dir, - const std::vector>& input_shapes) { +void OutputOptModel(const std::string& save_optimized_model_dir) { lite_api::CxxConfig config; config.set_model_dir(FLAGS_model_dir); - config.set_model_file(FLAGS_model_file); - config.set_param_file(FLAGS_param_file); + if (!FLAGS_model_filename.empty() && !FLAGS_param_filename.empty()) { + config.set_model_file(FLAGS_model_dir + "/" + FLAGS_model_filename); + config.set_param_file(FLAGS_model_dir + "/" + FLAGS_param_filename); + } std::vector vaild_places = { Place{TARGET(kARM), PRECISION(kFloat)}, }; - if (FLAGS_is_quantized_model) { - vaild_places.insert(vaild_places.begin(), - Place{TARGET(kARM), PRECISION(kInt8)}); - } config.set_valid_places(vaild_places); auto predictor = lite_api::CreatePaddlePredictor(config); @@ -108,30 +107,45 @@ void OutputOptModel(const std::string& save_optimized_model_dir, LOG(INFO) << "Save optimized model to " << save_optimized_model_dir; } +int64_t ShapeProduction(const std::vector& shape) { + int64_t num = 1; + for (auto i : shape) { + num *= i; + } + return num; +} + #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK -void Run(const std::vector>& input_shapes, - const std::string& model_dir, +void Run(const std::vector& input_shape, + const std::string& model_path, const std::string model_name) { // set config and create predictor lite_api::MobileConfig config; config.set_threads(FLAGS_threads); config.set_power_mode(static_cast(FLAGS_power_mode)); - config.set_model_from_file(model_dir + ".nb"); + config.set_model_from_file(model_path); auto predictor = lite_api::CreatePaddlePredictor(config); // set input - for (int j = 0; j < input_shapes.size(); ++j) { - auto input_tensor = predictor->GetInput(j); - input_tensor->Resize(input_shapes[j]); - auto input_data = input_tensor->mutable_data(); - int input_num = 1; - for (size_t i = 0; i < input_shapes[j].size(); ++i) { - input_num *= input_shapes[j][i]; - } + auto input_tensor = predictor->GetInput(0); + input_tensor->Resize(input_shape); + auto input_data = input_tensor->mutable_data(); + int64_t input_num = ShapeProduction(input_shape); + if (FLAGS_input_img_path.empty()) { for (int i = 0; i < input_num; ++i) { input_data[i] = 1.f; } + } else { + std::fstream fs(FLAGS_input_img_path); + if (!fs.is_open()) { + LOG(FATAL) << "open input image " << FLAGS_input_img_path << " error."; + } + for (int i = 0; i < input_num; i++) { + fs >> input_data[i]; + } + // LOG(INFO) << "input data:" << input_data[0] << " " << + // input_data[input_num-1]; } // warmup @@ -165,39 +179,78 @@ void Run(const std::vector>& input_shapes, ofs << "average = " << std::setw(12) << avg_res; ofs << std::endl; ofs.close(); + + if (FLAGS_show_output) { + auto out_tensor = predictor->GetOutput(0); + auto* out_data = out_tensor->data(); + int64_t output_num = ShapeProduction(out_tensor->shape()); + float max_value = out_data[0]; + int max_index = 0; + for (int i = 0; i < output_num; i++) { + if (max_value < out_data[i]) { + max_value = out_data[i]; + max_index = i; + } + } + LOG(INFO) << "max_value:" << max_value; + LOG(INFO) << "max_index:" << max_index; + LOG(INFO) << "output data[0:10]:"; + for (int i = 0; i < 10; i++) { + LOG(INFO) << out_data[i]; + } + } } #endif } // namespace lite_api } // namespace paddle +void print_usage() { + std::string help_info = + "Usage: \n" + "./benchmark_bin \n" + " --optimized_model_path (The path of the model that is optimized\n" + " by opt. If the model is optimized, please set the param.) \n" + " type: string \n" + " --model_dir (The path of the model that is not optimized by opt,\n" + " the model and param files is under model_dir.) type: string \n" + " --model_filename (The filename of model file. When the model is\n " + " combined formate, please set model_file. Otherwise, it is not\n" + " necessary to set it.) type: string \n" + " --param_filename (The filename of param file, set param_file when\n" + " the model is combined formate. Otherwise, it is not necessary\n" + " to set it.) type: string \n" + " --input_shape (Set input shapes according to the model, separated by\n" + " colon and comma, such as 1,3,244,244) type: string\n" + " default: 1,3,224,224 \n" + " --input_img_path (The path of input image, if not set\n" + " input_img_path, the input will be 1.0.) type: string \n " + " --power_mode (Arm power mode: 0 for big cluster, 1 for little\n" + " cluster, 2 for all cores, 3 for no bind) type: int32 default: 3\n" + " --repeats (Repeats times) type: int32 default: 1 \n" + " --result_filename (Save the inference time to the file.) type: \n" + " string default: result.txt \n" + " --threads (Threads num) type: int32 default: 1 \n" + " --warmup (Warmup times) type: int32 default: 0 \n" + "Note that: \n" + " If load the optimized model, set optimized_model_path. Otherwise, \n" + " set model_dir, model_filename and param_filename according to \n" + " the model. \n"; + LOG(INFO) << help_info; +} + int main(int argc, char** argv) { + // Check inputs gflags::ParseCommandLineFlags(&argc, &argv, true); - if (FLAGS_model_dir == "" || FLAGS_result_filename == "") { - LOG(INFO) << "please run ./benchmark_bin --help to obtain usage."; + bool is_opt_model = (FLAGS_optimized_model_path != ""); + bool is_origin_model = (FLAGS_model_dir != ""); + if (!is_origin_model && !is_opt_model) { + LOG(INFO) << "Input error, the model path should not be empty.\n"; + print_usage(); exit(0); } - std::size_t found = FLAGS_model_dir.find_last_of("/"); - std::string model_name = FLAGS_model_dir.substr(found + 1); - std::string save_optimized_model_dir = FLAGS_model_dir + "opt2"; - - auto split_string = - [](const std::string& str_in) -> std::vector { - std::vector str_out; - std::string tmp_str = str_in; - while (!tmp_str.empty()) { - size_t next_offset = tmp_str.find(":"); - str_out.push_back(tmp_str.substr(0, next_offset)); - if (next_offset == std::string::npos) { - break; - } else { - tmp_str = tmp_str.substr(next_offset + 1); - } - } - return str_out; - }; - + // Get input shape auto get_shape = [](const std::string& str_shape) -> std::vector { std::vector shape; std::string tmp_str = str_shape; @@ -213,23 +266,31 @@ int main(int argc, char** argv) { } return shape; }; + std::vector input_shape = get_shape(FLAGS_input_shape); - std::vector str_input_shapes = split_string(FLAGS_input_shape); - std::vector> input_shapes; - for (size_t i = 0; i < str_input_shapes.size(); ++i) { - input_shapes.push_back(get_shape(str_input_shapes[i])); - } - - // Output optimized model if needed - if (FLAGS_run_model_optimize) { - paddle::lite_api::OutputOptModel(save_optimized_model_dir, input_shapes); + // Get model_name and run_model_path + std::string model_name; + std::string run_model_path; + if (is_origin_model) { + if (FLAGS_model_dir.back() == '/') { + FLAGS_model_dir.pop_back(); + } + std::size_t found = FLAGS_model_dir.find_last_of("/"); + model_name = FLAGS_model_dir.substr(found + 1); + std::string optimized_model_path = FLAGS_model_dir + "_opt2"; + paddle::lite_api::OutputOptModel(optimized_model_path); + run_model_path = optimized_model_path + ".nb"; + } else { + size_t found1 = FLAGS_optimized_model_path.find_last_of("/"); + size_t found2 = FLAGS_optimized_model_path.find_last_of("."); + size_t len = found2 - found1 - 1; + model_name = FLAGS_optimized_model_path.substr(found1 + 1, len); + run_model_path = FLAGS_optimized_model_path; } #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK - // Run inference using optimized model - std::string run_model_dir = - FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir; - paddle::lite_api::Run(input_shapes, run_model_dir, model_name); + // Run test + paddle::lite_api::Run(input_shape, run_model_path, model_name); #endif return 0; } diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index f6f7ec75e65ff54e3f3642822e51057d3522ae3a..ceb874e9650f66f703f857b41275465c72cbb864 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -19,6 +19,7 @@ #include #include #include +#include "lite/api/paddle_use_passes.h" #include "lite/utils/io.h" namespace paddle { @@ -150,6 +151,11 @@ std::vector Predictor::GetInputNames() { return input_names_; } // get outputnames std::vector Predictor::GetOutputNames() { return output_names_; } +// get param names +std::vector Predictor::GetParamNames() { + return exec_scope_->AttributeVarNames(); +} + // append the names of inputs and outputs into input_names_ and output_names_ void Predictor::PrepareFeedFetch() { if (!program_) { @@ -291,9 +297,42 @@ void Predictor::Build(const cpp::ProgramDesc &desc, program_desc_ = desc; // `inner_places` is used to optimize passes std::vector inner_places = valid_places; - inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); - inner_places.emplace_back( - TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + for (auto &valid_place : valid_places) { + if (valid_place.target == TARGET(kOpenCL)) continue; + inner_places.emplace_back( + Place(TARGET(kHost), valid_place.precision, valid_place.layout)); + } + + // Analysis whether the modle is quantized. + // For quantized model, add place(arm, int8) to inner_places + const std::vector quant_dequant_op = { + "fake_quantize_abs_max", + "fake_quantize_range_abs_max", + "fake_quantize_moving_average_abs_max", + "fake_quantize_dequantize_moving_average_abs_max", + "fake_dequantize_max_abs", + "fake_channel_wise_dequantize_max_abs"}; + bool is_quantized_model = false; + for (size_t i = 0; i < program_desc_.BlocksSize() && !is_quantized_model; + ++i) { + auto *block_desc = program_desc_.GetBlock(i); + for (size_t j = 0; j < block_desc->OpsSize() && !is_quantized_model; ++j) { + auto *op_desc = block_desc->GetOp(j); + std::string op_type = op_desc->Type(); + if (std::find(quant_dequant_op.begin(), + quant_dequant_op.end(), + op_type) != quant_dequant_op.end()) { + is_quantized_model = true; + } + } + } + if (is_quantized_model) { +#ifdef LITE_WITH_ARM + inner_places.insert(inner_places.begin(), + Place{TARGET(kARM), PRECISION(kInt8)}); +#endif + } + Program program(desc, scope_, inner_places); core::KernelPickFactor factor; @@ -314,9 +353,16 @@ void Predictor::GenRuntimeProgram() { const lite::Tensor *Predictor::GetTensor(const std::string &name) const { auto *var = exec_scope_->FindVar(name); + CHECK(var) << "no variable named with " << name << " in exec_scope"; return &var->Get(); } +lite::Tensor *Predictor::GetMutableTensor(const std::string &name) { + auto *var = exec_scope_->FindVar(name); + CHECK(var) << "no variable named with " << name << " in exec_scope"; + return var->GetMutable(); +} + // get input by name lite::Tensor *Predictor::GetInputByName(const std::string &name) { auto element = std::find(input_names_.begin(), input_names_.end(), name); @@ -333,16 +379,16 @@ lite::Tensor *Predictor::GetInputByName(const std::string &name) { } } -#ifdef LITE_WITH_TRAIN -void Predictor::FeedVars(const std::vector &tensors) { - auto var = scope_->FindVar("feed"); - auto &feed_list = *(var->GetMutable>()); - feed_list.resize(tensors.size()); +// #ifdef LITE_WITH_TRAIN +// void Predictor::FeedVars(const std::vector &tensors) { +// auto var = scope_->FindVar("feed"); +// auto &feed_list = *(var->GetMutable>()); +// feed_list.resize(tensors.size()); - for (size_t i = 0; i < tensors.size(); ++i) - feed_list[i].ShareDataWith(tensors[i]); -} -#endif +// for (size_t i = 0; i < tensors.size(); ++i) +// feed_list[i].ShareDataWith(tensors[i]); +// } +// #endif } // namespace lite } // namespace paddle diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h index 504710d9fa29420b8762f31e0c675b59c6c626bd..cd542e87ed3bf4632bce141f019e974af6ef4308 100644 --- a/lite/api/cxx_api.h +++ b/lite/api/cxx_api.h @@ -43,6 +43,7 @@ class LITE_API Predictor { public: // Create an empty predictor. Predictor() { scope_ = std::make_shared(); } + // Create a predictor with the weight variable scope set. explicit Predictor(const std::shared_ptr& root_scope) : scope_(root_scope) {} @@ -84,6 +85,9 @@ class LITE_API Predictor { // get inputnames and get outputnames. std::vector GetInputNames(); std::vector GetOutputNames(); + // get param names + std::vector GetParamNames(); + void PrepareFeedFetch(); // Get offset-th col of fetch results. @@ -91,6 +95,9 @@ class LITE_API Predictor { std::vector GetOutputs() const; const cpp::ProgramDesc& program_desc() const; + // get a mutable tensor according to its name + lite::Tensor* GetMutableTensor(const std::string& name); + // get a const tensor according to its name const lite::Tensor* GetTensor(const std::string& name) const; const RuntimeProgram& runtime_program() const; @@ -101,14 +108,14 @@ class LITE_API Predictor { bool record_info = false); void SaveOpKernelInfo(const std::string& model_dir); -#ifdef LITE_WITH_TRAIN - void Run(const std::vector& tensors) { - FeedVars(tensors); - program_->Run(); - } + // #ifdef LITE_WITH_TRAIN + // void Run(const std::vector& tensors) { + // FeedVars(tensors); + // program_->Run(); + // } - void FeedVars(const std::vector& tensors); -#endif + // void FeedVars(const std::vector& tensors); + // #endif private: Optimizer optimizer_; @@ -141,9 +148,15 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor { // get inputs names and get outputs names std::vector GetInputNames() override; std::vector GetOutputNames() override; + // get param names + std::vector GetParamNames() override; + // get tensor according to tensor's name std::unique_ptr GetTensor( const std::string& name) const override; + // get a mutable tensor according to tensor's name + std::unique_ptr GetMutableTensor( + const std::string& name) override; // Get InputTebsor by name std::unique_ptr GetInputByName( diff --git a/lite/api/cxx_api_bin.cc b/lite/api/cxx_api_bin.cc index 8c929e9c8700a65c868e2facd763b0ec36719e23..eec17cc30e308e7169b7d8c394c0e47eee0c1c3e 100644 --- a/lite/api/cxx_api_bin.cc +++ b/lite/api/cxx_api_bin.cc @@ -67,7 +67,7 @@ void Run(const char* model_dir, int repeat) { int main(int argc, char** argv) { CHECK_EQ(argc, 3) << "usage: ./cmd "; - paddle::lite::Run(argv[1], std::stoi(argv[2])); + paddle::lite::Run(argv[1], atoi(argv[2])); return 0; } diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc index 81ea60eac66849f8ce42fb8cb210226d18bbfa9b..d85ed3b64494b47fc6155bf3f9177a0c94fec5b2 100644 --- a/lite/api/cxx_api_impl.cc +++ b/lite/api/cxx_api_impl.cc @@ -20,28 +20,58 @@ #include "lite/core/device_info.h" #include "lite/core/version.h" +#ifndef LITE_ON_TINY_PUBLISH +#include "lite/api/paddle_use_passes.h" +#endif + #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ - !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) + !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__) #include #include "lite/backends/x86/mklml.h" #endif - namespace paddle { namespace lite { void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { config_ = config; + auto places = config.valid_places(); + std::vector passes = config.get_passes_internal(); #ifdef LITE_WITH_CUDA - Env::Init(); + // if kCUDA is included in valid places, it should be initialized first, + // otherwise skip this step. + for (auto &p : places) { + if (p.target == TARGET(kCUDA)) { + Env::Init(); + if (config_.multi_stream()) { + passes = {"multi_stream_analysis_pass"}; + VLOG(3) << "add pass: " << passes[0]; + } + break; + } + } #endif - auto places = config.valid_places(); - raw_predictor_.Build(config, places); - +#ifdef LITE_WITH_MLU + Env::Init(); + lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(), + config.mlu_core_number(), + config.mlu_use_first_conv(), + config.mlu_first_conv_mean(), + config.mlu_first_conv_std(), + config.mlu_input_layout()); +#endif // LITE_WITH_MLU + auto use_layout_preprocess_pass = + config.model_dir().find("OPENCL_PRE_PRECESS"); + VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass; + if (places[0].target == TARGET(kOpenCL) && + use_layout_preprocess_pass != std::string::npos) { + passes = {"type_layout_cast_preprocess_pass"}; + VLOG(1) << "add pass:" << passes[0]; + } + raw_predictor_.Build(config, places, passes); mode_ = config.power_mode(); threads_ = config.threads(); - #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ - !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) + !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__) int num_threads = config.x86_math_library_num_threads(); int real_num_threads = num_threads > 1 ? num_threads : 1; paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads); @@ -67,6 +97,10 @@ std::vector CxxPaddleApiImpl::GetInputNames() { return raw_predictor_.GetInputNames(); } +std::vector CxxPaddleApiImpl::GetParamNames() { + return raw_predictor_.GetParamNames(); +} + std::vector CxxPaddleApiImpl::GetOutputNames() { return raw_predictor_.GetOutputNames(); } @@ -93,6 +127,12 @@ std::unique_ptr CxxPaddleApiImpl::GetTensor( return std::unique_ptr(new lite_api::Tensor(x)); } +std::unique_ptr CxxPaddleApiImpl::GetMutableTensor( + const std::string &name) { + return std::unique_ptr( + new lite_api::Tensor(raw_predictor_.GetMutableTensor(name))); +} + std::unique_ptr CxxPaddleApiImpl::GetInputByName( const std::string &name) { return std::unique_ptr( diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc index d517383d2773a02f9edba46c6df0df131c746876..65ce77276afdb4c3b7a7247cdb8ae120497d8145 100644 --- a/lite/api/light_api.cc +++ b/lite/api/light_api.cc @@ -14,6 +14,9 @@ #include "lite/api/light_api.h" #include +#include +#include "paddle_use_kernels.h" // NOLINT +#include "paddle_use_ops.h" // NOLINT namespace paddle { namespace lite { @@ -26,7 +29,10 @@ void LightPredictor::Build(const std::string& lite_model_file, LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_); } + // For weight quantization of post training, load the int8/16 weights + // for optimized model, and dequant it to fp32. DequantizeWeight(); + BuildRuntimeProgram(cpp_program_desc_); PrepareFeedFetch(); } @@ -76,7 +82,7 @@ Tensor* LightPredictor::GetInputByName(const std::string& name) { if (element == input_names_.end()) { LOG(ERROR) << "Model do not have input named with: [" << name << "], model's inputs include:"; - for (int i = 0; i < input_names_.size(); i++) { + for (size_t i = 0; i < input_names_.size(); i++) { LOG(ERROR) << "[" << input_names_[i] << "]"; } return nullptr; @@ -108,7 +114,7 @@ void LightPredictor::PrepareFeedFetch() { auto current_block = cpp_program_desc_.GetBlock(0); std::vector feeds; std::vector fetchs; - for (int i = 0; i < current_block->OpsSize(); i++) { + for (size_t i = 0; i < current_block->OpsSize(); i++) { auto op = current_block->GetOp(i); if (op->Type() == "feed") { feeds.push_back(op); @@ -118,11 +124,11 @@ void LightPredictor::PrepareFeedFetch() { } input_names_.resize(feeds.size()); output_names_.resize(fetchs.size()); - for (int i = 0; i < feeds.size(); i++) { + for (size_t i = 0; i < feeds.size(); i++) { input_names_[feeds[i]->GetAttr("col")] = feeds[i]->Output("Out").front(); } - for (int i = 0; i < fetchs.size(); i++) { + for (size_t i = 0; i < fetchs.size(); i++) { output_names_[fetchs[i]->GetAttr("col")] = fetchs[i]->Input("X").front(); } @@ -133,7 +139,12 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { // 1. Create op first Program program(prog, scope_, {}); - // 2. Create Instructs +// 2. Create Instructs +#ifdef LITE_WITH_OPENCL + using OpenCLContext = Context; + std::unique_ptr local_ctx(new KernelContext()); + local_ctx->As().InitOnce(); +#endif // Create the kernels of the target places, and filter out the specific // kernel with the target alias. @@ -149,7 +160,18 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { return it->alias() == alias; }); CHECK(it != kernels.end()); + +#ifdef LITE_WITH_OPENCL + if ((*it)->target() == TARGET(kOpenCL)) { + std::unique_ptr ctx(new KernelContext()); + (*local_ctx).As().CopySharedTo(&ctx->As()); + (*it)->SetContext(std::move(ctx)); + } else { + (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target())); + } +#else (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target())); +#endif insts.emplace_back(op, std::move(*it)); } @@ -160,58 +182,76 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { } void LightPredictor::DequantizeWeight() { -#define PROCESS_CONV2D_DATA() \ - for (int64_t i = 0; i < h; ++i) { \ - for (int64_t j = 0; j < w; ++j) { \ - fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \ - } \ +#define PROCESS_CONV2D_DATA() \ + for (int64_t i = 0; i < ch; ++i) { \ + for (int64_t j = 0; j < offset; ++j) { \ + fp_data[i * offset + j] = scale_list[i] * int_data[i * offset + j]; \ + } \ } -#define PROCESS_FC_DATA() \ - for (int i = 0; i < input_tensor->numel(); i++) { \ - *fp_data = scale_list[0] * (*int_data); \ - ++fp_data; \ - ++int_data; \ +#define PROCESS_FC_DATA() \ + for (int64_t i = 0; i < chin; i++) { \ + for (int64_t j = 0; j < chout; j++) { \ + fp_data[i * chout + j] = scale_list[j] * int_data[i * chout + j]; \ + } \ } + auto is_weight_quantized_op = [](const cpp::OpDesc* op_desc) { + bool result = false; + if (op_desc->HasAttr("quantization_type")) { + std::string type = op_desc->GetAttr("quantization_type"); + result = (type == "post_weight_abs_max") || + (type == "post_weight_channel_wise_abs_max"); + } else { + result = op_desc->HasAttr("quantize_weight_bits"); + } + return result; + }; + Tensor tmp_tensor; - CHECK(cpp_program_desc_.BlocksSize()); - auto* main_block = cpp_program_desc_.GetBlock(0); - for (size_t k = 0; k < main_block->OpsSize(); ++k) { - auto* op_desc = main_block->GetOp(k); - if (op_desc->HasAttr("quantize_weight_bits")) { // weight quantized op - auto input_names = op_desc->input_vars(); - for (auto& input_name : input_names) { - std::string input_scale_name = input_name + "_quant_scale"; - if (op_desc->HasAttr(input_scale_name)) { // the input is quantized - auto input_tensor = - scope_->FindVar(input_name)->GetMutable(); - tmp_tensor.CopyDataFrom(*input_tensor); - auto scale_list = - op_desc->GetAttr>(input_scale_name); - int quantize_weight_bits = - op_desc->GetAttr("quantize_weight_bits"); - float* fp_data = input_tensor->mutable_data(); - - std::string op_type = op_desc->Type(); - if (op_type == "conv2d" || op_type == "depthwise_conv2d") { - int64_t h = input_tensor->dims()[0]; - int64_t w = input_tensor->numel() / h; - CHECK_EQ(scale_list.size(), h); - if (quantize_weight_bits == 8) { - const int8_t* int_data = tmp_tensor.data(); - PROCESS_CONV2D_DATA() - } else { - const int16_t* int_data = tmp_tensor.data(); - PROCESS_CONV2D_DATA() - } - } else if (op_type == "fc" || op_type == "mul") { - if (quantize_weight_bits == 8) { - const int8_t* int_data = tmp_tensor.data(); - PROCESS_FC_DATA() - } else { - const int16_t* int_data = tmp_tensor.data(); - PROCESS_FC_DATA() + for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) { + auto* block = cpp_program_desc_.GetBlock(i); + for (size_t k = 0; k < block->OpsSize(); ++k) { + auto* op_desc = block->GetOp(k); + if (is_weight_quantized_op(op_desc)) { + auto input_names = op_desc->input_vars(); + for (auto& input_name : input_names) { + std::string input_scale_name = input_name + "_quant_scale"; + if (op_desc->HasAttr(input_scale_name)) { // the input is quantized + auto input_tensor = + scope_->FindVar(input_name)->GetMutable(); + tmp_tensor.CopyDataFrom(*input_tensor); + auto scale_list = + op_desc->GetAttr>(input_scale_name); + + int quantize_weight_bits = + op_desc->GetAttr("quantize_weight_bits"); + CHECK(quantize_weight_bits == 8 || quantize_weight_bits == 16); + float* fp_data = input_tensor->mutable_data(); + + std::string op_type = op_desc->Type(); + if (op_type == "conv2d" || op_type == "depthwise_conv2d") { + int64_t ch = input_tensor->dims()[0]; + int64_t offset = input_tensor->numel() / ch; + CHECK_EQ(scale_list.size(), ch); + if (quantize_weight_bits == 8) { + const int8_t* int_data = tmp_tensor.data(); + PROCESS_CONV2D_DATA() + } else { + const int16_t* int_data = tmp_tensor.data(); + PROCESS_CONV2D_DATA() + } + } else if (op_type == "fc" || op_type == "mul") { + int64_t chin = input_tensor->dims()[0]; + int64_t chout = input_tensor->dims()[1]; + CHECK_EQ(scale_list.size(), chout); + if (quantize_weight_bits == 8) { + const int8_t* int_data = tmp_tensor.data(); + PROCESS_FC_DATA() + } else { + const int16_t* int_data = tmp_tensor.data(); + PROCESS_FC_DATA() + } } } } diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc index 3965843250abe45c43490bdbb4aaed58915e0908..e76e89af43a7e1d8341c2f43b30e62d6f9306bd2 100644 --- a/lite/api/light_api_impl.cc +++ b/lite/api/light_api_impl.cc @@ -36,6 +36,11 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) { } mode_ = config.power_mode(); threads_ = config.threads(); + +#ifdef LITE_WITH_NPU + Context::SetSubgraphModelCacheDir( + config.subgraph_model_cache_dir()); +#endif } std::unique_ptr LightPredictorImpl::GetInput(int i) { @@ -58,6 +63,7 @@ void LightPredictorImpl::Run() { std::shared_ptr LightPredictorImpl::Clone() { LOG(FATAL) << "The Clone API is not supported in LigthPredictor"; + return nullptr; } std::string LightPredictorImpl::GetVersion() const { return lite::version(); } diff --git a/lite/api/light_api_shared.cc b/lite/api/light_api_shared.cc deleted file mode 100644 index 557804bfa56787fa8a83bfbfc3046df08be010f8..0000000000000000000000000000000000000000 --- a/lite/api/light_api_shared.cc +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#ifndef LITE_ON_TINY_PUBLISH -#include "lite/api/paddle_use_passes.h" -#endif - -namespace paddle { -namespace lite_api { - -void RunModel() { - // 1. Set MobileConfig - MobileConfig mobile_config; - - // 2. Create PaddlePredictor by MobileConfig - std::shared_ptr mobile_predictor = - CreatePaddlePredictor(mobile_config); -} - -} // namespace lite_api -} // namespace paddle diff --git a/lite/api/light_api_test.cc b/lite/api/light_api_test.cc index 7d322530f624c43737018d8ece98fb24d48bc16a..08779c0b5c9802ebc5095241b2543d8724981dff 100644 --- a/lite/api/light_api_test.cc +++ b/lite/api/light_api_test.cc @@ -15,9 +15,6 @@ #include "lite/api/light_api.h" #include #include -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" DEFINE_string(optimized_model, "", ""); @@ -40,11 +37,11 @@ TEST(LightAPI, load) { const std::vector inputs = predictor.GetInputNames(); LOG(INFO) << "input size: " << inputs.size(); - for (int i = 0; i < inputs.size(); i++) { + for (size_t i = 0; i < inputs.size(); i++) { LOG(INFO) << "inputnames: " << inputs[i]; } const std::vector outputs = predictor.GetOutputNames(); - for (int i = 0; i < outputs.size(); i++) { + for (size_t i = 0; i < outputs.size(); i++) { LOG(INFO) << "outputnames: " << outputs[i]; } diff --git a/lite/api/lite_multithread_test.cc b/lite/api/lite_multithread_test.cc index addd512eb0039c43edeca562b8f568528aab76f9..8da192701c9d232196c0dbbc9fd374e214821345 100644 --- a/lite/api/lite_multithread_test.cc +++ b/lite/api/lite_multithread_test.cc @@ -16,9 +16,6 @@ #include #include #include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" #include "lite/api/test_helper.h" #include "lite/core/device_info.h" #include "lite/core/profile/timer.h" @@ -39,7 +36,7 @@ DEFINE_string(model_dir_0, "", "model_dir_0"); DEFINE_string(input_shape_0, "1,3,224,224", "input shapes another, separated by colon and comma"); - +DEFINE_string(target, "arm", "main target for Predictor: arm, opencl"); DEFINE_bool(use_optimize_nb, false, "optimized & naive buffer model for mobile devices"); @@ -54,9 +51,19 @@ void OutputOptModel(const std::string& load_model_dir, const std::vector>& input_shapes) { lite_api::CxxConfig config; config.set_model_dir(load_model_dir); - config.set_valid_places({ - Place{TARGET(kARM), PRECISION(kFloat)}, - }); + if (FLAGS_target == "arm") { + config.set_valid_places({ + Place{TARGET(kARM), PRECISION(kFloat)}, + }); + } else if (FLAGS_target == "opencl") { + config.set_valid_places({ + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}, + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}, + Place{TARGET(kARM)}, // enable kARM CPU kernel when no opencl kernel + }); + } auto predictor = lite_api::CreatePaddlePredictor(config); // delete old optimized model @@ -81,7 +88,7 @@ void Run(const std::vector>& input_shapes, int tid, const int warmup_times = 5) { lite_api::MobileConfig config; - config.set_model_dir(model_dir); + config.set_model_from_file(model_dir + ".nb"); config.set_power_mode(power_mode); config.set_threads(thread_num); @@ -200,7 +207,7 @@ void RunTestType_10(const std::vector>& input_shapes, const int repeat, int warmup = 5) { lite_api::MobileConfig config; - config.set_model_dir(model_dir); + config.set_model_from_file(model_dir + ".nb"); config.set_power_mode(power_mode); config.set_threads(thread_num); @@ -221,13 +228,13 @@ void RunTestType_11(const std::vector>& input_shapes, const int repeat, int warmup = 5) { lite_api::MobileConfig config; - config.set_model_dir(model_dir); + config.set_model_from_file(model_dir + ".nb"); config.set_power_mode(power_mode); config.set_threads(thread_num); auto predictor = lite_api::CreatePaddlePredictor(config); - config.set_model_dir(model_dir_0); + config.set_model_from_file(model_dir_0 + ".nb"); auto predictor_0 = lite_api::CreatePaddlePredictor(config); for (int i = 0; i < 2 * repeat; i += 2) { @@ -249,7 +256,8 @@ int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); if (FLAGS_model_dir == "") { LOG(INFO) << "usage: " - << "--model_dir /path/to/your/model"; + << "--model_dir /path/to/your/model --model_dir_0 " + "/path/to/your/model0 --target `arm` or `opencl`"; exit(0); } std::string save_optimized_model_dir = ""; @@ -296,13 +304,13 @@ int main(int argc, char** argv) { std::vector str_input_shapes = split_string(FLAGS_input_shape); std::vector> input_shapes; - for (int i = 0; i < str_input_shapes.size(); ++i) { + for (size_t i = 0; i < str_input_shapes.size(); ++i) { input_shapes.push_back(get_shape(str_input_shapes[i])); } std::vector str_input_shapes_0 = split_string(FLAGS_input_shape_0); std::vector> input_shapes_0; - for (int i = 0; i < str_input_shapes_0.size(); ++i) { + for (size_t i = 0; i < str_input_shapes_0.size(); ++i) { input_shapes_0.push_back(get_shape(str_input_shapes_0[i])); } diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc index bcc9644f81542ab6fb8a0badf8ecaea89fc8dedb..5342a36ec154b2bdde44fa72bc21e9d430ad4efe 100644 --- a/lite/api/mobilenetv1_test.cc +++ b/lite/api/mobilenetv1_test.cc @@ -53,9 +53,13 @@ void TestModel(const std::vector& valid_places, predictor.Run(); } - auto start = GetCurrentUS(); + double sum_duration = 0.0; // millisecond; for (int i = 0; i < FLAGS_repeats; ++i) { + auto start = GetCurrentUS(); predictor.Run(); + auto duration = (GetCurrentUS() - start) / 1000.0; + sum_duration += duration; + VLOG(1) << "run_idx:" << i << " " << duration << " ms"; } if (save_model) { @@ -68,8 +72,7 @@ void TestModel(const std::vector& valid_places, LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; + << ", spend " << sum_duration / FLAGS_repeats << " ms in average."; std::vector> ref; ref.emplace_back(std::vector( @@ -81,29 +84,63 @@ void TestModel(const std::vector& valid_places, auto* out = predictor.GetOutput(0); const auto* pdata = out->data(); int step = 50; -#ifdef LITE_WITH_NPU - ASSERT_EQ(out->dims().production(), 1000); - double eps = 0.1; - for (int i = 0; i < ref.size(); ++i) { - for (int j = 0; j < ref[i].size(); ++j) { - auto result = pdata[j * step + (out->dims()[1] * i)]; - auto diff = std::fabs((result - ref[i][j]) / ref[i][j]); - VLOG(3) << diff; - EXPECT_LT(diff, eps); + + // Get target and check result + VLOG(1) << "valid_places.size():" << valid_places.size(); + for (int i = 0; i < valid_places.size(); ++i) { + auto p = valid_places[i]; + VLOG(1) << "valid_places[" << i << "]:" << p.DebugString(); + } + auto first_target = valid_places[0].target; + + if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) { + ASSERT_EQ(out->dims().production(), 1000); + double eps = first_target == TARGET(kOpenCL) ? 0.12 : 0.1; + for (int i = 0; i < ref.size(); ++i) { + for (int j = 0; j < ref[i].size(); ++j) { + auto result = pdata[j * step + (out->dims()[1] * i)]; + auto diff = std::fabs((result - ref[i][j]) / ref[i][j]); + VLOG(3) << diff; + EXPECT_LT(diff, eps); + } + } + } else { + ASSERT_EQ(out->dims().size(), 2); + ASSERT_EQ(out->dims()[0], 1); + ASSERT_EQ(out->dims()[1], 1000); + double eps = 1e-6; + for (int i = 0; i < ref.size(); ++i) { + for (int j = 0; j < ref[i].size(); ++j) { + auto result = pdata[j * step + (out->dims()[1] * i)]; + EXPECT_NEAR(result, ref[i][j], eps); + } } } -#else - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - double eps = 1e-6; - for (int i = 0; i < ref.size(); ++i) { - for (int j = 0; j < ref[i].size(); ++j) { - auto result = pdata[j * step + (out->dims()[1] * i)]; - EXPECT_NEAR(result, ref[i][j], eps); + + // Get detailed result + size_t output_tensor_num = predictor.GetOutputNames().size(); + VLOG(1) << "output tensor num:" << output_tensor_num; + + for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { + auto* output_tensor = predictor.GetOutput(tidx); + VLOG(1) << "============= output tensor " << tidx << " =============\n"; + auto out_dims = output_tensor->dims(); + auto out_data = output_tensor->data(); + auto out_mean = compute_mean(out_data, out_dims.production()); + auto out_std_dev = compute_standard_deviation( + out_data, out_dims.production(), true, out_mean); + + VLOG(1) << "output tensor dims:" << out_dims; + VLOG(1) << "output tensor elements num:" << out_dims.production(); + VLOG(1) << "output tensor standard deviation:" << out_std_dev; + VLOG(1) << "output tensor mean value:" << out_mean; + + // print result + for (int i = 0; i < out_dims.production(); ++i) { + VLOG(2) << "output_tensor->data()[" << i + << "]:" << output_tensor->data()[i]; } } -#endif } #ifdef LITE_WITH_NPU @@ -130,7 +167,7 @@ TEST(MobileNetV1, test_arm) { #ifdef LITE_WITH_OPENCL TEST(MobileNetV1, test_opencl) { std::vector valid_places({ - Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)}, + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}, diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc index 012d6d48d9e6d3747f83a7f1089944bbaf359f71..465f82056c6bb80b706cfb7d875773d75735911b 100644 --- a/lite/api/mobilenetv2_test.cc +++ b/lite/api/mobilenetv2_test.cc @@ -54,9 +54,13 @@ void TestModel(const std::vector& valid_places, predictor.Run(); } - auto start = GetCurrentUS(); + double sum_duration = 0.0; // millisecond; for (int i = 0; i < FLAGS_repeats; ++i) { + auto start = GetCurrentUS(); predictor.Run(); + auto duration = (GetCurrentUS() - start) / 1000.0; + sum_duration += duration; + VLOG(1) << "run_idx:" << i << " " << duration << " ms"; } if (save_model) { @@ -69,8 +73,7 @@ void TestModel(const std::vector& valid_places, LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; + << ", spend " << sum_duration / FLAGS_repeats << " ms in average."; std::vector> ref; // i = 1 @@ -83,27 +86,63 @@ void TestModel(const std::vector& valid_places, auto* out = predictor.GetOutput(0); const auto* pdata = out->data(); int step = 50; -#ifdef LITE_WITH_NPU - ASSERT_EQ(out->dims().production(), 1000); - double eps = 0.1; - for (int i = 0; i < ref.size(); ++i) { - for (int j = 0; j < ref[i].size(); ++j) { - auto result = pdata[j * step + (out->dims()[1] * i)]; - auto diff = std::fabs((result - ref[i][j]) / ref[i][j]); - VLOG(3) << diff; - EXPECT_LT(diff, eps); + + // Get target and check result + VLOG(1) << "valid_places.size():" << valid_places.size(); + for (int i = 0; i < valid_places.size(); ++i) { + auto p = valid_places[i]; + VLOG(1) << "valid_places[" << i << "]:" << p.DebugString(); + } + auto first_target = valid_places[0].target; + + if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) { + ASSERT_EQ(out->dims().production(), 1000); + double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1; + for (int i = 0; i < ref.size(); ++i) { + for (int j = 0; j < ref[i].size(); ++j) { + auto result = pdata[j * step + (out->dims()[1] * i)]; + auto diff = std::fabs((result - ref[i][j]) / ref[i][j]); + VLOG(3) << diff; + EXPECT_LT(diff, eps); + } + } + } else { + ASSERT_EQ(out->dims().size(), 2); + ASSERT_EQ(out->dims()[0], 1); + ASSERT_EQ(out->dims()[1], 1000); + double eps = 1e-6; + for (int i = 0; i < ref.size(); ++i) { + for (int j = 0; j < ref[i].size(); ++j) { + auto result = pdata[j * step + (out->dims()[1] * i)]; + EXPECT_NEAR(result, ref[i][j], eps); + } } } -#else - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - for (int i = 0; i < ref.size(); ++i) { - for (int j = 0; j < ref[i].size(); ++j) { - EXPECT_NEAR(pdata[j * step + (out->dims()[1] * i)], ref[i][j], 1e-6); + + // Get detailed result + size_t output_tensor_num = predictor.GetOutputNames().size(); + VLOG(1) << "output tensor num:" << output_tensor_num; + + for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { + auto* output_tensor = predictor.GetOutput(tidx); + VLOG(1) << "============= output tensor " << tidx << " =============\n"; + auto out_dims = output_tensor->dims(); + auto out_data = output_tensor->data(); + auto out_mean = compute_mean(out_data, out_dims.production()); + auto out_std_dev = compute_standard_deviation( + out_data, out_dims.production(), true, out_mean); + + VLOG(1) << "output tensor dims:" << out_dims; + VLOG(1) << "output tensor elements num:" << out_dims.production(); + VLOG(1) << "output tensor standard deviation:" << out_std_dev; + VLOG(1) << "output tensor mean value:" << out_mean; + + // print result + for (int i = 0; i < out_dims.production(); ++i) { + VLOG(2) << "output_tensor->data()[" << i + << "]:" << output_tensor->data()[i]; } } -#endif } #ifdef LITE_WITH_NPU @@ -130,7 +169,7 @@ TEST(MobileNetV2, test_arm) { #ifdef LITE_WITH_OPENCL TEST(MobileNetV2, test_opencl) { std::vector valid_places({ - Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)}, + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}, diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc index 190890da4c109f39cc52ca5209cd952f8937f780..f61ed9b4c38fcc3a6fe33fd26d6d3a80edcb9373 100644 --- a/lite/api/model_test.cc +++ b/lite/api/model_test.cc @@ -17,9 +17,6 @@ #include #include #include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" #include "lite/api/test_helper.h" #include "lite/core/device_info.h" #include "lite/core/profile/timer.h" @@ -47,9 +44,15 @@ void OutputOptModel(const std::string& load_model_dir, const std::vector>& input_shapes) { lite_api::CxxConfig config; config.set_model_dir(load_model_dir); +#ifdef LITE_WITH_X86 + config.set_valid_places({Place{TARGET(kX86), PRECISION(kFloat)}, + Place{TARGET(kX86), PRECISION(kInt64)}, + Place{TARGET(kHost), PRECISION(kFloat)}}); +#else config.set_valid_places({ Place{TARGET(kARM), PRECISION(kFloat)}, }); +#endif auto predictor = lite_api::CreatePaddlePredictor(config); // delete old optimized model @@ -141,7 +144,7 @@ void Run(const std::vector>& input_shapes, std::ofstream out(FLAGS_arg_name + ".txt"); for (size_t i = 0; i < arg_num; ++i) { sum += arg_tensor->data()[i]; - out << std::to_string(arg_tensor->data()[i]) << "\n"; + out << paddle::lite::to_string(arg_tensor->data()[i]) << "\n"; } LOG(INFO) << FLAGS_arg_name << " shape is " << os.str() << ", mean value is " << sum * 1. / arg_num; @@ -201,7 +204,7 @@ int main(int argc, char** argv) { LOG(INFO) << "input shapes: " << FLAGS_input_shape; std::vector str_input_shapes = split_string(FLAGS_input_shape); std::vector> input_shapes; - for (int i = 0; i < str_input_shapes.size(); ++i) { + for (size_t i = 0; i < str_input_shapes.size(); ++i) { LOG(INFO) << "input shape: " << str_input_shapes[i]; input_shapes.push_back(get_shape(str_input_shapes[i])); } diff --git a/lite/api/model_test_classify.cc b/lite/api/model_test_classify.cc new file mode 100644 index 0000000000000000000000000000000000000000..5d2011e29bfdeb166ae1ad202d96a204893888b0 --- /dev/null +++ b/lite/api/model_test_classify.cc @@ -0,0 +1,335 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "lite/api/paddle_api.h" +#include "lite/api/test_helper.h" +#include "lite/core/device_info.h" +#include "lite/core/profile/timer.h" +#include "lite/utils/cp_logging.h" +#include "lite/utils/string.h" +#ifdef LITE_WITH_PROFILE +#include "lite/core/profile/basic_profiler.h" +#endif // LITE_WITH_PROFILE + +using paddle::lite::profile::Timer; + +DEFINE_string(input_shape, + "1,3,224,224", + "input shapes, separated by colon and comma"); +DEFINE_bool(use_optimize_nb, + false, + "optimized & naive buffer model for mobile devices"); +DEFINE_string(arg_name, "", "the arg name"); + +DEFINE_string(threshold, "0.5", "threshold value default 0.5f"); +DEFINE_string(in_txt, "", "input text"); +DEFINE_string(out_txt, "", "output text"); +DEFINE_string(label_file, "", "label file path"); +DEFINE_int32(topk, 1, "topk num"); + +namespace paddle { +namespace lite_api { + +void OutputOptModel(const std::string& load_model_dir, + const std::string& save_optimized_model_dir, + const std::vector>& input_shapes) { + lite_api::CxxConfig config; + config.set_model_dir(load_model_dir); + config.set_valid_places({ + Place{TARGET(kARM), PRECISION(kFloat)}, + }); + auto predictor = lite_api::CreatePaddlePredictor(config); + + // delete old optimized model + int ret = system( + paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str()) + .c_str()); + if (ret == 0) { + LOG(INFO) << "delete old optimized model " << save_optimized_model_dir; + } + predictor->SaveOptimizedModel(save_optimized_model_dir, + LiteModelType::kNaiveBuffer); + LOG(INFO) << "Load model from " << load_model_dir; + LOG(INFO) << "Save optimized model to " << save_optimized_model_dir; +} + +#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK +std::vector load_labels(std::string label_path) { + FILE* fp = fopen(label_path.c_str(), "r"); + if (fp == nullptr) { + LOG(FATAL) << "load label file failed! " << label_path; + } + std::vector labels; + while (!feof(fp)) { + char str[1024]; + fgets(str, 1024, fp); + std::string str_s(str); + + if (str_s.length() > 0) { + for (int i = 0; i < str_s.length(); i++) { + if (str_s[i] == ' ') { + std::string strr = str_s.substr(i, str_s.length() - i - 1); + labels.push_back(strr); + i = str_s.length(); + } + } + } + } + fclose(fp); + return labels; +} + +void print_topk(const float* scores, + const int size, + const int topk, + const std::vector labels) { + std::vector> vec; + vec.resize(size); + for (int i = 0; i < size; i++) { + vec[i] = std::make_pair(scores[i], i); + } + std::partial_sort(vec.begin(), + vec.begin() + topk, + vec.end(), + std::greater>()); + + // print topk and score + std::string name = FLAGS_out_txt + "_accu.txt"; + FILE* fp = fopen(name.c_str(), "w"); + fprintf(fp, "%d \n", topk); + for (int i = 0; i < topk; i++) { + float score = vec[i].first; + int index = vec[i].second; + fprintf(fp, "%d ", index); + fprintf(fp, "%f \n", score); + LOG(INFO) << i << ": " << index << " " << labels[index] << " " << score; + } + fclose(fp); +} + +void Run(const std::vector>& input_shapes, + const std::string& model_dir, + const PowerMode power_mode, + const int thread_num, + const int repeat, + const int warmup_times = 0) { + lite_api::MobileConfig config; + config.set_model_dir(model_dir); + config.set_power_mode(power_mode); + config.set_threads(thread_num); + + auto predictor = lite_api::CreatePaddlePredictor(config); + bool flag_in = true; + bool flag_out = true; + if (FLAGS_in_txt == "") { + flag_in = false; + } + if (FLAGS_out_txt == "") { + flag_out = false; + } + printf("flag_in: %d, flag_out: %d \n", flag_in, flag_out); + for (int j = 0; j < input_shapes.size(); ++j) { + auto input_tensor = predictor->GetInput(j); + input_tensor->Resize(input_shapes[j]); + auto input_data = input_tensor->mutable_data(); + int input_num = 1; + for (int i = 0; i < input_shapes[j].size(); ++i) { + input_num *= input_shapes[j][i]; + } + + FILE* fp_r = nullptr; + if (flag_in) { + fp_r = fopen(FLAGS_in_txt.c_str(), "r"); + } + for (int i = 0; i < input_num; ++i) { + if (flag_in) { + fscanf(fp_r, "%f\n", &input_data[i]); + } else { + input_data[i] = 1.f; + } + } + if (flag_in) { + fclose(fp_r); + } + } + + for (int i = 0; i < warmup_times; ++i) { + predictor->Run(); + } + + Timer ti; + for (int j = 0; j < repeat; ++j) { + ti.Start(); + predictor->Run(); + float t = ti.Stop(); + LOG(INFO) << "iter: " << j << ", time: " << t << " ms"; + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << model_dir + << ", power_mode: " << static_cast(power_mode) + << ", threads num " << thread_num << ", warmup: " << warmup_times + << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg() + << " ms" + << ", min time: " << ti.LapTimes().Min() << " ms" + << ", max time: " << ti.LapTimes().Max() << " ms."; + + auto output = predictor->GetOutput(0); + auto out = output->data(); + auto output_shape = output->shape(); + int output_num = 1; + for (int i = 0; i < output_shape.size(); ++i) { + output_num *= output_shape[i]; + } + // classify + printf("load_labels \n"); + std::vector labels = load_labels(FLAGS_label_file); + printf("print_topk \n"); + print_topk(out, output_num, FLAGS_topk, labels); + LOG(INFO) << "output_num: " << output_num; + LOG(INFO) << "out " << out[0]; + LOG(INFO) << "out " << out[1]; + FILE* fp = nullptr; + if (flag_out) { + fp = fopen(FLAGS_out_txt.c_str(), "w"); + } + double sum1 = 0.f; + for (int i = 0; i < output_num; ++i) { + if (flag_out) { + fprintf(fp, "%f\n", out[i]); + } + sum1 += out[i]; + } + if (flag_out) { + fclose(fp); + } + printf("out mean: %f \n", sum1 / output_num); + + FILE* fp_w = fopen("time.txt", "a+"); + if (!fp_w) { + printf("open file failed \n"); + return; + } + fprintf(fp_w, + "model: %s, threads: %d, avg: %f ms, min: %f ms, max: %f ms \n", + model_dir.c_str(), + thread_num, + ti.LapTimes().Avg(), + ti.LapTimes().Min(), + ti.LapTimes().Max()); + fclose(fp_w); + + // please turn off memory_optimize_pass to use this feature. + if (FLAGS_arg_name != "") { + auto arg_tensor = predictor->GetTensor(FLAGS_arg_name); + auto arg_shape = arg_tensor->shape(); + int arg_num = 1; + std::ostringstream os; + os << "{"; + for (int i = 0; i < arg_shape.size(); ++i) { + arg_num *= arg_shape[i]; + os << arg_shape[i] << ","; + } + os << "}"; + float sum = 0.; + std::ofstream out(FLAGS_arg_name + ".txt"); + for (size_t i = 0; i < arg_num; ++i) { + sum += arg_tensor->data()[i]; + out << paddle::lite::to_string(arg_tensor->data()[i]) << "\n"; + } + LOG(INFO) << FLAGS_arg_name << " shape is " << os.str() + << ", mean value is " << sum * 1. / arg_num; + } +} +#endif + +} // namespace lite_api +} // namespace paddle + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_model_dir == "") { + LOG(INFO) << "usage: " + << "--model_dir /path/to/your/model"; + exit(0); + } + std::string save_optimized_model_dir = ""; + if (FLAGS_use_optimize_nb) { + save_optimized_model_dir = FLAGS_model_dir; + } else { + save_optimized_model_dir = FLAGS_model_dir + "opt2"; + } + + auto split_string = + [](const std::string& str_in) -> std::vector { + std::vector str_out; + std::string tmp_str = str_in; + while (!tmp_str.empty()) { + size_t next_offset = tmp_str.find(":"); + str_out.push_back(tmp_str.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return str_out; + }; + + auto get_shape = [](const std::string& str_shape) -> std::vector { + std::vector shape; + std::string tmp_str = str_shape; + while (!tmp_str.empty()) { + int dim = atoi(tmp_str.data()); + shape.push_back(dim); + size_t next_offset = tmp_str.find(","); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return shape; + }; + + LOG(INFO) << "input shapes: " << FLAGS_input_shape; + std::vector str_input_shapes = split_string(FLAGS_input_shape); + std::vector> input_shapes; + for (size_t i = 0; i < str_input_shapes.size(); ++i) { + LOG(INFO) << "input shape: " << str_input_shapes[i]; + input_shapes.push_back(get_shape(str_input_shapes[i])); + } + + if (!FLAGS_use_optimize_nb) { + // Output optimized model + paddle::lite_api::OutputOptModel( + FLAGS_model_dir, save_optimized_model_dir, input_shapes); + } + +#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK + // Run inference using optimized model + paddle::lite_api::Run( + input_shapes, + save_optimized_model_dir, + static_cast(FLAGS_power_mode), + FLAGS_threads, + FLAGS_repeats, + FLAGS_warmup); +#endif + return 0; +} diff --git a/lite/api/model_test_detection.cc b/lite/api/model_test_detection.cc new file mode 100644 index 0000000000000000000000000000000000000000..f059aca6330613f66fa93267c0c594cfba6d8833 --- /dev/null +++ b/lite/api/model_test_detection.cc @@ -0,0 +1,349 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "lite/api/paddle_api.h" +#include "lite/api/test_helper.h" +#include "lite/core/device_info.h" +#include "lite/core/profile/timer.h" +#include "lite/utils/cp_logging.h" +#include "lite/utils/string.h" +#ifdef LITE_WITH_PROFILE +#include "lite/core/profile/basic_profiler.h" +#endif // LITE_WITH_PROFILE + +using paddle::lite::profile::Timer; + +DEFINE_string(input_shape, + "1,3,224,224", + "input shapes, separated by colon and comma"); +DEFINE_bool(use_optimize_nb, + false, + "optimized & naive buffer model for mobile devices"); +DEFINE_string(arg_name, "", "the arg name"); + +DEFINE_string(threshold, "0.5", "threshold value default 0.5f"); +DEFINE_string(in_txt, "", "input text"); +DEFINE_string(out_txt, "", "output text"); +DEFINE_int32(orih, 1920, "input image height"); +DEFINE_int32(oriw, 1080, "input image width"); + +namespace paddle { +namespace lite_api { + +struct Object { + float x; + float y; + float width; + float height; + float class_id; + float prob; +}; + +void OutputOptModel(const std::string& load_model_dir, + const std::string& save_optimized_model_dir, + const std::vector>& input_shapes) { + lite_api::CxxConfig config; + config.set_model_dir(load_model_dir); + config.set_valid_places({ + Place{TARGET(kARM), PRECISION(kFloat)}, + }); + auto predictor = lite_api::CreatePaddlePredictor(config); + + // delete old optimized model + int ret = system( + paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str()) + .c_str()); + if (ret == 0) { + LOG(INFO) << "delete old optimized model " << save_optimized_model_dir; + } + predictor->SaveOptimizedModel(save_optimized_model_dir, + LiteModelType::kNaiveBuffer); + LOG(INFO) << "Load model from " << load_model_dir; + LOG(INFO) << "Save optimized model to " << save_optimized_model_dir; +} + +void detect_choose(const float* dout, + std::vector dims, + const float thresh) { + std::string name = FLAGS_out_txt + "_accu.txt"; + FILE* fp = fopen(name.c_str(), "w"); + for (int iw = 0; iw < dims[0]; iw++) { + const float* values = dout + iw * dims[1]; + if (values[1] > thresh) { // pro > 0.01 + fprintf(fp, "%f \n", values[0]); + fprintf(fp, "%f \n", values[1]); + fprintf(fp, "%f \n", values[2]); + fprintf(fp, "%f \n", values[3]); + fprintf(fp, "%f \n", values[4]); + fprintf(fp, "%f \n", values[5]); + } + } + fclose(fp); +} +void detect_object(const float* dout, + std::vector dims, + const float thresh, + int orih, + int oriw) { + std::vector objects; + for (int iw = 0; iw < dims[0]; iw++) { + Object object; + const float* values = dout + iw * dims[1]; + object.class_id = values[0]; + object.prob = values[1]; + object.x = values[2] * oriw; + object.y = values[3] * orih; + object.width = values[4] * oriw - object.x; + object.height = values[5] * orih - object.y; + objects.push_back(object); + } + std::string name = FLAGS_out_txt + "_accu.txt"; + FILE* fp = fopen(name.c_str(), "w"); + for (size_t i = 0; i < objects.size(); ++i) { + Object object = objects.at(i); + if (object.prob > thresh && object.x > 0 && object.y > 0 && + object.width > 0 && object.height > 0) { + if (object.x >= oriw || object.width >= oriw || object.y >= orih || + object.height >= orih) + continue; + fprintf(fp, "%f \n", object.x); + fprintf(fp, "%f \n", object.y); + fprintf(fp, "%f \n", object.width); + fprintf(fp, "%f \n", object.height); + fprintf(fp, "%f \n", object.prob); + fprintf(fp, "%f \n", object.class_id); + LOG(INFO) << "object id: " << object.class_id << ", image size: " << oriw + << ", " << orih << ", detect object: " << object.prob + << ", location: x=" << object.x << ", y=" << object.y + << ", width=" << object.width << ", height=" << object.height; + } + } + fclose(fp); +} +#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK +void Run(const std::vector>& input_shapes, + const std::string& model_dir, + const PowerMode power_mode, + const int thread_num, + const int repeat, + const int warmup_times = 0) { + lite_api::MobileConfig config; + config.set_model_dir(model_dir); + config.set_power_mode(power_mode); + config.set_threads(thread_num); + + auto predictor = lite_api::CreatePaddlePredictor(config); + bool flag_in = true; + bool flag_out = true; + if (FLAGS_in_txt == "") { + flag_in = false; + } + if (FLAGS_out_txt == "") { + flag_out = false; + } + printf("flag_in: %d, flag_out: %d \n", flag_in, flag_out); + for (int j = 0; j < input_shapes.size(); ++j) { + auto input_tensor = predictor->GetInput(j); + input_tensor->Resize(input_shapes[j]); + auto input_data = input_tensor->mutable_data(); + int input_num = 1; + for (int i = 0; i < input_shapes[j].size(); ++i) { + input_num *= input_shapes[j][i]; + } + + FILE* fp_r = nullptr; + if (flag_in) { + fp_r = fopen(FLAGS_in_txt.c_str(), "r"); + } + for (int i = 0; i < input_num; ++i) { + if (flag_in) { + fscanf(fp_r, "%f\n", &input_data[i]); + } else { + input_data[i] = 1.f; + } + } + if (flag_in) { + fclose(fp_r); + } + } + + for (int i = 0; i < warmup_times; ++i) { + predictor->Run(); + } + + Timer ti; + for (int j = 0; j < repeat; ++j) { + ti.Start(); + predictor->Run(); + float t = ti.Stop(); + LOG(INFO) << "iter: " << j << ", time: " << t << " ms"; + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << model_dir + << ", power_mode: " << static_cast(power_mode) + << ", threads num " << thread_num << ", warmup: " << warmup_times + << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg() + << " ms" + << ", min time: " << ti.LapTimes().Min() << " ms" + << ", max time: " << ti.LapTimes().Max() << " ms."; + + auto output = predictor->GetOutput(0); + auto out = output->data(); + auto output_shape = output->shape(); + // detect + detect_object( + out, output_shape, atof(FLAGS_threshold.data()), FLAGS_orih, FLAGS_oriw); + // detect_choose(out, output_shape, atof(FLAGS_threshold.data())); + LOG(INFO) << "out " << out[0]; + LOG(INFO) << "out " << out[1]; + int output_num = 1; + for (int i = 0; i < output_shape.size(); ++i) { + output_num *= output_shape[i]; + } + LOG(INFO) << "output_num: " << output_num; + FILE* fp = nullptr; + if (flag_out) { + fp = fopen(FLAGS_out_txt.c_str(), "w"); + } + double sum1 = 0.f; + for (int i = 0; i < output_num; ++i) { + if (flag_out) { + fprintf(fp, "%f\n", out[i]); + } + sum1 += out[i]; + } + if (flag_out) { + fclose(fp); + } + + printf("out mean: %f \n", sum1 / output_num); + + FILE* fp_w = fopen("time.txt", "a+"); + if (!fp_w) { + printf("open file failed \n"); + return; + } + fprintf(fp_w, + "model: %s, threads: %d, avg: %f ms, min: %f ms, max: %f ms \n", + model_dir.c_str(), + thread_num, + ti.LapTimes().Avg(), + ti.LapTimes().Min(), + ti.LapTimes().Max()); + fclose(fp_w); + + // please turn off memory_optimize_pass to use this feature. + if (FLAGS_arg_name != "") { + auto arg_tensor = predictor->GetTensor(FLAGS_arg_name); + auto arg_shape = arg_tensor->shape(); + int arg_num = 1; + std::ostringstream os; + os << "{"; + for (int i = 0; i < arg_shape.size(); ++i) { + arg_num *= arg_shape[i]; + os << arg_shape[i] << ","; + } + os << "}"; + float sum = 0.; + std::ofstream out(FLAGS_arg_name + ".txt"); + for (size_t i = 0; i < arg_num; ++i) { + sum += arg_tensor->data()[i]; + out << paddle::lite::to_string(arg_tensor->data()[i]) << "\n"; + } + LOG(INFO) << FLAGS_arg_name << " shape is " << os.str() + << ", mean value is " << sum * 1. / arg_num; + } +} +#endif + +} // namespace lite_api +} // namespace paddle + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_model_dir == "") { + LOG(INFO) << "usage: " + << "--model_dir /path/to/your/model"; + exit(0); + } + std::string save_optimized_model_dir = ""; + if (FLAGS_use_optimize_nb) { + save_optimized_model_dir = FLAGS_model_dir; + } else { + save_optimized_model_dir = FLAGS_model_dir + "opt2"; + } + + auto split_string = + [](const std::string& str_in) -> std::vector { + std::vector str_out; + std::string tmp_str = str_in; + while (!tmp_str.empty()) { + size_t next_offset = tmp_str.find(":"); + str_out.push_back(tmp_str.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return str_out; + }; + + auto get_shape = [](const std::string& str_shape) -> std::vector { + std::vector shape; + std::string tmp_str = str_shape; + while (!tmp_str.empty()) { + int dim = atoi(tmp_str.data()); + shape.push_back(dim); + size_t next_offset = tmp_str.find(","); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return shape; + }; + + LOG(INFO) << "input shapes: " << FLAGS_input_shape; + std::vector str_input_shapes = split_string(FLAGS_input_shape); + std::vector> input_shapes; + for (size_t i = 0; i < str_input_shapes.size(); ++i) { + LOG(INFO) << "input shape: " << str_input_shapes[i]; + input_shapes.push_back(get_shape(str_input_shapes[i])); + } + + if (!FLAGS_use_optimize_nb) { + // Output optimized model + paddle::lite_api::OutputOptModel( + FLAGS_model_dir, save_optimized_model_dir, input_shapes); + } + +#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK + // Run inference using optimized model + paddle::lite_api::Run( + input_shapes, + save_optimized_model_dir, + static_cast(FLAGS_power_mode), + FLAGS_threads, + FLAGS_repeats, + FLAGS_warmup); +#endif + return 0; +} diff --git a/lite/api/ocr_attention_test.cc b/lite/api/ocr_attention_test.cc index 5e39c5437c18990be9c6414695a94c6f2c9fcf20..ae45b8e2282d0946019d83a76298c0b0a61f9832 100644 --- a/lite/api/ocr_attention_test.cc +++ b/lite/api/ocr_attention_test.cc @@ -32,18 +32,10 @@ void TestModel(const std::vector& valid_places, bool use_npu = false) { predictor.Build(FLAGS_model_dir, "", "", valid_places); - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 1, 48, 512}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - auto* init_scores = predictor.GetInput(2); init_scores->Resize(DDim(std::vector({1, 1}))); auto* data_scores = init_scores->mutable_data(); - auto scores_size = input_tensor->dims().production(); + auto scores_size = init_scores->dims().production(); for (int i = 0; i < scores_size; i++) { data_scores[i] = 0; } @@ -53,7 +45,7 @@ void TestModel(const std::vector& valid_places, bool use_npu = false) { auto* init_ids = predictor.GetInput(1); init_ids->Resize(DDim(std::vector({1, 1}))); - auto* data_ids = init_ids->mutable_data(); + auto* data_ids = init_ids->mutable_data(); auto ids_size = init_ids->dims().production(); for (int i = 0; i < ids_size; i++) { data_ids[i] = 0; @@ -62,6 +54,13 @@ void TestModel(const std::vector& valid_places, bool use_npu = false) { std::vector> lod_i{{0, 1}, {0, 1}}; *lod_ids = lod_i; + auto* input_tensor = predictor.GetInput(0); + input_tensor->Resize(DDim(std::vector({1, 1, 48, 512}))); + auto* data = input_tensor->mutable_data(); + auto item_size = input_tensor->dims().production(); + for (int i = 0; i < item_size; i++) { + data[i] = 1; + } for (int i = 0; i < FLAGS_warmup; ++i) { predictor.Run(); } @@ -102,6 +101,7 @@ void TestModel(const std::vector& valid_places, bool use_npu = false) { TEST(OcrAttention, test_arm) { std::vector valid_places({ + Place{TARGET(kARM), PRECISION(kInt64)}, Place{TARGET(kARM), PRECISION(kFloat)}, }); diff --git a/lite/api/opt.cc b/lite/api/opt.cc index 92f83371e30affa017a3796cd92cdce7fecc0753..4956c1ae3922a8e041184444dd8b4db0b8fbc9af 100644 --- a/lite/api/opt.cc +++ b/lite/api/opt.cc @@ -23,6 +23,7 @@ #include "kernel_src_map.h" // NOLINT #include "lite/api/cxx_api.h" #include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" #include "lite/api/paddle_use_ops.h" #include "lite/api/paddle_use_passes.h" #include "lite/core/op_registry.h" @@ -54,7 +55,7 @@ DEFINE_string(model_file, "", "model file path of the combined-param model"); DEFINE_string(param_file, "", "param file path of the combined-param model"); DEFINE_string( optimize_out_type, - "protobuf", + "naive_buffer", "store type of the output optimized model. protobuf/naive_buffer"); DEFINE_bool(display_kernels, false, "Display kernel information"); DEFINE_bool(record_tailoring_info, @@ -67,7 +68,6 @@ DEFINE_string(valid_targets, "arm", "The targets this model optimized for, should be one of (arm, " "opencl, x86), splitted by space"); -DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels"); DEFINE_bool(print_supported_ops, false, "Print supported operators on the inputed target"); @@ -88,10 +88,17 @@ std::vector ParserValidPlaces() { auto target_reprs = lite::Split(FLAGS_valid_targets, ","); for (auto& target_repr : target_reprs) { if (target_repr == "arm") { - valid_places.emplace_back(TARGET(kARM)); + valid_places.emplace_back( + Place{TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)}); + valid_places.emplace_back( + Place{TARGET(kARM), PRECISION(kInt32), DATALAYOUT(kNCHW)}); + valid_places.emplace_back( + Place{TARGET(kARM), PRECISION(kInt64), DATALAYOUT(kNCHW)}); + valid_places.emplace_back( + Place{TARGET(kARM), PRECISION(kAny), DATALAYOUT(kNCHW)}); } else if (target_repr == "opencl") { valid_places.emplace_back( - Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)}); + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)}); valid_places.emplace_back( Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}); valid_places.emplace_back( @@ -101,11 +108,21 @@ std::vector ParserValidPlaces() { valid_places.emplace_back( TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel } else if (target_repr == "x86") { - valid_places.emplace_back(TARGET(kX86)); + valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kFloat)}); + valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)}); } else if (target_repr == "npu") { valid_places.emplace_back(TARGET(kNPU)); } else if (target_repr == "xpu") { valid_places.emplace_back(TARGET(kXPU)); + } else if (target_repr == "mlu") { + valid_places.emplace_back(TARGET(kMLU)); + } else if (target_repr == "rknpu") { + valid_places.emplace_back(TARGET(kRKNPU)); + valid_places.emplace_back( + TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)); + } else if (target_repr == "apu") { + valid_places.emplace_back( + Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}); } else { LOG(FATAL) << lite::string_format( "Wrong target '%s' found, please check the command flag " @@ -118,11 +135,6 @@ std::vector ParserValidPlaces() { << "At least one target should be set, should set the " "command argument 'valid_targets'"; - if (FLAGS_prefer_int8_kernel) { - LOG(WARNING) << "Int8 mode is only support by ARM target"; - valid_places.insert(valid_places.begin(), - Place{TARGET(kARM), PRECISION(kInt8)}); - } return valid_places; } @@ -187,6 +199,8 @@ void PrintOpsInfo(std::set valid_ops = {}) { "kFPGA", "kNPU", "kXPU", + "kRKNPU", + "kAPU", "kAny", "kUnk"}; int maximum_optype_length = 0; @@ -197,7 +211,7 @@ void PrintOpsInfo(std::set valid_ops = {}) { } std::cout << std::setiosflags(std::ios::internal); std::cout << std::setw(maximum_optype_length) << "OP_name"; - for (int i = 0; i < targets.size(); i++) { + for (size_t i = 0; i < targets.size(); i++) { std::cout << std::setw(10) << targets[i].substr(1); } std::cout << std::endl; @@ -205,7 +219,7 @@ void PrintOpsInfo(std::set valid_ops = {}) { for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) { std::cout << std::setw(maximum_optype_length) << it->first; auto ops_valid_places = it->second; - for (int i = 0; i < targets.size(); i++) { + for (size_t i = 0; i < targets.size(); i++) { if (std::find(ops_valid_places.begin(), ops_valid_places.end(), targets[i]) != ops_valid_places.end()) { @@ -225,7 +239,7 @@ void PrintOpsInfo(std::set valid_ops = {}) { } // Print OP info. auto ops_valid_places = supported_ops.at(*op); - for (int i = 0; i < targets.size(); i++) { + for (size_t i = 0; i < targets.size(); i++) { if (std::find(ops_valid_places.begin(), ops_valid_places.end(), targets[i]) != ops_valid_places.end()) { @@ -251,17 +265,16 @@ void PrintHelpInfo() { " `--param_file=`\n" " `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out=`\n" - " `--valid_targets=(arm|opencl|x86|npu|xpu)`\n" - " `--prefer_int8_kernel=(true|false)`\n" + " `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n" " `--record_tailoring_info=(true|false)`\n" " Arguments of model checking and ops information:\n" " `--print_all_ops=true` Display all the valid operators of " "Paddle-Lite\n" " `--print_supported_ops=true " - "--valid_targets=(arm|opencl|x86|npu|xpu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`" " Display valid operators of input targets\n" " `--print_model_ops=true --model_dir= " - "--valid_targets=(arm|opencl|x86|npu|xpu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`" " Display operators in the input model\n"; std::cout << "opt version:" << opt_version << std::endl << help_info << std::endl; @@ -279,11 +292,11 @@ void ParseInputCommand() { auto valid_places = paddle::lite_api::ParserValidPlaces(); // get valid_targets string std::vector target_types = {}; - for (int i = 0; i < valid_places.size(); i++) { + for (size_t i = 0; i < valid_places.size(); i++) { target_types.push_back(valid_places[i].target); } std::string targets_str = TargetToStr(target_types[0]); - for (int i = 1; i < target_types.size(); i++) { + for (size_t i = 1; i < target_types.size(); i++) { targets_str = targets_str + TargetToStr(target_types[i]); } @@ -292,7 +305,7 @@ void ParseInputCommand() { target_types.push_back(TARGET(kUnk)); std::set valid_ops; - for (int i = 0; i < target_types.size(); i++) { + for (size_t i = 0; i < target_types.size(); i++) { auto ops = supported_ops_target[static_cast(target_types[i])]; valid_ops.insert(ops.begin(), ops.end()); } @@ -309,7 +322,7 @@ void CheckIfModelSupported() { auto valid_unktype_ops = supported_ops_target[static_cast(TARGET(kUnk))]; valid_ops.insert( valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end()); - for (int i = 0; i < valid_places.size(); i++) { + for (size_t i = 0; i < valid_places.size(); i++) { auto target = valid_places[i].target; auto ops = supported_ops_target[static_cast(target)]; valid_ops.insert(valid_ops.end(), ops.begin(), ops.end()); @@ -331,7 +344,7 @@ void CheckIfModelSupported() { std::set unsupported_ops; std::set input_model_ops; - for (int index = 0; index < cpp_prog.BlocksSize(); index++) { + for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) { auto current_block = cpp_prog.GetBlock(index); for (size_t i = 0; i < current_block->OpsSize(); ++i) { auto& op_desc = *current_block->GetOp(i); @@ -355,13 +368,13 @@ void CheckIfModelSupported() { unsupported_ops_str = unsupported_ops_str + ", " + *op_str; } std::vector targets = {}; - for (int i = 0; i < valid_places.size(); i++) { + for (size_t i = 0; i < valid_places.size(); i++) { targets.push_back(valid_places[i].target); } std::sort(targets.begin(), targets.end()); targets.erase(unique(targets.begin(), targets.end()), targets.end()); std::string targets_str = TargetToStr(targets[0]); - for (int i = 1; i < targets.size(); i++) { + for (size_t i = 1; i < targets.size(); i++) { targets_str = targets_str + "," + TargetToStr(targets[i]); } diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc new file mode 100644 index 0000000000000000000000000000000000000000..36d5891eb5cfbc33b839626d0913538c9c02592f --- /dev/null +++ b/lite/api/opt_base.cc @@ -0,0 +1,457 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/api/opt_base.h" +#include "all_kernel_faked.cc" // NOLINT + +namespace paddle { +namespace lite_api { + +void OptBase::SetModelDir(const std::string& model_path) { + opt_config_.set_model_dir(model_path); +} + +void OptBase::SetModelFile(const std::string& model_path) { + opt_config_.set_model_file(model_path); +} + +void OptBase::SetParamFile(const std::string& param_path) { + opt_config_.set_param_file(param_path); +} + +void OptBase::SetModelType(std::string optimize_out_type) { + if (optimize_out_type == "protobuf") { + model_type_ = LiteModelType::kProtobuf; + } else if (optimize_out_type == "naive_buffer") { + model_type_ = LiteModelType::kNaiveBuffer; + } else { + LOG(FATAL) << "Unsupported Model type :" << optimize_out_type; + } +} + +void OptBase::SetPassesInternal( + const std::vector& passes_internal) { + opt_config_.set_passes_internal(passes_internal); +} + +void OptBase::SetValidPlaces(const std::string& valid_places) { + valid_places_.clear(); + auto target_reprs = lite::Split(valid_places, ","); + for (auto& target_repr : target_reprs) { + if (target_repr == "arm") { + valid_places_.emplace_back( + Place{TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)}); + valid_places_.emplace_back( + Place{TARGET(kARM), PRECISION(kInt32), DATALAYOUT(kNCHW)}); + valid_places_.emplace_back( + Place{TARGET(kARM), PRECISION(kInt64), DATALAYOUT(kNCHW)}); + valid_places_.emplace_back( + Place{TARGET(kARM), PRECISION(kAny), DATALAYOUT(kNCHW)}); + } else if (target_repr == "opencl") { + valid_places_.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)}); + valid_places_.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}); + valid_places_.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}); + valid_places_.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}); + valid_places_.emplace_back( + TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel + } else if (target_repr == "x86") { + valid_places_.emplace_back(TARGET(kX86)); + } else if (target_repr == "npu") { + valid_places_.emplace_back(TARGET(kNPU)); + } else if (target_repr == "xpu") { + valid_places_.emplace_back(TARGET(kXPU)); + } else if (target_repr == "rknpu") { + valid_places_.emplace_back(TARGET(kRKNPU)); + valid_places_.emplace_back( + TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)); + } else if (target_repr == "apu") { + valid_places_.emplace_back( + Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}); + } else { + LOG(FATAL) << lite::string_format( + "Wrong target '%s' found, please check the command flag " + "'valid_targets'", + target_repr.c_str()); + } + } + CHECK(!valid_places_.empty()) + << "At least one target should be set, should set the " + "command argument 'valid_targets'"; +} + +void OptBase::SetOptimizeOut(const std::string& lite_out_name) { + lite_out_name_ = lite_out_name; +} + +void OptBase::RecordModelInfo(bool record_strip_info) { + record_strip_info_ = record_strip_info; +} + +void OptBase::Run() { + CheckIfModelSupported(false); + OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map); + opt_config_.set_valid_places(valid_places_); + if (model_set_dir_ != "") { + RunOptimizeFromModelSet(record_strip_info_); + } else { + auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_); + opt_predictor->SaveOptimizedModel( + lite_out_name_, model_type_, record_strip_info_); + auto resulted_model_name = + record_strip_info_ ? "information of striped model" : "optimized model"; + std::cout << "Save the " << resulted_model_name + << " into :" << lite_out_name_ << "successfully"; + } +} + +void OptBase::RunOptimize(const std::string& model_dir_path, + const std::string& model_path, + const std::string& param_path, + const std::string& model_type, + const std::string& valid_places, + const std::string& optimized_out_path) { + SetModelDir(model_dir_path); + SetModelFile(model_path); + SetParamFile(param_path); + SetModelType(model_type); + SetValidPlaces(valid_places); + SetOptimizeOut(optimized_out_path); + CheckIfModelSupported(false); + OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map); + opt_config_.set_valid_places(valid_places_); + if (model_set_dir_ != "") { + RunOptimizeFromModelSet(record_strip_info_); + } else { + auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_); + opt_predictor->SaveOptimizedModel( + lite_out_name_, model_type_, record_strip_info_); + auto resulted_model_name = + record_strip_info_ ? "information of striped model" : "optimized model"; + std::cout << "Save the " << resulted_model_name + << " into :" << lite_out_name_ << "successfully"; + } +} +// collect ops info of modelset +void CollectModelMetaInfo(const std::string& output_dir, + const std::vector& models, + const std::string& filename) { + std::set total; + for (const auto& name : models) { + std::string model_path = + lite::Join({output_dir, name, filename}, "/"); + auto lines = lite::ReadLines(model_path); + total.insert(lines.begin(), lines.end()); + } + std::string output_path = + lite::Join({output_dir, filename}, "/"); + lite::WriteLines(std::vector(total.begin(), total.end()), + output_path); +} + +void OptBase::SetModelSetDir(const std::string& model_set_path) { + model_set_dir_ = model_set_path; +} +void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { + // 1. mkdir of outputed optimized model set. + lite::MkDirRecur(lite_out_name_); + auto model_dirs = lite::ListDir(model_set_dir_, true); + if (model_dirs.size() == 0) { + LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model"; + } + + // 2. optimize each model in inputed model set dir. + std::string model_file = opt_config_.model_file(); + std::string param_file = opt_config_.param_file(); + for (const auto& name : model_dirs) { + std::string input_model_dir = + lite::Join({model_set_dir_, name}, "/"); + std::string output_model_dir = + lite::Join({lite_out_name_, name}, "/"); + + if (opt_config_.model_file() != "" && opt_config_.param_file() != "") { + auto model_file_path = + lite::Join({input_model_dir, model_file}, "/"); + auto param_file_path = + lite::Join({input_model_dir, param_file}, "/"); + } + + std::cout << "Start optimize model: " << input_model_dir; + + opt_config_.set_model_dir(input_model_dir); + opt_config_.set_model_file(model_file); + opt_config_.set_param_file(param_file); + + auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_); + opt_predictor->SaveOptimizedModel( + lite_out_name_, model_type_, record_strip_info); + + std::cout << "Optimize done. "; + } + + // 3. if record_strip_info = true, we will record striping info + if (record_strip_info) { + // Collect all models information + CollectModelMetaInfo( + lite_out_name_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME); + CollectModelMetaInfo( + lite_out_name_, model_dirs, lite::TAILORD_OPS_LIST_NAME); + CollectModelMetaInfo( + lite_out_name_, model_dirs, lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME); + CollectModelMetaInfo( + lite_out_name_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME); + std::cout << "Record the information of stripped models into :" + << lite_out_name_ << "successfully"; + } +} + +void OptBase::PrintHelpInfo() { + const std::string opt_version = lite::version(); + const char help_info[] = + "------------------------------------------------------------------------" + "-----------------------------------------------------------\n" + " Valid arguments of Paddle-Lite opt are listed below:\n" + "------------------------------------------------------------------------" + "-----------------------------------------------------------\n" + " Arguments of help information:\n" + " `help()` Print help infomation\n" + "\n" + " Arguments of model transformation:\n" + " `set_model_dir(model_dir)`\n" + " `set_model_file(model_file_path)`\n" + " `set_param_file(param_file_path)`\n" + " `set_model_type(protobuf|naive_buffer)`: naive_buffer by " + "default\n" + " `set_lite_out(output_optimize_model_dir)`\n" + " `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n" + " `record_model_info(false|true)`: refer to whether to record ops " + "info for striping lib, false by default`\n" + " `run() : start model transformation`\n" + " eg. `opt.set_model_dir(\"./mobilenetv1\"); " + "opt.set_lite_out(\"mobilenetv1_opt\"); opt.set_valid_places(\"arm\"); " + "opt.run();`\n" + "\n" + " You can also transform model through a single input argument:\n" + " `run_optimize(model_dir, model_file_path, param_file_path, " + "model_type, valid_places, lite_out_name) `\n" + " eg. `opt.run_optimize(\"./mobilenetv1\", \"\", \"\", " + "\"naive_buffer\", \"arm\", \"mobilenetv1_opt\");`" + "\n" + " Arguments of checking model and printing ops information:\n" + " `print_all_ops()` Display all the valid operators of " + "Paddle-Lite\n" + " `print_supported_ops` Display supported operators of valid " + "places\n" + " `check_if_model_supported()` Check if the input model is " + "supported\n" + "------------------------------------------------------------------------" + "-----------------------------------------------------------\n"; + std::cout << "opt version:" << opt_version << std::endl << help_info; +} + +void OptBase::PrintExecutableBinHelpInfo() { + const std::string opt_version = lite::version(); + const char help_info[] = + "At least one argument should be inputed. Valid arguments are listed " + "below:\n" + " Arguments of model optimization:\n" + " `--model_dir=`\n" + " `--model_file=`\n" + " `--param_file=`\n" + " `--optimize_out_type=(protobuf|naive_buffer)`\n" + " `--optimize_out=`\n" + " `--valid_targets=(arm|opencl|x86|npu|xpu)`\n" + " `--record_tailoring_info=(true|false)`\n" + " Arguments of model checking and ops information:\n" + " `--print_all_ops=true` Display all the valid operators of " + "Paddle-Lite\n" + " `--print_supported_ops=true " + "--valid_targets=(arm|opencl|x86|npu|xpu)`" + " Display valid operators of input targets\n" + " `--print_model_ops=true --model_dir= " + "--valid_targets=(arm|opencl|x86|npu|xpu)`" + " Display operators in the input model\n"; + std::cout << "paddlelite opt version:" << opt_version << std::endl + << help_info << std::endl; +} + +// 2. Print supported info of inputed ops +void OptBase::PrintOpsInfo(const std::set& valid_ops) { + std::vector lite_supported_targets = {"kHost", + "kX86", + "kCUDA", + "kARM", + "kOpenCL", + "kFPGA", + "kNPU", + "kXPU", + "kRKNPU", + "kAPU", + "kAny", + "kUnk"}; + // Get the lengh of the first column: maximum length of the op_type + size_t maximum_optype_length = 0; + for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) { + maximum_optype_length = it->first.size() > maximum_optype_length + ? it->first.size() + : maximum_optype_length; + } + std::cout << std::setiosflags(std::ios::internal); + // Print the first row: OP_nam taget1 target2 ... + std::cout << std::setw(maximum_optype_length) << "OP_name"; + for (size_t i = 0; i < lite_supported_targets.size(); i++) { + std::cout << std::setw(10) << lite_supported_targets[i].substr(1); + } + std::cout << std::endl; + // Print the name of supported ops and mark if it's supported by each target + // print the support info of inputed ops: valid_ops + for (auto op = valid_ops.begin(); op != valid_ops.end(); op++) { + std::cout << std::setw(maximum_optype_length) << *op; + // Check: If this kernel doesn't match any operator, we will skip it. + if (supported_ops.find(*op) == supported_ops.end()) { + continue; + } + // Print OP info. + auto ops_valid_places = supported_ops.at(*op); + for (size_t i = 0; i < lite_supported_targets.size(); i++) { + if (std::find(ops_valid_places.begin(), + ops_valid_places.end(), + lite_supported_targets[i]) != ops_valid_places.end()) { + std::cout << std::setw(10) << "Y"; + } else { + std::cout << std::setw(10) << " "; + } + } + std::cout << std::endl; + } +} + +void OptBase::DisplayKernelsInfo() { // Display kernel information + std::cout << ::paddle::lite::KernelRegistry::Global().DebugString(); +} +void OptBase::PrintAllOps() { + // 1. Get supported ops on these targets + std::set valid_ops; + for (size_t i = 0; i < supported_ops_target.size(); i++) { + auto ops = supported_ops_target[i]; + valid_ops.insert(ops.begin(), ops.end()); + } + // 2. Print support info of these ops + PrintOpsInfo(valid_ops); +} + +void OptBase::PrintSupportedOps() { + // 1. Get the valid hardware targets + std::vector target_types = {}; + for (size_t i = 0; i < valid_places_.size(); i++) { + target_types.push_back(valid_places_[i].target); + } + std::string targets_str = TargetToStr(target_types[0]); + for (size_t i = 1; i < target_types.size(); i++) { + targets_str = targets_str + TargetToStr(target_types[i]); + } + std::cout << "Supported OPs on '" << targets_str << "': " << std::endl; + target_types.push_back(TARGET(kHost)); + target_types.push_back(TARGET(kUnk)); + + // 2. Get supported ops on these targets + std::set valid_ops; + for (size_t i = 0; i < target_types.size(); i++) { + auto ops = supported_ops_target[static_cast(target_types[i])]; + valid_ops.insert(ops.begin(), ops.end()); + } + // 3. Print support info of these ops + PrintOpsInfo(valid_ops); +} + +// test whether this model is supported +void OptBase::CheckIfModelSupported(bool print_ops_info) { + // 1. parse valid places and valid targets + auto valid_ops = supported_ops_target[static_cast(TARGET(kHost))]; + auto valid_unktype_ops = supported_ops_target[static_cast(TARGET(kUnk))]; + valid_ops.insert( + valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end()); + for (size_t i = 0; i < valid_places_.size(); i++) { + auto target = valid_places_[i].target; + auto ops = supported_ops_target[static_cast(target)]; + valid_ops.insert(valid_ops.end(), ops.begin(), ops.end()); + } + // get valid ops + std::set valid_ops_set(valid_ops.begin(), valid_ops.end()); + + // 2.Load model into program to get ops in model + std::string prog_path = opt_config_.model_dir() + "/__model__"; + if (!(opt_config_.model_file()).empty() && + !(opt_config_.param_file()).empty()) { + prog_path = opt_config_.model_file(); + } + lite::cpp::ProgramDesc cpp_prog; + framework::proto::ProgramDesc pb_proto_prog = + *lite::LoadProgram(prog_path, false); + lite::pb::ProgramDesc pb_prog(&pb_proto_prog); + // Transform to cpp::ProgramDesc + lite::TransformProgramDescAnyToCpp(pb_prog, &cpp_prog); + + std::set unsupported_ops; + std::set input_model_ops; + for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) { + auto current_block = cpp_prog.GetBlock(index); + for (size_t i = 0; i < current_block->OpsSize(); ++i) { + auto& op_desc = *current_block->GetOp(i); + auto op_type = op_desc.Type(); + input_model_ops.insert(op_type); + if (valid_ops_set.count(op_type) == 0) { + unsupported_ops.insert(op_type); + } + } + } + // 3. Print ops_info of input model and check if this model is supported + if (print_ops_info) { + std::cout << "OPs in the input model include:\n"; + PrintOpsInfo(input_model_ops); + } + if (!unsupported_ops.empty()) { + std::string unsupported_ops_str = *unsupported_ops.begin(); + for (auto op_str = ++unsupported_ops.begin(); + op_str != unsupported_ops.end(); + op_str++) { + unsupported_ops_str = unsupported_ops_str + ", " + *op_str; + } + std::vector targets = {}; + for (size_t i = 0; i < valid_places_.size(); i++) { + targets.push_back(valid_places_[i].target); + } + std::sort(targets.begin(), targets.end()); + targets.erase(unique(targets.begin(), targets.end()), targets.end()); + std::string targets_str = TargetToStr(targets[0]); + for (size_t i = 1; i < targets.size(); i++) { + targets_str = targets_str + "," + TargetToStr(targets[i]); + } + + LOG(ERROR) << "Error: This model is not supported, because " + << unsupported_ops.size() << " ops are not supported on '" + << targets_str << "'. These unsupported ops are: '" + << unsupported_ops_str << "'."; + exit(1); + } + if (print_ops_info) { + std::cout << "Paddle-Lite supports this model!" << std::endl; + exit(1); + } +} +} // namespace lite_api +} // namespace paddle diff --git a/lite/api/opt_base.h b/lite/api/opt_base.h new file mode 100644 index 0000000000000000000000000000000000000000..d162b4b511fc6cf56f1346c2c6bf02a3168095a8 --- /dev/null +++ b/lite/api/opt_base.h @@ -0,0 +1,98 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file defines Opt and basic functions about model transformation. + */ + +#ifndef PADDLE_LITE_OPT_H_ // NOLINT +#define PADDLE_LITE_OPT_H_ +#include +#include +#include +#include +#include +// stores the map that records the source_file path of each kernel. +#include "kernel_src_map.h" // NOLINT +#include "lite/api/cxx_api.h" +// version of Paddle-lite +#include "lite/core/version.h" +// model parser functions to pre-load model to verify if this model is supported +#include "lite/model_parser/compatible_pb.h" +#include "lite/model_parser/pb/program_desc.h" +#include "lite/utils/string.h" +// recorded all the ops supported by paddle-lite +#include "supported_kernel_op_info.h" // NOLINT + +namespace paddle { +namespace lite_api { + +/// The PaddlePredictor defines the basic interfaces for different kinds of +/// predictors. +class LITE_API OptBase { + public: + OptBase() = default; + void SetModelSetDir(const std::string &model_set_path); + void SetModelDir(const std::string &model_dir_path); + void SetModelFile(const std::string &model_path); + void SetParamFile(const std::string ¶m_path); + void SetValidPlaces(const std::string &valid_places); + void SetOptimizeOut(const std::string &lite_out_name); + void RecordModelInfo(bool record_strip_info = true); + // set optimized_model type + void SetModelType(std::string model_type = "naive_buffer"); + // internal inference for developer, not recommanded. + // choose methods of model optimizing. + void SetPassesInternal(const std::vector &passes_internal = {}); + // transform and save the optimized model + void Run(); + void RunOptimize(const std::string &model_dir_path = "", + const std::string &model_path = "", + const std::string ¶m_path = "", + const std::string &model_type = "", + const std::string &valid_places = "", + const std::string &optimized_out_path = ""); + // fuctions of printing info + // 1. help info + // 1.1 Print help info for opt python api + void PrintHelpInfo(); + // 1.2 Print help info for executable opt bin + void PrintExecutableBinHelpInfo(); + // 2. PrintOpsInfo + void PrintOpsInfo(const std::set &valid_ops = + {}); // print supported ops on target_types + void PrintAllOps(); // print all ops + void PrintSupportedOps(); // print ops supported on valid_places_ + void DisplayKernelsInfo(); // Display kernel information + // 3. Check if this model is supported + void CheckIfModelSupported(bool print_ops_info = true); + + private: + CxxConfig opt_config_; + // valid places for the optimized_model + std::vector valid_places_; + // filename of the optimized_model + std::string lite_out_name_; + // type of the optimized_model, kNaiveBuffer default. + LiteModelType model_type_{LiteModelType::kNaiveBuffer}; + // Dir path of a set of models, this should be combined with model + std::string model_set_dir_; + bool record_strip_info_{false}; + void RunOptimizeFromModelSet(bool record_strip_info = false); +}; + +} // namespace lite_api +} // namespace paddle + +#endif // NOLINT diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc index 9f071cf7780e27defdd1fcd6be02844618165fb6..bfeff4879820f132a331e9bff56a5f9c494fe775 100644 --- a/lite/api/paddle_api.cc +++ b/lite/api/paddle_api.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/api/paddle_api.h" +#include "lite/core/context.h" #include "lite/core/device_info.h" #include "lite/core/target_wrapper.h" #include "lite/core/tensor.h" @@ -38,6 +39,7 @@ void Tensor::Resize(const shape_t &shape) { tensor(raw_tensor_)->Resize(shape); } +// Tensor::data template <> const float *Tensor::data() const { return ctensor(raw_tensor_)->data(); @@ -47,15 +49,19 @@ const int8_t *Tensor::data() const { return ctensor(raw_tensor_)->data(); } template <> +const uint8_t *Tensor::data() const { + return ctensor(raw_tensor_)->data(); +} +template <> const int64_t *Tensor::data() const { return ctensor(raw_tensor_)->data(); } - template <> const int32_t *Tensor::data() const { return ctensor(raw_tensor_)->data(); } +// Tensor::mutable_data template <> int *Tensor::mutable_data(TargetType type) const { return tensor(raw_tensor_)->mutable_data(type); @@ -69,6 +75,10 @@ int8_t *Tensor::mutable_data(TargetType type) const { return tensor(raw_tensor_)->mutable_data(type); } template <> +uint8_t *Tensor::mutable_data(TargetType type) const { + return tensor(raw_tensor_)->mutable_data(type); +} +template <> int64_t *Tensor::mutable_data(TargetType type) const { return tensor(raw_tensor_)->mutable_data(type); } @@ -116,18 +126,22 @@ void Tensor::CopyToCpu(T *data) const { template void Tensor::CopyFromCpu(const int *); template void Tensor::CopyFromCpu(const float *); template void Tensor::CopyFromCpu(const int8_t *); +template void Tensor::CopyFromCpu(const uint8_t *); template void Tensor::CopyFromCpu(const int *); template void Tensor::CopyFromCpu(const float *); template void Tensor::CopyFromCpu(const int8_t *); +template void Tensor::CopyFromCpu(const uint8_t *); + template void Tensor::CopyFromCpu(const int *); template void Tensor::CopyFromCpu(const int64_t *); template void Tensor::CopyFromCpu(const float *); template void Tensor::CopyFromCpu(const int8_t *); -template void Tensor::CopyToCpu(int8_t *) const; template void Tensor::CopyToCpu(float *) const; template void Tensor::CopyToCpu(int *) const; +template void Tensor::CopyToCpu(int8_t *) const; +template void Tensor::CopyToCpu(uint8_t *) const; shape_t Tensor::shape() const { return ctensor(raw_tensor_)->dims().Vectorize(); @@ -153,6 +167,20 @@ lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); } void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); } +std::unique_ptr PaddlePredictor::GetMutableTensor( + const std::string &name) { + LOG(FATAL) + << "The GetMutableTensor API is only supported by CxxConfig predictor."; + return nullptr; +} + +std::vector PaddlePredictor::GetParamNames() { + std::vector null_result = {}; + LOG(FATAL) + << "The GetParamNames API is only supported by CxxConfig predictor."; + return null_result; +} + void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir, LiteModelType model_type, bool record_info) { @@ -190,6 +218,68 @@ void ConfigBase::set_threads(int threads) { #endif } +#ifdef LITE_WITH_MLU +void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) { + mlu_core_version_ = core_version; +} +void CxxConfig::set_mlu_core_number(int core_number) { + mlu_core_number_ = core_number; +} +void CxxConfig::set_mlu_input_layout(DataLayoutType layout) { + mlu_input_layout_ = layout; +} +void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) { + mlu_use_first_conv_ = use_first_conv; +} +void CxxConfig::set_mlu_first_conv_mean(const std::vector &mean) { + mlu_first_conv_mean_ = mean; +} +void CxxConfig::set_mlu_first_conv_std(const std::vector &std) { + mlu_first_conv_std_ = std; +} +lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const { + return mlu_core_version_; +} +int CxxConfig::mlu_core_number() const { return mlu_core_number_; } +DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; } +bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; } +const std::vector &CxxConfig::mlu_first_conv_mean() const { + return mlu_first_conv_mean_; +} +const std::vector &CxxConfig::mlu_first_conv_std() const { + return mlu_first_conv_std_; +} +#endif + +void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) { +#ifdef LITE_WITH_XPU + lite::Context::SetWorkspaceL3Size(l3_size); +#else + LOG(WARNING) << "The invoking of the function " + "'set_xpu_workspace_l3_size_per_thread' is ignored, please " + "rebuild it with LITE_WITH_XPU=ON."; +#endif +} + +void CxxConfig::set_xpu_dev_per_thread(int dev_no) { +#ifdef LITE_WITH_XPU + lite::Context::SetDev(dev_no); +#else + LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is " + "ignored, please rebuild it with LITE_WITH_XPU=ON."; +#endif +} + +void CxxConfig::set_xpu_multi_encoder_precision(const std::string &precision) { +#ifdef LITE_WITH_XPU + lite::Context::_multi_encoder_precision = precision; +#else + LOG(WARNING) << "The invoking of the function " + "'set_xpu_multi_encoder_precision' is " + "ignored, please rebuild it with LITE_WITH_XPU=ON."; +#endif +} + // set model data in combined format, `set_model_from_file` refers to loading // model from file, set_model_from_buffer refers to loading model from memory // buffer diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index 307eeb74e8b4cdc3b2d6188eb18490e4dcf89b8f..b9fb3daa1a8e6f6548704ac4352fa4334e85d3b8 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -86,6 +86,8 @@ class LITE_API PaddlePredictor { virtual std::vector GetInputNames() = 0; // Get output names virtual std::vector GetOutputNames() = 0; + // Get output names + virtual std::vector GetParamNames(); // Get Input by name virtual std::unique_ptr GetInputByName(const std::string& name) = 0; @@ -93,6 +95,9 @@ class LITE_API PaddlePredictor { /// Get a readonly tensor, return null if no one called `name` exists. virtual std::unique_ptr GetTensor( const std::string& name) const = 0; + /// Get a mutable tensor, return null if on one called `name` exists + /// internal infereces API, not recommanded. + virtual std::unique_ptr GetMutableTensor(const std::string& name); /// Persist the optimized model to disk. This API is only supported by /// CxxConfig, and the persisted model can be reused for MobileConfig. @@ -113,18 +118,27 @@ class LITE_API ConfigBase { std::string model_dir_; int threads_{1}; PowerMode mode_{LITE_POWER_NO_BIND}; + // to save subgraph model for npu/xpu/... + std::string subgraph_model_cache_dir_{""}; public: explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1); // set Model_dir void set_model_dir(const std::string& x) { model_dir_ = x; } const std::string& model_dir() const { return model_dir_; } - // set Power_mode - void set_power_mode(PowerMode mode); - PowerMode power_mode() const { return mode_; } // set Thread void set_threads(int threads); int threads() const { return threads_; } + // set Power_mode + void set_power_mode(PowerMode mode); + PowerMode power_mode() const { return mode_; } + // set subgraph_model_dir + void set_subgraph_model_cache_dir(std::string subgraph_model_cache_dir) { + subgraph_model_cache_dir_ = subgraph_model_cache_dir; + } + const std::string& subgraph_model_cache_dir() const { + return subgraph_model_cache_dir_; + } }; /// CxxConfig is the config for the Full feature predictor. @@ -132,10 +146,22 @@ class LITE_API CxxConfig : public ConfigBase { std::vector valid_places_; std::string model_file_; std::string param_file_; + std::vector passes_internal_{}; bool model_from_memory_{false}; #ifdef LITE_WITH_X86 int x86_math_library_math_threads_ = 1; #endif +#ifdef LITE_WITH_CUDA + bool multi_stream_{false}; +#endif +#ifdef LITE_WITH_MLU + lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270}; + int mlu_core_number_{1}; + DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)}; + bool mlu_use_first_conv_{false}; + std::vector mlu_first_conv_mean_; + std::vector mlu_first_conv_std_; +#endif public: void set_valid_places(const std::vector& x) { valid_places_ = x; } @@ -149,7 +175,16 @@ class LITE_API CxxConfig : public ConfigBase { param_file_ = std::string(param_buffer, param_buffer + param_buffer_size); model_from_memory_ = true; } - + // internal inference to choose passes for model optimizing, + // it's designed for internal developer and not recommanded + // for comman users. + void set_passes_internal( + const std::vector& passes_internal = {}) { + passes_internal_ = passes_internal; + } + const std::vector& get_passes_internal() const { + return passes_internal_; + } const std::vector& valid_places() const { return valid_places_; } std::string model_file() const { return model_file_; } std::string param_file() const { return param_file_; } @@ -163,6 +198,44 @@ class LITE_API CxxConfig : public ConfigBase { return x86_math_library_math_threads_; } #endif +#ifdef LITE_WITH_CUDA + void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; } + bool multi_stream() const { return multi_stream_; } +#endif + +#ifdef LITE_WITH_MLU + // set MLU core version, which is used when compiling MLU kernels + void set_mlu_core_version(lite_api::MLUCoreVersion core_version); + // set MLU core number, which is used when compiling MLU kernels + void set_mlu_core_number(int core_number); + // set MLU input layout. User can specify layout of input data to be NHWC, + // default is NCHW + void set_mlu_input_layout(DataLayoutType layout); + // whether use MLU's first conv kernel. First conv is a special kernel + // provided by MLU, its input is uint8, and also needs two 3-dimentional + // vectors which save all inputs' mean and std values + void set_mlu_use_first_conv(bool use_first_conv); + // set the 3-dimentional mean vector used by MLU's first conv + void set_mlu_first_conv_mean(const std::vector& mean); + // set the 3-dimentional std vector used by MLU's first conv + void set_mlu_first_conv_std(const std::vector& std); + + lite_api::MLUCoreVersion mlu_core_version() const; + int mlu_core_number() const; + DataLayoutType mlu_input_layout() const; + bool mlu_use_first_conv() const; + const std::vector& mlu_first_conv_mean() const; + const std::vector& mlu_first_conv_std() const; +#endif + + // XPU only, set the size of the workspace memory from L3 cache for the + // current thread. + void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00); + // XPU only, specify the target device ID for the current thread. + // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker + // thread + void set_xpu_dev_per_thread(int dev_no = 0); + void set_xpu_multi_encoder_precision(const std::string& precision = "int16"); }; /// MobileConfig is the config for the light weight predictor, it will skip @@ -206,7 +279,7 @@ class LITE_API MobileConfig : public ConfigBase { }; template -std::shared_ptr CreatePaddlePredictor(const ConfigT&); +LITE_API std::shared_ptr CreatePaddlePredictor(const ConfigT&); } // namespace lite_api } // namespace paddle diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc index 9213a24e5c0614550a098c4de8d97b6cf6695177..832867df079efa1baebf08da4c0d8e37958460f1 100644 --- a/lite/api/paddle_api_test.cc +++ b/lite/api/paddle_api_test.cc @@ -15,9 +15,6 @@ #include "lite/api/paddle_api.h" #include #include -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" #include "lite/utils/cp_logging.h" #include "lite/utils/io.h" DEFINE_string(model_dir, "", ""); @@ -39,11 +36,11 @@ TEST(CxxApi, run) { auto inputs = predictor->GetInputNames(); LOG(INFO) << "input size: " << inputs.size(); - for (int i = 0; i < inputs.size(); i++) { + for (size_t i = 0; i < inputs.size(); i++) { LOG(INFO) << "inputnames: " << inputs[i]; } auto outputs = predictor->GetOutputNames(); - for (int i = 0; i < outputs.size(); i++) { + for (size_t i = 0; i < outputs.size(); i++) { LOG(INFO) << "outputnames: " << outputs[i]; } auto input_tensor = predictor->GetInputByName(inputs[0]); diff --git a/lite/api/paddle_lite_factory_helper.h b/lite/api/paddle_lite_factory_helper.h index e99127e233bc4adf159a6a567dfb15f6fd784a27..5ce6a9ac9433d720c005d84712ed181d075c61b4 100644 --- a/lite/api/paddle_lite_factory_helper.h +++ b/lite/api/paddle_lite_factory_helper.h @@ -18,20 +18,27 @@ */ #pragma once -#define USE_LITE_OP(op_type__) \ - extern int touch_op_##op_type__(); \ - int LITE_OP_REGISTER_FAKE(op_type__) __attribute__((unused)) = \ - touch_op_##op_type__(); +// some platform-independent defintion + +#if defined(_WIN32) +#define UNUSED +#define __builtin_expect(EXP, C) (EXP) +#else +#define UNUSED __attribute__((unused)) +#endif + +#define USE_LITE_OP(op_type__) \ + extern int touch_op_##op_type__(); \ + int LITE_OP_REGISTER_FAKE(op_type__) UNUSED = touch_op_##op_type__(); #define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \ extern int touch_##op_type__##target__##precision__##layout__##alias__(); \ int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \ - __attribute__((unused)) = \ - touch_##op_type__##target__##precision__##layout__##alias__(); + UNUSED = touch_##op_type__##target__##precision__##layout__##alias__(); -#define USE_MIR_PASS(name__) \ - extern bool mir_pass_registry##name__##_fake(); \ - static bool mir_pass_usage##name__ __attribute__((unused)) = \ +#define USE_MIR_PASS(name__) \ + extern bool mir_pass_registry##name__##_fake(); \ + static bool mir_pass_usage##name__ UNUSED = \ mir_pass_registry##name__##_fake(); #define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__ diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index 2cced919e601f8ecb79ce262a2b083d5b6862da9..9bc63e78aae92556a312eb36c3415f9d57c2239a 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -24,9 +24,9 @@ namespace lite_api { size_t Place::hash() const { std::hash h; size_t hash = h(static_cast(target)); - hash = lite::hash_combine(hash, static_cast(precision)); - hash = lite::hash_combine(hash, static_cast(layout)); - hash = lite::hash_combine(hash, static_cast(device)); + lite::CombineHash(static_cast(precision), &hash); + lite::CombineHash(static_cast(layout), &hash); + lite::CombineHash(static_cast(device), &hash); return hash; } @@ -45,6 +45,21 @@ std::string Place::DebugString() const { return os.str(); } +const std::string& ActivationTypeToStr(ActivationType act) { + static const std::string act2string[] = {"unk", + "Relu", + "Relu6", + "PRelu", + "LeakyRelu", + "Sigmoid", + "Tanh", + "Swish", + "Exp"}; + auto x = static_cast(act); + CHECK_LT(x, static_cast(ActivationType::NUM)); + return act2string[x]; +} + const std::string& TargetToStr(TargetType target) { static const std::string target2string[] = {"unk", "host", @@ -56,7 +71,10 @@ const std::string& TargetToStr(TargetType target) { "fpga", "npu", "xpu", - "bm"}; + "bm", + "mlu", + "rknpu", + "apu"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -96,7 +114,10 @@ const std::string& TargetRepr(TargetType target) { "kFPGA", "kNPU", "kXPU", - "kBM"}; + "kBM", + "kMLU", + "kRKNPU", + "kAPU"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -138,6 +159,9 @@ std::set ExpandValidTargets(TargetType target) { TARGET(kNPU), TARGET(kXPU), TARGET(kBM), + TARGET(kMLU), + TARGET(kAPU), + TARGET(kRKNPU), TARGET(kFPGA)}); if (target == TARGET(kAny)) { return valid_set; diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h index 7da52adc7fb6fdd70de3b098508e4622496bed7d..7066656f18ec0693048223f5f1201e77a1b0a37d 100644 --- a/lite/api/paddle_place.h +++ b/lite/api/paddle_place.h @@ -49,12 +49,15 @@ enum class TargetType : int { kCUDA = 3, kARM = 4, kOpenCL = 5, + kAny = 6, // any target kFPGA = 7, kNPU = 8, kXPU = 9, kBM = 10, - kAny = 6, // any target - NUM = 11, // number of fields. + kMLU = 11, + kRKNPU = 12, + kAPU = 13, + NUM = 14, // number of fields. }; enum class PrecisionType : int { kUnk = 0, @@ -88,6 +91,8 @@ typedef enum { LITE_POWER_RAND_LOW = 5 } PowerMode; +typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion; + enum class ActivationType : int { kIndentity = 0, kRelu = 1, @@ -96,7 +101,12 @@ enum class ActivationType : int { kLeakyRelu = 4, kSigmoid = 5, kTanh = 6, - kSwish = 7 + kSwish = 7, + kExp = 8, + kAbs = 9, + kHardSwish = 10, + kReciprocal = 11, + NUM = 12, }; static size_t PrecisionTypeLength(PrecisionType type) { @@ -148,6 +158,8 @@ _ForEachPrecisionType(DefinePrecisionTypeTrait); #define PRECISION(item__) paddle::lite_api::PrecisionType::item__ #define DATALAYOUT(item__) paddle::lite_api::DataLayoutType::item__ +const std::string& ActivationTypeToStr(ActivationType act); + const std::string& TargetToStr(TargetType target); const std::string& PrecisionToStr(PrecisionType precision); diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index a2e13e156370090bfb9b9390a3389859b88fac3e..5165c1419a9fffb110b93744fe656f89fa013fe4 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -24,7 +24,7 @@ USE_MIR_PASS(generate_program_pass); USE_MIR_PASS(io_copy_kernel_pick_pass); USE_MIR_PASS(argument_type_display_pass); USE_MIR_PASS(runtime_context_assign_pass); -USE_MIR_PASS(graph_visualze); +USE_MIR_PASS(graph_visualize_pass); USE_MIR_PASS(lite_conv_bn_fuse_pass); USE_MIR_PASS(lite_fc_fuse_pass); @@ -33,16 +33,28 @@ USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass); USE_MIR_PASS(lite_interpolate_fuse_pass); USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass); USE_MIR_PASS(identity_scale_eliminate_pass); +USE_MIR_PASS(identity_dropout_eliminate_pass); USE_MIR_PASS(lite_conv_elementwise_fuse_pass); USE_MIR_PASS(lite_conv_activation_fuse_pass); USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass); -USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass); +USE_MIR_PASS(lite_elementwise_activation_fuse_pass); USE_MIR_PASS(lite_quant_dequant_fuse_pass); USE_MIR_PASS(type_precision_cast_pass); USE_MIR_PASS(type_layout_cast_pass); +USE_MIR_PASS(type_layout_cast_preprocess_pass); USE_MIR_PASS(memory_optimize_pass); USE_MIR_PASS(kernel_place_correct_pass) +USE_MIR_PASS(multi_stream_analysis_pass); USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(npu_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass); +USE_MIR_PASS(mlu_subgraph_pass); +USE_MIR_PASS(mlu_postprocess_pass); USE_MIR_PASS(weight_quantization_preprocess_pass); +USE_MIR_PASS(apu_subgraph_pass); +USE_MIR_PASS(quantized_op_attributes_inference_pass); +USE_MIR_PASS(lite_scale_activation_fuse_pass); +USE_MIR_PASS(__xpu__resnet_fuse_pass); +USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); +USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass); +USE_MIR_PASS(__xpu__fc_fuse_pass); diff --git a/lite/api/python/CMakeLists.txt b/lite/api/python/CMakeLists.txt index 43178a37c663bb09acb7c025e021cbc91bf0cc5d..5dfecf8c619d8cf9be7a03fa46b4e86a6e641a29 100644 --- a/lite/api/python/CMakeLists.txt +++ b/lite/api/python/CMakeLists.txt @@ -2,6 +2,27 @@ if (NOT LITE_WITH_PYTHON) return() endif() +# to create setup.py for packeting whl for Paddle-Lite and opt +execute_process( + COMMAND git describe --tags --exact-match + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_LITE_TAG + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +execute_process( + COMMAND git log -1 --format=%h + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_LITE_COMMIT + OUTPUT_STRIP_TRAILING_WHITESPACE +) +if(APPLE) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup_mac.py.in + ${CMAKE_CURRENT_BINARY_DIR}/setup.py) +else() + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in + ${CMAKE_CURRENT_BINARY_DIR}/setup.py) +endif() add_subdirectory(pybind) #add_subdirectory(interface) diff --git a/lite/api/python/__init__.py b/lite/api/python/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..72a75d9caaa79fa96e52e8603ae6886aac341009 --- /dev/null +++ b/lite/api/python/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +if os.name =='nt': + current_path = os.path.abspath(os.path.dirname(__file__)) + third_lib_path = current_path + os.sep + 'libs' + os.environ['path'] = third_lib_path+ ';' + os.environ['path'] + sys.path.insert(0, third_lib_path) diff --git a/lite/api/python/bin/paddle_lite_opt b/lite/api/python/bin/paddle_lite_opt new file mode 100644 index 0000000000000000000000000000000000000000..0d506df370841b14bffa48e789908873f6f35df2 --- /dev/null +++ b/lite/api/python/bin/paddle_lite_opt @@ -0,0 +1,91 @@ +#!/usr/bin/env python +# Copyright @ 2020 Baidu. All rights reserved. +""" python wrapper file for Paddle-Lite opt tool """ +from __future__ import print_function +import paddlelite.lite as lite +import argparse + + +def main(): + """ main funcion """ + a=lite.Opt() + parser = argparse.ArgumentParser() + parser.add_argument("--model_dir", type=str, required=False,\ + help="path of the model. This option will be ignored if model_file and param_file exist") + parser.add_argument("--model_file", type=str, required=False,\ + help="model file path of the combined-param model.") + parser.add_argument("--param_file", type=str, required=False,\ + help="param file path of the combined-param model.") + parser.add_argument("--optimize_out_type", type=str, required=False,default="naive_buffer",\ + choices=['protobuf', 'naive_buffer'], \ + help="store type of the output optimized model. protobuf/naive_buffer.") + parser.add_argument("--optimize_out", type=str, required=False,\ + help="path of the output optimized model") + parser.add_argument("--valid_targets", type=str, required=False,default="arm",\ + help="The targets this model optimized for, should be one of (arm,opencl, x86), splitted by space.") + + # arguments of help information + parser.add_argument("--print_supported_ops", type=str, default="false",\ + help="{true, false}\ + Print supported operators on the inputed target") + parser.add_argument("--print_all_ops", type=str, default="false",\ + help="{true, false}\ + Print all the valid operators of Paddle-Lite") + parser.add_argument("--print_model_ops", type=str, default="false",\ + help="{true, false}\ + Print operators in the input model") + parser.add_argument("--display_kernels", type=str, default="false",\ + help="{true, false}\ + Display kernel information") + + # arguments of strip lib according to input model + parser.add_argument("--record_tailoring_info", type=str, default="false",\ + help="{true, false}\ + Record kernels and operators information of the optimized model \ + for tailoring compiling, information are stored into optimized \ + model path as hidden files") + parser.add_argument("--model_set", type=str, required=False,\ + help="path of the models set. This option will be used to specific \ + tailoring") + + args = parser.parse_args() + """ input opt params """ + if args.model_dir is not None: + a.set_model_dir(args.model_dir) + if args.model_set is not None: + a.set_modelset_dir(args.model_set) + if args.model_file is not None: + a.set_model_file(args.model_file) + if args.param_file is not None: + a.set_param_file(args.param_file) + if args.optimize_out_type is not None: + a.set_model_type(args.optimize_out_type) + if args.optimize_out is not None: + a.set_optimize_out(args.optimize_out) + if args.valid_targets is not None: + a.set_valid_places(args.valid_targets) + if args.param_file is not None: + a.set_param_file(args.param_file) + if args.record_tailoring_info == "true": + a.record_model_info(True) + """ print ops info """ + if args.print_all_ops == "true": + a.print_all_ops() + return 0 + if args.print_supported_ops == "true": + a.print_supported_ops() + return 0 + if args.display_kernels == "true": + a.display_kernels_info() + return 0 + if args.print_model_ops == "true": + a.check_if_model_supported(True); + return 0 + if ((args.model_dir is None) and (args.model_file is None or args.param_file is None) and (args.model_set is None)) or (args.optimize_out is None): + a.executablebin_help() + return 1 + else: + a.run() + return 0 +if __name__ == "__main__": + main() diff --git a/lite/api/python/pybind/CMakeLists.txt b/lite/api/python/pybind/CMakeLists.txt index eabb6b150b93a722282118c3932676cd1aee5da8..f9343d3347b5565034e15ef8984191d19895ae9a 100644 --- a/lite/api/python/pybind/CMakeLists.txt +++ b/lite/api/python/pybind/CMakeLists.txt @@ -1,9 +1,28 @@ set(PYBIND_DEPS pybind python paddle_api_light paddle_api) if (NOT LITE_ON_TINY_PUBLISH) - set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full) + set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base) endif() -lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) +if(WIN32) + lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) + get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(lite_pybind ${os_dependency_modules}) +elseif(APPLE) + lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) + set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds") + set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}") + add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...) + set_target_properties(lite_pybind PROPERTIES LINK_FLAGS ${LINK_FLAGS}) + add_dependencies(lite_pybind custom_linker_map) +else() + lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) + set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map") + set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}") + add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...) + set_target_properties(lite_pybind PROPERTIES LINK_FLAGS ${LINK_FLAGS}) + add_dependencies(lite_pybind custom_linker_map) +endif(WIN32) + if (LITE_ON_TINY_PUBLISH) set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") endif() diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc index 2dfe0c49490ecd13e8a3ce480807bdf3875348b7..853153e4d4c61c3d1fd045b43f4f1799c19f078f 100644 --- a/lite/api/python/pybind/pybind.cc +++ b/lite/api/python/pybind/pybind.cc @@ -26,13 +26,11 @@ #ifndef LITE_ON_TINY_PUBLISH #include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_passes.h" +#include "lite/api/opt_base.h" #endif #include "lite/api/light_api.h" #include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" #include "lite/core/tensor.h" namespace py = pybind11; @@ -49,11 +47,34 @@ using lite_api::TargetType; using lite_api::PrecisionType; using lite_api::DataLayoutType; using lite_api::Place; +using lite_api::MLUCoreVersion; using lite::LightPredictorImpl; +using lite_api::OptBase; #ifndef LITE_ON_TINY_PUBLISH using lite::CxxPaddleApiImpl; static void BindLiteCxxPredictor(py::module *m); +void BindLiteOpt(py::module *m) { + py::class_ opt_base(*m, "Opt"); + opt_base.def(py::init<>()) + .def("set_model_dir", &OptBase::SetModelDir) + .def("set_modelset_dir", &OptBase::SetModelSetDir) + .def("set_model_file", &OptBase::SetModelFile) + .def("set_param_file", &OptBase::SetParamFile) + .def("set_valid_places", &OptBase::SetValidPlaces) + .def("set_optimize_out", &OptBase::SetOptimizeOut) + .def("set_model_type", &OptBase::SetModelType) + .def("record_model_info", &OptBase::RecordModelInfo) + .def("set_passes_internal", &OptBase::SetPassesInternal) + .def("run", &OptBase::Run) + .def("run_optimize", &OptBase::RunOptimize) + .def("help", &OptBase::PrintHelpInfo) + .def("executablebin_help", &OptBase::PrintExecutableBinHelpInfo) + .def("print_supported_ops", &OptBase::PrintSupportedOps) + .def("display_kernels_info", &OptBase::DisplayKernelsInfo) + .def("print_all_ops", &OptBase::PrintAllOps) + .def("check_if_model_supported", &OptBase::CheckIfModelSupported); +} #endif static void BindLiteLightPredictor(py::module *m); static void BindLiteCxxConfig(py::module *m); @@ -61,6 +82,7 @@ static void BindLiteMobileConfig(py::module *m); static void BindLitePowerMode(py::module *m); static void BindLitePlace(py::module *m); static void BindLiteTensor(py::module *m); +static void BindLiteMLUCoreVersion(py::module *m); void BindLiteApi(py::module *m) { BindLiteCxxConfig(m); @@ -68,6 +90,7 @@ void BindLiteApi(py::module *m) { BindLitePowerMode(m); BindLitePlace(m); BindLiteTensor(m); + BindLiteMLUCoreVersion(m); #ifndef LITE_ON_TINY_PUBLISH BindLiteCxxPredictor(m); #endif @@ -102,6 +125,7 @@ void BindLiteCxxConfig(py::module *m) { .def("param_file", &CxxConfig::param_file) .def("set_valid_places", &CxxConfig::set_valid_places) .def("set_model_buffer", &CxxConfig::set_model_buffer) + .def("set_passes_internal", &CxxConfig::set_passes_internal) .def("model_from_memory", &CxxConfig::model_from_memory); #ifdef LITE_WITH_ARM cxx_config.def("set_threads", &CxxConfig::set_threads) @@ -109,6 +133,14 @@ void BindLiteCxxConfig(py::module *m) { .def("set_power_mode", &CxxConfig::set_power_mode) .def("power_mode", &CxxConfig::power_mode); #endif +#ifdef LITE_WITH_MLU + cxx_config.def("set_mlu_core_version", &CxxConfig::set_mlu_core_version) + .def("set_mlu_core_number", &CxxConfig::set_mlu_core_number) + .def("set_mlu_input_layout", &CxxConfig::set_mlu_input_layout) + .def("set_mlu_use_first_conv", &CxxConfig::set_mlu_use_first_conv) + .def("set_mlu_first_conv_mean", &CxxConfig::set_mlu_first_conv_mean) + .def("set_mlu_first_conv_std", &CxxConfig::set_mlu_first_conv_std); +#endif } // TODO(sangoly): Should MobileConfig be renamed to LightConfig ?? @@ -140,6 +172,12 @@ void BindLitePowerMode(py::module *m) { .value("LITE_POWER_RAND_LOW", PowerMode::LITE_POWER_RAND_LOW); } +void BindLiteMLUCoreVersion(py::module *m) { + py::enum_(*m, "MLUCoreVersion") + .value("LITE_MLU_220", MLUCoreVersion::MLU_220) + .value("LITE_MLU_270", MLUCoreVersion::MLU_270); +} + void BindLitePlace(py::module *m) { // TargetType py::enum_(*m, "TargetType") @@ -150,6 +188,9 @@ void BindLitePlace(py::module *m) { .value("OpenCL", TargetType::kOpenCL) .value("FPGA", TargetType::kFPGA) .value("NPU", TargetType::kNPU) + .value("MLU", TargetType::kMLU) + .value("RKNPU", TargetType::kRKNPU) + .value("APU", TargetType::kAPU) .value("Any", TargetType::kAny); // PrecisionType @@ -230,6 +271,20 @@ void BindLiteTensor(py::module *m) { DO_GETTER_ONCE(data_type__, name__##_data) DATA_GETTER_SETTER_ONCE(int8_t, int8); +#ifdef LITE_WITH_MLU + tensor.def("set_uint8_data", + [](Tensor &self, + const std::vector &data, + TargetType type = TargetType::kHost) { + if (type == TargetType::kHost) { + self.CopyFromCpu(data.data()); + } + }, + py::arg("data"), + py::arg("type") = TargetType::kHost); + + DO_GETTER_ONCE(uint8_t, "uint8_data"); +#endif DATA_GETTER_SETTER_ONCE(int32_t, int32); DATA_GETTER_SETTER_ONCE(float, float); #undef DO_GETTER_ONCE diff --git a/lite/api/python/pybind/pybind.h b/lite/api/python/pybind/pybind.h index ca05f24b32fd0b0418d9cf595fe6134b34fa725f..15609957e05391be54466262f962e151594ef383 100644 --- a/lite/api/python/pybind/pybind.h +++ b/lite/api/python/pybind/pybind.h @@ -22,11 +22,15 @@ namespace lite { namespace pybind { void BindLiteApi(pybind11::module *m); +void BindLiteOpt(pybind11::module *m); -PYBIND11_MODULE(lite_core, m) { +PYBIND11_MODULE(lite, m) { m.doc() = "C++ core of Paddle-Lite"; BindLiteApi(&m); +#ifndef LITE_ON_TINY_PUBLISH + BindLiteOpt(&m); +#endif } } // namespace pybind diff --git a/lite/api/python/setup.py.in b/lite/api/python/setup.py.in new file mode 100644 index 0000000000000000000000000000000000000000..cf89a72332b4621424a17a347f80f2706aa274f1 --- /dev/null +++ b/lite/api/python/setup.py.in @@ -0,0 +1,97 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# module of pack whl installer for Paddle-lite + +import shutil +import os +from setuptools import setup, Distribution + + +class BinaryDistribution(Distribution): + 'binary distribution' + def has_ext_modules(foo): + return True + + +# get paddle-lite version, if it's not based on a release tag, we use commit id instead +PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@" +PADDLELITE_TAG = "@PADDLE_LITE_TAG@" +if PADDLELITE_TAG == "": + PADDLELITE_VERSION = PADDLELITE_COMMITE +else: + PADDLELITE_VERSION = PADDLELITE_TAG + +# core lib of paddlelite is stored as lite.so +files = os.listdir('${PADDLE_BINARY_DIR}') +INFERENCE_LITE_LIB_PATH = '' +for file in files: + if file.find('inference_lite_lib') == 0: + INFERENCE_LITE_LIB_PATH = '${PADDLE_BINARY_DIR}/' + file + break +LITE_PATH = INFERENCE_LITE_LIB_PATH + '/python/install/lite' +PACKAGE_DATA = {'paddlelite': ['lite.so' if os.name!='nt' else 'lite.pyd']} + +# copy scripts of paddlelite +shutil.copy('${PADDLE_SOURCE_DIR}/lite/api/python/bin/paddle_lite_opt', LITE_PATH) + +# put all thirdparty libraries in paddlelite.libs +PACKAGE_DATA['paddlelite.libs'] = [] +LIB_PATH = INFERENCE_LITE_LIB_PATH + '/python/install/libs/' +if '${WITH_MKL}' == 'ON': + shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH) + shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH) + if os.name != 'nt': + PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so'] + else: + PACKAGE_DATA['paddlelite.libs'] += ['libiomp5md.dll', 'mklml.dll'] + shutil.copy('${MKLML_SHARED_LIB_DEPS}', LIB_PATH) + PACKAGE_DATA['paddlelite.libs'] += ['msvcr120.dll'] +# link lite.so to paddlelite.libs +if os.name != 'nt': + COMMAND = "patchelf --set-rpath '$ORIGIN/libs/' " + LITE_PATH + "/lite.so" + if os.system(COMMAND) != 0: + raise Exception("patch third_party libs failed, command: %s" % COMMAND) + + + +# remove unused paddle/libs/__init__.py +if os.path.isfile(LIB_PATH+'/__init__.py'): + os.remove(LIB_PATH+'/__init__.py') + +# set dir path of each package +PACKAGE_DIR = { + # The paddle.fluid.proto will be generated while compiling. + # So that package points to other directory. + 'paddlelite.libs': LIB_PATH, + 'paddlelite': LITE_PATH +} + +if os.name == 'nt': + # fix the path separator under windows + fix_package_dir = {} + for k, v in PACKAGE_DIR.items(): + fix_package_dir[k] = v.replace('/', '\\') + PACKAGE_DIR = fix_package_dir + + +setup( + name='paddlelite', + version=PADDLELITE_VERSION, + description='Paddle-Lite Library', + scripts=['lite/paddle_lite_opt'], + packages=['paddlelite', 'paddlelite.libs'], + package_dir=PACKAGE_DIR, + package_data=PACKAGE_DATA, + distclass=BinaryDistribution +) diff --git a/lite/api/python/setup_mac.py.in b/lite/api/python/setup_mac.py.in new file mode 100644 index 0000000000000000000000000000000000000000..b4d53e8400ecf06c59951478817e20421e04ee82 --- /dev/null +++ b/lite/api/python/setup_mac.py.in @@ -0,0 +1,76 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# module of pack whl installer for Paddle-lite + +import shutil +import os +from setuptools import setup, Distribution + + +class BinaryDistribution(Distribution): + 'binary distribution' + def has_ext_modules(foo): + return True + + +# get paddle-lite version, if it's not based on a release tag, we use commit id instead +PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@" +PADDLELITE_TAG = "@PADDLE_LITE_TAG@" +if PADDLELITE_TAG == "": + PADDLELITE_VERSION = PADDLELITE_COMMITE +else: + PADDLELITE_VERSION = PADDLELITE_TAG + +# core lib of paddlelite is stored as lite.so +LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite' +PACKAGE_DATA = {'paddlelite': ['lite.so']} +# copy scripts of paddlelite +shutil.copy('${PADDLE_SOURCE_DIR}/lite/api/python/bin/paddle_lite_opt', LITE_PATH) +# put all thirdparty libraries in paddlelite.libs +PACKAGE_DATA['paddlelite.libs'] = [] +LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs' + +if '${WITH_MKL}' == 'ON': + shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH) + shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH) + PACKAGE_DATA['paddlelite.libs'] += ['libmklml.dylib', 'libiomp5.dylib'] + +# link lite.so to paddlelite.libs +COMMAND = "install_name_tool -id \"@loader_path/libs/\" ${PADDLE_BINARY_DIR}\ +/inference_lite_lib/python/install/lite/lite.so" +if os.system(COMMAND) != 0: + raise Exception("patch third_party libs failed, command: %s" % COMMAND) + +# remove unused paddle/libs/__init__.py +if os.path.isfile(LIB_PATH+'/__init__.py'): + os.remove(LIB_PATH+'/__init__.py') + +# set dir path of each package +PACKAGE_DIR = { + # The paddle.fluid.proto will be generated while compiling. + # So that package points to other directory. + 'paddlelite.libs': LIB_PATH, + 'paddlelite': LITE_PATH +} + +setup( + name='paddlelite', + version=PADDLELITE_VERSION, + description='Paddle-Lite Library', + scripts=['lite/paddle_lite_opt'], + packages=['paddlelite', 'paddlelite.libs'], + package_dir=PACKAGE_DIR, + package_data=PACKAGE_DATA, + distclass=BinaryDistribution +) diff --git a/lite/api/test_classify_lite_bm.cc b/lite/api/test_classify_lite_bm.cc new file mode 100644 index 0000000000000000000000000000000000000000..e7ebc80ade073f92fe17c3e375063e2c180b7c13 --- /dev/null +++ b/lite/api/test_classify_lite_bm.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include //NOLINT +#include +#include "lite/api/cxx_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/core/op_registry.h" + +DEFINE_string(input_img_txt_path, + "", + "if set input_img_txt_path, read the img filename as input."); + +namespace paddle { +namespace lite { + +const int g_batch_size = 1; +const int g_thread_num = 1; + +void instance_run() { + lite::Predictor predictor; + std::vector passes; + std::vector valid_places({Place{TARGET(kBM), PRECISION(kFloat)}, + Place{TARGET(kX86), PRECISION(kFloat)}}); + predictor.Build(FLAGS_model_dir, "", "", valid_places, passes); + auto* input_tensor = predictor.GetInput(0); + input_tensor->Resize(DDim(std::vector( + {g_batch_size, 3, FLAGS_im_height, FLAGS_im_width}))); + auto* data = input_tensor->mutable_data(); + auto item_size = input_tensor->dims().production(); + if (FLAGS_input_img_txt_path.empty()) { + for (int i = 0; i < item_size; i++) { + data[i] = 1; + } + } else { + for (int j = 0; j < g_batch_size; j++) { + std::fstream fs(FLAGS_input_img_txt_path, std::ios::in); + if (!fs.is_open()) { + LOG(FATAL) << "open input_img_txt error."; + } + for (int i = 0; i < item_size / g_batch_size; i++) { + fs >> data[i]; + } + data += j * item_size / g_batch_size; + } + } + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor.Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeats; ++i) { + predictor.Run(); + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; + + auto out = predictor.GetOutputs(); + FILE* fp = fopen("result.txt", "wb"); + for (int i = 0; i < out.size(); i++) { + auto* out_data = out[i]->data(); + LOG(INFO) << out[i]->numel(); + for (int j = 0; j < out[i]->numel(); j++) { + fprintf(fp, "%f\n", out_data[j]); + } + } + fclose(fp); +} + +void TestModel(const std::vector& valid_places) { + std::vector> instances_vec; + for (int i = 0; i < g_thread_num; ++i) { + instances_vec.emplace_back(new std::thread(&instance_run)); + } + for (int i = 0; i < g_thread_num; ++i) { + instances_vec[i]->join(); + } +} + +TEST(Classify, test_bm) { + std::vector valid_places({Place{TARGET(kBM), PRECISION(kFloat)}, + Place{TARGET(kX86), PRECISION(kFloat)}}); + + TestModel(valid_places); +} + +} // namespace lite +} // namespace paddle diff --git a/lite/api/test_googlenet_lite.cc b/lite/api/test_googlenet_lite.cc index 8ff7a49af9cbce09d205bb8633a913410beb91c3..4a46a93ebee1770dbbaa100dd7ae913756b7907f 100644 --- a/lite/api/test_googlenet_lite.cc +++ b/lite/api/test_googlenet_lite.cc @@ -38,7 +38,7 @@ TEST(CXXApi, test_lite_googlenet) { input_tensor->Resize(input_shape); auto* data = input_tensor->mutable_data(); int input_num = 1; - for (int i = 0; i < input_shape.size(); ++i) { + for (size_t i = 0; i < input_shape.size(); ++i) { input_num *= input_shape[i]; } for (int i = 0; i < input_num; i++) { @@ -61,15 +61,15 @@ TEST(CXXApi, test_lite_googlenet) { << " ms in average."; auto out = predictor->GetOutput(0); std::vector results( - {0.00034298553, 0.0008200012, 0.0005046297, 0.000839279, - 0.00052616704, 0.0003447803, 0.0010877076, 0.00081762316, - 0.0003941339, 0.0011430943, 0.0008892841, 0.00080191303, - 0.0004442384, 0.000658702, 0.0026721435, 0.0013686896, - 0.0005618166, 0.0006556497, 0.0006984528, 0.0014619455}); + {0.00034298553f, 0.0008200012f, 0.0005046297f, 0.000839279f, + 0.00052616704f, 0.0003447803f, 0.0010877076f, 0.00081762316f, + 0.0003941339f, 0.0011430943f, 0.0008892841f, 0.00080191303f, + 0.0004442384f, 0.000658702f, 0.0026721435f, 0.0013686896f, + 0.0005618166f, 0.0006556497f, 0.0006984528f, 0.0014619455f}); for (size_t i = 0; i < results.size(); ++i) { EXPECT_NEAR(out->data()[i * 51], results[i], 1e-5); } - ASSERT_EQ(out->shape().size(), 2); + ASSERT_EQ(out->shape().size(), 2u); ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[1], 1000); } diff --git a/lite/api/test_helper.h b/lite/api/test_helper.h index 71752c942bb53e7f2ed289ac0d965ae1d1007c55..79c3bbd73c7336aa0973a6bd820dee5b115a1fa1 100644 --- a/lite/api/test_helper.h +++ b/lite/api/test_helper.h @@ -15,8 +15,15 @@ #pragma once #include +#if !defined(_WIN32) #include +#else +#define NOMINMAX // msvc max/min macro conflict with std::min/max +#include +#include "lite/backends/x86/port.h" +#endif #include +#include // for eval DEFINE_string(model_dir, "", "model dir"); @@ -43,5 +50,31 @@ inline double GetCurrentUS() { return 1e+6 * time.tv_sec + time.tv_usec; } +template +double compute_mean(const T* in, const size_t length) { + double sum = 0.; + for (size_t i = 0; i < length; ++i) { + sum += in[i]; + } + return sum / length; +} + +template +double compute_standard_deviation(const T* in, + const size_t length, + bool has_mean = false, + double mean = 10000) { + if (!has_mean) { + mean = compute_mean(in, length); + } + + double variance = 0.; + for (size_t i = 0; i < length; ++i) { + variance += pow((in[i] - mean), 2); + } + variance /= length; + return sqrt(variance); +} + } // namespace lite } // namespace paddle diff --git a/lite/api/test_inceptionv4_lite_x86.cc b/lite/api/test_inceptionv4_lite_x86.cc index e986784809951390889e17f766302fc5ea459465..44c5de6018dcf3fbdb31602c2dd791b9d24515bd 100644 --- a/lite/api/test_inceptionv4_lite_x86.cc +++ b/lite/api/test_inceptionv4_lite_x86.cc @@ -38,7 +38,7 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) { input_tensor->Resize(input_shape); auto* data = input_tensor->mutable_data(); int input_num = 1; - for (int i = 0; i < input_shape.size(); ++i) { + for (size_t i = 0; i < input_shape.size(); ++i) { input_num *= input_shape[i]; } for (int i = 0; i < input_num; i++) { @@ -62,20 +62,20 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) { std::vector> results; // i = 1 results.emplace_back(std::vector( - {0.0011684548, 0.0010390386, 0.0011301535, 0.0010133048, - 0.0010259597, 0.0010982729, 0.00093195855, 0.0009141837, - 0.00096620916, 0.00089982944, 0.0010064574, 0.0010474789, - 0.0009782845, 0.0009230255, 0.0010548076, 0.0010974824, - 0.0010612885, 0.00089107914, 0.0010112736, 0.00097655767})); + {0.0011684548f, 0.0010390386f, 0.0011301535f, 0.0010133048f, + 0.0010259597f, 0.0010982729f, 0.00093195855f, 0.0009141837f, + 0.00096620916f, 0.00089982944f, 0.0010064574f, 0.0010474789f, + 0.0009782845f, 0.0009230255f, 0.0010548076f, 0.0010974824f, + 0.0010612885f, 0.00089107914f, 0.0010112736f, 0.00097655767f})); auto out = predictor->GetOutput(0); - ASSERT_EQ(out->shape().size(), 2); + ASSERT_EQ(out->shape().size(), 2u); ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[1], 1000); int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { + for (size_t i = 0; i < results.size(); ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { EXPECT_NEAR(out->data()[j * step + (out->shape()[1] * i)], results[i][j], 1e-6); diff --git a/lite/api/test_mobilenetv1_lite_x86.cc b/lite/api/test_mobilenetv1_lite_x86.cc index 67dc1b2436988c7d0d853c945fecce27ef2d329f..8280fae733754969828b97b5565f9ab05797552b 100644 --- a/lite/api/test_mobilenetv1_lite_x86.cc +++ b/lite/api/test_mobilenetv1_lite_x86.cc @@ -38,7 +38,7 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) { input_tensor->Resize(input_shape); auto* data = input_tensor->mutable_data(); int input_num = 1; - for (int i = 0; i < input_shape.size(); ++i) { + for (size_t i = 0; i < input_shape.size(); ++i) { input_num *= input_shape[i]; } for (int i = 0; i < input_num; i++) { @@ -62,19 +62,19 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) { std::vector> results; // i = 1 results.emplace_back(std::vector( - {0.00019130898, 9.467885e-05, 0.00015971427, 0.0003650665, - 0.00026431272, 0.00060884043, 0.0002107942, 0.0015819625, - 0.0010323516, 0.00010079765, 0.00011006987, 0.0017364529, - 0.0048292773, 0.0013995157, 0.0018453331, 0.0002428986, - 0.00020211363, 0.00013668182, 0.0005855956, 0.00025901722})); + {0.00019130898f, 9.467885e-05f, 0.00015971427f, 0.0003650665f, + 0.00026431272f, 0.00060884043f, 0.0002107942f, 0.0015819625f, + 0.0010323516f, 0.00010079765f, 0.00011006987f, 0.0017364529f, + 0.0048292773f, 0.0013995157f, 0.0018453331f, 0.0002428986f, + 0.00020211363f, 0.00013668182f, 0.0005855956f, 0.00025901722f})); auto out = predictor->GetOutput(0); - ASSERT_EQ(out->shape().size(), 2); + ASSERT_EQ(out->shape().size(), 2u); ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[1], 1000); int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { + for (size_t i = 0; i < results.size(); ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { EXPECT_NEAR(out->data()[j * step + (out->shape()[1] * i)], results[i][j], 1e-6); diff --git a/lite/api/test_mobilenetv2_lite_x86.cc b/lite/api/test_mobilenetv2_lite_x86.cc index 95e88abcc8e59c6808ea2dc44cf7d1bdd53ac9d0..bd8abf83c6f333e9fb4438df7494a27384c9252f 100644 --- a/lite/api/test_mobilenetv2_lite_x86.cc +++ b/lite/api/test_mobilenetv2_lite_x86.cc @@ -39,7 +39,7 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) { input_tensor->Resize(input_shape); auto* data = input_tensor->mutable_data(); int input_num = 1; - for (int i = 0; i < input_shape.size(); ++i) { + for (size_t i = 0; i < input_shape.size(); ++i) { input_num *= input_shape[i]; } for (int i = 0; i < input_num; i++) { @@ -63,19 +63,19 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) { std::vector> results; // i = 1 results.emplace_back(std::vector( - {0.00017082224, 5.699624e-05, 0.000260885, 0.00016412718, - 0.00034818667, 0.00015230637, 0.00032959113, 0.0014772735, - 0.0009059976, 9.5378724e-05, 5.386537e-05, 0.0006427285, - 0.0070957416, 0.0016094646, 0.0018807327, 0.00010506048, - 6.823785e-05, 0.00012269315, 0.0007806194, 0.00022354358})); + {0.00017082224f, 5.699624e-05f, 0.000260885f, 0.00016412718f, + 0.00034818667f, 0.00015230637f, 0.00032959113f, 0.0014772735f, + 0.0009059976f, 9.5378724e-05f, 5.386537e-05f, 0.0006427285f, + 0.0070957416f, 0.0016094646f, 0.0018807327f, 0.00010506048f, + 6.823785e-05f, 0.00012269315f, 0.0007806194f, 0.00022354358f})); auto out = predictor->GetOutput(0); - ASSERT_EQ(out->shape().size(), 2); + ASSERT_EQ(out->shape().size(), 2u); ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[1], 1000); int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { + for (size_t i = 0; i < results.size(); ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { EXPECT_NEAR(out->data()[j * step + (out->shape()[1] * i)], results[i][j], 1e-6); diff --git a/lite/api/test_resnet50_lite_x86.cc b/lite/api/test_resnet50_lite_x86.cc index 3f9b59d714de611ef0a84cfc3b283d0dddd5c294..4520cb7ba74a1d9eb66fdcb9824e60805bb6a95b 100644 --- a/lite/api/test_resnet50_lite_x86.cc +++ b/lite/api/test_resnet50_lite_x86.cc @@ -38,7 +38,7 @@ TEST(Resnet50, test_resnet50_lite_x86) { input_tensor->Resize(input_shape); auto* data = input_tensor->mutable_data(); int input_num = 1; - for (int i = 0; i < input_shape.size(); ++i) { + for (size_t i = 0; i < input_shape.size(); ++i) { input_num *= input_shape[i]; } for (int i = 0; i < input_num; i++) { @@ -63,19 +63,19 @@ TEST(Resnet50, test_resnet50_lite_x86) { std::vector> results; // i = 1 results.emplace_back(std::vector( - {0.00024139918, 0.00020566184, 0.00022418296, 0.00041731037, - 0.0005366107, 0.00016948722, 0.00028638865, 0.0009257241, - 0.00072681636, 8.531815e-05, 0.0002129998, 0.0021168243, - 0.006387163, 0.0037145028, 0.0012812682, 0.00045948103, - 0.00013535398, 0.0002483765, 0.00076759676, 0.0002773295})); + {0.00024139918f, 0.00020566184f, 0.00022418296f, 0.00041731037f, + 0.0005366107f, 0.00016948722f, 0.00028638865f, 0.0009257241f, + 0.00072681636f, 8.531815e-05f, 0.0002129998f, 0.0021168243f, + 0.006387163f, 0.0037145028f, 0.0012812682f, 0.00045948103f, + 0.00013535398f, 0.0002483765f, 0.00076759676f, 0.0002773295f})); auto out = predictor->GetOutput(0); - ASSERT_EQ(out->shape().size(), 2); + ASSERT_EQ(out->shape().size(), 2u); ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[1], 1000); int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { + for (size_t i = 0; i < results.size(); ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { EXPECT_NEAR(out->data()[j * step + (out->shape()[1] * i)], results[i][j], 1e-6); diff --git a/lite/api/test_step_rnn_lite_x86.cc b/lite/api/test_step_rnn_lite_x86.cc index 013fd82b19bc22ace22184389249a7b2d9bf237e..3840bac99798a48509822bf80786712e8510070b 100644 --- a/lite/api/test_step_rnn_lite_x86.cc +++ b/lite/api/test_step_rnn_lite_x86.cc @@ -82,7 +82,7 @@ TEST(Step_rnn, test_step_rnn_lite_x86) { std::vector> results; // i = 1 - results.emplace_back(std::vector({0.5030127, 0.496987})); + results.emplace_back(std::vector({0.5030127f, 0.496987f})); auto out = predictor->GetOutput(0); std::vector out_shape = out->shape(); diff --git a/lite/api/test_resnet50_lite_bm.cc b/lite/api/test_yolov3_lite_bm.cc similarity index 76% rename from lite/api/test_resnet50_lite_bm.cc rename to lite/api/test_yolov3_lite_bm.cc index 62a58704f4245b8618540ea7109447dd99d0bfea..d70ecf3c03955286244aa13cfe65f19569a55930 100644 --- a/lite/api/test_resnet50_lite_bm.cc +++ b/lite/api/test_yolov3_lite_bm.cc @@ -33,11 +33,15 @@ namespace lite { void TestModel(const std::vector& valid_places) { lite::Predictor predictor; std::vector passes; - passes.push_back("bm_subgraph_pass"); - predictor.Build(FLAGS_model_dir, "", "", valid_places, passes); + predictor.Build(FLAGS_model_dir, + FLAGS_model_dir + "/model", + FLAGS_model_dir + "/params", + valid_places, + passes); auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); + input_tensor->Resize(DDim( + std::vector({1, 3, FLAGS_im_height, FLAGS_im_width}))); auto* data = input_tensor->mutable_data(); auto item_size = input_tensor->dims().production(); if (FLAGS_input_img_txt_path.empty()) { @@ -53,6 +57,12 @@ void TestModel(const std::vector& valid_places) { fs >> data[i]; } } + auto* image_tensor = predictor.GetInput(1); + image_tensor->Resize(DDim(std::vector({1, 2}))); + data = image_tensor->mutable_data(); + data[0] = FLAGS_im_height; + data[1] = FLAGS_im_width; + for (int i = 0; i < FLAGS_warmup; ++i) { predictor.Run(); } @@ -68,20 +78,18 @@ void TestModel(const std::vector& valid_places) { << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 << " ms in average."; - auto* out = predictor.GetOutput(0); - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - - auto* out_data = out->data(); + auto out = predictor.GetOutputs(); FILE* fp = fopen("result.txt", "wb"); - for (int i = 0; i < out->numel(); i++) { - fprintf(fp, "%f\n", out_data[i]); + for (int i = 0; i < out.size(); i++) { + auto* out_data = out[i]->data(); + for (int j = 0; j < out[i]->numel(); j++) { + fprintf(fp, "%f\n", out_data[j]); + } } fclose(fp); } -TEST(ResNet50, test_bm) { +TEST(Yolov3, test_bm) { std::vector valid_places({Place{TARGET(kBM), PRECISION(kFloat)}, Place{TARGET(kX86), PRECISION(kFloat)}}); diff --git a/lite/api/transform_test.cc b/lite/api/transform_test.cc index 8e51f3778d30ba9fcfde493c3e27ecc973e66a59..3cd8416d5e2293642abc68e457465c8a836f790b 100644 --- a/lite/api/transform_test.cc +++ b/lite/api/transform_test.cc @@ -13,7 +13,9 @@ // limitations under the License. #include +#ifdef PADDLE_WITH_TESTING #include +#endif #include #include #include "lite/api/cxx_api.h" @@ -28,11 +30,10 @@ DEFINE_int32(batch, 1, "batch"); namespace paddle { namespace lite { -namespace test_transformer { +namespace test_transformer { std::vector inputed_lines; - -void LoadInputLines(const char* filename) { +void load_input_lines(const char* filename) { static const int max_line_buf_size = 100 * 1024 * 1024; char* line_buffer = (char*)calloc(max_line_buf_size, sizeof(char)); // NOLINT FILE* input_file = fopen(filename, "r"); @@ -49,7 +50,7 @@ void LoadInputLines(const char* filename) { line_buffer = NULL; fclose(input_file); } -void Split2(const std::string& main_str, +void split2(const std::string& main_str, std::vector& str_list, // NOLINT const std::string& delimiter) { size_t pre_pos = 0; @@ -75,19 +76,19 @@ void Split2(const std::string& main_str, } } // NOLINT -void PadBatchInput(std::vector& input_lines, // NOLINT - int pad_idx, - int n_head, - Tensor* src_word, - Tensor* src_pos, - Tensor* src_attn_bias, - Tensor* trg_word, - Tensor* init_scores, - Tensor* init_idx, - Tensor* trg_bias, - int line_start, - int batch_size, - int bos_idx) { +void pad_batch_input(std::vector& input_lines, // NOLINT + int pad_idx, + int n_head, + Tensor* src_word, + Tensor* src_pos, + Tensor* src_attn_bias, + Tensor* trg_word, + Tensor* init_scores, + Tensor* init_idx, + Tensor* trg_bias, + int line_start, + int batch_size, + int bos_idx) { int max_len = 0; int max_line = input_lines.size(); @@ -98,27 +99,27 @@ void PadBatchInput(std::vector& input_lines, // NOLINT std::vector split_str; - test_transformer::Split2(cur_line, split_str, " "); + test_transformer::split2(cur_line, split_str, " "); batch_lines.push_back(split_str); max_len = max_len >= split_str.size() ? max_len : split_str.size(); } - src_word->Resize(std::vector({batch_size, max_len, 1})); - src_pos->Resize(std::vector({batch_size, max_len, 1})); + src_word->Resize(std::vector({batch_size, max_len})); + src_pos->Resize(std::vector({batch_size, max_len})); src_attn_bias->Resize( std::vector({batch_size, n_head, max_len, max_len})); trg_bias->Resize( - std::vector({batch_size, n_head, 1, max_len})); - float* src_word_data = src_word->mutable_data(); - float* src_pos_data = src_pos->mutable_data(); + std::vector({batch_size, n_head, max_len, max_len})); + auto* src_word_data = src_word->mutable_data(); + auto* src_pos_data = src_pos->mutable_data(); float* src_bias_data = src_attn_bias->mutable_data(); float* trg_bias_data = trg_bias->mutable_data(); for (int i = 0; i < batch_size; ++i) { std::vector cur_words = batch_lines[i]; int fill_len = cur_words.size(); int src_bias_start = i * n_head * max_len * max_len; - int trg_bias_start = i * n_head * max_len; + int trg_bias_start = i * n_head * max_len * max_len; for (int j = 0; j < fill_len; ++j) { src_word_data[i * max_len + j] = (atoi(cur_words[j].c_str())); src_pos_data[i * max_len + j] = j; @@ -137,22 +138,24 @@ void PadBatchInput(std::vector& input_lines, // NOLINT int value_ind = j % max_len + src_bias_start; src_bias_data[j] = src_bias_data[value_ind]; } - for (int j = trg_bias_start; j < trg_bias_start + n_head * max_len; ++j) { + for (int j = trg_bias_start; + j < trg_bias_start + n_head * max_len * max_len; + ++j) { int value_ind = j % max_len + trg_bias_start; trg_bias_data[j] = trg_bias_data[value_ind]; } } - trg_word->Resize(std::vector({batch_size, 1, 1})); - auto* trg_word_data = trg_word->mutable_data(); - for (int i = 0; i < batch_size; ++i) { + trg_word->Resize(std::vector({batch_size, max_len})); + auto* trg_word_data = trg_word->mutable_data(); + for (int i = 0; i < batch_size * max_len; ++i) { trg_word_data[i] = bos_idx; } init_scores->Resize(std::vector({batch_size, 1})); init_idx->Resize(std::vector({batch_size})); float* score_data = init_scores->mutable_data(); - float* idx_data = init_idx->mutable_data(); + auto* idx_data = init_idx->mutable_data(); for (int i = 0; i < init_scores->numel(); ++i) { score_data[i] = 0; } @@ -175,21 +178,25 @@ void PadBatchInput(std::vector& input_lines, // NOLINT void TestModel(const std::vector& valid_places, const Place& preferred_place, bool use_npu = false) { +#ifdef LITE_WITH_ARM DeviceInfo::Init(); DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); +#endif lite::Predictor predictor; std::string test_data_path = FLAGS_input; - predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places); + predictor.Build("", + FLAGS_model_dir + "/__model__", + FLAGS_model_dir + "/weights", + valid_places); + // predictor.Build(FLAGS_model_dir, "", "", valid_places); int n_head = 8; int batch_size = FLAGS_batch; int bos_idx = 0; int eos_idx = 1; - LOG(INFO) << "reading"; - test_transformer::LoadInputLines(test_data_path.c_str()); - LOG(INFO) << "reading finished"; + test_transformer::load_input_lines(test_data_path.c_str()); auto* trg_bias = predictor.GetInput(6); auto* src_word = predictor.GetInput(0); @@ -205,28 +212,31 @@ void TestModel(const std::vector& valid_places, auto start = GetCurrentUS(); for (int i = 0; i < FLAGS_repeats; ++i) { - auto start_i = GetCurrentUS(); - PadBatchInput(test_transformer::inputed_lines, - eos_idx, - n_head, - src_word, // src_word - src_pos, // src_pos - src_bias, // src_bias - trg_word, // trg_word - init_score, // init_score - init_idx, // init_idx - trg_bias, // trg_bias - i * batch_size, - batch_size, - bos_idx); - LOG(INFO) << "src_word:" << src_word->dims(); - auto start_ii = GetCurrentUS(); - LOG(INFO) << i << "->ii:" << (start_ii - start_i) / 1000.0; + pad_batch_input(test_transformer::inputed_lines, + eos_idx, + n_head, + src_word, // src_word + src_pos, // src_pos + src_bias, // src_bias + trg_word, // trg_word + init_score, // init_score + init_idx, // init_idx + trg_bias, // trg_bias + i * batch_size, + batch_size, + bos_idx); predictor.Run(); - auto start_iii = GetCurrentUS(); - LOG(INFO) << i << "->iii:" << (start_iii - start_ii) / 1000.0; - auto* outs = predictor.GetOutputs(); - LOG(INFO) << "out:" << (*outs)[0].dims(); + auto* outs = predictor.GetOutput(0); + auto o_data = outs->data(); + auto lod = outs->lod(); + for (int i = 0; i < outs->numel(); ++i) { + LOG(INFO) << o_data[i]; + } + for (size_t i = 0; i < lod.size(); ++i) { + for (size_t j = 0; j < lod[i].size(); ++j) { + LOG(INFO) << lod[i][j]; + } + } } LOG(INFO) << "================== Speed Report ==================="; @@ -234,25 +244,18 @@ void TestModel(const std::vector& valid_places, << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 << " ms in average."; - - auto* outs = predictor.GetOutputs(); - for (auto out : *outs) { - LOG(INFO) << "======" - << "here"; - LOG(INFO) << out; - } - LOG(INFO) << "======" - << "hereggg"; } -TEST(OcrAttention, test_arm) { +} // namespace lite +} // namespace paddle +using namespace paddle::lite; // NOLINT +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kARM), PRECISION(kInt64)}, Place{TARGET(kARM), PRECISION(kFloat)}, + Place{TARGET(kHost), PRECISION(kFloat)}, }); TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)})); } - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index e3517464812a24c9911e824c53841efc05dd2bc5..7f0d53f976ace17ee8d95e62e62d56f5cb974881 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -6,4 +6,7 @@ add_subdirectory(fpga) add_subdirectory(host) add_subdirectory(npu) add_subdirectory(xpu) +add_subdirectory(mlu) add_subdirectory(bm) +add_subdirectory(apu) +add_subdirectory(rknpu) diff --git a/lite/backends/apu/CMakeLists.txt b/lite/backends/apu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9956256a6d88f01f63b08f8604a98eeb213f424f --- /dev/null +++ b/lite/backends/apu/CMakeLists.txt @@ -0,0 +1,6 @@ +if(NOT LITE_WITH_APU) + return() +endif() + +lite_cc_library(neuron_adapter SRCS neuron_adapter.cc) +lite_cc_library(device_apu SRCS device.cc DEPS neuron_adapter) diff --git a/lite/backends/apu/device.cc b/lite/backends/apu/device.cc new file mode 100644 index 0000000000000000000000000000000000000000..a4cee74488da2db3cc279b24b423d47d4e01e10b --- /dev/null +++ b/lite/backends/apu/device.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/apu/device.h" +#include +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace apu { + +NeuronCompilation* Device::Build(NeuronModel* model) { + VLOG(3) << "[APU] Compile model"; + NeuronCompilation* compilation = NULL; + int neuron_errCode = NeuronCompilation_create(model, &compilation); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "[APU] create compile failed! " << neuron_errCode; + return nullptr; + } + neuron_errCode = NeuronCompilation_finish(compilation); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "[APU] compile failed! " << neuron_errCode; + return nullptr; + } + VLOG(3) << "[APU] Build done"; + return compilation; +} + +} // namespace apu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/apu/device.h b/lite/backends/apu/device.h new file mode 100644 index 0000000000000000000000000000000000000000..8c6e6268f4be8c08bc4cfe2a929db448200b9c8e --- /dev/null +++ b/lite/backends/apu/device.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "lite/backends/apu/neuron_adapter.h" + +namespace paddle { +namespace lite { +namespace apu { + +class Device { + public: + static Device& Global() { + static Device x; + return x; + } + Device() {} + + NeuronCompilation* Build(NeuronModel* model); +}; + +} // namespace apu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/apu/neuron_adapter.cc b/lite/backends/apu/neuron_adapter.cc new file mode 100644 index 0000000000000000000000000000000000000000..953c92d1828848bd030a65cb2a8af0eac0674ca1 --- /dev/null +++ b/lite/backends/apu/neuron_adapter.cc @@ -0,0 +1,207 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "lite/backends/apu/neuron_adapter.h" +#include +#include +#include + +namespace paddle { +namespace lite { +NeuronAdapter* NeuronAdapter::Global() { + static NeuronAdapter adapter; + return &adapter; +} + +NeuronAdapter::NeuronAdapter() { + CHECK(InitHandle()) << "Fail to initialize the Neuron Adapter library!"; + InitFunctions(); +} + +bool NeuronAdapter::InitHandle() { + const std::vector paths = { + "libneuron_adapter.so", +#if defined(__aarch64__) + "/vendor/lib64/libneuron_adapter.so", + "/system/lib64/libneuron_adapter.so", + "/system/vendor/lib64/libneuron_adapter.so", +#else + "/vendor/lib/libneuron_adapter.so", + "/system/lib/libneuron_adapter.so", + "/system/vendor/lib/libneuron_adapter.so", +#endif + }; + std::string target_lib = "Unknown"; + for (auto path : paths) { + handle_ = dlopen(path.c_str(), RTLD_LAZY); + if (handle_ != nullptr) { + target_lib = path; + break; + } + } + VLOG(4) << "Load the Neuron Adapter library from " << target_lib; + if (handle_ != nullptr) { + return true; + } else { + return false; + } +} + +void NeuronAdapter::InitFunctions() { + CHECK(handle_ != nullptr) << "The library handle can't be null!"; + +#define PADDLE_DLSYM(neuron_adapter_func) \ + do { \ + neuron_adapter_func##_ = \ + (neuron_adapter_func##_Type)dlsym(handle_, #neuron_adapter_func); \ + if (neuron_adapter_func##_ == nullptr) { \ + LOG(FATAL) << "Cannot find the " << #neuron_adapter_func \ + << " symbol in libneuron_adapter.so!"; \ + break; \ + } \ + VLOG(4) << "Loaded the " << #neuron_adapter_func \ + << " symbol successfully."; \ + } while (false) + + PADDLE_DLSYM(Neuron_getVersion); + PADDLE_DLSYM(NeuronModel_create); + PADDLE_DLSYM(NeuronModel_free); + PADDLE_DLSYM(NeuronModel_finish); + PADDLE_DLSYM(NeuronModel_addOperand); + PADDLE_DLSYM(NeuronModel_setOperandValue); + PADDLE_DLSYM(NeuronModel_setOperandSymmPerChannelQuantParams); + PADDLE_DLSYM(NeuronModel_addOperation); + PADDLE_DLSYM(NeuronModel_identifyInputsAndOutputs); + PADDLE_DLSYM(NeuronCompilation_create); + PADDLE_DLSYM(NeuronCompilation_free); + PADDLE_DLSYM(NeuronCompilation_finish); + PADDLE_DLSYM(NeuronExecution_create); + PADDLE_DLSYM(NeuronExecution_free); + PADDLE_DLSYM(NeuronExecution_setInput); + PADDLE_DLSYM(NeuronExecution_setOutput); + PADDLE_DLSYM(NeuronExecution_compute); + +#undef PADDLE_DLSYM +} + +} // namespace lite +} // namespace paddle + +int Neuron_getVersion(uint32_t* version) { + return paddle::lite::NeuronAdapter::Global()->Neuron_getVersion()(version); +} + +int NeuronModel_create(NeuronModel** model) { + return paddle::lite::NeuronAdapter::Global()->NeuronModel_create()(model); +} + +void NeuronModel_free(NeuronModel* model) { + return paddle::lite::NeuronAdapter::Global()->NeuronModel_free()(model); +} + +int NeuronModel_finish(NeuronModel* model) { + return paddle::lite::NeuronAdapter::Global()->NeuronModel_finish()(model); +} + +int NeuronModel_addOperand(NeuronModel* model, const NeuronOperandType* type) { + return paddle::lite::NeuronAdapter::Global()->NeuronModel_addOperand()(model, + type); +} + +int NeuronModel_setOperandValue(NeuronModel* model, + int32_t index, + const void* buffer, + size_t length) { + return paddle::lite::NeuronAdapter::Global()->NeuronModel_setOperandValue()( + model, index, buffer, length); +} + +int NeuronModel_setOperandSymmPerChannelQuantParams( + NeuronModel* model, + int32_t index, + const NeuronSymmPerChannelQuantParams* channelQuant) { + return paddle::lite::NeuronAdapter::Global() + ->NeuronModel_setOperandSymmPerChannelQuantParams()( + model, index, channelQuant); +} + +int NeuronModel_addOperation(NeuronModel* model, + NeuronOperationType type, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs) { + return paddle::lite::NeuronAdapter::Global()->NeuronModel_addOperation()( + model, type, inputCount, inputs, outputCount, outputs); +} + +int NeuronModel_identifyInputsAndOutputs(NeuronModel* model, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs) { + return paddle::lite::NeuronAdapter::Global() + ->NeuronModel_identifyInputsAndOutputs()( + model, inputCount, inputs, outputCount, outputs); +} + +int NeuronCompilation_create(NeuronModel* model, + NeuronCompilation** compilation) { + return paddle::lite::NeuronAdapter::Global()->NeuronCompilation_create()( + model, compilation); +} + +void NeuronCompilation_free(NeuronCompilation* compilation) { + return paddle::lite::NeuronAdapter::Global()->NeuronCompilation_free()( + compilation); +} + +int NeuronCompilation_finish(NeuronCompilation* compilation) { + return paddle::lite::NeuronAdapter::Global()->NeuronCompilation_finish()( + compilation); +} + +int NeuronExecution_create(NeuronCompilation* compilation, + NeuronExecution** execution) { + return paddle::lite::NeuronAdapter::Global()->NeuronExecution_create()( + compilation, execution); +} + +void NeuronExecution_free(NeuronExecution* execution) { + return paddle::lite::NeuronAdapter::Global()->NeuronExecution_free()( + execution); +} + +int NeuronExecution_setInput(NeuronExecution* execution, + int32_t index, + const NeuronOperandType* type, + const void* buffer, + size_t length) { + return paddle::lite::NeuronAdapter::Global()->NeuronExecution_setInput()( + execution, index, type, buffer, length); +} + +int NeuronExecution_setOutput(NeuronExecution* execution, + int32_t index, + const NeuronOperandType* type, + void* buffer, + size_t length) { + return paddle::lite::NeuronAdapter::Global()->NeuronExecution_setOutput()( + execution, index, type, buffer, length); +} + +int NeuronExecution_compute(NeuronExecution* execution) { + return paddle::lite::NeuronAdapter::Global()->NeuronExecution_compute()( + execution); +} diff --git a/lite/backends/apu/neuron_adapter.h b/lite/backends/apu/neuron_adapter.h new file mode 100644 index 0000000000000000000000000000000000000000..c08db73279ea3969300c8f298016a976e30a7ac4 --- /dev/null +++ b/lite/backends/apu/neuron_adapter.h @@ -0,0 +1,191 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "NeuronAdapter.h" // NOLINT +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { + +class NeuronAdapter final { + public: + static NeuronAdapter *Global(); + // Platform APIs + using Neuron_getVersion_Type = int (*)(uint32_t *); + using NeuronModel_create_Type = int (*)(NeuronModel **); + using NeuronModel_free_Type = void (*)(NeuronModel *); + using NeuronModel_finish_Type = int (*)(NeuronModel *); + using NeuronModel_addOperand_Type = int (*)(NeuronModel *, + const NeuronOperandType *); + using NeuronModel_setOperandValue_Type = int (*)(NeuronModel *, + int32_t, + const void *, + size_t); + using NeuronModel_setOperandSymmPerChannelQuantParams_Type = + int (*)(NeuronModel *, int32_t, const NeuronSymmPerChannelQuantParams *); + using NeuronModel_addOperation_Type = int (*)(NeuronModel *, + NeuronOperationType, + uint32_t, + const uint32_t *, + uint32_t, + const uint32_t *); + using NeuronModel_identifyInputsAndOutputs_Type = int (*)( + NeuronModel *, uint32_t, const uint32_t *, uint32_t, const uint32_t *); + using NeuronCompilation_create_Type = int (*)(NeuronModel *, + NeuronCompilation **); + using NeuronCompilation_free_Type = void (*)(NeuronCompilation *); + using NeuronCompilation_finish_Type = int (*)(NeuronCompilation *); + using NeuronExecution_create_Type = int (*)(NeuronCompilation *, + NeuronExecution **); + using NeuronExecution_free_Type = void (*)(NeuronExecution *); + using NeuronExecution_setInput_Type = int (*)(NeuronExecution *, + int32_t, + const NeuronOperandType *, + const void *, + size_t); + using NeuronExecution_setOutput_Type = int (*)( + NeuronExecution *, int32_t, const NeuronOperandType *, void *, size_t); + using NeuronExecution_compute_Type = int (*)(NeuronExecution *); + + Neuron_getVersion_Type Neuron_getVersion() { + CHECK(Neuron_getVersion_ != nullptr) << "Cannot load Neuron_getVersion!"; + return Neuron_getVersion_; + } + + NeuronModel_create_Type NeuronModel_create() { + CHECK(NeuronModel_create_ != nullptr) << "Cannot load NeuronModel_create!"; + return NeuronModel_create_; + } + + NeuronModel_free_Type NeuronModel_free() { + CHECK(NeuronModel_free_ != nullptr) << "Cannot load NeuronModel_free!"; + return NeuronModel_free_; + } + + NeuronModel_finish_Type NeuronModel_finish() { + CHECK(NeuronModel_finish_ != nullptr) << "Cannot load NeuronModel_finish!"; + return NeuronModel_finish_; + } + + NeuronModel_addOperand_Type NeuronModel_addOperand() { + CHECK(NeuronModel_addOperand_ != nullptr) + << "Cannot load NeuronModel_addOperand!"; + return NeuronModel_addOperand_; + } + + NeuronModel_setOperandValue_Type NeuronModel_setOperandValue() { + CHECK(NeuronModel_setOperandValue_ != nullptr) + << "Cannot load NeuronModel_setOperandValue!"; + return NeuronModel_setOperandValue_; + } + + NeuronModel_setOperandSymmPerChannelQuantParams_Type + NeuronModel_setOperandSymmPerChannelQuantParams() { + CHECK(NeuronModel_setOperandSymmPerChannelQuantParams_ != nullptr) + << "Cannot load NeuronModel_setOperandSymmPerChannelQuantParams!"; + return NeuronModel_setOperandSymmPerChannelQuantParams_; + } + + NeuronModel_addOperation_Type NeuronModel_addOperation() { + CHECK(NeuronModel_addOperation_ != nullptr) + << "Cannot load NeuronModel_addOperation!"; + return NeuronModel_addOperation_; + } + + NeuronModel_identifyInputsAndOutputs_Type + NeuronModel_identifyInputsAndOutputs() { + CHECK(NeuronModel_identifyInputsAndOutputs_ != nullptr) + << "Cannot load NeuronModel_identifyInputsAndOutputs!"; + return NeuronModel_identifyInputsAndOutputs_; + } + + NeuronCompilation_create_Type NeuronCompilation_create() { + CHECK(NeuronCompilation_create_ != nullptr) + << "Cannot load NeuronCompilation_create!"; + return NeuronCompilation_create_; + } + + NeuronCompilation_free_Type NeuronCompilation_free() { + CHECK(NeuronCompilation_free_ != nullptr) + << "Cannot load NeuronCompilation_free!"; + return NeuronCompilation_free_; + } + + NeuronCompilation_finish_Type NeuronCompilation_finish() { + CHECK(NeuronCompilation_finish_ != nullptr) + << "Cannot load NeuronCompilation_finish!"; + return NeuronCompilation_finish_; + } + + NeuronExecution_create_Type NeuronExecution_create() { + CHECK(NeuronExecution_create_ != nullptr) + << "Cannot load NeuronExecution_create!"; + return NeuronExecution_create_; + } + + NeuronExecution_free_Type NeuronExecution_free() { + CHECK(NeuronExecution_free_ != nullptr) + << "Cannot load NeuronExecution_free!"; + return NeuronExecution_free_; + } + + NeuronExecution_setInput_Type NeuronExecution_setInput() { + CHECK(NeuronExecution_setInput_ != nullptr) + << "Cannot loadcl NeuronExecution_setInput!"; + return NeuronExecution_setInput_; + } + + NeuronExecution_setOutput_Type NeuronExecution_setOutput() { + CHECK(NeuronExecution_setOutput_ != nullptr) + << "Cannot load NeuronExecution_setOutput!"; + return NeuronExecution_setOutput_; + } + + NeuronExecution_compute_Type NeuronExecution_compute() { + CHECK(NeuronExecution_compute_ != nullptr) + << "Cannot load NeuronExecution_compute!"; + return NeuronExecution_compute_; + } + + private: + NeuronAdapter(); + NeuronAdapter(const NeuronAdapter &) = delete; + NeuronAdapter &operator=(const NeuronAdapter &) = delete; + bool InitHandle(); + void InitFunctions(); + void *handle_{nullptr}; + Neuron_getVersion_Type Neuron_getVersion_{nullptr}; + NeuronModel_create_Type NeuronModel_create_{nullptr}; + NeuronModel_free_Type NeuronModel_free_{nullptr}; + NeuronModel_finish_Type NeuronModel_finish_{nullptr}; + NeuronModel_addOperand_Type NeuronModel_addOperand_{nullptr}; + NeuronModel_setOperandValue_Type NeuronModel_setOperandValue_{nullptr}; + NeuronModel_setOperandSymmPerChannelQuantParams_Type + NeuronModel_setOperandSymmPerChannelQuantParams_{nullptr}; + NeuronModel_addOperation_Type NeuronModel_addOperation_{nullptr}; + NeuronModel_identifyInputsAndOutputs_Type + NeuronModel_identifyInputsAndOutputs_{nullptr}; + NeuronCompilation_create_Type NeuronCompilation_create_{nullptr}; + NeuronCompilation_free_Type NeuronCompilation_free_{nullptr}; + NeuronCompilation_finish_Type NeuronCompilation_finish_{nullptr}; + NeuronExecution_create_Type NeuronExecution_create_{nullptr}; + NeuronExecution_free_Type NeuronExecution_free_{nullptr}; + NeuronExecution_setInput_Type NeuronExecution_setInput_{nullptr}; + NeuronExecution_setOutput_Type NeuronExecution_setOutput_{nullptr}; + NeuronExecution_compute_Type NeuronExecution_compute_{nullptr}; +}; +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt index 6f6f7e7aa71ba5067d831a2bcc2b7b933205fbe0..aecec295ae0269fb34a3c4fa38e396bdf98d4418 100644 --- a/lite/backends/arm/math/CMakeLists.txt +++ b/lite/backends/arm/math/CMakeLists.txt @@ -68,6 +68,8 @@ if (NOT HAS_ARM_MATH_LIB_DIR) gemv_arm_int8.cc conv3x3s1_direct_fp32.cc conv3x3s2_direct_fp32.cc + conv3x3s1p01_depthwise_fp32_relu.cc + conv3x3s2p01_depthwise_fp32_relu.cc conv3x3s1p01_depthwise_fp32.cc conv3x3s2p01_depthwise_fp32.cc conv3x3s1px_depthwise_fp32.cc @@ -123,5 +125,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR) anchor_generator.cc split_merge_lod_tenosr.cc reduce_prod.cc + lstm.cc DEPS ${lite_kernel_deps} context tensor) endif() diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc index 634021cc3ce82bbb5fba72123b38457ab0c7ac06..1d01642100109d14a413ad5e274606c88bf0005a 100644 --- a/lite/backends/arm/math/activation.cc +++ b/lite/backends/arm/math/activation.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/backends/arm/math/activation.h" +#include #include #include "lite/backends/arm/math/funcs.h" @@ -700,6 +701,76 @@ void act_rsqrt(const float* din, float* dout, int size, int threads) { } } +template <> +void act_square(const float* din, float* dout, int size, int threads) { + const float* ptr_in = din; + float* ptr_out = dout; + for (int i = 0; i < size; ++i) { + ptr_out[0] = ptr_in[0] * ptr_in[0]; + ptr_in++; + ptr_out++; + } +} + +template <> +void act_hard_swish(const float* din, + float* dout, + int size, + float threshold, + float scale, + float offset, + int threads) { + const float* ptr_in = din; + float* ptr_out = dout; + for (int i = 0; i < size; ++i) { + ptr_out[0] = std::min(std::max(0.f, ptr_in[0] + offset), threshold) * + ptr_in[0] / scale; + ptr_in++; + ptr_out++; + } +} + +template <> +void act_reciprocal(const float* din, + float* dout, + int size, + int threads) { + const float* ptr_in = din; + float* ptr_out = dout; + for (int i = 0; i < size; ++i) { + ptr_out[0] = 1.0 / ptr_in[0]; + ptr_in++; + ptr_out++; + } +} + +template <> +void act_abs(const float* din, float* dout, int size, int threads) { + for (int i = 0; i < size; ++i) { + dout[0] = (din[0] > 0 ? din[0] : -din[0]); + din++; + dout++; + } +} + +#ifdef LITE_WITH_TRAIN +template <> +void act_square_grad(const float* din, + const float* dout_grad, + float* din_grad, + int size, + int threads) { + const float* ptr_out_grad = dout_grad; + float* ptr_in_grad = din_grad; + for (int i = 0; i < size; ++i) { + ptr_in_grad[0] = ptr_out_grad[0] * 2.0 * din[0]; + ptr_out_grad++; + ptr_in_grad++; + din++; + } +} +#endif + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h index bb8189eef0d81a92caf2aaf73e401e20d9c80155..50f60f300bbab9b9f0bcad222f31699b7bfadeab 100644 --- a/lite/backends/arm/math/activation.h +++ b/lite/backends/arm/math/activation.h @@ -69,6 +69,29 @@ void act_hard_sigmoid(const T* din, template void act_rsqrt(const T* din, T* dout, int size, int threads); +template +void act_square(const T* din, T* dout, int size, int threads); + +template +void act_hard_swish(const T* din, + T* dout, + int size, + float threshold, + float scale, + float offset, + int threads); +template +void act_reciprocal(const T* din, T* dout, int size, int threads); + +template +void act_abs(const T* din, T* dout, int size, int threads); + +#ifdef LITE_WITH_TRAIN +template +void act_square_grad( + const T* din, const T* dout_grad, T* din_grad, int size, int threads); +#endif + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/argmax.cc b/lite/backends/arm/math/argmax.cc index 3ca6d97c4d8ab97ca58e9859bfd753f7bf7f05ad..4177ad0ae05a5f29be56e9e277c0161841ba6124 100644 --- a/lite/backends/arm/math/argmax.cc +++ b/lite/backends/arm/math/argmax.cc @@ -53,7 +53,7 @@ void argmax_func(const lite::Tensor *input, std::greater>()); // out - float *out_ptr = output->mutable_data() + n * out_channel + k; + int64_t *out_ptr = output->mutable_data() + n * out_channel + k; *out_ptr = vec[0].second; } } diff --git a/lite/backends/arm/math/beam_search.cc b/lite/backends/arm/math/beam_search.cc index f93fcc0d601cc076163e4d6fb1e31fc58e7035a8..32b7d3bfeba6107493d62a0c9be14a3c15ce7692 100644 --- a/lite/backends/arm/math/beam_search.cc +++ b/lite/backends/arm/math/beam_search.cc @@ -70,7 +70,7 @@ void PruneEndBeams(const Tensor *pre_ids, std::vector> *items, size_t lod_level, int end_id) { - auto *pre_ids_data = pre_ids->data(); + auto *pre_ids_data = pre_ids->data(); auto &high_level = abs_lod[lod_level]; for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) { size_t src_prefix_start = high_level[src_idx]; @@ -152,10 +152,10 @@ std::vector> SelectTopBeamSizeItems(const Tensor *pre_ids, // find the current candidates // auto abs_lod = framework::ToAbsOffset(scores->lod()); auto abs_lod = scores->lod(); - auto *pre_ids_data = pre_ids->data(); + auto *pre_ids_data = pre_ids->data(); auto *pre_scores_data = pre_scores->data(); - auto *ids_data = ids ? ids->data() : nullptr; + auto *ids_data = ids ? ids->data() : nullptr; auto *scores_data = scores->data(); size_t num_seqs = abs_lod[lod_level].size() - 1; @@ -236,7 +236,7 @@ void beam_search(const Tensor *pre_ids, if (parent_idx) { parent_idx->Resize(dims); } - auto *selected_ids_data = selected_ids->mutable_data(); + auto *selected_ids_data = selected_ids->mutable_data(); auto *selected_scores_data = selected_scores->mutable_data(); auto *parent_idx_data = parent_idx ? parent_idx->mutable_data() : nullptr; diff --git a/lite/backends/arm/math/concat.cc b/lite/backends/arm/math/concat.cc index 65f93453388d7f41d73669f583d189bec9035bb5..e54d70ffbb119d0a91b82f67b77c9d778dea17bf 100644 --- a/lite/backends/arm/math/concat.cc +++ b/lite/backends/arm/math/concat.cc @@ -16,46 +16,3 @@ #include #include #include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void concat_func(const std::vector &input, - const int axis, - lite::Tensor *output) { - int64_t concat_input_size = 1; - int64_t num_cancats = 1; - auto dim_0 = input[0]->dims(); - size_t num = input.size(); - for (int i = axis + 1; i < dim_0.size(); i++) { - concat_input_size *= dim_0[i]; - } - for (int i = 0; i < axis; i++) { - num_cancats *= dim_0[i]; - } - float *dst_ptr = output->mutable_data(); - const int out_concat_axis = output->dims()[axis]; - int64_t offset_concat_axis = 0; - int64_t out_sum = out_concat_axis * concat_input_size; - for (int n = 0; n < num; n++) { - auto dims = input[n]->dims(); - const float *src_ptr = input[n]->data(); - int64_t in_concat_axis = dims[axis]; - float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size; - int64_t in_sum = in_concat_axis * concat_input_size; - for (int i = 0; i < num_cancats; i++) { - std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum); - dout_ptr += out_sum; - src_ptr += in_sum; - } - offset_concat_axis += in_concat_axis; - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/concat.h b/lite/backends/arm/math/concat.h index 4c6159e9e09b66edde812e5098e1263963f3e4da..44e8bf73e220f94dca4ba6713debfae77029867a 100644 --- a/lite/backends/arm/math/concat.h +++ b/lite/backends/arm/math/concat.h @@ -25,9 +25,39 @@ namespace lite { namespace arm { namespace math { -void concat_func(const std::vector &input, +template +void concat_func(const std::vector& input, const int axis, - lite::Tensor *output); + lite::Tensor* output) { + size_t num = input.size(); + auto dim_0 = input[0]->dims(); + int64_t concat_input_size = 1; + int64_t num_cancats = 1; + for (int i = axis + 1; i < dim_0.size(); i++) { + concat_input_size *= dim_0[i]; + } + for (int i = 0; i < axis; i++) { + num_cancats *= dim_0[i]; + } + + auto* dst_ptr = output->mutable_data(); + const int out_concat_axis = output->dims()[axis]; + int64_t offset_concat_axis = 0; + int64_t out_sum = out_concat_axis * concat_input_size; + for (int n = 0; n < num; n++) { + auto dims = input[n]->dims(); + auto* src_ptr = input[n]->data(); + int64_t in_concat_axis = dims[axis]; + auto* dout_ptr = dst_ptr + offset_concat_axis * concat_input_size; + int64_t in_sum = in_concat_axis * concat_input_size; + for (int i = 0; i < num_cancats; i++) { + std::memcpy(dout_ptr, src_ptr, sizeof(T) * in_sum); + dout_ptr += out_sum; + src_ptr += in_sum; + } + offset_concat_axis += in_concat_axis; + } +} } // namespace math } // namespace arm diff --git a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc index d1992f62bbfa9e15ab4d39565f7fe3555e17b215..35d9eeaee1b69bed423cd3b489217c71575b3079 100644 --- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc +++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc @@ -80,8 +80,10 @@ void conv_compute_6x6_3x3(const float* input, const operators::ConvParam& param, ARMContext* ctx) { auto act_param = param.activation_param; - const int pad_h = (*param.paddings)[0]; - const int pad_w = (*param.paddings)[2]; + const int pad_h0 = (*param.paddings)[0]; + const int pad_h1 = (*param.paddings)[1]; + const int pad_w0 = (*param.paddings)[2]; + const int pad_w1 = (*param.paddings)[3]; float* tmp_work_space = ctx->workspace_data() + ctx->llc_size() / sizeof(float); @@ -96,8 +98,8 @@ void conv_compute_6x6_3x3(const float* input, int tile_h = (hout + 5) / 6; int size_tile = tile_h * tile_w; - int w_pad = win + pad_w * 2; - int h_pad = hin + pad_h * 2; + int w_pad = win + pad_w0 + pad_w1; + int h_pad = hin + pad_h0 + pad_h1; const int zero_len = w_pad; float zero_ptr[zero_len]; // NOLINT @@ -127,10 +129,10 @@ void conv_compute_6x6_3x3(const float* input, prepack_input_nxwc4_dw(input + ni * in_n_stride, input_c4 + i * new_c_stride, i * 4, - -pad_h, - hin + pad_h, - -pad_w, - win + pad_w, + -pad_h0, + hin + pad_h1, + -pad_w0, + win + pad_w1, chin, win, hin, @@ -367,8 +369,10 @@ void conv_compute_2x2_3x3(const float* input, const operators::ConvParam& param, ARMContext* ctx) { auto act_param = param.activation_param; - const int pad_h = (*param.paddings)[0]; - const int pad_w = (*param.paddings)[2]; + const int pad_h0 = (*param.paddings)[0]; + const int pad_h1 = (*param.paddings)[1]; + const int pad_w0 = (*param.paddings)[2]; + const int pad_w1 = (*param.paddings)[3]; float* tmp_work_space = ctx->workspace_data() + ctx->llc_size() / sizeof(float); @@ -383,8 +387,8 @@ void conv_compute_2x2_3x3(const float* input, int tile_h = (hout + 1) / 2; int size_tile = tile_h * tile_w; - int w_pad = win + pad_w * 2; - int h_pad = hin + pad_h * 2; + int w_pad = win + pad_w0 + pad_w1; + int h_pad = hin + pad_h0 + pad_h1; const int zero_len = w_pad; float zero_ptr[zero_len]; // NOLINT @@ -414,10 +418,10 @@ void conv_compute_2x2_3x3(const float* input, prepack_input_nxwc4_dw(input + ni * in_n_stride, input_c4 + i * new_c_stride, i * 4, - -pad_h, - hin + pad_h, - -pad_w, - win + pad_w, + -pad_h0, + hin + pad_h1, + -pad_w0, + win + pad_w1, chin, win, hin, @@ -628,8 +632,10 @@ void conv_compute_2x2_3x3_small(const float* input, const operators::ConvParam& param, ARMContext* ctx) { auto act_param = param.activation_param; - const int pad_h = (*param.paddings)[0]; - const int pad_w = (*param.paddings)[2]; + const int pad_h0 = (*param.paddings)[0]; + const int pad_h1 = (*param.paddings)[1]; + const int pad_w0 = (*param.paddings)[2]; + const int pad_w1 = (*param.paddings)[3]; float* tmp_work_space = ctx->workspace_data() + ctx->llc_size() / sizeof(float); @@ -644,8 +650,8 @@ void conv_compute_2x2_3x3_small(const float* input, int tile_h = (hout + 1) / 2; int size_tile = tile_h * tile_w; - int w_pad = win + pad_w * 2; - int h_pad = hin + pad_h * 2; + int w_pad = win + pad_w0 + pad_w1; + int h_pad = hin + pad_h0 + pad_h1; const int zero_len = w_pad; float zero_ptr[zero_len]; // NOLINT @@ -676,10 +682,10 @@ void conv_compute_2x2_3x3_small(const float* input, prepack_input_nxwc4_dw(input + ni * in_n_stride, input_c4 + i * new_c_stride, i * 4, - -pad_h, - hin + pad_h, - -pad_w, - win + pad_w, + -pad_h0, + hin + pad_h1, + -pad_w0, + win + pad_w1, chin, win, hin, diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc index 66d61413fc43fd518e0b34c7bc8d7b7bf5cc72a7..b024d69507101e902dc45fb83668e00dc718a6b0 100644 --- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc +++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc @@ -91,23 +91,20 @@ void conv_depthwise_3x3s1_fp32(const float *din, bool flag_bias, const operators::ActivationParam act_param, ARMContext *ctx) { + bool has_active = act_param.has_active; + bool flag_relu = false; + bool relu6 = false; + if (has_active) { + if (act_param.active_type == lite_api::ActivationType::kRelu) { + flag_relu = true; + } else { + relu6 = true; + } + } if (pad == 0) { if (w_in > 5) { - conv_depthwise_3x3s1p0_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - act_param, - ctx); - } else { - conv_depthwise_3x3s1p0_bias_s(dout, + if (relu6) { + conv_depthwise_3x3s1p0_bias(dout, din, weights, bias, @@ -120,25 +117,57 @@ void conv_depthwise_3x3s1_fp32(const float *din, w_out, act_param, ctx); + } else { + conv_depthwise_3x3s1p0_bias_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } else { + if (relu6) { + conv_depthwise_3x3s1p0_bias_s(dout, + din, + weights, + bias, + flag_bias, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + act_param, + ctx); + } else { + conv_depthwise_3x3s1p0_bias_s_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } } } if (pad == 1) { if (w_in > 4) { - conv_depthwise_3x3s1p1_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - act_param, - ctx); - } else { - conv_depthwise_3x3s1p1_bias_s(dout, + if (relu6) { + conv_depthwise_3x3s1p1_bias(dout, din, weights, bias, @@ -151,6 +180,51 @@ void conv_depthwise_3x3s1_fp32(const float *din, w_out, act_param, ctx); + } else { + conv_depthwise_3x3s1p1_bias_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } else { + if (relu6) { + conv_depthwise_3x3s1p1_bias_s(dout, + din, + weights, + bias, + flag_bias, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + act_param, + ctx); + } else { + conv_depthwise_3x3s1p1_bias_s_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } } } } @@ -1924,223 +1998,169 @@ void act_switch_3x3s1p1(const float *din_ptr0, float *vbias, int cnt, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { - float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); - float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); - - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: - asm volatile( - INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 - MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - break; - case lite_api::ActivationType::kRelu6: - /* 0 <= din <= 6 */ - asm volatile( - INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1 - MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6 - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [vsix] "w"(vsix), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - break; - case lite_api::ActivationType::kLeakyRelu: - /*din = din >= 0 ? din : din * scale*/ - asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU - MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU - RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [vscale] "w"(vscale), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - break; - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { - asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 - MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); + float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); + float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); + + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 + MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + break; + case lite_api::ActivationType::kRelu6: + /* 0 <= din <= 6 */ + asm volatile( + INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1 + MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6 + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [vsix] "w"(vsix), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + break; + case lite_api::ActivationType::kLeakyRelu: + /*din = din >= 0 ? din : din * scale*/ + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU + MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_LEAKY_RELU + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [vscale] "w"(vscale), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + break; + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } #else @@ -2159,153 +2179,117 @@ void act_switch_3x3s1p1(const float *din_ptr0, float bias_val, int cnt, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { - float tmp = act_param.Relu_clipped_coef; - float ss = act_param.Leaky_relu_alpha; - float vsix[4] = {tmp, tmp, tmp, tmp}; - float vscale[4] = {ss, ss, ss, ss}; - - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: - asm volatile( - INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 - MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; - case lite_api::ActivationType::kRelu6: - /* 0 <= din <= 6 */ - asm volatile( - INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1 - MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6 - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [six_ptr] "r"(vsix), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; - case lite_api::ActivationType::kLeakyRelu: - /*din = din >= 0 ? din : din * scale*/ - asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU - MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU - RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [scale_ptr] "r"(vscale), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { - asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 - MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); + float tmp = act_param.Relu_clipped_coef; + float ss = act_param.Leaky_relu_alpha; + float vsix[4] = {tmp, tmp, tmp, tmp}; + float vscale[4] = {ss, ss, ss, ss}; + + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 + MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; + case lite_api::ActivationType::kRelu6: + /* 0 <= din <= 6 */ + asm volatile( + INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1 + MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6 + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [six_ptr] "r"(vsix), + [vzero] "w"(vzero) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; + case lite_api::ActivationType::kLeakyRelu: + /*din = din >= 0 ? din : din * scale*/ + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU + MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_LEAKY_RELU + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [scale_ptr] "r"(vscale), + [vzero] "w"(vzero) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } #endif @@ -2575,278 +2559,214 @@ void act_switch_3x3s1p1_s(const float *din_ptr0, float32x4_t vzero, float32x4_t wbias, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { #ifdef __aarch64__ - float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); - float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); + float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); + float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); #else - float tmp = act_param.Relu_clipped_coef; - float ss = act_param.Leaky_relu_alpha; - float vsix[4] = {tmp, tmp, tmp, tmp}; - float vscale[4] = {ss, ss, ss, ss}; + float tmp = act_param.Relu_clipped_coef; + float ss = act_param.Leaky_relu_alpha; + float vsix[4] = {tmp, tmp, tmp, tmp}; + float vscale[4] = {ss, ss, ss, ss}; #endif - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: #ifdef __aarch64__ - asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); - break; + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17"); + break; #else - asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; #endif - case lite_api::ActivationType::kRelu6: + case lite_api::ActivationType::kRelu6: /* 0 <= din <= 6 */ #ifdef __aarch64__ - asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [vsix] "w"(vsix), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); - break; + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6 + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [vsix] "w"(vsix), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17"); + break; #else - asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [six_ptr] "r"(vsix), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6 + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [six_ptr] "r"(vsix), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; #endif - case lite_api::ActivationType::kLeakyRelu: + case lite_api::ActivationType::kLeakyRelu: /*din = din >= 0 ? din : din * scale*/ #ifdef __aarch64__ - asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [vscale] "w"(vscale), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); - break; -#else - asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [scale_ptr] "r"(vscale), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; -#endif - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { -#ifdef __aarch64__ - asm volatile(COMPUTE_S_S1 RESULT_S_S1 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); + asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [vscale] "w"(vscale), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20"); + break; #else - asm volatile(COMPUTE_S_S1 RESULT_S_S1 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); + asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [scale_ptr] "r"(vscale), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; #endif + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } /** @@ -2987,262 +2907,198 @@ void act_switch_3x3s1p0(const float *din_ptr0, int cnt, int remain, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { - float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); - float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); - - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: - asm volatile( - INIT_S1 - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - MID_COMPUTE_S1 MID_RESULT_S1_RELU - "cmp %w[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_RELU "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - break; - case lite_api::ActivationType::kRelu6: - /* 0 <= din <= 6 */ - asm volatile( - INIT_S1 - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - MID_COMPUTE_S1 MID_RESULT_S1_RELU6 - "cmp %w[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_RELU6 "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [vsix] "w"(vsix), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - break; - case lite_api::ActivationType::kLeakyRelu: - /*din = din >= 0 ? din : din * scale*/ - asm volatile( - INIT_S1 - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU - "cmp %w[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_LEAKY_RELU "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [vscale] "w"(vscale), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - break; - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { - asm volatile( - INIT_S1 - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - MID_COMPUTE_S1 MID_RESULT_S1 - "cmp %w[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 - "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); + float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); + float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); + + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: + asm volatile( + INIT_S1 + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ + MID_COMPUTE_S1 MID_RESULT_S1_RELU + "cmp %w[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU "0: \n" + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + break; + case lite_api::ActivationType::kRelu6: + /* 0 <= din <= 6 */ + asm volatile( + INIT_S1 + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ + MID_COMPUTE_S1 MID_RESULT_S1_RELU6 + "cmp %w[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU6 "0: \n" + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [vsix] "w"(vsix), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [remain] "r"(remain) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + break; + case lite_api::ActivationType::kLeakyRelu: + /*din = din >= 0 ? din : din * scale*/ + asm volatile( + INIT_S1 + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ + MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU + "cmp %w[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_LEAKY_RELU "0: \n" + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [vscale] "w"(vscale), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [remain] "r"(remain) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + break; + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } #else @@ -3262,191 +3118,146 @@ void act_switch_3x3s1p0(const float *din_ptr0, int cnt, int remain, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { - float tmp = act_param.Relu_clipped_coef; - float ss = act_param.Leaky_relu_alpha; - float vsix[4] = {tmp, tmp, tmp, tmp}; - float vscale[4] = {ss, ss, ss, ss}; - - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: - asm volatile(INIT_S1 - "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" - "vext.32 q6, q8, q9, #1 @ 0012\n" - "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 - MID_RESULT_S1_RELU - "cmp %[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_RELU "0: \n" - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; - case lite_api::ActivationType::kRelu6: - /* 0 <= din <= 6 */ - asm volatile(INIT_S1 - "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" - "vext.32 q6, q8, q9, #1 @ 0012\n" - "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 - MID_RESULT_S1_RELU6 - "cmp %[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_RELU6 "0: \n" - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [six_ptr] "r"(vsix), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; - case lite_api::ActivationType::kLeakyRelu: - /*din = din >= 0 ? din : din * scale*/ - asm volatile(INIT_S1 - "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" - "vext.32 q6, q8, q9, #1 @ 0012\n" - "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 - MID_RESULT_S1_LEAKY_RELU - "cmp %[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_LEAKY_RELU - "0: \n" - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [scale_ptr] "r"(vscale), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { - asm volatile( - INIT_S1 - "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" - "vext.32 q6, q8, q9, #1 @ 0012\n" - "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 MID_RESULT_S1 - "cmp %[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 - "0: \n" - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); + float tmp = act_param.Relu_clipped_coef; + float ss = act_param.Leaky_relu_alpha; + float vsix[4] = {tmp, tmp, tmp, tmp}; + float vscale[4] = {ss, ss, ss, ss}; + + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: + asm volatile(INIT_S1 + "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" + "vext.32 q6, q8, q9, #1 @ 0012\n" + "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 + MID_RESULT_S1_RELU + "cmp %[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU "0: \n" + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; + case lite_api::ActivationType::kRelu6: + /* 0 <= din <= 6 */ + asm volatile(INIT_S1 + "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" + "vext.32 q6, q8, q9, #1 @ 0012\n" + "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 + MID_RESULT_S1_RELU6 + "cmp %[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU6 "0: \n" + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [six_ptr] "r"(vsix), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; + case lite_api::ActivationType::kLeakyRelu: + /*din = din >= 0 ? din : din * scale*/ + asm volatile(INIT_S1 + "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" + "vext.32 q6, q8, q9, #1 @ 0012\n" + "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 + MID_RESULT_S1_LEAKY_RELU + "cmp %[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_LEAKY_RELU + "0: \n" + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [scale_ptr] "r"(vscale), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } #endif @@ -3694,287 +3505,220 @@ void act_switch_3x3s1p0_s(const float *din_ptr0, unsigned int *vmask_ptr, float bias_val, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { #ifdef __aarch64__ - float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); - float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); + float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); + float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); #else - float tmp = act_param.Relu_clipped_coef; - float ss = act_param.Leaky_relu_alpha; - float vsix[4] = {tmp, tmp, tmp, tmp}; - float vscale[4] = {ss, ss, ss, ss}; + float tmp = act_param.Relu_clipped_coef; + float ss = act_param.Leaky_relu_alpha; + float vsix[4] = {tmp, tmp, tmp, tmp}; + float vscale[4] = {ss, ss, ss, ss}; #endif - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: #ifdef __aarch64__ - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [vzero] "w"(vzero), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - break; + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vbias] "w"(wbias), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [vzero] "w"(vzero), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + break; #else - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [bias_val] "r"(bias_val), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [bias_val] "r"(bias_val), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; #endif - case lite_api::ActivationType::kRelu6: + case lite_api::ActivationType::kRelu6: /* 0 <= din <= 6 */ #ifdef __aarch64__ - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [vzero] "w"(vzero), - [vsix] "w"(vsix), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - break; + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6 + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vbias] "w"(wbias), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [vzero] "w"(vzero), + [vsix] "w"(vsix), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + break; #else - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [six_ptr] "r"(vsix), - [bias_val] "r"(bias_val), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6 + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [six_ptr] "r"(vsix), + [bias_val] "r"(bias_val), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; #endif - case lite_api::ActivationType::kLeakyRelu: + case lite_api::ActivationType::kLeakyRelu: /*din = din >= 0 ? din : din * scale*/ #ifdef __aarch64__ - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [vzero] "w"(vzero), - [vscale] "w"(vscale), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - break; -#else - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [scale_ptr] "r"(vscale), - [bias_val] "r"(bias_val), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; -#endif - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { -#ifdef __aarch64__ - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [vzero] "w"(vzero), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vbias] "w"(wbias), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [vzero] "w"(vzero), + [vscale] "w"(vscale), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + break; #else - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [bias_val] "r"(bias_val), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [scale_ptr] "r"(vscale), + [bias_val] "r"(bias_val), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; #endif + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } /** diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc new file mode 100644 index 0000000000000000000000000000000000000000..c9dd4d2fd1e30d9b82a8db64a4872095af3f9768 --- /dev/null +++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc @@ -0,0 +1,2418 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/arm/math/conv_depthwise.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +#ifdef __aarch64__ +#define INIT_S1 \ + "PRFM PLDL1KEEP, [%[din_ptr0]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr1]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr2]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr3]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr4]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr5]] \n" \ + "movi v21.4s, #0x0\n" /* out0 = 0 */ \ + \ + "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ + +#define LEFT_COMPUTE_S1 \ + "ext v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ /* r0 */ \ + "fmla v12.4s, v0.4s, %[w0].s[1]\n" /* outr00 += din0_0123 * w0[1]*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "sub %[din_ptr0], %[din_ptr0], #4 \n" /* din_ptr0-- */ \ + "sub %[din_ptr1], %[din_ptr1], #4 \n" /* din_ptr0-- */ \ + \ + "fmla v12.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din0_0012 * w0[0]*/ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */ \ + "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */ \ + \ + "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_1234 * w0[2]*/ \ + \ + "ext v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ /* r1 */ \ + "fmla v13.4s , v2.4s, %[w0].s[1]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v2.4s, %[w1].s[1]\n" /* outr00 += din1_0123 * w1[1]*/ \ + "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */ \ + "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */ \ + \ + "fmla v13.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v17.16b, v4.16b, v5.16b, #4 \n" /* v16=1234 */ \ + "ext v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/ \ + \ + /* r2 */ \ + "fmla v14.4s , v4.4s, %[w0].s[1]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v13.4s , v4.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v12.4s , v4.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[1]\n" /*outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ + +#define LEFT_RESULT_S1 \ + /* r4 */ \ + "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ /* r5 */ \ + "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ \ + "cmp %w[cnt], #1 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "blt 3f \n" + +#define MID_COMPUTE_S1 \ + "1: \n" /* r0 */ \ + "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */ \ + "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */ \ + "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ + +#define MID_RESULT_S1 \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "bne 1b \n" + +#define RIGHT_COMPUTE_S1 \ + "3: \n" \ + "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" \ + "ld1 {v22.4s}, [%[doutr0]] \n" \ + "ld1 {v23.4s}, [%[doutr1]] \n" \ + "ld1 {v24.4s}, [%[doutr2]] \n" \ + "ld1 {v25.4s}, [%[doutr3]] \n" \ + \ + "bif v0.16b, %[vzero].16b, v18.16b \n" \ + "bif v1.16b, %[vzero].16b, v19.16b \n" \ + "bif v2.16b, %[vzero].16b, v18.16b \n" \ + "bif v3.16b, %[vzero].16b, v19.16b \n" \ + \ + "bif v4.16b, %[vzero].16b, v18.16b \n" \ + "bif v5.16b, %[vzero].16b, v19.16b \n" \ + "bif v6.16b, %[vzero].16b, v18.16b \n" \ + "bif v7.16b, %[vzero].16b, v19.16b \n" \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ /* r0 */ \ + "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v8.16b, %[vzero].16b, v18.16b \n" \ + "bif v9.16b, %[vzero].16b, v19.16b \n" \ + "bif v10.16b, %[vzero].16b, v18.16b \n" \ + "bif v11.16b, %[vzero].16b, v19.16b \n" \ + \ + "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v18.4s}, [%[rmask]] \n" \ + \ + "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */ \ + "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */ \ + "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ + +#define RIGHT_RESULT_S1 \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v12.16b, v22.16b, v18.16b \n" \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v13.16b, v23.16b, v18.16b \n" \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v14.16b, v24.16b, v18.16b \n" \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "bif v15.16b, v25.16b, v18.16b \n" \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" + +#define LEFT_RESULT_S1_RELU \ + /* r4 */ \ + "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ + "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \ + "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ + \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ \ + "cmp %w[cnt], #1 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "blt 3f \n" + +#define MID_RESULT_S1_RELU \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \ + \ + /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "bne 1b \n" + +#define RIGHT_RESULT_S1_RELU \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "bif v12.16b, v22.16b, v18.16b \n" \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "bif v13.16b, v23.16b, v18.16b \n" \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \ + \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "bif v14.16b, v24.16b, v18.16b \n" \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ + \ + "bif v15.16b, v25.16b, v18.16b \n" \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" + +#define COMPUTE_S_S1 \ + "prfm pldl1keep, [%[din0]]\n" \ + "prfm pldl1keep, [%[din1]]\n" \ + "prfm pldl1keep, [%[din2]]\n" \ + "prfm pldl1keep, [%[din3]]\n" \ + \ + "ld1 {v0.4s}, [%[din0]], #16\n" \ + "ld1 {v1.4s}, [%[din1]], #16\n" \ + "ld1 {v2.4s}, [%[din2]], #16\n" \ + "ld1 {v3.4s}, [%[din3]], #16\n" \ + \ + "bif v0.16b, %[zero].16b, %[mask].16b\n" \ + "bif v1.16b, %[zero].16b, %[mask].16b\n" \ + "bif v2.16b, %[zero].16b, %[mask].16b\n" \ + "bif v3.16b, %[zero].16b, %[mask].16b\n" \ + \ + "ext v4.16b, %[zero].16b, v0.16b, #12\n" \ + "ext v5.16b, %[zero].16b, v1.16b, #12\n" \ + "ext v6.16b, %[zero].16b, v2.16b, #12\n" \ + "ext v7.16b, %[zero].16b, v3.16b, #12\n" \ + \ + "ext v8.16b, v0.16b, %[zero].16b, #4\n" \ + "ext v9.16b, v1.16b, %[zero].16b, #4\n" \ + "ext v10.16b, v2.16b, %[zero].16b, #4\n" \ + "ext v11.16b, v3.16b, %[zero].16b, #4\n" \ + \ + "fmul v12.4s, v0.4s, %[wr0].s[1]\n" \ + "fmul v13.4s, v1.4s, %[wr0].s[1]\n" \ + \ + "fmul v14.4s, v1.4s, %[wr1].s[1]\n" \ + "fmul v15.4s, v2.4s, %[wr1].s[1]\n" \ + \ + "fmul v16.4s, v2.4s, %[wr2].s[1]\n" \ + "fmul v17.4s, v3.4s, %[wr2].s[1]\n" \ + \ + "fmla v12.4s, v4.4s, %[wr0].s[0]\n" \ + "fmla v13.4s, v5.4s, %[wr0].s[0]\n" \ + \ + "fmla v14.4s, v5.4s, %[wr1].s[0]\n" \ + "fmla v15.4s, v6.4s, %[wr1].s[0]\n" \ + \ + "fmla v16.4s, v6.4s, %[wr2].s[0]\n" \ + "fmla v17.4s, v7.4s, %[wr2].s[0]\n" \ + \ + "fmla v12.4s, v8.4s, %[wr0].s[2]\n" \ + "fmla v13.4s, v9.4s, %[wr0].s[2]\n" \ + \ + "fmla v14.4s, v9.4s, %[wr1].s[2]\n" \ + "fmla v15.4s, v10.4s, %[wr1].s[2]\n" \ + \ + "fmla v16.4s, v10.4s, %[wr2].s[2]\n" \ + "fmla v17.4s, v11.4s, %[wr2].s[2]\n" \ + \ + "fadd v12.4s, v12.4s, v14.4s\n" \ + "fadd v12.4s, v12.4s, v16.4s\n" \ + \ + "fadd v13.4s, v13.4s, v15.4s\n" \ + "fadd v13.4s, v13.4s, v17.4s\n" \ + \ + "fadd v12.4s, v12.4s, %[bias].4s\n" \ + "fadd v13.4s, v13.4s, %[bias].4s\n" + +#define RESULT_S_S1 \ + "prfm pldl1keep, [%[out1]]\n" \ + "prfm pldl1keep, [%[out2]]\n" \ + \ + "st1 {v12.4s}, [%[out1]]\n" \ + "st1 {v13.4s}, [%[out2]]\n" + +#define RESULT_S_S1_RELU \ + "prfm pldl1keep, [%[out1]]\n" \ + "prfm pldl1keep, [%[out2]]\n" \ + \ + "fmax v12.4s, v12.4s, %[zero].4s\n" \ + "fmax v13.4s, v13.4s, %[zero].4s\n" \ + \ + "st1 {v12.4s}, [%[out1]]\n" \ + "st1 {v13.4s}, [%[out2]]\n" + +#define COMPUTE_S_S1_P0 \ + "prfm pldl1keep, [%[din0]]\n" \ + "prfm pldl1keep, [%[din1]]\n" \ + "prfm pldl1keep, [%[din2]]\n" \ + "prfm pldl1keep, [%[din3]]\n" \ + \ + "ld1 {v0.4s, v1.4s}, [%[din0]]\n" \ + "ld1 {v2.4s, v3.4s}, [%[din1]]\n" \ + "ld1 {v4.4s, v5.4s}, [%[din2]]\n" \ + "ld1 {v6.4s, v7.4s}, [%[din3]]\n" \ + \ + "bif v0.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v1.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "bif v2.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v3.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "bif v4.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v5.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "bif v6.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v7.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "ext v8.16b, v0.16b, v1.16b, #4\n" \ + "ext v9.16b, v0.16b, v1.16b, #8\n" \ + \ + "and v12.16b, %[vbias].16b, %[vbias].16b \n" \ + "and v13.16b, %[vbias].16b, %[vbias].16b \n" /* r0 */ \ + "fmul v10.4s, v0.4s, %[wr0].s[0]\n" \ + "fmul v11.4s, v8.4s, %[wr0].s[1]\n" \ + "fmla v12.4s, v9.4s, %[wr0].s[2]\n" \ + \ + "ext v8.16b, v2.16b, v3.16b, #4\n" \ + "ext v9.16b, v2.16b, v3.16b, #8\n" /* r1 */ \ + "fmul v14.4s, v2.4s, %[wr0].s[0]\n" \ + "fmla v10.4s, v2.4s, %[wr1].s[0]\n" \ + \ + "fmul v15.4s, v8.4s, %[wr0].s[1]\n" \ + "fmla v11.4s, v8.4s, %[wr1].s[1]\n" \ + \ + "fmla v13.4s, v9.4s, %[wr0].s[2]\n" \ + "fmla v12.4s, v9.4s, %[wr1].s[2]\n" \ + \ + "ext v8.16b, v4.16b, v5.16b, #4\n" \ + "ext v9.16b, v4.16b, v5.16b, #8\n" /* r2 */ \ + "fmla v14.4s, v4.4s, %[wr1].s[0]\n" \ + "fmla v10.4s, v4.4s, %[wr2].s[0]\n" \ + \ + "fmla v15.4s, v8.4s, %[wr1].s[1]\n" \ + "fmla v11.4s, v8.4s, %[wr2].s[1]\n" \ + \ + "fmla v13.4s, v9.4s, %[wr1].s[2]\n" \ + "fmla v12.4s, v9.4s, %[wr2].s[2]\n" \ + \ + "ext v8.16b, v6.16b, v7.16b, #4\n" \ + "ext v9.16b, v6.16b, v7.16b, #8\n" \ + \ + "fmla v14.4s, v6.4s, %[wr2].s[0]\n" \ + \ + "fmla v15.4s, v8.4s, %[wr2].s[1]\n" \ + \ + "fadd v12.4s, v12.4s, v10.4s\n" \ + \ + "fmla v13.4s, v9.4s, %[wr2].s[2]\n" \ + \ + "fadd v12.4s, v12.4s, v11.4s\n" \ + "fadd v13.4s, v13.4s, v14.4s\n" \ + "fadd v13.4s, v13.4s, v15.4s\n" // \ + // "prfm pldl1keep, [%[out1]]\n" \ + // "prfm pldl1keep, [%[out2]]\n" \ + // \ + // "st1 {v12.4s}, [%[out1]]\n" \ + // "st1 {v13.4s}, [%[out2]]\n" \ + + +#else +#define INIT_S1 \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + "pld [%[din3_ptr]] @ preload data\n" \ + \ + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" \ + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" \ + "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" \ + "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" \ + \ + "vdup.32 q4, %[bias_val] @ and \n" \ + "vdup.32 q5, %[bias_val] @ and \n" + +#define LEFT_COMPUTE_S1 \ + "vext.32 q6, %q[vzero], q8, #3 @ 0012\n" \ + "vext.32 q7, q8, q9, #1 @ 1234\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" \ + "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" \ + "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" \ + "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + "pld [%[din3_ptr]] @ preload data\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, %q[vzero], q10, #3 @ 0012\n" \ + "vext.32 q7, q10, q11, #1 @ 1234\n" \ + \ + /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q10, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" \ + "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" \ + "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" \ + "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, %q[vzero], q12, #3 @ 0012\n" \ + "vext.32 q7, q12, q13, #1 @ 1234\n" \ + \ + /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q12, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" \ + "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, %q[vzero], q14, #3 @ 0012\n" \ + "vext.32 q7, q14, q15, #1 @ 1234\n" + +#define LEFT_RESULT_S1 \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + "cmp %[cnt], #1 @ check whether has mid cols\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + "blt 3f @ jump to main loop start point\n" + +#define MID_COMPUTE_S1 \ + "1: @ right pad entry\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + "pld [%[din3_ptr]] @ preload data\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" \ + \ + "vext.32 q6, q10, q11, #1 @ 1234\n" \ + "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q12, q13, #1 @ 1234\n" \ + "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q14, q15, #1 @ 1234\n" \ + "vext.32 q7, q14, q15, #2 @ 2345\n" + +#define MID_RESULT_S1 \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "subs %[cnt], #1 @ loop count minus 1\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + \ + "bne 1b @ jump to main loop start point\n" + +#define RIGHT_COMPUTE_S1 \ + "3: @ right pad entry\n" \ + "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" \ + "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" \ + \ + "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" \ + "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" \ + \ + "vbif d16, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d17, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d18, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vbif d20, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d21, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d22, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vbif d24, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d25, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d26, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vbif d28, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d29, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d30, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q10, q11, #1 @ 1234\n" \ + "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" \ + "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" \ + "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q12, q13, #1 @ 1234\n" \ + "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q14, q15, #1 @ 1234\n" \ + "vext.32 q7, q14, q15, #2 @ 2345\n" + +#define RIGHT_RESULT_S1 \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vbif d8, d16, d19 @ bit select, deal with right pad\n" \ + "vbif d9, d17, d23 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vbif d10, d20, d19 @ bit select, deal with right pad\n" \ + "vbif d11, d21, d23 @ bit select, deal with right pad\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" + +#define LEFT_RESULT_S1_RELU \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ + \ + "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ + \ + "cmp %[cnt], #1 @ check whether has mid cols\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + "blt 3f @ jump to main loop start point\n" + +#define MID_RESULT_S1_RELU \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "subs %[cnt], #1 @ loop count minus 1\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + \ + "bne 1b @ jump to main loop start point\n" + +#define RIGHT_RESULT_S1_RELU \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vbif d8, d16, d19 @ bit select, deal with right pad\n" \ + "vbif d9, d17, d23 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ + \ + "vbif d10, d20, d19 @ bit select, deal with right pad\n" \ + "vbif d11, d21, d23 @ bit select, deal with right pad\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" + +#define COMPUTE_S_S1 \ + "pld [%[din0]]\n" \ + "pld [%[din1]]\n" \ + "pld [%[din2]]\n" \ + "pld [%[din3]]\n" \ + \ + "vld1.32 {d12-d13}, [%[din0]]!\n" \ + "vld1.32 {d14-d15}, [%[din1]]!\n" \ + "vld1.32 {d16-d17}, [%[din2]]!\n" \ + "vld1.32 {d18-d19}, [%[din3]]!\n" \ + \ + "vbif q6, %q[vzero], %q[mask]\n" \ + "vbif q7, %q[vzero], %q[mask]\n" \ + "vbif q8, %q[vzero], %q[mask]\n" \ + "vbif q9, %q[vzero], %q[mask]\n" \ + \ + "vmul.f32 q14, q6, %e[wr0][1]\n" \ + "vmul.f32 q15, q7, %e[wr0][1]\n" \ + \ + "vmla.f32 q14, q7, %e[wr1][1]\n" \ + "vmla.f32 q15, q8, %e[wr1][1]\n" \ + \ + "vmla.f32 q14, q8, %e[wr2][1]\n" \ + "vmla.f32 q15, q9, %e[wr2][1]\n" \ + \ + "vext.32 q10, %q[vzero], q6, #3\n" \ + "vext.32 q11, %q[vzero], q7, #3\n" \ + "vext.32 q12, %q[vzero], q8, #3\n" \ + "vext.32 q13, %q[vzero], q9, #3\n" \ + \ + "vmla.f32 q14, q10, %e[wr0][0]\n" \ + "vmla.f32 q15, q11, %e[wr0][0]\n" \ + \ + "vmla.f32 q14, q11, %e[wr1][0]\n" \ + "vmla.f32 q15, q12, %e[wr1][0]\n" \ + \ + "vmla.f32 q14, q12, %e[wr2][0]\n" \ + "vmla.f32 q15, q13, %e[wr2][0]\n" \ + \ + "vext.32 q10, q6, %q[vzero], #1\n" \ + "vext.32 q11, q7, %q[vzero], #1\n" \ + "vext.32 q12, q8, %q[vzero], #1\n" \ + "vext.32 q13, q9, %q[vzero], #1\n" \ + \ + "vmla.f32 q14, q10, %f[wr0][0]\n" \ + "vmla.f32 q15, q11, %f[wr0][0]\n" \ + \ + "vmla.f32 q14, q11, %f[wr1][0]\n" \ + "vmla.f32 q15, q12, %f[wr1][0]\n" \ + \ + "vmla.f32 q14, q12, %f[wr2][0]\n" \ + "vmla.f32 q15, q13, %f[wr2][0]\n" \ + \ + "vadd.f32 q14, q14, %q[bias]\n" \ + "vadd.f32 q15, q15, %q[bias]\n" + +#define RESULT_S_S1 \ + "pld [%[out1]]\n" \ + "pld [%[out2]]\n" \ + \ + "vst1.32 {d28-d29}, [%[out1]]\n" \ + "vst1.32 {d30-d31}, [%[out2]]\n" + +#define RESULT_S_S1_RELU \ + "pld [%[out1]]\n" \ + "pld [%[out2]]\n" \ + \ + "vmax.f32 q14, q14, %q[vzero]\n" \ + "vmax.f32 q15, q15, %q[vzero]\n" \ + \ + "vst1.32 {d28-d29}, [%[out1]]\n" \ + "vst1.32 {d30-d31}, [%[out2]]\n" + +#define COMPUTE_S_S1_P0 \ + "pld [%[din0]]\n" \ + "pld [%[din1]]\n" \ + "pld [%[din2]]\n" \ + "pld [%[din3]]\n" \ + "vld1.32 {d16-d18}, [%[din0]] @ load din r0\n" \ + "vld1.32 {d20-d22}, [%[din1]] @ load din r1\n" \ + "vld1.32 {d24-d26}, [%[din2]] @ load din r2\n" \ + "vld1.32 {d28-d30}, [%[din3]] @ load din r3\n" \ + \ + "vdup.32 q4, %[bias_val] @ and \n" \ + "vdup.32 q5, %[bias_val] @ and \n" \ + \ + "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" \ + "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" \ + \ + "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" \ + \ + "vbif d16, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d20, %e[vzero], d19 @ bit select, deal with right pad\n" \ + \ + "vbif d17, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d21, %e[vzero], d23 @ bit select, deal with right pad\n" \ + \ + "vbif d18, %e[vzero], d27 @ bit select, deal with right pad\n" \ + "vbif d22, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vbif d24, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d25, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d26, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vbif d28, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d29, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d30, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q10, q11, #1 @ 1234\n" \ + "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmul.f32 q8, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmul.f32 q10, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmul.f32 q9, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ + "vmul.f32 q11, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q12, q13, #1 @ 1234\n" \ + "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q8, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q10, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q9, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q11, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q14, q15, #1 @ 1234\n" \ + "vext.32 q7, q14, q15, #2 @ 2345\n" /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vmla.f32 q8, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + "vadd.f32 q4, q4, q10 @ q4 += q10 \n" \ + \ + "pld [%[out1]]\n" \ + "pld [%[out2]]\n" \ + \ + "vmla.f32 q9, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + "vadd.f32 q14, q4, q11 @ q4 += q10 \n" \ + \ + "vadd.f32 q5, q5, q8 @ q4 += q10 \n" \ + "vadd.f32 q15, q5, q9 @ q4 += q10 \n" + +#endif +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width > 4 + */ +void conv_depthwise_3x3s1p1_bias_relu(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! pad is done implicit + const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + //! for 4x6 convolution window + const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; + + float *zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float *write_ptr = zero_ptr + w_in; + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + int w_stride = 9; + + int tile_w = w_out >> 2; + int remain = w_out % 4; + int cnt_col = tile_w - 1; + + unsigned int size_pad_right = (unsigned int)(5 + (tile_w << 2) - w_in); + const unsigned int remian_idx[4] = {0, 1, 2, 3}; + + if (remain == 0 && size_pad_right == 5) { + size_pad_right = 1; + cnt_col -= 1; + remain = 4; + } else if (remain == 0 && size_pad_right == 6) { + size_pad_right = 2; + cnt_col -= 1; + remain = 4; + } + + uint32x4_t vmask_rp1 = + vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_rp2 = + vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_result = + vcgtq_u32(vdupq_n_u32(remain), vld1q_u32(remian_idx)); + + unsigned int vmask[8]; + vst1q_u32(vmask, vmask_rp1); + vst1q_u32(vmask + 4, vmask_rp2); + + unsigned int rmask[4]; + vst1q_u32(rmask, vmask_result); + + float32x4_t vzero = vdupq_n_f32(0.f); + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int c = 0; c < ch_in; c++) { + float *dout_ptr = dout_batch + c * size_out_channel; + + const float *din_ch_ptr = din_batch + c * size_in_channel; + + float bias_val = flag_bias ? bias[c] : 0.f; + float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; + + const float *wei_ptr = weights + c * w_stride; + + float32x4_t wr0 = vld1q_f32(wei_ptr); + float32x4_t wr1 = vld1q_f32(wei_ptr + 3); + float32x4_t wr2 = vld1q_f32(wei_ptr + 6); + + float *doutr0 = dout_ptr; + float *doutr1 = doutr0 + w_out; + float *doutr2 = doutr1 + w_out; + float *doutr3 = doutr2 + w_out; + + const float *dr0 = din_ch_ptr; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + const float *dr4 = dr3 + w_in; + const float *dr5 = dr4 + w_in; + + const float *din_ptr0 = dr0; + const float *din_ptr1 = dr1; + const float *din_ptr2 = dr2; + const float *din_ptr3 = dr3; + const float *din_ptr4 = dr4; + const float *din_ptr5 = dr5; + float *ptr_zero = const_cast(zero); +#ifdef __aarch64__ + for (int i = 0; i < h_in; i += 4) { + //! process top pad pad_h = 1 + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + din_ptr4 = dr4; + din_ptr5 = dr5; + + doutr0 = dout_ptr; + doutr1 = doutr0 + w_out; + doutr2 = doutr1 + w_out; + doutr3 = doutr2 + w_out; + if (i == 0) { + din_ptr0 = zero_ptr; + din_ptr1 = dr0; + din_ptr2 = dr1; + din_ptr3 = dr2; + din_ptr4 = dr3; + din_ptr5 = dr4; + dr0 = dr3; + dr1 = dr4; + dr2 = dr5; + } else { + dr0 = dr4; + dr1 = dr5; + dr2 = dr1 + w_in; + } + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + dr5 = dr4 + w_in; + + //! process bottom pad + if (i + 5 > h_in) { + switch (i + 5 - h_in) { + case 5: + din_ptr1 = zero_ptr; + case 4: + din_ptr2 = zero_ptr; + case 3: + din_ptr3 = zero_ptr; + case 2: + din_ptr4 = zero_ptr; + case 1: + din_ptr5 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 4 > h_out) { + switch (i + 4 - h_out) { + case 3: + doutr1 = write_ptr; + case 2: + doutr2 = write_ptr; + case 1: + doutr3 = write_ptr; + default: + break; + } + } + + int cnt = cnt_col; + if (flag_relu) { + asm volatile( + INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 + MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } else { + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 + MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } + dout_ptr = dout_ptr + 4 * w_out; + } +#else + for (int i = 0; i < h_in; i += 2) { + //! process top pad pad_h = 1 + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + + doutr0 = dout_ptr; + doutr1 = dout_ptr + w_out; + // unsigned int* rst_mask = rmask; + + if (i == 0) { + din_ptr0 = zero_ptr; + din_ptr1 = dr0; + din_ptr2 = dr1; + din_ptr3 = dr2; + dr0 = dr1; + dr1 = dr2; + dr2 = dr3; + dr3 = dr2 + w_in; + } else { + dr0 = dr2; + dr1 = dr3; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + } + //! process bottom pad + if (i + 3 > h_in) { + switch (i + 3 - h_in) { + case 3: + din_ptr1 = zero_ptr; + case 2: + din_ptr2 = zero_ptr; + case 1: + din_ptr3 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 2 > h_out) { + doutr1 = write_ptr; + } + int cnt = cnt_col; + unsigned int *rmask_ptr = rmask; + unsigned int *vmask_ptr = vmask; + if (flag_relu) { + asm volatile( + INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 + MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 + MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + dout_ptr += 2 * w_out; + } //! end of processing mid rows +#endif + } + } +} + +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width <= 4 + */ +void conv_depthwise_3x3s1p1_bias_s_relu(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! 3x3s1 convolution, implemented by direct algorithm + //! pad is done implicit + //! for 4x6 convolution window + const int right_pad_idx[4] = {3, 2, 1, 0}; + const float zero[4] = {0.f, 0.f, 0.f, 0.f}; + + float32x4_t vzero = vdupq_n_f32(0.f); + uint32x4_t vmask_rp = + vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in)); + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + float *dout_channel = dout_batch + i * size_out_channel; + const float *din_channel = din_batch + i * size_in_channel; + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } + + int hs = -1; + int he = 3; + + float out_buf1[4]; + float out_buf2[4]; + float trash_buf[4]; + + int h_cnt = (h_out + 1) >> 1; + float *doutr0 = dout_channel; + float *doutr1 = dout_channel + w_out; + + for (int j = 0; j < h_cnt; ++j) { + const float *dr0 = din_channel + hs * w_in; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + + if (hs == -1) { + dr0 = zero; + } + + switch (he - h_in) { + case 2: + dr2 = zero; + doutr1 = trash_buf; + case 1: + dr3 = zero; + default: + break; + } +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [zero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17"); + } else { + asm volatile(COMPUTE_S_S1 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [zero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17"); + } +#else + if (flag_relu) { + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S1 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *doutr0++ = out_buf1[w]; + *doutr1++ = out_buf2[w]; + } + doutr0 = doutr1; + doutr1 += w_out; + hs += 2; + he += 2; + } // end of processing heights + } // end of processing channels + } // end of processing batchs +} + +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width > 4 + */ +void conv_depthwise_3x3s1p0_bias_relu(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! pad is done implicit + const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + //! for 4x6 convolution window + const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; + + float *zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float *write_ptr = zero_ptr + w_in; + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + int w_stride = 9; + + int tile_w = w_out >> 2; + int remain = w_out % 4; + + unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in); + const int remian_idx[4] = {0, 1, 2, 3}; + + uint32x4_t vmask_rp1 = + vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_rp2 = + vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_result = + vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx)); + + unsigned int vmask[8]; + vst1q_u32(vmask, vmask_rp1); + vst1q_u32(vmask + 4, vmask_rp2); + + unsigned int rmask[4]; + vst1q_u32(rmask, vmask_result); + + float32x4_t vzero = vdupq_n_f32(0.f); + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int c = 0; c < ch_in; c++) { + float *dout_ptr = dout_batch + c * size_out_channel; + + const float *din_ch_ptr = din_batch + c * size_in_channel; + + float bias_val = flag_bias ? bias[c] : 0.f; + float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; + + const float *wei_ptr = weights + c * w_stride; + + float32x4_t wr0 = vld1q_f32(wei_ptr); + float32x4_t wr1 = vld1q_f32(wei_ptr + 3); + float32x4_t wr2 = vld1q_f32(wei_ptr + 6); + + float *doutr0 = dout_ptr; + float *doutr1 = doutr0 + w_out; + float *doutr2 = doutr1 + w_out; + float *doutr3 = doutr2 + w_out; + + const float *dr0 = din_ch_ptr; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + const float *dr4 = dr3 + w_in; + const float *dr5 = dr4 + w_in; + + const float *din_ptr0 = dr0; + const float *din_ptr1 = dr1; + const float *din_ptr2 = dr2; + const float *din_ptr3 = dr3; + const float *din_ptr4 = dr4; + const float *din_ptr5 = dr5; + + float *ptr_zero = const_cast(zero); +#ifdef __aarch64__ + for (int i = 0; i < h_out; i += 4) { + //! process top pad pad_h = 1 + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + din_ptr4 = dr4; + din_ptr5 = dr5; + + doutr0 = dout_ptr; + doutr1 = doutr0 + w_out; + doutr2 = doutr1 + w_out; + doutr3 = doutr2 + w_out; + + dr0 = dr4; + dr1 = dr5; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + dr5 = dr4 + w_in; + + //! process bottom pad + if (i + 5 >= h_in) { + switch (i + 5 - h_in) { + case 4: + din_ptr1 = zero_ptr; + case 3: + din_ptr2 = zero_ptr; + case 2: + din_ptr3 = zero_ptr; + case 1: + din_ptr4 = zero_ptr; + case 0: + din_ptr5 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 4 > h_out) { + switch (i + 4 - h_out) { + case 3: + doutr1 = write_ptr; + case 2: + doutr2 = write_ptr; + case 1: + doutr3 = write_ptr; + default: + break; + } + } + + int cnt = tile_w; + if (flag_relu) { + asm volatile( + INIT_S1 + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ + MID_COMPUTE_S1 MID_RESULT_S1_RELU + "cmp %w[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU "0: \n" + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } else { + asm volatile( + INIT_S1 + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ + MID_COMPUTE_S1 MID_RESULT_S1 + "cmp %w[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1 "0: \n" + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } + dout_ptr = dout_ptr + 4 * w_out; + } +#else + for (int i = 0; i < h_out; i += 2) { + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + + doutr0 = dout_ptr; + doutr1 = dout_ptr + w_out; + + dr0 = dr2; + dr1 = dr3; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + //! process bottom pad + if (i + 3 >= h_in) { + switch (i + 3 - h_in) { + case 3: + din_ptr1 = zero_ptr; + case 2: + din_ptr2 = zero_ptr; + case 1: + din_ptr3 = zero_ptr; + case 0: + din_ptr3 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 2 > h_out) { + doutr1 = write_ptr; + } + int cnt = tile_w; + unsigned int *rmask_ptr = rmask; + unsigned int *vmask_ptr = vmask; + if (flag_relu) { + asm volatile(INIT_S1 + "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" + "vext.32 q6, q8, q9, #1 @ 0012\n" + "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 + MID_RESULT_S1_RELU + "cmp %[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU "0: \n" + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S1 + "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" + "vext.32 q6, q8, q9, #1 @ 0012\n" + "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 + MID_RESULT_S1 + "cmp %[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1 "0: \n" + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + dout_ptr += 2 * w_out; + } //! end of processing mid rows +#endif + } + } +} +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width <= 4 + */ +void conv_depthwise_3x3s1p0_bias_s_relu(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! 3x3s1 convolution, implemented by direct algorithm + //! pad is done implicit + //! for 4x6 convolution window + const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; + const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; + + float32x4_t vzero = vdupq_n_f32(0.f); + uint32x4_t vmask_rp1 = + vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in)); + uint32x4_t vmask_rp2 = + vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in)); + + unsigned int vmask[8]; + vst1q_u32(vmask, vmask_rp1); + vst1q_u32(vmask + 4, vmask_rp2); + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + float *dout_channel = dout_batch + i * size_out_channel; + const float *din_channel = din_batch + i * size_in_channel; + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + +#ifdef __aarch64__ + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } +#endif // __aarch64__ + + float out_buf1[4]; + float out_buf2[4]; + float trash_buf[4]; + + float *doutr0 = dout_channel; + float *doutr1 = dout_channel + w_out; + + for (int j = 0; j < h_out; j += 2) { + const float *dr0 = din_channel + j * w_in; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + + doutr0 = dout_channel + j * w_out; + doutr1 = doutr0 + w_out; + + if (j + 3 >= h_in) { + switch (j + 3 - h_in) { + case 3: + dr1 = zero_ptr; + case 2: + dr2 = zero_ptr; + case 1: + dr3 = zero_ptr; + doutr1 = trash_buf; + case 0: + dr3 = zero_ptr; + if (j + 2 > h_out) { + doutr1 = trash_buf; + } + default: + break; + } + } +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vbias] "w"(wbias), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [zero] "w"(vzero), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } else { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vbias] "w"(wbias), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [zero] "w"(vzero), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } +#else + unsigned int *vmask_ptr = vmask; + float bias_val = flag_bias ? bias[i] : 0.f; + if (flag_relu) { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [bias_val] "r"(bias_val), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [bias_val] "r"(bias_val), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *doutr0++ = out_buf1[w]; + *doutr1++ = out_buf2[w]; + } + } // end of processing heights + } // end of processing channels + } // end of processing batchs +} +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc index 55ea94949ba93396c97be5e3ea66d6e29ce95429..c998ddc3a34c2f6194a5156b7d04b7a9db3fbcef 100644 --- a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc +++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc @@ -508,6 +508,8 @@ void act_switch_3x3s1(const float* inr0, "x5", "x6", "x7"); +#else +#if 1 // def LITE_WITH_ARM_CLANG #else asm volatile(COMPUTE RELU STORE : [r0] "+r"(inr0), @@ -541,6 +543,7 @@ void act_switch_3x3s1(const float* inr0, "r3", "r4", "r5"); +#endif #endif break; case lite_api::ActivationType::kRelu6: @@ -593,6 +596,8 @@ void act_switch_3x3s1(const float* inr0, "x5", "x6", "x7"); +#else +#if 1 // def LITE_WITH_ARM_CLANG #else asm volatile(COMPUTE RELU RELU6 STORE : [r0] "+r"(inr0), @@ -626,6 +631,7 @@ void act_switch_3x3s1(const float* inr0, "r3", "r4", "r5"); +#endif #endif break; case lite_api::ActivationType::kLeakyRelu: @@ -678,6 +684,8 @@ void act_switch_3x3s1(const float* inr0, "x5", "x6", "x7"); +#else +#if 1 // def LITE_WITH_ARM_CLANG #else asm volatile(COMPUTE LEAKY_RELU STORE : [r0] "+r"(inr0), @@ -711,6 +719,7 @@ void act_switch_3x3s1(const float* inr0, "r3", "r4", "r5"); +#endif #endif break; default: @@ -768,6 +777,8 @@ void act_switch_3x3s1(const float* inr0, "x5", "x6", "x7"); +#else +#if 1 // def LITE_WITH_ARM_CLANG #else asm volatile(COMPUTE STORE : [r0] "+r"(inr0), @@ -801,6 +812,7 @@ void act_switch_3x3s1(const float* inr0, "r3", "r4", "r5"); +#endif #endif } } @@ -988,6 +1000,8 @@ void conv_3x3s1_depthwise_fp32(const float* i_data, w8, vbias, act_param); +#else +#if 1 // def LITE_WITH_ARM_CLANG #else act_switch_3x3s1(inr0, inr1, @@ -1008,6 +1022,7 @@ void conv_3x3s1_depthwise_fp32(const float* i_data, vbias, vbias, act_param); +#endif #endif outl[0] += 4; outl[1] += 4; diff --git a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc index 3e5569365119b97397c6d42f48bacd2552b248e5..d2e8f66a609d44d2c69228f3b9a343fdf91296a8 100644 --- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc +++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc @@ -91,23 +91,20 @@ void conv_depthwise_3x3s2_fp32(const float* din, bool flag_bias, const operators::ActivationParam act_param, ARMContext* ctx) { - if (pad == 0) { - if (w_in > 7) { - conv_depthwise_3x3s2p0_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - act_param, - ctx); + bool has_active = act_param.has_active; + bool flag_relu = false; + bool relu6 = false; + if (has_active) { + if (act_param.active_type == lite_api::ActivationType::kRelu) { + flag_relu = true; } else { - conv_depthwise_3x3s2p0_bias_s(dout, + relu6 = true; + } + } + if (pad == 0) { + if (w_in > 8) { + if (relu6) { + conv_depthwise_3x3s2p0_bias(dout, din, weights, bias, @@ -120,25 +117,57 @@ void conv_depthwise_3x3s2_fp32(const float* din, w_out, act_param, ctx); + } else { + conv_depthwise_3x3s2p0_bias_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } else { + if (relu6) { + conv_depthwise_3x3s2p0_bias_s(dout, + din, + weights, + bias, + flag_bias, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + act_param, + ctx); + } else { + conv_depthwise_3x3s2p0_bias_s_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } } } if (pad == 1) { if (w_in > 7) { - conv_depthwise_3x3s2p1_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - act_param, - ctx); - } else { - conv_depthwise_3x3s2p1_bias_s(dout, + if (relu6) { + conv_depthwise_3x3s2p1_bias(dout, din, weights, bias, @@ -151,6 +180,51 @@ void conv_depthwise_3x3s2_fp32(const float* din, w_out, act_param, ctx); + } else { + conv_depthwise_3x3s2p1_bias_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } else { + if (relu6) { + conv_depthwise_3x3s2p1_bias_s(dout, + din, + weights, + bias, + flag_bias, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + act_param, + ctx); + } else { + conv_depthwise_3x3s2p1_bias_s_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } } } } @@ -476,7 +550,7 @@ void conv_depthwise_3x3s2_fp32(const float* din, \ "st1 {v16.4s}, [%[outptr0]], #16 \n" \ "fcmge v11.4s, v17.4s, %[vzero].4s \n" /* vcgeq_u32 */ \ - "fmul v12.4s, v16.4s, v22.4s \n" \ + "fmul v12.4s, v17.4s, v22.4s \n" \ \ "ld1 {v20.4s}, [%[inptr3]] \n" \ "ld1 {v21.4s}, [%[inptr4]] \n" \ @@ -552,6 +626,7 @@ void conv_depthwise_3x3s2_fp32(const float* din, "ld1 {v20.4s}, [%[inptr3]] \n" \ "ld1 {v21.4s}, [%[inptr4]] \n" \ \ + "fadd v17.4s, v17.4s, v14.4s \n" \ "bif v16.16b, v12.16b, v11.16b \n" /* choose*/ \ "ext v10.16b, v0.16b, v15.16b, #4 \n" \ "fcmge v11.4s, v17.4s, %[vzero].4s \n" /* vcgeq_u32 */ \ @@ -977,207 +1052,158 @@ void act_switch_3x3s2p1(const float* din0_ptr, int cnt, int cnt_remain, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { - float tmp = act_param.Relu_clipped_coef; - float ss = act_param.Leaky_relu_alpha; - float vsix[4] = {tmp, tmp, tmp, tmp}; - float vscale[4] = {ss, ss, ss, ss}; - - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: - asm volatile( - INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 - MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - break; - case lite_api::ActivationType::kRelu6: - /* 0 <= din <= 6 */ - asm volatile( - INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU6 MID_COMPUTE_S2 - MID_RESULT_S2_RELU6 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU6 - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [six_ptr] "r"(vsix), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); - break; - case lite_api::ActivationType::kLeakyRelu: - /*din = din >= 0 ? din : din * scale*/ - asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU - MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU - RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_LEAKY_RELU - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [scale_ptr] "r"(vscale), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); - break; - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { - asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 - MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); + float tmp = act_param.Relu_clipped_coef; + float ss = act_param.Leaky_relu_alpha; + float vsix[4] = {tmp, tmp, tmp, tmp}; + float vscale[4] = {ss, ss, ss, ss}; + + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: + asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 + MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + break; + case lite_api::ActivationType::kRelu6: + /* 0 <= din <= 6 */ + asm volatile( + INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU6 MID_COMPUTE_S2 + MID_RESULT_S2_RELU6 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU6 + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [six_ptr] "r"(vsix), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22"); + break; + case lite_api::ActivationType::kLeakyRelu: + /*din = din >= 0 ? din : din * scale*/ + asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU + MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2_LEAKY_RELU + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [scale_ptr] "r"(vscale), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22"); + break; + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } #endif @@ -1569,249 +1595,191 @@ void act_switch_3x3s2p0(const float* din0_ptr, int cnt, int cnt_remain, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { - float tmp = act_param.Relu_clipped_coef; - float ss = act_param.Leaky_relu_alpha; - float vsix[4] = {tmp, tmp, tmp, tmp}; - float vscale[4] = {ss, ss, ss, ss}; - - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: - asm volatile( - INIT_S2 - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - MID_COMPUTE_S2 MID_RESULT_S2_RELU - "cmp %w[remain], #1 \n" - "blt 4f \n" RIGHT_COMPUTE_S2 - RIGHT_RESULT_S2_RELU - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - break; - case lite_api::ActivationType::kRelu6: - /* 0 <= din <= 6 */ - asm volatile( - INIT_S2 - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v22.4s}, [%[six_ptr]] \n" MID_COMPUTE_S2 - MID_RESULT_S2_RELU6 - "cmp %w[remain], #1 \n" - "blt 4f \n" RIGHT_COMPUTE_S2 - RIGHT_RESULT_S2_RELU6 - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [six_ptr] "r"(vsix), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); - break; - case lite_api::ActivationType::kLeakyRelu: - /*din = din >= 0 ? din : din * scale*/ - asm volatile( - INIT_S2 - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v22.4s}, [%[scale_ptr]] \n" MID_COMPUTE_S2 - MID_RESULT_S2_LEAKY_RELU - "cmp %w[remain], #1 \n" - "blt 4f \n" RIGHT_COMPUTE_S2 - RIGHT_RESULT_S2_LEAKY_RELU - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [scale_ptr] "r"(vscale), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); - break; - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { - asm volatile( - INIT_S2 - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - MID_COMPUTE_S2 MID_RESULT_S2 - "cmp %w[remain], #1 \n" - "blt 4f \n" RIGHT_COMPUTE_S2 - RIGHT_RESULT_S2 "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); + float tmp = act_param.Relu_clipped_coef; + float ss = act_param.Leaky_relu_alpha; + float vsix[4] = {tmp, tmp, tmp, tmp}; + float vscale[4] = {ss, ss, ss, ss}; + + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: + asm volatile( + INIT_S2 + "ld1 {v15.4s}, [%[inptr0]] \n" + "ld1 {v18.4s}, [%[inptr1]] \n" + "ld1 {v19.4s}, [%[inptr2]] \n" + "ld1 {v20.4s}, [%[inptr3]] \n" + "ld1 {v21.4s}, [%[inptr4]] \n" + "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} + MID_COMPUTE_S2 MID_RESULT_S2_RELU + "cmp %w[remain], #1 \n" + "blt 4f \n" RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2_RELU + "4: \n" + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + break; + case lite_api::ActivationType::kRelu6: + /* 0 <= din <= 6 */ + asm volatile( + INIT_S2 + "ld1 {v15.4s}, [%[inptr0]] \n" + "ld1 {v18.4s}, [%[inptr1]] \n" + "ld1 {v19.4s}, [%[inptr2]] \n" + "ld1 {v20.4s}, [%[inptr3]] \n" + "ld1 {v21.4s}, [%[inptr4]] \n" + "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} + "ld1 {v22.4s}, [%[six_ptr]] \n" MID_COMPUTE_S2 + MID_RESULT_S2_RELU6 + "cmp %w[remain], #1 \n" + "blt 4f \n" RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2_RELU6 + "4: \n" + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [six_ptr] "r"(vsix), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22"); + break; + case lite_api::ActivationType::kLeakyRelu: + /*din = din >= 0 ? din : din * scale*/ + asm volatile( + INIT_S2 + "ld1 {v15.4s}, [%[inptr0]] \n" + "ld1 {v18.4s}, [%[inptr1]] \n" + "ld1 {v19.4s}, [%[inptr2]] \n" + "ld1 {v20.4s}, [%[inptr3]] \n" + "ld1 {v21.4s}, [%[inptr4]] \n" + "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} + "ld1 {v22.4s}, [%[scale_ptr]] \n" MID_COMPUTE_S2 + MID_RESULT_S2_LEAKY_RELU + "cmp %w[remain], #1 \n" + "blt 4f \n" RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2_LEAKY_RELU + "4: \n" + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [scale_ptr] "r"(vscale), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22"); + break; + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } #endif diff --git a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc new file mode 100644 index 0000000000000000000000000000000000000000..b2f0243279fd1be27349bfeb97a3a61eed3eff4d --- /dev/null +++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc @@ -0,0 +1,1735 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/arm/math/conv_depthwise.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +#ifdef __aarch64__ +#define INIT_S2 \ + "prfm pldl1keep, [%[inptr0]] \n" \ + "prfm pldl1keep, [%[inptr1]] \n" \ + "prfm pldl1keep, [%[inptr2]] \n" \ + "prfm pldl1keep, [%[inptr3]] \n" \ + "prfm pldl1keep, [%[inptr4]] \n" \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" + +#define LEFT_COMPUTE_S2 \ + "ext v10.16b, %[vzero].16b, v1.16b, #12 \n" /* r0 */ \ + "fmul v11.4s, v0.4s, %[w0].s[1] \n" /* {0,2,4,6} * w01 */ \ + "fmul v12.4s, v1.4s, %[w0].s[2] \n" /* {1,3,5,7} * w02 */ \ + "fmla v16.4s, v10.4s, %[w0].s[0] \n" /* {0,1,3,5} * w00*/ \ + \ + "ext v10.16b, %[vzero].16b, v3.16b, #12 \n" /* v10 = {0,1,3,5} */ \ + \ + "sub %[inptr0], %[inptr0], #4 \n" \ + "sub %[inptr1], %[inptr1], #4 \n" /* r1 */ \ + "fmla v11.4s, v2.4s, %[w1].s[1] \n" \ + "fmla v12.4s, v3.4s, %[w1].s[2] \n" \ + "fmla v16.4s, v10.4s, %[w1].s[0] \n" \ + \ + "ext v10.16b, %[vzero].16b, v5.16b, #12 \n" \ + \ + "sub %[inptr2], %[inptr2], #4 \n" \ + "sub %[inptr3], %[inptr3], #4 \n" /* r2 */ \ + "fmul v13.4s, v4.4s, %[w0].s[1] \n" \ + "fmla v11.4s, v4.4s, %[w2].s[1] \n" \ + \ + "fmul v14.4s, v5.4s, %[w0].s[2] \n" \ + "fmla v12.4s, v5.4s, %[w2].s[2] \n" \ + \ + "fmla v17.4s, v10.4s, %[w0].s[0] \n" \ + "fmla v16.4s, v10.4s, %[w2].s[0] \n" \ + \ + "ext v10.16b, %[vzero].16b, v7.16b, #12 \n" \ + \ + "sub %[inptr4], %[inptr4], #4 \n" /* r3 */ \ + "fmla v13.4s, v6.4s, %[w1].s[1] \n" \ + "fmla v14.4s, v7.4s, %[w1].s[2] \n" \ + "fmla v17.4s, v10.4s, %[w1].s[0] \n" \ + \ + "ext v10.16b, %[vzero].16b, v9.16b, #12 \n" \ + "fadd v16.4s, v16.4s, v11.4s \n" \ + "fadd v16.4s, v16.4s, v12.4s \n" + +#define LEFT_RESULT_S2 \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[1] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[2] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[0] \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "cmp %w[cnt], #1 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "blt 1f \n" + +#define MID_COMPUTE_S2 \ + "2: \n" /* r0 */ \ + "fmul v11.4s, v0.4s, %[w0].s[0] \n" \ + "fmul v12.4s, v1.4s, %[w0].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w0].s[2] \n" \ + \ + "ext v10.16b, v2.16b, v18.16b, #4 \n" \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" /* r1 */ \ + "fmla v11.4s, v2.4s, %[w1].s[0] \n" \ + "fmla v12.4s, v3.4s, %[w1].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v4.16b, v19.16b, #4 \n" \ + \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" /* r2 */ \ + "fmul v13.4s, v4.4s, %[w0].s[0] \n" \ + "fmla v11.4s, v4.4s, %[w2].s[0] \n" \ + \ + "fmul v14.4s, v5.4s, %[w0].s[1] \n" \ + "fmla v12.4s, v5.4s, %[w2].s[1] \n" \ + \ + "fmla v17.4s, v10.4s, %[w0].s[2] \n" \ + "fmla v16.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ext v10.16b, v6.16b, v20.16b, #4 \n" \ + \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" /* r3 */ \ + "fmla v13.4s, v6.4s, %[w1].s[0] \n" \ + "fmla v14.4s, v7.4s, %[w1].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v8.16b, v21.16b, #4 \n" \ + \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + \ + "fadd v16.4s, v16.4s, v11.4s \n" \ + "fadd v16.4s, v16.4s, v12.4s \n" + +#define MID_RESULT_S2 \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "bne 2b \n" + +#define RIGHT_COMPUTE_S2 \ + "1: \n" \ + "cmp %w[remain], #1 \n" \ + "blt 4f \n" \ + "3: \n" \ + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" \ + \ + "bif v2.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v3.16b, %[vzero].16b, %[mask2].16b \n" \ + \ + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" \ + \ + "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" \ + \ + "bif v6.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v7.16b, %[vzero].16b, %[mask2].16b \n" /* r0 */ \ + "fmul v11.4s, v0.4s, %[w0].s[0] \n" \ + "fmul v12.4s, v1.4s, %[w0].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w0].s[2] \n" \ + \ + "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" \ + "bif v8.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v9.16b, %[vzero].16b, %[mask2].16b \n" /* r1 */ \ + "fmla v11.4s, v2.4s, %[w1].s[0] \n" \ + "fmla v12.4s, v3.4s, %[w1].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" /* r2 */ \ + "fmul v13.4s, v4.4s, %[w0].s[0] \n" \ + "fmla v11.4s, v4.4s, %[w2].s[0] \n" \ + \ + "fmul v14.4s, v5.4s, %[w0].s[1] \n" \ + "fmla v12.4s, v5.4s, %[w2].s[1] \n" \ + \ + "fmla v17.4s, v10.4s, %[w0].s[2] \n" \ + "fmla v16.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" /* r3 */ \ + "fmla v13.4s, v6.4s, %[w1].s[0] \n" \ + "fmla v14.4s, v7.4s, %[w1].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" \ + "ld1 {v0.4s}, [%[outptr0]] \n" \ + \ + "fadd v16.4s, v16.4s, v11.4s \n" \ + "fadd v16.4s, v16.4s, v12.4s \n" \ + "ld1 {v1.4s}, [%[outptr1]] \n" + +#define RIGHT_RESULT_S2 \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "bif v16.16b, v0.16b, %[wmask].16b \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "bif v17.16b, v1.16b, %[wmask].16b \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + "4: \n" + +#define LEFT_RESULT_S2_RELU \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[1] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[2] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[0] \n" \ + \ + "fmax v16.4s, v16.4s, %[vzero].4s \n" \ + \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + \ + "fmax v17.4s, v17.4s, %[vzero].4s \n" \ + \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "cmp %w[cnt], #1 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "blt 1f \n" + +#define MID_RESULT_S2_RELU \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "bne 2b \n" + +#define RIGHT_RESULT_S2_RELU \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "bif v16.16b, v0.16b, %[wmask].16b \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ \ + \ + "bif v17.16b, v1.16b, %[wmask].16b \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + "4: \n" + +#define COMPUTE_S_S2 \ + "movi v9.4s, #0 \n" \ + "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \ + \ + "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" \ + "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" \ + "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" \ + \ + "bif v10.16b, v9.16b, v6.16b \n" \ + "bif v11.16b, v9.16b, v7.16b \n" \ + "bif v12.16b, v9.16b, v6.16b \n" \ + "bif v13.16b, v9.16b, v7.16b \n" \ + "bif v14.16b, v9.16b, v6.16b \n" \ + "bif v15.16b, v9.16b, v7.16b \n" \ + \ + "ext v6.16b, v9.16b, v11.16b, #12 \n" \ + "ext v7.16b, v9.16b, v13.16b, #12 \n" \ + "ext v8.16b, v9.16b, v15.16b, #12 \n" \ + \ + "fmul v4.4s, v10.4s, %[wr0].s[1] \n" \ + "fmul v5.4s, v11.4s, %[wr0].s[2] \n" \ + "fmul v6.4s, v6.4s, %[wr0].s[0] \n" \ + \ + "fmla v4.4s, v12.4s, %[wr1].s[1] \n" \ + "fmla v5.4s, v13.4s, %[wr1].s[2] \n" \ + "fmla v6.4s, v7.4s, %[wr1].s[0] \n" \ + \ + "fmla v4.4s, v14.4s, %[wr2].s[1] \n" \ + "fmla v5.4s, v15.4s, %[wr2].s[2] \n" \ + "fmla v6.4s, v8.4s, %[wr2].s[0] \n" \ + \ + "fadd v4.4s, v4.4s, v5.4s \n" \ + "fadd v4.4s, v4.4s, v6.4s \n" + +#define RESULT_S_S2 \ + "fadd v4.4s, v4.4s, %[bias].4s \n" \ + \ + "st1 {v4.4s}, [%[out]] \n" + +#define RESULT_S_S2_RELU \ + "fadd v4.4s, v4.4s, %[bias].4s \n" \ + "fmax v4.4s, v4.4s, v9.4s \n" \ + \ + "st1 {v4.4s}, [%[out]] \n" + +#define COMPUTE_S_S2_P0 \ + "movi v9.4s, #0 \n" \ + "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \ + \ + "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" \ + "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" \ + "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" \ + "and v4.16b, %[bias].16b, %[bias].16b \n" \ + \ + "bif v10.16b, v9.16b, v6.16b \n" \ + "bif v11.16b, v9.16b, v7.16b \n" \ + "bif v12.16b, v9.16b, v6.16b \n" \ + "bif v13.16b, v9.16b, v7.16b \n" \ + "bif v14.16b, v9.16b, v6.16b \n" \ + "bif v15.16b, v9.16b, v7.16b \n" \ + \ + "ext v6.16b, v10.16b, v9.16b, #4 \n" \ + "ext v7.16b, v12.16b, v9.16b, #4 \n" \ + "ext v8.16b, v14.16b, v9.16b, #4 \n" \ + \ + "fmla v4.4s, v10.4s, %[wr0].s[0] \n" \ + "fmul v5.4s, v11.4s, %[wr0].s[1] \n" \ + "fmul v16.4s, v6.4s, %[wr0].s[2] \n" \ + \ + "fmla v4.4s, v12.4s, %[wr1].s[0] \n" \ + "fmla v5.4s, v13.4s, %[wr1].s[1] \n" \ + "fmla v16.4s, v7.4s, %[wr1].s[2] \n" \ + \ + "fmla v4.4s, v14.4s, %[wr2].s[0] \n" \ + "fmla v5.4s, v15.4s, %[wr2].s[1] \n" \ + "fmla v16.4s, v8.4s, %[wr2].s[2] \n" \ + \ + "fadd v4.4s, v4.4s, v5.4s \n" \ + "fadd v4.4s, v4.4s, v16.4s \n" + +#define RESULT_S_S2_P0 "st1 {v4.4s}, [%[out]] \n" + +#define RESULT_S_S2_P0_RELU \ + "fmax v4.4s, v4.4s, v9.4s \n" \ + "st1 {v4.4s}, [%[out]] \n" + +#else +#define INIT_S2 \ + "vmov.u32 q9, #0 \n" \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + \ + "vdup.32 q3, %[bias] @ and \n" + +#define LEFT_COMPUTE_S2 \ + "vext.32 q6, q9, q11, #3 @ shift right 1 data\n" \ + "vext.32 q7, q9, q13, #3 @ shift right 1 data\n" \ + "vext.32 q8, q9, q15, #3 @ shift right 1 data\n" \ + "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 1, out0\n" \ + "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 1, out0\n" \ + \ + "sub %[din0_ptr], #4 @ inpitr0 - 1\n" \ + "sub %[din1_ptr], #4 @ inpitr1 - 1\n" \ + "sub %[din2_ptr], #4 @ inpitr2 - 1\n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, out0\n" \ + \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 1, out1\n" \ + "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 1, out1\n" \ + "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 1, out1\n" \ + \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define LEFT_RESULT_S2 \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "cmp %[cnt], #1 \n" \ + "blt 1f \n" + +#define MID_COMPUTE_S2 \ + "2: \n" \ + "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + "vext.32 q6, q10, q8, #1 @ shift left 1 \n" \ + "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" \ + \ + "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ + \ + "vext.32 q7, q12, q8, #1 @ shift left 1 \n" \ + "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ + \ + "vext.32 q6, q14, q8, #1 @ shift left 1 \n" \ + \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, out0\n" \ + \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define MID_RESULT_S2 \ + "subs %[cnt], #1 \n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "bne 2b \n" + +#define RIGHT_COMPUTE_S2 \ + "1: \n" \ + "cmp %[remain], #1 \n" \ + "blt 3f \n" \ + \ + "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + \ + "vbif q10, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q11, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q12, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q13, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q14, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q15, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + \ + "vext.32 q6, q10, q9, #1 @ shift left 1 \n" \ + "vext.32 q7, q12, q9, #1 @ shift left 1 \n" \ + \ + "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ + \ + "vext.32 q6, q14, q9, #1 @ shift left 1 \n" \ + "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ + \ + "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, out0\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define RIGHT_RESULT_S2 \ + "vbif.f32 q3, q10, q11 @ write mask\n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "3: \n" + +#define LEFT_RESULT_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "cmp %[cnt], #1 \n" \ + "blt 1f \n" + +#define MID_RESULT_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "subs %[cnt], #1 \n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "bne 2b \n" + +#define RIGHT_RESULT_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "vbif.f32 q3, q10, q11 @ write mask\n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "3: \n" + +#define COMPUTE_S_S2 \ + "vmov.u32 q9, #0 \n" \ + "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ + \ + "vbif q10, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q11, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q12, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q13, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q14, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q15, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + \ + "vext.32 q6, q9, q11, #3 @ shift left 1 \n" \ + "vext.32 q7, q9, q13, #3 @ shift left 1 \n" \ + "vext.32 q8, q9, q15, #3 @ shift left 1 \n" \ + \ + "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 0, out0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, out0\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 2, out0\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]] \n" + +#define RESULT_S_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu\n" \ + \ + "vst1.32 {d6-d7}, [%[out]] \n" + +#define COMPUTE_S_S2_P0 \ + "vmov.u32 q9, #0 \n" \ + "vld1.f32 {d12-d15}, [%[mask_ptr]] @ load mask\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ + \ + "vbif q10, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q11, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q12, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q13, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q14, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q15, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + \ + "vext.32 q6, q10, q9, #1 @ shift left 1 \n" \ + "vext.32 q7, q12, q9, #1 @ shift left 1 \n" \ + "vext.32 q8, q14, q9, #1 @ shift left 1 \n" \ + \ + "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q8, %f[wr2][0] @ mul weight 2, out0\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]] \n" + +#define RESULT_S_S2_P0_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "vst1.32 {d6-d7}, [%[out]] \n" + +#endif + +/** + * \brief depthwise convolution kernel 3x3, stride 2 + * w_in > 7 + */ +void conv_depthwise_3x3s2p1_bias_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + int size_pad_bottom = h_out * 2 - h_in; + + int cnt_col = (w_out >> 2) - 2; + int size_right_remain = w_in - (7 + cnt_col * 8); + if (size_right_remain >= 9) { + cnt_col++; + size_right_remain -= 8; + } + int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4); // + + int size_right_pad = w_out * 2 - w_in; + + uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + uint32x4_t wmask = + vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + float* zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float* write_ptr = zero_ptr + w_in; + + unsigned int dmask[12]; + + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + vst1q_u32(dmask + 8, wmask); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float32x4_t vzero = vdupq_n_f32(0.f); +#ifdef __aarch64__ + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } +#else + float bias_c = 0.f; + if (flag_bias) { + bias_c = bias[i]; + } +#endif // __aarch64__ + + const float* dr0 = din_channel; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + const float* dr3 = dr2 + w_in; + const float* dr4 = dr3 + w_in; + + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + const float* din3_ptr = dr3; + const float* din4_ptr = dr4; + + float* doutr0 = dout_channel; + float* doutr0_ptr = nullptr; + float* doutr1_ptr = nullptr; + +#ifdef __aarch64__ + for (int i = 0; i < h_in; i += 4) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + din3_ptr = dr3; + din4_ptr = dr4; + + doutr0_ptr = doutr0; + doutr1_ptr = doutr0 + w_out; + + if (i == 0) { + din0_ptr = zero_ptr; + din1_ptr = dr0; + din2_ptr = dr1; + din3_ptr = dr2; + din4_ptr = dr3; + dr0 = dr3; + dr1 = dr4; + } else { + dr0 = dr4; + dr1 = dr0 + w_in; + } + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + + //! process bottom pad + if (i + 4 > h_in) { + switch (i + 4 - h_in) { + case 4: + din1_ptr = zero_ptr; + case 3: + din2_ptr = zero_ptr; + case 2: + din3_ptr = zero_ptr; + case 1: + din4_ptr = zero_ptr; + default: + break; + } + } + //! process output pad + if (i / 2 + 2 > h_out) { + doutr1_ptr = write_ptr; + } + int cnt = cnt_col; + if (flag_relu) { + asm volatile( + INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 + MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } else { + asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 + MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } + doutr0 = doutr0 + 2 * w_out; + } +#else + for (int i = 0; i < h_in; i += 2) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + + doutr0_ptr = doutr0; + + if (i == 0) { + din0_ptr = zero_ptr; + din1_ptr = dr0; + din2_ptr = dr1; + dr0 = dr1; + dr1 = dr2; + dr2 = dr1 + w_in; + } else { + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + } + + //! process bottom pad + if (i + 2 > h_in) { + switch (i + 2 - h_in) { + case 2: + din1_ptr = zero_ptr; + case 1: + din2_ptr = zero_ptr; + default: + break; + } + } + int cnt = cnt_col; + unsigned int* mask_ptr = dmask; + if (flag_relu) { + asm volatile( + INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 + MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 + MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + doutr0 = doutr0 + w_out; + } +#endif + } + } +} + +/** + * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 + */ +void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + float zeros[8] = {0.0f}; + + uint32x4_t vmask_rp1 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + unsigned int dmask[8]; + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float bias_c = 0.f; + + if (flag_bias) { + bias_c = bias[i]; + } + float32x4_t vbias = vdupq_n_f32(bias_c); + int hs = -1; + int he = 2; + float out_buf[4]; + for (int j = 0; j < h_out; ++j) { + const float* dr0 = din_channel + hs * w_in; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + if (hs == -1) { + dr0 = zeros; + } + if (he > h_in) { + dr2 = zeros; + } + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + + unsigned int* mask_ptr = dmask; +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } else { + asm volatile(COMPUTE_S_S2 RESULT_S_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } +#else + if (flag_relu) { + asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S2 RESULT_S_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *dout_channel++ = out_buf[w]; + } + hs += 2; + he += 2; + } + } + } +} + +/** + * \brief depthwise convolution kernel 3x3, stride 2 + */ +// w_in > 7 +void conv_depthwise_3x3s2p0_bias_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + + int tile_w = w_out >> 2; + int cnt_remain = w_out % 4; + + unsigned int size_right_remain = (unsigned int)(8 + (tile_w << 3) - w_in); + size_right_remain = 8 - size_right_remain; + + if (cnt_remain == 0 && size_right_remain == 0) { + cnt_remain = 4; + tile_w -= 1; + size_right_remain = 8; + } + uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + uint32x4_t wmask = + vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + float* zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float* write_ptr = zero_ptr + w_in; + + unsigned int dmask[12]; + + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + vst1q_u32(dmask + 8, wmask); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float32x4_t vzero = vdupq_n_f32(0.f); + +#ifdef __aarch64__ + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } +#else + float bias_c = 0.f; + if (flag_bias) { + bias_c = bias[i]; + } +#endif // __aarch64__ + + const float* dr0 = din_channel; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + const float* dr3 = dr2 + w_in; + const float* dr4 = dr3 + w_in; + + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + const float* din3_ptr = dr3; + const float* din4_ptr = dr4; + + float* doutr0 = dout_channel; + float* doutr0_ptr = nullptr; + float* doutr1_ptr = nullptr; + +#ifdef __aarch64__ + for (int i = 0; i < h_out; i += 2) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + din3_ptr = dr3; + din4_ptr = dr4; + + doutr0_ptr = doutr0; + doutr1_ptr = doutr0 + w_out; + + dr0 = dr4; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + + //! process bottom pad + if (i * 2 + 5 > h_in) { + switch (i * 2 + 5 - h_in) { + case 4: + din1_ptr = zero_ptr; + case 3: + din2_ptr = zero_ptr; + case 2: + din3_ptr = zero_ptr; + case 1: + din4_ptr = zero_ptr; + case 0: + din4_ptr = zero_ptr; + default: + break; + } + } + //! process output pad + if (i + 2 > h_out) { + doutr1_ptr = write_ptr; + } + int cnt = tile_w; + if (flag_relu) { + asm volatile( + INIT_S2 + "ld1 {v15.4s}, [%[inptr0]] \n" + "ld1 {v18.4s}, [%[inptr1]] \n" + "ld1 {v19.4s}, [%[inptr2]] \n" + "ld1 {v20.4s}, [%[inptr3]] \n" + "ld1 {v21.4s}, [%[inptr4]] \n" + "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} + MID_COMPUTE_S2 MID_RESULT_S2_RELU + "cmp %w[remain], #1 \n" + "blt 4f \n" RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2_RELU + "4: \n" + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } else { + asm volatile( + INIT_S2 + "ld1 {v15.4s}, [%[inptr0]] \n" + "ld1 {v18.4s}, [%[inptr1]] \n" + "ld1 {v19.4s}, [%[inptr2]] \n" + "ld1 {v20.4s}, [%[inptr3]] \n" + "ld1 {v21.4s}, [%[inptr4]] \n" + "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} + MID_COMPUTE_S2 MID_RESULT_S2 + "cmp %w[remain], #1 \n" + "blt 4f \n" RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2 + "4: \n" + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } + doutr0 = doutr0 + 2 * w_out; + } +#else + for (int i = 0; i < h_out; i++) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + + doutr0_ptr = doutr0; + + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + + //! process bottom pad + if (i * 2 + 3 > h_in) { + switch (i * 2 + 3 - h_in) { + case 2: + din1_ptr = zero_ptr; + case 1: + din2_ptr = zero_ptr; + default: + break; + } + } + int cnt = tile_w; + unsigned int* mask_ptr = dmask; + if (flag_relu) { + asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU + RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + doutr0 = doutr0 + w_out; + } +#endif + } + } +} + +/** + * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 + */ +void conv_depthwise_3x3s2p0_bias_s_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + float zeros[8] = {0.0f}; + const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; + + uint32x4_t vmask_rp1 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + unsigned int dmask[8]; + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float bias_c = 0.f; + + if (flag_bias) { + bias_c = bias[i]; + } + float32x4_t vbias = vdupq_n_f32(bias_c); + float out_buf[4]; + const float* dr0 = din_channel; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + for (int j = 0; j < h_out; j++) { + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + if (j * 2 + 2 >= h_in) { + switch (j + 2 - h_in) { + case 1: + din1_ptr = zero_ptr; + case 0: + din2_ptr = zero_ptr; + default: + break; + } + } + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + + unsigned int* mask_ptr = dmask; +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "cc", + "memory", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16"); + } else { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "cc", + "memory", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16"); + } +#else + if (flag_relu) { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf), + [mask_ptr] "r"(dmask) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf), + [mask_ptr] "r"(dmask) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *dout_channel++ = out_buf[w]; + } + } + } + } +} +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc index 4617d40f4372f6589f20b50205fb307cdc705808..4bb8554202b8feeea48b07e2057ea5d20606ab8e 100644 --- a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc +++ b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc @@ -113,9 +113,9 @@ namespace math { "fcmge v7.4s, v22.4s, v0.4s \n" /* vcgeq_u32 */ \ "fmul v8.4s, v22.4s, %[vscale].4s \n" /* mul */ \ "bif v19.16b, v2.16b, v1.16b \n" /* choose*/ \ - "bif v19.16b, v4.16b, v3.16b \n" /* choose*/ \ - "bif v19.16b, v6.16b, v5.16b \n" /* choose*/ \ - "bif v19.16b, v8.16b, v7.16b \n" /* choose*/ + "bif v20.16b, v4.16b, v3.16b \n" /* choose*/ \ + "bif v21.16b, v6.16b, v5.16b \n" /* choose*/ \ + "bif v22.16b, v8.16b, v7.16b \n" /* choose*/ #define STORE /* save result */ \ "str q19, [%[outc0]], #16\n" \ "str q20, [%[outc1]], #16\n" \ diff --git a/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc b/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc index c778896550de73f888979c8337731a0b9967b5dd..0ac1705de76102c92c9e63d64721aa2467baaf04 100644 --- a/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc +++ b/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc @@ -102,7 +102,7 @@ void conv_depthwise_5x5s2_int8(Dtype* dout, if (h + hout_r_block > hout) { h_kernel = hout - h; } - int hs = h - padh; + int hs = h * 2 - padh; int he = hs + h_kernel * 2 + 3; #pragma omp parallel for num_threads(threads) diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h index 85404d6a6e2e6246677857be8231e15afa86210d..c4fb51021e5b0288a4bc1fd476764348fdc7e450 100644 --- a/lite/backends/arm/math/conv_block_utils.h +++ b/lite/backends/arm/math/conv_block_utils.h @@ -703,7 +703,9 @@ inline void act_switch_c1_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -722,7 +724,7 @@ inline void act_switch_c1_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif break; case lite_api::ActivationType::kRelu6: @@ -734,7 +736,9 @@ inline void act_switch_c1_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [six] "w"(six) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -753,7 +757,7 @@ inline void act_switch_c1_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [six] "w"(six) - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif break; case lite_api::ActivationType::kLeakyRelu: @@ -765,7 +769,9 @@ inline void act_switch_c1_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [scale] "w"(scale) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -785,7 +791,9 @@ inline void act_switch_c1_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [scale] "w"(scale) - : "q0", + : "cc", + "memory", + "q0", "q1", "q2", "q3", @@ -812,14 +820,14 @@ inline void act_switch_c1_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", "v1", "v2", "v3", "v20"); + : "cc", "memory", "v0", "v1", "v2", "v3", "v20"); #else asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_STORE : [doutc0r0] "+r"(doutc0_ptr), [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif } } @@ -1006,7 +1014,9 @@ inline void act_switch_c2_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1026,7 +1036,7 @@ inline void act_switch_c2_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif break; case lite_api::ActivationType::kRelu6: @@ -1039,7 +1049,9 @@ inline void act_switch_c2_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [six] "w"(six) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1059,7 +1071,7 @@ inline void act_switch_c2_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [six] "w"(six) - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif break; case lite_api::ActivationType::kLeakyRelu: @@ -1072,7 +1084,9 @@ inline void act_switch_c2_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [scale] "w"(scale) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1092,7 +1106,9 @@ inline void act_switch_c2_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [scale] "w"(scale) - : "q0", + : "cc", + "memory", + "q0", "q1", "q2", "q3", @@ -1120,7 +1136,7 @@ inline void act_switch_c2_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", "v1", "v2", "v3", "v4", "v5", "v20"); + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v20"); #else asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE : [doutc0r0] "+r"(doutc0_ptr), @@ -1128,7 +1144,7 @@ inline void act_switch_c2_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif } } @@ -1373,7 +1389,9 @@ inline void act_switch_c4_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1403,7 +1421,7 @@ inline void act_switch_c4_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif break; case lite_api::ActivationType::kRelu6: @@ -1418,7 +1436,9 @@ inline void act_switch_c4_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [six] "w"(six) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1448,7 +1468,7 @@ inline void act_switch_c4_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [six] "w"(six) - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif break; case lite_api::ActivationType::kLeakyRelu: @@ -1463,7 +1483,9 @@ inline void act_switch_c4_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [scale] "w"(scale) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1493,7 +1515,9 @@ inline void act_switch_c4_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [scale] "w"(scale) - : "q0", + : "cc", + "memory", + "q0", "q1", "q2", "q3", @@ -1523,7 +1547,9 @@ inline void act_switch_c4_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1544,7 +1570,7 @@ inline void act_switch_c4_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif } } @@ -1929,7 +1955,9 @@ inline void act_switch_c8_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1963,7 +1991,17 @@ inline void act_switch_c8_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15"); + : "cc", + "memory", + "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q15"); #endif break; case lite_api::ActivationType::kRelu6: @@ -1982,7 +2020,9 @@ inline void act_switch_c8_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [six] "w"(six) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -2012,7 +2052,17 @@ inline void act_switch_c8_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [six] "w"(six) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15"); + : "cc", + "memory", + "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q15"); #endif break; case lite_api::ActivationType::kLeakyRelu: @@ -2031,7 +2081,9 @@ inline void act_switch_c8_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [scale] "w"(scale) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -2076,7 +2128,9 @@ inline void act_switch_c8_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [scale] "w"(scale) - : "q0", + : "cc", + "memory", + "q0", "q1", "q2", "q3", @@ -2112,7 +2166,9 @@ inline void act_switch_c8_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -2146,7 +2202,17 @@ inline void act_switch_c8_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15"); + : "cc", + "memory", + "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q15"); #endif } } @@ -2744,8 +2810,18 @@ inline void int32_nchwc4_kernel(int8_t*& dout0, // NOLINT float32x4_t bias, bool is_relu) { #ifdef __aarch64__ + float32x4_t vmax = vdupq_n_f32(-127.f); asm volatile(NCHWC4_TRANS_INT32 "subs %w[cnt], %w[cnt], #1\n" + /* data >= -127 */ + "fcmge v4.4s, v16.4s, %[vmax].4s \n" + "fcmge v5.4s, v18.4s, %[vmax].4s \n" + "fcmge v6.4s, v17.4s, %[vmax].4s \n" + "fcmge v7.4s, v19.4s, %[vmax].4s \n" + "bif v16.16b, %[vmax].16b, v4.16b \n" + "bif v18.16b, %[vmax].16b, v5.16b \n" + "bif v17.16b, %[vmax].16b, v6.16b \n" + "bif v19.16b, %[vmax].16b, v7.16b \n" /* fp32-int32 */ "fcvtas v4.4s, v16.4s\n" "fcvtas v5.4s, v18.4s\n" @@ -2773,7 +2849,10 @@ inline void int32_nchwc4_kernel(int8_t*& dout0, // NOLINT [doutc3r0] "+r"(dout3), [ptr_din] "+r"(din), [cnt] "+r"(cnt) - : [scale] "w"(scale), [bias] "w"(bias), [relu] "r"(is_relu) + : [scale] "w"(scale), + [vmax] "w"(vmax), + [bias] "w"(bias), + [relu] "r"(is_relu) : "cc", "memory", "v0", @@ -2799,6 +2878,7 @@ inline void int32_nchwc4_kernel(int8_t*& dout0, // NOLINT "v20", "v31"); #else + float vmax[4] = {-127.f, -127.f, -127.f, -127.f}; asm volatile(NCHWC4_TRANS_INT32 /* set 0.5 offset */ "vmov.f32 q2, #0.5\n" @@ -2815,11 +2895,21 @@ inline void int32_nchwc4_kernel(int8_t*& dout0, // NOLINT "vbif.f32 q3, q14, q7 @ get right offset\n" "vbif.f32 q4, q14, q8 @ get right offset\n" "vbif.f32 q5, q14, q9 @ get right offset\n" + "vld1.32 {d28-d29}, [%[vmax]] \n" /* add offset */ "vadd.f32 q10, q2, q10\n" "vadd.f32 q11, q3, q11\n" "vadd.f32 q12, q4, q12\n" "vadd.f32 q13, q5, q13\n" + /* data >= -127 */ + "vcge.f32 q6, q10, q14 @ q10 >= vmax \n" + "vcge.f32 q7, q11, q14 @ q11 >= vmax \n" + "vcge.f32 q8, q12, q14 @ q12 >= vmax \n" + "vcge.f32 q9, q13, q14 @ q13 >= vmax \n" + "vbif q10, q14, q6 @ choose \n" + "vbif q11, q14, q7 @ choose \n" + "vbif q12, q14, q8 @ choose \n" + "vbif q13, q14, q9 @ choose \n" /* fp32 to int32 */ "vcvt.s32.f32 q6, q10 @ cvt to int32\n" "vcvt.s32.f32 q7, q11 @ cvt to int32\n" @@ -2836,7 +2926,7 @@ inline void int32_nchwc4_kernel(int8_t*& dout0, // NOLINT "vqmovn.s16 d14, q12 @ cnt to int8\n" "vqmovn.s16 d15, q13 @ cnt to int8\n" "subs %[cnt], %[cnt], #1\n" - /* store */ + /* store data*/ "vld1.32 {d4-d7}, [%[ptr_din]]!\n" "vst1.32 {d12[0]}, [%[doutc0r0]]!\n" "vst1.32 {d13[0]}, [%[doutc1r0]]!\n" @@ -2850,7 +2940,10 @@ inline void int32_nchwc4_kernel(int8_t*& dout0, // NOLINT [doutc3r0] "+r"(dout3), [ptr_din] "+r"(din), [cnt] "+r"(cnt) - : [scale] "w"(scale), [bias] "w"(bias), [relu] "r"(is_relu) + : [scale] "w"(scale), + [bias] "w"(bias), + [relu] "r"(is_relu), + [vmax] "r"(vmax) : "cc", "memory", "q2", @@ -2989,8 +3082,10 @@ template <> inline int8_t cvt_kernel(int din, float scale, float bias, bool flag_relu) { if (flag_relu) { return saturate_cast(round(LITEMAX(din * scale + bias, 0))); + } else { + auto tmp = saturate_cast(round(din * scale + bias)); + return tmp < -127 ? -127 : tmp; } - return saturate_cast(round(din * scale + bias)); } template <> @@ -3362,7 +3457,27 @@ inline void int32_nchwc8_kernel(int8_t*& dout0, // NOLINT float32x4_t bias1, bool is_relu) { #ifdef __aarch64__ + float32x4_t vmax = vdupq_n_f32(-127.f); asm volatile(INT32_NCHWC8_TO_NCHW_FP32 /* fp32-int32 */ + /* data >= -127 */ + "fcmge v10.4s, v16.4s, %[vmax].4s \n" + "fcmge v11.4s, v17.4s, %[vmax].4s \n" + "fcmge v14.4s, v18.4s, %[vmax].4s \n" + "fcmge v15.4s, v19.4s, %[vmax].4s \n" + "fcmge v20.4s, v8.4s, %[vmax].4s \n" + "fcmge v21.4s, v9.4s, %[vmax].4s \n" + "fcmge v22.4s, v12.4s, %[vmax].4s \n" + "fcmge v23.4s, v13.4s, %[vmax].4s \n" + /* choose data */ + "bif v16.16b, %[vmax].16b, v10.16b \n" + "bif v17.16b, %[vmax].16b, v11.16b \n" + "bif v18.16b, %[vmax].16b, v14.16b \n" + "bif v19.16b, %[vmax].16b, v15.16b \n" + "bif v8.16b, %[vmax].16b, v20.16b \n" + "bif v9.16b, %[vmax].16b, v21.16b \n" + "bif v12.16b, %[vmax].16b, v22.16b \n" + "bif v13.16b, %[vmax].16b, v23.16b \n" + /* fp32 - int32 */ "fcvtas v10.4s, v16.4s\n" "fcvtas v11.4s, v17.4s\n" "fcvtas v14.4s, v18.4s\n" @@ -3413,6 +3528,7 @@ inline void int32_nchwc8_kernel(int8_t*& dout0, // NOLINT [scale1] "w"(scale1), [bias0] "w"(bias0), [bias1] "w"(bias1), + [vmax] "w"(vmax), [relu] "r"(is_relu) : "cc", "memory", @@ -3442,6 +3558,7 @@ inline void int32_nchwc8_kernel(int8_t*& dout0, // NOLINT "v23", "v31"); #else + float vmax[4] = {-127.f, -127.f, -127.f, -127.f}; asm volatile(INT32_NCHWC8_TO_NCHW_FP32 /* set +-0.5 offset */ "vmov.f32 q10, #-0.5\n" "vmov.f32 q9, #0.5\n" @@ -3475,7 +3592,18 @@ inline void int32_nchwc8_kernel(int8_t*& dout0, // NOLINT "vmov.f32 q9, #0.5\n" "vcgt.f32 q11, q7, q8 @ get mask > 0, in0\n" "vbif.f32 q9, q10, q11 @ get right offset\n" + "vld1.32 {d22-d23}, [%[vmax]] \n" "vadd.f32 q7, q7, q9\n" + /* data >= -127 */ + "vcge.f32 q8, q0, q11 @ q10 >= vmax \n" + "vcge.f32 q9, q2, q11 @ q10 >= vmax \n" + "vcge.f32 q10, q4, q11 @ q10 >= vmax \n" + /* choose data */ + "vbif q0, q11, q8 @ choose \n" + "vcge.f32 q8, q6, q11 @ q10 >= vmax \n" + "vbif q2, q11, q9 @ choose \n" + "vbif q4, q11, q10 @ choose \n" + "vbif q6, q11, q8 @ choose \n" /* fp32 to int32 */ "vcvt.s32.f32 q8, q0 @ cvt to int32\n" "vcvt.s32.f32 q9, q2 @ cvt to int32\n" @@ -3486,6 +3614,17 @@ inline void int32_nchwc8_kernel(int8_t*& dout0, // NOLINT "vqmovn.s32 d4, q9 @ cnt to int16\n" "vqmovn.s32 d8, q10 @ cnt to int16\n" "vqmovn.s32 d12, q11 @ cnt to int16\n" + /* data >= -127 */ + "vld1.32 {d22-d23}, [%[vmax]] \n" + "vcge.f32 q8, q1, q11 @ q10 >= vmax \n" + "vcge.f32 q9, q3, q11 @ q10 >= vmax \n" + "vcge.f32 q10, q5, q11 @ q10 >= vmax \n" + /* choose data */ + "vbif q1, q11, q8 @ choose \n" + "vcge.f32 q8, q7, q11 @ q10 >= vmax \n" + "vbif q3, q11, q9 @ choose \n" + "vbif q5, q11, q10 @ choose \n" + "vbif q7, q11, q8 @ choose \n" /* fp32 to int32 */ "vcvt.s32.f32 q8, q1 @ cvt to int32\n" "vcvt.s32.f32 q9, q3 @ cvt to int32\n" @@ -3529,6 +3668,7 @@ inline void int32_nchwc8_kernel(int8_t*& dout0, // NOLINT [scale1] "w"(scale1), [bias0] "w"(bias0), [bias1] "w"(bias1), + [vmax] "r"(vmax), [relu] "r"(is_relu) : "cc", "memory", diff --git a/lite/backends/arm/math/conv_depthwise.h b/lite/backends/arm/math/conv_depthwise.h index 4c5f284a19f615382ea04904184427f569f95ff3..72d887ce4e630057286d98c86970def4a9efdb04 100644 --- a/lite/backends/arm/math/conv_depthwise.h +++ b/lite/backends/arm/math/conv_depthwise.h @@ -207,6 +207,118 @@ void conv_depthwise_5x5s2_int8(Dtype* dout, int padh, ARMContext* ctx); +void conv_depthwise_3x3s1p0_bias_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s1p0_bias_s_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s1p1_bias_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s1p1_bias_s_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2p0_bias_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2p0_bias_s_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2p1_bias_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc index 96d0893bc0f0a1c145f4e58dd2caecfba78786ab..4fcef3813b792808414415fa874e14f5ef253fcd 100644 --- a/lite/backends/arm/math/conv_impl.cc +++ b/lite/backends/arm/math/conv_impl.cc @@ -573,6 +573,22 @@ template void conv_im2col_gemm_int8(const int8_t* i_data, ARMContext* ctx, const float* scale); +template void im2col(const float* data_im, + int channels, + int height, + int width, + int kernel_h, + int kernel_w, + int pad_top, + int pad_bottom, + int pad_left, + int pad_right, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + float* data_col); + void conv_depthwise_3x3_fp32(const void* din, void* dout, int num, @@ -613,6 +629,26 @@ void conv_depthwise_3x3_fp32(const void* din, act_param, ctx); } else { +#ifdef __aarch64__ + conv_3x3s1_depthwise_fp32(reinterpret_cast(din), + reinterpret_cast(dout), + num, + ch_out, + h_out, + w_out, + ch_in, + h_in, + w_in, + reinterpret_cast(weights), + bias, + param, + act_param, + ctx); +#else +#ifdef LITE_WITH_ARM_CLANG + LOG(FATAL) << "fp32 depthwise conv3x3s1px doesnot support in v7-clang, " + "this can run in basic"; +#else conv_3x3s1_depthwise_fp32(reinterpret_cast(din), reinterpret_cast(dout), num, @@ -627,6 +663,8 @@ void conv_depthwise_3x3_fp32(const void* din, param, act_param, ctx); +#endif +#endif } } else if (stride == 2) { if (pads_less && pad_h == pad_w && (pad < 2)) { // support pad = [0, 1] diff --git a/lite/backends/arm/math/conv_impl.h b/lite/backends/arm/math/conv_impl.h index 60f74b7feecc91a2fe8262a1fea4dce26430031d..28a2fb7e2a42a27e9ecd3d42b25f9942b481004e 100644 --- a/lite/backends/arm/math/conv_impl.h +++ b/lite/backends/arm/math/conv_impl.h @@ -359,6 +359,24 @@ void conv_compute_2x2_3x3_small(const float* input, const float* bias, const operators::ConvParam& param, ARMContext* ctx); + +template +void im2col(const Dtype* data_im, + int channels, + int height, + int width, + int kernel_h, + int kernel_w, + int pad_top, + int pad_bottom, + int pad_left, + int pad_right, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + Dtype* data_col); + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc index 186ad19735799dcb91641354af4b4f09692bfce9..4d08c1e957d43b5b748ffdb90fd14a07a61d0183 100644 --- a/lite/backends/arm/math/elementwise.cc +++ b/lite/backends/arm/math/elementwise.cc @@ -266,6 +266,72 @@ void elementwise_add_relu_broadcast(const float* dinx, } } +template <> +void elementwise_add_grad(const float* dout_grad, + float* x_grad, + int num) { + int cnt = num >> 4; + int remain = num & 0x0f; +#pragma omp parallel for + for (int i = 0; i < cnt; ++i) { + const float* out_data = dout_grad + 16 * i; + float* x_data = x_grad + 16 * i; + float32x4_t din0 = vld1q_f32(out_data); + float32x4_t din1 = vld1q_f32(out_data + 4); + float32x4_t din2 = vld1q_f32(out_data + 8); + float32x4_t din3 = vld1q_f32(out_data + 12); + vst1q_f32(x_data, din0); + vst1q_f32(x_data + 4, din1); + vst1q_f32(x_data + 8, din2); + vst1q_f32(x_data + 12, din3); + } + if (remain > 0) { + const float* out_data = dout_grad + 16 * cnt; + float* x_data = x_grad + 16 * cnt; + for (int i = 0; i < remain; ++i) { + x_data[i] = out_data[i]; + } + } +} +// we assume that y_data numel less than x_data, otherwise, call this function +// by change x_grad and y_grad position +template <> +void elementwise_add_grad_broadcast(const float* dout_grad, + float* x_grad, + float* y_grad, + int pre, + int n, + int post) { + if (x_grad != nullptr) { + elementwise_add_grad(dout_grad, x_grad, pre * n * post); + } + if (y_grad != nullptr) { + memset(y_grad, 0, n * sizeof(float)); +#pragma omp parallel for + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + float sum = 0; + int cnt = post >> 2; + int remain = post & 0x03; + const float* out_data = dout_grad + (i * n + j) * post; + float32x4_t sum_v = vdupq_n_f32(0); + for (int ci = 0; ci < cnt; ++ci) { + float32x4_t din = vld1q_f32(out_data + 4 * ci); + sum_v = vaddq_f32(sum_v, din); + } + out_data += 4 * cnt; + for (int ci = 0; ci < remain; ++ci) { + sum += out_data[ci]; + } + float32x2_t high = vget_high_f32(sum_v); + float32x2_t low = vget_low_f32(sum_v); + sum += vget_lane_f32(high, 0) + vget_lane_f32(high, 1) + + vget_lane_f32(low, 0) + vget_lane_f32(low, 1); + y_grad[j] += sum; + } + } + } +} template <> void elementwise_sub(const float* dinx, const float* diny, @@ -510,6 +576,84 @@ void elementwise_sub_relu_broadcast(const float* dinx, } } } +// we assume the formula is x-y +template <> +void elementwise_sub_grad(const float* dout_grad, + float* x_grad, + float* y_grad, + int num) { + if (x_grad != nullptr) { + elementwise_add_grad(dout_grad, x_grad, num); + } + if (y_grad != nullptr) { + int cnt = num >> 4; + int remain = num & 0x0f; + float32x4_t minus = vdupq_n_f32(-1); +#pragma omp parallel for + for (int i = 0; i < cnt; ++i) { + const float* out_data = dout_grad + 16 * i; + float* y_data = y_grad + 16 * i; + float32x4_t din0 = vld1q_f32(out_data); + float32x4_t din1 = vld1q_f32(out_data + 4); + float32x4_t din2 = vld1q_f32(out_data + 8); + float32x4_t din3 = vld1q_f32(out_data + 12); + din0 = vmulq_f32(din0, minus); + din1 = vmulq_f32(din1, minus); + din2 = vmulq_f32(din2, minus); + din3 = vmulq_f32(din3, minus); + vst1q_f32(y_data, din0); + vst1q_f32(y_data + 4, din1); + vst1q_f32(y_data + 8, din2); + vst1q_f32(y_data + 12, din3); + } + if (remain > 0) { + const float* out_data = dout_grad + 16 * cnt; + float* y_data = y_grad + 16 * cnt; + for (int i = 0; i < remain; ++i) { + y_data[i] = -out_data[i]; + } + } + } +} +// we assume that y_data numel less than x_data, otherwise, call this function +// by change x_grad and y_grad position +template <> +void elementwise_sub_grad_broadcast(const float* dout_grad, + float* x_grad, + float* y_grad, + int pre, + int n, + int post) { + if (x_grad != nullptr) { + elementwise_add_grad(dout_grad, x_grad, pre * n * post); + } + if (y_grad != nullptr) { + memset(y_grad, 0, n * sizeof(float)); +#pragma omp parallel for + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + float sum = 0; + int cnt = post << 2; + int remain = post & 0x03; + const float* out_data = dout_grad + (i * n + j) * post; + float32x4_t sum_v = vdupq_n_f32(0); + for (int ci = 0; ci < cnt; ++ci) { + float32x4_t din = vld1q_f32(out_data + 4 * ci); + sum_v = vaddq_f32(sum_v, din); + } + out_data += 4 * cnt; + for (int ci = 0; ci < remain; ++ci) { + sum -= out_data[ci]; + } + float32x2_t high = vget_high_f32(sum_v); + float32x2_t low = vget_low_f32(sum_v); + sum -= vget_lane_f32(high, 0) + vget_lane_f32(high, 1) + + vget_lane_f32(low, 0) + vget_lane_f32(low, 1); + y_grad[j] += sum; + } + } + } +} template <> void elementwise_mul(const float* dinx, diff --git a/lite/backends/arm/math/elementwise.h b/lite/backends/arm/math/elementwise.h index f8273a5bb39505b03e911b5699cc10c5be755619..06ecab08edcaf06614de94b99084be2ee80647aa 100644 --- a/lite/backends/arm/math/elementwise.h +++ b/lite/backends/arm/math/elementwise.h @@ -13,11 +13,161 @@ // limitations under the License. #pragma once - +#include +#include +#include +#include "lite/operators/op_params.h" namespace paddle { namespace lite { namespace arm { namespace math { +template +void elementwise_broadcast_common(T const* x_data, + T const* y_data, + T* out_data, + std::vector x_real_dim, + std::vector y_real_dim, + std::vector out_real_dim, + std::string type, + bool is_xsize_large = false) { + int out_size = 1; + int max_dim = out_real_dim.size(); + std::vector index_array(max_dim, 0); + for (int i = 0; i < max_dim; ++i) { + out_size *= out_real_dim[i]; + } + int x_index, y_index; + for (int out_index = 0; out_index < out_size; ++out_index) { + x_index = 0; + for (int i = 0; i < max_dim; i++) { + if (x_real_dim[i] > 1) { + x_index = x_index * x_real_dim[i] + index_array[i]; + } + } + y_index = 0; + for (int i = 0; i < max_dim; i++) { + if (y_real_dim[i] > 1) { + y_index = y_index * y_real_dim[i] + index_array[i]; + } + } + + if (type == "add") { + out_data[out_index] = x_data[x_index] + y_data[y_index]; + } + if (type == "mul") { + out_data[out_index] = x_data[x_index] * y_data[y_index]; + } + } + for (int i = max_dim - 1; i >= 0; --i) { + ++index_array[i]; + if (index_array[i] >= out_real_dim[i]) { + index_array[i] -= out_real_dim[i]; + } else { + break; + } + } +} +template +void elementwise_compute_basic(const operators::ElementwiseParam& param, + const std::string elt_type, + const std::string act_type) { + const dtype* x_data = param.X->data(); + const dtype* y_data = param.Y->data(); + dtype* out_data = param.Out->mutable_data(); + auto x_dims = param.X->dims(); + auto y_dims = param.Y->dims(); + int axis = param.axis; + if (axis < 0) { + axis = x_dims.size() - y_dims.size(); + } + int batch = 1; + int channels = 1; + int num = 1; + for (int i = 0; i < axis; ++i) { + batch *= x_dims[i]; + } + for (int i = 0; i < y_dims.size(); ++i) { + channels *= y_dims[i]; + } + for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { + num *= x_dims[i]; + } + // do elementwise add/sub/max... + if (elt_type == "add") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr + diny_data; + dout_ptr++; + din_ptr++; + } + } + } + } else if (elt_type == "sub") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr - diny_data; + dout_ptr++; + } + } + } + } else if (elt_type == "mul") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr * diny_data; + dout_ptr++; + din_ptr++; + } + } + } + } else if (elt_type == "max") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = std::max(*din_ptr, diny_data); + dout_ptr++; + din_ptr++; + } + } + } + } else { + LOG(FATAL) << "unsupported Elementwise type: " << elt_type; + } + // do activation relu/sigmod... + if (act_type.size() > 0) { + if (act_type == "relu") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + dtype* dout_ptr = out_data + (i * channels + j) * num; + for (int k = 0; k < num; ++k) { + *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f; + dout_ptr++; + } + } + } + } else { + LOG(FATAL) << "unsupported Activation type: " << elt_type; + } + } +} template void elementwise_add(const T* dinx, const T* diny, T* dout, int num); @@ -33,6 +183,13 @@ template void elementwise_add_relu_broadcast( const T* dinx, const T* diny, T* dout, int batch, int channels, int num); +template +void elementwise_add_grad(const T* dout, T* dinx, int num); + +template +void elementwise_add_grad_broadcast( + const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post); + template void elementwise_sub(const T* dinx, const T* diny, T* dout, int num); @@ -47,6 +204,13 @@ template void elementwise_sub_relu_broadcast( const T* dinx, const T* diny, T* dout, int batch, int channels, int num); +template +void elementwise_sub_grad(const T* dout, T* dinx, T* diny, int num); + +template +void elementwise_sub_grad_broadcast( + const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post); + template void elementwise_mul(const T* dinx, const T* diny, T* dout, int num); diff --git a/lite/backends/arm/math/gemm_prepacked_int8.cc b/lite/backends/arm/math/gemm_prepacked_int8.cc index d7e04bfc60b1214bd1e77738efa420d3e25e1456..08f88105e052322e13390b7482fed7d8dd15089b 100644 --- a/lite/backends/arm/math/gemm_prepacked_int8.cc +++ b/lite/backends/arm/math/gemm_prepacked_int8.cc @@ -572,6 +572,25 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, #define GEMM_INT8_INT8_OUT \ GEMM_TRANS_INT32_TO_FP32 \ GEMM_INT8_RELU \ + "ld1 {v8.4s}, [%[vmax]] \n" /* v8 = -127 */ \ + /* data >= -127 */ \ + "fcmge v0.4s, v16.4s, v8.4s\n" \ + "fcmge v1.4s, v17.4s, v8.4s\n" \ + "fcmge v2.4s, v18.4s, v8.4s\n" \ + "fcmge v3.4s, v19.4s, v8.4s\n" \ + "fcmge v4.4s, v20.4s, v8.4s\n" \ + "fcmge v5.4s, v21.4s, v8.4s\n" \ + "fcmge v6.4s, v22.4s, v8.4s\n" \ + "fcmge v7.4s, v23.4s, v8.4s\n" \ + /* choose data */ \ + "bif v16.16b, v8.16b, v0.16b \n" \ + "bif v17.16b, v8.16b, v1.16b \n" \ + "bif v18.16b, v8.16b, v2.16b \n" \ + "bif v19.16b, v8.16b, v3.16b \n" \ + "bif v20.16b, v8.16b, v4.16b \n" \ + "bif v21.16b, v8.16b, v5.16b \n" \ + "bif v22.16b, v8.16b, v6.16b \n" \ + "bif v23.16b, v8.16b, v7.16b \n" \ "fcvtas v0.4s, v16.4s\n" /* 00, cvt to int */ \ "fcvtas v1.4s, v17.4s\n" /* 01, cvt to int */ \ "fcvtas v2.4s, v18.4s\n" /* 02, cvt to int */ \ @@ -580,6 +599,24 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, "fcvtas v5.4s, v21.4s\n" /* 11, cvt to int */ \ "fcvtas v6.4s, v22.4s\n" /* 12, cvt to int */ \ "fcvtas v7.4s, v23.4s\n" /* 13, cvt to int */ \ + /* data >= -127 */ \ + "fcmge v16.4s, v24.4s, v8.4s\n" \ + "fcmge v17.4s, v25.4s, v8.4s\n" \ + "fcmge v18.4s, v26.4s, v8.4s\n" \ + "fcmge v19.4s, v27.4s, v8.4s\n" \ + "fcmge v20.4s, v28.4s, v8.4s\n" \ + "fcmge v21.4s, v29.4s, v8.4s\n" \ + "fcmge v22.4s, v30.4s, v8.4s\n" \ + "fcmge v23.4s, v31.4s, v8.4s\n" \ + /* choose data */ \ + "bif v24.16b, v8.16b, v16.16b\n" \ + "bif v25.16b, v8.16b, v17.16b\n" \ + "bif v26.16b, v8.16b, v18.16b\n" \ + "bif v27.16b, v8.16b, v19.16b\n" \ + "bif v28.16b, v8.16b, v20.16b\n" \ + "bif v29.16b, v8.16b, v21.16b\n" \ + "bif v30.16b, v8.16b, v22.16b\n" \ + "bif v31.16b, v8.16b, v23.16b\n" \ "sqxtn v16.4h, v0.4s\n" /* 00, cvt int32 to int16 */ \ "fcvtas v8.4s, v24.4s\n" /* 20, cvt to int */ \ "sqxtn2 v16.8h, v1.4s\n" /* 01, cvt int32 to int16 */ \ @@ -648,7 +685,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, "v9","v10","v11","v12","v13","v14", "v15","v16","v17","v18","v19","v20", "v21","v22","v23","v24","v25","v26", - "v27","v28","v29","v30","v31","cc"); + "v27","v28","v29","v30","v31","cc", "memory"); // clang-format on } @@ -665,6 +702,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, int k, int rem) { // clang-format off + float vmax[4] = {-127.0, -127.0, -127.0, -127.0}; asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), @@ -676,13 +714,14 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, : [is_relu] "r"(is_relu), [bias] "r"(bias), [rem] "r"(rem), - [scale] "r"(scale) + [scale] "r"(scale), + [vmax] "r"(vmax) : "v0","v1","v2","v3","v4","v5","v6","v7", "v8","v9","v10","v11","v12", "v13","v14","v15","v16","v17", "v18","v19","v20","v21","v22", "v23","v24","v25","v26","v27", - "v28","v29","v30","v31","cc"); + "v28","v29","v30","v31","cc", "memory"); // clang-format on } @@ -1179,6 +1218,25 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, #define GEMM_SDOT_INT8_OUT \ GEMM_SDOT_CVT_INT32_TO_FP32 \ GEMM_SDOT_RELU \ + "ld1 {v6.4s}, [%[vmax]]\n" /* v8 = -127.f */ \ + /* data >= -127 */ \ + "fcmge v0.4s, v8.4s, v6.4s\n" \ + "fcmge v1.4s, v9.4s, v6.4s\n" \ + "fcmge v2.4s, v10.4s, v6.4s\n" \ + "fcmge v3.4s, v11.4s, v6.4s\n" \ + "fcmge v4.4s, v12.4s, v6.4s\n" \ + "fcmge v5.4s, v13.4s, v6.4s\n" \ + "fcmge v7.4s, v14.4s, v6.4s\n" \ + /* choose data */ \ + "bif v8.16b, v6.16b, v0.16b\n" \ + "fcmge v0.4s, v15.4s, v6.4s\n" \ + "bif v9.16b, v6.16b, v1.16b\n" \ + "bif v10.16b, v6.16b, v2.16b\n" \ + "bif v11.16b, v6.16b, v3.16b\n" \ + "bif v12.16b, v6.16b, v4.16b\n" \ + "bif v13.16b, v6.16b, v5.16b\n" \ + "bif v14.16b, v6.16b, v7.16b\n" \ + "bif v15.16b, v6.16b, v0.16b \n" \ "fcvtas v0.4s, v8.4s\n" /* 00, cvt to int */ \ "fcvtas v1.4s, v9.4s\n" /* 01, cvt to int */ \ "fcvtas v2.4s, v10.4s\n" /* 02, cvt to int */ \ @@ -1194,7 +1252,30 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, "sqxtn2 v12.8h, v4.4s\n" /* 11, cvt int32 to int16 */ \ "sqxtn v13.4h, v5.4s\n" /* 12, cvt int32 to int16 */ \ "sqxtn v14.4h, v6.4s\n" /* 20, cvt int32 to int16 */ \ + "ld1 {v6.4s}, [%[vmax]]\n" /* v8 = -127.f */ \ "sqxtn2 v14.8h, v7.4s\n" /* 21, cvt int32 to int16 */ \ + /* data >= -127 */ \ + "fcmge v0.4s, v16.4s, v6.4s\n" \ + "fcmge v1.4s, v17.4s, v6.4s\n" \ + "fcmge v2.4s, v18.4s, v6.4s\n" \ + "fcmge v3.4s, v19.4s, v6.4s\n" \ + "fcmge v4.4s, v20.4s, v6.4s\n" \ + "fcmge v5.4s, v21.4s, v6.4s\n" \ + "fcmge v7.4s, v22.4s, v6.4s\n" \ + "fcmge v8.4s, v23.4s, v6.4s\n" \ + "fcmge v9.4s, v24.4s, v6.4s\n" \ + /* choose data */ \ + "bif v16.16b, v6.16b, v0.16b\n" \ + "fcmge v0.4s, v25.4s, v6.4s\n" \ + "bif v17.16b, v6.16b, v1.16b\n" \ + "bif v18.16b, v6.16b, v2.16b\n" \ + "bif v19.16b, v6.16b, v3.16b\n" \ + "bif v20.16b, v6.16b, v4.16b\n" \ + "bif v21.16b, v6.16b, v5.16b\n" \ + "bif v22.16b, v6.16b, v7.16b\n" \ + "bif v23.16b, v6.16b, v8.16b\n" \ + "bif v24.16b, v6.16b, v9.16b\n" \ + "bif v25.16b, v6.16b, v0.16b\n" \ "fcvtas v0.4s, v16.4s\n" /* 22, cvt to int */ \ "fcvtas v1.4s, v17.4s\n" /* 30, cvt to int */ \ "fcvtas v2.4s, v18.4s\n" /* 31, cvt to int */ \ @@ -1214,7 +1295,22 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, "sqxtn v19.4h, v6.4s\n" /* 42, cvt int32 to int16 */ \ "sqxtn v20.4h, v7.4s\n" /* 50, cvt int32 to int16 */ \ "sqxtn2 v20.8h, v8.4s\n" /* 51, cvt int32 to int16 */ \ + "ld1 {v6.4s}, [%[vmax]]\n" /* v8 = -127.f */ \ "sqxtn v21.4h, v9.4s\n" /* 52, cvt int32 to int16 */ \ + /* data >= -127 */ \ + "fcmge v0.4s, v26.4s, v6.4s\n" \ + "fcmge v1.4s, v27.4s, v6.4s\n" \ + "fcmge v2.4s, v28.4s, v6.4s\n" \ + "fcmge v3.4s, v29.4s, v6.4s\n" \ + "fcmge v4.4s, v30.4s, v6.4s\n" \ + "fcmge v5.4s, v31.4s, v6.4s\n" \ + /* choose data */ \ + "bif v26.16b, v6.16b, v0.16b\n" \ + "bif v27.16b, v6.16b, v1.16b\n" \ + "bif v28.16b, v6.16b, v2.16b\n" \ + "bif v29.16b, v6.16b, v3.16b\n" \ + "bif v30.16b, v6.16b, v4.16b\n" \ + "bif v31.16b, v6.16b, v5.16b\n" \ "fcvtas v0.4s, v26.4s\n" /* 60, cvt to int */ \ "fcvtas v1.4s, v27.4s\n" /* 61, cvt to int */ \ "fcvtas v2.4s, v28.4s\n" /* 62, cvt to int */ \ @@ -1318,6 +1414,7 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, int k, int tail) { // clang-format off + float32_t vmax[4] = {-127.0, -127.0, -127.0, -127.0}; asm volatile(GEMM_SDOT_INT8_KERNEL GEMM_SDOT_INT8_OUT : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), @@ -1331,7 +1428,7 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, [c_ptr5] "+r"(c_ptr5), [c_ptr6] "+r"(c_ptr6), [c_ptr7] "+r"(c_ptr7) - : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu) + : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu), [vmax] "r"(vmax) : "cc","memory","v0","v1","v2","v3", "v4","v5","v6","v7","v8","v9","v10", "v11","v12","v13","v14","v15","v16","v17", @@ -1614,6 +1711,24 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, "vadd.f32 q3, q11, q3\n" /* r21, add offset */ \ "vadd.f32 q4, q12, q4\n" /* r30, add offset */ \ "vadd.f32 q5, q13, q5\n" /* r31, add offset */ \ + "vld1.32 {d12-d13}, [%[vmax]]\n" /* set q4 = -127 \n"*/ \ + "vcge.f32 q7, q8, q6\n" /* @ q8 >= -127 \n */ \ + "vcge.f32 q10, q9, q6\n" /* @ q8 >= -127 \n */ \ + "vcge.f32 q11, q0, q6\n" /* @ q8 >= -127 \n */ \ + "vcge.f32 q12, q1, q6\n" /* @ q8 >= -127 \n */ \ + "vcge.f32 q13, q2, q6\n" /* @ q8 >= -127 \n */ \ + "vcge.f32 q14, q3, q6\n" /* @ q8 >= -127 \n */ \ + "vcge.f32 q15, q4, q6\n" /* @ q8 >= -127 \n */ \ + /* choose data */ \ + "vbif q8, q6, q7\n" /* @ choose */ \ + "vcge.f32 q7, q5, q6\n" /* @ q8 >= -127 \n */ \ + "vbif q9, q6, q10\n" /* @ choose */ \ + "vbif q0, q6, q11\n" /* @ choose */ \ + "vbif q1, q6, q12\n" /* @ choose */ \ + "vbif q2, q6, q13\n" /* @ choose */ \ + "vbif q3, q6, q14\n" /* @ choose */ \ + "vbif q4, q6, q15\n" /* @ choose */ \ + "vbif q5, q6, q7\n" /* @ choose */ \ "vcvt.s32.f32 q6, q8\n" /* r00, fp32->int32 */ \ "vcvt.s32.f32 q7, q9\n" /* r01, fp32->int32 */ \ "vcvt.s32.f32 q10, q0\n" /* r10, fp32->int32 */ \ @@ -1682,7 +1797,8 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, "q14", "q15", "r0", - "cc"); + "cc", + "memory"); } template <> @@ -1697,6 +1813,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, bool is_relu, int k, int rem) { + float vmax[4] = {-127.0, -127.0, -127.0, -127.0}; asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), @@ -1708,6 +1825,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, : [is_relu] "r"(is_relu), [bias] "r"(bias), [rem] "r"(rem), + [vmax] "r"(vmax), [scale] "r"(scale) : "q0", "q1", @@ -1726,7 +1844,8 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, "q14", "q15", "r0", - "cc"); + "cc", + "memory"); } #endif // __aarch64__ // NOLINT diff --git a/lite/backends/arm/math/gemv_arm_int8.cc b/lite/backends/arm/math/gemv_arm_int8.cc index dab42cdeca28d40622590632985603ce8eab1fb9..98c50de9e370fbe39c35156bf631b35362ff21b4 100644 --- a/lite/backends/arm/math/gemv_arm_int8.cc +++ b/lite/backends/arm/math/gemv_arm_int8.cc @@ -79,6 +79,7 @@ inline void write_gemv_out(const int* in, for (int i = 0; i < size; ++i) { out[0] = saturate_cast(roundf(*(in++) * *(scale++) + *(bias++))); + out[0] = out[0] < -127 ? -127 : out[0]; // -127 - 127 if (flag_relu) { out[0] = out[0] > 0 ? out[0] : 0; } @@ -87,6 +88,7 @@ inline void write_gemv_out(const int* in, } else { for (int i = 0; i < size; ++i) { out[0] = saturate_cast(roundf(*(in++) * *(scale++))); + out[0] = out[0] < -127 ? -127 : out[0]; // -127 - 127 if (flag_relu) { out[0] = out[0] > 0 ? out[0] : 0; } diff --git a/lite/backends/arm/math/increment.cc b/lite/backends/arm/math/increment.cc index 583ff52077e720510e66fcdb9604d1dc8992a90d..62c4f41eacda0356ca3967af877244856b3156d7 100644 --- a/lite/backends/arm/math/increment.cc +++ b/lite/backends/arm/math/increment.cc @@ -20,18 +20,7 @@ namespace paddle { namespace lite { namespace arm { -namespace math { -void increment(const float* input, - const int n, - const float step, - float* out, - Context* ctx) { - for (int i = 0; i < n; i++) { - out[i] = input[i] + step; - } -} - -} // namespace math +namespace math {} // namespace math } // namespace arm } // namespace lite } // namespace paddle diff --git a/lite/backends/arm/math/increment.h b/lite/backends/arm/math/increment.h index 028db0fd55e9507aa4f161339e4a8b0cd2e59ffe..ec6217d105bb73b5ab230518876471af91880d2d 100644 --- a/lite/backends/arm/math/increment.h +++ b/lite/backends/arm/math/increment.h @@ -21,11 +21,16 @@ namespace paddle { namespace lite { namespace arm { namespace math { -void increment(const float* input, +template +void increment(const T* input, const int n, const float step, - float* out, - Context* ctx); + T* out, + Context* ctx) { + for (int i = 0; i < n; i++) { + out[i] = input[i] + static_cast(step); + } +} } // namespace math } // namespace arm diff --git a/lite/backends/arm/math/layout.cc b/lite/backends/arm/math/layout.cc index fd9126ab48c8f829c82d0c78a338074c695f0b9c..214c386d553e3d5548bb4750c3130191a650830f 100644 --- a/lite/backends/arm/math/layout.cc +++ b/lite/backends/arm/math/layout.cc @@ -358,6 +358,8 @@ void NCHW2NHWC(int N, int C, int size, const int8_t* X, int8_t* Y) { "v14", "v15"); #else +#if 0 // TOOD(ysh329): caused assembly code error with register for armv7 + // **clang** compile asm volatile(TRANS_C8 : [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), @@ -375,6 +377,7 @@ void NCHW2NHWC(int N, int C, int size, const int8_t* X, int8_t* Y) { [stride_w] "+r"(stride_w) : : "cc", "memory", "q0", "q1", "q2", "q3"); +#endif #endif } // const int8_t* din_ptr = din + 8 * cnt * size + s; // remain channel @@ -478,6 +481,8 @@ void NHWC2NCHW(int N, int C, int size, const float* X, float* Y) { "v10", "v11"); #else +#if 0 // TOOD(ysh329): caused assembly code error with register for armv7 + // **clang** compile asm volatile(TRANS_C4 : [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), @@ -491,6 +496,7 @@ void NHWC2NCHW(int N, int C, int size, const float* X, float* Y) { [stride] "+r"(stride) : : "cc", "memory", "q0", "q1", "q2", "q3"); +#endif #endif } for (int i = 0; i < remain; i++) { @@ -593,6 +599,8 @@ void NHWC2NCHW(int N, int C, int size, const int8_t* X, int8_t* Y) { "v14", "v15"); #else +#if 0 // TOOD(ysh329): caused assembly code error with register for armv7 + // **clang** compile asm volatile(TRANS_C8 : [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), @@ -610,6 +618,7 @@ void NHWC2NCHW(int N, int C, int size, const int8_t* X, int8_t* Y) { [stride_w] "+r"(stride_w) : : "cc", "memory", "q0", "q1", "q2", "q3"); +#endif #endif } for (int i = 0; i < remain; i++) { diff --git a/lite/backends/arm/math/lstm.cc b/lite/backends/arm/math/lstm.cc new file mode 100644 index 0000000000000000000000000000000000000000..cd8e012a287437ac9527ca510f927be30d825f0c --- /dev/null +++ b/lite/backends/arm/math/lstm.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/arm/math/lstm.h" +#include "lite/backends/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void add_bias_rowwise(Tensor* input, + const Tensor* bias, + int start_w, + int end_w) { + auto in_dim = input->dims(); + int width = input->numel() / in_dim[0]; + int w_adds = width < end_w ? width : end_w; + float* i_data = input->mutable_data(); + const float* b_data = bias->data(); + for (int i = 0; i < in_dim[0]; ++i) { + for (int w = start_w; w < w_adds; ++w) { + i_data[w] += b_data[w]; + } + i_data += width; + } +} +void vector_dot( + float* out, const float* in, const float* v1, int size, const float* v2) { + int loop = size >> 2; + int remain = size & 3; + const float* in_ptr = in; + float* out_ptr = out; + const float* v1_ptr = v1; + const float* v2_ptr = v2; + for (int i = 0; i < loop; ++i) { + float32x4_t in = vld1q_f32(in_ptr); + float32x4_t data1 = vld1q_f32(v1_ptr); + if (!v2) { + // in_out * v1 + float32x4_t out = vmulq_f32(in, data1); + vst1q_f32(out_ptr, out); + in_ptr += 4; + v1_ptr += 4; + out_ptr += 4; + } else { + // in_out + v1 * v2 + float32x4_t data2 = vld1q_f32(v2_ptr); + float32x4_t out = vmlaq_f32(in, data1, data2); + vst1q_f32(out_ptr, out); + in_ptr += 4; + v1_ptr += 4; + out_ptr += 4; + v2_ptr += 4; + } + } + for (int i = 0; i < remain; ++i) { + if (!v2) { + out_ptr[i] = in_ptr[i] * v1_ptr[i]; + } else { + out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i]; + } + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/lstm.h b/lite/backends/arm/math/lstm.h new file mode 100644 index 0000000000000000000000000000000000000000..e04581b055a93ac09da5ec6d5d57263fa2ad6261 --- /dev/null +++ b/lite/backends/arm/math/lstm.h @@ -0,0 +1,137 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/backends/arm/math/activation.h" +#include "lite/core/tensor.h" +#include "lite/utils/logging.h" +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void add_bias_rowwise(Tensor* input, + const Tensor* bias, + int start_w, + int end_w); + +inline float* row_offset(Tensor& input, int start) { // NOLINT + auto in_dim = input.dims(); + int width = input.numel() / in_dim[0]; + int offset = start < in_dim[0] ? start * width : input.numel(); + return input.mutable_data() + offset; +} +template +struct LstmMetaValue { + T* gate_value; + T* prev_state_value; + T* state_value; + T* state_active_value; + T* output_value; + T* check_ig; + T* check_fg; + T* check_og; +}; + +template +void activation( + const T* din, T* dout, int size, std::string act_str, int threads) { + if (act_str == "sigmoid") { + act_sigmoid(din, dout, size, threads); + } else if (act_str == "tanh") { + act_tanh(din, dout, size, threads); + } else if (act_str == "relu") { + act_relu(din, dout, size, threads); + } else { + LOG(FATAL) << "unsupport activation " << act_str; + } +} + +void vector_dot(float* out, + const float* in, + const float* v1, + int size, + const float* v2 = nullptr); + +template +struct LstmUnitFunctor { + static void compute(LstmMetaValue value, + int frame_size, + int batch_size, + T cell_clip, + std::string gate_act, + std::string cell_act, + std::string cand_act, + int threads) { + for (int b = 0; b < batch_size; ++b) { + const int temp_len = frame_size; + float zero_ptr[temp_len]; // NOLINT + memset(zero_ptr, 0, sizeof(float) * temp_len); + + T* value_in = value.gate_value; + T* value_ig = value_in + frame_size; + T* value_fg = value_ig + frame_size; + T* value_og = value_fg + frame_size; + T* state = value.state_value; + T* state_act = value.state_active_value; + T* output = value.output_value; + + T* check_i = value.check_ig ? value.check_ig : zero_ptr; + T* check_f = value.check_fg ? value.check_fg : zero_ptr; + T* check_o = value.check_og ? value.check_og : zero_ptr; + T* prev_state = + value.prev_state_value ? value.prev_state_value : zero_ptr; + + activation(value_in, value_in, frame_size, gate_act, threads); + vector_dot(value_ig, value_ig, prev_state, frame_size, check_i); + vector_dot(value_fg, value_fg, prev_state, frame_size, check_f); + activation(value_ig, value_ig, frame_size, cell_act, threads); + activation(value_fg, value_fg, frame_size, cell_act, threads); + vector_dot(state, value_in, value_ig, frame_size); + vector_dot(state, state, prev_state, frame_size, value_fg); + + for (int i = 0; i < frame_size; ++i) { + if (cell_clip > 0.0) { + if (state[i] < -1.0 * cell_clip) { + state[i] = -1.0 * cell_clip; + } + if (state[i] > cell_clip) { + state[i] = cell_clip; + } + } + } + + vector_dot(value_og, value_og, state, frame_size, check_o); + activation(value_og, value_og, frame_size, cell_act, threads); + activation(state, state_act, frame_size, cand_act, threads); + vector_dot(value.output_value, value_og, state_act, frame_size); + + value.gate_value += frame_size * 4; + value.state_value += frame_size; + value.state_active_value += frame_size; + value.output_value += frame_size; + if (value.prev_state_value) { + value.prev_state_value += frame_size; + } + } + } +}; + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/packed_sgemm.cc b/lite/backends/arm/math/packed_sgemm.cc index cb9c049d81aee73b65bacd27a64138779d1532cc..2e869f2df3a292b264dae948f13c64e05854d052 100644 --- a/lite/backends/arm/math/packed_sgemm.cc +++ b/lite/backends/arm/math/packed_sgemm.cc @@ -72,6 +72,7 @@ void pack_trans_m4(float *out, int mmax, int k0, int kmax); + void sgemm_prepacked_4x4(bool is_transB, int M, int N, @@ -154,6 +155,20 @@ void sgemm_prepacked_4x8(bool is_transB, bool has_bias, const operators::ActivationParam act_param, ARMContext *ctx); +// for kA53 +void sgemm_prepacked_6x8_a53(bool is_transB, + int M, + int N, + int K, + const float *A_packed, + const float *B, + int ldb, + float *C, + int ldc, + const float *bias, + bool has_bias, + int is_relu, + ARMContext *ctx); #endif // __aarch64__ /** @@ -300,6 +315,44 @@ void sgemm_prepack(bool is_transB, has_bias, act_param, ctx); + } else if (ctx->arch() == kA53) { + auto act_type = act_param.active_type; + bool has_act = act_param.has_active; + bool act_flag = + (has_act == false) || + (has_act == true && act_type == lite_api::ActivationType::kRelu); + bool has_beta = fabsf(beta) > 1e-8f ? true : false; + bool a53_sgemm = act_flag && !has_beta; + if (a53_sgemm) { + sgemm_prepacked_6x8_a53(is_transB, + M, + N, + K, + A_packed, + B, + ldb, + C, + ldc, + bias, + has_bias, + static_cast(has_act), + ctx); + } else { + sgemm_prepacked_6x8(is_transB, + M, + N, + K, + A_packed, + B, + ldb, + beta, + C, + ldc, + bias, + has_bias, + act_param, + ctx); + } } else { sgemm_prepacked_6x8(is_transB, M, @@ -2289,6 +2342,29 @@ void sgemm_prepacked_8x12(bool is_transB, size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; auto workspace = ctx->workspace_data(); int threads = ctx->threads(); + + auto act_type = act_param.active_type; + float alpha[4] = {0.f, 0.f, 0.f, 0.f}; + int flag_act = 0x00; // relu: 1, relu6: 2, leakey: 3 + if (act_param.has_active) { + if (act_type == lite_api::ActivationType::kRelu) { + flag_act = 0x01; + } else if (act_type == lite_api::ActivationType::kRelu6) { + flag_act = 0x02; + float local_alpha = act_param.Relu_clipped_coef; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } else if (act_type == lite_api::ActivationType::kLeakyRelu) { + flag_act = 0x03; + float local_alpha = act_param.Leaky_relu_alpha; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } + } //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 int x_block = (l2_cache - (MBLOCK * K)) / (sizeof(float) * (K + MBLOCK)); x_block /= NBLOCK; @@ -2837,7 +2913,172 @@ void sgemm_prepacked_8x12(bool is_transB, "fmla v25.4s, v4.4s, v1.s[1]\n" /* out21 = b2 * a10[0], b2 =q7*/ "fmla v28.4s, v4.4s, v1.s[2]\n" /* out22 = b2 * a10[0], b2 =q7*/ "fmla v31.4s, v4.4s, v1.s[3]\n" /* out23 = b2 * a10[0], b2 =q7*/ - "11: \n" /* check if relu */ + + "11: \n" /* check activation */ + "cmp %w[flag_act], #1 \n" /* check if has relu */ + "bne 12f \n" /* jump if no relu */ + "movi v0.4s, #0 \n" /* for relu*/ + "fmax v8.4s, v8.4s, v0.4s \n" /* relu*/ + "fmax v9.4s, v9.4s, v0.4s \n" /* relu*/ + "fmax v10.4s, v10.4s, v0.4s \n" /* relu*/ + "fmax v11.4s, v11.4s, v0.4s \n" /* relu*/ + "fmax v12.4s, v12.4s, v0.4s \n" /* relu*/ + "fmax v13.4s, v13.4s, v0.4s \n" /* relu*/ + "fmax v14.4s, v14.4s, v0.4s \n" /* relu*/ + "fmax v15.4s, v15.4s, v0.4s \n" /* relu*/ + "fmax v16.4s, v16.4s, v0.4s \n" /* relu*/ + "fmax v17.4s, v17.4s, v0.4s \n" /* relu*/ + "fmax v18.4s, v18.4s, v0.4s \n" /* relu*/ + "fmax v19.4s, v19.4s, v0.4s \n" /* relu*/ + "fmax v20.4s, v20.4s, v0.4s \n" /* relu*/ + "fmax v21.4s, v21.4s, v0.4s \n" /* relu*/ + "fmax v22.4s, v22.4s, v0.4s \n" /* relu*/ + "fmax v23.4s, v23.4s, v0.4s \n" /* relu*/ + "fmax v24.4s, v24.4s, v0.4s \n" /* relu*/ + "fmax v25.4s, v25.4s, v0.4s \n" /* relu*/ + "fmax v26.4s, v26.4s, v0.4s \n" /* relu*/ + "fmax v27.4s, v27.4s, v0.4s \n" /* relu*/ + "fmax v28.4s, v28.4s, v0.4s \n" /* relu*/ + "fmax v29.4s, v29.4s, v0.4s \n" /* relu*/ + "fmax v30.4s, v30.4s, v0.4s \n" /* relu*/ + "fmax v31.4s, v31.4s, v0.4s \n" /* relu*/ + "b 20f \n" /* relu end */ + //! no act + "12: \n" /* no relu */ + "cmp %w[flag_act], #0 \n" /* check no act */ + "beq 20f \n" /* no act end */ + //! relu6 + "cmp %w[flag_act], #2 \n" /* check if has relu6 */ + "bne 13f \n" /* jump if no relu6 */ + "movi v0.4s, #0 \n" /* for relu6 */ + "ld1 {v1.4s}, [%[alpha]] \n" /* relu6 alpha */ + "fmax v8.4s, v8.4s, v0.4s \n" /* relu6 */ + "fmax v9.4s, v9.4s, v0.4s \n" /* relu6 */ + "fmax v10.4s, v10.4s, v0.4s \n" /* relu6 */ + "fmax v11.4s, v11.4s, v0.4s \n" /* relu6 */ + "fmax v12.4s, v12.4s, v0.4s \n" /* relu6 */ + "fmax v13.4s, v13.4s, v0.4s \n" /* relu6 */ + "fmax v14.4s, v14.4s, v0.4s \n" /* relu6 */ + "fmax v15.4s, v15.4s, v0.4s \n" /* relu6 */ + "fmax v16.4s, v16.4s, v0.4s \n" /* relu6 */ + "fmax v17.4s, v17.4s, v0.4s \n" /* relu6 */ + "fmax v18.4s, v18.4s, v0.4s \n" /* relu6 */ + "fmax v19.4s, v19.4s, v0.4s \n" /* relu6 */ + "fmax v20.4s, v20.4s, v0.4s \n" /* relu6 */ + "fmax v21.4s, v21.4s, v0.4s \n" /* relu6 */ + "fmax v22.4s, v22.4s, v0.4s \n" /* relu6 */ + "fmax v23.4s, v23.4s, v0.4s \n" /* relu6 */ + "fmax v24.4s, v24.4s, v0.4s \n" /* relu6 */ + "fmax v25.4s, v25.4s, v0.4s \n" /* relu6 */ + "fmax v26.4s, v26.4s, v0.4s \n" /* relu6 */ + "fmax v27.4s, v27.4s, v0.4s \n" /* relu6 */ + "fmax v28.4s, v28.4s, v0.4s \n" /* relu6 */ + "fmax v29.4s, v29.4s, v0.4s \n" /* relu6 */ + "fmax v30.4s, v30.4s, v0.4s \n" /* relu6 */ + "fmax v31.4s, v31.4s, v0.4s \n" /* relu6 */ + "fmin v8.4s, v8.4s, v1.4s \n" /* relu6 */ + "fmin v9.4s, v9.4s, v1.4s \n" /* relu6 */ + "fmin v10.4s, v10.4s, v1.4s \n" /* relu6 */ + "fmin v11.4s, v11.4s, v1.4s \n" /* relu6 */ + "fmin v12.4s, v12.4s, v1.4s \n" /* relu6 */ + "fmin v13.4s, v13.4s, v1.4s \n" /* relu6 */ + "fmin v14.4s, v14.4s, v1.4s \n" /* relu6 */ + "fmin v15.4s, v15.4s, v1.4s \n" /* relu6 */ + "fmin v16.4s, v16.4s, v1.4s \n" /* relu6 */ + "fmin v17.4s, v17.4s, v1.4s \n" /* relu6 */ + "fmin v18.4s, v18.4s, v1.4s \n" /* relu6 */ + "fmin v19.4s, v19.4s, v1.4s \n" /* relu6 */ + "fmin v20.4s, v20.4s, v1.4s \n" /* relu6 */ + "fmin v21.4s, v21.4s, v1.4s \n" /* relu6 */ + "fmin v22.4s, v22.4s, v1.4s \n" /* relu6 */ + "fmin v23.4s, v23.4s, v1.4s \n" /* relu6 */ + "fmin v24.4s, v24.4s, v1.4s \n" /* relu6 */ + "fmin v25.4s, v25.4s, v1.4s \n" /* relu6 */ + "fmin v26.4s, v26.4s, v1.4s \n" /* relu6 */ + "fmin v27.4s, v27.4s, v1.4s \n" /* relu6 */ + "fmin v28.4s, v28.4s, v1.4s \n" /* relu6 */ + "fmin v29.4s, v29.4s, v1.4s \n" /* relu6 */ + "fmin v30.4s, v30.4s, v1.4s \n" /* relu6 */ + "fmin v31.4s, v31.4s, v1.4s \n" /* relu6 */ + "b 20f \n" /* relu6 end */ + //! leakey relu + "13: \n" /* otherwise is leakey relu */ + "movi v0.4s, #0 \n" /* for leakey relu */ + "ld1 {v1.4s}, [%[alpha]] \n" /* leakey relu alpha */ + "fcmge v2.4s, v8.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v8.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v4.4s, v9.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v5.4s, v9.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v6.4s, v10.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v7.4s, v10.4s, v1.4s \n" /* vmulq_f32 */ + "bif v8.16b, v3.16b, v2.16b \n" /* choose*/ + "bif v9.16b, v5.16b, v4.16b \n" /* choose*/ + "bif v10.16b, v7.16b, v6.16b \n" /* choose*/ + "fcmge v2.4s, v11.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v11.4s, v1.4s \n" /* vmulq_f32 */ + "bif v11.16b, v3.16b, v2.16b \n" /* choose*/ + "fcmge v2.4s, v12.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v12.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v4.4s, v13.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v5.4s, v13.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v6.4s, v14.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v7.4s, v14.4s, v1.4s \n" /* vmulq_f32 */ + "bif v12.16b, v3.16b, v2.16b \n" /* choose*/ + "bif v13.16b, v5.16b, v4.16b \n" /* choose*/ + "bif v14.16b, v7.16b, v6.16b \n" /* choose*/ + "fcmge v2.4s, v15.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v15.4s, v1.4s \n" /* vmulq_f32 */ + "bif v15.16b, v3.16b, v2.16b \n" /* choose*/ + "fcmge v2.4s, v16.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v16.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v4.4s, v17.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v5.4s, v17.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v6.4s, v18.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v7.4s, v18.4s, v1.4s \n" /* vmulq_f32 */ + "bif v16.16b, v3.16b, v2.16b \n" /* choose*/ + "bif v17.16b, v5.16b, v4.16b \n" /* choose*/ + "bif v18.16b, v7.16b, v6.16b \n" /* choose*/ + "fcmge v2.4s, v19.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v19.4s, v1.4s \n" /* vmulq_f32 */ + "bif v19.16b, v3.16b, v2.16b \n" /* choose*/ + "fcmge v2.4s, v20.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v20.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v4.4s, v21.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v5.4s, v21.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v6.4s, v22.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v7.4s, v22.4s, v1.4s \n" /* vmulq_f32 */ + "bif v20.16b, v3.16b, v2.16b \n" /* choose*/ + "bif v21.16b, v5.16b, v4.16b \n" /* choose*/ + "bif v22.16b, v7.16b, v6.16b \n" /* choose*/ + "fcmge v2.4s, v23.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v23.4s, v1.4s \n" /* vmulq_f32 */ + "bif v23.16b, v3.16b, v2.16b \n" /* choose*/ + "fcmge v2.4s, v24.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v24.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v4.4s, v25.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v5.4s, v25.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v6.4s, v26.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v7.4s, v26.4s, v1.4s \n" /* vmulq_f32 */ + "bif v24.16b, v3.16b, v2.16b \n" /* choose*/ + "bif v25.16b, v5.16b, v4.16b \n" /* choose*/ + "bif v26.16b, v7.16b, v6.16b \n" /* choose*/ + "fcmge v2.4s, v27.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v27.4s, v1.4s \n" /* vmulq_f32 */ + "bif v27.16b, v3.16b, v2.16b \n" /* choose*/ + "fcmge v2.4s, v28.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v28.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v4.4s, v29.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v5.4s, v29.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v6.4s, v30.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v7.4s, v30.4s, v1.4s \n" /* vmulq_f32 */ + "bif v28.16b, v3.16b, v2.16b \n" /* choose*/ + "bif v29.16b, v5.16b, v4.16b \n" /* choose*/ + "bif v30.16b, v7.16b, v6.16b \n" /* choose*/ + "fcmge v2.4s, v31.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v31.4s, v1.4s \n" /* vmulq_f32 */ + "bif v31.16b, v3.16b, v2.16b \n" /* choose*/ + "20: \n" /* act end */ + "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n" /* store r0 */ "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */ "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */ @@ -2861,7 +3102,9 @@ void sgemm_prepacked_8x12(bool is_transB, [c_ptr7] "+r"(c_ptr7) : [bias_ptr] "r"(bias_local), [has_beta] "r"(has_beta), - [beta] "r"(beta) + [beta] "r"(beta), + [alpha] "r"(alpha), + [flag_act] "r"(flag_act) : "cc","memory", "v0","v1","v2","v3","v4","v5","v6","v7", "v8","v9","v10","v11","v12","v13", @@ -2884,13 +3127,6 @@ void sgemm_prepacked_8x12(bool is_transB, } } } - if (act_param.has_active) { -#pragma omp parallel for num_threads(threads) - for (unsigned int x = 0; x < M; x++) { - float *dst = C + x * ldc; - act_switch_process(dst, dst, N, &act_param); - } - } } void sgemm_prepacked_4x4(bool is_transB, @@ -2911,6 +3147,28 @@ void sgemm_prepacked_4x4(bool is_transB, auto workspace = ctx->workspace_data(); int threads = ctx->threads(); + auto act_type = act_param.active_type; + float alpha[4] = {0.f, 0.f, 0.f, 0.f}; + int flag_act = 0x00; // relu: 1, relu6: 2, leakey: 3 + if (act_param.has_active) { + if (act_type == lite_api::ActivationType::kRelu) { + flag_act = 0x01; + } else if (act_type == lite_api::ActivationType::kRelu6) { + flag_act = 0x02; + float local_alpha = act_param.Relu_clipped_coef; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } else if (act_type == lite_api::ActivationType::kLeakyRelu) { + flag_act = 0x03; + float local_alpha = act_param.Leaky_relu_alpha; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } + } const int n_block = 4; const int m_block = 4; //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 @@ -3137,7 +3395,51 @@ void sgemm_prepacked_4x4(bool is_transB, "fmla v10.4s, v6.4s, v2.s[2]\n" /* out2 = b2 * a20[2], b1 =q6*/ "fmla v11.4s, v6.4s, v2.s[3]\n" /* out3 = b2 * a20[3], b1 =q6*/ - "11: \n" /* check if relu */ + "11: \n" /* check activation */ + "cmp %w[flag_act], #1 \n" /* check if has relu */ + "bne 12f \n" /* jump if no relu */ + "movi v0.4s, #0 \n" /* for relu*/ + "fmax v8.4s, v8.4s, v0.4s \n" /* relu*/ + "fmax v9.4s, v9.4s, v0.4s \n" /* relu*/ + "fmax v10.4s, v10.4s, v0.4s \n" /* relu*/ + "fmax v11.4s, v11.4s, v0.4s \n" /* relu*/ + "b 20f \n" /* relu end */ + //! no act + "12: \n" /* no relu */ + "cmp %w[flag_act], #0 \n" /* check no act */ + "beq 20f \n" /* no act end */ + //! relu6 + "cmp %w[flag_act], #2 \n" /* check if has relu6 */ + "bne 13f \n" /* jump if no relu6 */ + "movi v0.4s, #0 \n" /* for relu6 */ + "ld1 {v1.4s}, [%[alpha]] \n" /* relu6 alpha */ + "fmax v8.4s, v8.4s, v0.4s \n" /* relu6 */ + "fmax v9.4s, v9.4s, v0.4s \n" /* relu6 */ + "fmax v10.4s, v10.4s, v0.4s \n" /* relu6 */ + "fmax v11.4s, v11.4s, v0.4s \n" /* relu6 */ + + "fmin v8.4s, v8.4s, v1.4s \n" /* relu6*/ + "fmin v9.4s, v9.4s, v1.4s \n" /* relu6*/ + "fmin v10.4s, v10.4s, v1.4s \n" /* relu6*/ + "fmin v11.4s, v11.4s, v1.4s \n" /* relu6*/ + "b 20f \n" /* relu6 end */ + //! leakey relu + "13: \n" /* otherwise is leakey relu */ + "movi v0.4s, #0 \n" /* for leakey relu */ + "ld1 {v1.4s}, [%[alpha]] \n" /* leakey relu alpha */ + "fcmge v2.4s, v8.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v8.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v4.4s, v9.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v5.4s, v9.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v6.4s, v10.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v7.4s, v10.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v12.4s, v11.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v13.4s, v11.4s, v1.4s \n" /* vmulq_f32 */ + "bif v8.16b, v3.16b, v2.16b \n" /* choose*/ + "bif v9.16b, v5.16b, v4.16b \n" /* choose*/ + "bif v10.16b, v7.16b, v6.16b \n" /* choose*/ + "bif v11.16b, v13.16b, v12.16b \n" /* choose*/ + "20: \n" /* act end */ "st1 {v8.4s}, [%[c_ptr0]], #16\n" /* store r0 */ "st1 {v9.4s}, [%[c_ptr1]], #16\n" /* store r1 */ "st1 {v10.4s}, [%[c_ptr2]], #16\n" /* store r2 */ @@ -3153,7 +3455,9 @@ void sgemm_prepacked_4x4(bool is_transB, [c_ptr3] "+r"(c_ptr3) : [bias_ptr] "r"(bias_local), [has_beta] "r"(has_beta), - [beta] "r"(beta) + [beta] "r"(beta), + [alpha] "r"(alpha), + [flag_act] "r"(flag_act) : "cc","memory", "v0","v1","v2","v3","v4","v5","v6","v7", "v8","v9","v10","v11"); @@ -3169,13 +3473,6 @@ void sgemm_prepacked_4x4(bool is_transB, } } } - if (act_param.has_active) { -#pragma omp parallel for num_threads(threads) - for (unsigned int x = 0; x < M; x++) { - float *dst = C + x * ldc; - act_switch_process(dst, dst, N, &act_param); - } - } } #else // __aarch64__ /** @@ -3206,6 +3503,28 @@ void sgemm_prepacked_6x8(bool is_transB, size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; auto* workspace = ctx->workspace_data(); int threads = ctx->threads(); + auto act_type = act_param.active_type; + float alpha[4] = {0.f, 0.f, 0.f, 0.f}; + int flag_act = 0x00; // relu: 1, relu6: 2, leakey: 3 + if (act_param.has_active) { + if (act_type == lite_api::ActivationType::kRelu) { + flag_act = 0x01; + } else if (act_type == lite_api::ActivationType::kRelu6) { + flag_act = 0x02; + float local_alpha = act_param.Relu_clipped_coef; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } else if (act_type == lite_api::ActivationType::kLeakyRelu) { + flag_act = 0x03; + float local_alpha = act_param.Leaky_relu_alpha; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } + } //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 int x_block = (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH)); @@ -3223,6 +3542,8 @@ void sgemm_prepacked_6x8(bool is_transB, tail_pre = KBLOCK; } + //! merge tail_pre and flag_act + tail_pre = (tail_pre << 2 | flag_act); bool flag_p_remain = false; int remain = 0; @@ -3456,13 +3777,14 @@ void sgemm_prepacked_6x8(bool is_transB, "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a0~a3\n" "vmla.f32 q9, q3, d2[0] @ out8 += b2 * a2\n" "vmla.f32 q11, q3, d2[1] @ out9 += b2 * a3\n" - "subs %[k], %[k], #1 @ k--\n" + "subs %[k], %[k], #1 @ k--\n" "vmla.f32 q13, q3, d3[0] @ out10 += b2 * a4\n" "vmla.f32 q15, q3, d3[1] @ out11 += b2 * a5\n" - "bne 1b @ jump to main loop\n" - "0: @ process tail\n" - "subs %[tails], %[tails], #1 @ tail--\n" - "beq 3f @ jump to tail = 1\n" + "bne 1b @ jump to main loop\n" + "0: @ process tail\n" + "sub %[tails], %[tails], #4 @ tail--\n" + "cmp %[tails], #4 @ cmp with act bits\n" + "blt 3f @ jump to tail = 1\n" /* Unroll 0*/ "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" "vmla.f32 q4, q2, d0[0] @ out0 += b1 * a0\n" @@ -3471,9 +3793,10 @@ void sgemm_prepacked_6x8(bool is_transB, "vmla.f32 q8, q2, d1[0] @ out2 += b1 * a2\n" "vmla.f32 q10, q2, d1[1] @ out3 += b1 * a3\n" "vmla.f32 q12, q2, d2[0] @ out4 += b1 * a4\n" - "subs %[tails], %[tails], #1 @ tail--\n" + "sub %[tails], %[tails], #4 @ tail--\n" "vmla.f32 q14, q2, d2[1] @ out5 += b1 * a5\n" "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" + "cmp %[tails], #4 @ cmp with act bits\n" "vmla.f32 q5, q3, d0[0] @ out6 += b2 * a0\n" "vmla.f32 q7, q3, d0[1] @ out7 += b2 * a1\n" "vmla.f32 q9, q3, d1[0] @ out8 += b2 * a2\n" @@ -3482,16 +3805,17 @@ void sgemm_prepacked_6x8(bool is_transB, "vmla.f32 q13, q3, d2[0] @ out10 += b2 * a4\n" "vmla.f32 q15, q3, d2[1] @ out11 += b2 * a5\n" "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "beq 4f @ jump to tail==2\n" + "blt 4f @ jump to tail==2\n" /* Unroll 1*/ "vmla.f32 q4, q2, d3[0] @ out0 += b1 * a0\n" "vmla.f32 q6, q2, d3[1] @ out1 += b1 * a1\n" - "subs %[tails], %[tails], #1 @ tail--\n" + "sub %[tails], %[tails], #4 @ tail--\n" "vmla.f32 q8, q2, d0[0] @ out2 += b1 * a2\n" "vmla.f32 q10, q2, d0[1] @ out3 += b1 * a3\n" "vmla.f32 q12, q2, d1[0] @ out4 += b1 * a4\n" "vmla.f32 q14, q2, d1[1] @ out5 += b1 * a5\n" "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" + "cmp %[tails], #4 @ cmp with act bits\n" "vmla.f32 q5, q3, d3[0] @ out6 += b2 * a0\n" "vmla.f32 q7, q3, d3[1] @ out7 += b2 * a1\n" "vld1.32 {d2-d3}, [%[a_ptr] :64]! @ load a0~a3\n" @@ -3500,8 +3824,9 @@ void sgemm_prepacked_6x8(bool is_transB, "vmla.f32 q13, q3, d1[0] @ out10 += b2 * a4\n" "vmla.f32 q15, q3, d1[1] @ out11 += b2 * a5\n" "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "beq 5f @ jump to tail==3\n" + "blt 5f @ jump to tail==3\n" /* Unroll 2 */ + "sub %[tails], %[tails], #4 @ tail--\n" "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a4,a5, a0,a1\n" "vmla.f32 q4, q2, d2[0] @ out0 += b1 * a0\n" "vmla.f32 q6, q2, d2[1] @ out1 += b1 * a1\n" @@ -3579,7 +3904,99 @@ void sgemm_prepacked_6x8(bool is_transB, "vmla.f32 q11, q3, d3[1] @ out9 += b2 * a3\n" "vmla.f32 q13, q3, d0[0] @ out10 += b2 * a4\n" "vmla.f32 q15, q3, d0[1] @ out11 += b2 * a5\n" - "2: @ check relu\n" + "2: @ check activation\n" + //! relu + "cmp %[tails], #1 @ check if has relu\n" + "bne 6f @ jump if not relu \n" + "vmov.u32 q0, #0 @ for relu\n" + "vmax.f32 q4, q4, q0 @ for relu\n" + "vmax.f32 q5, q5, q0 @ for relu\n" + "vmax.f32 q6, q6, q0 @ for relu\n" + "vmax.f32 q7, q7, q0 @ for relu\n" + "vmax.f32 q8, q8, q0 @ for relu\n" + "vmax.f32 q9, q9, q0 @ for relu\n" + "vmax.f32 q10, q10, q0 @ for relu\n" + "vmax.f32 q11, q11, q0 @ for relu\n" + "vmax.f32 q12, q12, q0 @ for relu\n" + "vmax.f32 q13, q13, q0 @ for relu\n" + "vmax.f32 q14, q14, q0 @ for relu\n" + "vmax.f32 q15, q15, q0 @ for relu\n" + "b 10f @ relu end\n" + "6: @ no relu \n" + "cmp %[tails], #0 @ check no act\n" + "beq 10f @ no act end \n" + //! relu6 + "cmp %[tails], #2 @ check if has relu6\n" + "bne 7f @ jump if no relu6 \n" + "vmov.u32 q0, #0 @ for relu6\n" + "vmax.f32 q4, q4, q0 @ for relu6\n" + "vmax.f32 q5, q5, q0 @ for relu6\n" + "vmax.f32 q6, q6, q0 @ for relu6\n" + "vmax.f32 q7, q7, q0 @ for relu6\n" + "vmax.f32 q8, q8, q0 @ for relu6\n" + "vmax.f32 q9, q9, q0 @ for relu6\n" + "vld1.f32 {d2-d3}, [%[alpha]] @ load relu6 alpha\n" + "vmax.f32 q10, q10, q0 @ for relu6\n" + "vmax.f32 q11, q11, q0 @ for relu6\n" + "vmax.f32 q12, q12, q0 @ for relu6\n" + "vmax.f32 q13, q13, q0 @ for relu6\n" + "vmax.f32 q14, q14, q0 @ for relu6\n" + "vmax.f32 q15, q15, q0 @ for relu6\n" + + "vmin.f32 q4, q4, q1 @ for relu6\n" + "vmin.f32 q5, q5, q1 @ for relu6\n" + "vmin.f32 q6, q6, q1 @ for relu6\n" + "vmin.f32 q7, q7, q1 @ for relu6\n" + "vmin.f32 q8, q8, q1 @ for relu6\n" + "vmin.f32 q9, q9, q1 @ for relu6\n" + "vmin.f32 q10, q10, q1 @ for relu6\n" + "vmin.f32 q11, q11, q1 @ for relu6\n" + "vmin.f32 q12, q12, q1 @ for relu6\n" + "vmin.f32 q13, q13, q1 @ for relu6\n" + "vmin.f32 q14, q14, q1 @ for relu6\n" + "vmin.f32 q15, q15, q1 @ for relu6\n" + "b 10f @ relu6 end \n" + //! leakey relu + "7: @ otherwise is leakey relu\n" + "vmov.u32 q0, #0 @ for leakey relu \n" + "vld1.f32 {d2-d3}, [%[alpha]] @ load leakey relu alpha\n" + "vcge.f32 q2, q4, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q4, q1 @ vmulq_f32 \n" + "vbif q4, q3, q2 @ choose \n" + "vcge.f32 q2, q5, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q5, q1 @ vmulq_f32 \n" + "vbif q5, q3, q2 @ choose \n" + "vcge.f32 q2, q6, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q6, q1 @ vmulq_f32 \n" + "vbif q6, q3, q2 @ choose \n" + "vcge.f32 q2, q7, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q7, q1 @ vmulq_f32 \n" + "vbif q7, q3, q2 @ choose \n" + "vcge.f32 q2, q8, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q8, q1 @ vmulq_f32 \n" + "vbif q8, q3, q2 @ choose \n" + "vcge.f32 q2, q9, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q9, q1 @ vmulq_f32 \n" + "vbif q9, q3, q2 @ choose \n" + "vcge.f32 q2, q10, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q10, q1 @ vmulq_f32 \n" + "vbif q10, q3, q2 @ choose \n" + "vcge.f32 q2, q11, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q11, q1 @ vmulq_f32 \n" + "vbif q11, q3, q2 @ choose \n" + "vcge.f32 q2, q12, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q12, q1 @ vmulq_f32 \n" + "vbif q12, q3, q2 @ choose \n" + "vcge.f32 q2, q13, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q13, q1 @ vmulq_f32 \n" + "vbif q13, q3, q2 @ choose \n" + "vcge.f32 q2, q14, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q14, q1 @ vmulq_f32 \n" + "vbif q14, q3, q2 @ choose \n" + "vcge.f32 q2, q15, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q15, q1 @ vmulq_f32 \n" + "vbif q15, q3, q2 @ choose \n" + "10: @ act end \n" "vst1.32 {d8-d11}, [%[c_ptr0]]! @ store r0\n" "vst1.32 {d12-d15}, [%[c_ptr1]]! @ store r1\n" "vst1.32 {d16-d19}, [%[c_ptr2]]! @ store r2\n" @@ -3597,7 +4014,8 @@ void sgemm_prepacked_6x8(bool is_transB, [k] "+r"(k), [tails] "+r"(tails) : [bias_ptr] "r"(bias_local), - [beta] "r"(beta) + [beta] "r"(beta), + [alpha] "r" (alpha) : "q0","q1","q2","q3","q4", "q5","q6","q7","q8","q9","q10","q11", "q12","q13","q14","q15","cc","memory"); @@ -3616,11 +4034,470 @@ void sgemm_prepacked_6x8(bool is_transB, } } } - if (act_param.has_active) { +} + +/** + * \brief gemm with ablock = 6, bblock = 8, output 6x8, optimize for a53 arch + * @param A + * @param B + * @param C + * @param M + * @param N + * @param K + * @param threads + * @param workspace + */ +void sgemm_prepacked_6x8_a53(bool is_transB, + int M, + int N, + int K, + const float* A_packed, + const float* B, + int ldb, + float* C, + int ldc, + const float* bias, + bool has_bias, + int is_relu, + ARMContext* ctx) { + size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; + auto* workspace = ctx->workspace_data(); + int threads = ctx->threads(); + //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 + int x_block = + (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH)); + x_block /= NBLOCK; + x_block *= NBLOCK; + int x_num = (N + (x_block - 1)) / x_block; + x_block = (N + x_num - 1) / x_num; + x_block = (x_block + NBLOCK - 1) / NBLOCK; + x_block *= NBLOCK; + x_block = x_block < NBLOCK ? NBLOCK : x_block; + + int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1; + int tail_pre = (K & (KBLOCK - 1)); + if (tail_pre == 0) { + tail_pre = KBLOCK; + } + + //! merge tail_pre and flag_act + tail_pre = (tail_pre << 2 | is_relu); + bool flag_p_remain = false; + int remain = 0; + + //! apanel is pre_compute outside gemm + for (unsigned int x0 = 0; x0 < N; x0 += x_block) { + unsigned int xmax = x0 + x_block; + if (xmax > N) { + xmax = N; + } + int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK; + remain = xmax - x0 - (bblocks - 1) * NBLOCK; + if (remain > 0) { + flag_p_remain = true; + } + //! load bpanel + auto b_pannel = static_cast(workspace); + if (is_transB) { + loadb_trans(b_pannel, B, ldb, 0, K, x0, xmax); + } else { + loadb(b_pannel, B, ldb, 0, K, x0, xmax); + } #pragma omp parallel for num_threads(threads) - for (unsigned int x = 0; x < M; x++) { - float* dst = C + x * ldc; - act_switch_process(dst, dst, N, &act_param); + for (unsigned int y = 0; y < M; y += MBLOCK_OTH) { + unsigned int ymax = y + MBLOCK_OTH; + if (ymax > M) { + ymax = M; + } + float* c_ptr0 = C + y * ldc + x0; + float* c_ptr1 = c_ptr0 + ldc; + float* c_ptr2 = c_ptr1 + ldc; + float* c_ptr3 = c_ptr2 + ldc; + float* c_ptr4 = c_ptr3 + ldc; + float* c_ptr5 = c_ptr4 + ldc; + + float* pout0 = c_ptr0; + float* pout1 = c_ptr1; + float* pout2 = c_ptr2; + float* pout3 = c_ptr3; + float* pout4 = c_ptr4; + float* pout5 = c_ptr5; + + float bias_local[6] = {0}; + if (has_bias) { + bias_local[0] = bias[y]; + bias_local[1] = bias[y + 1]; + bias_local[2] = bias[y + 2]; + bias_local[3] = bias[y + 3]; + bias_local[4] = bias[y + 4]; + bias_local[5] = bias[y + 5]; + } + + float cout0[NBLOCK]; + float cout1[NBLOCK]; + float cout2[NBLOCK]; + float cout3[NBLOCK]; + float cout4[NBLOCK]; + float cout5[NBLOCK]; + + const float* a_ptr_l = A_packed + y * K; + const float* b_ptr = b_pannel; + for (int xb = 0; xb < bblocks; xb++) { + if ((y + 5) >= ymax) { + switch ((y + 5) - ymax) { + case 4: + c_ptr1 = cout1; + case 3: + c_ptr2 = cout2; + case 2: + c_ptr3 = cout3; + case 1: + c_ptr4 = cout4; + case 0: + c_ptr5 = cout5; + default: + break; + } + } + if (flag_p_remain && (xb == bblocks - 1)) { + pout0 = c_ptr0; + pout1 = c_ptr1; + pout2 = c_ptr2; + pout3 = c_ptr3; + pout4 = c_ptr4; + pout5 = c_ptr5; + + c_ptr0 = cout0; + c_ptr1 = cout1; + c_ptr2 = cout2; + c_ptr3 = cout3; + c_ptr4 = cout4; + c_ptr5 = cout5; + } + const float* a_ptr = a_ptr_l; + int tails = tail_pre; + int k = k_pre; + + // clang-format off + asm volatile( + // sgemm 6x8 for a53 + "vld1.32 {d2-d3}, [%[bias_ptr]] \n" /* load bias0-3 to d2,d3 */ + "vdup.i32 q4, d2[0] \n" /* set out00 to bias0 */ + "vld1.32 {d0-d1}, [%[a_ptr] :64] \n" /* load a00-a30 to d0,d1 */ + "vdup.i32 q5, d2[0] \n" /* set out01 to bias0 */ + "vld1.32 {d4-d5}, [%[b_ptr] :128] \n" /* load b00-b03 to d4,d5 */ + "vdup.i32 q6, d2[1] \n" /* set out10 to bias1 */ + "ldr r0, [%[a_ptr], #0x10] \n" /* load a40 to r0 */ + "vdup.i32 q7, d2[1] \n" /* set out11 to bias1 */ + "ldr r1, [%[a_ptr], #0x14] \n" /* load a50 to r1 */ + "vdup.i32 q8, d3[0] \n" /* set out20 to bias2 */ + "vldr d6, [%[bias_ptr], #0x10] \n" /* load bias 4,5 to d6 */ + "pld [%[a_ptr], #0x40] \n" /* pre load apanel */ + "vdup.i32 q9, d3[0] \n" /* set out21 to bias2 */ + "pld [%[b_ptr], #0x40] \n" /* pre load bpanel */ + "vdup.i32 q10, d3[1] \n" /* set out30 to bias3 */ + "pld [%[a_ptr], #0x80] \n" /* pre load apanel */ + "vdup.i32 q11, d3[1] \n" /* set out31 to bias3 */ + "pld [%[b_ptr], #0x80] \n" /* pre load bpanel */ + "vdup.i32 q12, d6[0] \n" /* set out40 to bias4 */ + "vdup.i32 q13, d6[0] \n" /* set out41 to bias4 */ + "pld [%[a_ptr], #0xC0] \n" /* pre load apanel */ + "vdup.i32 q14, d6[1] \n" /* set out50 to bias5 */ + "pld [%[b_ptr], #0XC0] \n" /* pre load bpanel */ + "vdup.i32 q15, d6[1] \n" /* set out51 to bias5 */ + "cmp %[k], #0 \n" /* check k loop */ + "beq 6f \n" /* k==0, branch to 6 */ + "1:\n" + /* Unroll 0 */ + "vldr d6, [%[b_ptr], #0x10] \n" /* load b04, b05 to d6 */ + "vmov d2, r0, r1 \n" /* mov a40, a50 to d2 */ + "vmla.f32 q4, q2, d0[0] \n" /* out00 += a00 * b0l */ + "ldr r0, [%[b_ptr], #0x18] \n" /* load b06 to r0 */ + "vmla.f32 q6, q2, d0[1] \n" /* out10 += a10 * b0l */ + "ldr r1, [%[b_ptr], #0x1C] \n" /* load b07 to r1 */ + "vmla.f32 q8, q2, d1[0] \n" /* out20 += a20 * b0l */ + "vldr d3, [%[a_ptr], #0x18] \n" /* load a01, a11 to d3 */ + "vmov d7, r0, r1 \n" /* mov b06, b07 to d7 */ + "vmla.f32 q10, q2, d1[1] \n" /* out30 += a30 * b0l */ + "pld [%[a_ptr], #0x100] \n" /* pre load apanel */ + "vmla.f32 q12, q2, d2[0] \n" /* out40 += a40 * b0l */ + "vmla.f32 q14, q2, d2[1] \n" /* out50 += a50 * b0l */ + "vldr d4, [%[b_ptr], #0x20] \n" /* load b10, b11 to d4 */ + "vmla.f32 q5, q3, d0[0] \n" /* out01 += a00 * b0h */ + "ldr r0, [%[b_ptr], #0x28] \n" /* load b12 to r0 */ + "vmla.f32 q7, q3, d0[1] \n" /* out11 += a10 * b0h */ + "ldr r1, [%[b_ptr], #0x2C] \n" /* load b13 to r1 */ + "vmla.f32 q9, q3, d1[0] \n" /* out21 += a20 * b0h */ + "vldr d0, [%[a_ptr], #0x20] \n" /* load a21, a31 to d0 */ + "vmov d5, r0, r1 \n" /* mov b12, b13 to d5 */ + "vmla.f32 q11, q3, d1[1] \n" /* out31 += a30 * b0h */ + "ldr r0, [%[a_ptr], #0x28] \n" /* load a41 to r0 */ + "vmla.f32 q13, q3, d2[0] \n" /* out41 += a40 * b0h */ + "ldr r1, [%[a_ptr], #0x2C] \n" /* load a51 to r1 */ + "vmla.f32 q15, q3, d2[1] \n" /* out51 += a50 * b0h */ + /* Unroll 1 */ + "vldr d6, [%[b_ptr], #0x30] \n" /* load b14, b15 to d6 */ + "vmov d1, r0, r1 \n" /* mov a41, a51 to d1 */ + "vmla.f32 q4, q2, d3[0] \n" /* out00 += a01 * b1l */ + "ldr r0, [%[b_ptr], #0x38] \n" /* load b16 to r0 */ + "vmla.f32 q6, q2, d3[1] \n" /* out10 += a11 * b1l */ + "ldr r1, [%[b_ptr], #0x3C] \n" /* load b17 to r1 */ + "vmla.f32 q8, q2, d0[0] \n" /* out20 += a21 * b1l */ + "vldr d2, [%[a_ptr], #0x30] \n" /* load a02, a12 to d0 */ + "vmov d7, r0, r1 \n" /* mov b16, b17 to d7 */ + "vmla.f32 q10, q2, d0[1] \n" /* out30 += a31 * b1l */ + "pld [%[b_ptr], #0x100] \n" /* pre load apanel */ + "vmla.f32 q12, q2, d1[0] \n" /* out40 += a41 * b1l */ + "vmla.f32 q14, q2, d1[1] \n" /* out50 += a51 * b1l */ + "vldr d4, [%[b_ptr], #0x40] \n" /* load b20, b21 to d4 */ + "vmla.f32 q5, q3, d3[0] \n" /* out01 += a01 * b1h */ + "ldr r0, [%[b_ptr], #0x48] \n" /* load b22 to r0 */ + "vmla.f32 q7, q3, d3[1] \n" /* out11 += a11 * b1h */ + "ldr r1, [%[b_ptr], #0x4C] \n" /* load b23 to r1 */ + "vmla.f32 q9, q3, d0[0] \n" /* out21 += a21 * b1h */ + "vldr d3, [%[a_ptr], #0x38] \n" /* load a22, a32 to d3 */ + "vmov d5, r0, r1 \n" /* mov b22, b23 to d5 */ + "vmla.f32 q11, q3, d0[1] \n" /* out31 += a31 * b1h */ + "ldr r0, [%[a_ptr], #0x40] \n" /* load a42 to r0 */ + "vmla.f32 q13, q3, d1[0] \n" /* out41 += a41 * b1h */ + "ldr r1, [%[a_ptr], #0x44] \n" /* load a52 to r1 */ + "vmla.f32 q15, q3, d1[1] \n" /* out51 += a51 * b1h */ + /* Unroll 2 */ + "vldr d6, [%[b_ptr], #0x50] \n" /* load b24, b25 to d6 */ + "vmov d0, r0, r1 \n" /* mov a42, a52 to d0 */ + "vmla.f32 q4, q2, d2[0] \n" /* out00 += a02 * b2l */ + "ldr r0, [%[b_ptr], #0x58] \n" /* load b26 to r0 */ + "vmla.f32 q6, q2, d2[1] \n" /* out10 += a12 * b2l */ + "ldr r1, [%[b_ptr], #0x5C] \n" /* load b27 to r1 */ + "vmla.f32 q8, q2, d3[0] \n" /* out20 += a22 * b2l */ + "vldr d1, [%[a_ptr], #0x48] \n" /* load a03, a13 to d1 */ + "vmov d7, r0, r1 \n" /* mov b26, b27 to d7 */ + "vmla.f32 q10, q2, d3[1] \n" /* out30 += a32 * b2l */ + "pld [%[a_ptr], #0x140] \n" /* pre load apanel */ + "vmla.f32 q12, q2, d0[0] \n" /* out40 += a42 * b2l */ + "vmla.f32 q14, q2, d0[1] \n" /* out50 += a52 * b2l */ + "vldr d4, [%[b_ptr], #0x60] \n" /* load b30, b31 to d4 */ + "vmla.f32 q5, q3, d2[0] \n" /* out01 += a02 * b2h */ + "ldr r0, [%[b_ptr], #0x68] \n" /* load b32 to r0 */ + "vmla.f32 q7, q3, d2[1] \n" /* out11 += a12 * b2h */ + "ldr r1, [%[b_ptr], #0x6C] \n" /* load b33 to r1 */ + "vmla.f32 q9, q3, d3[0] \n" /* out21 += a22 * b2h */ + "vldr d2, [%[a_ptr], #0x50] \n" /* load a23, a33 to d2 */ + "vmov d5, r0, r1 \n" /* mov b32, b33 to d5 */ + "vmla.f32 q11, q3, d3[1] \n" /* out31 += a32 * b2h */ + "ldr r0, [%[a_ptr], #0x58] \n" /* load a43 to r0 */ + "vmla.f32 q13, q3, d0[0] \n" /* out41 += a42 * b2h */ + "ldr r1, [%[a_ptr], #0x5C] \n" /* load a53 to r1 */ + "vmla.f32 q15, q3, d0[1] \n" /* out51 += a52 * b2h */ + "add %[a_ptr], %[a_ptr], #0x60 \n" /* aptr += 96 */ + /* Unroll 3 */ + "vldr d6, [%[b_ptr], #0x70] \n" /* load b34, b35 to d6 */ + "vmov d3, r0, r1 \n" /* mov a43, a53 to d3 */ + "vmla.f32 q4, q2, d1[0] \n" /* out00 += a03 * b3l */ + "ldr r0, [%[b_ptr], #0x78] \n" /* load b36 to r0 */ + "vmla.f32 q6, q2, d1[1] \n" /* out10 += a13 * b3l */ + "ldr r1, [%[b_ptr], #0x7C] \n" /* load b37 to r1 */ + "vmla.f32 q8, q2, d2[0] \n" /* out20 += a23 * b3l */ + "add %[b_ptr], %[b_ptr], #0x80 \n" /* bptr += 108 */ + "vldr d0, [%[a_ptr], #0x00] \n" /* load a00, a10 to d0 */ + "vmov d7, r0, r1 \n" /* mov b36, b37 to d7 */ + "vmla.f32 q10, q2, d2[1] \n" /* out30 += a33 * b3l */ + "pld [%[b_ptr], #0xC0] \n" /* pre load bpanel */ + "vmla.f32 q12, q2, d3[0] \n" /* out40 += a43 * b3l */ + "vmla.f32 q14, q2, d3[1] \n" /* out50 += a53 * b3l */ + "vldr d4, [%[b_ptr], #0x00] \n" /* load b00, b01 to d4 */ + "vmla.f32 q5, q3, d1[0] \n" /* out01 += a03 * b3h */ + "ldr r0, [%[b_ptr], #0x08] \n" /* load b02 to r0 */ + "vmla.f32 q7, q3, d1[1] \n" /* out11 += a13 * b3h */ + "ldr r1, [%[b_ptr], #0x0C] \n" /* load b03 to r1 */ + "vmla.f32 q9, q3, d2[0] \n" /* out21 += a23 * b3h */ + "subs %[k], %[k], #1 \n" /* loop k -= 1 */ + "vldr d1, [%[a_ptr], #0x08] \n" /* load a20, a30 to d1 */ + "vmov d5, r0, r1 \n" /* mov b02, b03 to d5 */ + "vmla.f32 q11, q3, d2[1] \n" /* out31 += a33 * b3h */ + "ldr r0, [%[a_ptr], #0x10] \n" /* load a40 to r0 */ + "vmla.f32 q13, q3, d3[0] \n" /* out41 += a43 * b3h */ + "ldr r1, [%[a_ptr], #0x14] \n" /* load a50 to r1 */ + "vmla.f32 q15, q3, d3[1] \n" /* out51 += a53 * b3h */ + "bne 1b \n" /* branch to k loop */ + "6:\n" + "sub %[tails], %[tails], #4 \n" /* tail -= 4 */ + "cmp %[tails], #4 \n" /* cmp tail with 4 */ + "blt 3f \n" /* branch to tail == 1 */ + /* Tail Unroll 0 */ + "vmov d2, r0, r1 \n" /* mov b02, b03 to d2 */ + "add %[a_ptr], %[a_ptr], #0x18 \n" /* aptr += 24 */ + "vmla.f32 q4, q2, d0[0] \n" /* out00 += a00 * b0l */ + "vld1.32 {d3}, [%[a_ptr] :64]! \n" /* load a01, a11 to d3 */ + "vmla.f32 q6, q2, d0[1] \n" /* out10 += a10 * b0l */ + "add %[b_ptr], %[b_ptr], #0x10 \n" /* bptr += 16 */ + "vmla.f32 q8, q2, d1[0] \n" /* out20 += a20 * b0l */ + "vld1.32 {d6-d7}, [%[b_ptr] :128]! \n" /* load b04-b07 to d6,d7 */ + "vmla.f32 q10, q2, d1[1] \n" /* out30 += a30 * b0l */ + "vmla.f32 q12, q2, d2[0] \n" /* out40 += a40 * b0l */ + "sub %[tails], %[tails], #4 \n" /* tail -= 4 */ + "vmla.f32 q14, q2, d2[1] \n" /* out50 += a50 * b0l */ + "vld1.32 {d4-d5}, [%[b_ptr] :128]! \n" /* load b10-b13 to d4,d5 */ + "vmla.f32 q5, q3, d0[0] \n" /* out01 += a00 * b0h */ + "vmla.f32 q7, q3, d0[1] \n" /* out11 += a10 * b0h */ + "vmla.f32 q9, q3, d1[0] \n" /* out21 += a20 * b0h */ + "vmla.f32 q11, q3, d1[1] \n" /* out31 += a30 * b0h */ + "vld1.32 {d0-d1}, [%[a_ptr] :64]! \n" /* load a21-a51 to d0,d1 */ + "cmp %[tails], #4 \n" /* cmp tail with 4 */ + "vmla.f32 q13, q3, d2[0] \n" /* out41 += a40 * b0h */ + "vmla.f32 q15, q3, d2[1] \n" /* out51 += a50 * b0h */ + "vld1.32 {d6-d7}, [%[b_ptr] :128]! \n" /* load b14-b17 to d6,d7 */ + "blt 4f \n" /* branch to tail == 2 */ + /* Tail Unroll 1 */ + "vmla.f32 q4, q2, d3[0] \n" /* out00 += a01 * b1l */ + "vmla.f32 q6, q2, d3[1] \n" /* out10 += a11 * b1l */ + "sub %[tails], %[tails], #4 \n" /* tail -= 4 */ + "vmla.f32 q8, q2, d0[0] \n" /* out20 += a21 * b1l */ + "vmla.f32 q10, q2, d0[1] \n" /* out30 += a31 * b1l */ + "vmla.f32 q12, q2, d1[0] \n" /* out40 += a41 * b1l */ + "vmla.f32 q14, q2, d1[1] \n" /* out50 += a51 * b1l */ + "vld1.32 {d4-d5}, [%[b_ptr] :128]! \n" /* load b20-b23 to d4,d5 */ + "vmla.f32 q5, q3, d3[0] \n" /* out01 += a01 * b1h */ + "vmla.f32 q7, q3, d3[1] \n" /* out11 += a11 * b1h */ + "cmp %[tails], #4 \n" /* cmp tail with 4 */ + "vld1.32 {d2-d3}, [%[a_ptr] :64]! \n" /* load a02-a32 to d2,d3 */ + "vmla.f32 q9, q3, d0[0] \n" /* out21 += a21 * b1h */ + "vmla.f32 q11, q3, d0[1] \n" /* out31 += a31 * b1h */ + "vmla.f32 q13, q3, d1[0] \n" /* out41 += a41 * b1h */ + "vmla.f32 q15, q3, d1[1] \n" /* out51 += a51 * b1h */ + "vld1.32 {d6-d7}, [%[b_ptr] :128]! \n" /* load b24-b27 to d6,d7 */ + "blt 5f \n" /* branch to tail == 3 */ + /* Tail Unroll 2 */ + "sub %[tails], %[tails], #4 \n" /* tail -= 4 */ + "vld1.32 {d0-d1}, [%[a_ptr] :64]! \n" /* a42a52a03a13 to d0,d1 */ + "vmla.f32 q4, q2, d2[0] \n" /* out00 += a02 * b2l */ + "vmla.f32 q6, q2, d2[1] \n" /* out10 += a12 * b2l */ + "vmla.f32 q8, q2, d3[0] \n" /* out20 += a22 * b2l */ + "vmla.f32 q10, q2, d3[1] \n" /* out30 += a32 * b2l */ + "vmla.f32 q12, q2, d0[0] \n" /* out40 += a42 * b2l */ + "vmla.f32 q14, q2, d0[1] \n" /* out50 += a52 * b2l */ + "vld1.32 {d4-d5}, [%[b_ptr] :128]! \n" /* load b30-b33 to d4,d5 */ + "vmla.f32 q5, q3, d2[0] \n" /* out01 += a02 * b2h */ + "vmla.f32 q7, q3, d2[1] \n" /* out11 += a12 * b2h */ + "vmla.f32 q9, q3, d3[0] \n" /* out21 += a22 * b2h */ + "vmla.f32 q11, q3, d3[1] \n" /* out31 += a32 * b2h */ + "vld1.32 {d2-d3}, [%[a_ptr] :64]! \n" /* load a23-a53 to d2,d3 */ + "vmla.f32 q13, q3, d0[0] \n" /* out41 += a42 * b2h */ + "vmla.f32 q15, q3, d0[1] \n" /* out51 += a52 * b2h */ + "vld1.32 {d6-d7}, [%[b_ptr] :128]! \n" /* load b34-b37 to d6,d7 */ + /* Tail Unroll 3 */ + "vmla.f32 q4, q2, d1[0] \n" /* out00 += a03 * b3l */ + "vmla.f32 q5, q3, d1[0] \n" /* out01 += a03 * b3h */ + "vmla.f32 q6, q2, d1[1] \n" /* out10 += a13 * b3l */ + "vmla.f32 q7, q3, d1[1] \n" /* out11 += a13 * b3h */ + "vmla.f32 q8, q2, d2[0] \n" /* out20 += a23 * b3l */ + "vmla.f32 q9, q3, d2[0] \n" /* out21 += a23 * b3h */ + "vmla.f32 q10, q2, d2[1] \n" /* out30 += a33 * b3l */ + "vmla.f32 q11, q3, d2[1] \n" /* out31 += a33 * b3h */ + "vmla.f32 q12, q2, d3[0] \n" /* out40 += a43 * b3l */ + "vmla.f32 q13, q3, d3[0] \n" /* out41 += a43 * b3h */ + "vmla.f32 q14, q2, d3[1] \n" /* out50 += a53 * b3l */ + "vmla.f32 q15, q3, d3[1] \n" /* out51 += a53 * b3h */ + "b 2f \n" /* branch to check relu */ + /* tails==1 final tail */ + "3:\n" + "vmov d2, r0, r1 \n" /* mov b02, b03 to d2 */ + "add %[b_ptr], %[b_ptr], #0x10 \n" /* bptr += 16 */ + "vmla.f32 q4, q2, d0[0] \n" /* out00 += a00 * b0l */ + "add %[a_ptr], %[a_ptr], #0x18 \n" /* aptr += 24 */ + "vmla.f32 q6, q2, d0[1] \n" /* out10 += a10 * b0l */ + "vld1.32 {d6-d7}, [%[b_ptr] :128]! \n" /* load b04-b07 to d6,d7 */ + "vmla.f32 q8, q2, d1[0] \n" /* out20 += a20 * b0l */ + "vmla.f32 q10, q2, d1[1] \n" /* out30 += a30 * b0l */ + "vmla.f32 q12, q2, d2[0] \n" /* out40 += a40 * b0l */ + "vmla.f32 q14, q2, d2[1] \n" /* out50 += a50 * b0l */ + "vmla.f32 q5, q3, d0[0] \n" /* out01 += a00 * b0h */ + "vmla.f32 q7, q3, d0[1] \n" /* out11 += a10 * b0h */ + "vmla.f32 q9, q3, d1[0] \n" /* out21 += a20 * b0h */ + "vmla.f32 q11, q3, d1[1] \n" /* out31 += a30 * b0h */ + "vmla.f32 q13, q3, d2[0] \n" /* out41 += a40 * b0h */ + "vmla.f32 q15, q3, d2[1] \n" /* out51 += a50 * b0h */ + "b 2f \n" /* branch to check relu */ + /* tails==2 final tail */ + "4:\n" + "vmla.f32 q4, q2, d3[0] \n" /* out00 += a01 * b1l */ + "vmla.f32 q5, q3, d3[0] \n" /* out01 += a01 * b1h */ + "vmla.f32 q6, q2, d3[1] \n" /* out10 += a11 * b1l */ + "vmla.f32 q7, q3, d3[1] \n" /* out11 += a11 * b1h */ + "vmla.f32 q8, q2, d0[0] \n" /* out20 += a21 * b1l */ + "vmla.f32 q9, q3, d0[0] \n" /* out21 += a21 * b1h */ + "vmla.f32 q10, q2, d0[1] \n" /* out30 += a31 * b1l */ + "vmla.f32 q11, q3, d0[1] \n" /* out31 += a31 * b1h */ + "vmla.f32 q12, q2, d1[0] \n" /* out40 += a41 * b1l */ + "vmla.f32 q13, q3, d1[0] \n" /* out41 += a41 * b1h */ + "vmla.f32 q14, q2, d1[1] \n" /* out50 += a51 * b1l */ + "vmla.f32 q15, q3, d1[1] \n" /* out51 += a51 * b1h */ + "b 2f \n" /* branch to check relu */ + /* tails==3 final tail */ + "5:\n" + "vmla.f32 q4, q2, d2[0] \n" /* out00 += a02 * b2l */ + "vld1.32 {d0}, [%[a_ptr] :64]! \n" /* load a42, a52 to d0 */ + "vmla.f32 q6, q2, d2[1] \n" /* out10 += a12 * b2l */ + "vmla.f32 q8, q2, d3[0] \n" /* out20 += a22 * b2l */ + "vmla.f32 q5, q3, d2[0] \n" /* out01 += a02 * b2h */ + "vmla.f32 q7, q3, d2[1] \n" /* out11 += a12 * b2h */ + "vmla.f32 q9, q3, d3[0] \n" /* out21 += a22 * b2h */ + "vmla.f32 q10, q2, d3[1] \n" /* out30 += a32 * b2l */ + "vmla.f32 q11, q3, d3[1] \n" /* out31 += a32 * b2h */ + "vmla.f32 q12, q2, d0[0] \n" /* out40 += a42 * b2l */ + "vmla.f32 q13, q3, d0[0] \n" /* out41 += a42 * b2h */ + "vmla.f32 q14, q2, d0[1] \n" /* out50 += a52 * b2l */ + "vmla.f32 q15, q3, d0[1] \n" /* out51 += a52 * b2h */ + /* relu */ + "2:\n" + "cmp %[tails], #1 \n" /* cmp tail is relu */ + "bne 0f \n" /* no relu branch to end */ + "vmov.i32 q0, #0 \n" /* mov 0.f to q0 */ + "vmax.f32 q4, q4, q0 \n" /* out00 relu */ + "vmax.f32 q5, q5, q0 \n" /* out01 relu */ + "vmax.f32 q6, q6, q0 \n" /* out10 relu */ + "vmax.f32 q7, q7, q0 \n" /* out11 relu */ + "vmax.f32 q8, q8, q0 \n" /* out20 relu */ + "vmax.f32 q9, q9, q0 \n" /* out21 relu */ + "vmax.f32 q10, q10, q0 \n" /* out30 relu */ + "vmax.f32 q11, q11, q0 \n" /* out31 relu */ + "vmax.f32 q12, q12, q0 \n" /* out40 relu */ + "vmax.f32 q13, q13, q0 \n" /* out41 relu */ + "vmax.f32 q14, q14, q0 \n" /* out50 relu */ + "vmax.f32 q15, q15, q0 \n" /* out51 relu */ + "0:\n" + "vst1.32 {d8-d11}, [%[c_ptr0]]! \n" /* store out0 to cptr0 */ + "vst1.32 {d12-d15}, [%[c_ptr1]]! \n" /* store out1 to cptr1 */ + "vst1.32 {d16-d19}, [%[c_ptr2]]! \n" /* store out2 to cptr2 */ + "vst1.32 {d20-d23}, [%[c_ptr3]]! \n" /* store out3 to cptr3 */ + "vst1.32 {d24-d27}, [%[c_ptr4]]! \n" /* store out4 to cptr4 */ + "vst1.32 {d28-d31}, [%[c_ptr5]]! \n" /* store out5 to cptr5 */ + : [a_ptr] "+r"(a_ptr), + [b_ptr] "+r"(b_ptr), + [c_ptr0] "+r"(c_ptr0), + [c_ptr1] "+r"(c_ptr1), + [c_ptr2] "+r"(c_ptr2), + [c_ptr3] "+r"(c_ptr3), + [c_ptr4] "+r"(c_ptr4), + [c_ptr5] "+r"(c_ptr5), + [k] "+r"(k), + [tails] "+r"(tails) + : [bias_ptr] "r"(bias_local) + : "r0", "r1", "q0","q1","q2","q3","q4", + "q5","q6","q7","q8","q9","q10","q11", + "q12","q13","q14","q15","cc","memory"); + // clang-format on + if (flag_p_remain && (xb == bblocks - 1)) { + for (int i = 0; i < remain; ++i) { + *pout0++ = cout0[i]; + *pout1++ = cout1[i]; + *pout2++ = cout2[i]; + *pout3++ = cout3[i]; + *pout4++ = cout4[i]; + *pout5++ = cout5[i]; + } + } + } } } } @@ -3642,6 +4519,28 @@ void sgemm_prepacked_4x8(bool is_transB, size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; auto* workspace = ctx->workspace_data(); int threads = ctx->threads(); + auto act_type = act_param.active_type; + float alpha[4] = {0.f, 0.f, 0.f, 0.f}; + int flag_act = 0x00; // relu: 1, relu6: 2, leakey: 3 + if (act_param.has_active) { + if (act_type == lite_api::ActivationType::kRelu) { + flag_act = 0x01; + } else if (act_type == lite_api::ActivationType::kRelu6) { + flag_act = 0x02; + float local_alpha = act_param.Relu_clipped_coef; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } else if (act_type == lite_api::ActivationType::kLeakyRelu) { + flag_act = 0x03; + float local_alpha = act_param.Leaky_relu_alpha; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } + } //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 int x_block = (l2_cache - (MBLOCK_A73 * K)) / (sizeof(float) * (K + MBLOCK_A73)); @@ -3786,13 +4685,13 @@ void sgemm_prepacked_4x8(bool is_transB, "vmla.f32 q15, q3, q4\n" /* cr31 += beta * c_r31 */ "11: \n" /* check loop count */ "vld1.32 {d0-d3}, [%[a_ptr] :128]! @ load a0~a3\n" - "vld1.32 {d8-d11}, [%[b_ptr] :128]! @ load b1\n" - "cmp %[k], #0 @ check weather k is bigger than " + "vld1.32 {d8-d11}, [%[b_ptr] :128]! @ load b1\n" + "cmp %[k], #0 @ check weather k is bigger than " "0\n" - "beq 0f @ jump to tail\n" - "1: @ main loop for k\n" + "beq 0f @ jump to tail\n" + "1: @ main loop for k\n" /* Unroll 0*/ - "vld1.32 {d12-d15}, [%[b_ptr] :128]! @ load next b1, b2\n" + "vld1.32 {d12-d15}, [%[b_ptr] :128]! @ load next b1, b2\n" "vmla.f32 q8, q4, d0[0] @ out0 += b1 * a0\n" "vld1.32 {d4-d7}, [%[a_ptr] :128]! @ load next 2xa0~a3\n" "vmla.f32 q10, q4, d0[1] @ out1 += b1 * a1\n" @@ -3920,8 +4819,76 @@ void sgemm_prepacked_4x8(bool is_transB, "vmla.f32 q13, q5, d5[0] @ out6 += b2 * a2\n" "vmla.f32 q15, q5, d5[1] @ out7 += b2 * a3\n" /*aptr - 16*/ - "sub %[a_ptr], %[a_ptr], #16 @ tail--\n" - "2: @ check relu\n" + "sub %[a_ptr], %[a_ptr], #16 @ tail--\n" + "2: @ check relu\n" + //! relu + "cmp %[flag_act], #1 @ check if has relu\n" + "bne 6f @ jump if not relu \n" + "vmov.u32 q0, #0 @ for relu\n" + "vmax.f32 q8, q8, q0 @ for relu\n" + "vmax.f32 q9, q9, q0 @ for relu\n" + "vmax.f32 q10, q10, q0 @ for relu\n" + "vmax.f32 q11, q11, q0 @ for relu\n" + "vmax.f32 q12, q12, q0 @ for relu\n" + "vmax.f32 q13, q13, q0 @ for relu\n" + "vmax.f32 q14, q14, q0 @ for relu\n" + "vmax.f32 q15, q15, q0 @ for relu\n" + "b 10f @ relu end\n" + "6: @ no relu \n" + "cmp %[flag_act], #0 @ check no act\n" + "beq 10f @ no act end \n" + //! relu6 + "cmp %[flag_act], #2 @ check if has relu6\n" + "bne 7f @ jump if no relu6 \n" + "vmov.u32 q0, #0 @ for relu6\n" + "vld1.f32 {d2-d3}, [%[alpha]] @ load relu6 alpha\n" + "vmax.f32 q8, q8, q0 @ for relu6\n" + "vmax.f32 q9, q9, q0 @ for relu6\n" + "vmax.f32 q10, q10, q0 @ for relu6\n" + "vmax.f32 q11, q11, q0 @ for relu6\n" + "vmax.f32 q12, q12, q0 @ for relu6\n" + "vmax.f32 q13, q13, q0 @ for relu6\n" + "vmax.f32 q14, q14, q0 @ for relu6\n" + "vmax.f32 q15, q15, q0 @ for relu6\n" + + "vmin.f32 q8, q8, q1 @ for relu6\n" + "vmin.f32 q9, q9, q1 @ for relu6\n" + "vmin.f32 q10, q10, q1 @ for relu6\n" + "vmin.f32 q11, q11, q1 @ for relu6\n" + "vmin.f32 q12, q12, q1 @ for relu6\n" + "vmin.f32 q13, q13, q1 @ for relu6\n" + "vmin.f32 q14, q14, q1 @ for relu6\n" + "vmin.f32 q15, q15, q1 @ for relu6\n" + "b 10f @ relu6 end \n" + //! leakey relu + "7: @ otherwise is leakey relu\n" + "vmov.u32 q0, #0 @ for leakey relu \n" + "vld1.f32 {d2-d3}, [%[alpha]] @ load leakey relu alpha\n" + "vcge.f32 q2, q8, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q8, q1 @ vmulq_f32 \n" + "vbif q8, q3, q2 @ choose \n" + "vcge.f32 q2, q9, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q9, q1 @ vmulq_f32 \n" + "vbif q9, q3, q2 @ choose \n" + "vcge.f32 q2, q10, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q10, q1 @ vmulq_f32 \n" + "vbif q10, q3, q2 @ choose \n" + "vcge.f32 q2, q11, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q11, q1 @ vmulq_f32 \n" + "vbif q11, q3, q2 @ choose \n" + "vcge.f32 q2, q12, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q12, q1 @ vmulq_f32 \n" + "vbif q12, q3, q2 @ choose \n" + "vcge.f32 q2, q13, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q13, q1 @ vmulq_f32 \n" + "vbif q13, q3, q2 @ choose \n" + "vcge.f32 q2, q14, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q14, q1 @ vmulq_f32 \n" + "vbif q14, q3, q2 @ choose \n" + "vcge.f32 q2, q15, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q15, q1 @ vmulq_f32 \n" + "vbif q15, q3, q2 @ choose \n" + "10: @ act end \n" "vst1.32 {d16-d19}, [%[c_ptr0]]! @ store r0\n" "vst1.32 {d20-d23}, [%[c_ptr1]]! @ store r1\n" "vst1.32 {d24-d27}, [%[c_ptr2]]! @ store r2\n" @@ -3935,7 +4902,9 @@ void sgemm_prepacked_4x8(bool is_transB, [k] "+r"(k), [tails] "+r"(tails) : [bias_ptr] "r"(bias_local), - [beta] "r"(beta) + [beta] "r"(beta), + [alpha] "r"(alpha), + [flag_act] "r"(flag_act) : "q0","q1","q2","q3", "q4","q5","q6","q7","q8","q9","q10", "q11","q12","q13","q14","q15","cc","memory"); @@ -3951,13 +4920,6 @@ void sgemm_prepacked_4x8(bool is_transB, } } } - if (act_param.has_active) { -#pragma omp parallel for num_threads(threads) - for (unsigned int x = 0; x < M; x++) { - float* dst = C + x * ldc; - act_switch_process(dst, dst, N, &act_param); - } - } } #endif // __aarch64__ diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc index 07cbd00378c082e311e194c7b22b6d3cb195a63a..fdcbc7394b1be9e438686f91dfa407065d24f91a 100644 --- a/lite/backends/arm/math/pooling.cc +++ b/lite/backends/arm/math/pooling.cc @@ -21,6 +21,17 @@ namespace paddle { namespace lite { namespace arm { namespace math { + +int AdaptStartIndex(int ph, int input_size, int output_size) { + return static_cast( + floor(static_cast(ph * input_size) / output_size)); +} + +int AdaptEndIndex(int ph, int input_size, int output_size) { + return static_cast( + ceil(static_cast((ph + 1) * input_size) / output_size)); +} + void pooling_basic(const float* din, float* dout, int num, @@ -67,7 +78,6 @@ void pooling_basic(const float* din, } } else if (pooling_type == "avg") { // Pooling_average_include_padding - // Pooling_average_exclude_padding for (int n = 0; n < num; ++n) { float* dout_batch = dout + n * chout * size_channel_out; const float* din_batch = din + n * chin * size_channel_in; @@ -89,15 +99,27 @@ void pooling_basic(const float* din, #pragma omp parallel for for (int ind_c = 0; ind_c < chin; ++ind_c) { for (int ind_h = 0; ind_h < hout; ++ind_h) { - int sh = ind_h * stride_h; - int eh = sh + kernel_h; - sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; - eh = (eh - pad_h) > hin ? hin : eh - pad_h; + int sh, eh; + if (adaptive) { + sh = AdaptStartIndex(ind_h, hin, hout); + eh = AdaptEndIndex(ind_h, hin, hout); + } else { + sh = ind_h * stride_h; + eh = sh + kernel_h; + sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; + eh = (eh - pad_h) > hin ? hin : eh - pad_h; + } for (int ind_w = 0; ind_w < wout; ++ind_w) { - int sw = ind_w * stride_w; - int ew = sw + kernel_w; - sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; - ew = (ew - pad_w) > win ? win : ew - pad_w; + int sw, ew; + if (adaptive) { + sw = AdaptStartIndex(ind_w, win, wout); + ew = AdaptEndIndex(ind_w, win, wout); + } else { + sw = ind_w * stride_w; + ew = sw + kernel_w; + sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; + ew = (ew - pad_w) > win ? win : ew - pad_w; + } float result = static_cast(0); int dst_ind = (ind_n * chout + ind_c) * size_channel_out + ind_h * wout + ind_w; @@ -906,7 +928,9 @@ void pooling1x1s2p0_max(const float* din, int wout, int chin, int hin, - int win) { + int win, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1021,7 +1045,9 @@ void pooling2x2s2_max(const float* din, int wout, int chin, int hin, - int win) { + int win, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1104,7 +1130,9 @@ void pooling2x2s2_avg(const float* din, int chin, int hin, int win, - bool exclusive) { + bool exclusive, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1117,6 +1145,9 @@ void pooling2x2s2_avg(const float* din, int w_unroll_size = wout / 4; int w_unroll_remian = wout - w_unroll_size * 4; float32x4_t vcoef = vdupq_n_f32(0.25f); // divided by 4 + auto zero_ptr = + static_cast(TargetMalloc(TARGET(kARM), win * sizeof(float))); + memset(zero_ptr, 0, win * sizeof(float)); for (int n = 0; n < num; ++n) { float* data_out_batch = data_out + n * chout * size_channel_out; @@ -1132,7 +1163,7 @@ void pooling2x2s2_avg(const float* din, auto dr0 = r0; auto dr1 = r1; if (h * S + K - P > hin) { - dr1 = r0; + dr1 = zero_ptr; } int cnt_num = w_unroll_size; if (w_unroll_size > 0) { @@ -1178,6 +1209,7 @@ void pooling2x2s2_avg(const float* din, } } } + TargetFree(TARGET(kARM), zero_ptr); } void pooling3x3s1p1_max(const float* din, @@ -1188,7 +1220,9 @@ void pooling3x3s1p1_max(const float* din, int wout, int chin, int hin, - int win) { + int win, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1331,7 +1365,9 @@ void pooling3x3s1p1_avg(const float* din, int chin, int hin, int win, - bool exclusive) { + bool exclusive, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1389,7 +1425,13 @@ void pooling3x3s1p1_avg(const float* din, if (exclusive) { coef_h = 1.f; } else { - coef_h = 0.5f; + if (pad_bottom > 1) { + coef_h = 1.f / 3; + } else if (pad_bottom == 1) { + coef_h = 0.5f; + } else { + coef_h = 1.f; + } } break; case 1: @@ -1401,7 +1443,11 @@ void pooling3x3s1p1_avg(const float* din, coef_h = 0.5f; } } else { - coef_h = 1.f / 3; + if (pad_bottom >= 1) { + coef_h = 1.f / 3; + } else { + coef_h = 0.5f; + } } default: break; @@ -1477,8 +1523,12 @@ void pooling3x3s1p1_avg(const float* din, int st = wstart > 0 ? wstart : 0; if (wstart + K > win) { wend = win; - if (!exclusive && wstart + K - win == 2) { - coef = coef_h / 2; + if (!exclusive) { + if (wstart + K - pad_right - win == 1) { + coef = coef_h / 2; + } else if (wstart + K - pad_right - win == 2) { + coef = coef_h; + } } } if (exclusive) { @@ -1509,7 +1559,9 @@ void pooling3x3s1p0_max(const float* din, int wout, int chin, int hin, - int win) { + int win, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1646,7 +1698,9 @@ void pooling3x3s1p0_avg(const float* din, int chin, int hin, int win, - bool exclusive) { + bool exclusive, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1692,7 +1746,13 @@ void pooling3x3s1p0_avg(const float* din, if (exclusive) { coef_h = 1.f; } else { - coef_h = 0.5f; + if (pad_bottom > 1) { + coef_h = 1.f / 3; + } else if (pad_bottom = 1) { + coef_h = 0.5f; + } else { + coef_h = 1.f; + } } break; case 1: @@ -1704,7 +1764,11 @@ void pooling3x3s1p0_avg(const float* din, coef_h = 0.5f; } } else { - coef_h = 1.f / 3; + if (pad_bottom >= 1) { + coef_h = 1.f / 3; + } else { + coef_h = 0.5f; + } } default: break; @@ -1776,8 +1840,12 @@ void pooling3x3s1p0_avg(const float* din, int st = wstart > 0 ? wstart : 0; if (wstart + K > win) { wend = win; - if (!exclusive && wstart + K - win == 2) { - coef = coef_h / 2; + if (!exclusive) { + if (wstart + K - pad_right - win == 1) { + coef = coef_h / 2; + } else if (wstart + K - pad_right - win == 2) { + coef = coef_h; + } } } if (exclusive) { @@ -1811,7 +1879,9 @@ void pooling3x3s2p1_max(const float* din, int wout, int chin, int hin, - int win) { + int win, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1955,7 +2025,9 @@ void pooling3x3s2p1_avg(const float* din, int chin, int hin, int win, - bool exclusive) { + bool exclusive, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -2015,7 +2087,13 @@ void pooling3x3s2p1_avg(const float* din, if (exclusive) { coef_h = 1.f; } else { - coef_h = 0.5f; + if (pad_bottom > 1) { + coef_h = 1.f / 3; + } else if (pad_bottom == 1) { + coef_h = 0.5f; + } else { + coef_h = 1.f; + } } break; case 1: @@ -2027,7 +2105,11 @@ void pooling3x3s2p1_avg(const float* din, coef_h = 0.5f; } } else { - coef_h = 1.f / 3; + if (pad_bottom == 0) { + coef_h = 1.f / 2; + } else { + coef_h = 1.f / 3; + } } default: break; @@ -2102,8 +2184,12 @@ void pooling3x3s2p1_avg(const float* din, float coef = coef_h / 3.f; if (wstart + K > win) { wend = win; - if (!exclusive && wstart + K - win == 2) { - coef = coef_h / 2; + if (!exclusive) { + if (wstart + K - pad_right - win == 1) { + coef = coef_h / 2; + } else if (wstart + K - pad_right - win == 2) { + coef = coef_h; + } } } int st = wstart > 0 ? wstart : 0; @@ -2135,7 +2221,9 @@ void pooling3x3s2p0_max(const float* din, int wout, int chin, int hin, - int win) { + int win, + int pad_bottom, + int pad_right) { const int K = 3; const int P = 0; const int S = 2; @@ -2261,7 +2349,9 @@ void pooling3x3s2p0_avg(const float* din, int chin, int hin, int win, - bool exclusive) { + bool exclusive, + int pad_bottom, + int pad_right) { const int K = 3; const int P = 0; const int S = 2; @@ -2303,11 +2393,33 @@ void pooling3x3s2p0_avg(const float* din, case 2: dr1 = zero_ptr; dr2 = zero_ptr; - coef_h = 1.f; + if (exclusive) { + coef_h = 1.f; + } else { + if (pad_bottom >= 2) { + coef_h = 1.f / 3; + } else if (pad_bottom == 1) { + coef_h = 0.5f; + } else { + coef_h = 1.0f; + } + } break; case 1: dr2 = zero_ptr; - coef_h = 0.5f; + if (exclusive) { + if (fabsf(coef_h - 0.5f) < 1e-6f) { + coef_h = 1.f; + } else { + coef_h = 0.5f; + } + } else { + if (pad_bottom >= 1) { + coef_h = 1.0f / 3; + } else { + coef_h = 0.5f; + } + } break; default: break; @@ -2366,22 +2478,34 @@ void pooling3x3s2p0_avg(const float* din, dr2 -= 8; } // deal with right pad - int rem = win - (w_unroll_size * 4) * S; - int wstart = 0; + int wstart = w_unroll_size * 4 * S - P; for (int j = 0; j < w_unroll_remian; ++j) { - int wend = std::min(wstart + K, rem); - float coef = coef_h / (wend - wstart); + int wend = wstart + K; // std::min(wstart + K, win); + float coef = coef_h / 3.f; + if (wstart + K > win) { + wend = win; + if (!exclusive) { + if (wstart + K - pad_right - win == 1) { + coef = coef_h / 2; + } else if (wstart + K - pad_right - win == 2) { + coef = coef_h; + } + } + } + int st = wstart > 0 ? wstart : 0; + if (exclusive) { + coef = coef_h / (wend - st); + } float tmp = 0.f; - for (int i = wstart; i < wend; i++) { - tmp += dr0[i]; - tmp += dr1[i]; - tmp += dr2[i]; + for (int i = 0; i < wend - st; i++) { + tmp += dr0[i] + dr1[i] + dr2[i]; } - tmp *= coef; - *(dr_out++) = tmp; + *(dr_out++) = tmp * coef; + dr0 += S - (st - wstart); + dr1 += S - (st - wstart); + dr2 += S - (st - wstart); wstart += S; } - r0 = r2; r1 = r0 + win; r2 = r1 + win; diff --git a/lite/backends/arm/math/pooling.h b/lite/backends/arm/math/pooling.h index 701732cb453bfc9f2e970c83c8d713e70a205434..7bbffa8e2f4594da4be589569efc0ef18b8dd0da 100644 --- a/lite/backends/arm/math/pooling.h +++ b/lite/backends/arm/math/pooling.h @@ -72,7 +72,9 @@ void pooling1x1s2p0_max(const float* din, int wout, int chin, int hin, - int win); + int win, + int pad_bottom, + int pad_right); void pooling2x2s2_max(const float* din, float* dout, @@ -82,7 +84,9 @@ void pooling2x2s2_max(const float* din, int wout, int chin, int hin, - int win); + int win, + int pad_bottom, + int pad_right); void pooling2x2s2_avg(const float* din, float* dout, @@ -93,7 +97,9 @@ void pooling2x2s2_avg(const float* din, int chin, int hin, int win, - bool exclusive); + bool exclusive, + int pad_bottom, + int pad_right); void pooling3x3s1p1_max(const float* din, float* dout, @@ -103,7 +109,9 @@ void pooling3x3s1p1_max(const float* din, int wout, int chin, int hin, - int win); + int win, + int pad_bottom, + int pad_right); void pooling3x3s1p1_avg(const float* din, float* dout, @@ -114,7 +122,9 @@ void pooling3x3s1p1_avg(const float* din, int chin, int hin, int win, - bool exclusive); + bool exclusive, + int pad_bottom, + int pad_right); void pooling3x3s2p1_max(const float* din, float* dout, @@ -124,7 +134,9 @@ void pooling3x3s2p1_max(const float* din, int wout, int chin, int hin, - int win); + int win, + int pad_bottom, + int pad_right); void pooling3x3s1p0_max(const float* din, float* dout, @@ -134,7 +146,9 @@ void pooling3x3s1p0_max(const float* din, int wout, int chin, int hin, - int win); + int win, + int pad_bottom, + int pad_right); void pooling3x3s1p0_avg(const float* din, float* dout, @@ -145,7 +159,9 @@ void pooling3x3s1p0_avg(const float* din, int chin, int hin, int win, - bool exclusive); + bool exclusive, + int pad_bottom, + int pad_right); void pooling3x3s2p1_avg(const float* din, float* dout, @@ -156,7 +172,9 @@ void pooling3x3s2p1_avg(const float* din, int chin, int hin, int win, - bool exclusive); + bool exclusive, + int pad_bottom, + int pad_right); void pooling3x3s2p0_max(const float* din, float* dout, @@ -166,7 +184,9 @@ void pooling3x3s2p0_max(const float* din, int wout, int chin, int hin, - int win); + int win, + int pad_bottom, + int pad_right); void pooling3x3s2p0_avg(const float* din, float* dout, @@ -177,7 +197,9 @@ void pooling3x3s2p0_avg(const float* din, int chin, int hin, int win, - bool exclusive); + bool exclusive, + int pad_bottom, + int pad_right); } // namespace math } // namespace arm diff --git a/lite/backends/arm/math/reduce_mean.cc b/lite/backends/arm/math/reduce_mean.cc index 56104550d8d68e53ad9a2ac3148887d67480d6f6..a84eef2970b2837159609c1ded1ca0d9991ccfc6 100644 --- a/lite/backends/arm/math/reduce_mean.cc +++ b/lite/backends/arm/math/reduce_mean.cc @@ -198,6 +198,23 @@ void reduce_mean_hw(const float* src, reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in); } +template <> +void mean_grad(const float* out_grad, float* in_grad, int size) { + float grad = out_grad[0] / size; + float32x4_t grad_v = vdupq_n_f32(grad); + int loop = size >> 2; + int remain = size & 3; + +#pragma omp parallel for + for (int i = 0; i < loop; ++i) { + vst1q_f32(in_grad, grad_v); + in_grad += 4; + } + for (int i = 0; i < remain; ++i) { + in_grad[i] = grad; + } +} + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/reduce_mean.h b/lite/backends/arm/math/reduce_mean.h index 277ed209c058b5b4be76ce18a00683610e6afb7a..aaa9ff42c18d0cfa6a7cf11408dfba06a9444adc 100644 --- a/lite/backends/arm/math/reduce_mean.h +++ b/lite/backends/arm/math/reduce_mean.h @@ -83,6 +83,9 @@ void reduce_mean_all(const T* src, int height_in, int width_in); +template +void mean_grad(const T* out_grad, T* in_grad, int size); + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/scale.cc b/lite/backends/arm/math/scale.cc index 7f2169a6456bb04bda228cf62b89a125e4e2bb2f..aab1058b9dd66522a0793fc151c54707505d1fbb 100644 --- a/lite/backends/arm/math/scale.cc +++ b/lite/backends/arm/math/scale.cc @@ -27,31 +27,576 @@ void scale( int remain = num % 16; float32x4_t vscale = vdupq_n_f32(scale); float32x4_t vbias = vdupq_n_f32(bias); + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "1: \n" + "ld1 {v4.4s}, [%[din]], #16 \n" + "and v8.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v5.4s}, [%[din]], #16 \n" + "and v9.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v6.4s}, [%[din]], #16 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v7.4s}, [%[din]], #16 \n" + "and v11.16b, %[vbias].16b, %[vbias].16b \n" + + "fmla v8.4s, v4.4s, %[vscale].4s \n" + "fmla v9.4s, v5.4s, %[vscale].4s \n" + "fmla v10.4s, v6.4s, %[vscale].4s \n" + "fmla v11.4s, v7.4s, %[vscale].4s \n" + + "stp q8, q9, [%[dout]], #32 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "stp q10, q11, [%[dout]], #32 \n" + + "bne 1b \n" + "0: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), [vbias] "w"(vbias) + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); +#else + asm volatile( + "1: @ loop header \n" + "vld1.32 {d8-d11}, [%[din]]! @ load din 0 \n" + "vand.32 q8, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q9, %q[vbias], %q[vbias] @ out bias \n" + "vld1.32 {d12-d15}, [%[din]]! @ load din 0 \n" + + "vand.32 q10, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q11, %q[vbias], %q[vbias] @ out bias \n" + + "vmla.f32 q8, q4, %q[vscale] @ mla \n" + "vmla.f32 q9, q5, %q[vscale] @ mla \n" + "vmla.f32 q10, q6, %q[vscale] @ mla \n" + "vmla.f32 q11, q7, %q[vscale] @ mla \n" + + "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" + + "bne 1b @ jump to main loop start " + "2: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), [vbias] "w"(vbias) + : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); +#endif + } + if (remain > 0) { + for (int i = 0; i < remain; i++) { + *dout = *din * scale + bias; + dout++; + din++; + } + } +} + +template <> +void scale_relu( + const float* din, float* dout, int num, float scale, float bias) { + int cnt = num >> 4; + int remain = num % 16; + float32x4_t vscale = vdupq_n_f32(scale); + float32x4_t vbias = vdupq_n_f32(bias); + float32x4_t vzero = vdupq_n_f32(0.f); + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "1: \n" + "ld1 {v4.4s}, [%[din]], #16 \n" + "and v8.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v5.4s}, [%[din]], #16 \n" + "and v9.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v6.4s}, [%[din]], #16 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b\n" + "ld1 {v7.4s}, [%[din]], #16 \n" + "and v11.16b, %[vbias].16b, %[vbias].16b\n" + + "fmla v8.4s, v4.4s, %[vscale].4s \n" + "fmla v9.4s, v5.4s, %[vscale].4s \n" + "fmla v10.4s, v6.4s, %[vscale].4s \n" + "fmla v11.4s, v7.4s, %[vscale].4s \n" + + "fmax v8.4s, v8.4s, %[vzero].4s \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + + "stp q8, q9, [%[dout]], #32 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "stp q10, q11, [%[dout]], #32 \n" + "bne 1b \n" + "0: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), [vbias] "w"(vbias), [vzero] "w"(vzero) + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); +#else + asm volatile( + "1: @ loop header \n" + "vld1.32 {d8-d11}, [%[din]]! @ load din 0 \n" + "vand.32 q8, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q9, %q[vbias], %q[vbias] @ out bias \n" + "vld1.32 {d12-d15}, [%[din]]! @ load din 0 \n" + + "vand.32 q10, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q11, %q[vbias], %q[vbias] @ out bias \n" + + "vmla.f32 q8, q4, %q[vscale] @ mla \n" + "vmla.f32 q9, q5, %q[vscale] @ mla \n" + "vmla.f32 q10, q6, %q[vscale] @ mla \n" + "vmla.f32 q11, q7, %q[vscale] @ mla \n" + + "vmax.f32 q8, q8, %q[vzero] @ relu \n" + "vmax.f32 q9, q9, %q[vzero] @ relu \n" + "vmax.f32 q10, q10, %q[vzero] @ relu \n" + "vmax.f32 q11, q11, %q[vzero] @ relu \n" + + "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" + + "bne 1b @ jump to main loop start " + "2: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), [vbias] "w"(vbias), [vzero] "w"(vzero) + : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); +#endif + } + if (remain > 0) { + for (int i = 0; i < remain; i++) { + *dout = *din * scale + bias; + *dout = *dout > 0.f ? *dout : 0.f; + dout++; + din++; + } + } +} + +template <> +void scale_relu6(const float* din, + float* dout, + int num, + float scale, + float bias, + float alpha) { + int cnt = num >> 4; + int remain = num % 16; + float32x4_t vscale = vdupq_n_f32(scale); + float32x4_t vbias = vdupq_n_f32(bias); + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t valpha = vdupq_n_f32(alpha); + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "1: \n" + "ld1 {v4.4s}, [%[din]], #16 \n" + "and v8.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v5.4s}, [%[din]], #16 \n" + "and v9.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v6.4s}, [%[din]], #16 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v7.4s}, [%[din]], #16 \n" + "and v11.16b, %[vbias].16b, %[vbias].16b \n" + + "fmla v8.4s, v4.4s, %[vscale].4s \n" + "fmla v9.4s, v5.4s, %[vscale].4s \n" + "fmla v10.4s, v6.4s, %[vscale].4s \n" + "fmla v11.4s, v7.4s, %[vscale].4s \n" + + "fmax v8.4s, v8.4s, %[vzero].4s \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + + "fmin v8.4s, v8.4s, %[valpha].4s \n" + "fmin v9.4s, v9.4s, %[valpha].4s \n" + "fmin v10.4s, v10.4s, %[valpha].4s \n" + "fmin v11.4s, v11.4s, %[valpha].4s \n" + + "stp q8, q9, [%[dout]], #32 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "stp q10, q11, [%[dout]], #32 \n" + "bne 1b \n" + "0: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), + [vbias] "w"(vbias), + [vzero] "w"(vzero), + [valpha] "w"(valpha) + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); +#else + asm volatile( + "1: @ loop header \n" + "vld1.32 {d8-d11}, [%[din]]! @ load din 0 \n" + "vand.32 q8, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q9, %q[vbias], %q[vbias] @ out bias \n" + "vld1.32 {d12-d15}, [%[din]]! @ load din 0 \n" + + "vand.32 q10, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q11, %q[vbias], %q[vbias] @ out bias \n" + + "vmla.f32 q8, q4, %q[vscale] @ mla \n" + "vmla.f32 q9, q5, %q[vscale] @ mla \n" + "vmla.f32 q10, q6, %q[vscale] @ mla \n" + "vmla.f32 q11, q7, %q[vscale] @ mla \n" + + "vmax.f32 q8, q8, %q[vzero] @ relu \n" + "vmax.f32 q9, q9, %q[vzero] @ relu \n" + "vmax.f32 q10, q10, %q[vzero] @ relu \n" + "vmax.f32 q11, q11, %q[vzero] @ relu \n" + + "vmin.f32 q8, q8, %q[valpha] @ relu \n" + "vmin.f32 q9, q9, %q[valpha] @ relu \n" + "vmin.f32 q10, q10, %q[valpha] @ relu \n" + "vmin.f32 q11, q11, %q[valpha] @ relu \n" + + "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" + + "bne 1b @ jump to main loop start " + "2: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), + [vbias] "w"(vbias), + [vzero] "w"(vzero), + [valpha] "w"(valpha) + : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); +#endif + } + if (remain > 0) { + for (int i = 0; i < remain; i++) { + *dout = *din * scale + bias; + *dout = *dout > 0.f ? (*dout < alpha ? *dout : alpha) : 0.f; + dout++; + din++; + } + } +} + +template <> +void scale_leaky_relu(const float* din, + float* dout, + int num, + float scale, + float bias, + float alpha) { + int cnt = num >> 4; + int remain = num % 16; + float32x4_t vscale = vdupq_n_f32(scale); + float32x4_t vbias = vdupq_n_f32(bias); + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t valpha = vdupq_n_f32(alpha); + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "1: \n" + "ld1 {v4.4s}, [%[din]], #16 \n" + "and v8.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v5.4s}, [%[din]], #16 \n" + "and v9.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v6.4s}, [%[din]], #16 \n" + "and v10.16b, %[vbias].16b, %[vbias].16b \n" + "ld1 {v7.4s}, [%[din]], #16 \n" + "and v11.16b, %[vbias].16b, %[vbias].16b \n" + + "fmla v8.4s, v4.4s, %[vscale].4s \n" + "fmla v9.4s, v5.4s, %[vscale].4s \n" + "fmla v10.4s, v6.4s, %[vscale].4s \n" + "fmla v11.4s, v7.4s, %[vscale].4s \n" + + "fcmge v12.4s, v8.4s, %[vzero].4s \n" + "fmul v16.4s, v8.4s, %[valpha].4s \n" + + "fcmge v13.4s, v9.4s, %[vzero].4s \n" + "fmul v17.4s, v9.4s, %[valpha].4s \n" + + "fcmge v14.4s, v10.4s, %[vzero].4s \n" + "fmul v18.4s, v10.4s, %[valpha].4s \n" + + "fcmge v15.4s, v11.4s, %[vzero].4s \n" + "fmul v19.4s, v11.4s, %[valpha].4s \n" + + "bif v8.16b, v16.16b, v12.16b \n" /* choose*/ + "bif v9.16b, v17.16b, v13.16b \n" /* choose*/ + "bif v10.16b, v18.16b, v14.16b \n" /* choose*/ + "bif v11.16b, v19.16b, v15.16b \n" /* choose*/ + + "stp q8, q9, [%[dout]], #32 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "stp q10, q11, [%[dout]], #32 \n" + "bne 1b \n" + "0: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), + [vbias] "w"(vbias), + [vzero] "w"(vzero), + [valpha] "w"(valpha) + : "cc", + "memory", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); +#else + asm volatile( + "1: @ loop header \n" + "vld1.32 {d8-d11}, [%[din]]! @ load din 0 \n" + "vand.32 q8, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q9, %q[vbias], %q[vbias] @ out bias \n" + "vld1.32 {d12-d15}, [%[din]]! @ load din 0 \n" + + "vand.32 q10, %q[vbias], %q[vbias] @ out bias \n" + "vand.32 q11, %q[vbias], %q[vbias] @ out bias \n" + + "vmla.f32 q8, q4, %q[vscale] @ mla \n" + "vmla.f32 q9, q5, %q[vscale] @ mla \n" + "vmla.f32 q10, q6, %q[vscale] @ mla \n" + "vmla.f32 q11, q7, %q[vscale] @ mla \n" + + "vcge.f32 q12, q8, %q[vzero] @ relu \n" + "vmul.f32 q14, q8, %q[valpha] @ mul \n" + "vcge.f32 q13, q9, %q[vzero] @ relu \n" + "vmul.f32 q15, q9, %q[valpha] @ mul \n" + "vbif q8, q14, q12 @ choose \n" + "vbif q9, q15, q13 @ choose \n" + + "vcge.f32 q12, q10, %q[vzero] @ relu \n" + "vmul.f32 q14, q10, %q[valpha] @ mul \n" + "vcge.f32 q13, q11, %q[vzero] @ relu \n" + "vmul.f32 q15, q11, %q[valpha] @ mul \n" + + "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" + + "vbif q10, q14, q12 @ choose \n" + "vbif q11, q15, q13 @ choose \n" + "subs %[cnt], #1 @ loop count minus 1\n" + "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" + + "bne 1b @ jump to main loop start " + "2: \n" + : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt) + : [vscale] "w"(vscale), + [vbias] "w"(vbias), + [vzero] "w"(vzero), + [valpha] "w"(valpha) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); +#endif + } + if (remain > 0) { + for (int i = 0; i < remain; i++) { + *dout = *din * scale + bias; + *dout = *dout > 0.f ? *dout : (*dout * alpha); + dout++; + din++; + } + } +} + +template <> +void scale(const int* din, int* dout, int num, int scale, int bias) { + int cnt = num >> 4; + int remain = num % 16; + int32x4_t vscale = vdupq_n_s32(scale); + int32x4_t vbias = vdupq_n_s32(bias); +#pragma omp parallel for + for (int i = 0; i < cnt; i++) { + const int* din_ptr = din + (i << 4); + int* dout_ptr = dout + (i << 4); + + int32x4_t din0 = vld1q_s32(din_ptr); + int32x4_t din1 = vld1q_s32(din_ptr + 4); + int32x4_t din2 = vld1q_s32(din_ptr + 8); + int32x4_t din3 = vld1q_s32(din_ptr + 12); + + int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale); + int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale); + int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale); + int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale); + + vst1q_s32(dout_ptr, vsum1); + vst1q_s32(dout_ptr + 4, vsum2); + vst1q_s32(dout_ptr + 8, vsum3); + vst1q_s32(dout_ptr + 12, vsum4); + } + if (remain > 0) { + const int* din_ptr = din + (cnt << 4); + int* dout_ptr = dout + (cnt << 4); + for (int i = 0; i < remain; i++) { + *dout_ptr = *din_ptr * scale + bias; + dout_ptr++; + din_ptr++; + } + } +} + +template <> +void scale_relu(const int* din, int* dout, int num, int scale, int bias) { + int cnt = num >> 4; + int remain = num % 16; + int32x4_t vscale = vdupq_n_s32(scale); + int32x4_t vbias = vdupq_n_s32(bias); + int32x4_t vzero = vdupq_n_s32(0); +#pragma omp parallel for + for (int i = 0; i < cnt; i++) { + const int* din_ptr = din + (i << 4); + int* dout_ptr = dout + (i << 4); + + int32x4_t din0 = vld1q_s32(din_ptr); + int32x4_t din1 = vld1q_s32(din_ptr + 4); + int32x4_t din2 = vld1q_s32(din_ptr + 8); + int32x4_t din3 = vld1q_s32(din_ptr + 12); + + int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale); + int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale); + int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale); + int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale); + + vsum1 = vmaxq_s32(vsum1, vzero); + vsum2 = vmaxq_s32(vsum2, vzero); + vsum3 = vmaxq_s32(vsum3, vzero); + vsum4 = vmaxq_s32(vsum4, vzero); + + vst1q_s32(dout_ptr, vsum1); + vst1q_s32(dout_ptr + 4, vsum2); + vst1q_s32(dout_ptr + 8, vsum3); + vst1q_s32(dout_ptr + 12, vsum4); + } + if (remain > 0) { + const int* din_ptr = din + (cnt << 4); + int* dout_ptr = dout + (cnt << 4); + for (int i = 0; i < remain; i++) { + *dout_ptr = *din_ptr * scale + bias; + *dout_ptr = *dout_ptr > 0 ? *dout_ptr : 0; + dout_ptr++; + din_ptr++; + } + } +} + +template <> +void scale_relu6( + const int* din, int* dout, int num, int scale, int bias, int alpha) { + int cnt = num >> 4; + int remain = num % 16; + int32x4_t vscale = vdupq_n_s32(scale); + int32x4_t vbias = vdupq_n_s32(bias); + int32x4_t vzero = vdupq_n_s32(0); + int32x4_t valpha = vdupq_n_s32(alpha); #pragma omp parallel for for (int i = 0; i < cnt; i++) { - const float* din_ptr = din + (i << 4); - float* dout_ptr = dout + (i << 4); + const int* din_ptr = din + (i << 4); + int* dout_ptr = dout + (i << 4); + + int32x4_t din0 = vld1q_s32(din_ptr); + int32x4_t din1 = vld1q_s32(din_ptr + 4); + int32x4_t din2 = vld1q_s32(din_ptr + 8); + int32x4_t din3 = vld1q_s32(din_ptr + 12); - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); + int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale); + int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale); + int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale); + int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale); - float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale); - float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale); - float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale); - float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale); + vsum1 = vmaxq_s32(vsum1, vzero); + vsum2 = vmaxq_s32(vsum2, vzero); + vsum3 = vmaxq_s32(vsum3, vzero); + vsum4 = vmaxq_s32(vsum4, vzero); - vst1q_f32(dout_ptr, vsum1); - vst1q_f32(dout_ptr + 4, vsum2); - vst1q_f32(dout_ptr + 8, vsum3); - vst1q_f32(dout_ptr + 12, vsum4); + vsum1 = vminq_s32(vsum1, valpha); + vsum2 = vminq_s32(vsum2, valpha); + vsum3 = vminq_s32(vsum3, valpha); + vsum4 = vminq_s32(vsum4, valpha); + + vst1q_s32(dout_ptr, vsum1); + vst1q_s32(dout_ptr + 4, vsum2); + vst1q_s32(dout_ptr + 8, vsum3); + vst1q_s32(dout_ptr + 12, vsum4); } + + if (remain > 0) { + const int* din_ptr = din + (cnt << 4); + int* dout_ptr = dout + (cnt << 4); + for (int i = 0; i < remain; i++) { + *dout_ptr = *din_ptr * scale + bias; + *dout_ptr = *dout_ptr > 0 ? (*dout_ptr > alpha ? alpha : *dout_ptr) : 0; + dout_ptr++; + din_ptr++; + } + } +} + +template <> +void scale_leaky_relu( + const int* din, int* dout, int num, int scale, int bias, int alpha) { + int cnt = num >> 4; + int remain = num % 16; + int32x4_t vscale = vdupq_n_s32(scale); + int32x4_t vbias = vdupq_n_s32(bias); + int32x4_t vzero = vdupq_n_s32(0); + int32x4_t valpha = vdupq_n_s32(alpha); +#pragma omp parallel for + for (int i = 0; i < cnt; i++) { + const int* din_ptr = din + (i << 4); + int* dout_ptr = dout + (i << 4); + + int32x4_t din0 = vld1q_s32(din_ptr); + int32x4_t din1 = vld1q_s32(din_ptr + 4); + int32x4_t din2 = vld1q_s32(din_ptr + 8); + int32x4_t din3 = vld1q_s32(din_ptr + 12); + + int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale); + int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale); + int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale); + int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale); + + uint32x4_t v1 = vcgeq_s32(vsum1, vzero); + uint32x4_t v2 = vcgeq_s32(vsum2, vzero); + uint32x4_t v3 = vcgeq_s32(vsum3, vzero); + uint32x4_t v4 = vcgeq_s32(vsum4, vzero); + + int32x4_t v11 = vmulq_s32(vsum1, valpha); + int32x4_t v21 = vmulq_s32(vsum1, valpha); + int32x4_t v31 = vmulq_s32(vsum1, valpha); + int32x4_t v41 = vmulq_s32(vsum1, valpha); + + vsum1 = vbslq_s32(v1, vsum1, v11); + vsum2 = vbslq_s32(v2, vsum2, v21); + vsum3 = vbslq_s32(v3, vsum3, v31); + vsum4 = vbslq_s32(v4, vsum4, v41); + + vst1q_s32(dout_ptr, vsum1); + vst1q_s32(dout_ptr + 4, vsum2); + vst1q_s32(dout_ptr + 8, vsum3); + vst1q_s32(dout_ptr + 12, vsum4); + } + if (remain > 0) { - const float* din_ptr = din + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); + const int* din_ptr = din + (cnt << 4); + int* dout_ptr = dout + (cnt << 4); for (int i = 0; i < remain; i++) { *dout_ptr = *din_ptr * scale + bias; + *dout_ptr = *dout_ptr > 0 ? *dout_ptr : (*dout_ptr) * alpha; dout_ptr++; din_ptr++; } diff --git a/lite/backends/arm/math/scale.h b/lite/backends/arm/math/scale.h index a86528c9df18cd6ef807bc116686b766ad905d82..bbdb596bc8f45c247a24f9833680c8a510c1e904 100644 --- a/lite/backends/arm/math/scale.h +++ b/lite/backends/arm/math/scale.h @@ -13,14 +13,41 @@ // limitations under the License. #pragma once - +#include "lite/core/tensor.h" +#include "lite/operators/op_params.h" namespace paddle { namespace lite { namespace arm { namespace math { +template +void scale_compute_basic(const operators::ScaleParam& param) { + const dtype* x_data = param.x->data(); + dtype* output_data = param.output->mutable_data(); + DDim x_dims = param.x->dims(); + DDim output_dims = param.output->dims(); + bool bias_after_scale = param.bias_after_scale; + float scale = param.scale; + float bias = param.bias; + if (!bias_after_scale) { + bias *= scale; + } + for (int i = 0; i < output_dims.production(); i++) { + output_data[i] = static_cast(x_data[i] * scale + bias); + } +} + +template +void scale(const T* din, T* dout, int num, T scale, T bias); + +template +void scale_relu(const T* din, T* dout, int num, T scale, T bias); + +template +void scale_relu6(const T* din, T* dout, int num, T scale, T bias, T alpha); + template -void scale(const T* din, T* dout, int num, float scale, float bias); +void scale_leaky_relu(const T* din, T* dout, int num, T scale, T bias, T alpha); template void scale(const T* din, diff --git a/lite/backends/arm/math/sgemv.cc b/lite/backends/arm/math/sgemv.cc index d17ce0dea4640899482ba9dd87d0646ca2de705d..a7d4322326c9413878264400ba8118b510fade10 100644 --- a/lite/backends/arm/math/sgemv.cc +++ b/lite/backends/arm/math/sgemv.cc @@ -983,10 +983,12 @@ void sgemv_trans(const int M, "vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" \ "vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" \ "vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" \ - "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \ - "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \ "vmla.f32 q0, q4, q6 @ mul add\n" \ + "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \ "vmla.f32 q1, q4, q8 @ mul add\n" \ + "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \ + /*"vmla.f32 q0, q4, q6 @ mul add\n" */ \ + /*"vmla.f32 q1, q4, q8 @ mul add\n" */ \ "vmla.f32 q2, q4, q10 @ mul add\n" \ "vmla.f32 q3, q4, q12 @ mul add\n" \ "subs %[cnt], #1 @ sub loop count \n" \ diff --git a/lite/backends/arm/math/topk.cc b/lite/backends/arm/math/topk.cc index c9239134e1c3988f5f9c39af6a69fec52fa0904f..83986dc1505098b0a23cdff31297e325fcb109a1 100644 --- a/lite/backends/arm/math/topk.cc +++ b/lite/backends/arm/math/topk.cc @@ -26,7 +26,7 @@ bool comp_func(std::pair a, std::pair b) { void topk(const float* in_data, float* out_val, - int* out_ind, + int64_t* out_ind, int m, int n, int k, @@ -34,7 +34,7 @@ void topk(const float* in_data, for (int i = 0; i < m; i++) { const float* in_tmp = in_data + i * n; float* out_val_tmp = out_val + i * k; - int* out_ind_tmp = out_ind + i * k; + int64_t* out_ind_tmp = out_ind + i * k; std::vector> vec; for (int j = 0; j < n; j++) { vec.push_back(std::make_pair(in_tmp[j], j)); diff --git a/lite/backends/arm/math/topk.h b/lite/backends/arm/math/topk.h index 5bf472e1af497398309689151f0d5354b3a48f27..a6716623228e6df0598410f52de56db58be7a8dc 100644 --- a/lite/backends/arm/math/topk.h +++ b/lite/backends/arm/math/topk.h @@ -22,7 +22,7 @@ namespace math { void topk(const float* din, float* out_val, - int* out_ind, + int64_t* out_ind, int m, int n, int k, diff --git a/lite/backends/arm/math/type_trans.cc b/lite/backends/arm/math/type_trans.cc index c50abb741ded487efa03d7d46baf2c6f13a8791d..c7c2da678bf55c45c2a2702ed413cf6bfc135c6a 100644 --- a/lite/backends/arm/math/type_trans.cc +++ b/lite/backends/arm/math/type_trans.cc @@ -40,13 +40,11 @@ void fp32_to_int8(const float* din, int cnt = inner_size / 16; int remain = inner_size & 15; int64_t loop_size = outer_size * axis_size; - #pragma omp parallel for for (int j = 0; j < loop_size; ++j) { float inv_scale = 1.f / scale[j % axis_size]; float32x4_t vzero = vdupq_n_f32(0.f); float32x4_t vscale = vdupq_n_f32(inv_scale); - float32x4_t vmax = vdupq_n_f32(-127.f); float32x4_t vpoff = vdupq_n_f32(0.5f); float32x4_t vnoff = vdupq_n_f32(-0.5f); const float* din_c = din + j * inner_size; @@ -56,6 +54,7 @@ void fp32_to_int8(const float* din, const float* din_ptr = din_c; signed char* dout_ptr = dout_c; #ifdef __aarch64__ + float32x4_t vmax = vdupq_n_f32(-127.0); asm volatile( "ldp q0, q1, [%[in]], #32 \n" "ldp q2, q3, [%[in]], #32 \n" @@ -64,16 +63,19 @@ void fp32_to_int8(const float* din, "fmul v5.4s, v1.4s, %[scale].4s \n" "fmul v6.4s, v2.4s, %[scale].4s \n" "fmul v7.4s, v3.4s, %[scale].4s \n" + /* data >= -127 */ "fcmge v8.4s, v4.4s, %[vmax].4s \n" "fcmge v9.4s, v5.4s, %[vmax].4s \n" "fcmge v10.4s, v6.4s, %[vmax].4s \n" "fcmge v11.4s, v7.4s, %[vmax].4s \n" + /* choose data */ "bif v4.16b, %[vmax].16b, v8.16b \n" "bif v5.16b, %[vmax].16b, v9.16b \n" "bif v6.16b, %[vmax].16b, v10.16b \n" "bif v7.16b, %[vmax].16b, v11.16b \n" "ldp q0, q1, [%[in]], #32 \n" "subs %[cnt], %[cnt], #1 \n" + /* fp32 - int32 */ "FCVTAS v8.4s, v4.4s \n" "FCVTAS v9.4s, v5.4s \n" "FCVTAS v10.4s, v6.4s \n" @@ -89,7 +91,9 @@ void fp32_to_int8(const float* din, "bne 0b \n" : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop) : [scale] "w"(vscale), [vmax] "w"(vmax) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -102,6 +106,7 @@ void fp32_to_int8(const float* din, "v10", "v11"); #else + float vmax[4] = {-127.0, -127.0, -127.0, -127.0}; asm volatile( "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" @@ -113,23 +118,27 @@ void fp32_to_int8(const float* din, "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" "vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n" + "vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3\n" "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" - "vcgt.f32 q8, q3, %q[vzero] @ get mask > 0, in3\n" "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" "vbif.f32 q6, %q[vnoff], q10 @ get right offset\n" - "vbif.f32 q7, %q[vnoff], q8 @ get right offset\n" + "vbif.f32 q7, %q[vnoff], q11 @ get right offset\n" "vmla.f32 q4, q0, %q[vscale] @ mul scale\n" + "vld1.32 {d0-d1}, [%[vmax]] @ set q0 = -127 \n" "vmla.f32 q5, q1, %q[vscale] @ mul scale\n" "vmla.f32 q6, q2, %q[vscale] @ mul scale\n" "vmla.f32 q7, q3, %q[vscale] @ mul scale\n" - "vcge.f32 q8, q4, %q[vmax] @ q4 >= vmax \n" - "vcge.f32 q9, q5, %q[vmax] @ q4 >= vmax \n" - "vcge.f32 q10, q6, %q[vmax] @ q4 >= vmax \n" - "vbif q4, %q[vmax], q8 @ choose \n" - "vcge.f32 q8, q7, %q[vmax] @ q4 >= vmax \n" - "vbif q5, %q[vmax], q9 @ choose \n" - "vbif q6, %q[vmax], q10 @ choose \n" - "vbif q7, %q[vmax], q8 @ choose \n" + /* data >= -127 */ + "vcge.f32 q8, q4, q0 @ q4 >= -127 \n" + "vcge.f32 q9, q5, q0 @ q4 >= -127 \n" + "vcge.f32 q10, q6, q0 @ q4 >= -127 \n" + "vcge.f32 q11, q7, q0 @ q4 >= -127 \n" + /* choose data */ + "vbif q4, q0, q8 @ choose \n" + "vbif q5, q0, q9 @ choose \n" + "vbif q6, q0, q10 @ choose \n" + "vbif q7, q0, q11 @ choose \n" + /* fp32 - int32 */ "vcvt.s32.f32 q0, q4 @ cvt to int32\n" "vcvt.s32.f32 q1, q5 @ cvt to int32\n" "vcvt.s32.f32 q2, q6 @ cvt to int32\n" @@ -150,9 +159,22 @@ void fp32_to_int8(const float* din, : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff), - [vzero] "w"(vzero), - [vmax] "w"(vmax) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10"); + [vmax] "r"(vmax), + [vzero] "w"(vzero) + : "cc", + "memory", + "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11"); #endif } const float* din_r = din_c + 16 * cnt; @@ -203,7 +225,7 @@ void fp32_to_int16(const float* din, "bne 0b \n" : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop) : [scale] "w"(vscale) - : "v0", "v1", "v4", "v5", "v8", "v9"); + : "cc", "memory", "v0", "v1", "v4", "v5", "v8", "v9"); #else asm volatile( "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" @@ -232,7 +254,7 @@ void fp32_to_int16(const float* din, [vpoff] "w"(vpoff), [vnoff] "w"(vnoff), [vzero] "w"(vzero) - : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9"); + : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9"); #endif } const float* din_r = din_c + 8 * cnt; @@ -294,7 +316,9 @@ void int8_to_fp32(const int8_t* in, "bne 0b \n" : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [scale] "w"(vscale) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -335,7 +359,7 @@ void int8_to_fp32(const int8_t* in, "bne 0b \n" : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [scale] "w"(vscale) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); #endif // __aarch64__ } const signed char* din_r = din_c + 16 * cnt; @@ -394,7 +418,18 @@ void int16_to_fp32(const int16_t* in, "bne 0b \n" : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [scale] "w"(vscale) - : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); + : "cc", + "memory", + "v0", + "v1", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11"); #else asm volatile( "vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n" @@ -422,7 +457,7 @@ void int16_to_fp32(const int16_t* in, "bne 0b \n" : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [scale] "w"(vscale) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); #endif // __aarch64__ } const int16_t* din_r = din_c + 16 * cnt; @@ -473,7 +508,9 @@ void int32_to_fp32(const int* din, "bne 0b \n" : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [scale] "w"(vscale) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -506,7 +543,9 @@ void int32_to_fp32(const int* din, "bne 0b \n" : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [scale] "w"(vscale) - : "q0", + : "cc", + "memory", + "q0", "q1", "q2", "q3", @@ -551,41 +590,53 @@ void int32_to_int8(const int* din, const int* din_ptr = din_c; int8_t* dout_ptr = dout_c; #ifdef __aarch64__ + float32x4_t vmax = vdupq_n_f32(-127.0); asm volatile( "0: \n" "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" - + /* int32 - fp32 */ "scvtf v4.4s, v0.4s \n" "scvtf v5.4s, v1.4s \n" "scvtf v6.4s, v2.4s \n" "scvtf v7.4s, v3.4s \n" - + /* mul scale */ "fmul v0.4s, v4.4s, %[scale].4s \n" "fmul v1.4s, v5.4s, %[scale].4s \n" "fmul v2.4s, v6.4s, %[scale].4s \n" "fmul v3.4s, v7.4s, %[scale].4s \n" - + /* data >= -127 */ + "fcmge v4.4s, v0.4s, %[vmax].4s \n" + "fcmge v5.4s, v1.4s, %[vmax].4s \n" + "fcmge v6.4s, v2.4s, %[vmax].4s \n" + "fcmge v7.4s, v3.4s, %[vmax].4s \n" + /* choose data */ + "bif v0.16b, %[vmax].16b, v4.16b \n" + "bif v1.16b, %[vmax].16b, v5.16b \n" + "bif v2.16b, %[vmax].16b, v6.16b \n" + "bif v3.16b, %[vmax].16b, v7.16b \n" + /* fp32 - int32 */ "fcvtas v4.4s, v0.4s \n" "fcvtas v5.4s, v1.4s \n" "fcvtas v6.4s, v2.4s \n" "fcvtas v7.4s, v3.4s \n" - + /* int32 - int16 */ "sqxtn v0.4h, v4.4s \n" "sqxtn2 v0.8h, v5.4s \n" "sqxtn v1.4h, v6.4s \n" "sqxtn2 v1.8h, v7.4s \n" - + /* int16 - int8 */ "sqxtn v2.8b, v0.8h \n" "sqxtn2 v2.16b, v1.8h \n" - + /* store */ "st1 {v2.16b}, [%[out]], #16 \n" "subs %[loop], %[loop], #1 \n" "bne 0b \n" : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + : [scale] "w"(vscale), [vmax] "w"(vmax) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); #else + float vmax[4] = {-127.0, -127.0, -127.0, -127.0}; asm volatile( "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" @@ -607,9 +658,21 @@ void int32_to_int8(const int* din, "vbif.f32 q2, %q[vnoff], q10 @ get right offset\n" "vbif.f32 q3, %q[vnoff], q11 @ get right offset\n" "vmla.f32 q0, q4, %q[vscale] @ mul scale\n" + "vld1.32 {d8-d9}, [%[vmax]] @ set q4 = -127 \n" "vmla.f32 q1, q5, %q[vscale] @ mul scale\n" "vmla.f32 q2, q6, %q[vscale] @ mul scale\n" "vmla.f32 q3, q7, %q[vscale] @ mul scale\n" + /* data >= -127 */ + "vcge.f32 q8, q0, q4 @ q0 >= -127 \n" + "vcge.f32 q9, q1, q4 @ q1 >= -127 \n" + "vcge.f32 q10, q2, q4 @ q2 >= -127 \n" + "vcge.f32 q11, q3, q4 @ q3 >= -127 \n" + /* choose data */ + "vbif q0, q4, q8 @ choose \n" + "vbif q1, q4, q9 @ choose \n" + "vbif q2, q4, q10 @ choose \n" + "vbif q3, q4, q11 @ choose \n" + /* fp32 - int32 */ "vcvt.s32.f32 q4, q0 @ cvt to int32\n" "vcvt.s32.f32 q5, q1 @ cvt to int32\n" "vcvt.s32.f32 q6, q2 @ cvt to int32\n" @@ -628,9 +691,12 @@ void int32_to_int8(const int* din, : [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr) : [vscale] "w"(vscale), [vzero] "w"(vzero), + [vmax] "r"(vmax), [vnoff] "w"(vnoff), [vpoff] "w"(vpoff) - : "q0", + : "cc", + "memory", + "q0", "q1", "q2", "q3", @@ -648,6 +714,7 @@ void int32_to_int8(const int* din, int8_t* dout_r = dout_c + 16 * cnt; for (int i = 0; i < remain; ++i) { dout_r[i] = saturate_cast(roundf(in_scale * din_r[i])); + dout_r[i] = dout_r[i] < -127 ? -127 : dout_r[i]; } } } @@ -682,7 +749,7 @@ float compute_max_kernel(const float* din, int64_t size) { "bne 0b \n" : [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val) : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); #else asm volatile( "vld1.32 {d0-d3}, [%[in]]! @ load 8 float\n" @@ -703,7 +770,7 @@ float compute_max_kernel(const float* din, int64_t size) { : [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val) : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); #endif float32x2_t vmax_p = vpmax_f32(vget_high_f32(vmax_val), vget_low_f32(vmax_val)); diff --git a/lite/backends/bm/target_wrapper.cc b/lite/backends/bm/target_wrapper.cc index c75c71452269167064c248418098bcb285d09055..6dab2a574d9c270573c00688768ad45a767abeae 100644 --- a/lite/backends/bm/target_wrapper.cc +++ b/lite/backends/bm/target_wrapper.cc @@ -24,16 +24,17 @@ std::map TargetWrapperBM::bm_hds_; size_t TargetWrapperBM::num_devices() { int count = 0; - bm_dev_getcount(&count); + bm_status_t ret = bm_dev_getcount(&count); + CHECK_EQ(ret, BM_SUCCESS) << "Failed with error code: " + << static_cast(ret); return count; } +int TargetWrapperBM::GetDevice() { return device_id_; } void TargetWrapperBM::SetDevice(int id) { - /* - if (id < 0 || (size_t)id >= num_devices()) { - LOG(FATAL) << "Failed with invalid device id " << id; - } - */ + if (id < 0 || (size_t)id >= num_devices()) { + LOG(FATAL) << "Failed with invalid device id " << id; + } device_id_ = id; if (bm_hds_.find(id) == bm_hds_.end()) { bm_handle_t bm_handle; diff --git a/lite/backends/bm/target_wrapper.h b/lite/backends/bm/target_wrapper.h index 2674ffe161582fbd2fe0dfcabbe8e349d13f847f..db65b598b51206959ab08128177897d434b3fb58 100644 --- a/lite/backends/bm/target_wrapper.h +++ b/lite/backends/bm/target_wrapper.h @@ -31,6 +31,7 @@ class TargetWrapper { static size_t maximum_stream() { return 0; } static void SetDevice(int id); + static int GetDevice(); static void CreateStream(stream_t* stream) {} static void DestroyStream(const stream_t& stream) {} diff --git a/lite/backends/cuda/CMakeLists.txt b/lite/backends/cuda/CMakeLists.txt index 35f5f0ce2d93db59cbb856d8008e6f3138633e42..0689bb706ab3bac4b8b97059017181ef24dd8ee4 100644 --- a/lite/backends/cuda/CMakeLists.txt +++ b/lite/backends/cuda/CMakeLists.txt @@ -5,5 +5,7 @@ get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES) nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_deps}) nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_deps}) + +lite_cc_library(cuda_context SRCS context.cc DEPS device_info) add_subdirectory(math) diff --git a/lite/backends/cuda/context.cc b/lite/backends/cuda/context.cc new file mode 100644 index 0000000000000000000000000000000000000000..4bac4c442c28848d38bd434d045c7888a1a92ac8 --- /dev/null +++ b/lite/backends/cuda/context.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/cuda/context.h" + +namespace paddle { +namespace lite {} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/context.h b/lite/backends/cuda/context.h new file mode 100644 index 0000000000000000000000000000000000000000..5bed30a9603c6f6a48169ae31d66c989bd891836 --- /dev/null +++ b/lite/backends/cuda/context.h @@ -0,0 +1,170 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/backends/cuda/blas.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/backends/cuda/target_wrapper.h" +#include "lite/core/device_info.h" + +namespace paddle { +namespace lite { + +template +class Context; + +using CUDAContext = Context; + +// Only works with CUDA kernels. +template <> +class Context { + public: + typename Env::Devs& devs = + Env::Global(); + // NOTE: InitOnce should only be used by ContextScheduler + void InitOnce() { + if (devs.size() > 0) { + cublas_fp32_ = std::make_shared>(); + } else { + LOG(INFO) << "No cuda device(s) found, CUDAContext init failed."; + } + } + void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) { + CHECK_GT(devs.size(), 0UL) + << "Env is not initialized or current target is not exit!"; + if (dev_id >= static_cast(devs.size())) { + LOG(WARNING) << "device index exceeds the number of devices, set to " + "default device(0)!"; + device_id_ = 0; + } else { + device_id_ = dev_id; + } + if (io_stream_id >= devs[dev_id].max_stream()) { + LOG(WARNING) << "data stream index exceeds the maximum stream number, " + "set to default stream(0)!"; + io_stream_id = 0; + } + if (exec_stream_id >= devs[dev_id].max_stream()) { + LOG(WARNING) << "exec stream index exceeds the maximum stream number, " + "set to default stream(0)!"; + exec_stream_id = 0; + } + + exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id]; + io_stream_ = devs[dev_id].io_streams()[io_stream_id]; + + exec_stream_id_ = exec_stream_id; + io_stream_id_ = io_stream_id; + need_sync_ = false; + } + void CopySharedTo(CUDAContext* ctx) { + CHECK(ctx); + CHECK(cublas_fp32_) << "cublas_fp32 should be set first"; + ctx->cublas_fp32_ = cublas_fp32_; + } + + const cudaStream_t& exec_stream() const { return exec_stream_; } + void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; } + + const cudaStream_t& io_stream() const { return io_stream_; } + void SetIoStream(cudaStream_t stream) { io_stream_ = stream; } + + std::shared_ptr> cublas_fp32() { return cublas_fp32_; } + void SetCuBlasFP32(std::shared_ptr> cublas_fp32) { + cublas_fp32_ = cublas_fp32; + } + + const std::vector& input_events() { return input_events_; } + void SetInputEvents(const std::vector& input_events) { + input_events_.clear(); + input_events_.assign(input_events.begin(), input_events.end()); + } + + const std::vector& output_events() { return output_events_; } + void SetOutputEvents(const std::vector& output_events) { + output_events_.clear(); + output_events_.assign(output_events.begin(), output_events.end()); + } + + std::vector all_exec_streams() { + int dev_id = TargetWrapper::GetCurDevice(); + return devs[dev_id].exec_streams(); + } + + void SetSyncStreams(const std::vector& nums) { + sync_streams_.clear(); + std::vector exec_streams = all_exec_streams(); + for (size_t i = 0; i < nums.size(); ++i) { + CHECK(nums[i] >= 0 && nums[i] < static_cast(exec_streams.size())) + << "streams id is not valid"; + sync_streams_.push_back(exec_streams[nums[i]]); + } + InitSyncEvents(nums.size()); + } + + void InitSyncEvents(const int num) { + sync_events_.clear(); + for (int i = 0; i < num; ++i) { + cudaEvent_t eve; + TargetWrapperCuda::CreateEventWithFlags(&eve); + sync_events_.push_back(eve); + } + } + + void SetNeedSync(bool sync) { need_sync_ = sync; } + bool need_sync() const { return need_sync_; } + + void Sync() { + CHECK_EQ(sync_streams_.size(), sync_events_.size()); + for (size_t i = 0; i < sync_events_.size(); ++i) { + TargetWrapperCuda::RecordEvent(sync_events_[i], sync_streams_[i]); + TargetWrapperCuda::StreamSync(exec_stream_, sync_events_[i]); + } + } + + std::string name() const { return "CUDAContext"; } + + CUDAContext& operator=(const CUDAContext& context) { + this->Init( + context.device_id_, context.exec_stream_id_, context.io_stream_id_); + cublas_fp32_ = const_cast(context).cublas_fp32(); + return *this; + } + + private: + int device_id_; + // overall information + int exec_stream_id_; + int io_stream_id_; + cudaStream_t exec_stream_; + cudaStream_t io_stream_; + + // not thread-safe, should allocate for each thread. + std::shared_ptr> cublas_fp32_; + + // kernel information + std::vector input_events_; + std::vector output_events_; + // multi stream sync. + std::vector sync_streams_; + std::vector sync_events_; + bool need_sync_; +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt index fafd74ae7a43d1a769456edfe408c71593d21201..d26b1188c0878916986575b72cc978926ba5a1f6 100644 --- a/lite/backends/cuda/math/CMakeLists.txt +++ b/lite/backends/cuda/math/CMakeLists.txt @@ -2,7 +2,7 @@ if(NOT LITE_WITH_CUDA) return() endif() -get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES) +get_property(cuda_static_deps GLOBAL PROPERTY CUDA_MODULES) nv_library(cuda_activation SRCS activation.cu DEPS ${cuda_static_deps}) nv_library(cuda_scale SRCS scale.cu DEPS ${cuda_static_deps}) diff --git a/lite/backends/cuda/math/batched_gemm.cc b/lite/backends/cuda/math/batched_gemm.cc index e81510927615daa88e7f5bef3ce7b8421d8f6539..bc605e39fb2acdc53c1f2ac9da738a24f29330c8 100644 --- a/lite/backends/cuda/math/batched_gemm.cc +++ b/lite/backends/cuda/math/batched_gemm.cc @@ -33,6 +33,9 @@ bool BatchedGemm::init(const bool trans_a, } cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N; + if (A_ != nullptr) { + cudaFree(A_); + } cudaMalloc(reinterpret_cast(&A_), 3 * max_batch_size * sizeof(float *)); return true; diff --git a/lite/backends/cuda/math/elementwise.cu b/lite/backends/cuda/math/elementwise.cu index 8f0ebd1f97a03f03b568de694b986e9540f07c55..63e710b358e9c22a769b4bc2c945aa4ba39478af 100644 --- a/lite/backends/cuda/math/elementwise.cu +++ b/lite/backends/cuda/math/elementwise.cu @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/backends/cuda/math/elementwise.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -62,6 +63,52 @@ __global__ void elementwise_relu_kernel(const size_t total, } } +template +__global__ void elementwise_abs_kernel(const size_t total, + const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + BinaryOperation type) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < total) { + int idx = tid / post % n; + Dtype temp; +#if __CUDA_ARCH__ >= 350 + temp = binary_calc(__ldg(x_data + tid), __ldg(y_data + idx), type); + +#else + temp = binary_calc(x_data[tid], y_data[idx], type); +#endif + out_data[tid] = temp > 0 ? temp : -temp; + } +} + +template +__global__ void elementwise_tanh_kernel(const size_t total, + const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + BinaryOperation type) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < total) { + int idx = tid / post % n; + Dtype temp; +#if __CUDA_ARCH__ >= 350 + temp = binary_calc(__ldg(x_data + tid), __ldg(y_data + idx), type); + +#else + temp = binary_calc(x_data[tid], y_data[idx], type); +#endif + out_data[tid] = tanh(temp); + } +} + template __global__ void elementwise_add_kernel(const size_t total, const Dtype* x_data, @@ -135,19 +182,30 @@ void elementwise(const Dtype* x_data, } template -void elementwise_relu(const Dtype* x_data, - const Dtype* y_data, - Dtype* out_data, - int pre, - int n, - int post, - BinaryOperation type, - cudaStream_t stream) { +void elementwise_act(const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + std::string act, + BinaryOperation type, + cudaStream_t stream) { int num = pre * n * post; int thread = 256; int block = (num + thread - 1) / thread; - elementwise_relu_kernel<<>>( - num, x_data, y_data, out_data, pre, n, post, type); + if (act == "relu") { + elementwise_relu_kernel<<>>( + num, x_data, y_data, out_data, pre, n, post, type); + } else if (act == "tanh") { + elementwise_tanh_kernel<<>>( + num, x_data, y_data, out_data, pre, n, post, type); + } else if (act == "abs") { + elementwise_abs_kernel<<>>( + num, x_data, y_data, out_data, pre, n, post, type); + } else { + LOG(FATAL) << "not supported activate type: " << act; + } } template void elementwise(const float*, @@ -159,14 +217,15 @@ template void elementwise(const float*, BinaryOperation, cudaStream_t); -template void elementwise_relu(const float*, - const float*, - float*, - int, - int, - int, - BinaryOperation, - cudaStream_t); +template void elementwise_act(const float* x_data, + const float* y_data, + float* out_data, + int pre, + int n, + int post, + std::string act, + BinaryOperation type, + cudaStream_t stream); template void elementwise_add(int num, diff --git a/lite/backends/cuda/math/elementwise.h b/lite/backends/cuda/math/elementwise.h index ce45d0544e5a55a9cdc34bdfacc2b48157f5a198..46412de2358ff092742f12f73037d4f7c7ce84ab 100644 --- a/lite/backends/cuda/math/elementwise.h +++ b/lite/backends/cuda/math/elementwise.h @@ -15,6 +15,7 @@ #pragma once #include #include +#include #include "lite/backends/cuda/math/utils.h" namespace paddle { @@ -33,14 +34,15 @@ void elementwise(const Dtype* x_data, cudaStream_t stream); template -void elementwise_relu(const Dtype* x_data, - const Dtype* y_data, - Dtype* out_data, - int pre, - int n, - int post, - BinaryOperation type, - cudaStream_t stream); +void elementwise_act(const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + std::string act, + BinaryOperation type, + cudaStream_t stream); template void elementwise_add(int num, diff --git a/lite/backends/cuda/math/utils.h b/lite/backends/cuda/math/utils.h index b6aa9c7d160ad6c8b60b132e4a2bbd7ae1e0b9ff..78aa689ff767e8a454dec3aa48a97ecefafdbe7a 100644 --- a/lite/backends/cuda/math/utils.h +++ b/lite/backends/cuda/math/utils.h @@ -29,6 +29,7 @@ enum class BinaryOperation { kADD = 0, kMUL = 1, kDIV = 2, + kSUB = 3, }; template @@ -41,6 +42,7 @@ __device__ __forceinline__ float binary_calc(float x, if (type == BinaryOperation::kADD) return x + y; if (type == BinaryOperation::kMUL) return x * y; if (type == BinaryOperation::kDIV) return x / y; + if (type == BinaryOperation::kSUB) return x - y; } template diff --git a/lite/backends/cuda/target_wrapper.h b/lite/backends/cuda/target_wrapper.h index 5b57ddf0043c59219aded9836cc0b1ad982eec2d..3eeee84c1c46a65782e38b998bcd8142e08cbec1 100644 --- a/lite/backends/cuda/target_wrapper.h +++ b/lite/backends/cuda/target_wrapper.h @@ -39,13 +39,26 @@ class TargetWrapper { static void CreateStream(stream_t* stream) {} static void DestroyStream(const stream_t& stream) {} - static void CreateEvent(event_t* event) {} - static void DestroyEvent(const event_t& event) {} + static void CreateEvent(event_t* event) { cudaEventCreate(event); } + static void CreateEventWithFlags( + event_t* event, unsigned int flags = cudaEventDisableTiming) { + cudaEventCreateWithFlags(event, flags); + } + static void DestroyEvent(const event_t& event) { cudaEventDestroy(event); } static void RecordEvent(const event_t& event) {} + static void RecordEvent(const event_t& event, const stream_t& stream) { + cudaEventRecord(event, stream); + } static void SyncEvent(const event_t& event) {} - static void StreamSync(const stream_t& stream) {} + static void StreamSync(const stream_t& stream) { + cudaStreamSynchronize(stream); + } + static void StreamSync(const stream_t& stream, const event_t& event) { + cudaStreamWaitEvent(stream, event, 0); + } + static void DeviceSync() { cudaDeviceSynchronize(); } static void* Malloc(size_t size); static void Free(void* ptr); diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp index cbc65e41e2912df10fca00169cdc64ea832e7d03..004536fc8d1a6a64e97907f6a79db5a82bcd16c5 100755 --- a/lite/backends/fpga/KD/debugger.hpp +++ b/lite/backends/fpga/KD/debugger.hpp @@ -125,7 +125,7 @@ inline void read_from_file(lite::Tensor* t, const std::string& path) { inline void save_float(float* data, const std::string& name, int len) { static int counter = 0; - std::string old_string = std::to_string(counter); + std::string old_string = paddle::lite::to_string(counter); std::string new_string = std::string(3 - old_string.length(), '0') + old_string; diff --git a/lite/backends/mlu/CMakeLists.txt b/lite/backends/mlu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..29c90b422044be4e6a7aa9f4a8da45018a41f11a --- /dev/null +++ b/lite/backends/mlu/CMakeLists.txt @@ -0,0 +1,7 @@ +if(NOT LITE_WITH_MLU) + return() +endif() + +message (STATUS "Lite with mlu backend") + +lite_cc_library(target_wrapper_mlu SRCS target_wrapper.cc DEPS cnml_lib cnrt_lib) diff --git a/lite/backends/mlu/mlu_utils.h b/lite/backends/mlu/mlu_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..08dd355e8100a48363704168d264f6116ae58a79 --- /dev/null +++ b/lite/backends/mlu/mlu_utils.h @@ -0,0 +1,67 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +/* + * This file contains some MLU specific uitls. + */ + +#define CNRT_CALL(msg) \ + CHECK_EQ(static_cast(msg), CNRT_RET_SUCCESS) \ + << (msg) \ + << " MLU CNRT: " << cnrtGetErrorStr(static_cast(msg)) + +#define CNML_CALL(msg) \ + CHECK_EQ(static_cast(msg), CNML_STATUS_SUCCESS) \ + << (msg) << " MLU CNML: " \ + << ::paddle::lite::mlu::CnmlErrorInfo(static_cast(msg)) + +namespace paddle { +namespace lite { +namespace mlu { + +static const char* CnmlErrorInfo(int error) { + switch (error) { +#define LITE_CNML_ERROR_INFO(xx) \ + case xx: \ + return #xx; \ + break; + LITE_CNML_ERROR_INFO(CNML_STATUS_NODEVICE); + LITE_CNML_ERROR_INFO(CNML_STATUS_SUCCESS); + LITE_CNML_ERROR_INFO(CNML_STATUS_DOMAINERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_INVALIDARG); + LITE_CNML_ERROR_INFO(CNML_STATUS_LENGTHERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_OUTOFRANGE); + LITE_CNML_ERROR_INFO(CNML_STATUS_RANGEERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_OVERFLOWERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_UNDERFLOWERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_INVALIDPARAM); + LITE_CNML_ERROR_INFO(CNML_STATUS_BADALLOC); + LITE_CNML_ERROR_INFO(CNML_STATUS_BADTYPEID); + LITE_CNML_ERROR_INFO(CNML_STATUS_BADCAST); + LITE_CNML_ERROR_INFO(CNML_STATUS_UNSUPPORT); +#undef LITE_CNML_ERROR_INFO + default: + return "unknown error"; + break; + } +} + +} // namespace mlu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/mlu/target_wrapper.cc b/lite/backends/mlu/target_wrapper.cc new file mode 100644 index 0000000000000000000000000000000000000000..2385f69246a163830e0df855082d728da2743e02 --- /dev/null +++ b/lite/backends/mlu/target_wrapper.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/mlu/target_wrapper.h" + +#include + +#include "lite/backends/mlu/mlu_utils.h" + +namespace paddle { +namespace lite { +namespace mlu { + +void cnrtMemcpyHtoD(void* dst, const void* src, size_t size) { + CNRT_CALL(cnrtMemcpy( + dst, const_cast(src), size, CNRT_MEM_TRANS_DIR_HOST2DEV)) + << " cnrt memcpy htod failed"; +} + +void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) { + CNRT_CALL(cnrtMemcpy( + dst, const_cast(src), size, CNRT_MEM_TRANS_DIR_DEV2HOST)) + << " cnrt memcpy dtoh failed"; +} + +} // namespace mlu + +size_t TargetWrapperMlu::num_devices() { + uint32_t dev_count = 0; + CNRT_CALL(cnrtGetDeviceCount(&dev_count)) << " cnrt get device count failed"; + LOG(INFO) << "Current MLU device count: " << dev_count; + return dev_count; +} + +void* TargetWrapperMlu::Malloc(size_t size) { + void* ptr{}; + CNRT_CALL(cnrtMalloc(&ptr, size)) << " cnrt malloc failed"; + // LOG(INFO) << "Malloc mlu ptr: " << ptr << " with size: " << size; + return ptr; +} + +void TargetWrapperMlu::Free(void* ptr) { + CNRT_CALL(cnrtFree(ptr)) << " cnrt free failed"; +} + +void TargetWrapperMlu::MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir) { + // LOG(INFO) << "dst: " << dst << " src: " << src << " size: " << size + //<< " dir: " << (int)dir; + switch (dir) { + case IoDirection::DtoD: { + std::unique_ptr cpu_tmp_ptr(new char[size]); + mlu::cnrtMemcpyDtoH(cpu_tmp_ptr.get(), src, size); + mlu::cnrtMemcpyHtoD(dst, cpu_tmp_ptr.get(), size); + break; + } + case IoDirection::HtoD: + mlu::cnrtMemcpyHtoD(dst, src, size); + break; + case IoDirection::DtoH: + mlu::cnrtMemcpyDtoH(dst, src, size); + break; + default: + LOG(FATAL) << "Unsupported IoDirection" << static_cast(dir); + } +} + +// void TargetWrapperMlu::MemcpyAsync(void* dst, +// const void* src, +// size_t size, +// IoDirection dir, +// const stream_t& stream) { +// LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync."; +// MemcpySync(dst, src, size, dir); +// } + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/mlu/target_wrapper.h b/lite/backends/mlu/target_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..2d9e10806f78e56f50b04d408dab219c923456fc --- /dev/null +++ b/lite/backends/mlu/target_wrapper.h @@ -0,0 +1,54 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/backends/mlu/mlu_utils.h" +#include "lite/core/target_wrapper.h" + +namespace paddle { +namespace lite { + +using TargetWrapperMlu = TargetWrapper; + +template <> +class TargetWrapper { + public: + using queue_t = cnrtQueue_t; + + static size_t num_devices(); + static size_t maxinum_queue() { return 0; } // TODO(zhangshijin): fix out it. + + static size_t GetCurDevice() { return 0; } + + static void CreateQueue(queue_t* queue) {} + static void DestroyQueue(const queue_t& queue) {} + + static void QueueSync(const queue_t& queue) {} + + static void* Malloc(size_t size); + static void Free(void* ptr); + + static void MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir); + // static void MemcpyAsync(void* dst, + // const void* src, + // size_t size, + // IoDirection dir, + // const queue_t& queue); +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc index d62ac9cad3e5ab4e6f63e3b667e3fa93e244fec1..f9803aa8810ada33b9eecafe1502515501514e41 100644 --- a/lite/backends/npu/device.cc +++ b/lite/backends/npu/device.cc @@ -14,15 +14,50 @@ #include "lite/backends/npu/device.h" #include "lite/utils/cp_logging.h" +#include "lite/utils/io.h" namespace paddle { namespace lite { namespace npu { -std::unique_ptr Device::Build( - std::string& model_name, // NOLINT - std::vector& input_nodes, // NOLINT - std::vector& output_nodes // NOLINT +bool WriteToOMFile(const domi::ModelBufferData& om_model_buff, + std::string om_file_path) { + FILE* fp; + fp = fopen(om_file_path.c_str(), "wb"); + CHECK(fp != nullptr) << om_file_path << " open failed!"; + + uint32_t write_size = + (uint32_t)fwrite(om_model_buff.data, 1, om_model_buff.length, fp); + CHECK_EQ(write_size, om_model_buff.length) << "write om file failed !"; + + fclose(fp); + return true; +} + +bool ReadFromOMFile(domi::ModelBufferData* om_model_buff, + std::string om_file_path) { + FILE* fp; + fp = fopen(om_file_path.c_str(), "rb"); + CHECK(fp != nullptr) << om_file_path << " open failed!"; + + fseek(fp, 0, SEEK_END); + uint32_t model_length = (uint32_t)ftell(fp); + fseek(fp, 0, SEEK_SET); + om_model_buff->data = malloc(model_length); + om_model_buff->length = model_length; + uint32_t read_size = + (uint32_t)fread(om_model_buff->data, 1, model_length, fp); + CHECK_EQ(read_size, model_length) << "read om file failed !"; + + fclose(fp); + return true; +} + +std::shared_ptr Device::Build( + const std::string model_name, // NOLINT + std::vector& input_nodes, // NOLINT + std::vector& output_nodes, // NOLINT + const std::string model_cache_full_dir = "" // NOLINT ) { VLOG(3) << "[NPU] Build model"; // Build the HiAI IR graph to the HiAI om model @@ -32,24 +67,34 @@ std::unique_ptr Device::Build( om_model.SetGraph(ir_graph); domi::HiaiIrBuild ir_build; domi::ModelBufferData om_model_buf; - if (!ir_build.CreateModelBuff(om_model, om_model_buf)) { - LOG(WARNING) << "[NPU] CreateModelBuff failed!"; - return nullptr; - } - if (!ir_build.BuildIRModel(om_model, om_model_buf)) { - LOG(WARNING) << "[NPU] BuildIRModel failed!"; - ir_build.ReleaseModelBuff(om_model_buf); - return nullptr; + + if (!model_cache_full_dir.empty() && IsFileExists(model_cache_full_dir)) { + VLOG(3) << "Will read om model from " << model_cache_full_dir; + ReadFromOMFile(&om_model_buf, model_cache_full_dir); + } else { + if (!ir_build.CreateModelBuff(om_model, om_model_buf)) { + LOG(WARNING) << "[NPU] CreateModelBuff failed!"; + return nullptr; + } + if (!ir_build.BuildIRModel(om_model, om_model_buf)) { + LOG(WARNING) << "[NPU] BuildIRModel failed!"; + ir_build.ReleaseModelBuff(om_model_buf); + return nullptr; + } + if (!model_cache_full_dir.empty()) { + VLOG(3) << "Will write om model to " << model_cache_full_dir; + WriteToOMFile(om_model_buf, model_cache_full_dir); + } } + // Create a HiAI model manager client to load the HiAI om model - std::unique_ptr model_client( + std::shared_ptr model_client( new hiai::AiModelMngerClient()); if (model_client->Init(nullptr) != hiai::AI_SUCCESS) { LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!"; ir_build.ReleaseModelBuff(om_model_buf); return nullptr; } - model_name = "model_" + std::to_string(model_count_++) + ".om"; auto model_desc = std::make_shared( model_name, freq_level(), framework_type(), model_type(), device_type()); model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length); diff --git a/lite/backends/npu/device.h b/lite/backends/npu/device.h index 411600ae0a38e4ee1b4a3ce3d6519b927eeb0a1a..fa8469bf2ebe8e148080f0dc82b4cdf62dc9f75a 100644 --- a/lite/backends/npu/device.h +++ b/lite/backends/npu/device.h @@ -40,18 +40,18 @@ class Device { // Build the HiAI IR graph to om model, return HiAI model manager client to // load om model and run inference. - std::unique_ptr Build( - std::string& model_name, // NOLINT - std::vector& input_nodes, // NOLINT - std::vector& output_nodes // NOLINT - ); // NOLINT + std::shared_ptr Build( + const std::string model_name, // NOLINT + std::vector& input_nodes, // NOLINT + std::vector& output_nodes, // NOLINT + const std::string model_cache_name // NOLINT + ); // NOLINT private: int freq_level_{3}; int framework_type_{0}; int model_type_{0}; int device_type_{0}; - int model_count_{0}; }; } // namespace npu diff --git a/lite/backends/opencl/CMakeLists.txt b/lite/backends/opencl/CMakeLists.txt index dd7f6b417e0d6416eec9bb3e60ef088432776112..0ac8cf310370f34ae5743113efe1d71579979daf 100644 --- a/lite/backends/opencl/CMakeLists.txt +++ b/lite/backends/opencl/CMakeLists.txt @@ -2,17 +2,16 @@ if (NOT LITE_WITH_OPENCL) return() endif() +lite_cc_library(opencl_kernels_source_cc SRCS opencl_kernels_source.cc) lite_cc_library(cl_wrapper SRCS cl_wrapper.cc) lite_cc_library(cl_utility SRCS cl_utility.cc DEPS cl_wrapper) -lite_cc_library(cl_runtime SRCS cl_runtime.cc DEPS cl_utility) +lite_cc_library(cl_runtime SRCS cl_runtime.cc DEPS cl_utility opencl_kernels_source_cc) lite_cc_library(cl_context SRCS cl_context.cc DEPS cl_runtime) -lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor) +lite_cc_library(cl_half SRCS cl_half.cc) +lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor cl_half) lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runtime) lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image) lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime) -lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper - ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) -lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper - ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) +lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper) add_dependencies(cl_wrapper opencl_clhpp) diff --git a/lite/backends/opencl/cl_caller.cc b/lite/backends/opencl/cl_caller.cc index 6b9cab1056beaa6f516a0d3a202a7816c911f1b2..8421c784d5da224eacaaa9461b737eed1b4bdd4e 100644 --- a/lite/backends/opencl/cl_caller.cc +++ b/lite/backends/opencl/cl_caller.cc @@ -30,7 +30,7 @@ static void CopyImageData(CLContext* context, int width = cl_image.image_dims()[0]; int height = cl_image.image_dims()[1]; - float* image_data = new float[height * width * 4]; + uint16_t* image_data = new uint16_t[height * width * 4]; cl::Image* image = cl_image.cl_image(); cl::array origin = {0, 0, 0}; cl::array region = { @@ -46,9 +46,8 @@ static void CopyImageData(CLContext* context, delete[] image_data; } -bool InitOpenCLRuntime(std::string cl_path) { +bool InitOpenCLRuntime() { auto* runtime = CLRuntime::Global(); - runtime->set_cl_path(cl_path); return runtime->IsInitSuccess(); } diff --git a/lite/backends/opencl/cl_caller.h b/lite/backends/opencl/cl_caller.h index 1817db9f6bd6d9ecf21978b8293bd9534328de0f..d1f1429e44f8872852797dadcbf2f82c1c9c0269 100644 --- a/lite/backends/opencl/cl_caller.h +++ b/lite/backends/opencl/cl_caller.h @@ -21,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace lite { -bool InitOpenCLRuntime(std::string cl_path); +bool InitOpenCLRuntime(); } // namespace lite } // namespace paddle diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc index 0fcb99486eac57e36ee548b809f8f141e0807db8..67d679fdd596b109b714bf7ba3cd45b2632b9420 100644 --- a/lite/backends/opencl/cl_context.cc +++ b/lite/backends/opencl/cl_context.cc @@ -1,11 +1,8 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -41,8 +38,7 @@ cl::Program &CLContext::GetProgram(const std::string &file_name, return *(it->second); } - auto program = CLRuntime::Global()->CreateProgram( - GetContext(), CLRuntime::Global()->cl_path() + "/cl_kernel/" + file_name); + auto program = CLRuntime::Global()->CreateProgram(GetContext(), file_name); VLOG(3) << " --- begin build program -> " << program_key << " --- "; CLRuntime::Global()->BuildProgram(program.get(), options); @@ -55,19 +51,20 @@ cl::Program &CLContext::GetProgram(const std::string &file_name, void CLContext::AddKernel(const std::string &kernel_name, const std::string &file_name, - const std::string &options) { + const std::string &options, + const std::string &time_stamp) { cl_int status{CL_SUCCESS}; VLOG(3) << " --- to get program " << file_name << " --- "; auto program = GetProgram(file_name, options); VLOG(3) << " --- end get program --- "; VLOG(3) << " --- to create kernel: " << kernel_name << " --- "; - std::unique_ptr kernel( + std::shared_ptr kernel( new cl::Kernel(program, kernel_name.c_str(), &status)); CL_CHECK_FATAL(status); VLOG(3) << " --- end create kernel --- "; kernels_.emplace_back(std::move(kernel)); STL::stringstream kernel_key; - kernel_key << kernel_name << options; + kernel_key << kernel_name << options << time_stamp; kernel_offset_[kernel_key.str()] = kernels_.size() - 1; } @@ -122,5 +119,115 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) { } } +cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size, + size_t max_work_size, + int divisor) { + int preferred_lws = 0; +#if 1 + auto gws0 = global_work_size[0]; + auto gws1 = global_work_size[1]; + auto gws2 = global_work_size[2]; +#else + auto gws2 = global_work_size[0]; + auto gws1 = global_work_size[1]; + auto gws0 = global_work_size[2]; +#endif + if (divisor > 1) { + max_work_size /= divisor; + } + if (preferred_lws > 0 && preferred_lws <= max_work_size) { + max_work_size = preferred_lws; + } + while (gws1 > max_work_size && max_work_size > 0) { + gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1; + } + while (gws2 * gws1 > max_work_size && max_work_size > 0) { + gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1; + } + while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) { + gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1; + } +#if 1 + return cl::NDRange{static_cast(gws0), + static_cast(gws1), + static_cast(gws2)}; +#else + return cl::NDRange{static_cast(gws2), + static_cast(gws1), + static_cast(gws0)}; +#endif +} +cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size, + size_t max_work_size, + int divisor) { + int preferred_lws = 0; +#if 0 + auto gws0 = global_work_size[0]; + auto gws1 = global_work_size[1]; + auto gws2 = global_work_size[2]; +#else + auto gws2 = global_work_size[0]; + auto gws1 = global_work_size[1]; + auto gws0 = global_work_size[2]; +#endif + if (divisor > 1) { + max_work_size /= divisor; + } + if (preferred_lws > 0 && preferred_lws <= max_work_size) { + max_work_size = preferred_lws; + } + while (gws1 > max_work_size && max_work_size > 0) { + gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1; + } + while (gws2 * gws1 > max_work_size && max_work_size > 0) { + gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1; + } + while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) { + gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1; + } +#if 0 + return cl::NDRange{static_cast(gws0), + static_cast(gws1), + static_cast(gws2)}; +#else + return cl::NDRange{static_cast(gws2), + static_cast(gws1), + static_cast(gws0)}; +#endif +} + +bool CLContext::IsArmMali() { + return CLRuntime::Global()->GetGpuType() == GpuType::ARM_MALI; +} + +cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size, + size_t max_work_size) { + int preferred_lws = 0; + int divisor = 2; + + auto gws0 = global_work_size[0]; + auto gws1 = global_work_size[1]; + auto gws2 = global_work_size[2]; + + if (divisor > 1) { + max_work_size /= divisor; + } + if (preferred_lws > 0 && preferred_lws <= max_work_size) { + max_work_size = preferred_lws; + } + while (gws1 > max_work_size && max_work_size > 0) { + gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1; + } + while (gws2 * gws1 > max_work_size && max_work_size > 0) { + gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1; + } + while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) { + gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1; + } + return cl::NDRange{static_cast(gws0), + static_cast(gws1), + static_cast(gws2)}; +} + } // namespace lite } // namespace paddle diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h index a28f82f40ecd70a38fcd179e3c7dedfb02a6bcd1..69ae11a8d71cc8c3dcae2b7ba81b4e19b44d1abe 100644 --- a/lite/backends/opencl/cl_context.h +++ b/lite/backends/opencl/cl_context.h @@ -27,6 +27,22 @@ namespace lite { class CLContext { public: + ~CLContext() { + GetCommandQueue().finish(); + for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) { + // Note(ysh329): Don't need `clReleaseKernel` + kernels_[kidx].reset(); + } + kernels_.clear(); + kernel_offset_.clear(); + for (auto &p : programs_) { + // Note(ysh329): Dont't need `clReleaseProgram` + p.second.reset(); + } + programs_.clear(); + LOG(INFO) << "release cl::Program, cl::Kernel finished."; + } + cl::CommandQueue &GetCommandQueue(); cl::Context &GetContext(); @@ -36,7 +52,8 @@ class CLContext { void AddKernel(const std::string &kernel_name, const std::string &file_name, - const std::string &options = ""); + const std::string &options = "", + const std::string &time_stamp = ""); cl::Kernel &GetKernel(const int index); @@ -44,9 +61,21 @@ class CLContext { cl::NDRange DefaultWorkSize(const CLImage &image); + cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size); + + cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size, + size_t max_work_size, + int divitor = 2); + cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size, + size_t max_work_size, + int divitor = 2); + bool IsArmMali(); + // cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size, + // size_t max_work_size); + private: std::unordered_map> programs_; - std::vector> kernels_; + std::vector> kernels_; std::map kernel_offset_; }; diff --git a/lite/backends/opencl/cl_functions_test.cc b/lite/backends/opencl/cl_functions_test.cc index 70f47b47946641edf4d023437b48d46cae93ca6e..17c879269cb745481cd2b474833e71f7417e7bad 100644 --- a/lite/backends/opencl/cl_functions_test.cc +++ b/lite/backends/opencl/cl_functions_test.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include #include @@ -26,22 +25,18 @@ limitations under the License. */ #include "lite/core/tensor.h" #include "lite/utils/cp_logging.h" -DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path."); - namespace paddle { namespace lite { TEST(cl_test, runtime_test) { auto *runtime = CLRuntime::Global(); CHECK(runtime->IsInitSuccess()); - runtime->set_cl_path(FLAGS_cl_path); runtime->platform(); runtime->device(); runtime->command_queue(); auto &context = runtime->context(); - auto program = runtime->CreateProgram( - context, - runtime->cl_path() + "/cl_kernel/" + "buffer/elementwise_add_kernel.cl"); + auto program = + runtime->CreateProgram(context, "buffer/elementwise_add_kernel.cl"); auto event = runtime->CreateEvent(context); const std::string build_option("-DCL_DTYPE_float"); CHECK(runtime->BuildProgram(program.get(), build_option)); @@ -50,7 +45,6 @@ TEST(cl_test, runtime_test) { TEST(cl_test, context_test) { auto *runtime = CLRuntime::Global(); CHECK(runtime->IsInitSuccess()); - runtime->set_cl_path(FLAGS_cl_path); CLContext context; context.AddKernel("pool_max", "image/pool_kernel.cl", "-DCL_DTYPE_float"); context.AddKernel( @@ -62,7 +56,6 @@ TEST(cl_test, context_test) { TEST(cl_test, kernel_test) { auto *runtime = CLRuntime::Global(); CHECK(runtime->IsInitSuccess()); - runtime->set_cl_path(FLAGS_cl_path); std::unique_ptr context(new CLContext); context->AddKernel( "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float"); @@ -107,21 +100,23 @@ TEST(cl_test, kernel_test) { size_t width = in_image.ImageWidth(); size_t height = in_image.ImageHeight(); auto global_work_size = cl::NDRange{width, height}; - cl::Event event; status = context->GetCommandQueue().enqueueNDRangeKernel( - kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event); + kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr); CL_CHECK_FATAL(status); status = context->GetCommandQueue().finish(); CL_CHECK_FATAL(status); +#if 0 double start_nanos = event.getProfilingInfo(); double stop_nanos = event.getProfilingInfo(); double elapsed_micros = (stop_nanos - start_nanos) / 1000.0; LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us."; +#endif + LOG(INFO) << out_image; } TEST(cl_test, target_wrapper_buffer_test) { - bool inited = InitOpenCLRuntime(FLAGS_cl_path); + bool inited = InitOpenCLRuntime(); CHECK(inited) << "Fail to initialize OpenCL runtime."; std::unique_ptr context(new CLContext); std::string kernel_name = "elementwise_add"; diff --git a/lite/backends/opencl/cl_half.cc b/lite/backends/opencl/cl_half.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f27cae549c30eb7295a7c9490d9fb106883dda7 --- /dev/null +++ b/lite/backends/opencl/cl_half.cc @@ -0,0 +1,518 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "lite/backends/opencl/cl_half.h" + +namespace paddle { +namespace lite { + +// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf + +static const uint32_t mantissatable[2048] = { + 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000, + 0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, + 0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000, + 0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000, + 0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000, + 0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000, + 0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000, + 0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000, + 0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000, + 0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000, + 0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000, + 0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000, + 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000, + 0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000, + 0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000, + 0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000, + 0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000, + 0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000, + 0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000, + 0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000, + 0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000, + 0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, + 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, + 0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000, + 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, + 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000, + 0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000, + 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, + 0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000, + 0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000, + 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, + 0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000, + 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, + 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000, + 0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000, + 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, + 0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000, + 0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, + 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, + 0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, + 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000, + 0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000, + 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, + 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, + 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, + 0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000, + 0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000, + 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, + 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, + 0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, + 0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000, + 0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000, + 0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000, + 0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000, + 0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000, + 0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000, + 0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000, + 0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000, + 0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000, + 0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000, + 0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000, + 0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000, + 0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000, + 0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000, + 0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000, + 0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000, + 0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000, + 0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000, + 0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000, + 0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000, + 0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000, + 0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000, + 0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000, + 0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000, + 0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000, + 0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000, + 0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000, + 0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000, + 0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000, + 0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000, + 0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000, + 0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000, + 0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000, + 0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000, + 0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000, + 0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000, + 0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000, + 0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000, + 0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000, + 0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000, + 0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000, + 0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000, + 0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000, + 0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000, + 0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000, + 0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000, + 0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000, + 0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000, + 0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000, + 0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000, + 0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000, + 0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000, + 0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000, + 0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000, + 0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000, + 0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000, + 0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000, + 0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000, + 0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000, + 0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000, + 0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000, + 0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000, + 0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000, + 0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000, + 0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000, + 0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000, + 0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000, + 0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000, + 0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000, + 0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000, + 0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000, + 0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000, + 0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000, + 0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000, + 0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000, + 0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000, + 0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000, + 0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000, + 0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000, + 0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000, + 0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000, + 0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000, + 0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000, + 0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000, + 0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000, + 0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000, + 0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000, + 0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000, + 0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000, + 0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000, + 0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000, + 0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000, + 0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000, + 0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000, + 0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000, + 0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000, + 0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000, + 0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000, + 0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000, + 0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000, + 0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000, + 0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000, + 0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000, + 0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000, + 0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000, + 0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000, + 0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000, + 0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000, + 0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000, + 0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000, + 0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000, + 0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000, + 0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000, + 0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000, + 0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000, + 0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000, + 0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000, + 0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000, + 0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000, + 0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000, + 0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000, + 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000, + 0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000, + 0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000, + 0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000, + 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000, + 0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, + 0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000, + 0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000, + 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000, + 0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, + 0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000, + 0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000, + 0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000, + 0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000, + 0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000, + 0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000, + 0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000, + 0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000, + 0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000, + 0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000, + 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000, + 0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, + 0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000, + 0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000, + 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000, + 0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, + 0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000, + 0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000, + 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000, + 0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, + 0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000, + 0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000, + 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000, + 0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000, + 0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000, + 0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000, + 0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000, + 0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000, + 0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000, + 0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000, + 0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000, + 0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, + 0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000, + 0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000, + 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000, + 0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, + 0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000, + 0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000, + 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000, + 0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000, + 0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000, + 0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000, + 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000, + 0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, + 0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000, + 0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000, + 0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000, + 0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000, + 0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000, + 0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000, + 0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000, + 0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000, + 0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000, + 0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000, + 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000, + 0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, + 0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000, + 0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000, + 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000, + 0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, + 0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000, + 0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000, + 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000, + 0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, + 0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000, + 0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000, + 0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000, + 0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000, + 0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000, + 0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000, + 0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000, + 0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000, + 0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000, + 0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000, + 0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, + 0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000, + 0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000, + 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000, + 0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, + 0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000, + 0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000, + 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000, + 0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, + 0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000, + 0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000, + 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000, + 0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000, + 0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000, + 0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000, + 0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000, + 0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000, + 0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000, + 0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000, + 0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000, + 0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, + 0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000, + 0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000, + 0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, + 0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000, + 0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000, + 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000, + 0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, + 0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000, + 0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000, + 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000, + 0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, + 0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000, + 0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000, + 0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000, + 0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000, + 0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000, + 0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000, + 0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000, + 0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000, + 0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000, + 0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000, + 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000, + 0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, + 0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000, + 0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000, + 0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, + 0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000, + 0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000, + 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000, + 0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, + 0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000, + 0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000, + 0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000, + 0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000, + 0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000, + 0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000, + 0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000, + 0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000, + 0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000, + 0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000, + 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000, + 0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, + 0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000, + 0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000, + 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000, + 0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, + 0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000, + 0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000, + 0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, + 0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000, + 0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000, + 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000, + 0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000, + 0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000, + 0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000, + 0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000, + 0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000, + 0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000, + 0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000, + 0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000, + 0x387fc000, 0x387fe000}; + +static const uint16_t offsettable[64] = { + 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400}; + +static const uint32_t exponenttable[64] = { + 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, + 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, + 0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000, + 0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000, + 0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000, + 0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000, + 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, + 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, + 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000, + 0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000, + 0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000}; + +static const uint16_t basetable[512] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, + 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000, + 0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, + 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800, + 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, + 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, + 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400, + 0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, + 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, + 0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00}; + +static const uint8_t shifttable[512] = { + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, + 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, + 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d}; + +half_t Float2Half(float f) { + uint32_t v = *reinterpret_cast(&f); + return basetable[(v >> 23) & 0x1ff] + + ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]); +} + +float Half2Float(half_t h) { + uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + + exponenttable[h >> 10]; + return *reinterpret_cast(&v); +} + +void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) { + for (int i = 0; i < count; ++i) { + h_array[i] = Float2Half(f_array[i]); + } +} + +void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) { + for (int i = 0; i < count; ++i) { + f_array[i] = Half2Float(h_array[i]); + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/opencl/cl_kernel/image/relu_kernel.cl b/lite/backends/opencl/cl_half.h similarity index 52% rename from lite/backends/opencl/cl_kernel/image/relu_kernel.cl rename to lite/backends/opencl/cl_half.h index 43a27067c2f2c418d314f9bce95bccbbb51a9be0..0dcf325db2bc13b8fff68f1e777d4680d937abce 100644 --- a/lite/backends/opencl/cl_kernel/image/relu_kernel.cl +++ b/lite/backends/opencl/cl_half.h @@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#pragma once +#include -__kernel void relu(__read_only image2d_t input, - __write_only image2d_t output) { +namespace paddle { +namespace lite { - const int x = get_global_id(0); // image_width - const int y = get_global_id(1); // image_height +typedef uint16_t half_t; - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; +half_t Float2Half(float f); - CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); - in = max((CL_DTYPE4)(0.0f), in); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); -} +float Half2Float(half_t h); + +void FloatArray2HalfArray(float *f_array, half_t *h_array, int count); + +void HalfArray2FloatArray(half_t *h_array, float *f_array, int count); + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/opencl/cl_image.cc b/lite/backends/opencl/cl_image.cc index b67f4040bff4cac15624c1440ca741d2b9dfa6ba..1e21b3d03a4a231f4bb171e83f4038e7922fe19a 100644 --- a/lite/backends/opencl/cl_image.cc +++ b/lite/backends/opencl/cl_image.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "lite/backends/opencl/cl_image.h" +#include +#include "lite/backends/opencl/cl_half.h" #include "lite/backends/opencl/cl_runtime.h" #include "lite/backends/opencl/cl_utility.h" #include "lite/utils/cp_logging.h" @@ -24,7 +26,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) { int width = cl_image.image_dims_[0]; int height = cl_image.image_dims_[1]; - float* image_data = new float[height * width * 4]; + uint16_t* image_data = new uint16_t[height * width * 4]; cl::Image* image = cl_image.cl_image(); cl::array origin = {0, 0, 0}; @@ -41,7 +43,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) { int stride = cl_image.numel() / 20; stride = stride > 0 ? stride : 1; - os << " dims: " << cl_image.tensor_dims_ << "\n"; + os << " dims: "; // << cl_image.tensor_dims_ << "\n"; for (int i = 0; i < cl_image.numel(); i += stride) { os << tensor_data[i] << " "; } @@ -123,7 +125,7 @@ void CLImage::InitCLImage(const cl::Context& context, VLOG(3) << " begin init cl image "; image_dims_ = converter->InitImageDimInfoWith(tensor_dims_); - float* image_data = new float[image_dims_.production() * 4]; + uint16_t* image_data = new uint16_t[image_dims_.production() * 4]; VLOG(3) << " convert to image "; converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_); diff --git a/lite/backends/opencl/cl_image_converter.cc b/lite/backends/opencl/cl_image_converter.cc index 402f710d7a226de089134b4abc41dc41027e0da1..2cfcc5dc81576973ef20fc0855131472ec2c0977 100644 --- a/lite/backends/opencl/cl_image_converter.cc +++ b/lite/backends/opencl/cl_image_converter.cc @@ -37,7 +37,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) { } void CLImageConverterDefault::NCHWToImage(float *nchw, - float *image, + half_t *image, const DDim &tensor_dim) { size_t new_dims[] = {1, 1, 1, 1}; for (size_t j = 0; j < tensor_dim.size(); ++j) { @@ -69,11 +69,11 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, if (c < C) { // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + // (c % 4); - image[i2] = *p; + image[i2] = Float2Half(*p); i2 += 4; p++; } else { - image[i2] = 0.0; + image[i2] = Float2Half(0.f); i2 += 4; } } @@ -84,7 +84,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, } } -void CLImageConverterDefault::ImageToNCHW(float *image, +void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { @@ -109,7 +109,7 @@ void CLImageConverterDefault::ImageToNCHW(float *image, for (size_t h = 0; h < H; h++) { size_t i2 = (i1 << 2) + c % 4; for (size_t w = 0; w < W; w++) { - *p = image[i2]; + *p = Half2Float(image[i2]); i2 += 4; p++; } @@ -164,7 +164,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) { } void CLImageConverterFolder::NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) { CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) << " Tensor dim is not support!"; @@ -187,13 +187,14 @@ void CLImageConverterFolder::NCHWToImage(float *tensor, for (size_t h = 0; h < tdim[0]; h++) { for (size_t w = 0; w < tdim[1]; w++) { - image[(h * width + w / 4) * 4 + (w % 4)] = tensor[h * tdim[1] + w]; + image[(h * width + w / 4) * 4 + (w % 4)] = + Float2Half(tensor[h * tdim[1] + w]); } } } } -void CLImageConverterFolder::ImageToNCHW(float *image, +void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { @@ -216,7 +217,7 @@ void CLImageConverterFolder::ImageToNCHW(float *image, for (size_t h = 0; h < H; h++) { for (size_t w = 0; w < W; w++) { - p[h * W + w] = image[(h * width + w / 4) * 4 + (w % 4)]; + p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]); } } } @@ -237,7 +238,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { } void CLImageConverterNWBlock::NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) { CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; auto image_dim = InitImageDimInfoWith(tensor_dim); @@ -257,10 +258,10 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + w * 4 + n % 4; if (n < N) { - image[index] = *p; + image[index] = Float2Half(*p); p++; } else { - image[index] = 0.0; + image[index] = Float2Half(0.f); } if (index >= (width * height * 4)) { LOG(INFO) << " index out of range "; @@ -272,7 +273,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, VLOG(3) << " init done"; } -void CLImageConverterNWBlock::ImageToNCHW(float *image, +void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { @@ -291,7 +292,7 @@ void CLImageConverterNWBlock::ImageToNCHW(float *image, for (size_t w = 0; w < W; ++w) { size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + w * 4 + n % 4; - *p = image[index]; + *p = Half2Float(image[index]); p++; if (index >= (width * height * 4)) { LOG(INFO) << " index out of range "; @@ -318,7 +319,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { } void CLImageConverterDWBlock::NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) { size_t new_dims[] = {1, 1, 1, 1}; for (size_t j = 0; j < tensor_dim.size(); ++j) { @@ -350,7 +351,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, if (c < C) { // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + // (c % 4); - image[i2] = *p; + image[i2] = Float2Half(*p); i2 += 4; p++; } else { @@ -365,7 +366,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, } } -void CLImageConverterDWBlock::ImageToNCHW(float *image, +void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { @@ -384,7 +385,7 @@ void CLImageConverterDWBlock::ImageToNCHW(float *image, for (size_t h = 0; h < H; h++) { size_t i2 = (i1 << 2) + c % 4; for (size_t w = 0; w < W; w++) { - *p = image[i2]; + *p = Half2Float(image[i2]); i2 += 4; p++; } @@ -418,7 +419,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) { } void CLImageConverterNormal::NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) { CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) << " Tensor dim is not support!"; @@ -427,7 +428,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor, default_converter.NCHWToImage(tensor, image, tensor_dim); } -void CLImageConverterNormal::ImageToNCHW(float *image, +void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { @@ -449,10 +450,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith( } void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) {} -void CLImageConverterWinoTransWeight::ImageToNCHW(float *image, +void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) {} diff --git a/lite/backends/opencl/cl_image_converter.h b/lite/backends/opencl/cl_image_converter.h index 962eb8d3ef35bdb603aa4a56181b1124885d5506..bb8602f6adae377f21c8fe92448e8feae64a773f 100644 --- a/lite/backends/opencl/cl_image_converter.h +++ b/lite/backends/opencl/cl_image_converter.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "lite/backends/opencl/cl_half.h" #include "lite/core/tensor.h" namespace paddle { @@ -24,10 +25,10 @@ class CLImageConverterBase { virtual ~CLImageConverterBase() {} virtual void NCHWToImage(float *nchw, - float *image, + half_t *image, const DDim &tensor_dim) = 0; - virtual void ImageToNCHW(float *image, + virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim, const DDim &tensor_dim) = 0; @@ -37,8 +38,8 @@ class CLImageConverterBase { class CLImageConverterDefault : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim) override; - void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim) override; - void ImageToNCHW(float *image, + void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim) override; + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) override; @@ -48,9 +49,9 @@ class CLImageConverterFolder : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim) override; void NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) override; - void ImageToNCHW(float *image, + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) override; @@ -77,9 +78,9 @@ class CLImageConverterNormal : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim) override; void NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) override; - void ImageToNCHW(float *image, + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) override; @@ -106,9 +107,9 @@ class CLImageConverterNWBlock : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim) override; void NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) override; - void ImageToNCHW(float *image, + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) override; @@ -117,9 +118,9 @@ class CLImageConverterDWBlock : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim) override; void NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) override; - void ImageToNCHW(float *image, + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) override; @@ -129,9 +130,9 @@ class CLImageConverterWinoTransWeight : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim) override; void NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) override; - void ImageToNCHW(float *image, + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) override; diff --git a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl index b8dbf62c06f85ef6237378d8ceab37f8fa2cd69f..a14748c69f3eafce515c90f2b8a226703fe5883d 100644 --- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl @@ -91,11 +91,7 @@ void gemm_batch_naive(__global const CL_DTYPE* a, c0 += a0 * b0; } -#ifdef RELU cur_c[row * N + col] = activation(c0); -#else - cur_c[row * N + col] = c0; -#endif } @@ -103,7 +99,7 @@ void gemm_batch_naive(__global const CL_DTYPE* a, // a: filter_d // b: x_d // c: output_d - +#if 0 // TODO(ysh239): cause CL_OUT_OF_HOST_MEMORY on some devices(such as snapdragon 855) //#define PRINT_KERNEL __kernel void gemm_batch(__global const CL_DTYPE* Aptr, @@ -213,7 +209,7 @@ void gemm_batch(__global const CL_DTYPE* Aptr, } } } - +#endif // fc_gemv_naive: keep for check // used for fc with M = 1 @@ -259,7 +255,7 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, const int col = get_global_id(0) << 2; // gws[0]: [0, N >> 2) height of B == N if (col + 3 < N) { - CL_DTYPE4 c0 = 0.0f; + half4 c0 = 0.0f; if (bias) { c0.x = bias[col]; c0.y = bias[col+1]; @@ -270,11 +266,12 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, // main loop of K int p = 0; for (; p < K - 3; p += 4) { - CL_DTYPE4 a0 = vload4(0, a + p); - CL_DTYPE4 b0 = vload4(0, b + p * N + col); - CL_DTYPE4 b1 = vload4(0, b + (p+1) * N + col); - CL_DTYPE4 b2 = vload4(0, b + (p+2) * N + col); - CL_DTYPE4 b3 = vload4(0, b + (p+3) * N + col); + half4 a0 = convert_half4(vload4(0, a + p)); + + half4 b0 = convert_half4(vload4(0, b + p * N + col)); + half4 b1 = convert_half4(vload4(0, b + (p+1) * N + col)); + half4 b2 = convert_half4(vload4(0, b + (p+2) * N + col)); + half4 b3 = convert_half4(vload4(0, b + (p+3) * N + col)); c0 += a0.x * b0; c0 += a0.y * b1; @@ -283,21 +280,21 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, } // compute left K - CL_DTYPE4 b2 = 0.0f, - b1 = 0.0f, - b0 = 0.0f, - a0 = 0.0f; + half4 b2 = 0.0f, + b1 = 0.0f, + b0 = 0.0f, + a0 = 0.0f; switch (K - p) { case 3: { - b2 = vload4(0, b + (p+2) * N + col); + b2 = convert_half4(vload4(0, b + (p+2) * N + col)); a0.z = a[p + 2]; } case 2: { - b1 = vload4(0, b + (p+1) * N + col); + b1 = convert_half4(vload4(0, b + (p+1) * N + col)); a0.y = a[p + 1]; } case 1: { - b0 = vload4(0, b + (p) * N + col); + b0 = convert_half4(vload4(0, b + (p) * N + col)); a0.x = a[p]; } } @@ -308,7 +305,8 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, // store res #ifdef RELU if (col % 4 == 0) { - vstore4(fmax(c0, (CL_DTYPE4)0.f), 0, c + col); + float4 act_res = convert_float4(fmax(c0, (half4)0.f)); + vstore4(act_res, 0, c + col); } else { switch (col % 4) { case 3: @@ -321,7 +319,7 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, } #else if (col % 4 == 0) { - vstore4(c0, 0, c + col); + vstore4(convert_float4(c0), 0, c + col); } else { switch (col % 4) { case 3: @@ -336,10 +334,10 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, } else { const int left_col = N - col; for (int col_offset = 0; col_offset < left_col; ++col_offset) { - CL_DTYPE c0 = bias ? bias[col] : 0; + half c0 = bias ? bias[col] : 0; for (int p = 0; p < K; ++p) { - CL_DTYPE b0 = *(b + p * N + col + col_offset); - CL_DTYPE a0 = *(a + p); + half b0 = *(b + p * N + col + col_offset); + half a0 = *(a + p); c0 += a0 * b0; } #ifdef RELU @@ -366,18 +364,18 @@ void fc_gemm_4x4(__global const CL_DTYPE* a, const int col = get_global_id(1) << 2; // id: [0, N>>2) width of out == N if (row+3 < M && col+3 < N) { - CL_DTYPE bias0 = bias ? bias[col] : 0, - bias1 = bias ? bias[col+1] : 0, - bias2 = bias ? bias[col+2] : 0, - bias3 = bias ? bias[col+3] : 0; + CL_COMPUTE_DTYPE bias0 = bias ? bias[col] : 0, + bias1 = bias ? bias[col+1] : 0, + bias2 = bias ? bias[col+2] : 0, + bias3 = bias ? bias[col+3] : 0; - CL_DTYPE c00 = bias0, c01 = bias1, c02 = bias2, c03 = bias3, - c10 = bias0, c11 = bias1, c12 = bias2, c13 = bias3, - c20 = bias0, c21 = bias1, c22 = bias2, c23 = bias3, - c30 = bias0, c31 = bias1, c32 = bias2, c33 = bias3; + CL_COMPUTE_DTYPE c00 = bias0, c01 = bias1, c02 = bias2, c03 = bias3, + c10 = bias0, c11 = bias1, c12 = bias2, c13 = bias3, + c20 = bias0, c21 = bias1, c22 = bias2, c23 = bias3, + c30 = bias0, c31 = bias1, c32 = bias2, c33 = bias3; for (int p = 0; p < K; ++p) { - CL_DTYPE + CL_COMPUTE_DTYPE a00 = *(a + row * K + p), a10 = *(a + (row + 1) * K + p), a20 = *(a + (row + 2) * K + p), @@ -407,7 +405,7 @@ void fc_gemm_4x4(__global const CL_DTYPE* a, } else { for (int cidx = col; cidx < N; ++cidx) { for (int ridx = row; ridx < M; ++ridx) { - CL_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0; + CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0; for (int p = 0; p < K; ++p) { a0 = *(a + ridx * K + p); b0 = *(b + p * N + cidx), diff --git a/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl index fe71f4c6ff8856ca679f2e6b29fc20a0d64da9ac..8d3456fa66973b04eaf24a04a42615790a133ddb 100644 --- a/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl @@ -15,6 +15,8 @@ limitations under the License. */ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define CL_DTYPE float +#include + __kernel void im2col(__global const CL_DTYPE* data_im, const int img_offset, const int col_chw, diff --git a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl deleted file mode 100644 index 532f947dd342b1ee4db69a084111a97ec014237f..0000000000000000000000000000000000000000 --- a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl +++ /dev/null @@ -1,167 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -// buffer -> image2d -__kernel void buffer_to_image2d(__global CL_DTYPE *in, - __write_only image2d_t output_image, - __private const int out_H, - __private const int out_W, - __private const int out_C, - __private const int Stride0, - __private const int Stride1, - __private const int Stride2) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_n; - const int in_c0 = out_c * 4 + 0; - const int in_c1 = out_c * 4 + 1; - const int in_c2 = out_c * 4 + 2; - const int in_c3 = out_c * 4 + 3; - const int in_h = out_h; - const int in_w = out_w; - - int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w; - int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w; - int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w; - int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - CL_DTYPE4 output = (CL_DTYPE4)0.0f; - output.x = convert_float(in[input_pos0]); - if(out_C - 4 * out_c >= 2){ - output.y = convert_float(in[input_pos1]); - } - if(out_C - 4 * out_c >= 3){ - output.z = convert_float(in[input_pos2]); - } - if(out_C - 4 * out_c >= 4){ - output.w = convert_float(in[input_pos3]); - } - write_imagef(output_image, output_pos, output); -} - -// buffer -> image2d_nw -__kernel void buffer_to_image2d_nw(__global CL_DTYPE* in, - __write_only image2d_t output_image, - __private const int out_H, - __private const int out_W, - __private const int out_N, - __private const int Stride0, - __private const int Stride1, - __private const int Stride2) { - const int out_n = get_global_id(0); - const int out_w = get_global_id(1); - const int out_ch = get_global_id(2); - - const int out_c = out_ch / out_H; - const int out_h = out_ch % out_H; - - const int in_c = out_c; // index of c in h direction - - const int in_n0 = out_n * 4 + 0; - const int in_n1 = out_n * 4 + 1; - const int in_n2 = out_n * 4 + 2; - const int in_n3 = out_n * 4 + 3; - - const int in_h = out_h; - const int in_w = out_w; - - int input_pos0 = in_n0 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; - int input_pos1 = in_n1 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; - int input_pos2 = in_n2 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; - int input_pos3 = in_n3 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; - - int2 output_pos; - output_pos.x = out_n * out_W + out_w; - output_pos.y = out_ch; - - CL_DTYPE4 output = (CL_DTYPE4)0.0f; - output.x = convert_float(in[input_pos0]); - if (out_N - 4 * out_n >= 2) { - output.y = convert_float(in[input_pos1]); - } - if (out_N - 4 * out_n >= 3) { - output.z = convert_float(in[input_pos2]); - } - if (out_N - 4 * out_n >= 4) { - output.w = convert_float(in[input_pos3]); - } - write_imagef(output_image, output_pos, output); -} - - - -// image2d -> buffer -__kernel void image2d_to_buffer(__read_only image2d_t input, - __private const int in_width, - __private const int in_height, - __global CL_DTYPE* out, - __private const int size_ch, - __private const int size_block, - __private const int size_batch, - __private const int C) { - const int in_c = get_global_id(0); - const int in_w = get_global_id(1); - const int in_nh = get_global_id(2); - const int in_n = in_nh / in_height; - const int in_h = in_nh % in_height; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - const int pos_x = mad24(in_c, in_width, in_w); - CL_DTYPE4 in = read_imagef(input, sampler, (int2)(pos_x, in_nh)); - - const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; - out[index] = convert_float(in.x); - if (C - 4 * in_c >= 2) { - out[index + size_ch] = convert_float(in.y); - } - if(C - 4 * in_c >= 3) { - out[index + size_ch * 2] = convert_float(in.z); - } - if(C - 4 * in_c >= 4) { - out[index + size_ch * 3] = convert_float(in.w); - } -} - -// image2d -> buffer -__kernel void image2d_to_buffer_2d(__private const int in_height, - __private const int in_width, - __read_only image2d_t input, - __global CL_DTYPE* out) { - const int in_w = get_global_id(1); - const int in_h = get_global_id(2); - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - CL_DTYPE4 in = read_imagef(input, sampler, (int2)(in_w, in_h)); - - const int index = (in_h * in_width + in_w) * 4; - out[index] = convert_float(in.x); - out[index + 1] = convert_float(in.y); - out[index + 2] = convert_float(in.z); - out[index + 3] = convert_float(in.w); -} diff --git a/lite/backends/opencl/cl_kernel/cl_common.h b/lite/backends/opencl/cl_kernel/cl_common.h index c127c6cec79cb2eb8d82ce6aa6190b23d373ff64..b427eb70d6cdbb5cd495e970fb77c4790bc01723 100644 --- a/lite/backends/opencl/cl_kernel/cl_common.h +++ b/lite/backends/opencl/cl_kernel/cl_common.h @@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once ///////////////////////////////// @@ -29,11 +28,15 @@ limitations under the License. */ #ifdef CL_DTYPE_float #define CL_DTYPE float #define CL_DTYPE_CHAR f +#define CL_COMPUTE_DTYPE half +#define CL_COMPUTE_DTYPE_CHAR h #endif #ifdef CL_DTYPE_half #define CL_DTYPE half #define CL_DTYPE_CHAR h +#define CL_COMPUTE_DTYPE half +#define CL_COMPUTE_DTYPE_CHAR h #endif ///////////////////////////////// @@ -43,6 +46,7 @@ limitations under the License. */ #define GET_VEC_TYPE(type__, size__) type__##size__ #define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__) #define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4) +#define CL_COMPUTE_DTYPE4 VECTORIZED_TYPE(CL_COMPUTE_DTYPE, 4) ///////////////////////////////// // CONVERT_TYPE_TO @@ -103,7 +107,8 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in #endif #ifdef RELU6 - output = clamp(in, (CL_DTYPE4)0, (CL_DTYPE4)6); + in = fmax((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in); + output = fmin((CL_DTYPE4)(6.0f, 6.0f, 6.0f, 6.0f), in); #endif return output; } diff --git a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..a4070f747aec43f7a0ed097f9b15186cafd32476 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl @@ -0,0 +1,139 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void relu(__read_only image2d_t input, + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); + in = max((CL_DTYPE4)(0.0f), in); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); +} + +__kernel void relu6(__read_only image2d_t input, + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + const int x = get_global_id(0); + const int y = get_global_id(1); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); + in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in); + in = min((CL_DTYPE4)(threshold, threshold, threshold, threshold), in); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); +} + +__kernel void sigmoid(__read_only image2d_t input, + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); + CL_DTYPE4 out; + + out.x = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.x)))); + out.y = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.y)))); + out.z = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.z)))); + out.w = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.w)))); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); +} + +__kernel void leaky_relu(__read_only image2d_t input, + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + const int x = get_global_id(0); + const int y = get_global_id(1); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); + CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in; + if (in.x < 0.0f) { + in.x = s_val.x; + } + if (in.y < 0.0f) { + in.y = s_val.y; + } + if (in.z < 0.0f) { + in.z = s_val.z; + } + if (in.w < 0.0f) { + in.w = s_val.w; + } + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); +} + +__kernel void tanh_act(__read_only image2d_t input, + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); + CL_DTYPE4 out = (exp(in) - exp(-in)) / (exp(in) + exp(-in)); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); +} + +__kernel void exp_act(__read_only image2d_t input, + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); + CL_DTYPE4 out = exp(in); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); +} + +__kernel void swish(__read_only image2d_t input, + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); + CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in)); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); +} diff --git a/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl b/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..515bf57487ffd93959929ea93f76b0fdd888c4a5 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl @@ -0,0 +1,97 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + + +__kernel void bilinear_interp(__read_only image2d_t input, + __write_only image2d_t output, + __private const float scale_h, + __private const float scale_w, + __private const float align_delta, + __private const int in_dims_h, + __private const int in_dims_w, + __private const int out_dims_h, + __private const int out_dims_w){ + const int c = get_global_id(0); + const int w = get_global_id(1); + const int nh = get_global_id(2); + + int2 output_pos; + output_pos.x = c * out_dims_w + w; + output_pos.y = nh; + + // calculate center pixel's pos + int out_n = nh / out_dims_h; + int out_h = nh % out_dims_h; + float center_w = (w + align_delta) * scale_w - align_delta; + float center_h = (out_h + align_delta) * scale_h - align_delta; + + int floor_w = (int)center_w; + int floor_h = (int)center_h; + int ceil_w = floor_w + 1; + int ceil_h = floor_h + 1; + if (floor_w < 0){ + floor_w = 0; + } + if (floor_h < 0){ + floor_h = 0; + } + if (ceil_w > in_dims_w - 1) { + ceil_w = in_dims_w - 1; + } + if (ceil_h > in_dims_h - 1) { + ceil_h = in_dims_h- 1; + } + CL_DTYPE wight0_w = center_w - floor_w; + CL_DTYPE wight0_h = center_h - floor_h; + CL_DTYPE wight1_w = 1.0 - wight0_w; + CL_DTYPE wight1_h = 1.0 - wight0_h; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + // get left up pixel data + int2 left_up; + left_up.x = c * in_dims_w + floor_w; + left_up.y = out_n * in_dims_h + ceil_h; + CL_DTYPE4 left_up_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, left_up); + + + // get left down pixel data + int2 left_down; + left_down.x = c * in_dims_w + floor_w; + left_down.y = out_n * in_dims_h + floor_h; + CL_DTYPE4 left_down_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, left_down); + + // get right up pixel data + int2 right_up; + right_up.x = c * in_dims_w + ceil_w; + right_up.y = out_n * in_dims_h + ceil_h; + CL_DTYPE4 right_up_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, right_up); + + // get right down pixel's data + int2 right_down; + right_down.x = c * in_dims_w + ceil_w; + right_down.y = out_n * in_dims_h + floor_h; + CL_DTYPE4 right_down_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, right_down); + + // calculate output data + CL_DTYPE4 out = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h + + (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h; + + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, out); +} diff --git a/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..72b0b66f9737ce0ca9c740e6d4e399d06eaf2cd8 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl @@ -0,0 +1,152 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void decode_center_size(__read_only image2d_t prior_box_image, + __read_only image2d_t prior_box_var_image, + __read_only image2d_t target_box_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H){ + const int out_c = get_global_id(0); + const int out_nh = get_global_id(1); + const int out_h = out_nh % out_H; + const int out_n = 1; + + const int prior_box_n = 1; + const int prior_box_c = 0; + const int prior_box_h = out_h; + + const int prior_box_var_n = 1; + const int prior_box_var_c = 0; + const int prior_box_var_h = out_h; + + const int target_box_n = 1; + const int target_box_c = out_c; + const int target_box_h = out_h; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + int2 prior_box_pos; + int2 prior_box_var_pos; + int2 target_box_pos; + int2 output_pos; + + prior_box_pos.x = prior_box_c * 4; + prior_box_pos.y = prior_box_n * prior_box_h; + + prior_box_var_pos.x = prior_box_var_c * 4; + prior_box_var_pos.y = prior_box_var_n * prior_box_var_h; + + target_box_pos.x = target_box_c * 4; + target_box_pos.y = target_box_n * target_box_h; + + output_pos.x = out_c * 4; + output_pos.y = out_n * out_h; + + CL_DTYPE4 prior_box_input[4]; + CL_DTYPE4 prior_box_var_input[4]; + CL_DTYPE4 target_box_input[4]; + + prior_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 0, prior_box_pos.y)); + prior_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 1, prior_box_pos.y)); + prior_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 2, prior_box_pos.y)); + prior_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 3, prior_box_pos.y)); + + prior_box_var_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 0, prior_box_var_pos.y)); + prior_box_var_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 1, prior_box_var_pos.y)); + prior_box_var_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 2, prior_box_var_pos.y)); + prior_box_var_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 3, prior_box_var_pos.y)); + + target_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 0,target_box_pos.y)); + target_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 1, target_box_pos.y)); + target_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 2, target_box_pos.y)); + target_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 3, target_box_pos.y)); + + CL_DTYPE prior_box_width = prior_box_input[2].x - prior_box_input[0].x; + CL_DTYPE prior_box_height = prior_box_input[3].x - prior_box_input[1].x; + CL_DTYPE prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(CL_DTYPE)2; + CL_DTYPE prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(CL_DTYPE)2; + + CL_DTYPE4 target_box_center_x; + CL_DTYPE4 target_box_center_y; + CL_DTYPE4 target_box_width; + CL_DTYPE4 target_box_height; + CL_DTYPE4 output[4]; + + output[0] = 0.0f; + output[1] = 0.0f; + output[2] = 0.0f; + output[3] = 0.0f; + + target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x; + target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y; + target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width; + target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height; + + output[0].x = target_box_center_x.x - target_box_width.x/(half)2; + output[1].x = target_box_center_y.x - target_box_height.x/(half)2; + output[2].x = target_box_center_x.x + target_box_width.x/(half)2; + output[3].x = target_box_center_y.x + target_box_height.x/(half)2; + + if(out_C - out_c * 4 >= 2){ + target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x; + target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y; + target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width; + target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height; + output[0].y = target_box_center_x.y - target_box_width.y/(half)2; + output[1].y = target_box_center_y.y - target_box_height.y/(half)2; + output[2].y = target_box_center_x.y + target_box_width.y/(half)2; + output[3].y = target_box_center_y.y + target_box_height.y/(half)2; + } + if(out_C - out_c * 4 >= 3){ + target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x; + target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y; + target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width; + target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height; + output[0].z = target_box_center_x.z - target_box_width.z/(half)2; + output[1].z = target_box_center_y.z - target_box_height.z/(half)2; + output[2].z = target_box_center_x.z + target_box_width.z/(half)2; + output[3].z = target_box_center_y.z + target_box_height.z/(half)2; + } + if(out_C - out_c * 4 >= 4){ + target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x; + target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y; + target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width; + target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height; + output[0].w = target_box_center_x.w - target_box_width.w/(half)2; + output[1].w = target_box_center_y.w - target_box_height.w/(half)2; + output[2].w = target_box_center_x.w + target_box_width.w/(half)2; + output[3].w = target_box_center_y.w + target_box_height.w/(half)2; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]); +} diff --git a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl index f0335116f87aac34740dd22ac68f2b6265e62445..40cc52d54d0a9847ea71b017bdd3c633c74faa89 100644 --- a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl @@ -1,11 +1,8 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -15,50 +12,153 @@ limitations under the License. */ #include __kernel void concat2(__read_only image2d_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, - int axis_size, int flag, int width) { - const int x = get_global_id(0); // image_width cxw/4 - const int y = get_global_id(1); // image_height nxh + __read_only image2d_t input1, + __write_only image2d_t output, + int flag, int C_0, int out_C, int out_W, int width) { + const int out_w = get_global_id(0); // image_width cxw/4 + const int out_c = get_global_id(1); // image_width cxw/4 + const int out_nh = get_global_id(2); // image_height nxh const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int xx = x / width; - if (flag == 0){ - xx = y / width; + if (flag == 1){ // by channel + int c_in = out_c; + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + CL_DTYPE4 output_data; + for (int i = 0; i < 4; i++) { + int c = out_c * 4 + i; + if (c >= out_C) { + break; + } + int c_in; + CL_DTYPE4 input_data; + if (c < C_0) { + c_in = c; + int2 input_pos; + input_pos.x = (c_in / 4) * out_W + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos); + } else { + c_in = c - C_0; + int2 input_pos; + input_pos.x = (c_in / 4) * out_W + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos); + } + int value_offset = c_in % 4; + CL_DTYPE value; + if (value_offset == 0) { + value = input_data.x; + } else if (value_offset == 1) { + value = input_data.y; + } else if (value_offset == 2) { + value = input_data.z; + } else if (value_offset == 3) { + value = input_data.w; + } + if (i == 0) { + output_data.x = value; + } else if (i == 1) { + output_data.y = value; + } else if (i == 2) { + output_data.z = value; + } else if (i == 3) { + output_data.w = value; + } + } + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data); + }else if (flag == 2){ // by height, width == n + int2 input_pos; + input_pos.x = out_c * out_W + out_w; + int h = out_nh / width; + CL_DTYPE4 input; + if (h < C_0){ + input_pos.y = out_nh; + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos); + }else{ + input_pos.y = (h - C_0) * width; + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos); + } + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input); + }else if (flag == 3){ // by width, width == C + int2 input_pos; + input_pos.y = out_nh; + CL_DTYPE4 input; + if (out_w < C_0){ + input_pos.x = out_c * out_W + out_w; + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos); + }else{ + input_pos.x = out_c * out_W + (out_w - C_0); + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos); + } + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input); } - if (xx < axis_size){ - CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(x, y)); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); - }else{ - int new_val = xx - axis_size; - new_val *= width; - CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(new_val, y)); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); - } - // WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); } -__kernel void concat_mul(__read_only image2d_t input0, - __write_only image2d_t output, - int axis_size, int flag, int width, int start) { - const int x = get_global_id(0); // image_width cxw/4 - const int y = get_global_id(1); // image_height nxh +__kernel void concat_mul(__read_only image2d_t input, + __write_only image2d_t output, + int flag, int C_0, int out_C, int out_W, int in_W, int width) { + const int in_w = get_global_id(0); // image_width cxw/4 + const int in_c = get_global_id(1); // image_width cxw/4 + const int in_nh = get_global_id(2); // image_height nxh const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int xx = x / width; - if (flag == 0){ - xx = y / width; - } - - if (xx < axis_size && xx >= start){ - xx -= start; - xx *= width; - CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(xx, y)); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); + int2 input_pos; + int2 output_pos; + input_pos.x = in_c * in_W + in_w; + input_pos.y = in_nh; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); + if (flag == 1){ // by channel + CL_DTYPE4 output_data; + for (int i = 0; i < 4; i++) { + int c_out = C_0 + in_c * 4 + i; + if (c_out >= out_C) { + break; + } + int2 output_pos; + output_pos.x = (c_out / 4) * in_W + in_w; + output_pos.y = in_nh; + CL_DTYPE val; + if (i == 0) { + val = input_data.x; + } else if (i == 1) { + val = input_data.y; + } else if (i == 2) { + val = input_data.z; + } else if (i == 3) { + val = input_data.w; + } + if (c_out % 4 == 0){ + output_data.x = val; + }else if (c_out % 4 == 1){ + output_data.y = val; + }else if (c_out % 4 == 2){ + output_data.z = val; + }else if (c_out % 4 == 3){ + output_data.w = val; + } + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data); + } + }else if (flag == 2){ // by height, width == n + int2 output_pos; + output_pos.x = in_c * in_W + in_w; + output_pos.y = in_nh + C_0 * width; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data); + }else if (flag == 3){ // by width, width == C + int2 output_pos; + output_pos.y = in_nh; + output_pos.x = in_c * out_W + (in_w + C_0); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data); } - } diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl similarity index 60% rename from lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl rename to lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl index 37e03e802c56d3de9ba08e97c9dfb62f8cd76e9a..1c808da68ddc923e12234bc4b6ac99b35bfffb0b 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl @@ -1,29 +1,30 @@ #include -__kernel void conv2d_1x1(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter, +__kernel void conv2d_1x1_opt( + __private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, #endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int input_c_origin, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height, - __private const int old_w) { - CL_DTYPE zero = 0.0f; + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c_block, + __private const int input_c_origin, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height, + __private const int old_w) { + const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -79,14 +80,9 @@ __kernel void conv2d_1x1(__private const int global_size_dim0, CL_DTYPE4 output3 = 0.0f; #endif - int max_w_bound = input_c * input_width; - int burndary_index = input_c * 4 - input_c_origin; - bool burndary_index_w = - burndary_index == 1 || burndary_index == 2 || burndary_index == 3; - bool burndary_index_z = burndary_index == 2 || burndary_index == 3; - bool burndary_index_y = burndary_index == 3; - - for (int i = 0; i < input_c; ++i) { + int max_w_bound = input_c_block * input_width; + int burndary_index = input_c_block * 4 - input_c_origin; + for (int i = 0; i < input_c_block; ++i) { // ------------0--------------- int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); @@ -101,34 +97,73 @@ __kernel void conv2d_1x1(__private const int global_size_dim0, READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 2)); CL_DTYPE4 weight3 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 3)); - int bound_gap = max_w_bound - pos_in.x - 1; - bool outof_bound = bound_gap < input_width && bound_gap >= 0; - input0.w = select(input0.w, zero, outof_bound && burndary_index_w); - input0.z = select(input0.z, zero, outof_bound && burndary_index_z); - input0.y = select(input0.y, zero, outof_bound && burndary_index_y); + if ((max_w_bound - pos_in.x - 1) < input_width && + (max_w_bound - pos_in.x - 1) >= 0) { + if (burndary_index == 0) { + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(input0.z, weight2, output0); + output0 = mad(input0.w, weight3, output0); + } else if (burndary_index == 1) { + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(input0.z, weight2, output0); + output0 = mad(0.0f, weight3, output0); + + } else if (burndary_index == 2) { + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(0.0f, weight2, output0); + output0 = mad(0.0f, weight3, output0); + } else if (burndary_index == 3) { + output0 = mad(input0.x, weight0, output0); + output0 = mad(0.0f, weight1, output0); + output0 = mad(0.0f, weight2, output0); + output0 = mad(0.0f, weight3, output0); + } + } else { + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(input0.z, weight2, output0); + output0 = mad(input0.w, weight3, output0); + } - output0 = mad(input0.x, weight0, output0); - output0 = mad(input0.y, weight1, output0); - output0 = mad(input0.z, weight2, output0); - output0 = mad(input0.w, weight3, output0); // -------------1-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); CL_DTYPE4 input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); - bound_gap = max_w_bound - pos_in.x - 1; - - outof_bound = bound_gap < input_width && bound_gap >= 0; - input1.w = select(input1.w, zero, outof_bound && burndary_index_w); - input1.z = select(input1.z, zero, outof_bound && burndary_index_z); - input1.y = select(input1.y, zero, outof_bound && burndary_index_y); - - output1 = mad(input1.x, weight0, output1); - output1 = mad(input1.y, weight1, output1); - output1 = mad(input1.z, weight2, output1); - output1 = mad(input1.w, weight3, output1); + if (abs(max_w_bound - pos_in.x) < input_width) { + if (burndary_index == 0) { + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(input1.z, weight2, output1); + output1 = mad(input1.w, weight3, output1); + } else if (burndary_index == 1) { + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(input1.z, weight2, output1); + output1 = mad(0.0f, weight3, output1); + + } else if (burndary_index == 2) { + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(0.0f, weight2, output1); + output1 = mad(0.0f, weight3, output1); + } else if (burndary_index == 3) { + output1 = mad(input1.x, weight0, output1); + output1 = mad(0.0f, weight1, output1); + output1 = mad(0.0f, weight2, output1); + output1 = mad(0.0f, weight3, output1); + } + } else { + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(input1.z, weight2, output1); + output1 = mad(input1.w, weight3, output1); + } // -------------2-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, @@ -136,41 +171,71 @@ __kernel void conv2d_1x1(__private const int global_size_dim0, CL_DTYPE4 input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); - bound_gap = max_w_bound - pos_in.x - 1; - - outof_bound = bound_gap < input_width && bound_gap >= 0; - input2.w = select(input2.w, zero, outof_bound && burndary_index_w); - input2.z = select(input2.z, zero, outof_bound && burndary_index_z); - input2.y = select(input2.y, zero, outof_bound && burndary_index_y); - - output2 = mad(input2.x, weight0, output2); - output2 = mad(input2.y, weight1, output2); - output2 = mad(input2.z, weight2, output2); - output2 = mad(input2.w, weight3, output2); + if (abs(max_w_bound - pos_in.x) < input_width) { + if (burndary_index == 0) { + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(input2.z, weight2, output2); + output2 = mad(input2.w, weight3, output2); + } else if (burndary_index == 1) { + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(input2.z, weight2, output2); + output2 = mad(0.0f, weight3, output2); + + } else if (burndary_index == 2) { + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(0.0f, weight2, output2); + output2 = mad(0.0f, weight3, output2); + } else if (burndary_index == 3) { + output2 = mad(input2.x, weight0, output2); + output2 = mad(0.0f, weight1, output2); + output2 = mad(0.0f, weight2, output2); + output2 = mad(0.0f, weight3, output2); + } + } else { + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(input2.z, weight2, output2); + output2 = mad(input2.w, weight3, output2); + } // -------------3-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); CL_DTYPE4 input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); - bound_gap = max_w_bound - pos_in.x - 1; - - outof_bound = bound_gap < input_width && bound_gap >= 0; - input3.w = - select(input3.w, - zero, - outof_bound && (burndary_index == 1 || burndary_index == 2 || - burndary_index == 3)); - input3.z = - select(input3.z, - zero, - outof_bound && (burndary_index == 2 || burndary_index == 3)); - input3.y = select(input3.y, zero, outof_bound && burndary_index == 3); - output3 = mad(input3.x, weight0, output3); - output3 = mad(input3.y, weight1, output3); - output3 = mad(input3.z, weight2, output3); - output3 = mad(input3.w, weight3, output3); + if (abs(max_w_bound - pos_in.x) < input_width) { + if (burndary_index == 0) { + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(input3.z, weight2, output3); + output3 = mad(input3.w, weight3, output3); + } else if (burndary_index == 1) { + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(input3.z, weight2, output3); + output3 = mad(0.0f, weight3, output3); + + } else if (burndary_index == 2) { + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(0.0f, weight2, output3); + output3 = mad(0.0f, weight3, output3); + } else if (burndary_index == 3) { + output3 = mad(input3.x, weight0, output3); + output3 = mad(0.0f, weight1, output3); + output3 = mad(0.0f, weight2, output3); + output3 = mad(0.0f, weight3, output3); + } + } else { + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(input3.z, weight2, output3); + output3 = mad(input3.w, weight3, output3); + } } #ifdef BATCH_NORM @@ -191,12 +256,10 @@ __kernel void conv2d_1x1(__private const int global_size_dim0, READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); #endif -#ifdef RELU output0 = activation_type4(output0); output1 = activation_type4(output1); output2 = activation_type4(output2); output3 = activation_type4(output3); -#endif if (out_w0 < old_w) { WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0); @@ -215,29 +278,30 @@ __kernel void conv2d_1x1(__private const int global_size_dim0, } } -__kernel void conv2d_1x1_simple(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter, +__kernel void conv2d_1x1_simple( + __private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, #endif #ifdef BATCH_NORM -__read_only image2d_t new_scale, - __read_only image2d_t new_biase, + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, #endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int input_c_origin, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height, - __private const int old_w) { + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int input_c_origin, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height, + __private const int old_w) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -360,13 +424,11 @@ __read_only image2d_t new_scale, READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); #endif - output0 = activation_type4(output0); output1 = activation_type4(output1); output2 = activation_type4(output2); output3 = activation_type4(output3); - if (out_w0 < old_w) { WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0); } diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl index 8d7950d6b897df833ada56e2de5be7c6203de9ea..771765ea6063a08784ae824a757b28450d808f6d 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl @@ -27,402 +27,509 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, __private const int offset, __private const int input_c, __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ __private const int output_width, __private const int output_height, __private const int output_c, __private const int filter_channel, - __private const int filter_width, - __private const int filter_height, - __private const int group) { + __private const int filter_width, + __private const int filter_height, + __private const int group, + __private const int input_tensor_c - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); +) { - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - if (out_c >= global_size_dim0 || - out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; #ifdef BIASE_CH - CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); + CL_DTYPE4 output = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); + CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); #else - CL_DTYPE4 output = 0.0f; + CL_DTYPE4 output = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f); #endif - CL_DTYPE4 input[9]; // 3x3 region of input - if (group == 1) { - for (int i = 0; i < input_c; ++i) { // each run for 3x3 - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - - input[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y - dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - - input[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x, pos_in.y - dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - - input[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y - dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - - input[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - input[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x, pos_in.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - input[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - input[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y + dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - - input[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x, pos_in.y + dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - - input[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y + dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - - int j = 0; - int2 pos_of_weight; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - CL_DTYPE4 weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y += 3; - CL_DTYPE4 weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y += 3; - CL_DTYPE4 weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y += 3; - CL_DTYPE4 weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 1; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 2; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 3; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 4; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 5; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 6; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 7; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 8; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); + CL_DTYPE4 input[9]; // 3x3 region of input + if (group == 1) { + for (int i = 0; i < input_c; ++i) { // each run for 3x3 + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, + in_pos_in_one_block.y); + + input[0] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[1] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[2] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[3] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[4] = select( + READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[5] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[6] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + input[7] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + input[8] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + if (i == input_c - 1) { + int c_shr = input_tensor_c % 4; + if (c_shr == 1) { + for (int k = 0; k < 9; k++) { + input[k].y = (half)0.f; + input[k].z = (half)0.f; + input[k].w = (half)0.f; + } + } else if (c_shr == 2) { + for (int k = 0; k < 9; k++) { + input[k].z = (half)0.f; + input[k].w = (half)0.f; + } + } else if (c_shr == 3) { + for (int k = 0; k < 9; k++) { + input[k].w = (half)0.f; + } + } else if (c_shr == 0) { } - } else { // group != 1 - for (int i = 0; i < 4; i++) { - int used_input_channel_num = + } + + int j = 0; + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + CL_DTYPE4 weight_x = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y += 3; + CL_DTYPE4 weight_y = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y += 3; + CL_DTYPE4 weight_z = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y += 3; + CL_DTYPE4 weight_w = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 1; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 2; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 3; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 4; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 5; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 6; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 7; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 8; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + } + } else { // group != 1 + for (int i = 0; i < 4; i++) { + int used_input_channel_num = (out_c * 4 + i) / (output_c / group) * filter_channel; - for (int f_c = 0; f_c < filter_channel; ++f_c) { - int input_c = used_input_channel_num + f_c; - int input_block = input_c / 4; - int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, - in_pos_in_one_block.y); - input[0] = select( - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + for (int f_c = 0; f_c < filter_channel; ++f_c) { + int input_c = used_input_channel_num + f_c; + int input_block = input_c / 4; + int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, + in_pos_in_one_block.y); + input[0] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, (int2)(pos_in.x - dilation, pos_in.y - dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - input[1] = - select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[1] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, (int2)(pos_in.x, pos_in.y - dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - input[2] = select( - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[2] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, (int2)(pos_in.x + dilation, pos_in.y - dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x + dilation >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - input[3] = select( - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - input[4] = select( - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - input[5] = - select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y)), - (CL_DTYPE4)(0.0f), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[3] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + input[4] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x, pos_in.y)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + input[5] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - input[6] = select( - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + input[6] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, (int2)(pos_in.x - dilation, pos_in.y + dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - input[7] = - select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + input[7] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, (int2)(pos_in.x, pos_in.y + dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - input[8] = select( - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + input[8] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, (int2)(pos_in.x + dilation, pos_in.y + dilation)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x + dilation >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - - CL_DTYPE tmp_out = 0; - for (int j = 0; j < 9; j++) { - int2 pos_of_weight; - pos_of_weight.x = (f_c / 4) * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3; - CL_DTYPE4 weight = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); - - int f_c_offset = f_c % 4; - CL_DTYPE f_value; - if (f_c_offset == 0) { - f_value = weight.x; - } else if (f_c_offset == 1) { - f_value = weight.y; - } else if (f_c_offset == 2) { - f_value = weight.z; - } else if (f_c_offset == 3) { - f_value = weight.w; - } - - int input_c_offset = input_c % 4; - CL_DTYPE input_value; - if (input_c_offset == 0) { - input_value = input[j].x; - } else if (input_c_offset == 1) { - input_value = input[j].y; - } else if (input_c_offset == 2) { - input_value = input[j].z; - } else if (input_c_offset == 3) { - input_value = input[j].w; - } - tmp_out += f_value * input_value; + (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + CL_DTYPE tmp_out = 0; + for (int j = 0; j < 9; j++) { + int2 pos_of_weight; + pos_of_weight.x = (f_c / 4) * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3; + CL_DTYPE4 weight = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + + int f_c_offset = f_c % 4; + CL_DTYPE f_value; + if (f_c_offset == 0) { + f_value = weight.x; + } else if (f_c_offset == 1) { + f_value = weight.y; + } else if (f_c_offset == 2) { + f_value = weight.z; + } else if (f_c_offset == 3) { + f_value = weight.w; } - if (i == 0) { - output.x += tmp_out; - } else if (i == 1) { - output.y += tmp_out; - } else if (i == 2) { - output.z += tmp_out; - } else if (i == 3) { - output.w += tmp_out; + int input_c_offset = input_c % 4; + CL_DTYPE input_value; + if (input_c_offset == 0) { + input_value = input[j].x; + } else if (input_c_offset == 1) { + input_value = input[j].y; + } else if (input_c_offset == 2) { + input_value = input[j].z; + } else if (input_c_offset == 3) { + input_value = input[j].w; } + tmp_out += f_value * input_value; + } + + if (i == 0) { + output.x += tmp_out; + } else if (i == 1) { + output.y += tmp_out; + } else if (i == 2) { + output.z += tmp_out; + } else if (i == 3) { + output.w += tmp_out; } } } + } - output = activation_type4(output); + output = activation_type4(output); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); } diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..79f3922e89549fc15b7a849efb0e2b6595357102 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl @@ -0,0 +1,505 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void conv2d_3x3_opt(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * 3; + int filter_h_val1 = filter_h_val0 + 3; + int filter_h_val2 = filter_h_val1 + 3; + int filter_h_val3 = filter_h_val2 + 3; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * 3; + + for (int h = 0; h < 3; h++) { + int in_h_val = select(out_batch_id * in_h + in_h_id + h, + -1, + (out_batch_id * in_h + in_h_id + h < 0 || + out_batch_id * in_h + in_h_id + h >= in_h)); + + for (int w = 0; w < 3; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} + +// support batch > 1 +__kernel void conv2d_3x3_multi_batch(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * 3; + int filter_h_val1 = filter_h_val0 + 3; + int filter_h_val2 = filter_h_val1 + 3; + int filter_h_val3 = filter_h_val2 + 3; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * 3; + + for (int h = 0; h < 3; h++) { + int in_h_val = select( + out_batch_id * in_h + in_h_id + h, + -1, + (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h || + out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h)); + + for (int w = 0; w < 3; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..4ed2e072022dc4b457a86d634bf4bc21ab62bc45 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl @@ -0,0 +1,516 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +// opt version of conv5x5 +__kernel void conv2d_5x5_opt(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // filter + const int filter_w = 5; + const int filter_h = 5; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = + select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} +// support batch > 1 +__kernel void conv2d_5x5_multi_batch(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // filter + const int filter_w = 5; + const int filter_h = 5; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = select( + out_batch_id * in_h + in_h_id + h, + -1, + (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h || + out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} \ No newline at end of file diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl index 1f99322812c13287af92b52aee6c346309ee006c..4998dc99279fffad8750ef3b6495597e9fc4ad65 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl @@ -36,10 +36,10 @@ __kernel void conv2d_7x7(__private const int global_size_dim0, const int batch_index = out_nh / output_height; const int out_nh_in_one_batch = out_nh % output_height; - const filter_n0 = 4 * out_c + 0; - const filter_n1 = 4 * out_c + 1; - const filter_n2 = 4 * out_c + 2; - const filter_n3 = 4 * out_c + 3; + const int filter_n0 = 4 * out_c + 0; + const int filter_n1 = 4 * out_c + 1; + const int filter_n2 = 4 * out_c + 2; + const int filter_n3 = 4 * out_c + 3; int2 stride_xy; stride_xy.x = stride; diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..d82f4b4c96b586b6ecf948827402afd0766dcea4 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl @@ -0,0 +1,516 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +// opt version of con7x7 +__kernel void conv2d_7x7_opt(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // filter + const int filter_w = 7; + const int filter_h = 7; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = + select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} +// support batch > 1 +__kernel void conv2d_7x7_multi_batch(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // filter + const int filter_w = 7; + const int filter_h = 7; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = select( + out_batch_id * in_h + in_h_id + h, + -1, + (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h || + out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} \ No newline at end of file diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl index 14086dcd16bd1a8770f444bdcd0b6bea78e23b7e..5626fe6be7d451d4ffe22a2008affa7d82298bc3 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl @@ -12,311 +12,375 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include -__kernel void depth_conv2d_3x3(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input, - __read_only image2d_t filter, +__kernel void depth_conv2d_3x3( + __private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, + __read_only image2d_t bias, #endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int dilation, - __private const int input_c, - __private const int input_width,/* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int dilation, + __private const int input_c, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height) { - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - const int batch_index = out_nh / output_height; + const int batch_index = out_nh / output_height; - const int out_nh_in_one_batch = out_nh % output_height; + const int out_nh_in_one_batch = out_nh % output_height; + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); - int2 stride_xy = (int2)(stride, stride); - int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); - - int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + int2 in_pos_in_one_block = + ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); #ifdef BIASE_CH - CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); + CL_DTYPE4 output = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); + CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); #else - CL_DTYPE4 output = 0.0f; -#endif - - const int filter_width = 3; - const int filter_height = 3; - - int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); - - int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height); - - int filter_x = pos_in_filter_block.x ; - int filter_y = pos_in_filter_block.y ; - - CL_DTYPE4 inputs[9]; - - inputs[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - /* - if (output_pos.x == 112 && output_pos.y == 0) { - CL_DTYPE4 input1 = inputs[3]; - float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); - printf(" input4 3 - %v4hlf \n", in); - printf(" --- %d ---\n", in_pos_in_one_block.x - 1); - } - */ - - - inputs[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - inputs[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - inputs[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - inputs[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - inputs[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - CL_DTYPE4 filters[9]; - filters[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y)); - filters[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y)); - filters[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y)); - filters[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1)); - filters[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1)); - filters[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1)); - filters[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2)); - filters[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2)); - filters[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2)); - - for(int i = 0 ;i < 9 ; i++){ - output += inputs[i] * filters[i]; - } -#ifdef BATCH_NORM - output = output * READ_IMG_TYPE(CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) + READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); + CL_DTYPE4 output = 0.0f; #endif -#ifdef RELU - output = activation_type4(output); -#endif - - - /* - - if (output_pos.x == 112 && output_pos.y == 0) { - - for (int i = 0; i < 9; ++i) { - CL_DTYPE4 input1 = inputs[i]; - float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); - printf(" input4 %d - %v4hlf \n", i, in); - } - - float4 out = (float4)(output.x, output.y, output.z, output.w); - printf(" depth wise output output4 = %v4hlf \n", out); - printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); - printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); - printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); - printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); - } - - */ - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); - + const int filter_width = 3; + const int filter_height = 3; + + int2 pos_in_input_block = + (int2)(out_c * input_width, batch_index * input_height); + + int2 pos_in_filter_block = + (int2)(out_c * filter_width, batch_index * filter_height); + + int filter_x = pos_in_filter_block.x; + int filter_y = pos_in_filter_block.y; + + CL_DTYPE4 inputs[9]; + + inputs[0] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || + in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[1] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[2] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || + in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[3] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + /* + if (output_pos.x == 112 && output_pos.y == 0) { + CL_DTYPE4 input1 = inputs[3]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 3 - %v4hlf \n", in); + printf(" --- %d ---\n", in_pos_in_one_block.x - 1); + } + */ + + inputs[4] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + inputs[5] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + inputs[6] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || + in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + inputs[7] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + inputs[8] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || + in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + CL_DTYPE4 filters[9]; + filters[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y)); + filters[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y)); + filters[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y)); + filters[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 1)); + filters[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 1)); + filters[5] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 1)); + filters[6] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 2)); + filters[7] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 2)); + filters[8] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 2)); + + for (int i = 0; i < 9; i++) { + output += inputs[i] * filters[i]; + } + + output = activation_type4(output); + + /* + + if (output_pos.x == 112 && output_pos.y == 0) { + + for (int i = 0; i < 9; ++i) { + CL_DTYPE4 input1 = inputs[i]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 %d - %v4hlf \n", i, in); + } + + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" depth wise output output4 = %v4hlf \n", out); + printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); + printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); + printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); + printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); + } + + */ + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); } - - __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, - __private const int ou_w_blk, - __private const int ou_nh, - __read_only image2d_t input, - __read_only image2d_t filter, + __private const int ou_w_blk, + __private const int ou_nh, + __read_only image2d_t input, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, + __read_only image2d_t bias, #endif - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int in_ch, - __private const int in_w,/* of one block */ - __private const int in_h, /* of one block */ - __private const int ou_w, - __private const int ou_h) { - - const int ou_ch_blk_id = get_global_id(0); - const int ou_w_blk_id = get_global_id(1); - const int ou_nh_id = get_global_id(2); - const int w_blk_size = 2; - - const int batch_id = ou_nh_id / ou_h; - int ou_col_id = ou_w_blk_id * w_blk_size; - int ou_row_id = ou_nh_id % ou_h; - int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id); - - // input pos in one block and on batch - int col_id = ou_col_id - pad; - int row_id = ou_row_id - pad; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int in_ch, + __private const int in_w, /* of one block */ + __private const int in_h, /* of one block */ + __private const int ou_w, + __private const int ou_h) { + + const int ou_ch_blk_id = get_global_id(0); + const int ou_w_blk_id = get_global_id(1); + const int ou_nh_id = get_global_id(2); + const int w_blk_size = 2; + + const int batch_id = ou_nh_id / ou_h; + int ou_col_id = ou_w_blk_id * w_blk_size; + int ou_row_id = ou_nh_id % ou_h; + int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id); + + // input pos in one block and on batch + int col_id = ou_col_id - pad; + int row_id = ou_row_id - pad; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; #ifdef BIASE_CH - CL_DTYPE4 output[2]; - output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0)); - output[1] = output[0]; + CL_DTYPE4 output[2]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0)); + output[1] = output[0]; #elif defined(BIASE_ELE) - CL_DTYPE4 output[2]; - output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id)); - if (ou_col_id + 1 < ou_w) { - output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id)); - } + CL_DTYPE4 output[2]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id)); + if (ou_col_id + 1 < ou_w) { + output[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id)); + } #else - CL_DTYPE4 output[2] = {0.0f}; + CL_DTYPE4 output[2] = {0.0f}; #endif - CL_DTYPE4 inputs[12]; - - int filter_x = ou_ch_blk_id * 3; - int filter_y = 0; - CL_DTYPE4 filters[9]; - filters[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y)); - filters[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y)); - filters[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y)); - - int in_x = mad24(ou_ch_blk_id, in_w, col_id); - int in_y = mad24(batch_id, in_h, row_id); - - int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h); - int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w); - inputs[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y0)); - int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w); - inputs[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y0)); - int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w); - inputs[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y0)); - int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w); - inputs[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y0)); - - output[0] = mad(inputs[0], filters[0], output[0]); - output[1] = mad(inputs[1], filters[0], output[1]); - - output[0] = mad(inputs[1], filters[1], output[0]); - output[1] = mad(inputs[2], filters[1], output[1]); - - output[0] = mad(inputs[2], filters[2], output[0]); - output[1] = mad(inputs[3], filters[2], output[1]); - - - filters[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1)); - filters[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1)); - filters[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1)); - - - int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h); - inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1)); - inputs[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y1)); - inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1)); - inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1)); - - - output[0] = mad(inputs[4], filters[3], output[0]); - output[1] = mad(inputs[5], filters[3], output[1]); - - output[0] = mad(inputs[5], filters[4], output[0]); - output[1] = mad(inputs[6], filters[4], output[1]); - - output[0] = mad(inputs[6], filters[5], output[0]); - output[1] = mad(inputs[7], filters[5], output[1]); - - - filters[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2)); - filters[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2)); - filters[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2)); - - int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h); - inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2)); - inputs[9] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y2)); - inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2)); - inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2)); - - - output[0] = mad(inputs[8], filters[6], output[0]); - output[1] = mad(inputs[9], filters[6], output[1]); - - output[0] = mad(inputs[9], filters[7], output[0]); - output[1] = mad(inputs[10], filters[7], output[1]); - - output[0] = mad(inputs[10], filters[8], output[0]); - output[1] = mad(inputs[11], filters[8], output[1]); -#ifdef BATCH_NORM - CL_DTYPE4 scale = READ_IMG_TYPE(CL_DTYPE_CHAR, new_scale, sampler, (int2)(ou_ch_blk_id, 0)); - CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(ou_ch_blk_id, 0)); - output[0] = mad(scale, output[0], biase); - if (ou_col_id + 1 < ou_w) { - output[1] = mad(scale, output[1], biase); - } -#endif - -#ifdef RELU - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); -#endif - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]); - if (ou_col_id + 1 < ou_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]); - } - + CL_DTYPE4 inputs[12]; + + int filter_x = ou_ch_blk_id * 3; + int filter_y = 0; + CL_DTYPE4 filters[9]; + filters[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y)); + filters[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y)); + filters[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y)); + + int in_x = mad24(ou_ch_blk_id, in_w, col_id); + int in_y = mad24(batch_id, in_h, row_id); + + int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h); + int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w); + inputs[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y0)); + int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w); + inputs[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y0)); + int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w); + inputs[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y0)); + int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w); + inputs[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y0)); + + output[0] = mad(inputs[0], filters[0], output[0]); + output[1] = mad(inputs[1], filters[0], output[1]); + + output[0] = mad(inputs[1], filters[1], output[0]); + output[1] = mad(inputs[2], filters[1], output[1]); + + output[0] = mad(inputs[2], filters[2], output[0]); + output[1] = mad(inputs[3], filters[2], output[1]); + + filters[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 1)); + filters[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 1)); + filters[5] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 1)); + + int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h); + inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1)); + inputs[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y1)); + inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1)); + inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1)); + + output[0] = mad(inputs[4], filters[3], output[0]); + output[1] = mad(inputs[5], filters[3], output[1]); + + output[0] = mad(inputs[5], filters[4], output[0]); + output[1] = mad(inputs[6], filters[4], output[1]); + + output[0] = mad(inputs[6], filters[5], output[0]); + output[1] = mad(inputs[7], filters[5], output[1]); + + filters[6] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 2)); + filters[7] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 2)); + filters[8] = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 2)); + + int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h); + inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2)); + inputs[9] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y2)); + inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2)); + inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2)); + + output[0] = mad(inputs[8], filters[6], output[0]); + output[1] = mad(inputs[9], filters[6], output[1]); + + output[0] = mad(inputs[9], filters[7], output[0]); + output[1] = mad(inputs[10], filters[7], output[1]); + + output[0] = mad(inputs[10], filters[8], output[0]); + output[1] = mad(inputs[11], filters[8], output[1]); + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + + WRITE_IMG_TYPE( + CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]); + if (ou_col_id + 1 < ou_w) { + WRITE_IMG_TYPE( + CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]); + } } - diff --git a/lite/backends/opencl/cl_kernel/image/dropout_kernel.cl b/lite/backends/opencl/cl_kernel/image/dropout_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..116b4452dd17e800da20238ad688daf5630d55fb --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/dropout_kernel.cl @@ -0,0 +1,43 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void dropout(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_W, + __private const float dropoutPro) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + half4 input; + half4 output; + + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,output_pos); + half4 dropout = (half4)(1 - dropoutPro); + output = dropout * input; + + write_imageh(output_image, output_pos, output); +} + + diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl index 17b6e8c72a82718a541841ff3c69c175649d7056..73a089d7591b98486daac2d4aaa29fe4f2192134 100644 --- a/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl @@ -14,7 +14,8 @@ limitations under the License. */ #include -__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias, +__kernel void elementwise_mul(__global image2d_t input, + __global image2d_t bias, __write_only image2d_t outputImage) { int x = get_global_id(0); int y = get_global_id(1); @@ -29,8 +30,148 @@ __kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias, WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); } -__kernel void channel_mul_d1(__read_only image2d_t input, __read_only image2d_t bias, - __write_only image2d_t outputImage, int w) { +__kernel void channel_mul(__global image2d_t input, + __global image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x / w; + coords_bias.y = 0; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + CL_DTYPE4 output = in * biase; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + +// etc : 1 1 1 72 +// run time Y [value,0,0,0] * 72 +__kernel void channel_mul_d2(__global image2d_t input, + __global image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias0; + int2 coords_bias1; + int2 coords_bias2; + int2 coords_bias3; + /* if (x == 0 && y == 0) { + CL_DTYPE4 b = (CL_DTYPE4){0, 0, 0, 0}; + #define PPI(j, k) \ + b = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2){j, k}); \ + printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \ + convert_float(b.y), convert_float(b.z), convert_float(b.w)); + for (int i = 0; i < 73; ++i) { + PPI(i, 0); + } + #undef PPI + }*/ + coords_bias0.x = x / w * 4; + coords_bias0.y = 0; + coords_bias1.x = x / w * 4 + 1; + coords_bias1.y = 0; + coords_bias2.x = x / w * 4 + 2; + coords_bias2.y = 0; + coords_bias3.x = x / w * 4 + 3; + coords_bias3.y = 0; + CL_DTYPE4 biase0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias0); + CL_DTYPE4 biase1 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias1); + CL_DTYPE4 biase2 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias2); + CL_DTYPE4 biase3 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias3); + /* if (x == 0 && y == 0) { + printf("bias0={ %f , %f , %f , %f }\n ", + convert_float(biase0.x), convert_float(biase0.y), + convert_float(biase0.z), convert_float(biase0.w)); + printf("bias1={ %f , %f , %f , %f }\n ", + convert_float(biase1.x), convert_float(biase1.y), + convert_float(biase1.z), convert_float(biase1.w)); + printf("bias2={ %f , %f , %f , %f }\n ", + convert_float(biase2.x), convert_float(biase2.y), + convert_float(biase2.z), convert_float(biase2.w)); + printf("bias3={ %f , %f , %f , %f }\n ", + convert_float(biase3.x), convert_float(biase3.y), + convert_float(biase3.z), convert_float(biase3.w)); + }*/ + CL_DTYPE4 biase = {biase0.x, biase1.x, biase2.x, biase3.x}; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 output = mad(in, biase, 0); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + +// c 1 1 +__kernel void channel_mul_d3(__global image2d_t input, + __global image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x / w; + coords_bias.y = 0; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + CL_DTYPE4 output = in * biase; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + +__kernel void channel_mul_d4(__global image2d_t input, +__global image2d_t bias, + __write_only image2d_t outputImage, int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x / w; + coords_bias.y = 0; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + CL_DTYPE4 output = in * biase; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + +#if 0 // TODO(ysh329): comment code below +__kernel void elementwise_mul(__global image2d_t input, + __global image2d_t bias, + __write_only image2d_t outputImage) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords); + CL_DTYPE4 output = in * biase; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + + +__kernel void channel_mul_d1(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage, + int w) { int x = get_global_id(0); int y = get_global_id(1); @@ -52,8 +193,88 @@ __kernel void channel_mul_d1(__read_only image2d_t input, __read_only image2d_t WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); } -__kernel void channel_mul_d2(__read_only image2d_t input, __read_only image2d_t bias, - __write_only image2d_t outputImage, int w, int h) { + +// #define DEBUG +__kernel void channel_mul_d2_nc(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + +#ifdef DEBUG + printf("x:%d y:%d\n", x, y); +#endif + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int2 coords; + coords.x = x; + coords.y = y; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + + int2 coords_bias0 = (int2)(x / w * 4, 0); + int2 coords_bias1 = (int2)(x / w * 4 + 1, 0); + int2 coords_bias2 = (int2)(x / w * 4 + 2, 0); + int2 coords_bias3 = (int2)(x / w * 4 + 3, 0); + + CL_DTYPE4 b0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias0); + CL_DTYPE4 b1 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias1); + CL_DTYPE4 b2 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias2); + CL_DTYPE4 b3 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias3); + + CL_DTYPE4 biase = {b0.x, b1.x, b2.x, b3.x}; + CL_DTYPE4 output = mad(in, biase, 0); + +#ifdef DEBUG + if (x == 0 && y == 0) { + printf("w:%d\n", w); + + printf("biase:%.1f %.1f %.1f %.1f\n", biase.x, biase.y, biase.z, biase.w); + printf("output:%.1f %.1f %.1f %.1f\n", output.x, output.y, output.z, output.w); + + coords.x = 0; + coords.y = 0; + in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + printf("in(%d,%d):%.2f %.2f %.2f %.2f\n", coords.x, coords.y, in.x, in.y, in.z, in.w); + coords.x = 0; + coords.y = 1; + in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + printf("in(%d,%d):%.2f %.2f %.2f %.2f\n", coords.x, coords.y, in.x, in.y, in.z, in.w); + coords.x = 1; + coords.y = 0; + in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + printf("in(%d,%d):%.2f %.2f %.2f %.2f\n", coords.x, coords.y, in.x, in.y, in.z, in.w); + coords.x = 1; + coords.y = 1; + in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + printf("in(%d,%d):%.2f %.2f %.2f %.2f\n", coords.x, coords.y, in.x, in.y, in.z, in.w); + + coords_bias.x = 0; + coords_bias.y = 0; + biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + printf("biase(%d,%d):%.2f %.2f %.2f %.2f\n", coords_bias.x, coords_bias.y, biase.x, biase.y, biase.z, biase.w); + coords_bias.x = 1; + coords_bias.y = 0; + biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + printf("biase(%d,%d):%.2f %.2f %.2f %.2f\n", coords_bias.x, coords_bias.y, biase.x, biase.y, biase.z, biase.w); + coords_bias.x = 2; + coords_bias.y = 0; + biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + printf("biase(%d,%d):%.2f %.2f %.2f %.2f\n", coords_bias.x, coords_bias.y, biase.x, biase.y, biase.z, biase.w); + } +#endif + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + + +__kernel void channel_mul_d2_hw(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage, + int w, + int h) { int x = get_global_id(0); int y = get_global_id(1); @@ -75,8 +296,11 @@ __kernel void channel_mul_d2(__read_only image2d_t input, __read_only image2d_t WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); } -__kernel void channel_mul_d4(__read_only image2d_t input, __read_only image2d_t bias, - __write_only image2d_t outputImage, int w) { + +__kernel void channel_mul_d4(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage, + int w) { int x = get_global_id(0); int y = get_global_id(1); @@ -97,4 +321,4 @@ __kernel void channel_mul_d4(__read_only image2d_t input, __read_only image2d_t WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); } - +#endif diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..6ed6af298f23bcfb396aefe7593ccfd52c732937 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl @@ -0,0 +1,85 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void elementwise_sub(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage) { + int x = get_global_id(0); + int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int2 coords; + coords.x = x; + coords.y = y; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords); + CL_DTYPE4 output = activation_type4(in - biase); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage,coords,output); + } + +__kernel void channel_sub(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + + int2 coords_bias; + coords_bias.x = x % w; + coords_bias.y = 0; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + CL_DTYPE4 output = in - (CL_DTYPE4)(biase.x); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); + } + +__kernel void width_sub(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + + int2 coords_bias; + coords_bias.x = x % w; + coords_bias.y = 0; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + CL_DTYPE4 output; + + output.x = in.x - biase.x; + output.y = in.y - biase.x; + output.z = in.z - biase.x; + output.w = in.w - biase.x; + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} diff --git a/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl b/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..360d8c753ef64b1da2ff2aeebddd94ff0f41db96 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl @@ -0,0 +1,168 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +__kernel void grid_sampler(__read_only image2d_t input, + __read_only image2d_t grid, + __write_only image2d_t output, + __private const int out_height, + __private const int out_width) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2) * 4; + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords1, coords2, outpoints; + coords1.x = out_h / 4 * 2; + coords1.y = out_n * out_width + out_w; + coords2.x = coords1.x + 1; + coords2.y = coords1.y; + outpoints.x = out_c * out_width + out_w; + outpoints.x = out_n * out_height + out_h; + + CL_DTYPE4 g1 = READ_IMG_TYPE(CL_DTYPE_CHAR, grid, sampler, coords1); + CL_DTYPE4 g2 = READ_IMG_TYPE(CL_DTYPE_CHAR, grid, sampler, coords2); + + // x + float x = (g1.x + 1) * (out_width - 1) * 0.5; + float y = (g2.x + 1) * (out_height - 1) * 0.5; + int x0 = floor(x); + int y0 = floor(y); + int x_p = out_c * out_width + x0; + int y_p = out_n * out_height + y0; + + float xs = x - x0; + float xe = x0 + 1 - x; + float ys = y - y0; + float ye = y0 + 1 - y; + + CL_DTYPE4 input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p)); + CL_DTYPE4 input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p)); + CL_DTYPE4 input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1)); + CL_DTYPE4 input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1)); + + if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input0 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input1 = (CL_DTYPE4)(0.0); + } + if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input2 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input3 = (CL_DTYPE4)(0.0); + } + CL_DTYPE4 out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, outpoints, out_val); + + // y + x = (g1.y + 1) * (out_width - 1) / 2; + y = (g2.y + 1) * (out_height - 1) / 2; + x0 = floor(x); + y0 = floor(y); + x_p = out_c * out_width + x0; + y_p = out_n * out_height + y0; + + xs = x - x0; + xe = x0 + 1 - x; + ys = y - y0; + ye = y0 + 1 - y; + + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p)); + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p)); + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1)); + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1)); + + if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input0 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input1 = (CL_DTYPE4)(0.0); + } + if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input2 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input3 = (CL_DTYPE4)(0.0); + } + + out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 1), out_val); + + // z + x = (g1.z + 1) * (out_width - 1) / 2; + y = (g2.z + 1) * (out_height - 1) / 2; + x0 = floor(x); + y0 = floor(y); + x_p = out_c * out_width + x0; + y_p = out_n * out_height + y0; + + xs = x - x0; + xe = x0 + 1 - x; + ys = y - y0; + ye = y0 + 1 - y; + + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p)); + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p)); + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1)); + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1)); + + if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input0 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input1 = (CL_DTYPE4)(0.0); + } + if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input2 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input3 = (CL_DTYPE4)(0.0); + } + out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 2), out_val); + + // w + x = (g1.w + 1) * (out_width - 1) / 2; + y = (g2.w + 1) * (out_height - 1) / 2; + x0 = floor(x); + y0 = floor(y); + x_p = out_c * out_width + x0; + y_p = out_n * out_height + y0; + + xs = x - x0; + xe = x0 + 1 - x; + ys = y - y0; + ye = y0 + 1 - y; + + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p)); + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p)); + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1)); + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1)); + + if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input0 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input1 = (CL_DTYPE4)(0.0); + } + if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input2 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input3 = (CL_DTYPE4)(0.0); + } + out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 3), out_val); +} diff --git a/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..3e3d65394f9924edac735084c2fe5ce550f20684 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl @@ -0,0 +1,192 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +// onnx/pytorch instancenorm by lijian +__kernel void instance_norm_onnx(__private const int in_width, + __private const int in_height, + __private const int in_c_group, + __private const int local_work_size_x, + __private const int local_work_size_y, + __private const float epsilon, + __read_only image2d_t input, + __write_only image2d_t output) { + const int out_cn = get_global_id(0); + const int n = out_cn / in_c_group; + const int c = out_cn % in_c_group; + const int w = get_local_id(1); + const int h = get_local_id(2); + const int local_id = w * local_work_size_y + h; + const int local_total_size = local_work_size_x * local_work_size_y; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +#ifdef LOCAL_MEM_128 + __local float4 shared_mem[128]; +#elif defined(LOCAL_MEM_64) + __local float4 shared_mem[64]; +#else + __local float4 shared_mem[256]; +#endif + int xOffset = c * in_width; + int yOffset = n * in_height; + float4 sum = 0.0f; + for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { + for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { + sum += read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex)); + } + } + shared_mem[local_id] = sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + if (local_id < 32) { + for (int i = local_id + 32; i < local_total_size; i += 32) { + sum += shared_mem[i]; + } + } + shared_mem[local_id] += sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + if (local_id == 0) { + int top = min(32, local_total_size); + for (int i = 0; i < top; i += 1) { + sum += shared_mem[i]; + } + shared_mem[0] = sum / (in_width * in_height); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + const float4 mean_val = shared_mem[0]; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { + for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { + float4 temp = read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex)) - mean_val; + sum += temp * temp; + } + } + shared_mem[local_id] = sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + if (local_id < 32) { + for (int i = local_id + 32; i < local_total_size; i += 32) { + sum += shared_mem[i]; + } + } + shared_mem[local_id] += sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + if (local_id == 0) { + int top = min(32, local_total_size); + for (int i = 0; i < top; i += 1) { + sum += shared_mem[i]; + } + shared_mem[0] = sum / (in_width * in_height); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + const float4 sigma = sqrt(shared_mem[0] + (float4)(epsilon)); + + float4 s = 1 / sigma; + + for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { + for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { + int2 intout_pos = (int2)(xOffset + xIndex, yOffset + yIndex); + float4 in_val = read_imagef(input, sampler, intout_pos); + half4 out_val = convert_half4((in_val - mean_val) * s); +#ifdef RELU + out_val = activation(out_val); +#endif + write_imageh(output, intout_pos, out_val); + } + } +} + + +// paddle instancenorm by zhangxi +__kernel void instance_norm_paddle(__read_only image2d_t input, + __write_only image2d_t output, + __read_only image2d_t scale, + __read_only image2d_t bias, + const float epsilon, + const int in_h, + const int in_w){ + __local CL_DTYPE4 saved_mean[1024]; + __local CL_DTYPE4 saved_variance[1024]; + const int lid = get_local_id(0); + const int lsize = get_local_size(0); + const int gidx = get_group_id(0); + const int gidy = get_group_id(1); + const int spatial_size = in_h * in_w; + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + CL_DTYPE4 mean = (CL_DTYPE4)(0.f, 0.f, 0.f, 0.f); + CL_DTYPE4 variance = (CL_DTYPE4)(0.f, 0.f, 0.f, 0.f); + CL_DTYPE4 vepsilon = (CL_DTYPE4)(epsilon, epsilon, epsilon, epsilon); + const int x_offset = gidx * in_w; + const int y_offset = gidy * in_h; + int2 coor; + for (int i = lid; i < spatial_size; i += lsize) { + coor.x = i % in_w + x_offset; + coor.y = i / in_w + y_offset; + CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor); + mean += pixel; + variance += pixel * pixel; + } + saved_mean[lid] = mean; + saved_variance[lid] = variance; + barrier(CLK_LOCAL_MEM_FENCE); + + //! do reduction + int dynamic_size = lsize >> 1; + for (; dynamic_size > 0; dynamic_size >>= 1){ + if (lid < dynamic_size) { + saved_mean[lid] += saved_mean[lid + dynamic_size]; + saved_variance[lid] += saved_variance[lid + dynamic_size]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + mean = saved_mean[0] / spatial_size; + variance = saved_variance[0] / spatial_size - mean * mean; + variance = rsqrt(variance + vepsilon); + + //! do instance norm + coor.x = gidx; + coor.y = gidy; + CL_DTYPE4 vscale = READ_IMG_TYPE(CL_DTYPE_CHAR, scale, sampler, coor); + vscale *= variance; + CL_DTYPE4 vbias = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coor); + for (int i = lid; i < spatial_size; i += lsize) { + coor.x = i % in_w + x_offset; + coor.y = i / in_w + y_offset; + CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor); + pixel = (pixel - mean) * vscale + vbias; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, coor, pixel); + } +} diff --git a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..4c90981eb97f864b2c7ffa3b01e61b23aa4444de --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl @@ -0,0 +1,322 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +// #define DEBUG +//////////////////////////////////////////////////////// +// buffer -> image2d +//////////////////////////////////////////////////////// +__kernel void buffer_to_image2d(__global CL_DTYPE* in, + __write_only image2d_t output_image, + __private const int out_H, + __private const int out_W, + __private const int out_C, + __private const int Stride0, + __private const int Stride1, + __private const int Stride2) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_n; + const int in_c0 = out_c * 4 + 0; + const int in_c1 = out_c * 4 + 1; + const int in_c2 = out_c * 4 + 2; + const int in_c3 = out_c * 4 + 3; + const int in_h = out_h; + const int in_w = out_w; + + int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w; + int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w; + int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w; + int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)(0.f, 0.f, 0.f, 0.f); + output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE); + + if (out_C - 4 * out_c >= 2) { + output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE); + } + if (out_C - 4 * out_c >= 3) { + output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE); + } + if (out_C - 4 * out_c >= 4) { + output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE); + } + +#ifdef DEBUG + if (out_w > 2045) { + printf( + "out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f " + "%.2f\n", + out_w, + out_C - 4 * out_c, + (float)(in[input_pos0]), + (float)(in[input_pos1]), + (float)(in[input_pos2]), + (float)(in[input_pos3])); + printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", + out_c, + out_w, + out_nh, + output_pos.x, + output_pos.y, + (float)(output.x), + (float)(output.y), + (float)(output.z), + (float)(output.w)); + } +#endif + + WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output); +} + +//////////////////////////////////////////////////////// +// image2d -> buffer +//////////////////////////////////////////////////////// +__kernel void image2d_to_buffer(__read_only image2d_t input, + __private const int in_width, + __private const int in_height, + __global CL_DTYPE* out, + __private const int size_ch, + __private const int size_block, + __private const int size_batch, + __private const int C) { + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + + const int in_n = in_nh / in_height; + const int in_h = in_nh % in_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + const int pos_x = mad24(in_c, in_width, in_w); + CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE( + CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)); + +#ifdef DEBUG + if (in_w > 2045) { + printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", + in_c, + in_w, + in_nh, + pos_x, + in_nh, + (float)(in.x), + (float)(in.y), + (float)(in.z), + (float)(in.w)); + } +#endif + + const int index = + in_n * size_batch + in_c * size_block + in_h * in_width + in_w; + out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE); + if (C - 4 * in_c >= 2) { + out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE); + } + if (C - 4 * in_c >= 3) { + out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE); + } + if (C - 4 * in_c >= 4) { + out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE); + } +} + +#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile +//////////////////////////////////////////////////////// +// buffer -> image2d_nw +//////////////////////////////////////////////////////// +__kernel void buffer_to_image2d_nw(__global CL_DTYPE* in, + __write_only image2d_t output_image, + __private const int out_H, + __private const int out_W, + __private const int out_N, + __private const int Stride0, + __private const int Stride1, + __private const int Stride2) { + const int out_n = get_global_id(0); + const int out_w = get_global_id(1); + const int out_ch = get_global_id(2); + + const int out_c = out_ch / out_H; + const int out_h = out_ch % out_H; + + const int in_c = out_c; // index of c in h direction + + const int in_n0 = out_n * 4 + 0; + const int in_n1 = out_n * 4 + 1; + const int in_n2 = out_n * 4 + 2; + const int in_n3 = out_n * 4 + 3; + + const int in_h = out_h; + const int in_w = out_w; + + int input_pos0 = in_n0 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; + int input_pos1 = in_n1 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; + int input_pos2 = in_n2 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; + int input_pos3 = in_n3 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; + + int2 output_pos; + output_pos.x = out_n * out_W + out_w; + output_pos.y = out_ch; + + CL_DTYPE4 output = (CL_DTYPE4)0.0f; + output.x = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos0]); + if (out_N - 4 * out_n >= 2) { + output.y = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos1]); + } + if (out_N - 4 * out_n >= 3) { + output.z = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos2]); + } + if (out_N - 4 * out_n >= 4) { + output.w = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos3]); + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} +#endif + +#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile +// image2d -> buffer +__kernel void image2d_to_buffer_2d(__private const int in_height, + __private const int in_width, + __read_only image2d_t input, + __global CL_DTYPE* out) { + const int in_w = get_global_id(1); + const int in_h = get_global_id(2); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(in_w, in_h)); + + const int index = (in_h * in_width + in_w) * 4; + out[index] = CONVERT_TYPE_TO(CL_DTYPE, in.x); + out[index + 1] = CONVERT_TYPE_TO(CL_DTYPE, in.y); + out[index + 2] = CONVERT_TYPE_TO(CL_DTYPE, in.z); + out[index + 3] = CONVERT_TYPE_TO(CL_DTYPE, in.w); +} +#endif + +//////////////////////////////////////////////////////// +// buffer -> image2d (divide by 255 to normalize) +//////////////////////////////////////////////////////// +__kernel void buffer_to_image2d_with_pre255(__global uchar* in, + __write_only image2d_t output_image, + __private const int out_H, + __private const int out_W, + __private const int out_C, + __private const int Stride0, + __private const int Stride1, + __private const int Stride2) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_n; + const int in_c0 = out_c * 4 + 0; + const int in_c1 = out_c * 4 + 1; + const int in_c2 = out_c * 4 + 2; + const int in_c3 = out_c * 4 + 3; + const int in_h = out_h; + const int in_w = out_w; + + int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w; + int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w; + int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w; + int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f; + output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255; + if (out_C - 4 * out_c >= 2) { + output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255; + } + if (out_C - 4 * out_c >= 3) { + output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255; + } + if (out_C - 4 * out_c >= 4) { + output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255; + } + WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output); +} + +//////////////////////////////////////////////////////// +// image2d -> buffer (multiply by 255 to de-normalize) +//////////////////////////////////////////////////////// +__kernel void image2d_to_buffer_with_post255(__read_only image2d_t input, + __private const int in_width, + __private const int in_height, + __global uchar* out, + __private const int size_ch, + __private const int size_block, + __private const int size_batch, + __private const int C) { + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + const int in_n = in_nh / in_height; + const int in_h = in_nh % in_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + const int pos_x = mad24(in_c, in_width, in_w); + CL_COMPUTE_DTYPE4 in = + READ_IMG_TYPE( + CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) * + 255; + +#ifdef DEBUG + printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n", + in_c, + in_w, + in_nh, + pos_x, + in_nh, + in.x, + in.y, + in.z, + in.w); +#endif + + const int index = + in_n * size_batch + in_c * size_block + in_h * in_width + in_w; + out[index] = convert_uchar_sat(in.x); + if (C - 4 * in_c >= 2) { + out[index + size_ch] = convert_uchar_sat(in.y); + } + if (C - 4 * in_c >= 3) { + out[index + size_ch * 2] = convert_uchar_sat(in.z); + } + if (C - 4 * in_c >= 4) { + out[index + size_ch * 3] = convert_uchar_sat(in.w); + } +} diff --git a/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl b/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..655a2657e07c419d4e50aed0e78cb8c37afa4b2a --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl @@ -0,0 +1,159 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void lrn(__read_only image2d_t input, + __write_only image2d_t output, + __private const int out_C, + __private const int out_W, + __private const int local_size, + __private const float k, + __private const float alpha, + __private const float beta){ + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const int out_c0 = out_c * 4; + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const int out_c1 = out_c0 + 1; + const int out_c2 = out_c0 + 2; + const int out_c3 = out_c0 + 3; + + const int pad = (local_size - 1) / 2; + const int start = out_c0 - pad; + const int end = out_c0 + pad; + start = start > 0 ? start : 0; + end = end < out_C - 1 ? end : out_C - 1; + float square0 = 0.0; + float square1 = 0.0; + float square2 = 0.0; + float square3 = 0.0; + for (int i = start; i <= end; i++){ + int input_c0 = i / 4; + int2 input_pos; + input_pos.x = input_c0 * out_C + out_w; + input_pos.y = out_nh; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); + int num = i % 4; + switch (num){ + case 0: + square0 += input_data.x * input_data.x; + break; + case 1: + square0 += input_data.y * input_data.y; + break; + case 2: + square0 += input_data.z * input_data.z; + break; + case 3: + square0 += input_data.w * input_data.w; + break; + } + } + start = out_c1 - pad; + end = out_c1 + pad; + for (int i = start; i <= end; i++){ + int input_c0 = i / 4; + int2 input_pos; + input_pos.x = input_c0 * out_C + out_w; + input_pos.y = out_nh; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); + int num = i % 4; + switch (num){ + case 0: + square1 += input_data.x * input_data.x; + break; + case 1: + square1 += input_data.y * input_data.y; + break; + case 2: + square1 += input_data.z * input_data.z; + break; + case 3: + square1 += input_data.w * input_data.w; + break; + } + } + start = out_c2 - pad; + end = out_c2 + pad; + for (int i = start; i <= end; i++){ + int input_c0 = i / 4; + int2 input_pos; + input_pos.x = input_c0 * out_C + out_w; + input_pos.y = out_nh; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); + int num = i % 4; + switch (num){ + case 0: + square2 += input_data.x * input_data.x; + break; + case 1: + square2 += input_data.y * input_data.y; + break; + case 2: + square2 += input_data.z * input_data.z; + break; + case 3: + square2 += input_data.w * input_data.w; + break; + } + } + start = out_c3 - pad; + end = out_c3 + pad; + for (int i = start; i <= end; i++){ + int input_c0 = i / 4; + int2 input_pos; + input_pos.x = input_c0 * out_C + out_w; + input_pos.y = out_nh; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); + int num = i % 4; + switch (num){ + case 0: + square3 += input_data.x * input_data.x; + break; + case 1: + square3 += input_data.y * input_data.y; + break; + case 2: + square3 += input_data.z * input_data.z; + break; + case 3: + square3 += input_data.w * input_data.w; + break; + } + } + int2 out_pos; + out_pos.x = out_c * out_W + out_w; + out_pos.y = out_nh; + CL_DTYPE4 input = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, out_pos); + + float4 out_val; + out_val.x = input.x / (pow(k + alpha * (square0), beta)); + if (out_c1 < out_C){ + out_val.y = input.y / (pow(k + alpha * (square1), beta)); + } + if (out_c2 < out_C){ + out_val.z = input.z / (pow(k + alpha * (square1), beta)); + } + if (out_c3 < out_C){ + out_val.w = input.w / (pow(k + alpha * (square1), beta)); + } + CL_DTYPE4 out_data = CONVERT_TYPE_TO(out_val, CL_DTYPE4); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, out_pos, out_data); +} diff --git a/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl b/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl index b74449d9c8a02551cd74d366849768b4a91a4dce..1df1f0c18b7abb7e715716856dbec7c7d4d5108a 100644 --- a/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl @@ -12,26 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output, - __private const float scale_h, __private const float scale_w, - __private const int in_dims_h, __private const int out_dims_h, - __private const int in_dims_w, __private const int out_dims_w) { - const int c = get_global_id(0); - const int w = get_global_id(1); - const int nh = get_global_id(2); - int2 output_pos; - output_pos.x = c * out_dims_w + w; - output_pos.y = nh; - int out_n = nh / out_dims_h; - int out_h = nh % out_dims_h; - int2 input_pos; - input_pos.x = c * in_dims_w + w / scale_w; - input_pos.y = out_n * in_dims_h + out_h / scale_h; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - half4 input_data = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y)); - write_imageh(output, (int2)(output_pos.x , output_pos.y), input_data); +#include + + +__kernel void nearest_interp(__read_only image2d_t input, + __write_only image2d_t output, + __private const float scale_h, + __private const float scale_w, + __private const int in_dims_h, + __private const int out_dims_h, + __private const int in_dims_w, + __private const int out_dims_w) { + + const int c = get_global_id(0); + const int w = get_global_id(1); + const int nh = get_global_id(2); + + int2 output_pos; + output_pos.x = c * out_dims_w + w; + output_pos.y = nh; + + int out_n = nh / out_dims_h; + int out_h = nh % out_dims_h; + + int2 input_pos; + input_pos.x = c * in_dims_w + w / scale_w; + input_pos.y = out_n * in_dims_h + out_h / scale_h; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(input_pos.x, input_pos.y)); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(output_pos.x , output_pos.y), input_data); } diff --git a/lite/backends/opencl/cl_kernel/image/pad2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/pad2d_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..e65aad3d19bc674aff2f71d2403e611cd247abf1 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/pad2d_kernel.cl @@ -0,0 +1,108 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void pad2d_constant( + __read_only image2d_t input, __write_only image2d_t output, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_h0, const int pad_h1, + const int pad_w0, const int pad_w1, + const float pad_value) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + int2 output_pos = (int2)(mad24(out_c, out_width, out_w), out_nh); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int x = out_w - pad_w0; + int y = out_h - pad_h0; + + if (x < 0 || y < 0 || x >= in_width || y >= in_height) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, (CL_DTYPE4)(pad_value)); + } else { + int2 coor = (int2)(out_c * in_width + x, out_n * in_height + y); + CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, pixel); + } +} + +__kernel void pad2d_reflect( + __read_only image2d_t input, __write_only image2d_t output, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_h0, const int pad_h1, + const int pad_w0, const int pad_w1, + const float pad_value) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + int2 output_pos = (int2)(mad24(out_c, out_width, out_w), out_nh); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int x = out_w - pad_w0; + int y = out_h - pad_h0; + + x = abs(x); + y = abs(y); + x = x < in_width ? x : 2 * in_width - 2 - x; + y = y < in_height ? y : 2 * in_height - 2 - y; + int2 coor = (int2)(out_c * in_width + x, out_n * in_height + y); + CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, pixel); +} + +__kernel void pad2d_edge( + __read_only image2d_t input, __write_only image2d_t output, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_h0, const int pad_h1, + const int pad_w0, const int pad_w1, + const float pad_value) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + int2 output_pos = (int2)(mad24(out_c, out_width, out_w), out_nh); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int x = out_w - pad_w0; + int y = out_h - pad_h0; + + x = x > 0 ? x : 0; + x = x < in_width ? x : in_width - 1; + y = y > 0 ? y : 0; + y = y < in_height ? y : in_height - 1; + int2 coor = (int2)(out_c * in_width + x, out_n * in_height + y); + CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, pixel); +} diff --git a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl index 775166261d01dc639cd5af8cee49f7e7fb30cb19..f64c2b5e7b21d81a50acd485938ca4f74c3f013b 100644 --- a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl @@ -15,17 +15,17 @@ limitations under the License. */ #include __kernel void pool_max(__read_only image2d_t input, - __write_only image2d_t output, - __private const int in_height, - __private const int in_width, - __private const int out_height, - __private const int out_width, - __private const int ksize_h, - __private const int ksize_w, - __private const int stride_h, - __private const int stride_w, - __private const int pad_top, - __private const int pad_left) { + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int ksize_h, + __private const int ksize_w, + __private const int stride_h, + __private const int stride_w, + __private const int pad_top, + __private const int pad_left) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -37,18 +37,19 @@ __kernel void pool_max(__read_only image2d_t input, int start_h = out_h * stride_h - pad_top; int end_h = min(start_h + ksize_h, in_height); - start_h = max(start_h,0); + start_h = max(start_h, 0); int start_w = out_w * stride_w - pad_left; int end_w = min(start_w + ksize_w, in_width); - start_w = max(start_w,0); + start_w = max(start_w, 0); const int pos_in_x = out_c * in_width; const int pos_in_y = out_n * in_height; CL_DTYPE4 max_value = (CL_DTYPE4)(MIN_VALUE); for (int y = start_h; y < end_h; ++y) { for (int x = start_w; x < end_w; ++x) { - CL_DTYPE4 tmp = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + CL_DTYPE4 tmp = READ_IMG_TYPE( + CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); max_value = max(max_value, tmp); } } @@ -58,17 +59,17 @@ __kernel void pool_max(__read_only image2d_t input, } __kernel void pool_avg(__read_only image2d_t input, - __write_only image2d_t output, - __private const int in_height, - __private const int in_width, - __private const int out_height, - __private const int out_width, - __private const int ksize_h, - __private const int ksize_w, - __private const int stride_h, - __private const int stride_w, - __private const int pad_top, - __private const int pad_left) { + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int ksize_h, + __private const int ksize_w, + __private const int stride_h, + __private const int stride_w, + __private const int pad_top, + __private const int pad_left) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -90,10 +91,121 @@ __kernel void pool_avg(__read_only image2d_t input, for (int y = start_h; y < end_h; ++y) { for (int x = start_w; x < end_w; ++x) { - sum += READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + sum += READ_IMG_TYPE( + CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); } } CL_DTYPE4 avg = sum / (ksize_h * ksize_w); const int pos_out_x = mad24(out_c, out_width, out_w); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), avg); } + +__kernel void pool_avg_global(__read_only image2d_t input, + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int ksize_h, + __private const int ksize_w, + __private const int stride_h, + __private const int stride_w, + __private const int pad_top, + __private const int pad_left) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); // =1 + const int out_nh = get_global_id(2); // = n*1 + + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // do not use dtype4 here + // skip issue for half 2048 + float4 sum = (float4)(0.0f); + + const int pos_in_x = out_c * in_width; + const int pos_in_y = out_n * in_height; + for (int y = 0; y < in_height; ++y) { + for (int x = 0; x < in_width; ++x) { + half4 tmp = READ_IMG_TYPE( + CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + + sum.x = convert_float(tmp.x) + sum.x; + sum.y = convert_float(tmp.y) + sum.y; + sum.z = convert_float(tmp.z) + sum.z; + sum.w = convert_float(tmp.w) + sum.w; + } + } + const float global_size_div = 1.0f / (in_height * in_width); + half4 avg; + avg.x = convert_half((sum.x * global_size_div)); + avg.y = convert_half((sum.y * global_size_div)); + avg.z = convert_half((sum.z * global_size_div)); + avg.w = convert_half((sum.w * global_size_div)); + +#ifdef DEBUG + if (out_c == 0) { + printf("\033[31msum.x= %f \033 \n ", sum.x); + printf("sum.y=%f \n ", sum.y); + printf("sum.z=%f \n ", sum.z); + printf("sum.w=%f \n ", sum.w); + printf("one4.x=%f \n ", convert_float(one4.x)); + + printf("in_height=%d \n ", in_height); + printf("in_width=%d \n ", in_width); + printf("ksize_h=%d \n ", ksize_h); + printf("ksize_w=%d \n ", ksize_w); + printf("stride_h=%d \n ", stride_h); + printf("stride_w=%d \n ", stride_w); + printf("pad_top=%d \n ", pad_top); + printf("pad_left=%d \n ", pad_left); + printf("out_width=%d \n ", out_width); + printf("out_height=%d \n ", out_height); + printf("i++=%d \n ", i++); + printf("avg.x=%f \n ", convert_float(avg.x)); + printf("avg.y=%f \n ", convert_float(avg.y)); + printf("avg.z=%f \n ", convert_float(avg.z)); + printf("avg.w=%f \n ", convert_float(avg.w)); + } +#endif + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(out_c, out_nh), avg); +} +__kernel void pool_max_global(__read_only image2d_t input, + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int ksize_h, + __private const int ksize_w, + __private const int stride_h, + __private const int stride_w, + __private const int pad_top, + __private const int pad_left) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); // =1 + const int out_nh = get_global_id(2); // = n*1 + + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 max_value = (CL_DTYPE4)(MIN_VALUE); + const int pos_in_x = out_c * in_width; + const int pos_in_y = out_n * in_height; + for (int y = 0; y < in_height; ++y) { + for (int x = 0; x < in_width; ++x) { + max_value = max(max_value, + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_x + x, pos_in_y + y))); + } + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(out_c, out_nh), max_value); +} \ No newline at end of file diff --git a/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl b/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl deleted file mode 100644 index 7750bd98a29151ba2428bdafd462420393fe7433..0000000000000000000000000000000000000000 --- a/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -__kernel void relu6(__read_only image2d_t input, - __write_only image2d_t output, - __private const float threshold){ - - const int x = get_global_id(0); - const int y = get_global_id(1); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); - in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in); - in = min((CL_DTYPE4)(threshold, threshold, threshold, threshold), in); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); -} diff --git a/lite/backends/opencl/cl_kernel/image/scale_kernel.cl b/lite/backends/opencl/cl_kernel/image/scale_kernel.cl index 739ff1338582b65d87dbd9c92f1ea86e0c49f0ff..dfc25063cc2e36d768f1bc4d7ff992c87fe17592 100644 --- a/lite/backends/opencl/cl_kernel/image/scale_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/scale_kernel.cl @@ -27,6 +27,6 @@ __kernel void scale(__read_only image2d_t input, CLK_FILTER_NEAREST; CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); - in = convert_float(scale) * in + convert_float(bias); + in = CONVERT_TYPE_TO(scale, CL_DTYPE) * in + CONVERT_TYPE_TO(bias, CL_DTYPE); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); } diff --git a/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl b/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl deleted file mode 100644 index d2cb8fa36e21167979172fba634a7862c932b74c..0000000000000000000000000000000000000000 --- a/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -__kernel void sigmoid(__read_only image2d_t input, - __write_only image2d_t output) { - - const int x = get_global_id(0); // image_width - const int y = get_global_id(1); // image_height - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); - CL_DTYPE4 out = 1 / (1 + exp(-in)); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); -} diff --git a/lite/backends/opencl/cl_kernel/image/slice_kernel.cl b/lite/backends/opencl/cl_kernel/image/slice_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..1ef74bb14213beaa0e83e28b99b592ac1dcc667d --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/slice_kernel.cl @@ -0,0 +1,78 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void slice(__read_only image2d_t input, __write_only image2d_t output, + __private const int start, __private const int end, + __private const int dims_w){ + + const int c = get_global_id(0); + const int w = get_global_id(1); + const int nh = get_global_id(2); + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 output_pos; + output_pos.x = c * dims_w + w; + output_pos.y = nh; + + int2 input_pos; + half4 input_data; + half4 output_data; + + if (start % 4 == 0) { + input_pos.x = (4 * c + start) / 4 * dims_w + w; + input_pos.y = nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos); + output_data = input_data; + } else if (start % 4 == 1) { + input_pos.x = (4 * c + start) / 4 * dims_w + w; + input_pos.y = nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos); + output_data.x = input_data.y; + output_data.y = input_data.z; + output_data.z = input_data.w; + input_pos.x = input_pos.x + dims_w; + input_pos.y = nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos); + output_data.w = input_data.x; + } else if (start % 4 == 2) { + input_pos.x = (4 * c + start) / 4 * dims_w + w; + input_pos.y = nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos); + output_data.x = input_data.z; + output_data.y = input_data.w; + input_pos.x = input_pos.x + dims_w; + input_pos.y = nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos); + output_data.z = input_data.x; + output_data.w = input_data.y; + } else if (start % 4 == 3) { + input_pos.x = (4 * c + start) / 4 * dims_w + w; + input_pos.y = nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos); + output_data.x = input_data.w; + input_pos.x = input_pos.x + dims_w; + input_pos.y = nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos); + output_data.y = input_data.x; + output_data.z = input_data.y; + output_data.w = input_data.z; + } + write_imageh(output, output_pos, output_data); + +} + diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc index 0c7b2f8575a88082f6d79a5392c4468715a701b9..d8232cda4c790646fb5a4aae7d4e00d272d3a640 100644 --- a/lite/backends/opencl/cl_runtime.cc +++ b/lite/backends/opencl/cl_runtime.cc @@ -1,11 +1,8 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,13 +26,15 @@ CLRuntime* CLRuntime::Global() { CLRuntime::~CLRuntime() { if (command_queue_ != nullptr) { + command_queue_->flush(); command_queue_->finish(); } - // For controlling the destruction order: + // For controlling the destruction order command_queue_.reset(); context_.reset(); device_.reset(); platform_.reset(); + device_info_.clear(); } bool CLRuntime::Init() { @@ -46,6 +45,9 @@ bool CLRuntime::Init() { bool is_device_init = InitializeDevice(); is_init_success_ = is_platform_init && is_device_init; initialized_ = true; + + context_ = CreateContext(); + command_queue_ = CreateCommandQueue(context()); return initialized_; } @@ -56,7 +58,7 @@ cl::Platform& CLRuntime::platform() { cl::Context& CLRuntime::context() { if (context_ == nullptr) { - context_ = CreateContext(); + LOG(FATAL) << "context_ create failed. "; } return *context_; } @@ -68,20 +70,15 @@ cl::Device& CLRuntime::device() { cl::CommandQueue& CLRuntime::command_queue() { if (command_queue_ == nullptr) { - command_queue_ = CreateCommandQueue(context()); + LOG(FATAL) << "command_queue_ create failed. "; } return *command_queue_; } std::unique_ptr CLRuntime::CreateProgram( const cl::Context& context, std::string file_name) { - std::ifstream file{file_name, std::ios::binary | std::ios::ate}; - CHECK(file.is_open()) << "Can't open file from " << file_name; - auto size = file.tellg(); - CHECK(size > 0) << "size is too small."; - std::string content(size, '\0'); - file.seekg(0); - file.read(&content[0], size); + auto cl_file = opencl_kernels_files.find(file_name); + std::string content(cl_file->second.begin(), cl_file->second.end()); cl::Program::Sources sources; sources.push_back(content); auto prog = @@ -101,8 +98,8 @@ std::unique_ptr CLRuntime::CreateEvent( } bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) { - std::string build_option = options + " -cl-fast-relaxed-math -I " + - CLRuntime::Global()->cl_path() + "/cl_kernel"; + /* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/ + std::string build_option = options + " -cl-fast-relaxed-math -cl-mad-enable"; VLOG(4) << "OpenCL build_option: " << build_option; status_ = program->build({*device_}, build_option.c_str()); CL_CHECK_ERROR(status_); @@ -132,7 +129,33 @@ bool CLRuntime::InitializePlatform() { return true; } +GpuType CLRuntime::ParseGpuTypeFromDeviceName(std::string device_name) { + const std::string kMALI_PATTERN_STR = "Mali"; + const std::string kADRENO_PATTERN_STR = "QUALCOMM Adreno(TM)"; + const std::string kPOWERVR_PATTERN_STR = "PowerVR"; + + if (device_name == kADRENO_PATTERN_STR) { + LOG(INFO) << "adreno gpu"; + return GpuType::QUALCOMM_ADRENO; + } else if (device_name.find(kMALI_PATTERN_STR) != std::string::npos) { + LOG(INFO) << "mali gpu"; + return GpuType::ARM_MALI; + } else if (device_name.find(kPOWERVR_PATTERN_STR) != std::string::npos) { + LOG(INFO) << "powerVR gpu"; + return GpuType::IMAGINATION_POWERVR; + } else { + LOG(INFO) << "others gpu"; + return GpuType::UNKNOWN; + } +} + bool CLRuntime::InitializeDevice() { + // ===================== BASIC ===================== + // CL_DEVICE_TYPE_GPU + // CL_DEVICE_NAME + // CL_DEVICE_SUPPORT + // CL_DEVICE_MAX_COMPUTE_UNITS + // CL_DEVICE_MAX_CLOCK_FREQUENCY std::vector all_devices; status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices); CL_CHECK_ERROR(status_); @@ -145,27 +168,228 @@ bool CLRuntime::InitializeDevice() { auto device_name = device_->getInfo(); LOG(INFO) << "Using device: " << device_name; + gpu_type_ = ParseGpuTypeFromDeviceName(device_name); + + cl_device_type device_type = device_->getInfo(); + auto device_type_to_str = [](cl_device_type t) -> std::string { + std::string t_str{""}; + switch (t) { + case CL_DEVICE_TYPE_CPU: + t_str = "CPU"; + break; + case CL_DEVICE_TYPE_GPU: + t_str = "GPU"; + break; + case CL_DEVICE_TYPE_ACCELERATOR: + t_str = "Accelerator"; + break; + case CL_DEVICE_TYPE_DEFAULT: + t_str = "Default"; + break; + default: + t_str = "Unknown"; + } + return t_str; + }; + const std::string device_version = device_->getInfo(); + LOG(INFO) << "device_version:" << device_version; + + LOG(INFO) << "device_type:" << device_type_to_str(device_type); + device_info_["CL_DEVICE_TYPE"] = device_type; + + auto max_units = device_->getInfo(); + LOG(INFO) << "The chosen device has " << max_units << " compute units."; + device_info_["CL_DEVICE_MAX_COMPUTE_UNITS"] = max_units; + + auto max_clock_freq = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_MAX_CLOCK_FREQUENCY:" << max_clock_freq; + device_info_["CL_DEVICE_MAX_CLOCK_FREQUENCY"] = max_clock_freq; + + // ===================== MEMORY ===================== + // CL_DEVICE_LOCAL_MEM_SIZE + // CL_DEVICE_GLOBAL_MEM_CACHE_SIZE + // CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE + // CL_DEVICE_GLOBAL_MEM_SIZE + auto local_mem_kb = + static_cast(device_->getInfo()) / 1024; + LOG(INFO) << "The local memory size of the chosen device is " << local_mem_kb + << " KB."; + device_info_["CL_DEVICE_LOCAL_MEM_SIZE_KB"] = local_mem_kb; + + auto global_mem_cache_size_kb = + static_cast(device_->getInfo()) / + 1024; + LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_CACHE_SIZE(KB):" + << global_mem_cache_size_kb << " KB."; + device_info_["CL_DEVICE_GLOBAL_MEM_CACHE_SIZE_KB"] = global_mem_cache_size_kb; + + auto global_mem_cacheline_size_kb = + static_cast( + device_->getInfo()) / + 1024; + LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE(KB):" + << global_mem_cacheline_size_kb << " KB."; + device_info_["CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE_KB"] = + global_mem_cacheline_size_kb; + + auto global_mem_size_kb = + static_cast(device_->getInfo()) / 1024; + LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_SIZE(KB):" << global_mem_size_kb << " KB."; + device_info_["CL_DEVICE_GLOBAL_MEM_SIZE_KB"] = global_mem_size_kb; + + // ===================== WORK_GROUP ===================== + // CL_DEVICE_MAX_WORK_GROUP_SIZE + // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS + // CL_DEVICE_MAX_WORK_ITEM_SIZES + auto max_work_group_size = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_MAX_WORK_GROUP_SIZE:" << max_work_group_size; + device_info_["CL_DEVICE_MAX_WORK_GROUP_SIZE"] = max_work_group_size; + + auto max_dims_num = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:" << max_dims_num; + device_info_["CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS"] = max_dims_num; + + auto max_work_item_sizes = device_->getInfo(); + for (size_t i = 0; i < max_work_item_sizes.size(); ++i) { + LOG(INFO) << "max_work_item_sizes[" << i << "]:" << max_work_item_sizes[i]; + std::string dim_key = "CL_DEVICE_MAX_WORK_ITEM_SIZES_" + std::to_string(i); + device_info_[dim_key] = max_work_item_sizes[i]; + } + + // ===================== BUFFER ===================== + // CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE + auto max_constant_buffer_size_kb = + static_cast( + device_->getInfo()) / + 1024; + LOG(INFO) << "CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:" + << max_constant_buffer_size_kb; + device_info_["CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE"] = + max_constant_buffer_size_kb; + + // ===================== IMAGE ===================== + // CL_DEVICE_IMAGE_SUPPORT + // CL_DEVICE_IMAGE2D_MAX_HEIGHT + // CL_DEVICE_IMAGE2D_MAX_WIDTH auto image_support = device_->getInfo(); if (image_support) { LOG(INFO) << "The chosen device supports image processing."; + device_info_["CL_DEVICE_IMAGE_SUPPORT"] = 1; } else { LOG(INFO) << "The chosen device doesn't support image processing!"; + device_info_["CL_DEVICE_IMAGE_SUPPORT"] = 0; return false; } + + auto image2d_max_height = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_IMAGE2D_MAX_HEIGHT:" << image2d_max_height; + device_info_["CL_DEVICE_IMAGE2D_MAX_HEIGHT"] = image2d_max_height; + + auto image2d_max_width = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_IMAGE2D_MAX_WIDTH:" << image2d_max_width; + device_info_["CL_DEVICE_IMAGE2D_MAX_WIDTH"] = image2d_max_width; + + // ===================== OTHERS / EXTENSION / VERSION ===================== + // CL_DEVICE_EXTENSIONS + // CL_DEVICE_ADDRESS_BITS auto ext_data = device_->getInfo(); VLOG(4) << "The extensions supported by this device: " << ext_data; if (ext_data.find("cl_khr_fp16") != std::string::npos) { LOG(INFO) << "The chosen device supports the half data type."; + device_info_["CL_DEVICE_EXTENSIONS_FP16"] = 1; } else { LOG(INFO) << "The chosen device doesn't support the half data type!"; + device_info_["CL_DEVICE_EXTENSIONS_FP16"] = 0; } - auto max_units = device_->getInfo(); - LOG(INFO) << "The chosen device has " << max_units << " compute units."; - auto local_mem = device_->getInfo(); - LOG(INFO) << "The local memory size of the chosen device is " - << static_cast(local_mem) / 1024 << " KB."; + + auto address_bits = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_ADDRESS_BITS:" << address_bits; + device_info_["CL_DEVICE_ADDRESS_BITS"] = address_bits; + + auto driver_version = device_->getInfo(); + LOG(INFO) << "CL_DRIVER_VERSION:" << driver_version; + return true; } +std::map& CLRuntime::GetDeviceInfo() { + if (0 != device_info_.size()) { + return device_info_; + } + InitializeDevice(); + return device_info_; +} + +GpuType& CLRuntime::GetGpuType() { return gpu_type_; } + +void CLRuntime::GetAdrenoContextProperties( + std::vector* properties, + GPUPerfMode gpu_perf_mode, + GPUPriorityLevel gpu_priority_level) { + CHECK(properties) << "cl_context_properties is nullptr"; + properties->reserve(5); + switch (gpu_perf_mode) { + case GPUPerfMode::PERF_LOW: + LOG(INFO) << "GPUPerfMode::PERF_LOW"; + properties->push_back(CL_CONTEXT_PERF_MODE_QCOM); + properties->push_back(CL_PERF_MODE_LOW_QCOM); + break; + case GPUPerfMode::PERF_NORMAL: + LOG(INFO) << "GPUPerfMode::PERF_NORMAL"; + properties->push_back(CL_CONTEXT_PERF_MODE_QCOM); + properties->push_back(CL_PERF_MODE_NORMAL_QCOM); + break; + case GPUPerfMode::PERF_HIGH: + LOG(INFO) << "GPUPerfMode::PERF_HIGH"; + properties->push_back(CL_CONTEXT_PERF_MODE_QCOM); + properties->push_back(CL_PERF_MODE_HIGH_QCOM); + break; + default: + break; + } + switch (gpu_priority_level) { + case GPUPriorityLevel::PRIORITY_LOW: + LOG(INFO) << "GPUPriorityLevel::PRIORITY_LOW"; + properties->push_back(CL_CONTEXT_PRIORITY_LEVEL_QCOM); + properties->push_back(CL_PRIORITY_HINT_LOW_QCOM); + break; + case GPUPriorityLevel::PRIORITY_NORMAL: + LOG(INFO) << "GPUPriorityLevel::PRIORITY_NORMAL"; + properties->push_back(CL_CONTEXT_PRIORITY_LEVEL_QCOM); + properties->push_back(CL_PRIORITY_HINT_NORMAL_QCOM); + break; + case GPUPriorityLevel::PRIORITY_HIGH: + LOG(INFO) << "GPUPriorityLevel::PRIORITY_HIGH"; + properties->push_back(CL_CONTEXT_PRIORITY_LEVEL_QCOM); + properties->push_back(CL_PRIORITY_HINT_HIGH_QCOM); + break; + default: + break; + } + // The properties list should be terminated with 0 + properties->push_back(0); +} + +double CLRuntime::GetCommandTime(const cl::Event& event) { + command_queue().finish(); + auto start_nanos = event.getProfilingInfo(); + auto stop_nanos = event.getProfilingInfo(); + return (stop_nanos - start_nanos) / 1000000.0; +} + +double CLRuntime::GetQueuedTime(const cl::Event& event) { + command_queue().finish(); + return (event.getProfilingInfo() - + event.getProfilingInfo()) / + 1000000.0; +} + +double CLRuntime::GetSubmitTime(const cl::Event& event) { + command_queue().finish(); + return (event.getProfilingInfo() - + event.getProfilingInfo()) / + 1000000.0; +} + } // namespace lite } // namespace paddle diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h index 0859780c69cc8647c1fd54bf1ab12be29217c9e1..3eeea7d63ae8f81e7eb395bc0da70caaf94c2a79 100644 --- a/lite/backends/opencl/cl_runtime.h +++ b/lite/backends/opencl/cl_runtime.h @@ -1,11 +1,8 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -15,15 +12,58 @@ limitations under the License. */ #pragma once #include +#include #include #include #include #include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_utility.h" +typedef enum { + UNKNOWN = 0, + QUALCOMM_ADRENO = 1, + ARM_MALI = 2, + IMAGINATION_POWERVR = 3, + OTHERS = 4, +} GpuType; + +typedef enum { + PERF_DEFAULT = 0, + PERF_LOW = 1, + PERF_NORMAL = 2, + PERF_HIGH = 3 +} GPUPerfMode; + +typedef enum { + PRIORITY_DEFAULT = 0, + PRIORITY_LOW = 1, + PRIORITY_NORMAL = 2, + PRIORITY_HIGH = 3 +} GPUPriorityLevel; + +// Adreno extensions +// Adreno performance hints +typedef cl_uint cl_perf_hint; +#define CL_CONTEXT_PERF_MODE_QCOM 0x40C2 +#define CL_PERF_MODE_HIGH_QCOM 0x40C3 +#define CL_PERF_MODE_NORMAL_QCOM 0x40C4 +#define CL_PERF_MODE_LOW_QCOM 0x40C5 + +// Adreno priority hints +typedef cl_uint cl_priority_hint; + +#define CL_PRIORITY_HINT_NONE_QCOM 0 +#define CL_CONTEXT_PRIORITY_LEVEL_QCOM 0x40C9 +#define CL_PRIORITY_HINT_HIGH_QCOM 0x40CA +#define CL_PRIORITY_HINT_NORMAL_QCOM 0x40CB +#define CL_PRIORITY_HINT_LOW_QCOM 0x40CC + namespace paddle { namespace lite { +extern const std::map> + opencl_kernels_files; + class CLRuntime { public: static CLRuntime* Global(); @@ -51,8 +91,18 @@ class CLRuntime { void set_cl_path(std::string cl_path) { cl_path_ = cl_path; } + std::map& GetDeviceInfo(); + + GpuType& GetGpuType(); + + double GetCommandTime(const cl::Event& event); + + double GetQueuedTime(const cl::Event& event); + + double GetSubmitTime(const cl::Event& event); + private: - CLRuntime() = default; + CLRuntime() { Init(); } ~CLRuntime(); @@ -60,9 +110,28 @@ class CLRuntime { bool InitializeDevice(); + void GetAdrenoContextProperties( + std::vector* properties, + GPUPerfMode gpu_perf_mode, + GPUPriorityLevel gpu_priority_level); + std::shared_ptr CreateContext() { - auto context = std::make_shared( - std::vector{device()}, nullptr, nullptr, nullptr, &status_); + // note(ysh329): gpu perf mode and priority level of adreno gpu referred + // from xiaomi/mace. + // However, no performance gain after `PERF_HIGH` and `PRIORITY_HIGH` set. + auto perf_mode = GPUPerfMode::PERF_HIGH; + auto priority_level = GPUPriorityLevel::PRIORITY_HIGH; + std::vector context_properties; + if (gpu_type_ == GpuType::QUALCOMM_ADRENO) { + GetAdrenoContextProperties( + &context_properties, perf_mode, priority_level); + } + auto context = + std::make_shared(std::vector{device()}, + context_properties.data(), + nullptr, + nullptr, + &status_); CL_CHECK_FATAL(status_); return context; } @@ -80,6 +149,12 @@ class CLRuntime { return queue; } + GpuType ParseGpuTypeFromDeviceName(std::string device_name); + + std::map device_info_; + + GpuType gpu_type_{GpuType::UNKNOWN}; + std::string cl_path_; std::shared_ptr platform_{nullptr}; diff --git a/lite/backends/opencl/cl_utility.h b/lite/backends/opencl/cl_utility.h index b7f14c15e61ba050220ef0819fa9c3d13a7b8606..dcea7aef2e3a1c1df9130b0d1670504f8dd4cd37 100644 --- a/lite/backends/opencl/cl_utility.h +++ b/lite/backends/opencl/cl_utility.h @@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error); __FILE__, \ __LINE__); \ } - +#ifdef LITE_WITH_LOG #define CL_CHECK_FATAL(err_code__) \ if (err_code__ != CL_SUCCESS) { \ LOG(FATAL) << string_format( \ @@ -42,5 +42,21 @@ const char* opencl_error_to_str(cl_int error); __FILE__, \ __LINE__); \ } +#else +#define CL_CHECK_FATAL(err_code__) +#endif + +#ifdef LITE_WITH_PROFILE +#define EnqueueNDRangeKernel( \ + context, kernel, gws_offset, gws, lws, event_wait_list, event) \ + context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( \ + kernel, gws_offset, gws, lws, event_wait_list, &event) +#else +#define EnqueueNDRangeKernel( \ + context, kernel, gws_offset, gws, lws, event_wait_list, event) \ + context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( \ + kernel, gws_offset, gws, lws, event_wait_list, nullptr) +#endif + } // namespace lite } // namespace paddle diff --git a/lite/backends/opencl/target_wrapper.cc b/lite/backends/opencl/target_wrapper.cc index 310567baa539697f6a67b59f6c0e5f29ce46a80e..950f2fc442bdbbbb843ea6b15f0c2eac23c2e690 100644 --- a/lite/backends/opencl/target_wrapper.cc +++ b/lite/backends/opencl/target_wrapper.cc @@ -66,7 +66,8 @@ void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, cl_int status; cl::Image2D *cl_image = new cl::Image2D(CLRuntime::Global()->context(), - CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), + CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR + : CL_MEM_ALLOC_HOST_PTR), img_format, cl_image2d_width, cl_image2d_height, @@ -81,15 +82,16 @@ void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, return cl_image; } -template <> // use int16_t represents half float -void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, - const size_t cl_image2d_height, - void *host_ptr) { +template <> // use uint16_t represents half float +void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, + const size_t cl_image2d_height, + void *host_ptr) { cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFP16))); cl_int status; cl::Image2D *cl_image = new cl::Image2D(CLRuntime::Global()->context(), - CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), + CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR + : CL_MEM_ALLOC_HOST_PTR), img_format, cl_image2d_width, cl_image2d_height, @@ -112,7 +114,8 @@ void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, cl_int status; cl::Image2D *cl_image = new cl::Image2D(CLRuntime::Global()->context(), - CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), + CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR + : CL_MEM_ALLOC_HOST_PTR), img_format, cl_image2d_width, cl_image2d_height, @@ -192,7 +195,6 @@ void TargetWrapperCL::MemcpySync(void *dst, size_t size, IoDirection dir) { cl_int status; - cl::Event event; auto stream = CLRuntime::Global()->command_queue(); switch (dir) { case IoDirection::DtoD: @@ -202,9 +204,9 @@ void TargetWrapperCL::MemcpySync(void *dst, 0, size, nullptr, - &event); + nullptr); CL_CHECK_FATAL(status); - event.wait(); + CLRuntime::Global()->command_queue().finish(); break; case IoDirection::HtoD: status = stream.enqueueWriteBuffer(*static_cast(dst), @@ -283,7 +285,6 @@ void TargetWrapperCL::ImgcpySync(void *dst, cl::array origin = {0, 0, 0}; cl::array region = {cl_image2d_width, cl_image2d_height, 1}; cl_int status; - cl::Event event; auto stream = CLRuntime::Global()->command_queue(); switch (dir) { case IoDirection::DtoD: @@ -293,9 +294,9 @@ void TargetWrapperCL::ImgcpySync(void *dst, origin, region, nullptr, - &event); + nullptr); CL_CHECK_FATAL(status); - event.wait(); + CLRuntime::Global()->command_queue().finish(); break; case IoDirection::HtoD: status = stream.enqueueWriteImage(*static_cast(dst), diff --git a/lite/backends/rknpu/CMakeLists.txt b/lite/backends/rknpu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..cec60c80759cfc02e25a82eb795746c8b93e7cfe --- /dev/null +++ b/lite/backends/rknpu/CMakeLists.txt @@ -0,0 +1,5 @@ +if(NOT LITE_WITH_RKNPU) + return() +endif() + +lite_cc_library(device_rknpu SRCS device.cc DEPS ${rknpu_builder_libs} ${rknpu_runtime_libs}) diff --git a/lite/backends/rknpu/device.cc b/lite/backends/rknpu/device.cc new file mode 100644 index 0000000000000000000000000000000000000000..5b486259b3b328713062648df445f94735ae6380 --- /dev/null +++ b/lite/backends/rknpu/device.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/rknpu/device.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace rknpu { + +std::unique_ptr Device::Build( + std::string& model_name, // NOLINT + rk::nn::Graph* rk_graph, // NOLINT + std::vector> input_nodes, // NOLINT + std::vector> output_nodes // NOLINT + ) { + VLOG(3) << "[RKNPU] Build model"; + + rk_graph->SetInputsOutputs(input_nodes, output_nodes); + + std::unique_ptr exector = + std::unique_ptr(new rk::nn::Exection(rk_graph)); + + exector->Build(); + + return exector; +} + +} // namespace rknpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/rknpu/device.h b/lite/backends/rknpu/device.h new file mode 100644 index 0000000000000000000000000000000000000000..9284725aac7fbd9840aef64b7e8f411059f9ba15 --- /dev/null +++ b/lite/backends/rknpu/device.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "rknpu/rknpu_pub.h" // NOLINT + +namespace paddle { +namespace lite { +namespace rknpu { + +class Device { + public: + static Device& Global() { + static Device x; + return x; + } + Device() {} + + // Build the RK IR graph to om model, return RK model exector to + // load om model and run inference. + std::unique_ptr Build( + std::string& model_name, // NOLINT + rk::nn::Graph* rk_graph, // NOLINT + std::vector> input_nodes, // NOLINT + std::vector> output_nodes // NOLINT + ); // NOLINT + + private: +}; + +} // namespace rknpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/x86/CMakeLists.txt b/lite/backends/x86/CMakeLists.txt index 63b41ae77d0f3949e3d1de13f9db5ca99b4f1c41..1014e3f87f5190700746467f09f7bf294070a09b 100644 --- a/lite/backends/x86/CMakeLists.txt +++ b/lite/backends/x86/CMakeLists.txt @@ -8,9 +8,9 @@ lite_cc_library(target_wrapper_x86 SRCS target_wrapper.cc) if (LITE_ON_MODEL_OPTIMIZE_TOOL) return() endif(LITE_ON_MODEL_OPTIMIZE_TOOL) -lite_cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags) +lite_cc_library(dynamic_loader SRCS dynamic_loader.cc) lite_cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) -lite_cc_library(x86_cpu_info SRCS cpu_info.cc DEPS xbyak) +lite_cc_library(x86_cpu_info SRCS cpu_info.cc) add_subdirectory(jit) add_subdirectory(math) diff --git a/lite/backends/x86/cpu_info.cc b/lite/backends/x86/cpu_info.cc index aa097f947a0289b4a44417160fbe5d6e6db48020..276b62654f3c8b25d23e629c706e4877dabc3889 100644 --- a/lite/backends/x86/cpu_info.cc +++ b/lite/backends/x86/cpu_info.cc @@ -29,8 +29,8 @@ #include #endif // _WIN32 -#include #include +#include "lite/utils/cp_logging.h" #include "lite/utils/env.h" diff --git a/lite/backends/x86/dynamic_loader.cc b/lite/backends/x86/dynamic_loader.cc index a05a57e93b23008e49683764b5ed669d5c425e5b..4978dfb84a4ee5770df011c54dccde59a62135b7 100644 --- a/lite/backends/x86/dynamic_loader.cc +++ b/lite/backends/x86/dynamic_loader.cc @@ -17,8 +17,6 @@ limitations under the License. */ #include // NOLINT #include -#include "gflags/gflags.h" -#include "glog/logging.h" #include "lite/backends/x86/cupti_lib_path.h" #include "lite/backends/x86/port.h" #include "lite/backends/x86/warpctc_lib_path.h" @@ -262,7 +260,7 @@ void* GetTensorRtDsoHandle() { void* GetMKLMLDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib"); + return GetDsoHandleFromSearchPath(mklml_dir, "libmklml.dylib"); #elif defined(_WIN32) return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll"); #else diff --git a/lite/backends/x86/jit/gen/act.h b/lite/backends/x86/jit/gen/act.h index 6366cff3c85d674c8f7730dae24732bdf3571672..dd545b9fc95f9a260300bf11afb8f98e7d2ad922 100644 --- a/lite/backends/x86/jit/gen/act.h +++ b/lite/backends/x86/jit/gen/act.h @@ -14,9 +14,9 @@ #pragma once -#include #include #include "lite/backends/x86/jit/gen/jitcode.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/jit/gen/blas.h b/lite/backends/x86/jit/gen/blas.h index 39920195b245e1c44ff68ab91af94d25c949bd02..8545ea96f8dd1a4d2eeaa1748d34a859f46799c1 100644 --- a/lite/backends/x86/jit/gen/blas.h +++ b/lite/backends/x86/jit/gen/blas.h @@ -15,8 +15,9 @@ #pragma once #include -#include "glog/logging.h" #include "lite/backends/x86/jit/gen/jitcode.h" +#include "lite/utils/cp_logging.h" +#include "lite/utils/string.h" namespace paddle { namespace lite { @@ -64,7 +65,7 @@ class VXXJitCode : public JitCode { base += "_Vec"; } base += (with_relu_ ? "_Relu" : ""); - base += "_D" + std::to_string(num_); + base += "_D" + paddle::lite::to_string(num_); return base; } void genCode() override; diff --git a/lite/backends/x86/jit/gen/embseqpool.h b/lite/backends/x86/jit/gen/embseqpool.h index 7cae76f9dd99cf904e831b196bd493623ff7eb1d..7bb248dd1d384af949fd3cd190df3d90d21921ef 100644 --- a/lite/backends/x86/jit/gen/embseqpool.h +++ b/lite/backends/x86/jit/gen/embseqpool.h @@ -14,9 +14,9 @@ #pragma once -#include #include #include "lite/backends/x86/jit/gen/jitcode.h" +#include "lite/utils/cp_logging.h" #include "lite/utils/paddle_enforce.h" namespace paddle { @@ -47,7 +47,7 @@ class EmbSeqPoolJitCode : public JitCode { } else if (type_ == SeqPoolType::kSqrt) { base += "_Sqrt"; } - base += ("_W" + std::to_string(tbl_w_)); + base += ("_W" + paddle::lite::to_string(tbl_w_)); return base; } void genCode() override; diff --git a/lite/backends/x86/jit/gen/gru.h b/lite/backends/x86/jit/gen/gru.h index 408f25746d85d4c56bdbd3c0728687f817c1f80f..6a468fd9ac19acbc68f2e2569e77892189f37e62 100644 --- a/lite/backends/x86/jit/gen/gru.h +++ b/lite/backends/x86/jit/gen/gru.h @@ -15,9 +15,9 @@ #pragma once #include -#include "glog/logging.h" #include "lite/backends/x86/jit/gen/act.h" #include "lite/backends/x86/jit/gen/jitcode.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/jit/gen/hopv.h b/lite/backends/x86/jit/gen/hopv.h index 801131d6307e6ff10efaa2770fce6ac0a0f3b9d3..6fa0c041b9f45000ef12251974579020de31784a 100644 --- a/lite/backends/x86/jit/gen/hopv.h +++ b/lite/backends/x86/jit/gen/hopv.h @@ -15,8 +15,8 @@ #pragma once #include -#include "glog/logging.h" #include "lite/backends/x86/jit/gen/jitcode.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/jit/gen/lstm.h b/lite/backends/x86/jit/gen/lstm.h index 141419505c7ce3b8e515dbd728987640afda7fc5..22611978e081edad369612e29bdd1e8fd1634b1f 100644 --- a/lite/backends/x86/jit/gen/lstm.h +++ b/lite/backends/x86/jit/gen/lstm.h @@ -15,9 +15,9 @@ #pragma once #include -#include "glog/logging.h" #include "lite/backends/x86/jit/gen/act.h" #include "lite/backends/x86/jit/gen/jitcode.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/jit/gen/matmul.cc b/lite/backends/x86/jit/gen/matmul.cc index 010c80fac4842e74c9b8272db472ddf6cf954771..f78df73f66532f891721c74cff9c78cc3bb61922 100644 --- a/lite/backends/x86/jit/gen/matmul.cc +++ b/lite/backends/x86/jit/gen/matmul.cc @@ -40,7 +40,7 @@ void MatMulJitCode::genCode() { for (size_t g = 0; g < groups.size(); ++g) { size_t x_offset = 0; size_t wgt_offset_tmp = 0; - for (int i = 0; i < g; ++i) { + for (size_t i = 0; i < g; ++i) { wgt_offset_tmp += groups[i] * block_len; } for (int k = 0; k < k_; ++k) { diff --git a/lite/backends/x86/jit/gen/matmul.h b/lite/backends/x86/jit/gen/matmul.h index b1b302b7904a5d92952f4385c483eccdc5df3592..95edc14201ac94d302ff806d0a4b8f5f50b2835c 100644 --- a/lite/backends/x86/jit/gen/matmul.h +++ b/lite/backends/x86/jit/gen/matmul.h @@ -17,8 +17,8 @@ #include // for malloc and free #include #include -#include "glog/logging.h" #include "lite/backends/x86/jit/gen/jitcode.h" +#include "lite/utils/cp_logging.h" #include "lite/utils/paddle_enforce.h" namespace paddle { @@ -38,8 +38,8 @@ class MatMulJitCode : public JitCode { std::string name() const override { std::string base = "MatMulJitCode"; - base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" + - std::to_string(k_); + base = base + "_M" + paddle::lite::to_string(m_) + "_N" + + paddle::lite::to_string(n_) + "_K" + paddle::lite::to_string(k_); return base; } void genCode() override; diff --git a/lite/backends/x86/jit/gen/seqpool.h b/lite/backends/x86/jit/gen/seqpool.h index 346179cfbbd0e8291dc17b266366c5df07114b7f..a00428f3e0982889665cd23b21a5978c7c239399 100644 --- a/lite/backends/x86/jit/gen/seqpool.h +++ b/lite/backends/x86/jit/gen/seqpool.h @@ -14,9 +14,9 @@ #pragma once -#include #include #include "lite/backends/x86/jit/gen/jitcode.h" +#include "lite/utils/cp_logging.h" #include "lite/utils/paddle_enforce.h" namespace paddle { @@ -47,7 +47,7 @@ class SeqPoolJitCode : public JitCode { } else if (type_ == SeqPoolType::kSqrt) { base += "_Sqrt"; } - base += ("_W" + std::to_string(w_)); + base += ("_W" + paddle::lite::to_string(w_)); return base; } void genCode() override; diff --git a/lite/backends/x86/jit/gen/sgd.h b/lite/backends/x86/jit/gen/sgd.h index 303d94f2ab6bf823ea71b8c52b2a755558f50fbd..9c9c2cff01ab051dcd526b7f633fcd66c1af702e 100644 --- a/lite/backends/x86/jit/gen/sgd.h +++ b/lite/backends/x86/jit/gen/sgd.h @@ -15,8 +15,8 @@ #pragma once #include -#include "glog/logging.h" #include "lite/backends/x86/jit/gen/jitcode.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/jit/gen/vbroadcast.h b/lite/backends/x86/jit/gen/vbroadcast.h index 39bcd4965f3a24f18de7fa5a13d469b3019920f9..8b58bd4c04922319f0b18b709df4a2a6fc0c1313 100644 --- a/lite/backends/x86/jit/gen/vbroadcast.h +++ b/lite/backends/x86/jit/gen/vbroadcast.h @@ -15,8 +15,8 @@ #pragma once #include -#include "glog/logging.h" #include "lite/backends/x86/jit/gen/jitcode.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/jit/gen_base.cc b/lite/backends/x86/jit/gen_base.cc index 7d051aa6f5802844753b71fd43400e20b7f5965b..a3376be423828b25c6eda6fff30a56578c7bbbe5 100644 --- a/lite/backends/x86/jit/gen_base.cc +++ b/lite/backends/x86/jit/gen_base.cc @@ -28,6 +28,12 @@ #define posix_memalign_free free #endif +#ifdef _WIN32 +#define posix_memalign_free _aligned_free +#define posix_memalign(p, a, s) \ + (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) +#endif + // DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode"); @@ -53,10 +59,14 @@ void GenBase::dumpCode(const unsigned char* code) const { void* GenBase::operator new(size_t size) { void* ptr; constexpr size_t alignment = 32ul; +#ifdef _WIN32 + ptr = _aligned_malloc(size, alignment); +#else PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0, "GenBase Alloc %ld error!", size); +#endif PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size); return ptr; } diff --git a/lite/backends/x86/jit/refer/refer.h b/lite/backends/x86/jit/refer/refer.h index 119ec7469ed21f5e74c973e3de88ed6b93b1e06a..d8c8d86911ab9a7794192aa68fb0c0571b1e4d26 100644 --- a/lite/backends/x86/jit/refer/refer.h +++ b/lite/backends/x86/jit/refer/refer.h @@ -14,7 +14,6 @@ #pragma once -#include #include #include #include @@ -22,6 +21,7 @@ #include "lite/backends/x86/jit/helper.h" #include "lite/backends/x86/jit/kernel_base.h" #include "lite/backends/x86/jit/macro.h" +#include "lite/utils/cp_logging.h" #include "lite/utils/paddle_enforce.h" namespace paddle { diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc index 8d61fb3bbb97705c697fba934e6cab9424f85bad..5d7e98629cb89bd7a3fdee852507e0f381e54931 100644 --- a/lite/backends/x86/math/beam_search.cc +++ b/lite/backends/x86/math/beam_search.cc @@ -96,8 +96,8 @@ class BeamSearchFunctor { // : nullptr; // fill in data - std::vector low_level; - size_t low_offset = 0; + std::vector low_level; + uint64_t low_offset = 0; for (auto &items : selected_items) { low_level.push_back(low_offset); for (auto &item : items) { @@ -265,7 +265,7 @@ class BeamSearchFunctor { // size_t num_seqs = scores->NumElements(lod_level); size_t num_seqs = scores->lod()[lod_level].size() - 1; size_t seq_width = 1; - for (int i = 1; i < scores->dims().size(); i++) { + for (size_t i = 1; i < scores->dims().size(); i++) { seq_width *= scores->dims()[i]; } diff --git a/lite/backends/x86/math/beam_search_test.cc b/lite/backends/x86/math/beam_search_test.cc index 904870207b08d462025ecb4b84d6cf57f7b13f26..233fa03fbaa31165dae4453affb148276f8c6584 100644 --- a/lite/backends/x86/math/beam_search_test.cc +++ b/lite/backends/x86/math/beam_search_test.cc @@ -22,8 +22,8 @@ void PrepareCPUTensors(paddle::framework::LoDTensor* ids, paddle::framework::LoDTensor* pre_scores) { // lod paddle::framework::LoD lod; - std::vector level0({0, 2, 4}); - std::vector level1({0, 1, 2, 3, 4}); + std::vector level0({0, 2, 4}); + std::vector level1({0, 1, 2, 3, 4}); lod.push_back(level0); lod.push_back(level1); ids->set_lod(lod); diff --git a/lite/backends/x86/math/blas.cc b/lite/backends/x86/math/blas.cc index 2d21adaf5d22930ff720c193696eb00c8035579d..3bc5f9f67ad96e7ec699400ff6369fe48c745b7e 100644 --- a/lite/backends/x86/math/blas.cc +++ b/lite/backends/x86/math/blas.cc @@ -23,7 +23,7 @@ namespace math { MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim, int num_flatten_cols, bool trans) { - PADDLE_ENFORCE_GT(tensor_dim.size(), 1); + PADDLE_ENFORCE_GT(tensor_dim.size(), 1u); MatDescriptor retv; if (num_flatten_cols > 1) { auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols); diff --git a/lite/backends/x86/math/blas_impl.h b/lite/backends/x86/math/blas_impl.h index 72d0736268f342187f0be8c6348f5bed75df30ea..34b258892be05625ae88076eff175f56a53d3537 100644 --- a/lite/backends/x86/math/blas_impl.h +++ b/lite/backends/x86/math/blas_impl.h @@ -483,7 +483,7 @@ void Blas::MatMul(const lite::Tensor &mat_a, mat_a.data(), mat_b.data(), beta, - mat_out->mutable_data()); + mat_out->template mutable_data()); } template <> @@ -759,7 +759,7 @@ void Blas::MatMul(const lite::Tensor &mat_a, mat_a.data(), mat_b.data(), beta, - mat_out->mutable_data()); + mat_out->template mutable_data()); } else { PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0); @@ -773,7 +773,7 @@ void Blas::MatMul(const lite::Tensor &mat_a, mat_a.data(), mat_b.data(), beta, - mat_out->mutable_data(), + mat_out->template mutable_data(), dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_, dim_a.stride_, dim_b.stride_); diff --git a/lite/backends/x86/math/concat_and_split.cc b/lite/backends/x86/math/concat_and_split.cc index bec93dde41fdb654cfbfd20f5d9e59d1d372e3a8..df75654aebaba26b9889d97445bd889cdf2f4eb0 100644 --- a/lite/backends/x86/math/concat_and_split.cc +++ b/lite/backends/x86/math/concat_and_split.cc @@ -51,7 +51,7 @@ class ConcatFunctor { // auto cpu_place = boost::get(context.GetPlace()); // computation - auto output_data = output->mutable_data(); + auto output_data = output->template mutable_data(); int col_idx = 0; for (int j = 0; j < num; ++j) { int col_len = input_cols[j]; @@ -108,7 +108,7 @@ class SplitFunctor { int col_len = output_cols[j]; auto* out_tensor = outputs->at(j); if (out_tensor != nullptr) { - T* dst_ptr = out_tensor->mutable_data() + k * col_len; + T* dst_ptr = out_tensor->template mutable_data() + k * col_len; std::copy_n(src_ptr + col_idx, col_len, dst_ptr); // memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, // sizeof(T) * col_len); diff --git a/lite/backends/x86/math/cross_entropy.cc b/lite/backends/x86/math/cross_entropy.cc index 366486924a8c4a5eefd6341183b4f1bc1c0277ad..941a34643669f060cdd18f38f92c39e529da7b19 100644 --- a/lite/backends/x86/math/cross_entropy.cc +++ b/lite/backends/x86/math/cross_entropy.cc @@ -50,8 +50,8 @@ class CrossEntropyFunctor { .reshape(batch_axis_remain) .sum(Eigen::DSizes(1))); } else { - const T* prob_data = prob->data(); - T* loss_data = out->mutable_data(); + const T* prob_data = prob->template data(); + T* loss_data = out->template mutable_data(); const int64_t* label_data = labels->data(); for (int i = 0; i < batch_size; ++i) { diff --git a/lite/backends/x86/math/im2col.cc b/lite/backends/x86/math/im2col.cc index 1c4c6a49f5bb804a57344c59368d18255e8a7912..b916c912ffc2a4d62b63b98fdce150b353ba087e 100644 --- a/lite/backends/x86/math/im2col.cc +++ b/lite/backends/x86/math/im2col.cc @@ -99,7 +99,7 @@ class Col2ImFunctormutable_data(); + T* im_data = im->template mutable_data(); const T* col_data = col.data(); for (int c = 0; c < channels_col; ++c) { @@ -161,7 +161,7 @@ class Im2ColFunctordims()[1]; const T* im_data = im.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) { @@ -235,7 +235,7 @@ class Col2ImFunctormutable_data(); + T* im_data = im->template mutable_data(); const T* col_data = col.data(); for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { diff --git a/lite/backends/x86/math/im2col_cfo_cpu.h b/lite/backends/x86/math/im2col_cfo_cpu.h index 4623f045bb1cbe67605b36621efcc3285b989ad5..97579647d4ec3a9a95e033a153417cb0aaadbeb6 100644 --- a/lite/backends/x86/math/im2col_cfo_cpu.h +++ b/lite/backends/x86/math/im2col_cfo_cpu.h @@ -42,7 +42,7 @@ inline void im2col_common(const lite::Tensor& im, int channels_col = im_channels * filter_height * filter_width; const T* im_data = im.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); for (int c = 0; c < channels_col; ++c) { int w_offset = c % filter_width; int h_offset = (c / filter_width) % filter_height; @@ -77,7 +77,7 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const lite::Tensor& im, int output_width = col->dims()[4]; const T* im_data = im.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); int col_matrix_width = output_width * output_height; int im_size = im_height * im_width; size_t copy_size = sizeof(T) * output_width; @@ -123,7 +123,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const lite::Tensor& im, constexpr int prw = 1; const T* im_data = im.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); int im_size = im_height * im_width; int col_matrix_width = output_width * output_height; int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc index a17807e8a997f0ecf908313a4cb205676e4fa4b8..cb1781db2199c1b7a12aaec80b1904f65b23b534 100644 --- a/lite/backends/x86/math/math_function.cc +++ b/lite/backends/x86/math/math_function.cc @@ -65,7 +65,7 @@ struct TensorSetConstantCPU { : tensor_(tensor), value_(value) {} template void apply() const { - auto* begin = tensor_->mutable_data(lite::TargetType::kX86); + auto* begin = tensor_->template mutable_data(lite::TargetType::kX86); std::fill(begin, begin + tensor_->numel(), static_cast(value_)); } lite::Tensor* tensor_; @@ -126,11 +126,10 @@ struct RowwiseAdd { const T* input_data = input.data(); const T* vector_data = vector.data(); - T* output_data = output->mutable_data(); + T* output_data = output->template mutable_data(); for (int64_t i = 0; i < in_dims[0]; ++i) { for (int64_t j = 0; j < size; ++j) { - output_data[i * in_dims[0] + j] = - input_data[i * in_dims[0] + j] + vector_data[j]; + output_data[i * size + j] = input_data[i * size + j] + vector_data[j]; } } } diff --git a/lite/backends/x86/math/math_function_impl.h b/lite/backends/x86/math/math_function_impl.h index 3aaca2e59370f8f2b922554ec6f378bb2a3de9b5..acfb76759f6fc9fa4122afd2388bc3adf8f5ea22 100644 --- a/lite/backends/x86/math/math_function_impl.h +++ b/lite/backends/x86/math/math_function_impl.h @@ -83,7 +83,7 @@ class ColwiseSum { auto size = in_dims[1]; PADDLE_ENFORCE_EQ(out->numel(), size); - T* out_buf = out->mutable_data(out->target()); + T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); for (size_t i = 0; i < static_cast(height); ++i) { @@ -129,7 +129,7 @@ class RowwiseMean { auto size = in_dims[1]; PADDLE_ENFORCE_EQ(out->numel(), height); auto inv_size = 1.0 / size; - T* out_buf = out->mutable_data(out->target()); + T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); for (size_t i = 0; i < static_cast(height); ++i) { @@ -173,7 +173,7 @@ class RowwiseSum { auto size = in_dims[1]; PADDLE_ENFORCE_EQ(out->numel(), height); - T* out_buf = out->mutable_data(out->target()); + T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); for (size_t i = 0; i < static_cast(height); ++i) { diff --git a/lite/backends/x86/math/maxouting.cc b/lite/backends/x86/math/maxouting.cc index 20b40fe7c5000cc1d0ee80c18efa5d1defc911f0..f97b16f7fb3326a6d2eb186e2984df3dbd0a0a90 100644 --- a/lite/backends/x86/math/maxouting.cc +++ b/lite/backends/x86/math/maxouting.cc @@ -35,7 +35,7 @@ class MaxOutFunctor { // c_size means the output size of each sample int c_size = fea_size * output_channels; const T* input_data = input.data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); + T* output_data = output->template mutable_data(lite::TargetType::kX86); for (int i = 0; i < batch_size; ++i) { int new_bindex = c_size * i; @@ -72,7 +72,8 @@ class MaxOutGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); for (int i = 0; i < batch_size; ++i) { int blen = fea_size * output_channels * i; diff --git a/lite/backends/x86/math/pooling.cc b/lite/backends/x86/math/pooling.cc index ab6c1edb481f914d5751149aca2595fee550ca51..4393c42157bb7667ec2218e8b76f05a2c60bcc86 100644 --- a/lite/backends/x86/math/pooling.cc +++ b/lite/backends/x86/math/pooling.cc @@ -54,8 +54,8 @@ class Pool2dFunctor { const int input_stride = input_height * input_width; const int output_stride = output_height * output_width; - const T* input_data = input->data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); + const T* input_data = input->template data(); + T* output_data = output->template mutable_data(lite::TargetType::kX86); int hstart, hend; int wstart, wend; @@ -137,7 +137,8 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); int hstart, hend; int wstart, wend; @@ -220,7 +221,8 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { @@ -322,7 +324,7 @@ class Pool3dFunctor { const int output_stride = output_depth * output_height * output_width; const T* input_data = input.data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); + T* output_data = output->template mutable_data(lite::TargetType::kX86); int dstart, dend; int hstart, hend; @@ -425,7 +427,8 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); int dstart, dend; int hstart, hend; @@ -530,7 +533,8 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { diff --git a/lite/backends/x86/math/sample_prob.h b/lite/backends/x86/math/sample_prob.h index 5312b3df10a41444c073f0cf61d69bce6fc3859a..4351df68a2630c2b8c6f7285f3955a9b06165f67 100644 --- a/lite/backends/x86/math/sample_prob.h +++ b/lite/backends/x86/math/sample_prob.h @@ -58,11 +58,11 @@ class SampleWithProb { const int64_t* label_data = L->data(); // int64_t* samples_data = // S->mutable_data(ret_dim, Target); - // T* probabilities_data = P->mutable_data(ret_dim, Target); + // T* probabilities_data = P->template mutable_data(ret_dim, Target); S->Resize({batch_size, num_sampled_classes}); auto* samples_data = S->mutable_data(Target); P->Resize({batch_size, num_sampled_classes}); - auto* probabilities_data = P->mutable_data(Target); + auto* probabilities_data = P->template mutable_data(Target); // temp sets for unique sampling std::unordered_set tmp_samples; diff --git a/lite/backends/x86/math/search_fc.cc b/lite/backends/x86/math/search_fc.cc index 56fc363cb48ec5c58f4a7ee3e62a2e6bd7355021..014b213d4f10f7161dc1881d582cca93f2be58e5 100644 --- a/lite/backends/x86/math/search_fc.cc +++ b/lite/backends/x86/math/search_fc.cc @@ -42,7 +42,7 @@ class SearchFcFunctor { lite::DDim dims(std::vector({bottom.dims()[0], out_size})); const auto bottom_data = bottom.data(); - auto top_data = top->mutable_data(lite::TargetType::kX86); + auto top_data = top->template mutable_data(lite::TargetType::kX86); const auto weights = w.data(); auto blas = math::GetBlas(context); call_gemm(blas, diff --git a/lite/backends/x86/math/selected_rows_functor.cc b/lite/backends/x86/math/selected_rows_functor.cc index f8f1b42361832771ba04d1bdc8b3e2e05f954e29..fe7a46f9f04d49ea7b505b8e2ece6b4bdd0ec826 100644 --- a/lite/backends/x86/math/selected_rows_functor.cc +++ b/lite/backends/x86/math/selected_rows_functor.cc @@ -52,7 +52,7 @@ struct SelectedRowsAdd { PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); - auto* out_data = out_value->mutable_data(); + auto* out_data = out_value->template mutable_data(); auto* in1_data = in1_value.data(); std::copy_n(in1_data, in1_value.numel(), out_data); @@ -87,7 +87,7 @@ struct SelectedRowsAddTensor { functor(context, output, 0.0); auto* in1_data = in1_value.data(); - auto* out_data = output->mutable_data(); + auto* out_data = output->template mutable_data(); for (size_t i = 0; i < in1_rows.size(); i++) { for (int64_t j = 0; j < in1_row_numel; j++) { @@ -127,7 +127,7 @@ struct SelectedRowsAddTo { in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end()); auto* in1_data = in1_value.data(); - auto* in2_data = in2_value->mutable_data(); + auto* in2_data = in2_value->template mutable_data(); std::copy_n(in1_data, in1_value.numel(), in2_data + input2_offset); } }; @@ -161,7 +161,7 @@ struct SelectedRowsSumTo { input2->set_rows(in2_rows); auto* in2_value = input2->mutable_value(); - T* in2_data = in2_value->mutable_data(); + T* in2_data = in2_value->template mutable_data(); auto blas = math::GetBlas(context); size_t offset = 0u; for (size_t i = 0u; i != input1.size(); ++i) { @@ -194,7 +194,7 @@ struct SelectedRowsAddToTensor { PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); auto* in1_data = in1_value.data(); - auto* input2_data = input2->mutable_data(); + auto* input2_data = input2->template mutable_data(); for (size_t i = 0; i < in1_rows.size(); i++) { for (int64_t j = 0; j < in1_row_numel; j++) { @@ -279,7 +279,7 @@ struct MergeAdd { } } if (has_value_input == nullptr) { - VLOG(3) << "no input has value! just return" << std::endl; + VLOG(3) << "no input has value! just return"; return; } auto input_width = has_value_input->value().dims()[1]; @@ -305,7 +305,7 @@ struct MergeAdd { lite::DDim dims(std::vector( {static_cast(merged_row_set.size()), input_width})); out.mutable_value()->Resize(dims); - auto* out_data = out.mutable_value()->mutable_data(); + auto* out_data = out.mutable_value()->template mutable_data(); if (merged_row_set.size() == row_num && !sorted_result) { // no duplicated ids, just concat the result together @@ -385,7 +385,7 @@ struct UpdateToTensor { PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); auto* in1_data = in1_value.data(); - auto* input2_data = input2->data(); + auto* input2_data = input2->template data(); // FIXME(typhoonzero): use macro fix the below messy code. switch (op) { diff --git a/lite/backends/x86/math/sequence2batch.cc b/lite/backends/x86/math/sequence2batch.cc index c12c05414d717dce706590a491ccae2384f3bfe5..aa7aeac532e2fa1f90d452924b364be1896ee862 100644 --- a/lite/backends/x86/math/sequence2batch.cc +++ b/lite/backends/x86/math/sequence2batch.cc @@ -24,10 +24,10 @@ class CopyMatrixRowsFunctor { public: void operator()(const lite::Context& context, const lite::Tensor& src, - const std::vector& index_lod, + const std::vector& index_lod, lite::Tensor* dst, bool is_src_index) { - const size_t* index = index_lod.data(); + const uint64_t* index = index_lod.data(); const auto& src_dims = src.dims(); const auto& dst_dims = dst->dims(); PADDLE_ENFORCE_EQ( @@ -39,7 +39,7 @@ class CopyMatrixRowsFunctor { auto height = dst_dims[0]; auto width = dst_dims[1]; auto* src_data = src.data(); - auto* dst_data = dst->mutable_data(); + auto* dst_data = dst->template mutable_data(); const int sz = width * sizeof(T); if (is_src_index) { for (int i = 0; i < height; ++i) { diff --git a/lite/backends/x86/math/sequence2batch.h b/lite/backends/x86/math/sequence2batch.h index a70cc5bf73522f97ab312fc48553b5316dbf8376..63df008b6dfca936265019a71ac0a553c525dc73 100644 --- a/lite/backends/x86/math/sequence2batch.h +++ b/lite/backends/x86/math/sequence2batch.h @@ -36,7 +36,7 @@ class CopyMatrixRowsFunctor { // The indexed rows are based on the input index. void operator()(const lite::Context& context, const lite::Tensor& src, - const std::vector& index_lod, + const std::vector& index_lod, lite::Tensor* dst, bool is_src_index); }; @@ -130,8 +130,8 @@ class LoDTensor2BatchFunctor { // batch_lods[2] is the sort order for the input LoDTensor. batch_lods->at(2).resize(seq_info.size()); - size_t* batch_starts = batch_lods->at(0).data(); - size_t* seq2batch_idx = batch_lods->at(1).data(); + auto* batch_starts = batch_lods->at(0).data(); + auto* seq2batch_idx = batch_lods->at(1).data(); batch_starts[0] = 0; for (int n = 0; n < max_seqlen; n++) { auto batch_id = static_cast(batch_starts[n]); @@ -148,7 +148,7 @@ class LoDTensor2BatchFunctor { } batch_starts[n + 1] = static_cast(batch_id); } - size_t* seq_order = batch_lods->at(2).data(); + auto* seq_order = batch_lods->at(2).data(); for (size_t i = 0; i < seq_info.size(); ++i) { seq_order[i] = seq_info[i].seq_idx; } diff --git a/lite/backends/x86/math/sequence_padding.cc b/lite/backends/x86/math/sequence_padding.cc index fbb6c11a5f7a0cbae36d2f8fba0b141dadadf542..eb977dc2d23f4cfaeec7dd5a6e2834ca23345f76 100644 --- a/lite/backends/x86/math/sequence_padding.cc +++ b/lite/backends/x86/math/sequence_padding.cc @@ -22,15 +22,15 @@ namespace math { template void CopyValidData(lite::Tensor* dst_tensor, const lite::Tensor* src_tensor, - const std::vector& seq_offsets, + const std::vector& seq_offsets, int pad_seq_len, int step_width, bool norm_by_len, CopyType type, PadLayout layout) { int seq_num = seq_offsets.size() - 1; - const T* src_data = src_tensor->data(); - T* dst_data = dst_tensor->mutable_data(); + const T* src_data = src_tensor->template data(); + T* dst_data = dst_tensor->template mutable_data(); int seq_cpy_gap = step_width; int pad_cpy_gap = @@ -113,7 +113,7 @@ class PaddingLoDTensorFunctor { "'step_width'."); // fill padding value - T* pad_data = pad_tensor->mutable_data(); + T* pad_data = pad_tensor->template mutable_data(); const T* pad_value_data = pad_value.data(); if (pad_value.numel() == 1) { fast_mem_init( diff --git a/lite/backends/x86/math/sequence_padding.h b/lite/backends/x86/math/sequence_padding.h index a3f4512042de4c7a2fc665f2fd41777d472225f5..43407014dea0ed0c78ab29da7fb8ebb0e0310566 100644 --- a/lite/backends/x86/math/sequence_padding.h +++ b/lite/backends/x86/math/sequence_padding.h @@ -30,10 +30,10 @@ enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth }; enum CopyType { kSeqToPad, kPadToSeq }; -inline static size_t MaximumSequenceLength( - const std::vector& seq_offset) { - size_t seq_num = seq_offset.size() - 1; - size_t max_seq_len = 0; +inline static uint64_t MaximumSequenceLength( + const std::vector& seq_offset) { + uint64_t seq_num = seq_offset.size() - 1; + uint64_t max_seq_len = 0; for (size_t i = 0; i < seq_num; ++i) { max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]); } @@ -42,7 +42,7 @@ inline static size_t MaximumSequenceLength( inline static void CheckDims(const lite::DDim& seq_tensor_dims, const lite::DDim& pad_tensor_dims, - const std::vector& seq_offset, + const std::vector& seq_offset, int64_t padded_seq_len, int64_t step_width, const PadLayout& layout) { diff --git a/lite/backends/x86/math/sequence_pooling.cc b/lite/backends/x86/math/sequence_pooling.cc index 186b8b5543c7132867093616c83b45ae8ff27d3c..2d00ebad61840da5b14fbf12d9255394b2b2df1a 100644 --- a/lite/backends/x86/math/sequence_pooling.cc +++ b/lite/backends/x86/math/sequence_pooling.cc @@ -46,16 +46,16 @@ class MaxSeqPoolFunctor { auto in_dims = input.dims(); auto out_dims = output->dims(); auto idx_dims = index->dims(); - PADDLE_ENFORCE_GT(in_dims.size(), 1); - PADDLE_ENFORCE_GT(out_dims.size(), 1); - for (int64_t i = 1; i < in_dims.size(); ++i) { + PADDLE_ENFORCE_GT(in_dims.size(), 1u); + PADDLE_ENFORCE_GT(out_dims.size(), 1u); + for (size_t i = 1; i < in_dims.size(); ++i) { PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); } PADDLE_ENFORCE_EQ(idx_dims, out_dims); auto starts = input.lod()[0]; const T* in_data = input.data(); - T* out_data = output->mutable_data(); + T* out_data = output->template mutable_data(); int* max_index = index->mutable_data(); int64_t num_seq = out_dims[0]; @@ -95,15 +95,15 @@ class MaxSeqPoolFunctor { lite::Tensor* index) { auto in_dims = input.dims(); auto out_dims = output->dims(); - PADDLE_ENFORCE_GT(in_dims.size(), 1); - PADDLE_ENFORCE_GT(out_dims.size(), 1); - for (int64_t i = 1; i < in_dims.size(); ++i) { + PADDLE_ENFORCE_GT(in_dims.size(), 1u); + PADDLE_ENFORCE_GT(out_dims.size(), 1u); + for (size_t i = 1; i < in_dims.size(); ++i) { PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); } auto starts = input.lod()[0]; const T* in_data = input.data(); - T* out_data = output->mutable_data(); + T* out_data = output->template mutable_data(); int64_t num_seq = out_dims[0]; int64_t dim = output->numel() / num_seq; @@ -138,14 +138,14 @@ class MaxSeqPoolGradFunctor { auto idx_dims = index.dims(); PADDLE_ENFORCE_GT(og_dims.size(), 1); PADDLE_ENFORCE_GT(ig_dims.size(), 1); - for (int64_t i = 1; i < og_dims.size(); ++i) { + for (size_t i = 1; i < og_dims.size(); ++i) { PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]); } PADDLE_ENFORCE_EQ(idx_dims, og_dims); const T* og_data = out_grad.data(); const int* max_index = index.data(); - T* ig_data = in_grad->mutable_data(); + T* ig_data = in_grad->template mutable_data(); SetConstant set_zero; set_zero(context, in_grad, static_cast(0.0)); @@ -170,7 +170,7 @@ class LastSeqPoolFunctor { lite::Tensor* output) { // Create pointers to input and output data auto* in_data = input.data(); - auto* out_data = output->mutable_data(); + auto* out_data = output->template mutable_data(); // Calculate the size of each item in sequence int64_t item_size = input.numel() / input.dims()[0]; @@ -203,7 +203,7 @@ class FirstSeqPoolFunctor { lite::Tensor* output) { // Create pointers to input and output data auto* in_data = input.data(); - auto* out_data = output->mutable_data(); + auto* out_data = output->template mutable_data(); // Calculate the size of each item in sequence int64_t item_size = input.numel() / input.dims()[0]; @@ -238,7 +238,7 @@ class SumSeqPoolGradFunctor { int64_t in_w = in_grad->numel() / in_grad->dims()[0]; PADDLE_ENFORCE(in_w == out_w); const T* out_g_data = out_grad.data(); - T* in_g_data = in_grad->mutable_data(TARGET(kX86)); + T* in_g_data = in_grad->template mutable_data(TARGET(kX86)); auto blas = math::GetBlas(context); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); @@ -288,7 +288,7 @@ class SequencePoolFunctor { auto lod = input.lod()[0]; if (pooltype == "SUM") { const T* src = input.data(); - T* dst = output->mutable_data(TARGET(kX86)); + T* dst = output->template mutable_data(TARGET(kX86)); jit::seq_pool_attr_t attr( static_cast(input.numel() / input.dims()[0]), jit::SeqPoolType::kSum); diff --git a/lite/backends/x86/math/sequence_pooling_test.cc b/lite/backends/x86/math/sequence_pooling_test.cc index a73014767345842f09ac2ff0cd5c2e7231c1f90a..b91f43a571994bef95650361a6dc62c0465837a7 100644 --- a/lite/backends/x86/math/sequence_pooling_test.cc +++ b/lite/backends/x86/math/sequence_pooling_test.cc @@ -101,13 +101,13 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { TEST(SequencePoolingGrad, CPU_SUM) { paddle::framework::LoD lod1; - lod1.push_back(std::vector{0, 10}); + lod1.push_back(std::vector{0, 10}); TestSequencePoolingSum(lod1); paddle::framework::LoD lod2; - lod2.push_back(std::vector{0, 2, 7, 10}); + lod2.push_back(std::vector{0, 2, 7, 10}); TestSequencePoolingSum(lod2); @@ -116,13 +116,13 @@ TEST(SequencePoolingGrad, CPU_SUM) { #ifdef PADDLE_WITH_CUDA TEST(SequencePoolingGrad, CUDA_SUM) { paddle::framework::LoD lod1; - lod1.push_back(std::vector{0, 10}); + lod1.push_back(std::vector{0, 10}); TestSequencePoolingSum(lod1); paddle::framework::LoD lod2; - lod2.push_back(std::vector{0, 2, 7, 10}); + lod2.push_back(std::vector{0, 2, 7, 10}); TestSequencePoolingSum(lod2); diff --git a/lite/backends/x86/math/sequence_scale.cc b/lite/backends/x86/math/sequence_scale.cc index fad0628de15379b58847827cc3d48bf6085cbda2..25c7be0d0e2747f4f28c1d82f8855872d57726d1 100644 --- a/lite/backends/x86/math/sequence_scale.cc +++ b/lite/backends/x86/math/sequence_scale.cc @@ -32,7 +32,7 @@ class ScaleLoDTensorFunctor { size_t seq_width = seq->dims()[1]; lite::LoD abs_offset_lod = lite::fluid::ToAbsOffset(lod); - T* seq_data = seq->mutable_data(lite::TargetType::kX86); + T* seq_data = seq->template mutable_data(lite::TargetType::kX86); for (size_t i = 0; i < num_seq; ++i) { for (size_t j = lod[level][i] * seq_width; j < lod[level][i + 1] * seq_width; diff --git a/lite/backends/x86/math/sequence_topk_avg_pooling.cc b/lite/backends/x86/math/sequence_topk_avg_pooling.cc index 035a7923c70f91cf27f1d845f68110f8f33cb73d..97e27fed59f4bc1a4c457ea9cf515da6caca9a1c 100644 --- a/lite/backends/x86/math/sequence_topk_avg_pooling.cc +++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc @@ -83,7 +83,7 @@ class SequenceTopkAvgPoolingFunctor { auto pos_data = pos->mutable_data(lite::TargetType::kX86); int offset = 0; - std::vector vec_out_lod; + std::vector vec_out_lod; vec_out_lod.reserve(batch_size + 1); for (int i = 0; i <= batch_size; ++i) { offset = row_lod[i]; @@ -95,7 +95,7 @@ class SequenceTopkAvgPoolingFunctor { out->set_lod(lod_temp); auto in_data = in.data(); - auto out_data = out->mutable_data(lite::TargetType::kX86); + auto out_data = out->template mutable_data(lite::TargetType::kX86); T* sum_data = new T[max_k]; for (int i = 0; i < batch_size; ++i) { diff --git a/lite/backends/x86/math/softmax_impl.h b/lite/backends/x86/math/softmax_impl.h index ec45377bc55154a4a36ebc5c3684ab7efeeef88e..1ba84dda42093155b10fa74a49e953d6663b8c88 100644 --- a/lite/backends/x86/math/softmax_impl.h +++ b/lite/backends/x86/math/softmax_impl.h @@ -108,8 +108,8 @@ class SoftmaxFunctor> { const int num_remain = num_classes / axis_dim; if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) { - const T* in_data = X->data(); - auto* out_data = Y->mutable_data(); + const T* in_data = X->template data(); + auto* out_data = Y->template mutable_data(); for (int bs = 0; bs < batch_size; ++bs) { T max_val = *std::max_element(in_data, in_data + num_classes); max_val *= static_cast(-1); @@ -219,9 +219,9 @@ class SoftmaxGradFunctor> { const int num_remain = num_classes / axis_dim; if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) { - const T* out_data = y->data(); - const T* out_grad = y_grad->data(); - T* in_grad = x_grad->mutable_data(); + const T* out_data = y->template data(); + const T* out_grad = y_grad->template data(); + T* in_grad = x_grad->template mutable_data(); for (int bs = 0; bs < batch_size; ++bs) { T scalar; vec_mul_reduce( diff --git a/lite/backends/x86/math/tree2col.cc b/lite/backends/x86/math/tree2col.cc index 20b913331308c8b8c95d190b6b0b3d76ccac354b..bfc7084c9ff018101ca3dfc1d1748083b1449662 100644 --- a/lite/backends/x86/math/tree2col.cc +++ b/lite/backends/x86/math/tree2col.cc @@ -104,12 +104,12 @@ class Tree2ColFunctor { patch_size = processing_list.size(); // T *patch_data = - // patch->mutable_data({static_cast(patch_size), + // patch->template mutable_data({static_cast(patch_size), // static_cast(patch_elem_size)}, // cpu_place); patch->Resize({static_cast(patch_size), static_cast(patch_elem_size)}); - auto *patch_data = patch->mutable_data(lite::TargetType::kX86); + auto *patch_data = patch->template mutable_data(lite::TargetType::kX86); constant(context, patch, 0); const T *features = node_features.data(); @@ -166,12 +166,12 @@ class Col2TreeFunctor { } } // T *grad_data = - // in_grad->mutable_data({static_cast(node_count), + // in_grad->template mutable_data({static_cast(node_count), // static_cast(grad_elem_size)}, // cpu_place); in_grad->Resize({static_cast(node_count), static_cast(grad_elem_size)}); - auto *grad_data = in_grad->mutable_data(lite::TargetType::kX86); + auto *grad_data = in_grad->template mutable_data(lite::TargetType::kX86); constant(context, in_grad, 0); const T *out_g = out_grad.data(); diff --git a/lite/backends/x86/math/unpooling.cc b/lite/backends/x86/math/unpooling.cc index 568f9952cab755c8441695e1a9266a2001d2b9a9..119d7294e9ec21e67f09776ad20d04f15b8b81ce 100644 --- a/lite/backends/x86/math/unpooling.cc +++ b/lite/backends/x86/math/unpooling.cc @@ -36,7 +36,7 @@ class Unpool2dMaxFunctor { int output_feasize = output_height * output_width; const T* input_data = input.data(); const int* indices_data = indices.data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); + T* output_data = output->template mutable_data(lite::TargetType::kX86); for (int b = 0; b < batch_size; ++b) { for (int c = 0; c < output_channels; ++c) { for (int i = 0; i < input_feasize; ++i) { @@ -70,7 +70,8 @@ class Unpool2dMaxGradFunctor { int output_feasize = output_height * output_width; const int* indices_data = indices.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); for (int b = 0; b < batch_size; ++b) { for (int c = 0; c < output_channels; ++c) { diff --git a/lite/backends/x86/math/vol2col.cc b/lite/backends/x86/math/vol2col.cc index 8fd5e8954e2010d5226d56ac4a87a44e6364c8c6..91979bb7fdcfe66d84ded3f9797144ddafc8769e 100644 --- a/lite/backends/x86/math/vol2col.cc +++ b/lite/backends/x86/math/vol2col.cc @@ -75,7 +75,7 @@ class Vol2ColFunctor { "mismatching."); const T* vol_data = vol.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); for (int c = 0; c < channels_col; ++c) { int w_offset = c % filter_width; @@ -159,7 +159,7 @@ class Col2VolFunctor { output_width, "input_width and output_width are " "mismatching."); - T* vol_data = vol->mutable_data(); + T* vol_data = vol->template mutable_data(); const T* col_data = col.data(); for (int c = 0; c < channels_col; ++c) { diff --git a/lite/backends/x86/parallel.h b/lite/backends/x86/parallel.h index 0689ec4c234509cee6f10f8e0f7dd432edae5c4e..49794b8e15a8f90a6512798baa842534df879f6b 100644 --- a/lite/backends/x86/parallel.h +++ b/lite/backends/x86/parallel.h @@ -38,7 +38,7 @@ static inline int64_t GetMaxThreads() { // Do not support nested omp parallem. num_threads = omp_in_parallel() ? 1 : omp_get_max_threads(); #endif - return std::max(num_threads, 1L); + return std::max(num_threads, 1L); } using ThreadHandler = diff --git a/lite/backends/x86/port.h b/lite/backends/x86/port.h index c1b81159aca979efe4b46777a1cef49e44b95e27..42680bfc89f16bf7da11cebe19e3d3555de066bc 100644 --- a/lite/backends/x86/port.h +++ b/lite/backends/x86/port.h @@ -14,15 +14,15 @@ #pragma once +#include #include #include -#include #include #include #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#include "glog/logging.h" +#include "lite/utils/cp_logging.h" #if !defined(_WIN32) #include // dladdr @@ -37,7 +37,9 @@ #define GOOGLE_GLOG_DLL_DECL #include // _popen, _pclose #include +#define NOMINMAX // msvc max/min macro conflict with std::min/max #include +#include #include // std::accumulate in msvc #ifndef S_ISDIR // windows port for sys/stat.h #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) @@ -62,6 +64,7 @@ static void *dlopen(const char *filename, int flag) { return reinterpret_cast(hModule); } +extern struct timeval; static int gettimeofday(struct timeval *tp, void *tzp) { time_t clock; struct tm tm; diff --git a/lite/backends/xpu/CMakeLists.txt b/lite/backends/xpu/CMakeLists.txt index 4491fdeaefe9f16265bdee2c07ebb02b86a2b038..85bef0452c41ce35c90d9bd058bb7fdefd030f3a 100644 --- a/lite/backends/xpu/CMakeLists.txt +++ b/lite/backends/xpu/CMakeLists.txt @@ -2,4 +2,7 @@ if(NOT LITE_WITH_XPU) return() endif() -lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs}) +if(LITE_WITH_XTCL) + lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs}) +endif() +lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs}) diff --git a/lite/backends/xpu/device.h b/lite/backends/xpu/device.h index 6de18d5466da6e6b791363d2e275ea72376c78b8..a2cc3206d3d0391d89690026561f47983e9376c9 100644 --- a/lite/backends/xpu/device.h +++ b/lite/backends/xpu/device.h @@ -14,12 +14,12 @@ #pragma once -#include #include #include #include #include #include +#include "lite/backends/xpu/xpu_header_sitter.h" namespace paddle { namespace lite { diff --git a/lite/backends/xpu/math.h b/lite/backends/xpu/math.h new file mode 100644 index 0000000000000000000000000000000000000000..48352736d45a20d9abd496d9dd10b000d3f15a28 --- /dev/null +++ b/lite/backends/xpu/math.h @@ -0,0 +1,219 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace xpu { +namespace math { + +static inline long round_half_to_even(const float src) { // NOLINT + long ret = llround(src); // NOLINT + if (fabs(fabs(round(src) - src) - 0.5) > 0) { + return ret; + } else { + if (abs(ret) % 2 == 0) { + return ret; + } else { + return ret + (ret > 0 ? -1 : 1); + } + } +} + +static float ieee_compliance_0(float f) { + uint32_t *ptr = reinterpret_cast(&f); + uint32_t sign = (*ptr) & 0x80000000; + uint32_t uf = 0; + // nan -> inf + if (std::isnan(f)) { + uf = (sign | 0x7F800000); + float *ptr = reinterpret_cast(&uf); + return *ptr; + } else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) { + return f; + } else { + // denormal -> +-0 + uf = 0x0; + float *ptr = reinterpret_cast(&uf); + return *ptr; + } +} + +template +static inline T fp32_to_intx(const float f, float max) { + max = ieee_compliance_0(max); + float input = ieee_compliance_0(f); + // +0 and -0 -> +0 + if (input == 0) { + input = 0.0f; + } + + float tmp = RMAX / max; + if (std::isinf(tmp)) { + uint32_t *ptr = reinterpret_cast(&input); + if ((*ptr) >> 31 & 1) { + return T(-RMAX); + } else { + return T(RMAX); + } + } + + tmp = input * tmp; + if (std::isnan(tmp)) { + return T(RMAX); + } + + tmp = ieee_compliance_0(tmp); + // early check to avoid INF or big value get into convertor func. + if (tmp > RMAX) { + return T(RMAX); + } + if (tmp < -RMAX) { + return T(-RMAX); + } + T ret = (T)round_half_to_even(tmp); + if (ret > RMAX) { + ret = T(RMAX); + } + if (ret < -RMAX) { + ret = T(-RMAX); + } + return ret; +} + +static inline int16_t fp32_to_int16(const float f, float max) { + int16_t v1 = fp32_to_intx(f, max); + return v1; +} + +static inline int ConvertFP32ToInt16(const void *input, + void *output, + float max_val, + int len) { + for (int i = 0; i < len; i++) { + static_cast(output)[i] = + fp32_to_int16(static_cast(input)[i], max_val); + } + return 0; +} + +static inline float FindMaxAbs(const float *data, int len) { + float max_f = 0.0f; + for (int i = 0; i < len; ++i) { + float max = std::abs(data[i]); + if (max > max_f) { + max_f = max; + } + } + return max_f; +} + +template +static inline void Transpose(const T *in, T *out, int h, int w) { + for (int h1 = 0; h1 < w; ++h1) { + for (int w1 = 0; w1 < h; ++w1) { + out[h1 * h + w1] = in[w1 * w + h1]; + } + } +} + +/** + * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the + * original x_dim is returned. + */ +static lite::DDim RowMatrixFromVector(const lite::DDim &x_dim) { + if (x_dim.size() > 1) { + return x_dim; + } + return lite::DDim({1, x_dim[0]}); +} + +/** + * Get column matrix shape from a vector shape. If the rank of y_dim > 1, the + * original y_dim is returned. + */ +static lite::DDim ColumnMatrixFromVector(const lite::DDim &y_dim) { + if (y_dim.size() > 1) { + return y_dim; + } + return lite::DDim({y_dim[0], 1}); +} + +/** + * Matrix Descriptor of a memory buffer. + * + * It is used for Blas::MatMul. MatMul operator can be batched. + * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a + * `batch_size` times of GEMM. The batched GEMM could be faster base on the + * implementation of the blas library. The batch size could be zero. If any + * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g., + * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be + * [BatchSize, H1, W2] + * + * The boolean flag, `trans`, describe the memory is the transpose of matrix or + * not. If the trans is true, the last two dims of matrix are transposed. The + * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height]. + * + * The MatDescriptor is not only the dimension or shape of a matrix, it also + * contains the layout, stride of matrix. It is clearer to have a structure than + * reuse `DDim`. + */ +struct MatDescriptor { + int64_t height_; + int64_t width_; + int64_t stride_{0}; + int64_t batch_size_{0}; + bool trans_; +}; + +static MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim, + int num_flatten_cols, + bool trans) { + MatDescriptor retv; + if (num_flatten_cols > 1) { + auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols); + retv.height_ = flatten_dim[0]; + retv.width_ = flatten_dim[1]; + } else { + if (tensor_dim.size() == 2) { + retv.height_ = tensor_dim[0]; + retv.width_ = tensor_dim[1]; + } else { + auto dim_vec = tensor_dim.Vectorize(); + retv.batch_size_ = 1; + for (size_t i = 0; i < dim_vec.size() - 2; ++i) { + retv.batch_size_ *= dim_vec[i]; + } + retv.height_ = dim_vec[dim_vec.size() - 2]; + retv.width_ = dim_vec[dim_vec.size() - 1]; + retv.stride_ = retv.height_ * retv.width_; + } + } + if (trans) { + std::swap(retv.width_, retv.height_); + } + retv.trans_ = trans; + return retv; +} + +} // namespace math +} // namespace xpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc new file mode 100644 index 0000000000000000000000000000000000000000..5dcbc1e275cca8c32003cbef74dfb1f6d4caee93 --- /dev/null +++ b/lite/backends/xpu/target_wrapper.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/xpu/target_wrapper.h" +#include "lite/backends/xpu/xpu_header_sitter.h" + +namespace paddle { +namespace lite { + +void* TargetWrapperXPU::Malloc(size_t size) { + void* ptr{nullptr}; + xpu_malloc(&ptr, size); + return ptr; +} + +void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); } + +void TargetWrapperXPU::MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir) { + switch (dir) { + case IoDirection::HtoD: + xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE); + break; + case IoDirection::DtoH: + xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST); + break; + default: + LOG(FATAL) << "Unsupported IoDirection " << static_cast(dir); + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..c42d4139246085d8b9a367b45b60699209d0b668 --- /dev/null +++ b/lite/backends/xpu/target_wrapper.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/target_wrapper.h" + +namespace paddle { +namespace lite { + +using TargetWrapperXPU = TargetWrapper; + +template <> +class TargetWrapper { + public: + static size_t num_devices() { return 1; } + static size_t maximum_stream() { return 0; } + + static void* Malloc(size_t size); + static void Free(void* ptr); + + static void MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir); +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/xpu/xpu_header_sitter.h b/lite/backends/xpu/xpu_header_sitter.h new file mode 100644 index 0000000000000000000000000000000000000000..875e67d57d4ba2110bfbffb7ee9d1d6a876060fa --- /dev/null +++ b/lite/backends/xpu/xpu_header_sitter.h @@ -0,0 +1,32 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#pragma GCC system_header +#include +#include +#include + +#if defined(LITE_WITH_XTCL) +#include +#endif + +namespace paddle { +namespace lite { + +namespace xdnn = baidu::xpu::api; + +} // namespace lite +} // namespace paddle diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index 1d0558451fce67433d966d1f4bff82af26459e33..55c83cdb4d02d485054ea4d7f3b90fb9f7aa3dc1 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -5,9 +5,11 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc DEPS target_wrapper_host place X86_DEPS target_wrapper_x86 CUDA_DEPS target_wrapper_cuda + XPU_DEPS target_wrapper_xpu CL_DEPS cl_target_wrapper FPGA_DEPS fpga_target_wrapper - BM_DEPS target_wrapper_bm) + BM_DEPS target_wrapper_bm + MLU_DEPS target_wrapper_mlu) lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper) @@ -22,21 +24,16 @@ if (NOT LITE_ON_TINY_PUBLISH) proto_library(framework_proto SRCS framework.proto) endif() -if (LITE_WITH_X86) lite_cc_library(variable SRCS variable.cc DEPS tensor) lite_cc_library(types SRCS types.cc) -else() -lite_cc_library(variable SRCS variable.cc DEPS tensor) -lite_cc_library(types SRCS types.cc) -endif() lite_cc_library(op_registry SRCS op_registry.cc DEPS kernel) lite_cc_library(scope SRCS scope.cc DEPS tensor) lite_cc_library(device_info SRCS device_info.cc DEPS tensor) if (LITE_WITH_ARM) -lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags) +lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context) else() -lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags) +lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context CUDA_DEPS cuda_context) endif() #-------------------------------------------- GET CODE META INFO ------------------------------------------ @@ -67,6 +64,13 @@ message(STATUS "commit: ${PADDLE_LITE_COMMIT}") configure_file(version.h.in version.h) #----------------------------------------------- NOT CHANGE ----------------------------------------------- +# A trick to generate the opencl_kernels_source.cc +#add_custom_command( +# COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/gen_opencl_code.py +# ${CMAKE_SOURCE_DIR}/lite/backends/opencl/cl_kernel +# ${CMAKE_BINARY_DIR}/lite/backends/opencl/opencl_kernels_source.cc +# OUTPUT opencl_kernels_source.cc # not a real path to the output to force it execute every time. +# ) # A trick to generate the paddle_use_kernels.h add_custom_command( COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_kernel_registry.py @@ -86,9 +90,13 @@ add_custom_command( OUTPUT ops.h # not a real path to the output to force it execute every time. ) # generate fake kernels for memory_optimize_tool + +#-------------------------------opt---------------------------------------------------------------- +# tricks to create headfiles for opt add_custom_command( COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/create_fake_kernel_registry.py ${kernels_src_list} + ${fake_kernels_src_list} ${CMAKE_BINARY_DIR}/all_kernel_faked.cc ${CMAKE_BINARY_DIR}/kernel_src_map.h OUTPUT all_kernel_faked.cc # not a real path to the output to force it execute every time. @@ -96,10 +104,12 @@ add_custom_command( add_custom_target(op_list_h DEPENDS ops.h) add_custom_target(kernel_list_h DEPENDS kernels.h) add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc) + # create headfile to restore ops info sorted by suppported platforms add_custom_command( COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/record_supported_kernel_op.py ${kernels_src_list} + ${fake_kernels_src_list} ${ops_src_list} ${CMAKE_BINARY_DIR}/supported_kernel_op_info.h OUTPUT supported_kernel_op_info.h # not a real path to the output to force it execute every time. diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt index 0f3f36768bd5a079564002cbb6464d61bd5db3aa..75971570fb078ce4e39413e5b3df629fe2a7ac3e 100644 --- a/lite/core/arena/CMakeLists.txt +++ b/lite/core/arena/CMakeLists.txt @@ -6,5 +6,5 @@ endif() lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc index fe36f1e1ba16ad85c44136b09a0d2e5d3fadf688..731215f542567ec3ff0cc87d6990624bfa6b2bc2 100644 --- a/lite/core/arena/framework.cc +++ b/lite/core/arena/framework.cc @@ -59,6 +59,8 @@ void TestCase::CreateInstruction() { CHECK(it != kernels.end()) << "failed to create the kernel in " << place_.DebugString() << " with alias: " << alias_; + // reset final place + place_ = (*it)->place(); // prepare context (*it)->SetContext(std::move(ctx_)); instruction_.reset(new Instruction(op, std::move(*it))); @@ -74,25 +76,164 @@ void TestCase::PrepareInputsForInstruction() { const auto* param_type = ParamTypeRegistry::Global().RetrieveInArgument( place_, kernel_key, arg); - const auto* inst_type = Type::GetTensorTy(TARGET(kHost)); + const Type* inst_type = nullptr; + if (param_type->type->IsTensor()) { + inst_type = Type::GetTensorTy(TARGET(kHost)); + } else if (param_type->type->IsTensorList()) { + inst_type = Type::GetTensorListTy(TARGET(kHost)); + } else { + LOG(FATAL) << "unsupported param_type"; + } + CHECK(scope_->FindVar(var)); - const auto* shared_tensor = scope_->FindTensor((var)); if (!TargetCompatibleTo(*inst_type, *param_type->type)) { - /// Create a tensor in the instruction's scope, alloc memory and then - /// copy data there. - auto* target_tensor = inst_scope_->NewTensor(var); - CHECK(!shared_tensor->dims().empty()) << "shared_tensor is empty yet"; - target_tensor->Resize(shared_tensor->dims()); - TargetCopy(param_type->type->target(), - target_tensor->mutable_data(param_type->type->target(), - shared_tensor->memory_size()), - shared_tensor->raw_data(), - shared_tensor->memory_size()); + /// Create a tensor or tensor_array in the instruction's scope, + /// alloc memory and then copy data there. + if (param_type->type->IsTensor()) { + const auto* shared_tensor = scope_->FindTensor(var); + auto* target_tensor = inst_scope_->NewTensor(var); + CHECK(!shared_tensor->dims().empty()) << "shared_tensor is empty yet"; + target_tensor->Resize(shared_tensor->dims()); + TargetCopy(param_type->type->target(), + target_tensor->mutable_data(param_type->type->target(), + shared_tensor->memory_size()), + shared_tensor->raw_data(), + shared_tensor->memory_size()); + } else if (param_type->type->IsTensorList()) { + const auto* shared_tensor_array = + scope_->FindVar(var)->GetMutable>(); + auto* target_tensor_array = + inst_scope_->Var(var)->GetMutable>(); + CHECK(!shared_tensor_array->empty()) + << "shared_tensor_array is empty yet"; + target_tensor_array->resize(shared_tensor_array->size()); + for (size_t i = 0; i < shared_tensor_array->size(); i++) { + target_tensor_array->at(i).Resize( + shared_tensor_array->at(i).dims()); + TargetCopy(param_type->type->target(), + target_tensor_array->at(i).mutable_data( + param_type->type->target(), + shared_tensor_array->at(i).memory_size()), + shared_tensor_array->at(i).raw_data(), + shared_tensor_array->at(i).memory_size()); + } + } else { + LOG(FATAL) << "not support"; + } } } } } +template +bool TestCase::CheckTensorPrecision(const Tensor* a_tensor, + const Tensor* b_tensor, + float abs_error) { + CHECK(a_tensor); + CHECK(b_tensor); + + CHECK(ShapeEquals(a_tensor->dims(), b_tensor->dims())); + + CHECK(a_tensor->lod() == b_tensor->lod()) << "lod not match"; + + // The baseline should output in host devices. + CHECK(b_tensor->target() == TARGET(kHost) || + b_tensor->target() == TARGET(kX86) || + b_tensor->target() == TARGET(kARM)); + + const T* a_data{}; + switch (a_tensor->target()) { + case TARGET(kX86): + case TARGET(kHost): + case TARGET(kARM): + a_data = static_cast(a_tensor->raw_data()); + break; + + default: + // Before compare, need to copy data from `target` device to host. + LOG(FATAL) << "Not supported"; + } + + CHECK(a_data); + + const T* b_data = static_cast(b_tensor->raw_data()); + + bool success = true; + for (int i = 0; i < a_tensor->dims().production(); i++) { + EXPECT_NEAR(a_data[i], b_data[i], abs_error); + if (fabsf(a_data[i] - b_data[i]) > abs_error) { + success = false; + } + } + return success; +} + +bool TestCase::CheckPrecision(const Tensor* a_tensor, + const Tensor* b_tensor, + float abs_error, + PrecisionType precision_type) { + PrecisionType precision_type_t = precision_type; + if (precision_type == PRECISION(kAny)) { + precision_type_t = b_tensor->precision(); + } + CHECK(precision_type_t == b_tensor->precision()) + << "arg precision type and base tensor precision type are not matched! " + "arg precision type is: " + << PrecisionToStr(precision_type) << ", base tensor precision type is: " + << PrecisionToStr(b_tensor->precision()); + CHECK(a_tensor->precision() == b_tensor->precision()) + << "real tensor precision type and base tensor precision type are not " + "matched! real tensor precision type is: " + << PrecisionToStr(a_tensor->precision()) + << ", base tensor precision type is: " + << PrecisionToStr(b_tensor->precision()); + switch (precision_type_t) { + case PRECISION(kFloat): + return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + case PRECISION(kInt8): + return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + case PRECISION(kInt32): + return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + case PRECISION(kInt64): + return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + case PRECISION(kBool): + return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + default: + LOG(FATAL) << "not support type: " << PrecisionToStr(precision_type); + return false; + } +} + +bool TestCase::CheckPrecision(const std::string& var_name, + float abs_error, + PrecisionType precision_type) { + bool success = true; + if (inst_scope_->FindVar(var_name)->IsType()) { + auto a_tensor = inst_scope_->FindTensor(var_name); + auto b_tensor = base_scope_->FindTensor(var_name); + success = success && + CheckPrecision(a_tensor, b_tensor, abs_error, precision_type); + } else if (inst_scope_->FindVar(var_name)->IsType>()) { + auto a_tensor_array = + inst_scope_->FindVar(var_name)->GetMutable>(); + auto b_tensor_array = + base_scope_->FindVar(var_name)->GetMutable>(); + CHECK_EQ(a_tensor_array->size(), b_tensor_array->size()); + for (size_t i = 0; i < a_tensor_array->size(); i++) { + Tensor* a_tensor = &(a_tensor_array->at(i)); + Tensor* b_tensor = &(b_tensor_array->at(i)); + if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) { + continue; + } + success = success && + CheckPrecision(a_tensor, b_tensor, abs_error, precision_type); + } + } else { + LOG(FATAL) << "unsupported var type"; + } + return success; +} + TestCase::~TestCase() { if (op_desc_->Type() == "subgraph") { // Release the subblock desc of Subgraph op diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h index 85edda26e6591bada967165317de00b169a2d0cd..20a0792155f0b4ea8faa7c3fc15ea5c4767352ac 100644 --- a/lite/core/arena/framework.h +++ b/lite/core/arena/framework.h @@ -66,11 +66,24 @@ class TestCase { /// output. virtual void RunBaseline(Scope* scope) = 0; - /// Check the precision of the output tensors. It will compare the same tensor - /// in two scopes, one of the instruction execution, and the other for the - /// baseline. + // checkout the precision of the two tensors with type T. b_tensor is baseline template - bool CheckPrecision(const std::string& var_name, float abs_error); + bool CheckTensorPrecision(const Tensor* a_tensor, + const Tensor* b_tensor, + float abs_error); + + // checkout the precision of the two tensors. b_tensor is baseline + bool CheckPrecision(const Tensor* a_tensor, + const Tensor* b_tensor, + float abs_error, + PrecisionType precision_type); + + /// Check the precision of the output variables. It will compare the same + /// tensor (or all tensors of the tensor_array) in two scopes, one of the + /// instruction execution, and the other for the baseline. + bool CheckPrecision(const std::string& var_name, + float abs_error, + PrecisionType precision_type); const cpp::OpDesc& op_desc() { return *op_desc_; } @@ -78,20 +91,6 @@ class TestCase { // kernel registry. void CheckKernelConsistWithDefinition() {} - // Get the real precision of the output for check precision. When the declare - // precision obtained from the kernel is any, we should set the precision of - // the output in test case. - bool GetPrecisonType(const std::string& var_name, - PrecisionType* precision_type) { - auto res = precision_type_map_.find(var_name); - if (res == precision_type_map_.end()) { - return false; - } else { - *precision_type = precision_type_map_.at(var_name); - return true; - } - } - Scope& scope() { return *scope_; } Scope* baseline_scope() { return base_scope_; } @@ -120,22 +119,37 @@ class TestCase { tensor->set_persistable(is_persistable); } - // Prepare for the operator. - virtual void PrepareOpDesc(cpp::OpDesc* op_desc) = 0; + /// Prepare a tensor_array in host. The tensors will be created in scope_. + /// Need to specify the targets other than X86 or ARM. + template + void SetCommonTensorList(const std::string& var_name, + const std::vector& array_tensor_dims, + const std::vector>& datas, + const std::vector& lods = {}) { + CHECK_EQ(array_tensor_dims.size(), datas.size()); + if (!lods.empty()) { + CHECK_EQ(array_tensor_dims.size(), lods.size()); + } - // Set the real precision of the output for check precision. When the declare - // precision obtained from the kernel is any, we should set the precision of - // the output in test case. - void SetPrecisionType(const std::string& var_name, - const PrecisionType& precision_type) { - auto res = precision_type_map_.find(var_name); - if (res == precision_type_map_.end()) { - precision_type_map_.insert({var_name, precision_type}); - } else { - precision_type_map_.at(var_name) = precision_type; + auto* tensor_array = + scope_->Var(var_name)->GetMutable>(); + for (int i = 0; i < array_tensor_dims.size(); i++) { + Tensor tmp; + tmp.Resize(array_tensor_dims[i]); + auto* tmp_data = tmp.mutable_data(); + memcpy(tmp_data, + datas[i].data(), + array_tensor_dims[i].production() * sizeof(T)); + if (!lods.empty()) { + tmp.set_lod(lods[i]); + } + tensor_array->push_back(tmp); } } + // Prepare for the operator. + virtual void PrepareOpDesc(cpp::OpDesc* op_desc) = 0; + public: const Instruction& instruction() { return *instruction_; } @@ -152,7 +166,7 @@ class TestCase { // TODO(Superjomn) Move this method to utils or DDim? bool ShapeEquals(const DDim& a, const DDim& b) { if (a.size() != b.size()) return false; - for (int i = 0; i < a.size(); i++) { + for (size_t i = 0; i < a.size(); i++) { if (a[i] != b[i]) return false; } return true; @@ -179,7 +193,6 @@ class TestCase { Scope* base_scope_{}; std::unique_ptr op_desc_; std::unique_ptr instruction_; - std::unordered_map precision_type_map_; }; class Arena { @@ -236,22 +249,7 @@ class Arena { const Type* type = tester_->instruction().kernel()->GetOutputDeclType(arg_name); auto precision_type = type->precision(); - if (precision_type == PRECISION(kAny)) { - CHECK(tester_->GetPrecisonType(var_name, &precision_type)); - } - switch (precision_type) { - case PRECISION(kFloat): - return tester_->CheckPrecision(var_name, abs_error_); - case PRECISION(kInt8): - return tester_->CheckPrecision(var_name, abs_error_); - case PRECISION(kInt32): - return tester_->CheckPrecision(var_name, abs_error_); - case PRECISION(kBool): - return tester_->CheckPrecision(var_name, abs_error_); - default: - LOG(FATAL) << "not support type " << PrecisionToStr(type->precision()); - return false; - } + return tester_->CheckPrecision(var_name, abs_error_, precision_type); } private: @@ -260,49 +258,6 @@ class Arena { float abs_error_; }; -template -bool TestCase::CheckPrecision(const std::string& var_name, float abs_error) { - auto a_tensor = inst_scope_->FindTensor(var_name); - auto b_tensor = base_scope_->FindTensor(var_name); - CHECK(a_tensor); - CHECK(b_tensor); - - CHECK(ShapeEquals(a_tensor->dims(), b_tensor->dims())); - - CHECK(a_tensor->lod() == b_tensor->lod()) << "lod not match"; - - // The baseline should output in host devices. - CHECK(b_tensor->target() == TARGET(kHost) || - b_tensor->target() == TARGET(kX86) || - b_tensor->target() == TARGET(kARM)); - - const T* a_data{}; - switch (a_tensor->target()) { - case TARGET(kX86): - case TARGET(kHost): - case TARGET(kARM): - a_data = static_cast(a_tensor->raw_data()); - break; - - default: - // Before compare, need to copy data from `target` device to host. - LOG(FATAL) << "Not supported"; - } - - CHECK(a_data); - - const T* b_data = static_cast(b_tensor->raw_data()); - - bool success = true; - for (int i = 0; i < a_tensor->dims().production(); i++) { - EXPECT_NEAR(a_data[i], b_data[i], abs_error); - if (fabsf(a_data[i] - b_data[i]) > abs_error) { - success = false; - } - } - return success; -} - } // namespace arena } // namespace lite } // namespace paddle diff --git a/lite/core/context.cc b/lite/core/context.cc index 948aac0c794969304b585520bfb7229410555578..eb8f90d7fa90d459846b24bc93b5d26cdfc3969a 100644 --- a/lite/core/context.cc +++ b/lite/core/context.cc @@ -14,10 +14,18 @@ #include "lite/core/context.h" -#ifdef LITE_WITH_OPENCL -DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path."); +namespace paddle { +namespace lite { + +#ifdef LITE_WITH_NPU +std::string Context::subgraph_model_cache_dir_{""}; // NOLINT #endif -namespace paddle { -namespace lite {} // namespace lite +#ifdef LITE_WITH_XPU +std::string Context::_multi_encoder_precision; // NOLINT +thread_local xdnn::Context* Context::_tls_raw_ctx{nullptr}; +int Context::_workspace_l3_size_per_thread{0}; +#endif + +} // namespace lite } // namespace paddle diff --git a/lite/core/context.h b/lite/core/context.h index 653329e4f24b1f391ea41ed39819b60c8a598a3b..f8013ac5008e2478719b3d777a36d2bfac57ec6d 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -16,15 +16,21 @@ #include "lite/utils/any.h" #ifdef LITE_WITH_CUDA -#include "lite/backends/cuda/blas.h" -#include "lite/backends/cuda/cuda_utils.h" +#include "lite/backends/cuda/context.h" #endif #ifdef LITE_WITH_OPENCL -#include #include #include "lite/backends/opencl/cl_context.h" #include "lite/backends/opencl/cl_runtime.h" #endif +#ifdef LITE_WITH_MLU +#include +#include +#include "lite/backends/mlu/mlu_utils.h" +#endif +#ifdef LITE_WITH_XPU +#include "lite/backends/xpu/xpu_header_sitter.h" +#endif #include #include @@ -36,10 +42,7 @@ #include "lite/core/target_wrapper.h" #include "lite/core/tensor.h" #include "lite/utils/all.h" - -#ifdef LITE_WITH_OPENCL -DECLARE_string(cl_path); -#endif +#include "lite/utils/env.h" namespace paddle { namespace lite { @@ -49,13 +52,15 @@ class Context; using HostContext = Context; using X86Context = Context; -using CUDAContext = Context; using ARMContext = Context; using NPUContext = Context; +using APUContext = Context; using XPUContext = Context; using OpenCLContext = Context; using FPGAContext = Context; using BMContext = Context; +using MLUContext = Context; +using RKNPUContext = Context; template <> class Context { @@ -80,6 +85,31 @@ class Context { NPUContext& operator=(const NPUContext& ctx) {} std::string name() const { return "NPUContext"; } + + static void SetSubgraphModelCacheDir(std::string subgraph_model_cache_dir) { + subgraph_model_cache_dir_ = subgraph_model_cache_dir; + } + static std::string SubgraphModelCacheDir() { + return subgraph_model_cache_dir_; + } + + private: + static std::string subgraph_model_cache_dir_; +}; +#endif + +#ifdef LITE_WITH_APU +template <> +class Context { + public: + Context() {} + explicit Context(const APUContext& ctx); + // NOTE: InitOnce should only be used by ContextScheduler + void InitOnce() {} + void CopySharedTo(APUContext* ctx) {} + + APUContext& operator=(const APUContext& ctx) {} + std::string name() const { return "APUContext"; } }; #endif @@ -90,9 +120,7 @@ class Context { Context() {} explicit Context(const BMContext& ctx); // NOTE: InitOnce should only be used by ContextScheduler - void InitOnce() { Init(0); } - - void Init(int dev_id) { TargetWrapperBM::SetDevice(dev_id); } + void InitOnce() { TargetWrapperBM::SetDevice(TargetWrapperBM::GetDevice()); } void CopySharedTo(BMContext* ctx) {} void* GetHandle() { return TargetWrapperBM::GetHandle(); } @@ -100,17 +128,72 @@ class Context { }; #endif +#ifdef LITE_WITH_RKNPU +template <> +class Context { + public: + Context() {} + explicit Context(const RKNPUContext& ctx); + // NOTE: InitOnce should only be used by ContextScheduler + void InitOnce() {} + void CopySharedTo(RKNPUContext* ctx) {} + + RKNPUContext& operator=(const RKNPUContext& ctx) {} + std::string name() const { return "RKNPUContext"; } +}; +#endif + #ifdef LITE_WITH_XPU template <> class Context { public: Context() {} explicit Context(const XPUContext& ctx); + // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() {} + void CopySharedTo(XPUContext* ctx) {} + static xdnn::Context* GetRawContext() { + if (_tls_raw_ctx == nullptr) { + _tls_raw_ctx = xdnn::create_context(); + CHECK(_tls_raw_ctx); + int r = xdnn::set_workspace_l3_size(_tls_raw_ctx, + _workspace_l3_size_per_thread); + if (r != 0) { + LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r + << ", _workspace_l3_size_per_thread = " + << _workspace_l3_size_per_thread; + } + } + return _tls_raw_ctx; + } + + static void SetWorkspaceL3Size(int l3_size = 0xfffc00) { + _workspace_l3_size_per_thread = l3_size; + } + + // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker + // thread + static void SetDev(int dev_no = 0) { + const char* dev_env = getenv("LITE_XPU_DEV"); + if (dev_env) { + xpu_set_device(atoi(dev_env)); + return; + } + + xpu_set_device(dev_no); + } + std::string name() const { return "XPUContext"; } + + public: + static std::string _multi_encoder_precision; // NOLINT + + private: + static thread_local xdnn::Context* _tls_raw_ctx; + static int _workspace_l3_size_per_thread; }; #endif @@ -175,18 +258,20 @@ class Context { }; #endif -#ifdef LITE_WITH_CUDA -// Only works with CUDA kernels. +#ifdef LITE_WITH_MLU template <> -class Context { +class Context { public: - typename Env::Devs& devs = - Env::Global(); - // NOTE: InitOnce should only be used by ContextScheduler - void InitOnce() { - cublas_fp32_ = std::make_shared>(); + typename Env::Devs& devs = Env::Global(); + + void InitOnce() {} + + MLUContext& operator=(const MLUContext& ctx) { + this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_); + return *this; } - void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) { + + void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) { CHECK_GT(devs.size(), 0UL) << "Env is not initialized or current target is not exit!"; if (dev_id >= static_cast(devs.size())) { @@ -196,77 +281,61 @@ class Context { } else { device_id_ = dev_id; } - if (io_stream_id >= devs[dev_id].max_stream()) { - LOG(WARNING) << "data stream index exceeds the maximum stream number, " - "set to default stream(0)!"; - io_stream_id = 0; + SetMluDevice(device_id_); + if (io_queue_id >= devs[dev_id].max_queue()) { + LOG(WARNING) << "data queue index exceeds the maximum queue number, " + "set to default qeueu(0)!"; + io_queue_id = 0; } - if (exec_stream_id >= devs[dev_id].max_stream()) { - LOG(WARNING) << "exec stream index exceeds the maximum stream number, " - "set to default stream(0)!"; - exec_stream_id = 0; + if (exec_queue_id >= devs[dev_id].max_queue()) { + LOG(WARNING) << "exec queue index exceeds the maximum queue number, " + "set to default qeueu(0)!"; + exec_queue_id = 0; } + io_queue_ = devs[dev_id].io_queues()[io_queue_id]; + exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id]; - exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id]; - io_stream_ = devs[dev_id].io_streams()[io_stream_id]; - - exec_stream_id_ = exec_stream_id; - io_stream_id_ = io_stream_id; - } - void CopySharedTo(CUDAContext* ctx) { - CHECK(ctx); - CHECK(cublas_fp32_) << "cublas_fp32 should be set first"; - ctx->cublas_fp32_ = cublas_fp32_; + exec_queue_id_ = exec_queue_id; + io_queue_id_ = io_queue_id; } - const cudaStream_t& exec_stream() const { return exec_stream_; } - void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; } + void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; } - const cudaStream_t& io_stream() const { return io_stream_; } - void SetIoStream(cudaStream_t stream) { io_stream_ = stream; } + const cnrtQueue_t& exec_queue() const { return exec_queue_; } + void SetExecQueue(cnrtQueue_t queue) { exec_queue_ = queue; } - std::shared_ptr> cublas_fp32() { return cublas_fp32_; } - void SetCuBlasFP32(std::shared_ptr> cublas_fp32) { - cublas_fp32_ = cublas_fp32; - } + const cnrtQueue_t& io_queue() const { return io_queue_; } + void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; } - const std::vector& input_events() { return input_events_; } - void SetInputEvents(const std::vector& input_events) { - input_events_.clear(); - input_events_.assign(input_events.begin(), input_events.end()); + cnmlCoreVersion_t MLUCoreVersion() { + return DeviceInfo::Global().MLUCoreVersion(); } - const std::vector& output_events() { return output_events_; } - void SetOutputEvents(const std::vector& output_events) { - output_events_.clear(); - output_events_.assign(output_events.begin(), output_events.end()); - } + int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); } - std::string name() const { return "CUDAContext"; } + u32_t affinity() { return affinity_; } - CUDAContext& operator=(const CUDAContext& context) { - this->Init( - context.device_id_, context.exec_stream_id_, context.io_stream_id_); - cublas_fp32_ = const_cast(context).cublas_fp32(); - return *this; - } + cnrtInvokeFuncParam_t forward_param() { return forward_param_; } + + int device_id() { return device_id_; } + + std::string name() const { return "MLUContext"; } private: int device_id_; // overall information - int exec_stream_id_; - int io_stream_id_; - cudaStream_t exec_stream_; - cudaStream_t io_stream_; + int exec_queue_id_; + int io_queue_id_; + cnrtQueue_t io_queue_; + cnrtQueue_t exec_queue_; - // not thread-safe, should allocate for each thread. - std::shared_ptr> cublas_fp32_; + std::vector input_notifiers_; + std::vector output_notifiers_; - // kernel information - std::vector input_events_; - std::vector output_events_; + cnrtInvokeFuncParam_t forward_param_; + u32_t affinity_ = 0x01; }; -#endif +#endif // LITE_WITH_MLU #ifdef LITE_WITH_X86 template <> @@ -292,28 +361,17 @@ class Context { template <> class Context { std::shared_ptr cl_context_; - using WaitListType = - std::unordered_map(nullptr)), - std::shared_ptr>; - std::shared_ptr cl_wait_list_; public: CLContext* cl_context() { return cl_context_.get(); } - WaitListType* cl_wait_list() { return cl_wait_list_.get(); } void InitOnce() { // Init cl runtime. CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed"; - CLRuntime::Global()->set_cl_path(FLAGS_cl_path); - cl_context_ = std::make_shared(); - cl_wait_list_ = std::make_shared(); } - void CopySharedTo(OpenCLContext* ctx) { - ctx->cl_context_ = cl_context_; - ctx->cl_wait_list_ = cl_wait_list_; - } + void CopySharedTo(OpenCLContext* ctx) { ctx->cl_context_ = cl_context_; } }; #endif @@ -341,7 +399,9 @@ class ContextScheduler { return *x; } - std::unique_ptr NewContext(TargetType target) { + std::unique_ptr NewContext( + TargetType target, + /*only used for cuda context*/ int exec_stream_id = 0) { std::unique_ptr ctx(new KernelContext); switch (target) { case TARGET(kHost): @@ -358,7 +418,7 @@ class ContextScheduler { case TARGET(kCUDA): { int dev_id = TargetWrapper::GetCurDevice(); auto& context = ctx->As(); - context.Init(dev_id); + context.Init(dev_id, exec_stream_id); kernel_contexts_[TargetType::kCUDA].As().CopySharedTo( &context); } break; @@ -375,6 +435,18 @@ class ContextScheduler { &ctx->As()); break; #endif +#ifdef LITE_WITH_APU + case TARGET(kAPU): + kernel_contexts_[TargetType::kAPU].As().CopySharedTo( + &ctx->As()); + break; +#endif +#ifdef LITE_WITH_RKNPU + case TARGET(kRKNPU): + kernel_contexts_[TargetType::kRKNPU].As().CopySharedTo( + &ctx->As()); + break; +#endif #ifdef LITE_WITH_XPU case TARGET(kXPU): kernel_contexts_[TargetType::kXPU].As().CopySharedTo( @@ -398,9 +470,19 @@ class ContextScheduler { kernel_contexts_[TargetType::kBM].As().CopySharedTo( &ctx->As()); break; +#endif +#ifdef LITE_WITH_MLU + case TARGET(kMLU): { + int dev_id = TargetWrapper::GetCurDevice(); + auto& context = ctx->As(); + context.Init(dev_id); + kernel_contexts_[TargetType::kMLU].As().CopySharedTo( + &context); + LOG(INFO) << "New Context for MLU"; + } break; #endif default: -#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL +#if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON) LOG(FATAL) << "unsupported target " << TargetToStr(target); #endif break; @@ -434,11 +516,20 @@ class ContextScheduler { #ifdef LITE_WITH_NPU InitContext(); #endif +#ifdef LITE_WITH_APU + InitContext(); +#endif +#ifdef LITE_WITH_RKNPU + InitContext(); +#endif #ifdef LITE_WITH_XPU InitContext(); #endif #ifdef LITE_WITH_BM InitContext(); +#endif +#ifdef LITE_WITH_MLU + InitContext(); #endif } diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc index 6e0d743fb9d8d8af5e7168e292c1e85d76844383..ac79ede37406188f495690179b4a4886bc009d80 100644 --- a/lite/core/device_info.cc +++ b/lite/core/device_info.cc @@ -58,7 +58,7 @@ namespace paddle { namespace lite { -#ifdef LITE_WITH_ARM +#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU)) thread_local lite_api::PowerMode DeviceInfo::mode_; thread_local ARMArch DeviceInfo::arch_; thread_local int DeviceInfo::mem_size_; @@ -66,6 +66,15 @@ thread_local std::vector DeviceInfo::active_ids_; thread_local TensorLite DeviceInfo::workspace_; thread_local int64_t DeviceInfo::count_ = 0; +#ifdef LITE_WITH_MLU +thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270}; +thread_local int DeviceInfo::mlu_core_number_{1}; +thread_local bool DeviceInfo::use_first_conv_{false}; +thread_local std::vector DeviceInfo::mean_vec_; +thread_local std::vector DeviceInfo::std_vec_; +thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)}; +#endif + #ifdef TARGET_IOS const int DEFAULT_L1_CACHE_SIZE = 64 * 1024; const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024; @@ -938,7 +947,7 @@ void DeviceInfo::RequestPowerNoBindMode(int thread_num) { active_ids_ = core_ids_; } else { active_ids_.resize(thread_num); - for (int i = 0; i < thread_num; ++i) { + for (uint32_t i = 0; i < thread_num; ++i) { if (i < big_core_ids_.size()) { active_ids_[i] = big_core_ids_[i]; } else { @@ -1080,6 +1089,45 @@ int DeviceInfo::Setup() { return 0; } +#ifdef LITE_WITH_MLU +void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version, + int core_number, + bool use_first_conv, + const std::vector& mean_vec, + const std::vector& std_vec, + DataLayoutType input_layout) { + switch (core_version) { + case (lite_api::MLUCoreVersion::MLU_220): + mlu_core_version_ = CNML_MLU220; + break; + case (lite_api::MLUCoreVersion::MLU_270): + mlu_core_version_ = CNML_MLU270; + break; + default: + mlu_core_version_ = CNML_MLU270; + break; + } + mlu_core_number_ = core_number; + use_first_conv_ = use_first_conv; + mean_vec_ = mean_vec; + std_vec_ = std_vec; + input_layout_ = input_layout; +} + +cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; } + +int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; } + +bool DeviceInfo::UseFirstConv() { return use_first_conv_; } + +const std::vector& DeviceInfo::MeanVec() const { return mean_vec_; } + +const std::vector& DeviceInfo::StdVec() const { return std_vec_; } + +DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; } + +#endif // LITE_WITH_MLU + void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) { #ifdef ARM_WITH_OMP thread_num = std::min(thread_num, core_num_); @@ -1159,6 +1207,52 @@ bool DeviceInfo::ExtendWorkspace(size_t size) { #endif // LITE_WITH_ARM +#ifdef LITE_WITH_MLU +void SetMluDevice(int device_id) { + LOG(INFO) << "Set mlu device " << device_id; + cnrtDev_t dev_handle; + CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, device_id)); + CNRT_CALL(cnrtSetCurrentDevice(dev_handle)); +} + +void Device::Init() { + SetMluDevice(idx_); + GetInfo(); + CreateQueue(); +} + +void Device::GetInfo() {} + +void Device::CreateQueue() { + exec_queue_.clear(); + io_queue_.clear(); + for (size_t i = 0; i < max_queue_; ++i) { + cnrtQueue_t exec_queue; + cnrtQueue_t io_queue; + cnrtCreateQueue(&exec_queue); + cnrtCreateQueue(&io_queue); + exec_queue_.push_back(exec_queue); + io_queue_.push_back(io_queue); + + cnrtCreateQueue(&exec_queue); + exec_queue_.push_back(exec_queue); + } +} +#endif // LITE_WITH_MLU + +#ifdef LITE_WITH_BM +void Device::SetId(int device_id) { + LOG(INFO) << "Set bm device " << device_id; + TargetWrapper::SetDevice(device_id); + idx_ = device_id; +} + +void Device::Init() { SetId(idx_); } +int Device::core_num() { + return TargetWrapper::num_devices(); +} +#endif // LITE_WITH_BM + #ifdef LITE_WITH_CUDA void Device::Init() { diff --git a/lite/core/device_info.h b/lite/core/device_info.h index 1ff8b896a70dc538d2486a24db2625c7b62c600a..f5b75039ea14f67cee9d009261b2dd1fc6b46825 100644 --- a/lite/core/device_info.h +++ b/lite/core/device_info.h @@ -19,11 +19,14 @@ #include #include "lite/core/tensor.h" #include "lite/utils/cp_logging.h" +#ifdef LITE_WITH_MLU +#include "lite/backends/mlu/mlu_utils.h" +#endif namespace paddle { namespace lite { -#ifdef LITE_WITH_ARM +#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU)) typedef enum { kAPPLE = 0, @@ -52,6 +55,20 @@ class DeviceInfo { int Setup(); void SetRunMode(lite_api::PowerMode mode, int thread_num); +#ifdef LITE_WITH_MLU + void SetMLURunMode(lite_api::MLUCoreVersion core_version, + int core_number, + bool use_first_conv, + const std::vector& mean_vec, + const std::vector& std_vec, + DataLayoutType input_layout); + cnmlCoreVersion_t MLUCoreVersion(); + int MLUCoreNumber(); + bool UseFirstConv(); + const std::vector& MeanVec() const; + const std::vector& StdVec() const; + DataLayoutType InputLayout() const; +#endif void SetCache(int l1size, int l2size, int l3size); void SetArch(ARMArch arch) { arch_ = arch; } @@ -103,6 +120,15 @@ class DeviceInfo { static thread_local TensorLite workspace_; static thread_local int64_t count_; +#ifdef LITE_WITH_MLU + static thread_local cnmlCoreVersion_t mlu_core_version_; + static thread_local int mlu_core_number_; + static thread_local bool use_first_conv_; + static thread_local std::vector mean_vec_; + static thread_local std::vector std_vec_; + static thread_local DataLayoutType input_layout_; +#endif + void SetDotInfo(int argc, ...); void SetFP16Info(int argc, ...); void SetFP32Info(int argc, ...); @@ -133,7 +159,10 @@ class Env { static Devs* devs = new Devs(); return *devs; } - static void Init(int max_stream = 4) { + static void Init(int max_stream = 6) { +#ifdef LITE_WITH_MLU + CNRT_CALL(cnrtInit(0)); +#endif Devs& devs = Global(); if (devs.size() > 0) { return; @@ -142,10 +171,11 @@ class Env { // Get device count count = API::num_devices(); if (count == 0) { - CHECK(false) << "No device found!"; + LOG(INFO) << "No " << TargetToStr(Type) << " device(s) found!"; } else { LOG(INFO) << "Found " << count << " device(s)"; } + CHECK_GT(max_stream, 0) << "max_stream must be greater than 0."; // create all device for (int i = 0; i < count; i++) { auto dev = Device(i, max_stream); @@ -156,6 +186,84 @@ class Env { } }; +#ifdef LITE_WITH_MLU +void SetMluDevice(int device_id); + +template <> +class Device { + public: + Device(int dev_id, int max_queue = 1) : idx_(dev_id), max_queue_(max_queue) {} + void Init(); + + int id() { return idx_; } + int max_queue() { return max_queue_; } + void SetId(int idx) { idx_ = idx; } + std::string name() { return "MLU"; } + int core_num() { return 16; } + float max_memory() { return 16 * 1024; } + std::vector io_queues() { return io_queue_; } + std::vector exec_queues() { return exec_queue_; } + + private: + void CreateQueue(); + void GetInfo(); + + private: + int idx_{0}; + int max_queue_; + std::string device_name_; + float max_memory_; + + std::vector io_queue_; + std::vector exec_queue_; +}; + +template class Env; +#endif // LITE_WITH_MLU + +#ifdef LITE_WITH_BM +template <> +class Device { + public: + Device(int dev_id, int max_stream = 1) + : idx_(dev_id), max_stream_(max_stream) {} + void Init(); + + int id() { return idx_; } + int max_stream() { return 1; } + std::string name() { return "BM"; } + float max_memory() { return 16; } + int core_num(); + void SetId(int idx); + + int sm_version() { return 0; } + bool has_fp16() { return false; } + bool has_int8() { return false; } + bool has_hmma() { return false; } + bool has_imma() { return false; } + int runtime_version() { return 0; } + + private: + void CreateQueue() {} + void GetInfo() {} + + private: + int idx_{0}; + int max_stream_{1}; + std::string device_name_; + float max_memory_; + + int sm_version_; + bool has_fp16_; + bool has_int8_; + bool has_hmma_; + bool has_imma_; + int runtime_version_; +}; + +template class Env; +#endif + #ifdef LITE_WITH_CUDA template <> class Device { @@ -170,8 +278,8 @@ class Device { std::string name() { return device_prop_.name; } int core_num() { return device_prop_.multiProcessorCount; } float max_memory() { return device_prop_.totalGlobalMem / 1048576.; } - std::vector exec_streams() { return exec_stream_; } - std::vector io_streams() { return io_stream_; } + const std::vector& exec_streams() { return exec_stream_; } + const std::vector& io_streams() { return io_stream_; } int sm_version() { return sm_version_; } bool has_fp16() { return has_fp16_; } diff --git a/lite/core/exported_symbols.lds b/lite/core/exported_symbols.lds new file mode 100644 index 0000000000000000000000000000000000000000..f5e53027bdcfb3db1f1f452c150758894847cd00 --- /dev/null +++ b/lite/core/exported_symbols.lds @@ -0,0 +1,4 @@ +*paddle*lite* +*touch_* +*mir_pass_* +*PyInit_lite* diff --git a/lite/core/kernel.cc b/lite/core/kernel.cc index 7ec718cb3881c10dec08376419b419777c71bba6..194d736a4c0cf6fa18eae119589c5fa1fd08bca0 100644 --- a/lite/core/kernel.cc +++ b/lite/core/kernel.cc @@ -57,7 +57,7 @@ void KernelBase::ParseKernelType(const std::string &kernel_type, std::string *alias, Place *place) { auto parts = Split(kernel_type, "/"); - CHECK_EQ(parts.size(), 5); + CHECK_EQ(parts.size(), 5u); *op_type = parts[0]; *alias = parts[1]; diff --git a/lite/core/kernel.h b/lite/core/kernel.h index 18a1243c11652afc181f13f0f5a497858a30885f..cbd9e8affffcac159a8cf15136e57b4936d3ba41 100644 --- a/lite/core/kernel.h +++ b/lite/core/kernel.h @@ -62,6 +62,14 @@ class KernelBase { profiler_ = profiler; profile_id_ = id; } + + virtual void SetProfileRuntimeKernelInfo( + paddle::lite::profile::OpCharacter* ch) { + ch->kernel_func_name = std::string("NotImpl"); +#ifdef LITE_WITH_ARM + ch->cl_event = event_; +#endif + } #endif void Launch() { @@ -83,11 +91,20 @@ class KernelBase { #if defined(LITE_WITH_CUDA) WorkSpace::Global_CUDA().AllocReset(); #endif +#if defined(LITE_WITH_MLU) + WorkSpace::Global_MLU().AllocReset(); +#endif #ifdef LITE_WITH_PROFILE profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get()); profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); Run(); + + if (is_first_epoch_for_profiler_) { + SetProfileRuntimeKernelInfo(profiler_->GetOpCharacter(profile_id_)); + is_first_epoch_for_profiler_ = false; + } profiler_->StopTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); + #else Run(); #endif @@ -179,6 +196,11 @@ class KernelBase { #ifdef LITE_WITH_PROFILE profile::Profiler* profiler_{nullptr}; int profile_id_{-1}; + bool is_first_epoch_for_profiler_{true}; +#endif + +#ifdef LITE_WITH_OPENCL + cl::Event event_; #endif }; diff --git a/lite/core/lite.map b/lite/core/lite.map index 9cfd272eb6d3017a75b40481d25527d7c14478bf..bc76ef04e9d0eb58b2e702207b526f3a2911e8c5 100644 --- a/lite/core/lite.map +++ b/lite/core/lite.map @@ -1,8 +1,9 @@ { global: - *paddle*; + *paddle*lite*; *touch_*; *mir_pass_*; + *PyInit_lite*; local: *; }; diff --git a/lite/core/lite_tensor_test.cc b/lite/core/lite_tensor_test.cc index d667a9f8852d49bd850274bbb3c895e14d233f77..500dae3e283084ff8218fc758e1a7c5119eff16b 100644 --- a/lite/core/lite_tensor_test.cc +++ b/lite/core/lite_tensor_test.cc @@ -13,19 +13,49 @@ // limitations under the License. #include +#include #include "lite/core/tensor.h" namespace paddle { namespace lite { -TEST(tensor, test) { - TensorLite tensor; - DDimLite ddim({1, 8}); - tensor.Resize(ddim); +template +void test_shared_memory_tensor() { + const std::vector data({0, 1, 2, 3}); + const std::vector shape({2, 2}); + const size_t size = data.size() * sizeof(Dtype); + TensorLite init_tensor; + init_tensor.Assign(data.data(), + static_cast(shape)); + Dtype* init_raw_data = init_tensor.mutable_data(); - for (int i = 0; i < 8; i++) { - tensor.mutable_data()[i] = i; + TensorLite shared_tensor( + std::make_shared(Buffer(init_raw_data, Target, size))); + Buffer host_buffer; + host_buffer.ResetLazy(TargetType::kHost, size); + if (Target == TargetType::kHost) { + CopySync( + host_buffer.data(), init_raw_data, size, IoDirection::HtoH); + } else { + CopySync( + host_buffer.data(), init_raw_data, size, IoDirection::DtoH); } + EXPECT_EQ(std::memcmp(host_buffer.data(), data.data(), size), 0); + + shared_tensor.Resize({1, 5}); + ASSERT_DEATH(shared_tensor.mutable_data(), ""); +} + +TEST(tensor, shared_memory) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + test_shared_memory_tensor(); + test_shared_memory_tensor(); + test_shared_memory_tensor(); +#ifdef LITE_WITH_CUDA + test_shared_memory_tensor(); + test_shared_memory_tensor(); + test_shared_memory_tensor(); +#endif } } // namespace lite diff --git a/lite/core/memory.cc b/lite/core/memory.cc index cfb0b3ae1765864200ecf2d70107a3aa0046899c..1f2f7fed7d61b67a76f54a092b6d48951bc9fcbd 100644 --- a/lite/core/memory.cc +++ b/lite/core/memory.cc @@ -45,13 +45,23 @@ void* TargetMalloc(TargetType target, size_t size) { data = TargetWrapper::Malloc(size); break; #endif +#ifdef LITE_WITH_MLU + case TargetType::kMLU: + data = TargetWrapper::Malloc(size); + break; +#endif // LITE_WITH_MLU +#ifdef LITE_WITH_XPU + case TargetType::kXPU: + data = TargetWrapperXPU::Malloc(size); + break; +#endif // LITE_WITH_XPU default: LOG(FATAL) << "Unknown supported target " << TargetToStr(target); } return data; } -void TargetFree(TargetType target, void* data) { +void TargetFree(TargetType target, void* data, std::string free_flag) { switch (target) { case TargetType::kHost: case TargetType::kX86: @@ -66,7 +76,11 @@ void TargetFree(TargetType target, void* data) { #endif // LITE_WITH_CUDA #ifdef LITE_WITH_OPENCL case TargetType::kOpenCL: - TargetWrapperCL::Free(data); + if (free_flag == "cl_use_image2d_") { + TargetWrapperCL::FreeImage(data); + } else { + TargetWrapperCL::Free(data); + } break; #endif // LITE_WITH_OPENCL #ifdef LITE_WITH_FPGA @@ -79,6 +93,16 @@ void TargetFree(TargetType target, void* data) { TargetWrapper::Free(data); break; #endif +#ifdef LITE_WITH_MLU + case TargetType::kMLU: + TargetWrapper::Free(data); + break; +#endif // LITE_WITH_MLU +#ifdef LITE_WITH_XPU + case TargetType::kXPU: + TargetWrapperXPU::Free(data); + break; +#endif // LITE_WITH_XPU default: LOG(FATAL) << "Unknown type"; } @@ -110,6 +134,12 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) { TargetWrapper::MemcpySync(dst, src, size, IoDirection::DtoD); break; #endif +#ifdef LITE_WITH_MLU + case TargetType::kMLU: + TargetWrapper::MemcpySync( + dst, src, size, IoDirection::HtoD); + break; +#endif #ifdef LITE_WITH_OPENCL case TargetType::kOpenCL: TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD); diff --git a/lite/core/memory.h b/lite/core/memory.h index 051d47bdde102f5fe058163d0c746fe3c4acf26e..a1013910019251271ddfccfbc700297c45226fe6 100644 --- a/lite/core/memory.h +++ b/lite/core/memory.h @@ -13,8 +13,10 @@ // limitations under the License. #pragma once +#include #include "lite/api/paddle_place.h" #include "lite/core/target_wrapper.h" +#include "lite/utils/logging.h" #include "lite/utils/macros.h" #ifdef LITE_WITH_OPENCL @@ -29,6 +31,14 @@ #include "lite/backends/bm/target_wrapper.h" #endif // LITE_WITH_BM +#ifdef LITE_WITH_MLU +#include "lite/backends/mlu/target_wrapper.h" +#endif // LITE_WITH_MLU + +#ifdef LITE_WITH_XPU +#include "lite/backends/xpu/target_wrapper.h" +#endif // LITE_WITH_XPU + namespace paddle { namespace lite { @@ -38,7 +48,9 @@ LITE_API void* TargetMalloc(TargetType target, size_t size); // Free memory for a specific Target. All the targets should be an element in // the `switch` here. -void LITE_API TargetFree(TargetType target, void* data); +void LITE_API TargetFree(TargetType target, + void* data, + std::string free_flag = ""); // Copy a buffer from host to another target. void TargetCopy(TargetType target, void* dst, const void* src, size_t size); @@ -71,6 +83,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) { TargetWrapperCL::MemcpySync(dst, src, size, dir); break; #endif // LITE_WITH_OPENCL +#ifdef LITE_WITH_MLU + case TARGET(kMLU): + TargetWrapperMlu::MemcpySync(dst, src, size, dir); + break; +#endif #ifdef LITE_WITH_FPGA case TARGET(kFPGA): TargetWrapper::MemcpySync(dst, src, size, dir); @@ -81,6 +98,9 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) { TargetWrapper::MemcpySync(dst, src, size, dir); break; #endif + default: + LOG(FATAL) + << "The copy function of this target has not been implemented yet."; } } @@ -89,17 +109,24 @@ class Buffer { public: Buffer() = default; Buffer(TargetType target, size_t size) : space_(size), target_(target) {} + Buffer(void* data, TargetType target, size_t size) + : space_(size), data_(data), own_data_(false), target_(target) {} void* data() const { return data_; } TargetType target() const { return target_; } size_t space() const { return space_; } + bool own_data() const { return own_data_; } void ResetLazy(TargetType target, size_t size) { if (target != target_ || space_ < size) { + CHECK_EQ(own_data_, true) << "Can not reset unowned buffer."; Free(); data_ = TargetMalloc(target, size); target_ = target; space_ = size; +#ifdef LITE_WITH_OPENCL + cl_use_image2d_ = false; +#endif } } @@ -111,14 +138,15 @@ class Buffer { const size_t img_w, const size_t img_h, void* host_ptr = nullptr) { - size_t size = sizeof(T) * img_w * img_h * - 4; // 4 for RGBA, un-used for opencl Image2D if (target != target_ || cl_image2d_width_ < img_w || - cl_image2d_height_ < img_h) { + cl_image2d_height_ < img_h || host_ptr != nullptr) { + CHECK_EQ(own_data_, true) << "Can not reset unowned buffer."; Free(); data_ = TargetWrapperCL::MallocImage(img_w, img_h, host_ptr); target_ = target; - space_ = size; // un-used for opencl Image2D + space_ = sizeof(T) * img_w * img_h * + 4; // un-used for opencl Image2D, 4 for RGBA, + cl_use_image2d_ = true; cl_image2d_width_ = img_w; cl_image2d_height_ = img_h; } @@ -126,8 +154,12 @@ class Buffer { #endif void Free() { - if (space_ > 0) { - TargetFree(target_, data_); + if (space_ > 0 && own_data_) { + if (!cl_use_image2d_) { + TargetFree(target_, data_); + } else { + TargetFree(target_, data_, "cl_use_image2d_"); + } } data_ = nullptr; target_ = TargetType::kHost; @@ -146,9 +178,11 @@ class Buffer { private: // memory it actually malloced. size_t space_{0}; + bool cl_use_image2d_{false}; // only used for OpenCL Image2D size_t cl_image2d_width_{0}; // only used for OpenCL Image2D size_t cl_image2d_height_{0}; // only used for OpenCL Image2D void* data_{nullptr}; + bool own_data_{true}; TargetType target_{TargetType::kHost}; }; diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt index 3f9fb97ee756eeac870fe5090de182d8c03d170b..8a47e0add7dac6f28b103aef2c1b9bfdd8665029 100644 --- a/lite/core/mir/CMakeLists.txt +++ b/lite/core/mir/CMakeLists.txt @@ -21,7 +21,13 @@ lite_cc_library(mir_passes fusion/elementwise_add_activation_fuse_pass.cc fusion/quant_dequant_fuse_pass.cc fusion/sequence_pool_concat_fuse_pass.cc + fusion/scale_activation_fuse_pass.cc + fusion/__xpu__resnet_fuse_pass.cc + fusion/__xpu__multi_encoder_fuse_pass.cc + fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc + fusion/__xpu__fc_fuse_pass.cc elimination/identity_scale_eliminate_pass.cc + elimination/identity_dropout_eliminate_pass.cc elimination/elementwise_mul_constant_eliminate_pass.cc static_kernel_pick_pass.cc variable_place_inference_pass.cc @@ -36,7 +42,10 @@ lite_cc_library(mir_passes demo_pass.cc runtime_context_assign_pass.cc memory_optimize_pass.cc + multi_stream_analysis_pass.cc + mlu_postprocess_pass.cc weight_quantization_preprocess_pass.cc + quantized_op_attributes_inference_pass.cc DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs}) # lite_cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS @@ -69,10 +78,10 @@ set(pattern_deps mir_node mir_ssa_graph op) if (WITH_TESTING) list(APPEND pattern_deps gtest) endif() -lite_cc_library(pattern_matcher SRCS pattern_matcher.cc DEPS ${pattern_deps}) +lite_cc_library(pattern_matcher SRCS pattern_matcher.cc xpu_pattern_matcher.cc DEPS ${pattern_deps}) lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher) -lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher) +lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc xpu_pattern_matcher_high_api.cc DEPS pattern_matcher) # for mobile, unnecessary to compile the following testings. diff --git a/lite/core/mir/dot.h b/lite/core/mir/dot.h index df70565c0775acdb61cb540598f15b7f84e0119c..a68890910ab33bd32c68efc6f06236db21909a05 100644 --- a/lite/core/mir/dot.h +++ b/lite/core/mir/dot.h @@ -27,8 +27,8 @@ #include "lite/utils/string.h" namespace paddle { -namespace inference { -namespace analysis { +namespace lite { +namespace mir { static size_t dot_node_counter{0}; @@ -162,6 +162,6 @@ class Dot { std::vector attrs_; }; -} // namespace analysis -} // namespace inference +} // namespace mir +} // namespace lite } // namespace paddle diff --git a/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc b/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..cc0cc47b76104b68f091b2413b703a19a1f198bc --- /dev/null +++ b/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/pass.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/pattern_matcher_high_api.h" + +namespace paddle { +namespace lite { +namespace mir { + +namespace { + +class Eliminator : public FuseBase { + public: + static bool DropoutIsTest(const Node* x) { + if (x && x->IsStmt()) { + auto* op_info = x->stmt()->op_info(); + if (op_info->HasAttr("is_test")) { + auto attr_type = op_info->GetAttrType("is_test"); + if (attr_type == paddle::lite::OpDescAPI::AttrType::INT && + op_info->GetAttr("is_test") == 1) { + return true; + } else if (attr_type == paddle::lite::OpDescAPI::AttrType::BOOLEAN && + op_info->GetAttr("is_test")) { + return true; + } + } + } + return false; + } + + void BuildPattern() override { + // the previous op's output need updat + auto* pre_op = OpNode("preop")->assert_is_not_op_type("conditional_block"); + // TODO(Superjomn) check has only one output + auto* x = VarNode("x")->assert_is_op_input("dropout", "X"); + auto* dropout_op = OpNode("dropout", "dropout") + ->assert_node_satisfied(Eliminator::DropoutIsTest) + ->assert_op_attr( + "dropout_implementation", "upscale_in_train"); + auto* out = VarNode("out")->assert_is_op_output("dropout", "Out"); + auto* mask = VarNode("mask")->assert_is_op_output("dropout", "Mask"); + + *pre_op >> *x >> *dropout_op >> *out; + *dropout_op >> *mask; + + // The pre_op will be eliminated, and a new output-updated op will insert. + x->AsIntermediate(); // x is pre_op's output, need to update + dropout_op->AsIntermediate(); + mask->AsIntermediate(); + } + + private: + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + auto& pre_op = matched.at("preop")->AsStmt(); + auto op_info = *pre_op.op_info(); + + op_info.UpdateAllOutputs(matched.at("x")->AsArg().name, + matched.at("out")->AsArg().name); + pre_op.ResetOp(op_info, graph->valid_places()); + + IR_NODE_LINK_TO(matched.at("preop"), matched.at("out")); + } +}; + +} // namespace + +class IdentityDropoutEliminatePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + Eliminator eliminator; + eliminator(graph.get()); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(identity_dropout_eliminate_pass, + paddle::lite::mir::IdentityDropoutEliminatePass) + .BindTargets({TARGET(kXPU)}); diff --git a/lite/core/mir/elimination/identity_scale_eliminate_pass.cc b/lite/core/mir/elimination/identity_scale_eliminate_pass.cc index 345361047bbbad68cdd0b298a163214cbfe114fc..2e522214bfa301c488700dde06b98e0ad8ff3940 100644 --- a/lite/core/mir/elimination/identity_scale_eliminate_pass.cc +++ b/lite/core/mir/elimination/identity_scale_eliminate_pass.cc @@ -26,7 +26,9 @@ class Eliminator : public FuseBase { public: void BuildPattern() override { // the previous op's output need updat - auto* pre_op = OpNode("preop")->assert_is_not_op_type("conditional_block"); + auto* pre_op = OpNode("preop") + ->assert_is_not_op_type("conditional_block") + ->assert_is_not_op_type("scale"); // TODO(Superjomn) check has only one output auto* x = VarNode("x")->assert_is_op_input("scale", "X"); auto* scale_op = OpNode("scale", "scale") diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt index e65e72cf7b367ee8477f3f783ae4d82372529864..a7a4cee798c1e8ef5b9b8f8d9e8e5810554fc571 100644 --- a/lite/core/mir/fusion/CMakeLists.txt +++ b/lite/core/mir/fusion/CMakeLists.txt @@ -27,10 +27,13 @@ lite_cc_library(fuse_transpose_softmax_transpose DEPS pattern_matcher_high_api) lite_cc_library(fuse_interpolate SRCS interpolate_fuser.cc - DEPS pattern_matcher_high_api) + DEPS pattern_matcher_high_api) lite_cc_library(fuse_sequence_pool_concat SRCS sequence_pool_concat_fuser.cc - DEPS pattern_matcher_high_api) + DEPS pattern_matcher_high_api) +lite_cc_library(fuse_scale_activation + SRCS scale_activation_fuser.cc + DEPS pattern_matcher_high_api) set(mir_fusers fuse_fc @@ -44,6 +47,7 @@ set(mir_fusers fuse_transpose_softmax_transpose fuse_interpolate fuse_sequence_pool_concat + fuse_scale_activation CACHE INTERNAL "fusers") if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) diff --git a/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc b/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..1272ae4c63c2521bf738ca8623fcde2d40014dea --- /dev/null +++ b/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc @@ -0,0 +1,166 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" +#include "lite/utils/string.h" + +namespace paddle { +namespace lite { +namespace mir { + +namespace fusion { + +class XPUEmbeddingWithEltwiseAddFuser : public FuseBase { + public: + explicit XPUEmbeddingWithEltwiseAddFuser(int n_embedding) + : n_embedding_(n_embedding) {} + + void BuildPattern() override { + auto* ids0 = + VarNode("ids0")->assert_is_op_input("lookup_table", "Ids")->AsInput(); + auto* table0 = + VarNode("table0")->assert_is_op_input("lookup_table", "W")->AsInput(); + auto* embedding0 = OpNode("embedding0", "lookup_table"); + auto* embedding_out0 = VarNode("embedding_out0") + ->assert_is_op_output("lookup_table", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + + auto* ids1 = + VarNode("ids1")->assert_is_op_input("lookup_table", "Ids")->AsInput(); + auto* table1 = + VarNode("table1")->assert_is_op_input("lookup_table", "W")->AsInput(); + auto* embedding1 = OpNode("embedding1", "lookup_table")->AsIntermediate(); + auto* embedding_out1 = VarNode("embedding_out1") + ->assert_is_op_output("lookup_table", "Out") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + + auto* ewadd01 = OpNode("ewadd01", "elementwise_add")->AsIntermediate(); + auto* ewadd01_out = VarNode("ewadd01_out") + ->assert_is_op_output("elementwise_add", "Out") + ->AsIntermediate(); + + embedding0->LinksFrom({ids0, table0}); + embedding0->LinksTo({embedding_out0}); + embedding1->LinksFrom({ids1, table1}); + embedding1->LinksTo({embedding_out1}); + ewadd01->LinksFrom({embedding_out0, embedding_out1}); + ewadd01->LinksTo({ewadd01_out}); + + auto* last_ewadd_out = ewadd01_out; + for (int i = 2; i < n_embedding_; ++i) { + auto ids_name = paddle::lite::string_format("ids%d", i); + auto table_name = paddle::lite::string_format("table%d", i); + auto embedding_name = paddle::lite::string_format("embedding%d", i); + auto embedding_out_name = + paddle::lite::string_format("embedding_out%d", i); + + auto* new_ids = VarNode(ids_name) + ->assert_is_op_input("lookup_table", "Ids") + ->AsInput(); + auto* new_table = VarNode(table_name) + ->assert_is_op_input("lookup_table", "W") + ->AsInput(); + auto* new_embedding = + OpNode(embedding_name, "lookup_table")->AsIntermediate(); + auto* new_embedding_out = VarNode(embedding_out_name) + ->assert_is_op_output("lookup_table", "Out") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + + new_embedding->LinksFrom({new_ids, new_table}); + new_embedding->LinksTo({new_embedding_out}); + + auto ewadd_name = paddle::lite::string_format("ewadd%d%d", i - 1, i); + auto ewadd_out_name = ewadd_name + "_out"; + + auto* new_ewadd = OpNode(ewadd_name, "elementwise_add")->AsIntermediate(); + auto* new_ewadd_out = VarNode(ewadd_out_name) + ->assert_is_op_output("elementwise_add", "Out") + ->AsIntermediate(); + + new_ewadd->LinksFrom({last_ewadd_out, new_embedding_out}); + new_ewadd->LinksTo({new_ewadd_out}); + last_ewadd_out = new_ewadd_out; + } + last_ewadd_out->AsOutput(); + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__embedding_with_eltwise_add"); + std::vector ids_names; + std::vector table_names; + for (int i = 0; i < n_embedding_; ++i) { + auto ids_name = paddle::lite::string_format("ids%d", i); + ids_names.push_back(matched.at(ids_name)->arg()->name); + auto table_name = paddle::lite::string_format("table%d", i); + table_names.push_back(matched.at(table_name)->arg()->name); + } + op_desc.SetInput("Ids", ids_names); + op_desc.SetInput("Tables", table_names); + auto output_name = paddle::lite::string_format( + "ewadd%d%d_out", n_embedding_ - 2, n_embedding_ - 1); + op_desc.SetOutput("Output", {matched.at(output_name)->arg()->name}); + op_desc.SetAttr("n_embedding", n_embedding_); + auto* embedding0_op_info = matched.at("embedding0")->stmt()->op_info(); + op_desc.SetAttr( + "padding_idx", embedding0_op_info->GetAttr("padding_idx")); + + auto* new_stmt = matched.at("embedding0")->stmt(); + auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); + new_op->Attach(op_desc, new_stmt->op()->scope()); + new_op->SetValidPlaces(new_stmt->op()->valid_places()); + auto kernels = new_op->CreateKernels(new_op->valid_places()); + new_stmt->SetOp(new_op); + new_stmt->SetKernels(std::move(kernels)); + + for (int i = 0; i < n_embedding_; ++i) { + auto ids_name = paddle::lite::string_format("ids%d", i); + auto table_name = paddle::lite::string_format("table%d", i); + DirectedLink(matched.at(ids_name), matched.at("embedding0")); + DirectedLink(matched.at(table_name), matched.at("embedding0")); + } + IR_OP_VAR_LINK(matched.at("embedding0"), matched.at(output_name)); + } + + private: + int n_embedding_; +}; + +} // namespace fusion + +class XPUEmbeddingWithEltwiseAddFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + for (int n_embedding : {4, 3}) { + fusion::XPUEmbeddingWithEltwiseAddFuser fuser(n_embedding); + fuser(graph.get()); + } + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass, + paddle::lite::mir::XPUEmbeddingWithEltwiseAddFusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("lookup_table"); diff --git a/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc b/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..1e6b28790e1c87f2e9e80acc99f3cf517621c477 --- /dev/null +++ b/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/pattern_matcher_high_api.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +class XPUFcFuser : public FuseBase { + public: + explicit XPUFcFuser(bool with_relu) : with_relu_(with_relu) {} + + void BuildPattern() override { + // create nodes. + auto* x = VarNode("x")->assert_is_op_input("mul", "X"); + auto* W = VarNode("W")->assert_is_op_input("mul", "Y"); + auto* b = VarNode("b")->assert_is_persistable_var(); + auto* mul = OpNode("mul", "mul"); + auto* mul_out = VarNode("mul_out"); + auto* add = OpNode("add", "elementwise_add"); + auto* Out = VarNode("Out"); + + // create topology. + std::vector mul_inputs{W, x}; + std::vector add_inputs{mul_out, b}; + mul_inputs >> *mul >> *mul_out; + + // Some op specialities. + mul_out->AsIntermediate(); + mul->AsIntermediate(); + add->AsIntermediate(); + + if (with_relu_) { + auto* add_out = VarNode("add_out"); + auto* relu = OpNode("relu", "relu"); + std::vector relu_inputs{add_out}; + add_inputs >> *add >> *add_out; + relu_inputs >> *relu >> *Out; + add_out->AsIntermediate(); + relu->AsIntermediate(); + } else { + add_inputs >> *add >> *Out; + } + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + auto mul = matched.at("mul")->stmt()->op(); + auto* scope = mul->scope(); + + // convert W from float to int16, and transpose W + auto weight_name = matched.at("W")->arg()->name; + auto* weight_t = scope->FindMutableTensor(weight_name); + auto weight_dims = weight_t->dims(); + int weight_len = weight_t->numel(); + float* weight_on_host = weight_t->mutable_data(); + float max_f = + paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len); + + std::unique_ptr weight_int16(new int16_t[weight_len]); + std::unique_ptr weight_trans_int16(new int16_t[weight_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + weight_on_host, weight_int16.get(), max_f, weight_len); + paddle::lite::xpu::math::Transpose(weight_int16.get(), + weight_trans_int16.get(), + weight_dims[0], + weight_dims[1]); + memcpy( + weight_on_host, weight_trans_int16.get(), weight_len * sizeof(int16_t)); + + auto op_desc = GenOpDesc(matched, max_f, true); + auto fc_op = LiteOpRegistry::Global().Create("__xpu__fc"); + auto& valid_places = mul->valid_places(); + fc_op->Attach(op_desc, scope); + + auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places); + + IR_NODE_LINK_TO(matched.at("W"), new_op_node); + IR_NODE_LINK_TO(matched.at("x"), new_op_node); + IR_NODE_LINK_TO(matched.at("b"), new_op_node); + IR_NODE_LINK_TO(new_op_node, matched.at("Out")); + } + + private: + cpp::OpDesc GenOpDesc(const key2nodes_t& matched, + float w_max, + bool transpose_w) { + cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info(); + op_desc.mutable_inputs()->clear(); + op_desc.mutable_outputs()->clear(); + op_desc.SetType("__xpu__fc"); + op_desc.SetInput("Input", {matched.at("x")->arg()->name}); + op_desc.SetInput("W", {matched.at("W")->arg()->name}); + op_desc.SetInput("Bias", {matched.at("b")->arg()->name}); + op_desc.SetOutput("Out", {matched.at("Out")->arg()->name}); + op_desc.SetAttr( + "in_num_col_dims", + matched.at("mul")->stmt()->op_info()->GetAttr("x_num_col_dims")); + op_desc.SetAttr("w_max", w_max); + op_desc.SetAttr("transpose_w", transpose_w); + if (with_relu_) { + op_desc.SetAttr("activation_type", std::string{"relu"}); + } + return op_desc; + } + + bool with_relu_; +}; + +} // namespace fusion + +class XPUFcFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + + fusion::XPUFcFuser fuser(true /* with_relu */); + fuser(graph.get()); + + fusion::XPUFcFuser fuser2(false /* with_relu */); + fuser2(graph.get()); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__fc_fuse_pass, paddle::lite::mir::XPUFcFusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("fc"); diff --git a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..d653f87f7b5e4f71998ba1e73ac88398d89d328a --- /dev/null +++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc @@ -0,0 +1,674 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/context.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/type_precision_cast_pass.h" // For UpdateInputs() +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" +#include "lite/operators/subgraph_op.h" + +namespace paddle { +namespace lite { +namespace mir { + +namespace fusion { + +class XPUSingleEncoderFuser : public FuseBase { + public: + explicit XPUSingleEncoderFuser(const std::string& act_type = "gelu") + : act_type_(act_type) {} + + void BuildPattern() override { + auto* input = VarNode("input") + ->assert_is_op_input("mul", "X") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + + auto* q_mul_y = + VarNode("q_mul_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* q_mul = OpNode("q_mul", "mul"); + auto* q_mul_out = VarNode("q_mul_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* q_add_y = VarNode("q_add_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* q_add = OpNode("q_add", "elementwise_add")->AsIntermediate(); + auto* q_add_out = VarNode("q_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate(); + auto* q_reshape2 = OpNode("q_reshape2", "reshape2")->AsIntermediate(); + auto* q_reshape2_out = VarNode("q_reshape2_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("transpose2", "X") + ->AsIntermediate(); + auto* q_reshape2_xshape = VarNode("q_reshape2_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* q_transpose2 = OpNode("q_transpose2", "transpose2")->AsIntermediate(); + auto* q_transpose2_out = VarNode("q_transpose2_out") + ->assert_is_op_output("transpose2", "Out") + ->assert_is_op_input("scale", "X") + ->AsIntermediate(); + auto* q_transpose2_xshape = + VarNode("q_transpose2_xshape") + ->assert_is_op_output("transpose2", "XShape") + ->AsIntermediate(); + auto* q_scale = OpNode("q_scale", "scale")->AsIntermediate(); + auto* q_scale_out = VarNode("q_scale_out") + ->assert_is_op_output("scale", "Out") + ->assert_is_op_input("matmul", "X") + ->AsIntermediate(); + + auto* k_mul_y = + VarNode("k_mul_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* k_mul = OpNode("k_mul", "mul")->AsIntermediate(); + auto* k_mul_out = VarNode("k_mul_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* k_add_y = VarNode("k_add_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* k_add = OpNode("k_add", "elementwise_add")->AsIntermediate(); + auto* k_add_out = VarNode("k_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate(); + auto* k_reshape2 = OpNode("k_reshape2", "reshape2")->AsIntermediate(); + auto* k_reshape2_out = VarNode("k_reshape2_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("transpose2", "X") + ->AsIntermediate(); + auto* k_reshape2_xshape = VarNode("k_reshape2_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* k_transpose2 = OpNode("k_transpose2", "transpose2")->AsIntermediate(); + auto* k_transpose2_out = VarNode("k_transpose2_out") + ->assert_is_op_output("transpose2", "Out") + ->assert_is_op_input("matmul", "Y") + ->AsIntermediate(); + auto* k_transpose2_xshape = + VarNode("k_transpose2_xshape") + ->assert_is_op_output("transpose2", "XShape") + ->AsIntermediate(); + + auto* qk_matmul = OpNode("qk_matmul", "matmul")->AsIntermediate(); + auto* qk_matmul_out = VarNode("qk_matmul_out") + ->assert_is_op_output("matmul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qk_mask = VarNode("qk_mask") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* qk_add = OpNode("qk_add", "elementwise_add")->AsIntermediate(); + auto* qk_add_out = VarNode("qk_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("softmax", "X") + ->AsIntermediate(); + auto* qk_softmax = OpNode("qk_softmax", "softmax")->AsIntermediate(); + auto* qk_softmax_out = VarNode("qk_softmax_out") + ->assert_is_op_output("softmax", "Out") + ->AsIntermediate(); + + auto* v_mul_y = + VarNode("v_mul_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* v_mul = OpNode("v_mul", "mul")->AsIntermediate(); + auto* v_mul_out = VarNode("v_mul_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* v_add_y = VarNode("v_add_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* v_add = OpNode("v_add", "elementwise_add")->AsIntermediate(); + auto* v_add_out = VarNode("v_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate(); + auto* v_reshape2 = OpNode("v_reshape2", "reshape2")->AsIntermediate(); + auto* v_reshape2_out = VarNode("v_reshape2_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("transpose2", "X") + ->AsIntermediate(); + auto* v_reshape2_xshape = VarNode("v_reshape2_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* v_transpose2 = OpNode("v_transpose2", "transpose2")->AsIntermediate(); + auto* v_transpose2_out = VarNode("v_transpose2_out") + ->assert_is_op_output("transpose2", "Out") + ->assert_is_op_input("matmul", "Y") + ->AsIntermediate(); + auto* v_transpose2_xshape = + VarNode("v_transpose2_xshape") + ->assert_is_op_output("transpose2", "XShape") + ->AsIntermediate(); + + auto* qkv_matmul = OpNode("qkv_matmul", "matmul")->AsIntermediate(); + auto* qkv_matmul_out = VarNode("qkv_matmul_out") + ->assert_is_op_output("matmul", "Out") + ->assert_is_op_input("transpose2", "X") + ->AsIntermediate(); + auto* qkv_transpose2 = + OpNode("qkv_transpose2", "transpose2")->AsIntermediate(); + auto* qkv_transpose2_out = VarNode("qkv_transpose2_out") + ->assert_is_op_output("transpose2", "Out") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate(); + auto* qkv_transpose2_xshape = + VarNode("qkv_transpose2_xshape") + ->assert_is_op_output("transpose2", "XShape") + ->AsIntermediate(); + auto* qkv_reshape2 = OpNode("qkv_reshape2", "reshape2")->AsIntermediate(); + auto* qkv_reshape2_out = VarNode("qkv_reshape2_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("mul", "X") + ->AsIntermediate(); + auto* qkv_reshape2_xshape = VarNode("qkv_reshape2_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* qkv_mul_y = + VarNode("qkv_mul_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* qkv_mul = OpNode("qkv_mul", "mul")->AsIntermediate(); + auto* qkv_mul_out = VarNode("qkv_mul_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qkv_add_y = VarNode("qkv_add_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* qkv_add = OpNode("qkv_add", "elementwise_add")->AsIntermediate(); + auto* qkv_add_out = VarNode("qkv_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->AsIntermediate(); + + auto* qkv_add_2 = OpNode("qkv_add_2", "elementwise_add")->AsIntermediate(); + auto* qkv_add_2_out = VarNode("qkv_add_2_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("layer_norm", "X") + ->AsIntermediate(); + auto* qkv_ln_2_scale = VarNode("qkv_ln_2_scale") + ->assert_is_op_input("layer_norm", "Scale") + ->AsInput(); + auto* qkv_ln_2_bias = VarNode("qkv_ln_2_bias") + ->assert_is_op_input("layer_norm", "Bias") + ->AsInput(); + auto* qkv_ln_2 = OpNode("qkv_ln_2", "layer_norm")->AsIntermediate(); + auto* qkv_ln_2_out = VarNode("qkv_ln_2_out") + ->assert_is_op_output("layer_norm", "Y") + ->assert_is_op_input("mul", "X") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + auto* qkv_ln_2_mean = VarNode("qkv_ln_2_mean") + ->assert_is_op_output("layer_norm", "Mean") + ->AsIntermediate(); + auto* qkv_ln_2_var = VarNode("qkv_ln_2_var") + ->assert_is_op_output("layer_norm", "Variance") + ->AsIntermediate(); + + auto* qkv_mul_3_y = + VarNode("qkv_mul_3_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* qkv_mul_3 = OpNode("qkv_mul_3", "mul")->AsIntermediate(); + auto* qkv_mul_3_out = VarNode("qkv_mul_3_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qkv_add_3_y = VarNode("qkv_add_3_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* qkv_add_3 = OpNode("qkv_add_3", "elementwise_add")->AsIntermediate(); + auto* qkv_add_3_out = VarNode("qkv_add_3_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input(act_type_, "X") + ->AsIntermediate(); + auto* qkv_act = OpNode("qkv_act", act_type_)->AsIntermediate(); + auto* qkv_act_out = VarNode("qkv_act_out") + ->assert_is_op_output(act_type_, "Out") + ->assert_is_op_input("mul", "X") + ->AsIntermediate(); + auto* qkv_mul_4_y = + VarNode("qkv_mul_4_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* qkv_mul_4 = OpNode("qkv_mul_4", "mul")->AsIntermediate(); + auto* qkv_mul_4_out = VarNode("qkv_mul_4_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qkv_add_4_y = VarNode("qkv_add_4_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* qkv_add_4 = OpNode("qkv_add_4", "elementwise_add")->AsIntermediate(); + auto* qkv_add_4_out = VarNode("qkv_add_4_out") + ->assert_is_op_output("elementwise_add", "Out") + ->AsIntermediate(); + + auto* qkv_add_5 = OpNode("qkv_add_5", "elementwise_add")->AsIntermediate(); + auto* qkv_add_5_out = VarNode("qkv_add_5_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("layer_norm", "X") + ->AsIntermediate(); + auto* qkv_ln_5_scale = VarNode("qkv_ln_5_scale") + ->assert_is_op_input("layer_norm", "Scale") + ->AsInput(); + auto* qkv_ln_5_bias = VarNode("qkv_ln_5_bias") + ->assert_is_op_input("layer_norm", "Bias") + ->AsInput(); + auto* qkv_ln_5 = OpNode("qkv_ln_5", "layer_norm")->AsIntermediate(); + auto* qkv_ln_5_out = VarNode("qkv_ln_5_out") + ->assert_is_op_output("layer_norm", "Y") + ->AsOutput(); + auto* qkv_ln_5_mean = VarNode("qkv_ln_5_mean") + ->assert_is_op_output("layer_norm", "Mean") + ->AsIntermediate(); + auto* qkv_ln_5_var = VarNode("qkv_ln_5_var") + ->assert_is_op_output("layer_norm", "Variance") + ->AsIntermediate(); + + // TODO(miaotianxiang): use LinksFrom/LinksTo() instead + *input >> *q_mul >> *q_mul_out >> *q_add >> *q_add_out >> *q_reshape2 >> + *q_reshape2_out >> *q_transpose2 >> *q_transpose2_out >> *q_scale >> + *q_scale_out >> *qk_matmul; + *q_mul_y >> *q_mul; + *q_add_y >> *q_add; + *q_reshape2 >> *q_reshape2_xshape; + *q_transpose2 >> *q_transpose2_xshape; + + *input >> *k_mul >> *k_mul_out >> *k_add >> *k_add_out >> *k_reshape2 >> + *k_reshape2_out >> *k_transpose2 >> *k_transpose2_out >> *qk_matmul; + *k_mul_y >> *k_mul; + *k_add_y >> *k_add; + *k_reshape2 >> *k_reshape2_xshape; + *k_transpose2 >> *k_transpose2_xshape; + + *qk_matmul >> *qk_matmul_out >> *qk_add >> *qk_add_out >> *qk_softmax >> + *qk_softmax_out >> *qkv_matmul; + *qk_mask >> *qk_add; + + *input >> *v_mul >> *v_mul_out >> *v_add >> *v_add_out >> *v_reshape2 >> + *v_reshape2_out >> *v_transpose2 >> *v_transpose2_out >> *qkv_matmul; + *v_mul_y >> *v_mul; + *v_add_y >> *v_add; + *v_reshape2 >> *v_reshape2_xshape; + *v_transpose2 >> *v_transpose2_xshape; + + *qkv_matmul >> *qkv_matmul_out >> *qkv_transpose2 >> *qkv_transpose2_out >> + *qkv_reshape2 >> *qkv_reshape2_out >> *qkv_mul >> *qkv_mul_out >> + *qkv_add >> *qkv_add_out >> *qkv_add_2; + *qkv_transpose2 >> *qkv_transpose2_xshape; + *qkv_reshape2 >> *qkv_reshape2_xshape; + *qkv_mul_y >> *qkv_mul; + *qkv_add_y >> *qkv_add; + + *input >> *qkv_add_2 >> *qkv_add_2_out >> *qkv_ln_2 >> *qkv_ln_2_out; + *qkv_ln_2_scale >> *qkv_ln_2; + *qkv_ln_2_bias >> *qkv_ln_2; + *qkv_ln_2 >> *qkv_ln_2_mean; + *qkv_ln_2 >> *qkv_ln_2_var; + + *qkv_ln_2_out >> *qkv_mul_3 >> *qkv_mul_3_out >> *qkv_add_3 >> + *qkv_add_3_out >> *qkv_act >> *qkv_act_out >> *qkv_mul_4 >> + *qkv_mul_4_out >> *qkv_add_4 >> *qkv_add_4_out >> *qkv_add_5; + *qkv_mul_3_y >> *qkv_mul_3; + *qkv_add_3_y >> *qkv_add_3; + *qkv_mul_4_y >> *qkv_mul_4; + *qkv_add_4_y >> *qkv_add_4; + + *qkv_ln_2_out >> *qkv_add_5 >> *qkv_add_5_out >> *qkv_ln_5 >> *qkv_ln_5_out; + *qkv_ln_5_scale >> *qkv_ln_5; + *qkv_ln_5_bias >> *qkv_ln_5; + *qkv_ln_5 >> *qkv_ln_5_mean; + *qkv_ln_5 >> *qkv_ln_5_var; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("single_encoder"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Mask", {matched.at("qk_mask")->arg()->name}); + op_desc.SetInput("FCWeight", + { + matched.at("q_mul_y")->arg()->name, + matched.at("k_mul_y")->arg()->name, + matched.at("v_mul_y")->arg()->name, + matched.at("qkv_mul_y")->arg()->name, + matched.at("qkv_mul_3_y")->arg()->name, + matched.at("qkv_mul_4_y")->arg()->name, + }); + op_desc.SetInput("FCBias", + { + matched.at("q_add_y")->arg()->name, + matched.at("k_add_y")->arg()->name, + matched.at("v_add_y")->arg()->name, + matched.at("qkv_add_y")->arg()->name, + matched.at("qkv_add_3_y")->arg()->name, + matched.at("qkv_add_4_y")->arg()->name, + }); + op_desc.SetInput("LNScale", + { + matched.at("qkv_ln_2_scale")->arg()->name, + matched.at("qkv_ln_5_scale")->arg()->name, + }); + op_desc.SetInput("LNBias", + { + matched.at("qkv_ln_2_bias")->arg()->name, + matched.at("qkv_ln_5_bias")->arg()->name, + }); + op_desc.SetOutput("Outputs", {matched.at("qkv_ln_5_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + // extra traits to distill + auto* reshape_op_info = matched.at("q_reshape2")->stmt()->op_info(); + auto reshape_dim = reshape_op_info->GetAttr>("shape"); + op_desc.SetAttr("head_num", reshape_dim[2]); + op_desc.SetAttr("size_per_head", reshape_dim[3]); + op_desc.SetAttr("act_type", act_type_); + + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + // XXX: memleak? + auto sub_block_desc = new cpp::BlockDesc(); + static_cast(fake_subgraph_op.get()) + ->SetSubBlock(sub_block_desc); + auto* single_encoder_stmt = matched.at("q_mul")->stmt(); + fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope()); + fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places()); + single_encoder_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "qk_mask", + "k_mul_y", + "v_mul_y", + "qkv_mul_y", + "qkv_mul_3_y", + "qkv_mul_4_y", + "q_add_y", + "k_add_y", + "v_add_y", + "qkv_add_y", + "qkv_add_3_y", + "qkv_add_4_y", + "qkv_ln_2_scale", + "qkv_ln_2_bias", + "qkv_ln_5_scale", + "qkv_ln_5_bias", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("q_mul")); + } + IR_OP_VAR_LINK(matched.at("q_mul"), matched.at("qkv_ln_5_out")); + } + + private: + std::string act_type_; +}; + +class XPUMultiEncoderFuser { + public: + explicit XPUMultiEncoderFuser(const std::set& fc_int31_ids) + : fc_int31_ids_(fc_int31_ids) {} + + bool IsDirectPredecessorOf(Node* op1, Node* op2) { + for (auto* out : op1->outlinks) { + for (auto* in : op2->inlinks) { + if (out == in) return true; + } + } + return false; + } + + void operator()(SSAGraph* graph) { + std::vector all_encoders; + for (auto* node : graph->StmtTopologicalOrder()) { + CHECK(node->IsStmt()); + if (node->stmt()->op_info()->Type() == "single_encoder") { + all_encoders.push_back(node); + } + } + VLOG(3) << "Found " << all_encoders.size() << " single_encoder"; + if (all_encoders.size() == 0) { + return; + } + + // TODO(miaotianxiang): more verification + for (size_t i = 0; i < all_encoders.size() - 1; ++i) { + CHECK(IsDirectPredecessorOf(all_encoders[i], all_encoders[i + 1])); + } + std::string mask_name; + for (auto* encoder : all_encoders) { + auto* op_info = encoder->stmt()->op_info(); + if (mask_name.empty()) { + mask_name = op_info->Input("Mask").front(); + } else { + // CHECK(mask_name == op_info->Input("Mask").front()); + } + } + + std::unordered_set to_remove; + Node* first_encoder = all_encoders[0]; + std::string in_name, out_name; + std::vector arg_names{ + "FCWeight", "FCBias", "LNScale", "LNBias"}; + std::unordered_map> arg_map; + for (size_t i = 0; i < all_encoders.size(); ++i) { + Node* cur_encoder = all_encoders[i]; + auto* op_info = cur_encoder->stmt()->op_info(); + for (auto arg_name : arg_names) { + auto real_names = op_info->Input(arg_name); + for (auto name : real_names) { + auto* arg_node = graph->RetrieveArgument(name); + DirectedLink(arg_node, first_encoder); + arg_map[arg_name].push_back(name); + } + } + + auto* cur_out = + graph->RetrieveArgument(op_info->Output("Outputs").front()); + if (i == 0) { + // first encoder + to_remove.insert(cur_out); + in_name = op_info->Input("Inputs").front(); + mask_name = op_info->Input("Mask").front(); + } else if (i == all_encoders.size() - 1) { + // last encoder + to_remove.insert(cur_encoder); + DirectedLink(first_encoder, cur_out); + out_name = op_info->Output("Outputs").front(); + } else { + to_remove.insert(cur_encoder); + to_remove.insert(cur_out); + } + } + GraphSafeRemoveNodes(graph, to_remove); + + auto* multi_encoder_stmt = first_encoder->stmt(); + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__multi_encoder"); + op_desc.SetInput("Input", {in_name}); + for (auto kv : arg_map) { + op_desc.SetInput(kv.first, kv.second); + } + op_desc.SetInput("Mask", {mask_name}); + op_desc.SetOutput("Output", {out_name}); + op_desc.SetAttr("xpu", 1); + auto* first_encoder_op_info = multi_encoder_stmt->op_info(); + op_desc.SetAttr("head_num", + first_encoder_op_info->GetAttr("head_num")); + op_desc.SetAttr("size_per_head", + first_encoder_op_info->GetAttr("size_per_head")); + op_desc.SetAttr("n_layers", all_encoders.size()); + op_desc.SetAttr( + "act_type", first_encoder_op_info->GetAttr("act_type")); + op_desc.SetAttr("precision", + (fc_int31_ids_.empty() ? "int16" : "int31")); + + auto* scope = multi_encoder_stmt->op()->scope(); + std::vector fc_weight_max(arg_map["FCWeight"].size()); + auto& fc_weight_names = arg_map["FCWeight"]; + for (size_t i = 0; i < fc_weight_names.size(); ++i) { + auto* weight_t = scope->FindMutableTensor(fc_weight_names[i]); + auto weight_dims = weight_t->dims(); + int weight_len = weight_t->numel(); + float* weight_on_host = weight_t->mutable_data(); + float max_f = + paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len); + // i ranges from 0 to 6*encoder_num, so we need to do i%6 to get relative + // position in the encoder + if (fc_int31_ids_.find(i % 6) != fc_int31_ids_.end()) { + // FCs in encoder use int31 + VLOG(3) << "Use FC-int31 in FC-" << i << ", " << i / 6 << "-" << i % 6; + std::unique_ptr weight_trans_fp32(new float[weight_len]); + paddle::lite::xpu::math::Transpose(weight_on_host, + weight_trans_fp32.get(), + weight_dims[0], + weight_dims[1]); + + memcpy(weight_on_host, + weight_trans_fp32.get(), + weight_len * sizeof(float)); + } else { + std::unique_ptr weight_int16(new int16_t[weight_len]); + std::unique_ptr weight_trans_int16(new int16_t[weight_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + weight_on_host, weight_int16.get(), max_f, weight_len); + paddle::lite::xpu::math::Transpose(weight_int16.get(), + weight_trans_int16.get(), + weight_dims[0], + weight_dims[1]); + memcpy(weight_on_host, + weight_trans_int16.get(), + weight_len * sizeof(int16_t)); + } + fc_weight_max[i] = max_f; + } + + std::string max_name = "encoder_max"; + auto* max_filter_node = graph->NewArgumentNode(max_name); + max_filter_node->arg()->is_weight = true; + max_filter_node->arg()->type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + DirectedLink(max_filter_node, first_encoder); + auto* max_filter_tensor = scope->NewTensor(max_name); + max_filter_tensor->Resize({static_cast(fc_weight_max.size())}); + memcpy(max_filter_tensor->mutable_data(), + &fc_weight_max[0], + sizeof(float) * fc_weight_max.size()); + op_desc.SetInput("FCWeightMax", {max_name}); + + auto multi_encoder_op = LiteOpRegistry::Global().Create(op_desc.Type()); + multi_encoder_op->Attach(op_desc, scope); + multi_encoder_op->SetValidPlaces(multi_encoder_stmt->op()->valid_places()); + auto kernels = + multi_encoder_op->CreateKernels(multi_encoder_op->valid_places()); + multi_encoder_stmt->SetOp(multi_encoder_op); + multi_encoder_stmt->SetKernels(std::move(kernels)); + + // remove dangling/useless cast + Node* stack = nullptr; + for (auto* node : graph->StmtTopologicalOrder()) { + CHECK(node->IsStmt()); + if (node->stmt()->op_info()->Type() == "stack") { + stack = node; + } + } + if (stack) { + std::unordered_set to_remove2; + Node* stack_out = stack->outlinks.front(); + // avoid modification while traversing + auto stack_out_outlinks = stack_out->outlinks; + for (Node* cast : stack_out_outlinks) { + if (cast->stmt()->op_info()->Type() != "cast") { + continue; + } + + Node* cast_out = cast->outlinks.front(); + if (cast_out->outlinks.size() == 0) { + // dangling cast + to_remove2.insert(cast); + to_remove2.insert(cast_out); + VLOG(3) << "Remove dangling cast [" << cast_out->arg()->name << "]"; + } else if (cast_out->outlinks.size() == 1) { + // useless cast + to_remove2.insert(cast); + to_remove2.insert(cast_out); + VLOG(3) << "Remove useless cast [" << cast_out->arg()->name << "]"; + + auto* multi_encoder = cast_out->outlinks.front(); + DirectedLink(stack_out, multi_encoder); + UpdateInputs(multi_encoder->stmt()->op().get(), + cast_out->arg()->name, + stack_out->arg()->name); + auto update_op_info = *multi_encoder->stmt()->op_info(); + multi_encoder->stmt()->ResetOp(update_op_info, graph->valid_places()); + } + } + GraphSafeRemoveNodes(graph, to_remove2); + } + } + + private: + std::set fc_int31_ids_; +}; + +} // namespace fusion + +class XPUMultiEncoderFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + // TODO(miaotianxiang): backup graph, recover from failed match + std::vector act_types{"gelu", "relu"}; + + std::set fc_int31_ids; +#ifdef LITE_WITH_XPU + // TODO(miaotianxiang): core/mir/*_pass.cc are compiled anyway and need to + // access Context::_multi_encoder_precision, but this static member + // variable in class specialization defined in lite/core/context.cc + // is only compiled iff LITE_WITH_XPU==ON. To suppress linkage error, we use + // #ifdef here. Any better idea? + if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int31" || + lite::Context::_multi_encoder_precision == "int31") { + fc_int31_ids = {0, 1, 2, 3, 4, 5}; + VLOG(3) << "Use int31 in XPUMultiEncoderOp, " + << "lite::Context<>::_multi_encoder_precision=" + << lite::Context::_multi_encoder_precision; + } else { + VLOG(3) << "Use int16 in XPUMultiEncoderOp, " + << "lite::Context<>::_multi_encoder_precision=" + << lite::Context::_multi_encoder_precision; + } +#endif + + for (auto& act_type : act_types) { + fusion::XPUSingleEncoderFuser single_encoder_fuser(act_type); + single_encoder_fuser(graph.get()); + fusion::XPUMultiEncoderFuser multi_encoder_fuser(fc_int31_ids); + multi_encoder_fuser(graph.get()); + } + } +}; +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__multi_encoder_fuse_pass, + paddle::lite::mir::XPUMultiEncoderFusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("matmul"); diff --git a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..de2210a76ea0647cb02131a088ceb754afd0ef9c --- /dev/null +++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc @@ -0,0 +1,951 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" +#include "lite/operators/subgraph_op.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +class XPUResNetBlock0Fuser : public FuseBase { + public: + XPUResNetBlock0Fuser() {} + + void BuildPattern() override { + auto* input = + VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput(); + + auto* left_conv1_weight = VarNode("left_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv1 = OpNode("left_conv1", "conv2d"); + auto* left_conv1_out = VarNode("left_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn1_scale = VarNode("left_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn1_bias = VarNode("left_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn1_mean = VarNode("left_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn1_var = VarNode("left_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn1 = OpNode("left_bn1", "batch_norm")->AsIntermediate(); + auto* left_bn1_out = VarNode("left_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* left_bn1_mean_out = VarNode("left_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn1_var_out = + VarNode("left_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn1_saved_mean = + VarNode("left_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn1_saved_var = + VarNode("left_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* left_relu1 = OpNode("left_relu1", "relu")->AsIntermediate(); + auto* left_relu1_out = VarNode("left_relu1_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* left_conv2_weight = VarNode("left_conv2_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv2 = OpNode("left_conv2", "conv2d")->AsIntermediate(); + auto* left_conv2_out = VarNode("left_conv2_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn2_scale = VarNode("left_bn2_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn2_bias = VarNode("left_bn2_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn2_mean = VarNode("left_bn2_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn2_var = VarNode("left_bn2_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn2 = OpNode("left_bn2", "batch_norm")->AsIntermediate(); + auto* left_bn2_out = VarNode("left_bn2_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* left_bn2_mean_out = VarNode("left_bn2_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn2_var_out = + VarNode("left_bn2_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn2_saved_mean = + VarNode("left_bn2_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn2_saved_var = + VarNode("left_bn2_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* left_relu2 = OpNode("left_relu2", "relu")->AsIntermediate(); + auto* left_relu2_out = VarNode("left_relu2_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* left_conv3_weight = VarNode("left_conv3_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv3 = OpNode("left_conv3", "conv2d")->AsIntermediate(); + auto* left_conv3_out = VarNode("left_conv3_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn3_scale = VarNode("left_bn3_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn3_bias = VarNode("left_bn3_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn3_mean = VarNode("left_bn3_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn3_var = VarNode("left_bn3_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn3 = OpNode("left_bn3", "batch_norm")->AsIntermediate(); + auto* left_bn3_out = VarNode("left_bn3_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + auto* left_bn3_mean_out = VarNode("left_bn3_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn3_var_out = + VarNode("left_bn3_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn3_saved_mean = + VarNode("left_bn3_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn3_saved_var = + VarNode("left_bn3_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + auto* right_conv1_weight = VarNode("right_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv1 = OpNode("right_conv1", "conv2d")->AsIntermediate(); + auto* right_conv1_out = VarNode("right_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn1_scale = VarNode("right_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn1_bias = VarNode("right_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn1_mean = VarNode("right_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn1_var = VarNode("right_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate(); + auto* right_bn1_out = VarNode("right_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* right_bn1_mean_out = + VarNode("right_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn1_var_out = + VarNode("right_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn1_saved_mean = + VarNode("right_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn1_saved_var = + VarNode("right_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + auto* add = OpNode("add", "elementwise_add")->AsIntermediate(); + auto* add_out = VarNode("add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* relu = OpNode("relu", "relu")->AsIntermediate(); + auto* relu_out = + VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput(); + + *input >> *left_conv1 >> *left_conv1_out >> *left_bn1 >> *left_bn1_out >> + *left_relu1 >> *left_relu1_out >> *left_conv2 >> *left_conv2_out >> + *left_bn2 >> *left_bn2_out >> *left_relu2 >> *left_relu2_out >> + *left_conv3 >> *left_conv3_out >> *left_bn3 >> *left_bn3_out >> *add; + + *left_conv1_weight >> *left_conv1; + *left_bn1_scale >> *left_bn1; + *left_bn1_bias >> *left_bn1; + *left_bn1_mean >> *left_bn1; + *left_bn1_var >> *left_bn1; + *left_bn1 >> *left_bn1_mean_out; + *left_bn1 >> *left_bn1_var_out; + *left_bn1 >> *left_bn1_saved_mean; + *left_bn1 >> *left_bn1_saved_var; + + *left_conv2_weight >> *left_conv2; + *left_bn2_scale >> *left_bn2; + *left_bn2_bias >> *left_bn2; + *left_bn2_mean >> *left_bn2; + *left_bn2_var >> *left_bn2; + *left_bn2 >> *left_bn2_mean_out; + *left_bn2 >> *left_bn2_var_out; + *left_bn2 >> *left_bn2_saved_mean; + *left_bn2 >> *left_bn2_saved_var; + + *left_conv3_weight >> *left_conv3; + *left_bn3_scale >> *left_bn3; + *left_bn3_bias >> *left_bn3; + *left_bn3_mean >> *left_bn3; + *left_bn3_var >> *left_bn3; + *left_bn3 >> *left_bn3_mean_out; + *left_bn3 >> *left_bn3_var_out; + *left_bn3 >> *left_bn3_saved_mean; + *left_bn3 >> *left_bn3_saved_var; + + *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >> + *right_bn1_out >> *add; + + *right_conv1_weight >> *right_conv1; + *right_bn1_scale >> *right_bn1; + *right_bn1_bias >> *right_bn1; + *right_bn1_mean >> *right_bn1; + *right_bn1_var >> *right_bn1; + *right_bn1 >> *right_bn1_mean_out; + *right_bn1 >> *right_bn1_var_out; + *right_bn1 >> *right_bn1_saved_mean; + *right_bn1 >> *right_bn1_saved_var; + + *add >> *add_out >> *relu >> *relu_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("resnet_block0"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Filter", + { + matched.at("left_conv1_weight")->arg()->name, + matched.at("left_conv2_weight")->arg()->name, + matched.at("left_conv3_weight")->arg()->name, + matched.at("right_conv1_weight")->arg()->name, + }); + op_desc.SetInput("Scale", + { + matched.at("left_bn1_scale")->arg()->name, + matched.at("left_bn2_scale")->arg()->name, + matched.at("left_bn3_scale")->arg()->name, + matched.at("right_bn1_scale")->arg()->name, + }); + op_desc.SetInput("Bias", + { + matched.at("left_bn1_bias")->arg()->name, + matched.at("left_bn2_bias")->arg()->name, + matched.at("left_bn3_bias")->arg()->name, + matched.at("right_bn1_bias")->arg()->name, + }); + op_desc.SetInput("Mean", + { + matched.at("left_bn1_mean")->arg()->name, + matched.at("left_bn2_mean")->arg()->name, + matched.at("left_bn3_mean")->arg()->name, + matched.at("right_bn1_mean")->arg()->name, + }); + op_desc.SetInput("Var", + { + matched.at("left_bn1_variance")->arg()->name, + matched.at("left_bn2_variance")->arg()->name, + matched.at("left_bn3_variance")->arg()->name, + matched.at("right_bn1_variance")->arg()->name, + }); + op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + auto block0_stmt = matched.at("left_conv1")->stmt(); + // block0_stmt->ResetOp(op_desc, graph->valid_places()); + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + // XXX: memleak? + auto sub_block_desc = new cpp::BlockDesc(); + static_cast(fake_subgraph_op.get()) + ->SetSubBlock(sub_block_desc); + fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope()); + fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places()); + block0_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "left_conv2_weight", + "left_conv3_weight", + "right_conv1_weight", + "left_bn1_bias", + "left_bn2_bias", + "left_bn3_bias", + "right_bn1_bias", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("left_conv1")); + } + IR_OP_VAR_LINK(matched.at("left_conv1"), matched.at("relu_out")); + } +}; + +class XPUResNetBlock1Fuser : public FuseBase { + public: + XPUResNetBlock1Fuser() {} + + void BuildPattern() override { + auto* input = VarNode("input") + ->assert_is_op_input("conv2d", "Input") + ->assert_is_op_input("elementwise_add", "X") + ->AsInput(); + + auto* right_conv1_weight = VarNode("right_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv1 = OpNode("right_conv1", "conv2d"); + auto* right_conv1_out = VarNode("right_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn1_scale = VarNode("right_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn1_bias = VarNode("right_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn1_mean = VarNode("right_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn1_var = VarNode("right_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate(); + auto* right_bn1_out = VarNode("right_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* right_bn1_mean_out = + VarNode("right_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn1_var_out = + VarNode("right_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn1_saved_mean = + VarNode("right_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn1_saved_var = + VarNode("right_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* right_relu1 = OpNode("right_relu1", "relu")->AsIntermediate(); + auto* right_relu1_out = VarNode("right_relu1_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* right_conv2_weight = VarNode("right_conv2_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv2 = OpNode("right_conv2", "conv2d")->AsIntermediate(); + auto* right_conv2_out = VarNode("right_conv2_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn2_scale = VarNode("right_bn2_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn2_bias = VarNode("right_bn2_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn2_mean = VarNode("right_bn2_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn2_var = VarNode("right_bn2_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn2 = OpNode("right_bn2", "batch_norm")->AsIntermediate(); + auto* right_bn2_out = VarNode("right_bn2_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* right_bn2_mean_out = + VarNode("right_bn2_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn2_var_out = + VarNode("right_bn2_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn2_saved_mean = + VarNode("right_bn2_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn2_saved_var = + VarNode("right_bn2_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* right_relu2 = OpNode("right_relu2", "relu")->AsIntermediate(); + auto* right_relu2_out = VarNode("right_relu2_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* right_conv3_weight = VarNode("right_conv3_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv3 = OpNode("right_conv3", "conv2d")->AsIntermediate(); + auto* right_conv3_out = VarNode("right_conv3_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn3_scale = VarNode("right_bn3_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn3_bias = VarNode("right_bn3_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn3_mean = VarNode("right_bn3_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn3_var = VarNode("right_bn3_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn3 = OpNode("right_bn3", "batch_norm")->AsIntermediate(); + auto* right_bn3_out = VarNode("right_bn3_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + auto* right_bn3_mean_out = + VarNode("right_bn3_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn3_var_out = + VarNode("right_bn3_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn3_saved_mean = + VarNode("right_bn3_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn3_saved_var = + VarNode("right_bn3_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + auto* add = OpNode("add", "elementwise_add")->AsIntermediate(); + auto* add_out = VarNode("add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* relu = OpNode("relu", "relu")->AsIntermediate(); + auto* relu_out = + VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput(); + + *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >> + *right_bn1_out >> *right_relu1 >> *right_relu1_out >> *right_conv2 >> + *right_conv2_out >> *right_bn2 >> *right_bn2_out >> *right_relu2 >> + *right_relu2_out >> *right_conv3 >> *right_conv3_out >> *right_bn3 >> + *right_bn3_out >> *add; + + *right_conv1_weight >> *right_conv1; + *right_bn1_scale >> *right_bn1; + *right_bn1_bias >> *right_bn1; + *right_bn1_mean >> *right_bn1; + *right_bn1_var >> *right_bn1; + *right_bn1 >> *right_bn1_mean_out; + *right_bn1 >> *right_bn1_var_out; + *right_bn1 >> *right_bn1_saved_mean; + *right_bn1 >> *right_bn1_saved_var; + + *right_conv2_weight >> *right_conv2; + *right_bn2_scale >> *right_bn2; + *right_bn2_bias >> *right_bn2; + *right_bn2_mean >> *right_bn2; + *right_bn2_var >> *right_bn2; + *right_bn2 >> *right_bn2_mean_out; + *right_bn2 >> *right_bn2_var_out; + *right_bn2 >> *right_bn2_saved_mean; + *right_bn2 >> *right_bn2_saved_var; + + *right_conv3_weight >> *right_conv3; + *right_bn3_scale >> *right_bn3; + *right_bn3_bias >> *right_bn3; + *right_bn3_mean >> *right_bn3; + *right_bn3_var >> *right_bn3; + *right_bn3 >> *right_bn3_mean_out; + *right_bn3 >> *right_bn3_var_out; + *right_bn3 >> *right_bn3_saved_mean; + *right_bn3 >> *right_bn3_saved_var; + + *input >> *add; + + *add >> *add_out >> *relu >> *relu_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("resnet_block1"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Filter", + { + matched.at("right_conv1_weight")->arg()->name, + matched.at("right_conv2_weight")->arg()->name, + matched.at("right_conv3_weight")->arg()->name, + }); + op_desc.SetInput("Scale", + { + matched.at("right_bn1_scale")->arg()->name, + matched.at("right_bn2_scale")->arg()->name, + matched.at("right_bn3_scale")->arg()->name, + }); + op_desc.SetInput("Bias", + { + matched.at("right_bn1_bias")->arg()->name, + matched.at("right_bn2_bias")->arg()->name, + matched.at("right_bn3_bias")->arg()->name, + }); + op_desc.SetInput("Mean", + { + matched.at("right_bn1_mean")->arg()->name, + matched.at("right_bn2_mean")->arg()->name, + matched.at("right_bn3_mean")->arg()->name, + }); + op_desc.SetInput("Var", + { + matched.at("right_bn1_variance")->arg()->name, + matched.at("right_bn2_variance")->arg()->name, + matched.at("right_bn3_variance")->arg()->name, + }); + op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + auto block1_stmt = matched.at("right_conv1")->stmt(); + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + // XXX: memleak? + auto sub_block_desc = new cpp::BlockDesc(); + static_cast(fake_subgraph_op.get()) + ->SetSubBlock(sub_block_desc); + fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope()); + fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places()); + block1_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "right_conv2_weight", + "right_conv3_weight", + "right_bn1_bias", + "right_bn2_bias", + "right_bn3_bias", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("right_conv1")); + } + IR_OP_VAR_LINK(matched.at("right_conv1"), matched.at("relu_out")); + } +}; + +class XPUResNet50Fuser : public xpu::XPUFuseBase { + public: + XPUResNet50Fuser() {} + + void BuildPattern() override { + auto* input = + VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput(); + + auto* top_conv_weight = VarNode("top_conv_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* top_conv = OpNode("top_conv", "conv2d"); + auto* top_conv_out = VarNode("top_conv_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* top_bn_scale = VarNode("top_bn_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* top_bn_bias = VarNode("top_bn_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* top_bn_mean = VarNode("top_bn_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* top_bn_var = VarNode("top_bn_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* top_bn = OpNode("top_bn", "batch_norm")->AsIntermediate(); + auto* top_bn_out = VarNode("top_bn_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* top_bn_mean_out = VarNode("top_bn_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* top_bn_var_out = + VarNode("top_bn_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* top_bn_saved_mean = + VarNode("top_bn_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* top_bn_saved_var = + VarNode("top_bn_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* top_relu = OpNode("top_relu", "relu")->AsIntermediate(); + auto* top_relu_out = VarNode("top_relu_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("pool2d", "X") + ->AsIntermediate(); + auto* top_pool = OpNode("top_pool", "pool2d")->AsIntermediate(); + auto* top_pool_out = VarNode("top_pool_out") + ->assert_is_op_output("pool2d", "Out") + ->assert_is_op_input("resnet_block0", "Inputs") + ->AsIntermediate(); + + // args are left out + auto* resnet_block0_1 = + OpNode("resnet_block0_1", "resnet_block0")->AsIntermediate(); + auto* resnet_block0_1_out = + VarNode("resnet_block0_1_out") + ->assert_is_op_output("resnet_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_1_1 = + OpNode("resnet_block1_1_1", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_1_1_out = + VarNode("resnet_block1_1_1_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_1_2 = + OpNode("resnet_block1_1_2", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_1_2_out = + VarNode("resnet_block1_1_2_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_2 = + OpNode("resnet_block0_2", "resnet_block0")->AsIntermediate(); + auto* resnet_block0_2_out = + VarNode("resnet_block0_2_out") + ->assert_is_op_output("resnet_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_1 = + OpNode("resnet_block1_2_1", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_2_1_out = + VarNode("resnet_block1_2_1_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_2 = + OpNode("resnet_block1_2_2", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_2_2_out = + VarNode("resnet_block1_2_2_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_3 = + OpNode("resnet_block1_2_3", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_2_3_out = + VarNode("resnet_block1_2_3_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_3 = + OpNode("resnet_block0_3", "resnet_block0")->AsIntermediate(); + auto* resnet_block0_3_out = + VarNode("resnet_block0_3_out") + ->assert_is_op_output("resnet_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_1 = + OpNode("resnet_block1_3_1", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_1_out = + VarNode("resnet_block1_3_1_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_2 = + OpNode("resnet_block1_3_2", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_2_out = + VarNode("resnet_block1_3_2_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_3 = + OpNode("resnet_block1_3_3", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_3_out = + VarNode("resnet_block1_3_3_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_4 = + OpNode("resnet_block1_3_4", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_4_out = + VarNode("resnet_block1_3_4_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_5 = + OpNode("resnet_block1_3_5", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_5_out = + VarNode("resnet_block1_3_5_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_4 = + OpNode("resnet_block0_4", "resnet_block0")->AsIntermediate(); + auto* resnet_block0_4_out = + VarNode("resnet_block0_4_out") + ->assert_is_op_output("resnet_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_4_1 = + OpNode("resnet_block1_4_1", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_4_1_out = + VarNode("resnet_block1_4_1_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_4_2 = + OpNode("resnet_block1_4_2", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_4_2_out = + VarNode("resnet_block1_4_2_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + + auto* bottom_pool = OpNode("bottom_pool", "pool2d")->AsIntermediate(); + auto* bottom_pool_out = VarNode("bottom_pool_out") + ->assert_is_op_output("pool2d", "Out") + ->AsOutput(); + + *input >> *top_conv >> *top_conv_out >> *top_bn >> *top_bn_out >> + *top_relu >> *top_relu_out >> *top_pool >> *top_pool_out >> + *resnet_block0_1 >> *resnet_block0_1_out >> *resnet_block1_1_1 >> + *resnet_block1_1_1_out >> *resnet_block1_1_2 >> + *resnet_block1_1_2_out >> *resnet_block0_2 >> *resnet_block0_2_out >> + *resnet_block1_2_1 >> *resnet_block1_2_1_out >> *resnet_block1_2_2 >> + *resnet_block1_2_2_out >> *resnet_block1_2_3 >> + *resnet_block1_2_3_out >> *resnet_block0_3 >> *resnet_block0_3_out >> + *resnet_block1_3_1 >> *resnet_block1_3_1_out >> *resnet_block1_3_2 >> + *resnet_block1_3_2_out >> *resnet_block1_3_3 >> + *resnet_block1_3_3_out >> *resnet_block1_3_4 >> + *resnet_block1_3_4_out >> *resnet_block1_3_5 >> + *resnet_block1_3_5_out >> *resnet_block0_4 >> *resnet_block0_4_out >> + *resnet_block1_4_1 >> *resnet_block1_4_1_out >> *resnet_block1_4_2 >> + *resnet_block1_4_2_out >> *bottom_pool >> *bottom_pool_out; + + *top_conv_weight >> *top_conv; + *top_bn_scale >> *top_bn; + *top_bn_bias >> *top_bn; + *top_bn_mean >> *top_bn; + *top_bn_var >> *top_bn; + *top_bn >> *top_bn_mean_out; + *top_bn >> *top_bn_var_out; + *top_bn >> *top_bn_saved_mean; + *top_bn >> *top_bn_saved_var; + } + + void InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched, + const std::vector& extra_input_vars) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__resnet50"); + op_desc.SetInput("Input", {matched.at("input")->arg()->name}); + std::vector filter_name = { + matched.at("top_conv_weight")->arg()->name}; + std::vector scale_name = { + matched.at("top_bn_scale")->arg()->name}; + std::vector bias_name = { + matched.at("top_bn_bias")->arg()->name}; + std::vector mean_name = { + matched.at("top_bn_mean")->arg()->name}; + std::vector var_name = { + matched.at("top_bn_variance")->arg()->name}; + std::vector max_filter_name; + std::vector resnet_block_vec = { + "resnet_block0_1", + "resnet_block1_1_1", + "resnet_block1_1_2", + "resnet_block0_2", + "resnet_block1_2_1", + "resnet_block1_2_2", + "resnet_block1_2_3", + "resnet_block0_3", + "resnet_block1_3_1", + "resnet_block1_3_2", + "resnet_block1_3_3", + "resnet_block1_3_4", + "resnet_block1_3_5", + "resnet_block0_4", + "resnet_block1_4_1", + "resnet_block1_4_2", + }; + for (auto& block : resnet_block_vec) { + auto* block_op_info = matched.at(block)->stmt()->op_info(); + auto block_filter_name = block_op_info->Input("Filter"); + std::copy(block_filter_name.begin(), + block_filter_name.end(), + std::back_inserter(filter_name)); + auto block_scale_name = block_op_info->Input("Scale"); + std::copy(block_scale_name.begin(), + block_scale_name.end(), + std::back_inserter(scale_name)); + auto block_bias_name = block_op_info->Input("Bias"); + std::copy(block_bias_name.begin(), + block_bias_name.end(), + std::back_inserter(bias_name)); + auto block_mean_name = block_op_info->Input("Mean"); + std::copy(block_mean_name.begin(), + block_mean_name.end(), + std::back_inserter(mean_name)); + auto block_var_name = block_op_info->Input("Var"); + std::copy(block_var_name.begin(), + block_var_name.end(), + std::back_inserter(var_name)); + } + op_desc.SetInput("Filter", filter_name); + op_desc.SetInput("Bias", bias_name); + op_desc.SetOutput("Output", {matched.at("bottom_pool_out")->arg()->name}); + op_desc.SetAttr("xpu", 1); + + auto* resnet50_stmt = matched.at("top_conv")->stmt(); + auto* scope = resnet50_stmt->op()->scope(); + for (size_t i = 0; i < filter_name.size(); ++i) { + auto* filter_t = scope->FindMutableTensor(filter_name[i]); + auto* scale_t = scope->FindMutableTensor(scale_name[i]); + auto* bias_t = scope->FindMutableTensor(bias_name[i]); + auto* mean_t = scope->FindMutableTensor(mean_name[i]); + auto* var_t = scope->FindMutableTensor(var_name[i]); + + int mean_len = mean_t->numel(); + int filter_len = filter_t->numel(); + int filter_stride = filter_len / mean_len; + + float* filter_on_host = filter_t->mutable_data(); + float* scale_on_host = scale_t->mutable_data(); + float* bias_on_host = bias_t->mutable_data(); + float* mean_on_host = mean_t->mutable_data(); + float* var_on_host = var_t->mutable_data(); + + // Perform preprocess + for (int i = 0; i < mean_len; ++i) { + scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f); + } + for (int i = 0; i < mean_len; ++i) { + for (int j = 0; j < filter_stride; ++j) { + filter_on_host[i * filter_stride + j] *= scale_on_host[i]; + } + } + for (int i = 0; i < mean_len; ++i) { + bias_on_host[i] += -mean_on_host[i] * scale_on_host[i]; + } + + float max_f = + paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len); + std::unique_ptr filter_int16(new int16_t[filter_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + filter_on_host, filter_int16.get(), max_f, filter_len); + memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t)); + + // create new arg in graph and scope + std::string max_name = filter_name[i] + "_max"; + max_filter_name.push_back(max_name); + auto* max_filter_node = graph->NewArgumentNode(max_name); + max_filter_node->arg()->is_weight = true; + max_filter_node->arg()->type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + DirectedLink(max_filter_node, matched.at("top_conv")); + auto* max_filter_t = scope->NewTensor(max_name); + max_filter_t->Resize({4}); + float* max_ptr = max_filter_t->mutable_data(); + max_ptr[0] = max_f; + max_ptr[1] = max_f; + max_ptr[2] = max_f; + max_ptr[3] = max_f; + } + op_desc.SetInput("MaxFilter", max_filter_name); + + auto resnet50_op = LiteOpRegistry::Global().Create(op_desc.Type()); + resnet50_op->Attach(op_desc, scope); + resnet50_op->SetValidPlaces(resnet50_stmt->op()->valid_places()); + auto kernels = resnet50_op->CreateKernels(resnet50_op->valid_places()); + resnet50_stmt->SetOp(resnet50_op); + resnet50_stmt->SetKernels(std::move(kernels)); + + IR_NODE_LINK_TO(matched.at("top_bn_bias"), matched.at("top_conv")); + for (auto* node : extra_input_vars) { + IR_NODE_LINK_TO(node, matched.at("top_conv")); + } + IR_OP_VAR_LINK(matched.at("top_conv"), matched.at("bottom_pool_out")); + } +}; + +} // namespace fusion + +class XPUResNet50FusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + fusion::XPUResNetBlock0Fuser block0_fuser; + block0_fuser(graph.get()); + fusion::XPUResNetBlock1Fuser block1_fuser; + block1_fuser(graph.get()); + fusion::XPUResNet50Fuser resnet50_fuser; + resnet50_fuser(graph.get()); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__resnet_fuse_pass, + paddle::lite::mir::XPUResNet50FusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("conv2d"); diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc index a3b90f7d1040b4d878db784c44d578dc37581d42..68c07c0ffd0694aec0ff073082e1192213a0ef4a 100644 --- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc @@ -24,16 +24,27 @@ namespace mir { void ConvActivationFusePass::Apply(const std::unique_ptr& graph) { std::vector act_types{"relu"}; + bool has_int8 = false; + bool has_arm_float = false; + bool has_cuda = false; for (auto& place : graph->valid_places()) { - if (place.target == TARGET(kCUDA) || place.target == TARGET(kFPGA)) { - act_types.push_back("leaky_relu"); - break; + if (place.precision == PRECISION(kInt8)) { + has_int8 = true; } if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) { - act_types.push_back("relu6"); - act_types.push_back("leaky_relu"); - break; + has_arm_float = true; } + if (place.target == TARGET(kCUDA)) { + has_cuda = true; + } + } + + if (!has_int8 && has_arm_float) { + act_types.push_back("relu6"); + act_types.push_back("leaky_relu"); + } + if (!has_int8 && has_cuda) { + act_types.push_back("leaky_relu"); } for (auto conv_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) { for (auto act_type : act_types) { diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.cc b/lite/core/mir/fusion/conv_bn_fuse_pass.cc index f5a7837b53650e08f9632b499a4c2ab1faeaeedf..4393832931c95ca20e34ca3b3d2fb4501274b15f 100644 --- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc @@ -26,7 +26,8 @@ namespace mir { void ConvBNFusePass::Apply(const std::unique_ptr& graph) { // initialze fuser params std::vector conv_has_bias_cases{true, false}; - std::vector conv_type_cases{"conv2d", "depthwise_conv2d"}; + std::vector conv_type_cases{ + "conv2d", "depthwise_conv2d", "conv2d_transpose"}; // start fuse using params for (auto conv_has_bias : conv_has_bias_cases) { for (auto conv_type : conv_type_cases) { diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc index 0f5bb64e10dd61c3edf4ddd32569a2d365651cdf..6718356788d46e24752204c3586cd8447cbbfaaa 100644 --- a/lite/core/mir/fusion/conv_bn_fuser.cc +++ b/lite/core/mir/fusion/conv_bn_fuser.cc @@ -103,14 +103,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { std::string conv_weight_name = matched.at("conv_weight")->arg()->name; auto conv_weight_t = scope->FindVar(conv_weight_name)->GetMutable(); - CHECK_EQ(static_cast(bn_scale_t->data_size()), - static_cast(conv_weight_t->dims()[0])) - << "The BN bias's size should be equal to the size of the first " - << "dim size of the conv weights"; + auto groups = conv_op_desc->GetAttr("groups"); + bool depthwise = false; + if (conv_type_ == "conv2d_transpose") { + depthwise = (conv_weight_t->dims()[0] == conv_weight_t->dims()[1] * groups); + CHECK_EQ(static_cast(bn_scale_t->data_size()), + static_cast(conv_weight_t->dims()[1] * groups)) + << "The BN bias's size should be equal to the size of the first " + << "dim size of the conv weights"; + } else { + CHECK_EQ(static_cast(bn_scale_t->data_size()), + static_cast(conv_weight_t->dims()[0])) + << "The BN bias's size should be equal to the size of the first " + << "dim size of the conv weights"; + } size_t weight_num = conv_weight_t->data_size(); bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false; - bool is_weight_quantization = - conv_op_desc->HasAttr("quantize_weight_bits") ? true : false; + bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits"); // comupte BN alpha and beta Tensor alpha_tensor, beta_tensor; @@ -153,12 +162,29 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { // compute new conv_weight for int8 auto weight_scale = conv_op_desc->GetAttr>("weight_scale"); - for (unsigned int i = 0; i < h; ++i) { - weight_scale[i] *= fabsf(alpha_data[i]); - if (alpha_data[i] < 0.f) { - auto ptr_row = conv_weight_d + i * w; - for (unsigned int j = 0; j < w; ++j) { - ptr_row[j] *= -1; + if (conv_type_ == "conv2d_transpose" && !depthwise) { + int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * + conv_weight_t->dims()[3]; + int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; + for (int k = 0; k < conv_weight_t->dims()[0]; ++k) { + for (int i = 0; i < h; ++i) { + weight_scale[i] *= fabsf(alpha_data[i]); + if (alpha_data[i] < 0.f) { + auto ptr_row = conv_weight_d + k * c_size + i * hw; + for (int j = 0; j < hw; ++j) { + ptr_row[j] *= -1; + } + } + } + } + } else { + for (int i = 0; i < h; ++i) { + weight_scale[i] *= fabsf(alpha_data[i]); + if (alpha_data[i] < 0.f) { + auto ptr_row = conv_weight_d + i * w; + for (int j = 0; j < w; ++j) { + ptr_row[j] *= -1; + } } } } @@ -176,9 +202,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { } else { // compute new conv_weight auto conv_weight_d = conv_weight_t->mutable_data(); - for (unsigned int i = 0; i < h; ++i) { // n: conv2d output channels - for (unsigned int j = 0; j < w; ++j) { // w: conv2d input channels - conv_weight_d[i * w + j] *= alpha_data[i]; + if (conv_type_ == "conv2d_transpose" && !depthwise) { + int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * + conv_weight_t->dims()[3]; + int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; + for (int k = 0; k < conv_weight_t->dims()[0]; ++k) { + for (int i = 0; i < h; ++i) { + auto ptr_row = conv_weight_d + k * c_size + i * hw; + for (int j = 0; j < hw; ++j) { + ptr_row[j] *= alpha_data[i]; + } + } + } + } else { + for (int i = 0; i < h; ++i) { // n: conv2d output channels + for (int j = 0; j < w; ++j) { // w: conv2d input channels + conv_weight_d[i * w + j] *= alpha_data[i]; + } } } } diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc index 1c2297710b7cf41dc1adb7cde30d9fcfb61c79f0..4de007bb17c9d393c6316c425e50188ed8aea222 100644 --- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc @@ -22,20 +22,31 @@ namespace paddle { namespace lite { namespace mir { -void ElementwiseAddActivationFusePass::Apply( +void ElementwiseActivationFusePass::Apply( const std::unique_ptr& graph) { - fusion::ElementwiseAddActivationFuser fuser("relu"); - fuser(graph.get()); + // initialze fuser params + std::vector elt_types{ + "elementwise_add", "elementwise_sub", "elementwise_mul"}; + std::vector act_types{"relu", "abs", "tanh"}; + + // start fuse using params + for (auto elt_type : elt_types) { + for (auto act_type : act_types) { + fusion::ElementwiseActivationFuser fuser(elt_type, act_type); + fuser(graph.get()); + } + } } } // namespace mir } // namespace lite } // namespace paddle -REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass, - paddle::lite::mir::ElementwiseAddActivationFusePass) +REGISTER_MIR_PASS(lite_elementwise_activation_fuse_pass, + paddle::lite::mir::ElementwiseActivationFusePass) .BindTargets({TARGET(kAny)}) .ExcludeTargets({TARGET(kXPU)}) .ExcludeTargets({TARGET(kBM)}) .ExcludeTargets({TARGET(kX86)}) - .BindKernel("fusion_elementwise_add_activation"); + .BindKernel("fusion_elementwise_add_activation") + .BindKernel("fusion_elementwise_sub_activation"); diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h index 299b6b89a07912c43f4714c59895edf8a964d3e6..bca8bd802b278424ac40e1c80dca2d1f5125cb40 100644 --- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h +++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h @@ -22,7 +22,7 @@ namespace paddle { namespace lite { namespace mir { -class ElementwiseAddActivationFusePass : public ProgramPass { +class ElementwiseActivationFusePass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override; }; diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuser.cc b/lite/core/mir/fusion/elementwise_add_activation_fuser.cc index 3c6bf4768bfe8524de4bdbb488cebdf037e51f5e..28081748a78f3549a34324cbfde0d07b31f1ab6b 100644 --- a/lite/core/mir/fusion/elementwise_add_activation_fuser.cc +++ b/lite/core/mir/fusion/elementwise_add_activation_fuser.cc @@ -21,21 +21,21 @@ namespace lite { namespace mir { namespace fusion { -void ElementwiseAddActivationFuser::BuildPattern() { +void ElementwiseActivationFuser::BuildPattern() { // create input nodes. - auto* x = VarNode("x")->assert_is_op_input("elementwise_add", "X")->AsInput(); - auto* y = VarNode("y")->assert_is_op_input("elementwise_add", "Y")->AsInput(); + auto* x = VarNode("x")->assert_is_op_input(eltwise_type_, "X")->AsInput(); + auto* y = VarNode("y")->assert_is_op_input(eltwise_type_, "Y")->AsInput(); // create op nodes - auto* add = OpNode("add", "elementwise_add") - ->assert_is_op("elementwise_add") + auto* elt = OpNode("elt", eltwise_type_) + ->assert_is_op(eltwise_type_) ->AsIntermediate(); auto* act = OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate(); // create intermediate nodes - auto* add_out = VarNode("add_out") - ->assert_is_op_output("elementwise_add", "Out") + auto* elt_out = VarNode("add_out") + ->assert_is_op_output(eltwise_type_, "Out") ->assert_is_op_input(act_type_, "X") ->AsIntermediate(); @@ -44,21 +44,29 @@ void ElementwiseAddActivationFuser::BuildPattern() { VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput(); // create topology. - std::vector add_inputs{x, y}; - add_inputs >> *add >> *add_out; - *add_out >> *act >> *out; + std::vector elt_inputs{x, y}; + elt_inputs >> *elt >> *elt_out; + *elt_out >> *act >> *out; } -void ElementwiseAddActivationFuser::InsertNewNode(SSAGraph* graph, - const key2nodes_t& matched) { +void ElementwiseActivationFuser::InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched) { auto op_desc = GenOpDesc(matched); - auto op = - LiteOpRegistry::Global().Create("fusion_elementwise_add_activation"); - auto old_op = matched.at("add")->stmt()->op(); + std::shared_ptr op; + if (eltwise_type_ == "elementwise_add") { + op = LiteOpRegistry::Global().Create("fusion_elementwise_add_activation"); + } else if (eltwise_type_ == "elementwise_sub") { + op = LiteOpRegistry::Global().Create("fusion_elementwise_sub_activation"); + } else if (eltwise_type_ == "elementwise_mul") { + op = LiteOpRegistry::Global().Create("fusion_elementwise_mul_activation"); + } else { + LOG(FATAL) << "not supported elementwise_type: " << eltwise_type_; + } + + auto old_op = matched.at("elt")->stmt()->op(); auto* scope = old_op->scope(); auto& valid_places = old_op->valid_places(); op->Attach(op_desc, scope); - auto* new_op_node = graph->GraphCreateInstructNode(op, valid_places); IR_NODE_LINK_TO(matched.at("x"), new_op_node); @@ -66,12 +74,20 @@ void ElementwiseAddActivationFuser::InsertNewNode(SSAGraph* graph, IR_NODE_LINK_TO(new_op_node, matched.at("output")); } -cpp::OpDesc ElementwiseAddActivationFuser::GenOpDesc( - const key2nodes_t& matched) { - auto* desc = matched.at("add")->stmt()->op_info(); +cpp::OpDesc ElementwiseActivationFuser::GenOpDesc(const key2nodes_t& matched) { + auto* desc = matched.at("elt")->stmt()->op_info(); cpp::OpDesc op_desc; - op_desc.SetType("fusion_elementwise_add_activation"); + if (eltwise_type_ == "elementwise_add") { + op_desc.SetType("fusion_elementwise_add_activation"); + } else if (eltwise_type_ == "elementwise_sub") { + op_desc.SetType("fusion_elementwise_sub_activation"); + } else if (eltwise_type_ == "elementwise_mul") { + op_desc.SetType("fusion_elementwise_mul_activation"); + } else { + LOG(FATAL) << "not supported elementwise_type: " << eltwise_type_; + } + op_desc.SetInput("X", {matched.at("x")->arg()->name}); op_desc.SetInput("Y", {matched.at("y")->arg()->name}); op_desc.SetOutput("Out", {matched.at("output")->arg()->name}); diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuser.h b/lite/core/mir/fusion/elementwise_add_activation_fuser.h index 47bb2fcf821c4813ced504f63ebc3151ec0f73f8..ac56e7a67526a02eeb78dc29cfc6c9127d1e4b81 100644 --- a/lite/core/mir/fusion/elementwise_add_activation_fuser.h +++ b/lite/core/mir/fusion/elementwise_add_activation_fuser.h @@ -23,15 +23,23 @@ namespace lite { namespace mir { namespace fusion { -class ElementwiseAddActivationFuser : public FuseBase { +// Detect elementwise and activation ops, and then merge into +// fusion_eltsiwise_act op. +// Example: +// elementwise_add + relu fuse. +// fusion::ElementwiseActivationFuser fuser("elementwise_add", "relu"); +// fuser(graph.get()); +class ElementwiseActivationFuser : public FuseBase { public: - explicit ElementwiseAddActivationFuser(const std::string& act_type) - : act_type_(act_type) {} + explicit ElementwiseActivationFuser(const std::string& eltwise_type, + const std::string& act_type) + : eltwise_type_(eltwise_type), act_type_(act_type) {} void BuildPattern() override; void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; private: cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; + std::string eltwise_type_; std::string act_type_; }; diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc index 46695be396596c2ce9b74bb771326171fc7b374b..a4df3a143a5ef3569e74d4401cf75ab5d8c789c7 100644 --- a/lite/core/mir/fusion/fc_fuse_pass.cc +++ b/lite/core/mir/fusion/fc_fuse_pass.cc @@ -23,7 +23,7 @@ namespace lite { namespace mir { void FcFusePass::Apply(const std::unique_ptr& graph) { -#ifdef LITE_WITH_X86 +#ifdef LITE_WITH_X86 || LITE_WITH_FPGA fusion::FcFuser fuser(true); fuser(graph.get()); #endif @@ -38,7 +38,7 @@ void FcFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass) .BindTargets({TARGET(kAny)}) - .ExcludeTargets({TARGET(kXPU)}) + .ExcludeTargets({TARGET(kXPU), TARGET(kX86)}) .ExcludeTargets({TARGET(kBM)}) .ExcludeTargets({TARGET(kCUDA)}) .BindKernel("fc"); diff --git a/lite/core/mir/fusion/interpolate_fuse_pass.cc b/lite/core/mir/fusion/interpolate_fuse_pass.cc index 51c9868cf3ed76ee6f02ac954f74c330e9f1a8e1..ab152c94561410f8febc5f5db7a1709bb114fb94 100644 --- a/lite/core/mir/fusion/interpolate_fuse_pass.cc +++ b/lite/core/mir/fusion/interpolate_fuse_pass.cc @@ -23,11 +23,15 @@ namespace lite { namespace mir { void InterpolateFusePass::Apply(const std::unique_ptr& graph) { - fusion::InterpolateFuser bilinear_interp_fuser("bilinear_interp"); - bilinear_interp_fuser(graph.get()); + std::vector Interpolate_type_cases{"bilinear_interp", + "nearest_interp"}; + for (auto type_ : Interpolate_type_cases) { + fusion::InterpolateFuser interp_fuser(type_); + interp_fuser(graph.get()); - fusion::InterpolateFuser nearest_interp_fuser("nearest_interp"); - nearest_interp_fuser(graph.get()); + fusion::InterpolateFuser2 interp_fuser2(type_); + interp_fuser2(graph.get()); + } } } // namespace mir diff --git a/lite/core/mir/fusion/interpolate_fuser.cc b/lite/core/mir/fusion/interpolate_fuser.cc index 458ef76cb4432dd54678824b1a179e554bcbbf78..ebbd63f8613fb6d62b580004cf7522683db08e38 100644 --- a/lite/core/mir/fusion/interpolate_fuser.cc +++ b/lite/core/mir/fusion/interpolate_fuser.cc @@ -22,6 +22,9 @@ namespace mir { namespace fusion { void InterpolateFuser::BuildPattern() { + // type1 fill_constant --> + // x --> shape --> slice --> cast --> elementwise_mul --> interpolate + // `--------------------------------------------------> auto* x = VarNode("x"); auto* shape = OpNode("shape", "shape")->AsIntermediate(); auto* shape_out = VarNode("shape_out")->AsIntermediate(); @@ -89,6 +92,64 @@ cpp::OpDesc InterpolateFuser::GenOpDesc(const key2nodes_t& matched) { return op_desc; } +void InterpolateFuser2::BuildPattern() { + // type2 x --> shape --> slice --> cast --> scale --> interpolate + // `----------------------------------------> + auto* x = VarNode("x"); + auto* shape = OpNode("shape", "shape")->AsIntermediate(); + auto* shape_out = VarNode("shape_out")->AsIntermediate(); + auto* slice = OpNode("slice", "slice") + ->assert_op_attr_satisfied>( + "axes", + [](const std::vector& attr) { + return attr.size() == 1 && attr[0] == 0; + }) + ->assert_op_attr_satisfied>( + "starts", + [](const std::vector& attr) { + return attr.size() == 1 && attr[0] == 2; + }) + ->assert_op_attr_satisfied>( + "ends", + [](const std::vector& attr) { + return attr.size() == 1 && attr[0] == 4; + }) + ->AsIntermediate(); + auto* slice_out = VarNode("slice_out")->AsIntermediate(); + auto* cast = OpNode("cast", "cast")->AsIntermediate(); + auto* cast_out = VarNode("cast_out")->AsIntermediate(); + auto* scale = OpNode("scale", "scale")->AsIntermediate(); + auto* scale_out = VarNode("scale_out")->AsIntermediate(); + auto* interpolate = OpNode("interpolate", interp_type_)->AsIntermediate(); + auto* interpolate_out = VarNode("interpolate_out"); + + // create topology. + *x >> *shape >> *shape_out >> *slice >> *slice_out >> *cast >> *cast_out >> + *scale >> *scale_out >> *interpolate >> *interpolate_out; + *x >> *interpolate; +} + +void InterpolateFuser2::InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched) { + auto op_desc = GenOpDesc(matched); + auto interp_op = LiteOpRegistry::Global().Create(interp_type_); + auto interp_old = matched.at("interpolate")->stmt()->op(); + auto* scope = interp_old->scope(); + auto& valid_places = interp_old->valid_places(); + interp_op->Attach(op_desc, scope); + + auto* new_op_node = graph->GraphCreateInstructNode(interp_op, valid_places); + + IR_NODE_LINK_TO(matched.at("x"), new_op_node); + IR_NODE_LINK_TO(new_op_node, matched.at("interpolate_out")); +} + +cpp::OpDesc InterpolateFuser2::GenOpDesc(const key2nodes_t& matched) { + auto op_desc = *matched.at("interpolate")->stmt()->op_info(); + op_desc.SetInput("OutSize", {}); + return op_desc; +} + } // namespace fusion } // namespace mir } // namespace lite diff --git a/lite/core/mir/fusion/interpolate_fuser.h b/lite/core/mir/fusion/interpolate_fuser.h index 51f5655e76749ea4de6e1789f499862f2ac46437..96fa6b260190114d41fe6308217fef05de21bd44 100644 --- a/lite/core/mir/fusion/interpolate_fuser.h +++ b/lite/core/mir/fusion/interpolate_fuser.h @@ -36,6 +36,19 @@ class InterpolateFuser : public FuseBase { std::string interp_type_; }; +class InterpolateFuser2 : public FuseBase { + public: + explicit InterpolateFuser2(const std::string& interp_type) + : interp_type_(interp_type) {} + + void BuildPattern() override; + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; + + private: + cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; + std::string interp_type_; +}; + } // namespace fusion } // namespace mir } // namespace lite diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc index 2720404fb03cddaf00c9a25d8287b14d69ca86e8..804b79ad7420de47723658aba898dd6ea3e6715f 100644 --- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc +++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc @@ -58,11 +58,9 @@ void QuantDequantFusePass::Apply(const std::unique_ptr& graph) { fuser(graph.get()); } - // delete quant_dequant_node - for (auto op_type : {"pool2d", "elementwise_add"}) { - fusion::DeleteQuantDequantOpFuser fuser(op_type); - fuser(graph.get()); - } + // process quant_dequant_node + fusion::DeleteQuantDequantOpFuser dqd_fuser; + dqd_fuser(graph.get()); } } // namespace mir diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc index 754bfe142e59d066b936c9337d59c56fbf55eba5..c0d20f51c2d560f278f00ac27a0ec0edefe22d78 100644 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc @@ -50,7 +50,7 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph, auto* output_scale_node = matched.at("output_scale_node"); auto* output_act_node = matched.at("output_act_node"); - // obtain values, save values and relink node + // obtain scale, save attrs and relink node int bit_length = quant_node->stmt()->op_info()->GetAttr("bit_length"); int range = ((1 << (bit_length - 1)) - 1); auto* scope = quant_node->stmt()->op()->scope(); @@ -58,11 +58,22 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph, ->GetMutable(); float scale_value = scale_tensor->data()[0] / range; + auto in_act_name = input_act_node->arg()->name; + auto out_act_name = output_act_node->arg()->name; auto outlinks = output_act_node->outlinks; for (auto* quantized_node : outlinks) { - auto* op_desc = quantized_node->stmt()->mutable_op_info(); - op_desc->SetAttr("bit_length", bit_length); - op_desc->SetAttr("input_scale", scale_value); + // save input scale in quantized op by input argname + index + auto op_desc = *quantized_node->stmt()->mutable_op_info(); + std::string argname; + int index; + op_desc.GetInputArgname(out_act_name, &argname); + op_desc.GetInputIndex(out_act_name, &index); + op_desc.SetAttr(argname + std::to_string(index) + "_input_scale", + scale_value); + op_desc.SetAttr("input_scale", scale_value); // save it for now + op_desc.SetAttr("bit_length", bit_length); + op_desc.UpdateAllInputs(out_act_name, in_act_name); + quantized_node->stmt()->ResetOp(op_desc, graph->valid_places()); IR_NODE_LINK_TO(input_act_node, quantized_node) } @@ -174,22 +185,19 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, auto* dequant_op = matched.at("dequant_op"); auto* dequant_op_out = matched.at("dequant_op_out"); - // obtain input_scale and weight_scale + // obtain weight_scale from max_range auto* scope = quantized_op->stmt()->op()->scope(); auto& valid_places = quantized_op->stmt()->op()->valid_places(); int bit_length = quantized_op->stmt()->op_info()->GetAttr("bit_length"); int range = ((1 << (bit_length - 1)) - 1); - float input_scale = 0; - if (quantized_op->stmt()->op_info()->HasAttr("input_scale")) { - input_scale = - quantized_op->stmt()->op_info()->GetAttr("input_scale"); - } + float max_range = dequant_op->stmt()->op_info()->GetAttr("max_range"); float whole_weight_scale = static_cast(range * range) / max_range / range; - // max_range = range * range / max(abs(weight)) - // weight_scale = range * range / (range * range / max(abs(weight))) / range - // = max(abs(weight)) / range + // As: max_range = range * range / max(abs(weight)) + // So: whole_weight_scale + // = range * range / (range * range / max(abs(weight))) / range + // = max(abs(weight)) / range // set op desc cpp::OpDesc op_desc = *quantized_op->stmt()->op_info(); @@ -205,7 +213,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should // be Cout. weight_scale_size = quantized_weight_t->dims()[0]; - } else if (quantized_op_type_ == "mul") { + } else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") { op_desc.SetInput("X", {quantized_op_input->arg()->name}); op_desc.SetOutput("Out", {dequant_op_out->arg()->name}); // Fc weight: Cin * Cout, the weight_scale_size should be Cout. @@ -217,11 +225,8 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, #ifndef LITE_WITH_FPGA op_desc.SetAttr("enable_int8", true); -#endif - if (quantized_op->stmt()->op_info()->HasAttr("input_scale")) { - op_desc.SetAttr("input_scale", input_scale); - } +#endif op_desc.SetAttr("weight_scale", weight_scale); // change the weight from the float type to int8 type. @@ -284,6 +289,7 @@ void ChannelWiseDequantOpFuser::BuildPattern() { ->assert_is_op_output(quantized_op_type_) ->assert_is_op_input(dequant_op_type, "X") ->AsIntermediate(); + // The scale var_node of input activation is deleted in DeleteQuantOpFuser auto* dequant_op_channel_scale = VarNode("dequant_op_channel_scale") ->assert_is_op_input(dequant_op_type) ->AsIntermediate(); @@ -312,11 +318,9 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph, auto* dequant_op = matched.at("dequant_op"); auto* dequant_op_out = matched.at("dequant_op_out"); - // obtain input_scale and weight_scale + // obtain input weight_scale from fake_dequant op auto* scope = quantized_op->stmt()->op()->scope(); auto& valid_places = quantized_op->stmt()->op()->valid_places(); - float input_scale = - quantized_op->stmt()->op_info()->GetAttr("input_scale"); std::vector weight_scale; std::vector quant_bits = @@ -327,17 +331,21 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph, auto channel_scale_tensor = scope->FindVar(channel_scale_name)->GetMutable(); auto* channel_scale_data = channel_scale_tensor->data(); - for (int i = 0; i < channel_scale_tensor->data_size(); i++) { + for (size_t i = 0; i < channel_scale_tensor->data_size(); i++) { weight_scale.push_back(channel_scale_data[i] / range); } // set op desc cpp::OpDesc op_desc = *quantized_op->stmt()->op_info(); - op_desc.SetInput("Input", {quantized_op_input->arg()->name}); - op_desc.SetOutput("Output", {dequant_op_out->arg()->name}); - + if (quantized_op_type_ == "conv2d" || + quantized_op_type_ == "depthwise_conv2d") { + op_desc.SetInput("Input", {quantized_op_input->arg()->name}); + op_desc.SetOutput("Output", {dequant_op_out->arg()->name}); + } else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") { + op_desc.SetInput("X", {quantized_op_input->arg()->name}); + op_desc.SetOutput("Out", {dequant_op_out->arg()->name}); + } op_desc.SetAttr("enable_int8", true); - op_desc.SetAttr("input_scale", input_scale); op_desc.SetAttr("weight_scale", weight_scale); // change the weight from the float type to int8 type. @@ -372,167 +380,65 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) { void DeleteQuantDequantOpFuser::BuildPattern() { std::string quant_dequant_op_type = "fake_quantize_dequantize_moving_average_abs_max"; - if (quantized_op_type_ == "pool2d") { - auto* input_scale_node = - VarNode("input_scale_node") - ->assert_is_op_input(quant_dequant_op_type, "InScale"); - auto* input_act_node = VarNode("input_act_node") - ->assert_is_op_input(quant_dequant_op_type, "X"); - auto* quant_dequant_node = - OpNode("quant_dequant_node", quant_dequant_op_type) - ->assert_is_op(quant_dequant_op_type); - auto* output_scale_node = - VarNode("output_scale_node") - ->assert_is_op_output(quant_dequant_op_type, "OutScale"); - auto* output_act_node = - VarNode("output_act_node") - ->assert_is_op_output(quant_dequant_op_type, "Out"); - auto* quantized_node = OpNode("quantized_node", quantized_op_type_) - ->assert_is_op(quantized_op_type_); - - quant_dequant_node->LinksFrom({input_scale_node, input_act_node}); - output_scale_node->LinksFrom({quant_dequant_node}); - output_act_node->LinksFrom({quant_dequant_node}); - quantized_node->LinksFrom({output_act_node}); - } else if (quantized_op_type_ == "elementwise_add") { - auto* input_scale_left_node = - VarNode("input_scale_left_node") - ->assert_is_op_input(quant_dequant_op_type, "InScale"); - auto* input_act_left_node = - VarNode("input_act_left_node") - ->assert_is_op_input(quant_dequant_op_type, "X"); - auto* quant_dequant_left_node = - OpNode("quant_dequant_left_node", quant_dequant_op_type) - ->assert_is_op(quant_dequant_op_type); - auto* output_scale_left_node = - VarNode("output_scale_left_node") - ->assert_is_op_output(quant_dequant_op_type, "OutScale"); - auto* output_act_left_node = - VarNode("output_act_left_node") - ->assert_is_op_output(quant_dequant_op_type, "Out") - ->assert_is_op_input(quantized_op_type_, "X"); - quant_dequant_left_node->LinksFrom( - {input_scale_left_node, input_act_left_node}); - output_scale_left_node->LinksFrom({quant_dequant_left_node}); - output_act_left_node->LinksFrom({quant_dequant_left_node}); - - auto* input_scale_right_node = - VarNode("input_scale_right_node") - ->assert_is_op_input(quant_dequant_op_type, "InScale"); - auto* input_act_right_node = - VarNode("input_act_right_node") - ->assert_is_op_input(quant_dequant_op_type, "X"); - auto* quant_dequant_right_node = - OpNode("quant_dequant_right_node", quant_dequant_op_type) - ->assert_is_op(quant_dequant_op_type); - auto* output_scale_right_node = - VarNode("output_scale_right_node") - ->assert_is_op_output(quant_dequant_op_type, "OutScale"); - auto* output_act_right_node = - VarNode("output_act_right_node") - ->assert_is_op_output(quant_dequant_op_type, "Out") - ->assert_is_op_input(quantized_op_type_, "Y"); - quant_dequant_right_node->LinksFrom( - {input_scale_right_node, input_act_right_node}); - output_scale_right_node->LinksFrom({quant_dequant_right_node}); - output_act_right_node->LinksFrom({quant_dequant_right_node}); - - auto* quantized_node = OpNode("quantized_node", quantized_op_type_) - ->assert_is_op(quantized_op_type_); - quantized_node->LinksFrom({output_act_left_node, output_act_right_node}); - } else { - LOG(FATAL) << "No support quantized_op_type:" << quantized_op_type_; - } - VLOG(4) << "DeleteQuantDequantOpFuser BuildPattern op_type:" - << quantized_op_type_; + auto* input_scale_node = + VarNode("input_scale_node") + ->assert_is_op_input(quant_dequant_op_type, "InScale"); + auto* input_act_node = + VarNode("input_act_node")->assert_is_op_input(quant_dequant_op_type, "X"); + auto* quant_dequant_node = OpNode("quant_dequant_node", quant_dequant_op_type) + ->assert_is_op(quant_dequant_op_type); + auto* output_scale_node = + VarNode("output_scale_node") + ->assert_is_op_output(quant_dequant_op_type, "OutScale"); + auto* output_act_node = + VarNode("output_act_node") + ->assert_is_op_output(quant_dequant_op_type, "Out"); + + quant_dequant_node->LinksFrom({input_scale_node, input_act_node}); + output_scale_node->LinksFrom({quant_dequant_node}); + output_act_node->LinksFrom({quant_dequant_node}); } void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { - if (quantized_op_type_ == "pool2d") { - auto* input_scale_node = matched.at("input_scale_node"); - auto* input_act_node = matched.at("input_act_node"); - auto* quant_dequant_node = matched.at("quant_dequant_node"); - auto* output_scale_node = matched.at("output_scale_node"); - auto* output_act_node = matched.at("output_act_node"); - auto* quantized_node = matched.at("quantized_node"); - - // obtain values, save values and relink node - int bit_length = - quant_dequant_node->stmt()->op_info()->GetAttr("bit_length"); - int range = ((1 << (bit_length - 1)) - 1); - auto* scope = quant_dequant_node->stmt()->op()->scope(); - auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name) - ->GetMutable(); - float scale_value = scale_tensor->data()[0] / range; + auto* input_scale_node = matched.at("input_scale_node"); + auto* input_act_node = matched.at("input_act_node"); + auto* quant_dequant_node = matched.at("quant_dequant_node"); + auto* output_scale_node = matched.at("output_scale_node"); + auto* output_act_node = matched.at("output_act_node"); + auto input_act_name = input_act_node->arg()->name; + auto output_act_name = output_act_node->arg()->name; - auto* op_desc = quantized_node->stmt()->mutable_op_info(); - op_desc->SetAttr("bit_length", bit_length); - op_desc->SetAttr("input_scale", scale_value); - op_desc->SetInput("X", {input_act_node->arg()->name}); - IR_NODE_LINK_TO(input_act_node, quantized_node) - auto update_op_desc = *quantized_node->stmt()->mutable_op_info(); - quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places()); - - // delete nodes and edges - std::unordered_set nodes2rm = {input_scale_node, - quant_dequant_node, - output_scale_node, - output_act_node}; - GraphSafeRemoveNodes(graph, nodes2rm); - } else if (quantized_op_type_ == "elementwise_add") { - auto* input_scale_left_node = matched.at("input_scale_left_node"); - auto* input_act_left_node = matched.at("input_act_left_node"); - auto* quant_dequant_left_node = matched.at("quant_dequant_left_node"); - auto* output_scale_left_node = matched.at("output_scale_left_node"); - auto* output_act_left_node = matched.at("output_act_left_node"); - - auto* input_scale_right_node = matched.at("input_scale_right_node"); - auto* input_act_right_node = matched.at("input_act_right_node"); - auto* quant_dequant_right_node = matched.at("quant_dequant_right_node"); - auto* output_scale_right_node = matched.at("output_scale_right_node"); - auto* output_act_right_node = matched.at("output_act_right_node"); - - auto* quantized_node = matched.at("quantized_node"); - - // obtain values, save values and relink node - int bit_length = - quant_dequant_left_node->stmt()->op_info()->GetAttr("bit_length"); - int range = ((1 << (bit_length - 1)) - 1); - auto* scope = quant_dequant_left_node->stmt()->op()->scope(); - auto* left_scale_tensor = - scope->FindVar(output_scale_left_node->arg()->name) - ->GetMutable(); - float left_scale_value = left_scale_tensor->data()[0] / range; - auto* right_scale_tensor = - scope->FindVar(output_scale_right_node->arg()->name) - ->GetMutable(); - float right_scale_value = right_scale_tensor->data()[0] / range; + // Get scale value from scale var node + int bit_length = + quant_dequant_node->stmt()->op_info()->GetAttr("bit_length"); + int range = ((1 << (bit_length - 1)) - 1); + auto* scope = quant_dequant_node->stmt()->op()->scope(); + auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name) + ->GetMutable(); + float scale_value = scale_tensor->data()[0] / range; - auto* op_desc = quantized_node->stmt()->mutable_op_info(); - op_desc->SetAttr("bit_length", bit_length); - op_desc->SetAttr("x_input_scale", left_scale_value); - op_desc->SetAttr("y_input_scale", right_scale_value); - op_desc->SetInput("X", {input_act_left_node->arg()->name}); - op_desc->SetInput("Y", {input_act_right_node->arg()->name}); - IR_NODE_LINK_TO(input_act_left_node, quantized_node) - IR_NODE_LINK_TO(input_act_right_node, quantized_node) - auto update_op_desc = *quantized_node->stmt()->mutable_op_info(); - quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places()); - - // delete nodes and edges - std::unordered_set nodes2rm = {input_scale_left_node, - quant_dequant_left_node, - output_scale_left_node, - output_act_left_node, - input_scale_right_node, - quant_dequant_right_node, - output_scale_right_node, - output_act_right_node}; - GraphSafeRemoveNodes(graph, nodes2rm); - } else { - LOG(FATAL) << "No support quantized_op_type:" << quantized_op_type_; + auto quantized_nodes = output_act_node->outlinks; + for (auto* quantized_node : quantized_nodes) { + // Save quantization info in op_info attr + auto op_info = *quantized_node->stmt()->op_info(); + std::string argname; + int index; + op_info.GetInputArgname(output_act_name, &argname); + op_info.GetInputIndex(output_act_name, &index); + op_info.SetAttr(argname + std::to_string(index) + "_input_scale", + scale_value); + op_info.SetAttr("input_scale", scale_value); // Save it for now + op_info.SetAttr("bit_length", bit_length); + + op_info.UpdateAllInputs(output_act_name, input_act_name); + quantized_node->stmt()->ResetOp(op_info, graph->valid_places()); + IR_NODE_LINK_TO(input_act_node, quantized_node); } + // delete nodes and edges + std::unordered_set nodes2rm = { + input_scale_node, quant_dequant_node, output_scale_node, output_act_node}; + GraphSafeRemoveNodes(graph, nodes2rm); } cpp::OpDesc DeleteQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) { diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.h b/lite/core/mir/fusion/quant_dequant_op_fuser.h index c21df350f96143a09b3229776bf5c013b1988559..d1f6e33bb864a4278762bba726ba5f0aef5b7b72 100644 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.h +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h @@ -100,24 +100,16 @@ class ChannelWiseDequantOpFuser : public FuseBase { }; /* The pattern like "fake_quantize_dequantize_moving_average_abs_max + - * pooled/elementwise_add" can be deteted by this fuser. The fuser - * extract the input_scale form fake_quant_dequant_op and save into - * the quantized_op. Besides, the fuser delete fake_quant_dequant_op in - * the graph. + * quantized_op" can be deteted by this fuser. The fuser modifies the input + * scale for the quantized_op and deletes the fake_quant_dequant_op. */ - class DeleteQuantDequantOpFuser : public FuseBase { public: - explicit DeleteQuantDequantOpFuser(const std::string& quantized_op_type) - : quantized_op_type_(quantized_op_type) {} void BuildPattern() override; void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; private: cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; - - private: - std::string quantized_op_type_{}; }; // dynamic quantdequant op fuser class DynamicQuantDequantOpFuser : public FuseBase { diff --git a/lite/core/mir/fusion/scale_activation_fuse_pass.cc b/lite/core/mir/fusion/scale_activation_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..2ad1f4994f6d5183d3b5c925bb222cb95ea064e8 --- /dev/null +++ b/lite/core/mir/fusion/scale_activation_fuse_pass.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/fusion/scale_activation_fuse_pass.h" +#include +#include +#include "lite/core/mir/fusion/scale_activation_fuser.h" +#include "lite/core/mir/pass_registry.h" + +namespace paddle { +namespace lite { +namespace mir { + +void ScaleActivationFusePass::Apply(const std::unique_ptr& graph) { + for (auto act_type : {"relu", "relu6", "leaky_relu"}) { + fusion::ScaleActivationFuser fuser(act_type); + fuser(graph.get()); + } +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(lite_scale_activation_fuse_pass, + paddle::lite::mir::ScaleActivationFusePass) + .BindTargets({TARGET(kARM)}) + .BindKernel("scale"); diff --git a/lite/core/mir/fusion/scale_activation_fuse_pass.h b/lite/core/mir/fusion/scale_activation_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..2118a0b6f396ff12855009a975059c95ee6111a8 --- /dev/null +++ b/lite/core/mir/fusion/scale_activation_fuse_pass.h @@ -0,0 +1,32 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/core/mir/pass.h" + +namespace paddle { +namespace lite { +namespace mir { + +class ScaleActivationFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/fusion/scale_activation_fuser.cc b/lite/core/mir/fusion/scale_activation_fuser.cc new file mode 100644 index 0000000000000000000000000000000000000000..4f18099da8bc97d9dab8f9c31fd6c23d42d67d81 --- /dev/null +++ b/lite/core/mir/fusion/scale_activation_fuser.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/fusion/scale_activation_fuser.h" +#include +#include + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +void ScaleActivationFuser::BuildPattern() { + // create input nodes. + auto* x = VarNode("x")->assert_is_op_input("scale", "X")->AsInput(); + + // create op nodes + auto* scale = + OpNode("scale", "scale")->assert_is_op("scale")->AsIntermediate(); + auto* act = + OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate(); + + // create intermediate nodes + auto* scale_out = VarNode("scale_out") + ->assert_is_op_output("scale", "Out") + ->assert_is_op_input(act_type_, "X") + ->AsIntermediate(); + + // create output node + auto* out = + VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput(); + // create topology. + *x >> *scale >> *scale_out; + *scale_out >> *act >> *out; +} + +void ScaleActivationFuser::InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched) { + auto op_desc = GenOpDesc(matched); + auto scale_op = LiteOpRegistry::Global().Create("scale"); + auto scale = matched.at("scale")->stmt()->op(); + auto* scope = scale->scope(); + auto& valid_places = scale->valid_places(); + scale_op->Attach(op_desc, scope); + + auto* new_op_node = graph->GraphCreateInstructNode(scale_op, valid_places); + + IR_NODE_LINK_TO(matched.at("x"), new_op_node); + IR_NODE_LINK_TO(new_op_node, matched.at("output")); +} + +cpp::OpDesc ScaleActivationFuser::GenOpDesc(const key2nodes_t& matched) { + cpp::OpDesc op_desc = *matched.at("scale")->stmt()->op_info(); + op_desc.SetOutput("Out", {matched.at("output")->arg()->name}); + cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info(); + + op_desc.SetAttr("activation_type", act_type_); + if (act_type_ == "relu") { + op_desc.SetAttr("fuse_relu", true); + } else if (act_type_ == "relu6") { + float alpha = act_op_desc.GetAttr("threshold"); + op_desc.SetAttr("alpha", alpha); + } else if (act_type_ == "leaky_relu") { + float alpha = act_op_desc.GetAttr("alpha"); + op_desc.SetAttr("alpha", alpha); + } + return op_desc; +} + +} // namespace fusion +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/fusion/scale_activation_fuser.h b/lite/core/mir/fusion/scale_activation_fuser.h new file mode 100644 index 0000000000000000000000000000000000000000..9fa9b9d2b5ebc5091b41a2ca244689797c97ccb6 --- /dev/null +++ b/lite/core/mir/fusion/scale_activation_fuser.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/core/mir/pattern_matcher_high_api.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +class ScaleActivationFuser : public FuseBase { + public: + explicit ScaleActivationFuser(const std::string& act_type) { + act_type_ = act_type; + } + void BuildPattern() override; + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; + + private: + cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; + std::string act_type_; +}; + +} // namespace fusion +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/generate_program_pass.cc b/lite/core/mir/generate_program_pass.cc index 76c97d2da6ed9e7c6fc1f1889d80095278b68ec0..d7486c0933dbbe74115bd6358962817b2b946c12 100644 --- a/lite/core/mir/generate_program_pass.cc +++ b/lite/core/mir/generate_program_pass.cc @@ -14,6 +14,7 @@ #include "lite/core/mir/generate_program_pass.h" #include +#include #include #include #include "lite/core/mir/graph_visualize_pass.h" @@ -25,10 +26,37 @@ namespace mir { void GenerateProgramPass::Apply(const std::unique_ptr& graph) { VLOG(4) << "final program \n" << Visualize(graph.get()); - for (auto& item : graph->StmtTopologicalOrder()) { + std::vector nodes_in_order; +#ifdef LITE_WITH_CUDA + const std::string depend_pass = "multi_stream_analysis_pass"; + const std::string attr_name = "nodes_in_order"; + mir::Pass* pass = mir::PassManager::Global().LookUp(depend_pass); + if (pass->HasAttr(attr_name)) { + nodes_in_order = pass->GetAttr>(attr_name); + } +#endif + if (nodes_in_order.empty()) { + nodes_in_order = graph->StmtTopologicalOrder(); + } + + for (auto& item : nodes_in_order) { if (item->IsStmt()) { auto& stmt = item->AsStmt(); VLOG(4) << stmt; +#ifdef LITE_WITH_CUDA + if (stmt.kernels().front()->target() == TargetType::kCUDA) { + stmt.kernels() + .front() + ->mutable_context() + ->As() + .SetNeedSync(stmt.need_sync_); + stmt.kernels() + .front() + ->mutable_context() + ->As() + .SetSyncStreams(stmt.sync_streams_); + } +#endif insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front())); } } diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc index 3a27360f94d7d828e1c19214d621f1dfe4e048ca..55b7a004567ec5a5298e084839d6dcf5a8591882 100644 --- a/lite/core/mir/graph_visualize_pass.cc +++ b/lite/core/mir/graph_visualize_pass.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include "lite/core/mir/pass_registry.h" #include "lite/utils/string.h" @@ -25,59 +26,130 @@ namespace paddle { namespace lite { namespace mir { -using inference::analysis::Dot; - void GraphVisualizePass::Apply(const std::unique_ptr& graph) { - Visualize(graph.get()); + VLOG(5) << "\n" << Visualize(graph.get()); } std::string Visualize(mir::SSAGraph* graph) { - inference::analysis::Dot dot; - - int id = 0; - std::set exists_args; - for (auto& node : graph->mutable_nodes()) { - std::string key; - if (node.IsArg()) { - key = node.AsArg().name; - } else { - key = string_format("%s%d", node.AsStmt().op_type().c_str(), id++); + std::ostringstream os; + Dot dot; + auto string_trunc = [](const std::string& str) -> std::string { + const int max_disp_size = 100; + if (str.length() > max_disp_size) + return str.substr(0, max_disp_size) + "..."; + return str; + }; + auto attr_repr = [&](const OpInfo* op_info, + const std::string& attr_name) -> std::string { + std::ostringstream os; + using AttrType = cpp::OpDesc::AttrType; + auto attr_type = op_info->GetAttrType(attr_name); + switch (attr_type) { + case AttrType::INT: + os << ":int:" + << paddle::lite::to_string(op_info->GetAttr(attr_name)); + break; + case AttrType::FLOAT: + os << ":float:" + << paddle::lite::to_string(op_info->GetAttr(attr_name)); + break; + case AttrType::BOOLEAN: + os << ":int:" + << paddle::lite::to_string(op_info->GetAttr(attr_name)); + break; + case AttrType::STRING: + os << ":string: \"" + << string_trunc(op_info->GetAttr(attr_name)) << "\""; + break; + case AttrType::FLOATS: { + auto vals = op_info->GetAttr>(attr_name); + os << ":floats: {" + Join(vals, ",") << "}"; + } break; + case AttrType::INTS: { + auto vals = op_info->GetAttr>(attr_name); + os << ":ints: {" + Join(vals, ",") + "}"; + } break; + case AttrType::STRINGS: { + auto vals = op_info->GetAttr>(attr_name); + os << ":strings: {" + string_trunc(Join(vals, ",")) << "}"; + } break; + default: + os << ":Unknow type(" << static_cast(attr_type) << ")"; + break; } - if (node.IsStmt()) { - dot.AddNode(key, - {Dot::Attr("shape", "box"), - Dot::Attr("style", "filled"), - Dot::Attr("color", "black"), - Dot::Attr("fillcolor", "yellow")}); - for (auto& x : node.inlinks) { - auto name = x->AsArg().name; - if (!exists_args.count(name)) { - dot.AddNode(name, {}); + return os.str(); + }; + int op_idx = 0; + std::set exists_var_names; + for (auto& node : graph->StmtTopologicalOrder()) { + if (!node->IsStmt()) continue; + auto op_info = node->AsStmt().op_info(); + auto op_type = op_info->Type(); + std::string op_name; + if (node->AsStmt().need_sync_) { + std::ostringstream oss; + for (size_t i = 0; i < node->AsStmt().sync_streams_.size(); ++i) { + oss << std::to_string(node->AsStmt().sync_streams_[i]); + if (i != node->AsStmt().sync_streams_.size() - 1) { + oss << ","; } - dot.AddEdge(name, key, {}); - exists_args.insert(name); } - for (auto& x : node.outlinks) { - auto name = x->AsArg().name; - if (!exists_args.count(name)) { - dot.AddNode(name, {}); - } - dot.AddEdge(key, name, {}); - exists_args.insert(name); + op_name = string_format("%s%d, stream=%d, sync_streams={%s}", + op_type.c_str(), + op_idx++, + node->AsStmt().stream_id_, + oss.str().c_str()); + } else { + op_name = string_format("%s%d", op_type.c_str(), op_idx++); + } + // Add its input&output variables as the Dot nodes + dot.AddNode(op_name, + {Dot::Attr("shape", "box"), + Dot::Attr("style", "filled"), + Dot::Attr("color", "black"), + Dot::Attr("fillcolor", "yellow")}); + for (auto& x : node->inlinks) { + std::string var_name; + if (x->AsArg().lane != -1) { + var_name = string_format( + "%s, lane=%d", x->AsArg().name.c_str(), x->AsArg().lane); + } else { + var_name = x->AsArg().name; } + if (!exists_var_names.count(var_name)) { + dot.AddNode(var_name, {}); + exists_var_names.insert(var_name); + } + dot.AddEdge(var_name, op_name, {}); + } + for (auto& x : node->outlinks) { + std::string var_name; + if (x->AsArg().lane != -1) { + var_name = string_format( + "%s, lane=%d", x->AsArg().name.c_str(), x->AsArg().lane); + } else { + var_name = x->AsArg().name; + } + if (!exists_var_names.count(var_name)) { + dot.AddNode(var_name, {}); + exists_var_names.insert(var_name); + } + dot.AddEdge(op_name, var_name, {}); + } + // Output its all of attributes(name and values) + os << "* " << op_name << "\n"; + const auto& attr_names = op_info->AttrNames(); + for (auto& attr_name : attr_names) { + os << " - " << attr_name << attr_repr(op_info, attr_name) << "\n"; } } - - auto res = dot.Build(); - // If we use VLOG here, we can not type all graph out. - // So we change VLOG to std::cout. - std::cout << "dot:\n" << res << std::endl; - return res; + os << dot.Build(); + return os.str(); } } // namespace mir } // namespace lite } // namespace paddle -REGISTER_MIR_PASS(graph_visualze, paddle::lite::mir::GraphVisualizePass) +REGISTER_MIR_PASS(graph_visualize_pass, paddle::lite::mir::GraphVisualizePass) .BindTargets({TARGET(kAny)}); diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc index 6256a49a99b9097664c192d40502daf506437a31..12b4eab0a9582af6d2d4abd3941e75b99a3e39a6 100644 --- a/lite/core/mir/memory_optimize_pass.cc +++ b/lite/core/mir/memory_optimize_pass.cc @@ -39,52 +39,109 @@ void MemoryOptimizePass::CollectLifeCycleByDevice( auto is_host = [](TargetType x) -> bool { return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM); }; - // The vars which inputs or outputs are invalid op will not be reused. - auto valid_var = [&](Node* node) -> bool { - std::set invalid_op = {"while", - "conditional_block", - "conditional_block_infer", - "merge_lod_tensor_infer", - "merge_lod_tensor", - "equal", - "lod_reset", - "concat", - "yolo_box", - "subgraph", - "feed", - "fetch"}; - for (auto* tmp : node->inlinks) { - CHECK(tmp->IsStmt()); - std::string op_type = tmp->AsStmt().op_info()->Type(); - if (std::find(invalid_op.begin(), invalid_op.end(), op_type) != - invalid_op.end()) { - return false; + + // The all of input and output variables of the Ops will not be reused. + std::unordered_set invalid_op_nodes = {"while", + "conditional_block", + "conditional_block_infer", + "merge_lod_tensor_infer", + "merge_lod_tensor", + "equal", + "lod_reset", + "concat", + "yolo_box", + "subgraph", + "feed", + "fetch"}; + + auto insert_invalid_op_nodes_for_specific_target = [&]( + std::unordered_set op_node_set, TargetType specific_target) { + std::unordered_set invalid_op_nodes_opencl = {"layout", "fc"}; + for (auto& op_node : graph->StmtTopologicalOrder()) { + if (!op_node->IsStmt()) continue; + TargetType op_target_type = op_node->AsStmt().place().target; + if (op_target_type == specific_target && + specific_target == TARGET(kOpenCL)) { + invalid_op_nodes.insert(invalid_op_nodes_opencl.begin(), + invalid_op_nodes_opencl.end()); + break; } + // else if // you can add more targets } - for (auto* tmp : node->outlinks) { - CHECK(tmp->IsStmt()); - std::string op_type = tmp->AsStmt().op_info()->Type(); - if (std::find(invalid_op.begin(), invalid_op.end(), op_type) != - invalid_op.end()) { - return false; + }; + + VLOG(4) << "invalid_op_nodes.size();" << invalid_op_nodes.size(); + insert_invalid_op_nodes_for_specific_target(invalid_op_nodes, + TARGET(kOpenCL)); + VLOG(4) << "invalid_op_nodes.size();" << invalid_op_nodes.size(); + + // Collect the invalid input and output variables that will not be reused. + std::unordered_set invalid_var_names; + for (auto& op_node : graph->StmtTopologicalOrder()) { + // variables of invalid_op_nodes wil not be reused + if (!op_node->IsStmt()) continue; + auto op_info = op_node->AsStmt().op_info(); + auto op_type = op_info->Type(); + auto invalid_op_node = invalid_op_nodes.find(op_type); + if (invalid_op_node != invalid_op_nodes.end()) { + for (auto in_var_node : op_node->inlinks) { + CHECK(in_var_node->IsArg()); + invalid_var_names.insert(in_var_node->AsArg().name); } + for (auto out_var_node : op_node->outlinks) { + CHECK(out_var_node->IsArg()); + invalid_var_names.insert(out_var_node->AsArg().name); + } + continue; } - return true; - }; + // The specified input and output variables of the Ops whose 'inplace' attr + // is true will not be reused, such as reshape/reshape2's X and Out + // variables + std::unordered_map, + std::unordered_set>> + inplace_op_nodes = {{"reshape", {{"X"}, {"Out"}}}, + {"reshape2", {{"X"}, {"Out"}}}}; + auto inplace_op_node = inplace_op_nodes.find(op_type); + if (inplace_op_node != inplace_op_nodes.end()) { + bool inplace = false; + if (op_info->HasAttr("inplace")) { + inplace = op_info->GetAttr("inplace"); + } + if (inplace) { + for (auto& in_param_name : inplace_op_node->second.first) { + const auto& in_arg_names = op_info->Input(in_param_name); + invalid_var_names.insert(in_arg_names.begin(), in_arg_names.end()); + } + for (auto& out_param_name : inplace_op_node->second.second) { + const auto& out_arg_names = op_info->Output(out_param_name); + invalid_var_names.insert(out_arg_names.begin(), out_arg_names.end()); + } + } + } + } + + // non-tensor(like tensor_array) variables will not be reused + for (auto& node : graph->nodes()) { + if (node.IsArg() && (node.arg()->type != nullptr) && + !node.arg()->type->IsTensor()) { + invalid_var_names.insert(node.arg()->name); + } + } for (auto& op_node : graph->StmtTopologicalOrder()) { if (op_node->IsStmt()) { - auto inputs = op_node->inlinks; - auto outputs = op_node->outlinks; - std::vector requires(inputs.begin(), inputs.end()); - requires.insert(requires.end(), outputs.begin(), outputs.end()); - for (Node* node : requires) { - CHECK(node->IsArg()); - auto& arg = node->AsArg(); + std::vector var_nodes(op_node->inlinks.begin(), + op_node->inlinks.end()); + var_nodes.insert( + var_nodes.end(), op_node->outlinks.begin(), op_node->outlinks.end()); + for (auto* var_node : var_nodes) { + CHECK(var_node->IsArg()); + auto& arg = var_node->AsArg(); if (arg.is_weight || arg.is_persist) continue; - if (!valid_var(node)) continue; std::string var_name = arg.name; - TargetType target_type = node->AsArg().type->target(); + if (invalid_var_names.count(var_name)) continue; + TargetType target_type = arg.type->target(); if (is_host(target_type)) target_type = TARGET(kHost); if (!(*lifecycles)[TargetToStr(target_type)].count(var_name)) { @@ -181,7 +238,7 @@ void MemoryOptimizePass::PerformReusePlan( if (reuse_table.count(name) && reuse_table.at(name) != name) { auto replace_name = reuse_table.at(name); input_node->AsArg().name = - replace_name + "(" + std::to_string(node_append_idx) + ")"; + replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")"; node_append_idx++; } } @@ -205,7 +262,7 @@ void MemoryOptimizePass::PerformReusePlan( if (reuse_table.count(name) && reuse_table.at(name) != name) { auto replace_name = reuse_table.at(name); out_node->AsArg().name = - replace_name + "(" + std::to_string(node_append_idx) + ")"; + replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")"; node_append_idx++; } } @@ -255,5 +312,9 @@ void MemoryOptimizePass::Apply(const std::unique_ptr& graph) { } // namespace paddle REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass) - .BindTargets({TARGET(kARM)}) - .ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU), TARGET(kBM)}); + .BindTargets({TARGET(kARM), TARGET(kOpenCL)}) + .ExcludeTargets({TARGET(kNPU), + TARGET(kXPU), + TARGET(kBM), + TARGET(kRKNPU), + TARGET(kAPU)}); diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..ba48d5d4ead5ea922ded0bff3a87c2c127595790 --- /dev/null +++ b/lite/core/mir/mlu_postprocess_pass.cc @@ -0,0 +1,588 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/mlu_postprocess_pass.h" +#include +#include +#include +#include +#include +#include "lite/core/mir/graph_visualize_pass.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/operators/subgraph_op.h" + +namespace paddle { +namespace lite { +namespace mir { + +Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, + const std::string& cast_arg_name, + SSAGraph* graph, + Node* cur_node, + Node* inst_node, + const Type* cast_type) { + // create the arg node + auto* cast_arg = graph->NewArgumentNode(cast_arg_name); + cast_arg->AsArg().type = cast_type; + inst_node->AsStmt().op()->scope()->Var(cast_arg_name); + + // create the stmt node + auto* cast_inst = graph->NewInstructNode(); + // create op + auto cast_op = LiteOpRegistry::Global().Create(op_type); + CHECK(cast_op) << "create op [" << op_type << "] failed"; + cpp::OpDesc op_desc; + op_desc.SetType(op_type); + if (op_type == "cast") { + op_desc.SetAttr("in_dtype", 5); // FP32 + op_desc.SetAttr("out_dtype", 4); // FP16 + op_desc.SetInput("X", {cur_node->AsArg().name}); + op_desc.SetOutput("Out", {cast_arg_name}); + } else if (op_type == "layout") { + // NCHW -> NHWC + op_desc.SetInput("Input", {cur_node->AsArg().name}); + op_desc.SetOutput("Out", {cast_arg_name}); + } else if (op_type == "io_copy") { + op_desc.SetInput("Input", {cur_node->AsArg().name}); + op_desc.SetOutput("Out", {cast_arg_name}); + } else { + CHECK(0) << "Unsupport cast type"; + } + cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); + // create kernels + auto kernels = cast_op->CreateKernels(graph->valid_places()); + std::vector> selected_kernels; + bool is_found = false; + for (auto& kernel : kernels) { + if (op_type == "cast") { + const Type* in_arg_ty = kernel->GetInputDeclType("X"); + if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) { + is_found = true; + } + } else if (op_type == "layout") { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type) && + DataLayoutCompatible(*out_arg_ty, *cast_type) && + // for first conv + PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) { + is_found = true; + } + } else if (op_type == "io_copy") { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (TargetCompatibleTo(*in_arg_ty, *cur_node->AsArg().type) && + TargetCompatibleTo(*out_arg_ty, *cast_type)) { + is_found = true; + } + } else { + CHECK(0) << "Unsupport cast type"; + } + if (is_found) { + selected_kernels.emplace_back(std::move(kernel)); + // we pick the kernel + cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); + auto& stmt = cast_inst->AsStmt(); + if (op_type == "layout") { + stmt.picked_kernel().SetContext( + ContextScheduler::Global().NewContext(TARGET(kX86))); + } else { + stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + stmt.picked_kernel().target())); + } + break; + } + } + CHECK(is_found) << "Can't find a Cast kernel for Cast op: " + << cur_node->AsArg().name << "->" << op_type; + // modify links + DirectedLink(cur_node, cast_inst); + DirectedLink(cast_inst, cast_arg); + return cast_arg; +} + +Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, + const std::string& cast_arg_name, + SSAGraph* graph, + Node* cur_node, + Node* inst_node, + const Type* cast_type) { + // create the arg node + auto* cast_arg = graph->NewArgumentNode(cast_arg_name); + cast_arg->AsArg().type = cast_type; + auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name); + // for CastAfter manully set the tensor's type + var->GetMutable(); + + // create the stmt node + auto* cast_inst = graph->NewInstructNode(); + // create op + auto cast_op = LiteOpRegistry::Global().Create(op_type); + CHECK(cast_op) << "create op [" << op_type << "] failed"; + cpp::OpDesc op_desc; + op_desc.SetType(op_type); + if (op_type == "cast") { + op_desc.SetAttr("in_dtype", 4); // FP32 + op_desc.SetAttr("out_dtype", 5); // FP16 + op_desc.SetInput("X", {cast_arg_name}); + op_desc.SetOutput("Out", {cur_node->AsArg().name}); + } else if (op_type == "layout") { + // NHWC -> NCHW + op_desc.SetInput("Input", {cast_arg_name}); + op_desc.SetOutput("Out", {cur_node->AsArg().name}); + } else if (op_type == "io_copy") { + op_desc.SetInput("Input", {cast_arg_name}); + op_desc.SetOutput("Out", {cur_node->AsArg().name}); + } else { + CHECK(0) << "Unsupport cast type"; + } + + cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); + + // create kernels + auto kernels = cast_op->CreateKernels(graph->valid_places()); + std::vector> selected_kernels; + bool is_found = false; + for (auto& kernel : kernels) { + if (op_type == "cast") { + const Type* in_arg_ty = kernel->GetInputDeclType("X"); + if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) { + is_found = true; + } + } else if (op_type == "layout") { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (DataLayoutCompatible(*in_arg_ty, *cast_type) && + DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type)) { + is_found = true; + } + } else if (op_type == "io_copy") { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (TargetCompatibleTo(*in_arg_ty, *cast_type) && + TargetCompatibleTo(*out_arg_ty, *cur_node->AsArg().type)) { + is_found = true; + } + } else { + CHECK(0) << "Unsupport cast type"; + } + if (is_found) { + selected_kernels.emplace_back(std::move(kernel)); + // we pick the kernel + cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); + auto& stmt = cast_inst->AsStmt(); + if (op_type == "layout") { + stmt.picked_kernel().SetContext( + ContextScheduler::Global().NewContext(TARGET(kX86))); + } else { + stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + stmt.picked_kernel().target())); + } + break; + } + } + CHECK(is_found) << "Can't find a Cast kernel for Cast op: " + << cur_node->AsArg().name << "->" << op_type; + // modify links + DirectedLink(cast_arg, cast_inst); + DirectedLink(cast_inst, cur_node); + return cast_arg; +} + +void MLUPostprocessPass::InsertBefore(SSAGraph* graph, + Node* head_node, + Node* inst_node, + const Type* inst_type) { + const auto* head_type = head_node->AsArg().type; + + // break original link + RemoveDirectedLink(head_node, inst_node); + + auto* cur_node = head_node; + const auto name_prefix = + head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; + bool is_first_conv_head = + std::find(first_conv_nodes_.begin(), + first_conv_nodes_.end(), + head_node->AsArg().name) != first_conv_nodes_.end(); + + // precision cast node + if (head_type->precision() != inst_type->precision() && !is_first_conv_head) { + cur_node = InsertCastBefore( + "cast", + name_prefix + "cast", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + head_type->target(), inst_type->precision(), head_type->layout())); + } + + // layout cast node + if (head_type->layout() != inst_type->layout()) { + cur_node = InsertCastBefore( + "layout", + name_prefix + "layout", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + head_type->target(), inst_type->precision(), inst_type->layout())); + } + + // io copy + cur_node = InsertCastBefore( + "io_copy", + name_prefix + "io_copy", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + inst_type->target(), inst_type->precision(), inst_type->layout())); + + // connect cur_node to inst_node + DirectedLink(cur_node, inst_node); + + // reset opdesc and update kernel information + UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), + head_node->AsArg().name, + cur_node->AsArg().name); + // for subgraph op, modify the BlockDesc + auto* sub_block_desc = dynamic_cast( + inst_node->AsStmt().op().get()) + ->GetSubBlock(); + for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) { + auto* sub_block_op_desc = sub_block_desc->GetOp(i); + UpdateInputTo( + sub_block_op_desc, head_node->AsArg().name, cur_node->AsArg().name); + } + + // recreate the op + RecreateOp(inst_node, graph); + + graph->CheckValid(); +} + +void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node, + const Type** arg_type, + SSAGraph* graph) { + CHECK(inst_node->IsStmt()); + constexpr auto subgraph_target = TARGET(kMLU); + constexpr auto subgraph_layout = DATALAYOUT(kNHWC); + + // get subgraph's valid precision + const auto& places = graph->valid_places(); + std::set prec_set; + for (const auto& place : places) { + if (place.target == TARGET(kMLU)) { + prec_set.insert(place.precision); + } + } + + // get subgraph op's type info + size_t kernel_size = inst_node->AsStmt().kernels().size(); + CHECK_GT(kernel_size, 0u); + VLOG(4) << "subgraph kernel size: " << kernel_size; + + for (size_t i = 0; i < kernel_size; ++i) { + auto* kernel = inst_node->AsStmt().kernels()[i].get(); + VLOG(4) << i << "th kernel: " << TargetToStr(kernel->target()) << ", " + << PrecisionToStr(kernel->precision()) << ", " + << DataLayoutToStr(kernel->layout()); + } + + for (size_t i = 0; i < kernel_size; ++i) { + auto* kernel = inst_node->AsStmt().kernels()[i].get(); + CHECK(kernel->target() == subgraph_target); + CHECK(kernel->layout() == subgraph_layout); + if (prec_set.count(kernel->precision()) == 1) { + const auto subgraph_precision = kernel->precision(); + CHECK(subgraph_precision == PRECISION(kFloat) || + subgraph_precision == PRECISION(kFP16)) + << "Mlu node has unsupport precision"; + VLOG(4) << "picked kernel precision: " + << PrecisionToStr(subgraph_precision); + *arg_type = LiteType::GetTensorTy( + subgraph_target, subgraph_precision, subgraph_layout); + break; + } + } +} + +bool MLUPostprocessPass::NeedInsert(Node* node, const Type* inst_type) { + CHECK(node->IsArg()); + + // some op, for example batch_norm, has output nodes useless + if (node->outlinks.size() == 0) { + return false; + } + + // check if node is weight or persistent + bool is_persist = node->AsArg().is_weight || node->AsArg().is_persist; + if (is_persist) { + VLOG(4) << "Persistent arg name: " << node->AsArg().name + << " is_weight: " << node->AsArg().is_weight + << " is_persist: " << node->AsArg().is_persist; + return false; + } + + const auto target = node->AsArg().type->target(); + const auto precision = node->AsArg().type->precision(); + const auto layout = node->AsArg().type->layout(); + VLOG(4) << "arg name: " << node->AsArg().name + << " type: " << TargetToStr(target) << ", " + << PrecisionToStr(precision) << ", " << DataLayoutToStr(layout); + + // do not insert nodes if previous node is on mlu already + if (target == inst_type->target()) { + CHECK(layout == inst_type->layout()) << "Mlu node has wrong layout"; + return false; + } + + return true; +} + +void MLUPostprocessPass::InsertAfter(SSAGraph* graph, + Node* tail_node, + Node* inst_node, + const Type* inst_type) { + const auto* tail_type = tail_node->AsArg().type; + + // break original link + RemoveDirectedLink(inst_node, tail_node); + + auto* cur_node = tail_node; + const auto name_prefix = + tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; + + // precision cast node + if (tail_type->precision() != inst_type->precision()) { + cur_node = InsertCastAfter( + "cast", + name_prefix + "cast", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + tail_type->target(), inst_type->precision(), tail_type->layout())); + } + + // layout cast node + if (tail_type->layout() != inst_type->layout()) { + cur_node = InsertCastAfter( + "layout", + name_prefix + "layout", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + tail_type->target(), inst_type->precision(), inst_type->layout())); + } + + // io copy + cur_node = InsertCastAfter( + "io_copy", + name_prefix + "io_copy", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + inst_type->target(), inst_type->precision(), inst_type->layout())); + + // connect cur_node to inst_node + DirectedLink(inst_node, cur_node); + + // reset opdesc and update kernel information + UpdateOutputTo(inst_node->AsStmt().op()->mutable_op_info(), + tail_node->AsArg().name, + cur_node->AsArg().name); + // for subgraph op, modify the BlockDesc + auto* sub_block_desc = dynamic_cast( + inst_node->AsStmt().op().get()) + ->GetSubBlock(); + for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) { + auto* sub_block_op_desc = sub_block_desc->GetOp(i); + UpdateOutputTo( + sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name); + /* graph like this + * subgraph_op_0 + * / \ + * / \ + * subgraph_op_1 host_op + */ + UpdateInputTo( + sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name); + } + + // recreate the op + RecreateOp(inst_node, graph); + + graph->CheckValid(); +} + +void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) { + auto original_selected_kernel = + std::move(inst_node->AsStmt().kernels().front()); + auto updated_op_info = *inst_node->AsStmt().mutable_op_info(); + + inst_node->AsStmt().ResetOp(updated_op_info, graph->valid_places()); + inst_node->AsStmt().kernels().clear(); + inst_node->AsStmt().kernels().emplace_back( + std::move(original_selected_kernel)); + for (auto& kernel : inst_node->AsStmt().kernels()) { + VLOG(4) << "kernel info: " << kernel->name(); + inst_node->AsStmt().op()->AttachKernel(kernel.get()); + } +} + +bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) { + auto* block_desc = + static_cast(inst->AsStmt().op().get()) + ->GetSubBlock(); + for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) { + auto op_desc = block_desc->GetOp(op_idx); + CHECK(op_desc); + if (op_desc->Type() == "conv2d") { + for (auto& names : op_desc->inputs()) { + if (std::find(names.second.begin(), + names.second.end(), + arg_node->AsArg().name) != names.second.end()) { + return true; + } + } + } + } + return false; +} + +bool MLUPostprocessPass::IsFirstConvNode(Node* arg_node) { + CHECK(arg_node->IsArg()); + for (auto& inst : arg_node->outlinks) { + if (inst->AsStmt().op_type() == "subgraph") { + return IsFirstConvInSubgraph(arg_node, inst); + } + } + return false; +} + +void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) { + for (auto& node : graph->mutable_nodes()) { + if (!node.IsStmt()) continue; + if (node.AsStmt().op_type() == "feed") { + for (auto& out : node.outlinks) { + if (IsFirstConvNode(out)) { + first_conv_nodes_.insert(out->AsArg().name); + // modify first conv nodes' type + const auto* old_type = out->AsArg().type; + out->AsArg().type = + LiteType::GetTensorTy(old_type->target(), + paddle::lite_api::PrecisionType::kInt8, + old_type->layout(), + old_type->device()); + } + } + } + } +} + +void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { + for (auto& node : graph->mutable_nodes()) { + if (!node.IsStmt()) continue; + if (node.AsStmt().op_type() == "feed") { + for (auto& out : node.outlinks) { + bool change = true; + for (auto& inst : out->outlinks) { + if (inst->AsStmt().op_type() != "subgraph") { + change = false; + break; + } + } + if (change) { + const auto* old_type = out->AsArg().type; + out->AsArg().type = + LiteType::GetTensorTy(old_type->target(), + old_type->precision(), + paddle::lite_api::DataLayoutType::kNHWC, + old_type->device()); + } + } + } + if (node.AsStmt().op_type() == "fetch") { + for (auto& inp : node.inlinks) { + bool change = true; + for (auto& inst : inp->inlinks) { + if (inst->AsStmt().op_type() != "subgraph") { + change = false; + break; + } + } + if (change) { + const auto* old_type = inp->AsArg().type; + inp->AsArg().type = + LiteType::GetTensorTy(old_type->target(), + old_type->precision(), + paddle::lite_api::DataLayoutType::kNHWC, + old_type->device()); + } + } + } + } +} + +void MLUPostprocessPass::Apply(const std::unique_ptr& graph) { +// currently for non-persistent input and output args, mlu subgraph op +// only support float16/float32 data type + +// in two situations as folllows: +// 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch; +// arg_in and arg_out are assumed to be NHWC which user should be aware of. +// Thus here we change these args' layout to NHWC +#ifdef LITE_WITH_MLU + if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) { + ModifyLayout(graph.get()); + } + + if (lite::DeviceInfo::Global().UseFirstConv()) { + GatherAndModifyFirstConvNodes(graph.get()); + } +#endif + + // insert io_copy, layout and precision cast of subgraph's inputs and outputs + for (auto& node : graph->mutable_nodes()) { + if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") { + const Type* subgraph_arg_type = nullptr; + GetSubgraphOpArgType(&node, &subgraph_arg_type, graph.get()); + + auto links_tmp = node.inlinks; + for (auto p_in : links_tmp) { + if (NeedInsert(p_in, subgraph_arg_type)) { + InsertBefore(graph.get(), p_in, &node, subgraph_arg_type); + } + } + links_tmp.assign(node.outlinks.begin(), node.outlinks.end()); + for (auto p_out : links_tmp) { + if (NeedInsert(p_out, subgraph_arg_type)) { + InsertAfter(graph.get(), p_out, &node, subgraph_arg_type); + } + } + } + } +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(mlu_postprocess_pass, paddle::lite::mir::MLUPostprocessPass) + .BindTargets({TARGET(kMLU)}); diff --git a/lite/core/mir/mlu_postprocess_pass.h b/lite/core/mir/mlu_postprocess_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..688dd06fb5fbec0c8e1c53acfe4215456ddb4192 --- /dev/null +++ b/lite/core/mir/mlu_postprocess_pass.h @@ -0,0 +1,124 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "lite/core/mir/pass.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace mir { + +static void UpdateInputTo(cpp::OpDesc* desc, + const std::string& from, + const std::string& to) { + for (auto& item : *desc->mutable_inputs()) { + for (auto& input : item.second) { + if (input == from) { + input = to; + } + } + } + if (desc->Type() != "subgraph") return; + auto input_names = + desc->GetAttr>("input_data_names"); + for (size_t i = 0; i < input_names.size(); ++i) { + if (input_names[i] == from) { + input_names[i] = to; + } + } + desc->SetAttr>("input_data_names", input_names); +} + +static void UpdateOutputTo(cpp::OpDesc* desc, + const std::string& from, + const std::string& to) { + for (auto& item : *desc->mutable_outputs()) { + for (auto& output : item.second) { + if (output == from) { + output = to; + } + } + } + if (desc->Type() != "subgraph") return; + auto output_names = + desc->GetAttr>("output_data_names"); + for (size_t i = 0; i < output_names.size(); ++i) { + if (output_names[i] == from) { + output_names[i] = to; + } + } + desc->SetAttr>("output_data_names", output_names); +} + +/* + * The pass changes the node's target to mlu which follows a mlu subgraph op + * */ +class MLUPostprocessPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; + + private: + void GetSubgraphOpArgType(Node* inst_node, + const Type** arg_type, + SSAGraph* graph); + + void ModifyLayout(SSAGraph* graph); + + bool NeedInsert(Node* node, const Type* inst_type); + + void InsertBefore(SSAGraph* graph, + Node* head_node, + Node* inst_node, + const Type* type); + + void InsertAfter(SSAGraph* graph, + Node* tail_node, + Node* inst_node, + const Type* type); + + Node* InsertCastBefore(const std::string& op_type, + const std::string& cast_arg_name, + SSAGraph* graph, + Node* cur_node, + Node* inst_node, + const Type* cast_type); + + Node* InsertCastAfter(const std::string& op_type, + const std::string& cast_arg_name, + SSAGraph* graph, + Node* cur_node, + Node* inst_node, + const Type* cast_type); + + void RecreateOp(Node* inst_node, SSAGraph* graph); + + void GatherAndModifyFirstConvNodes(SSAGraph* graph); + + bool IsFirstConvNode(Node* arg_node); + + bool IsFirstConvInSubgraph(Node* arg_node, Node* inst); + + private: + std::set first_conv_nodes_; +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/multi_stream_analysis_pass.cc b/lite/core/mir/multi_stream_analysis_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..46454a1fc357c7d96162a58a43a6c34bc890bc69 --- /dev/null +++ b/lite/core/mir/multi_stream_analysis_pass.cc @@ -0,0 +1,313 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/multi_stream_analysis_pass.h" + +#include +#include +#include +#include + +#include "lite/core/device_info.h" +#include "lite/core/mir/graph_visualize_pass.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace mir { + +void MultiStreamAnalysisPass::CleanUp() { + exec_ops_.clear(); + wait_que_.clear(); + wait_que_cpu_.clear(); + std::queue empty_queue; + while (!exec_que_.empty()) { + exec_que_.pop(); + } + ops_in_streams_.clear(); + resources_.clear(); + map_arg_to_lane_.clear(); + op_types_set_.clear(); + io_copy_once_num_ = 0; +} + +void MultiStreamAnalysisPass::Init(SSAGraph* graph) { + // If not cleaned, the clone will overlay the previous state + CleanUp(); + for (auto& op_node : graph->StmtTopologicalOrder()) { + if (op_node->IsStmt()) { + // Set all outputs of op to inaccessible state. + auto outputs = op_node->outlinks; + for (Node* node : outputs) { + CHECK(node->IsArg()); + auto& arg = node->AsArg(); + if (!resources_.count(arg.name)) { + resources_[arg.name] = false; + } + } + // Set the weight input of op to be accessible. + auto inputs = op_node->inlinks; + for (Node* node : inputs) { + CHECK(node->IsArg()); + auto& arg = node->AsArg(); + if (arg.is_weight || arg.is_persist) { + resources_[arg.name] = true; + } + } + + // feed and io_copy_once op has no dependencies and can be launched + // directly. Other ops are put into the waiting queue. + if (op_node->AsStmt().op_type() == "feed" || + op_node->AsStmt().op_type() == "io_copy_once") { + exec_que_.push(op_node); + } else { + auto tgt = op_node->AsStmt().kernels().front()->target(); + if (tgt == TargetType::kCUDA) { + wait_que_.push_back(op_node); + } else { + wait_que_cpu_.push_back(op_node); + } + } + op_types_set_.insert(op_node->AsStmt().op_type()); + } + } + + // Set the stream id according to the number of feed ops, and set the output + // of the feed op to be accessible. + int lane = 0; + auto nodes = graph->inputs(); + ops_in_streams_.resize(max_stream_); + + for (auto& node : nodes) { + std::string::size_type idx = node->AsArg().name.find("feed"); + if (idx != std::string::npos) { + for (auto& feed_ops : node->outlinks) { + if (feed_ops->AsStmt().op_type() == "feed") { + // feed op doesn't need to wait sync. + feed_ops->AsStmt().need_sync_ = false; + CHECK_EQ(static_cast(feed_ops->outlinks.size()), 1) + << "feed op must have one output."; + for (auto& var : feed_ops->outlinks) { + var->AsArg().lane = lane; + map_arg_to_lane_[var->AsArg().name] = lane; + resources_[var->AsArg().name] = true; + } + feed_ops->AsStmt().stream_id_ = lane; + ops_in_streams_[lane].push_back(feed_ops); + ++lane; + if (lane >= max_stream_) { + lane = 0; + } + } + } + } + // set all io_copy_once op in the first stream + for (auto& io_copy_once_ops : node->outlinks) { + if (io_copy_once_ops->AsStmt().op_type() == "io_copy_once") { + ops_in_streams_[0].push_back(io_copy_once_ops); + io_copy_once_ops->AsStmt().stream_id_ = 0; + io_copy_once_ops->AsStmt().need_sync_ = false; + ++io_copy_once_num_; + } + } + } +} + +bool MultiStreamAnalysisPass::CheckOpSupport() { + std::unordered_set invalid_op = { + "while", "conditional_block", "conditional_block_infer", "graph_op"}; + for (auto& op_type : op_types_set_) { + if (invalid_op.count(op_type)) { + LOG(INFO) << "multi_stream_analysis_pass don't support " << op_type + << ", just return."; + return false; + } + } + return true; +} + +bool MultiStreamAnalysisPass::IsPrepared(Node* stmt_node) { + // feed op are prepared when init. + std::string op_name = stmt_node->AsStmt().op_type(); + if (op_name == "feed") { + return true; + } + + // Check is op's input are all accessible. + std::vector args; + for (auto* ins : stmt_node->inlinks) { + args.push_back(ins->AsArg().name); + } + return CheckAccess(args); +} + +bool MultiStreamAnalysisPass::CheckAccess( + const std::vector& args) { + if (args.size() == 0) { + return true; + } + for (auto& name : args) { + if (resources_[name]) { + continue; + } else { + return false; + } + } + return true; +} + +int MultiStreamAnalysisPass::SelectStreamId(const std::vector& lanes) { + if (lanes.size() == 0) { + return 0; + } + + int res = lanes[0]; + int exclude_io_copy_once_num = ops_in_streams_[0].size() - io_copy_once_num_; + int min_num = lanes[0] == 0 ? exclude_io_copy_once_num + : ops_in_streams_[lanes[0]].size(); + for (size_t i = 1; i < lanes.size(); ++i) { + int ith_num = lanes[i] == 0 ? exclude_io_copy_once_num + : ops_in_streams_[lanes[i]].size(); + if (ith_num < min_num) { + res = lanes[i]; + min_num = ith_num; + } + } + + return res; +} + +void MultiStreamAnalysisPass::Launch(Node* stmt_node) { + // record ops launch order. + exec_que_.push(stmt_node); + std::vector lanes; + for (auto& in_arg : stmt_node->inlinks) { + // Weight parameter does not involve stream id, so just skip it. + if (in_arg->AsArg().is_weight || in_arg->AsArg().is_persist) { + continue; + } + + if (std::find(lanes.begin(), lanes.end(), in_arg->AsArg().lane) == + lanes.end()) { + lanes.push_back(in_arg->AsArg().lane); + } + } + + int stream_id = SelectStreamId(lanes); + + // If all inputs of the op are on multiple streams, they need to be + // synchronized + if (lanes.size() > 1) { + for (size_t i = 0; i < lanes.size(); ++i) { + if (lanes[i] != stream_id) { + stmt_node->AsStmt().sync_streams_.push_back(lanes[i]); + } + } + stmt_node->AsStmt().need_sync_ = true; + } + // io_copy are nodes inserted across devices and need to be synced. + if (stmt_node->AsStmt().op_type() == "io_copy") { + stmt_node->AsStmt().need_sync_ = true; + } + stmt_node->AsStmt().stream_id_ = stream_id; + + // set output lane and set the output of op to be accessible. + for (auto& out_arg : stmt_node->outlinks) { + out_arg->AsArg().lane = stream_id; + resources_[out_arg->AsArg().name] = true; + } + ops_in_streams_[stream_id].push_back(stmt_node); +} + +void MultiStreamAnalysisPass::Apply(const std::unique_ptr& graph) { +#ifdef LITE_WITH_CUDA + typename Env::Devs& devs = + Env::Global(); + int dev_id = TargetWrapper::GetCurDevice(); + max_stream_ = devs[dev_id].max_stream(); +#else + LOG(FATAL) << "Please re-compile by setting the cmake flag LITE_WITH_CUDA=ON"; +#endif + + // Find the correct startup sequence for op. + Init(graph.get()); + bool is_valid = CheckOpSupport(); + if (!is_valid) { + return; + } + size_t prev_size; + + while (!(this->wait_que_.empty() && this->wait_que_cpu_.empty())) { + prev_size = this->wait_que_.size() + this->wait_que_cpu_.size(); + // launch the acessible cuda kernel and remove it from wait que. + for (auto it = this->wait_que_.begin(); it != this->wait_que_.end();) { + if (IsPrepared(*it)) { + Launch(*it); + it = wait_que_.erase(it); + } else { + ++it; + } + } + // launch the accessible cpu kernel and remove it from wait que. + for (auto cpu_it = this->wait_que_cpu_.begin(); + cpu_it != this->wait_que_cpu_.end();) { + if (IsPrepared(*cpu_it)) { + Launch(*cpu_it); + cpu_it = wait_que_cpu_.erase(cpu_it); + } else { + ++cpu_it; + } + } + + if (this->wait_que_.size() + this->wait_que_cpu_.size() == prev_size) { + LOG(FATAL) << "network topo error!"; + } + } + + // Get exec ops order. + while (!exec_que_.empty()) { + auto* node = exec_que_.front(); + exec_ops_.push_back(node); + VLOG(4) << node->AsStmt().op_type() + << " stream: " << node->AsStmt().stream_id_ + << ", sync: " << node->AsStmt().need_sync_; + if (node->AsStmt().need_sync_) { + for (size_t i = 0; i < node->AsStmt().sync_streams_.size(); ++i) { + VLOG(4) << " " << node->AsStmt().sync_streams_[i]; + } + } + exec_que_.pop(); + } + + // Set attribute parameters, for passing parameters between passes + const std::string attr_name{"nodes_in_order"}; + SetAttr>(attr_name, &exec_ops_); + + LOG(INFO) << "stream " << 0 << " has " + << ops_in_streams_[0].size() - io_copy_once_num_ + << " ops. (exclude io_copy_once)."; + for (size_t i = 1; i < ops_in_streams_.size(); ++i) { + LOG(INFO) << "stream " << i << " has " << ops_in_streams_[i].size() + << " ops."; + } +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(multi_stream_analysis_pass, + paddle::lite::mir::MultiStreamAnalysisPass) + .BindTargets({TARGET(kCUDA)}); diff --git a/lite/core/mir/multi_stream_analysis_pass.h b/lite/core/mir/multi_stream_analysis_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..37a7feca3a1200ad7ff26ef8fc0317deee9d174e --- /dev/null +++ b/lite/core/mir/multi_stream_analysis_pass.h @@ -0,0 +1,85 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lite/core/kernel.h" +#include "lite/core/mir/pass.h" + +namespace paddle { +namespace lite { +namespace mir { + +/* + * MultiStreamAnalysisPass will find the correct launch sequence for all ops. + * Ideally, the order should be multiple asynchronous ops and a small number of + * synchronous ops. + */ +class MultiStreamAnalysisPass : public StmtPass { + public: + void Apply(const std::unique_ptr& graph) override; + + private: + // Init resource list. Set all ops except feed to inaccessible state and set + // stream id according to the numer of inputs. + void Init(SSAGraph* graph); + + // Clean state information of all member variables. + void CleanUp(); + + // After launching, unlock the output resources of op. + void Launch(Node* stmt_node); + + // If all inputs of an op are accessible, the op is considered to be in the + // prepared state + bool IsPrepared(Node* stmt_node); + + // Determine if all inputs of op are accessible. + bool CheckAccess(const std::vector& args); + + // The logic of selecting a stream: + // 1. Make the number of ops on each stream as close as possible. + // 2. The selected stream must be one of the streams contained in the input + // arg + int SelectStreamId(const std::vector& lanes); + + // Check if the model's ops are all supported. If you encounter unsupported + // ops, exit + bool CheckOpSupport(); + + private: + std::list wait_que_; + std::list wait_que_cpu_; + std::queue exec_que_; + std::vector exec_ops_; + std::vector> ops_in_streams_; + std::unordered_map resources_; + std::unordered_map map_arg_to_lane_; + int max_stream_; + int io_copy_once_num_; + std::unordered_set op_types_set_; +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h index e7c44d2be689a9d890158c097e198314413d1ba3..ae7b112d9157de3f53c409dfc89bf1273531e05f 100644 --- a/lite/core/mir/node.h +++ b/lite/core/mir/node.h @@ -80,12 +80,18 @@ class Node { // Description. std::string desc; + + // for cuda multi stream + bool need_sync_{false}; + int stream_id_{0}; + // streams which need to be sync. exclude stream_id_ + std::vector sync_streams_{}; }; struct Arg { std::string name; int id{0}; - const Type* type{}; + const Type* type{nullptr}; // Weight is a special kind of argument, it is marked as weight explicitly // so that some weight related optimization can take place. bool is_weight{false}; @@ -93,6 +99,7 @@ class Node { // if the need more than one tool operator(eg. io_copy layout calib), the // argument between them should be persist to make sure it's only run once bool is_persist{false}; + int lane{-1}; }; Arg& AsArg(const std::string& name, int id); diff --git a/lite/core/mir/pass.h b/lite/core/mir/pass.h index 4e8c8be292bbd5e7f46664378634d4f1aeed2965..64f2db82c0b1b0b863c1aa61b3b2affea5f85d89 100644 --- a/lite/core/mir/pass.h +++ b/lite/core/mir/pass.h @@ -17,9 +17,11 @@ #include #include #include +#include #include "lite/core/mir/node.h" #include "lite/core/mir/ssa_graph.h" +#include "lite/utils/varient.h" namespace paddle { namespace lite { @@ -121,6 +123,27 @@ class Pass { virtual ~Pass() = default; + bool HasAttr(const std::string& attr_name) const { + return pass_attrs_.count(attr_name) > 0; + } + + // Set a pointer to the attribute. Specific pass itself takes ownership of the + // attribute. + template + void SetAttr(const std::string& attr_name, const AttrType* attr) { + VLOG(4) << "Setting the attribute " << attr_name << " for the pass " + << name_; + pass_attrs_[attr_name].set(*attr); + } + + // Get a reference to the attribute previously set. + template + const AttrType& GetAttr(const std::string& attr_name) const { + CHECK(pass_attrs_.count(attr_name)) + << attr_name << " attr not register for pass " << name_; + return pass_attrs_.at(attr_name).get(); + } + private: const Kind kind_; std::string name_; @@ -128,6 +151,8 @@ class Pass { std::set bound_targets_; std::set excluded_targets_; std::unordered_map> bound_kernels_; + std::unordered_map>> + pass_attrs_; }; // Different kinds. diff --git a/lite/core/mir/pass_registry.h b/lite/core/mir/pass_registry.h index 849f80aea2191b72ac423c7125a4e69cb6927be5..170de1cd31ffd31662eb98898ad795993a36289e 100644 --- a/lite/core/mir/pass_registry.h +++ b/lite/core/mir/pass_registry.h @@ -59,6 +59,9 @@ class PassRegistry { } // namespace lite } // namespace paddle +// some platform-independent defintion +#include "lite/utils/macros.h" + #define REGISTER_MIR_PASS(name__, class__) \ paddle::lite::mir::PassRegistry mir_pass_registry##name__(#name__, \ new class__); \ @@ -66,4 +69,4 @@ class PassRegistry { return mir_pass_registry##name__.Touch(); \ } \ static paddle::lite::mir::PassRegistry mir_pass_registry_func_##name__ \ - __attribute__((unused)) = mir_pass_registry##name__ + UNUSED = mir_pass_registry##name__ diff --git a/lite/core/mir/pattern_matcher.cc b/lite/core/mir/pattern_matcher.cc index b625919cbfb6d26ecbbd1bad36772aff86bee087..aaebf852b2ec519515e59655a57600f59ec6a2c3 100644 --- a/lite/core/mir/pattern_matcher.cc +++ b/lite/core/mir/pattern_matcher.cc @@ -322,7 +322,6 @@ void PatternMatcher::RemoveOverlappedMatch(std::vector *subgraphs) { } std::string PMPattern::DotString() const { - using inference::analysis::Dot; Dot dot; int id = 0; // Create Nodes diff --git a/lite/core/mir/pattern_matcher.h b/lite/core/mir/pattern_matcher.h index 90c4359c6d3ade98cf60b5c23411e2026cdeccc9..0cbfbd986ce743985fde64b8e71b9b0e2b135b9e 100644 --- a/lite/core/mir/pattern_matcher.h +++ b/lite/core/mir/pattern_matcher.h @@ -162,6 +162,12 @@ struct PMNode { attr_name, [=](const T& src) { return src == attr; }); } + PMNode* assert_node_satisfied( + const std::function& condition) { + asserts_.push_back(condition); + return this; + } + private: PMNode(PMPattern* pattern, const std::string& name = "", diff --git a/lite/core/mir/pattern_matcher_high_api.h b/lite/core/mir/pattern_matcher_high_api.h index e62a4fc7494d750b2b5331c4b54b787df239ceee..3ac8e331aacb28044fca7f328319de37b27950bf 100644 --- a/lite/core/mir/pattern_matcher_high_api.h +++ b/lite/core/mir/pattern_matcher_high_api.h @@ -64,7 +64,6 @@ class FuseBase { protected: virtual void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) = 0; - private: void PerformPatternMatcher(SSAGraph* graph); // Delete nodes that are marked as Intermediate diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.cc b/lite/core/mir/quantized_op_attributes_inference_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..187e6b634fcf9d38cb32b7ca936ac8039c1717cf --- /dev/null +++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/quantized_op_attributes_inference_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include "lite/core/mir/graph_visualize_pass.h" +#include "lite/core/mir/pass_registry.h" + +namespace paddle { +namespace lite { +namespace mir { + +void QuantizedOpAttributesInferencePass::Apply( + const std::unique_ptr& graph) { + // Only for fully quantized model which is only supported by MTK and RK NPU. + // Replace the output_scale with the input_scale of the adjacent quantized + // ops, and fix the missing of the attribute 'enable_int8'. + for (auto& op_node : graph->StmtTopologicalOrder()) { + if (!op_node->IsStmt()) continue; + auto& inst = op_node->AsStmt(); + auto op_info = inst.op_info(); + auto op_type = op_info->Type(); + if (!op_info->HasAttr("input_scale")) continue; + bool found = false; + float output_scale; + for (auto out_var_node : op_node->outlinks) { + CHECK(out_var_node->IsArg()); + for (auto out_op_node : out_var_node->outlinks) { + CHECK(out_op_node->IsStmt()); + auto& out_inst = out_op_node->AsStmt(); + auto out_op_info = out_inst.op_info(); + if (!out_op_info->HasAttr("input_scale")) continue; + auto input_scale = out_op_info->GetAttr("input_scale"); + if (!found) { + found = true; + output_scale = input_scale; + } else { + CHECK_EQ(output_scale, input_scale); + } + } + } + if (found) { + inst.mutable_op_info()->SetAttr("output_scale", output_scale); + } else if (op_info->HasAttr("output_scale")) { + int bit_length = op_info->GetAttr("bit_length"); + int range = (1 << (bit_length - 1)) - 1; + output_scale = op_info->GetAttr("output_scale"); + inst.mutable_op_info()->SetAttr("output_scale", output_scale / range); + } + if (op_info->HasAttr("output_scale")) { + inst.mutable_op_info()->SetAttr("enable_int8", true); + } + } + VLOG(5) << "\n" << Visualize(graph.get()); +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(quantized_op_attributes_inference_pass, + paddle::lite::mir::QuantizedOpAttributesInferencePass) + .BindTargets({TARGET(kAPU), TARGET(kRKNPU)}); diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.h b/lite/core/mir/quantized_op_attributes_inference_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..2b475e0b3d662a9837b7766efb4ccc8f87037b7a --- /dev/null +++ b/lite/core/mir/quantized_op_attributes_inference_pass.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/core/mir/pass.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace mir { + +class QuantizedOpAttributesInferencePass : public mir::StmtPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/runtime_context_assign_pass.cc b/lite/core/mir/runtime_context_assign_pass.cc index 97c4819eaf6734ba9b374444166d17cb15e8ae65..5b6f968484b7b49838a004c3edfd00ff9b7e5e5e 100644 --- a/lite/core/mir/runtime_context_assign_pass.cc +++ b/lite/core/mir/runtime_context_assign_pass.cc @@ -24,11 +24,32 @@ class RuntimeContextAssignPass : public StmtPass { RuntimeContextAssignPass() {} void Apply(const std::unique_ptr& graph) override { +#ifdef LITE_WITH_OPENCL + using OpenCLContext = Context; + std::unique_ptr local_ctx(new KernelContext()); + local_ctx->As().InitOnce(); +#endif for (auto& node : graph->mutable_nodes()) { if (!node.IsStmt()) continue; auto& inst = node.AsStmt(); - inst.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(inst.picked_kernel().target())); + +#ifdef LITE_WITH_OPENCL + if (inst.picked_kernel().target() == TARGET(kOpenCL)) { + std::unique_ptr ctx(new KernelContext()); + (*local_ctx) + .As() + .CopySharedTo(&ctx->As()); + inst.picked_kernel().SetContext(std::move(ctx)); + } else { + inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + inst.picked_kernel().target())); + } +#else + int stream_id = inst.stream_id_; + + inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + inst.picked_kernel().target(), stream_id)); +#endif } } }; diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc index 0d4c642877f7beccfe37ebb92a5f6e7e508d37b0..c8813edfb3aed9531bdbb4e80e44bc26bcf55ba7 100755 --- a/lite/core/mir/ssa_graph.cc +++ b/lite/core/mir/ssa_graph.cc @@ -64,6 +64,26 @@ std::map> SSAGraph::BuildOperationAdjList() { return adj_list; } +std::map> SSAGraph::BuildNodeAdjList() { + std::map> adj_list; + + for (auto &n : mutable_nodes()) { + if (adj_list.find(&n) == adj_list.end()) { + adj_list[&n] = std::set(); + } + std::vector nodes; + for (auto &var : n.inlinks) { + nodes.push_back(var); + } + std::sort(nodes.begin(), + nodes.end(), + [](mir::Node *node1, mir::Node *node2) { return node1 > node2; }); + adj_list[&n].insert(std::make_move_iterator(nodes.begin()), + std::make_move_iterator(nodes.end())); + } + return adj_list; +} + void SSAGraph::SortHelper( const std::map> &adj_list, mir::Node *node, @@ -98,6 +118,24 @@ std::vector SSAGraph::StmtTopologicalOrder() { return res; } +std::vector SSAGraph::NodeTopologicalOrder() { + CheckBidirectionalConnection(); + + std::stack stack; + std::set visited; + std::vector res; + + auto adj_list = BuildNodeAdjList(); + + for (auto adj : adj_list) { + if (visited.find(adj.first) == visited.end()) { + SortHelper(adj_list, adj.first, &visited, &res); + } + } + + return res; +} + Node *SSAGraph::GraphCreateInstructNode( const std::shared_ptr &op, const std::vector &valid_places) { node_storage_.emplace_back(); @@ -140,12 +178,21 @@ void SSAGraph::Build(const Program &program, arg_node->AsArg(name, node_storage_.size() - 1); arg_update_node_map_[name] = arg_node; } - /* - if (var_types.count(name) && !arg_node->arg()->type) { - arg_node->arg()->type = LiteType::GetTensorTy( - TARGET(kUnk), var_types[name], DATALAYOUT(kUnk)); + + if (var_types.count(name)) { + if (!arg_node->arg()->type) { + arg_node->arg()->type = LiteType::GetTensorTy( + TARGET(kUnk), var_types[name], DATALAYOUT(kUnk)); + } + // Store the original data type of the output tensors for + // type_precision_cast_pass, to keep the consistency between the + // output types of original graph and optimized graph's + if (op->op_info()->Type() == "fetch") { + op->mutable_op_info()->SetAttr( + "data_type", static_cast(var_types[name])); + } } - */ + if (is_weights(name)) arg_node->AsArg().is_weight = true; CHECK(arg_node->IsRoleSet()); DirectedLink(arg_node, op_node); @@ -208,9 +255,10 @@ std::vector SSAGraph::outputs() { } mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) { - auto it = arguments_.find(arg); - if (it != arguments_.end()) { - return it->second; + for (auto &node : node_storage_) { + if (node.IsArg() && node.arg()->name == arg) { + return &node; + } } return nullptr; } diff --git a/lite/core/mir/ssa_graph.h b/lite/core/mir/ssa_graph.h index b5b9fb1cb28a35f37d51e4e63eb7512354d0547b..e2967cf96a6b00ccc225ce05b043cb94f161b1d6 100644 --- a/lite/core/mir/ssa_graph.h +++ b/lite/core/mir/ssa_graph.h @@ -42,6 +42,8 @@ class SSAGraph : GraphBase { std::vector StmtTopologicalOrder(); + std::vector NodeTopologicalOrder(); + // The inputs of the graph. std::vector inputs(); @@ -86,6 +88,9 @@ class SSAGraph : GraphBase { // Build operator inlink edge table. std::map> BuildOperationAdjList(); + // Build node inlink edge table. + std::map> BuildNodeAdjList(); + void SortHelper(const std::map> &adj_list, mir::Node *node, std::set *visited, diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h index f655b298bf2d800f4adf142ad14b8ac05ca00482..dd6e8fff13242d94a8f37bc6f7d23ad7bd306272 100644 --- a/lite/core/mir/static_kernel_pick_pass.h +++ b/lite/core/mir/static_kernel_pick_pass.h @@ -58,7 +58,7 @@ class StaticKernelPickPass : public mir::StmtPass { const std::unordered_map& out_types, const std::vector& in_names, const std::vector& out_names) { - CHECK_GT(places.size(), 0) << "valid_places is empty."; + CHECK_GT(places.size(), static_cast(0)) << "valid_places is empty."; float final_score{-1.}; Place winner_place{places[0]}; const int kMax = @@ -145,11 +145,12 @@ class StaticKernelPickPass : public mir::StmtPass { } VLOG(4) << "[score(final)]:" << final_score; - VLOG(4) << "-------- pick summary --------"; - VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision) + VLOG(2) << "-------- pick summary for " << instruct.op_type() + << " --------"; + VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision) << " " << DataLayoutToStr(winner_place.layout) << " " << TargetToStr(winner_place.target); - VLOG(4) << " ===> kernel.place():" + VLOG(2) << " ===> kernel.place():" << PrecisionToStr(kernel.place().precision) << " " << DataLayoutToStr(kernel.place().layout) << " " << TargetToStr(kernel.place().target); @@ -163,6 +164,11 @@ class StaticKernelPickPass : public mir::StmtPass { // might have different data layout. // TODO(Superjomn) reconsider the idea of taking the data layout as a kernel // specification. + + if (kernel.place().target == TARGET(kFPGA)) { + final_score = 1000; + } + return final_score; } diff --git a/lite/core/mir/subgraph/CMakeLists.txt b/lite/core/mir/subgraph/CMakeLists.txt index f8aa09676c2d1e6d4df6fafbaf6a54bc69491acc..a009f1c6d49f373b8c99ee4814e7f1f62b64018f 100644 --- a/lite/core/mir/subgraph/CMakeLists.txt +++ b/lite/core/mir/subgraph/CMakeLists.txt @@ -12,8 +12,10 @@ if (WITH_TESTING AND NOT LITE_WITH_CUDA) add_dependencies(test_subgraph_detector extern_lite_download_mobilenet_v1_tar_gz extern_lite_download_mobilenet_v2_relu_tar_gz) - set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") - set_target_properties(test_subgraph_detector PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + if(NOT WIN32) + set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") + set_target_properties(test_subgraph_detector PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + endif() lite_cc_test(test_subgraph_pass SRCS subgraph_pass_test.cc DEPS mir_passes paddle_api_full paddle_api_light gflags @@ -22,8 +24,10 @@ if (WITH_TESTING AND NOT LITE_WITH_CUDA) add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v1_tar_gz extern_lite_download_mobilenet_v2_relu_tar_gz) - set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") - set_target_properties(test_subgraph_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + if(NOT WIN32) + set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") + set_target_properties(test_subgraph_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + endif() endif() set(mir_subgraphs subgraph_pass CACHE INTERNAL "mir_subgraphs") diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc index 6d48b053a1a4140252d35e85d2351644d3c216e9..6bab454c42a68a7513aa01ff06cc2be6c970e199 100644 --- a/lite/core/mir/subgraph/subgraph_detector.cc +++ b/lite/core/mir/subgraph/subgraph_detector.cc @@ -22,15 +22,16 @@ #include "lite/core/mir/pass_registry.h" #include "lite/core/mir/pattern_matcher.h" #include "lite/operators/subgraph_op.h" +#include "lite/utils/env.h" +#include "lite/utils/io.h" +#include "lite/utils/string.h" namespace paddle { namespace lite { namespace mir { -using inference::analysis::Dot; - std::string SubgraphVisualizer::operator()() { - inference::analysis::Dot dot; + Dot dot; const std::vector subgraph_colors{ "red", "green", "cyan", "bisque3", "coral", "darkseagreen1", "goldenrod1", "darkorchid", @@ -46,8 +47,8 @@ std::string SubgraphVisualizer::operator()() { "turquoise4", "snow3", "sienna4", "salmon2", }; std::unordered_map subgraph_indices; - for (int i = 0; i < subgraphs_.size(); i++) { - for (int j = 0; j < subgraphs_[i].size(); j++) { + for (size_t i = 0; i < subgraphs_.size(); i++) { + for (size_t j = 0; j < subgraphs_[i].size(); j++) { subgraph_indices[subgraphs_[i][j]] = i; } } @@ -63,11 +64,11 @@ std::string SubgraphVisualizer::operator()() { } else { exists_ops[op_type]++; } - auto op_name = op_type + std::to_string(exists_ops[op_type]); + auto op_name = op_type + paddle::lite::to_string(exists_ops[op_type]); std::string op_color = "white"; if (subgraph_indices.count(node)) { auto subgraph_idx = subgraph_indices[node]; - op_name += "_subgraph_" + std::to_string(subgraph_idx); + op_name += "_subgraph_" + paddle::lite::to_string(subgraph_idx); op_color = subgraph_colors[subgraph_idx % subgraph_colors.size()]; } dot.AddNode(op_name, @@ -209,8 +210,82 @@ void SubgraphDetector::FlexibleDFS( } } +std::unordered_set SubgraphDetector::GetExcludedNodesFromConfigFile() { + // get exclude nodes from config file + std::unordered_set excluded_nodes; + std::string config_file_path = + GetStringFromEnv(SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE); + if (!IsFileExists(config_file_path)) { + return excluded_nodes; + } + std::vector lines = ReadLines(config_file_path); + + for (std::string line : lines) { + if (line.empty()) continue; + std::vector node_info = Split(line, ":"); + std::string op_type = node_info.at(0); + std::vector in_vars_name; + if (node_info.size() > 1) { + in_vars_name = Split(node_info.at(1), ","); + } + std::vector out_vars_name; + if (node_info.size() > 2) { + out_vars_name = Split(node_info.at(2), ","); + } + + for (auto &node : graph_->mutable_nodes()) { + if (node.IsArg()) continue; + auto stmt = node.stmt(); + if (op_type != stmt->op_type()) continue; + auto in_nodes = node.inlinks; + auto out_nodes = node.outlinks; + if (in_vars_name.size() > in_nodes.size() || + out_vars_name.size() > out_nodes.size()) { + continue; + } + + bool matched = true; + + for (auto in_var_name : in_vars_name) { + bool find_var = false; + for (auto *in_node : in_nodes) { + if (in_node->arg()->name == in_var_name) { + find_var = true; + break; + } + } + if (!find_var) { + matched = false; + break; + } + } + + for (auto out_var_name : out_vars_name) { + bool find_var = false; + for (auto *out_node : out_nodes) { + if (out_node->arg()->name == out_var_name) { + find_var = true; + break; + } + } + if (!find_var) { + matched = false; + break; + } + } + + if (matched) { + excluded_nodes.insert(&node); + } + } + } + + return excluded_nodes; +} + void SubgraphDetector::InitNodes(node_map_t *nodes) { // Initialize and mark the subgraph detector nodes based on teller. + std::unordered_set excluded_nodes = GetExcludedNodesFromConfigFile(); for (auto &it : *nodes) { for (auto &in_node : it.first->inlinks) { it.second->inlinks.push_back((*nodes)[in_node]); @@ -218,7 +293,7 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) { for (auto &out_node : it.first->outlinks) { it.second->outlinks.push_back((*nodes)[out_node]); } - if (teller_(it.first)) { + if (teller_(it.first) && excluded_nodes.count(it.first) == 0) { it.second->marked = true; if (it.first->IsStmt()) { // If a function is inside the subgraph, mark all the output variables @@ -237,8 +312,14 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) { std::vector> SubgraphDetector::ExtractSubgraphs( node_map_t *nodes) { - for (auto &it : *nodes) { - node_dat_t *node = it.second; + for (auto &ordered_node : graph_->NodeTopologicalOrder()) { + // different orders when traversing nodes in graph may lead to + // different subgraph division, which may generate different result + // with device such as MLU. These different results are all "right" + // but a little confusing. Thus the topological order is used instead + // of the address of the node in graph. + CHECK(nodes->find(ordered_node) != nodes->end()); + node_dat_t *node = (*nodes)[ordered_node]; if (!node->marked) { continue; } @@ -331,7 +412,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, cpp::OpDesc subgraph_op_desc; subgraph_op_desc.SetType("subgraph"); - // Create a new sub block desc for storing all of Ops an Vars of the target + // Create a new sub block desc for storing all of Ops and Vars of the target // subgraph and sub_block_idx is set as a attribute of subgraph op, // sub_block_idx < 0 means it's a new subgraph op int sub_block_idx = -(subgraph_idx + 1); @@ -341,9 +422,6 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, for (auto &op_node : subgraph_nodes) { auto sub_block_op_desc = sub_block_desc->AddOp(); *sub_block_op_desc = *op_node->AsStmt().op_info(); - sub_block_op_desc->SetAttr( - kKernelTypeAttr, - op_node->AsStmt().picked_kernel().SerializedKernelType()); } subgraph_op_desc.SetAttr("sub_block", sub_block_idx); @@ -375,6 +453,37 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, subgraph_op_desc.SetAttr>("output_data_names", output_var_names); + // Set input/output scale values of input/output var nodes for + // type_precision_cast_pass. + std::vector input_data_scales; + std::vector output_data_scales; + for (auto &var_node : input_var_nodes) { + auto any_op_node = var_node->outlinks.front(); + CHECK(any_op_node->IsStmt()); + auto &any_inst = any_op_node->AsStmt(); + if (any_inst.op_info()->HasAttr("input_scale")) { + input_data_scales.push_back( + any_inst.op_info()->GetAttr("input_scale")); + } + } + for (auto &var_node : output_var_nodes) { + auto any_op_node = var_node->inlinks.front(); + CHECK(any_op_node->IsStmt()); + auto &any_inst = any_op_node->AsStmt(); + if (any_inst.op_info()->HasAttr("output_scale")) { + output_data_scales.push_back( + any_inst.op_info()->GetAttr("output_scale")); + } + } + if (input_data_scales.size() > 0) { + subgraph_op_desc.SetAttr>("input_data_scales", + input_data_scales); + } + if (output_data_scales.size() > 0) { + subgraph_op_desc.SetAttr>("output_data_scales", + output_data_scales); + } + // Set all of the inputs and outputs to the target subgraph op // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram() for (auto &var_node : weight_var_nodes) { @@ -413,12 +522,6 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, IR_OP_VAR_LINK(subgraph_op_node, var_node); } - // Create and assign the context to the picked kernel of the new subgraph - // node - auto &inst = subgraph_op_node->AsStmt(); - inst.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(inst.picked_kernel().target())); - // Remove subgraph nodes and unused var nodes auto nodes2rm = GetNodes2RM(subgraph_nodes, {input_var_nodes, @@ -435,7 +538,8 @@ void SubgraphFuser::ReplaceNodesWithSubgraphs(SSAGraph *graph, std::vector> subgraphs = SubgraphDetector(graph, teller)(); SubgraphVisualizer(graph, subgraphs)(); - for (int subgraph_idx = 0; subgraph_idx < subgraphs.size(); subgraph_idx++) { + for (size_t subgraph_idx = 0; subgraph_idx < subgraphs.size(); + subgraph_idx++) { if (subgraphs[subgraph_idx].size() >= min_subgraph_size) { InsertNewNode(graph, subgraph_idx, subgraphs[subgraph_idx]); } @@ -474,13 +578,14 @@ void ExtractInputsOutputs(const std::vector &op_nodes, unused_var_nodes->insert(var_node); continue; } - // Var can have more than one next op node, So, if any one in the - // op_nodes then continue - bool next_op_in_nodes = false; + // Var can have more than one next op node, So, if all next nodes are in + // op_nodes then it should be put into local_var_nodes + bool next_op_in_nodes = true; for (auto &next_op_node : var_node->outlinks) { - if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) != + if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) == op_nodes.end()) { - next_op_in_nodes = true; + next_op_in_nodes = false; + break; } } if (next_op_in_nodes) { diff --git a/lite/core/mir/subgraph/subgraph_detector.h b/lite/core/mir/subgraph/subgraph_detector.h index b6873655e976a785383269972221f001196431f8..567f2446a2af31c739b049005d2960ffbc802ef9 100644 --- a/lite/core/mir/subgraph/subgraph_detector.h +++ b/lite/core/mir/subgraph/subgraph_detector.h @@ -63,6 +63,7 @@ class SubgraphDetector { node_dat_t* UnionFindAncestor(); void UnionFindCombine(node_dat_t* candidate); }; + SubgraphDetector(SSAGraph* graph, const SubgraphTeller& teller) : graph_(graph), teller_(teller) {} std::vector> operator()(); @@ -71,7 +72,11 @@ class SubgraphDetector { bool reverse, const std::function& enter, const std::function& leave); + + std::unordered_set GetExcludedNodesFromConfigFile(); + void InitNodes(node_map_t* nodes); + std::vector> ExtractSubgraphs(node_map_t* nodes); protected: diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc index 3b0d7c5cd5c8a0d0901750148359f430b6d49894..f52c0332fa3cfce904d2b7c8bf010bc3d3ac6ac9 100644 --- a/lite/core/mir/subgraph/subgraph_detector_test.cc +++ b/lite/core/mir/subgraph/subgraph_detector_test.cc @@ -36,10 +36,10 @@ std::vector AddFCDesc( const std::shared_ptr& scope, const std::vector& input_var_names, const std::vector& wshape) { - CHECK_EQ(input_var_names.size(), 1); - CHECK_EQ(wshape.size(), 2); + CHECK_EQ(input_var_names.size(), 1u); + CHECK_EQ(wshape.size(), 2u); static int id = 0; - std::string prefix = "fc_" + std::to_string(id); + std::string prefix = "fc_" + paddle::lite::to_string(id); auto* op_desc = block_desc->AddOp(); auto* wgt = block_desc->AddVar(); @@ -76,7 +76,7 @@ std::vector AddElementwiseAddDesc( const std::vector& input_Y_names) { // CHECK_EQ(input_var_names.size(), 2); static int id = 0; - std::string prefix = "elementwise_add_" + std::to_string(id); + std::string prefix = "elementwise_add_" + paddle::lite::to_string(id); auto* op_desc = block_desc->AddOp(); auto* out = block_desc->AddVar(); @@ -100,7 +100,7 @@ std::vector AddFeedDesc( const std::vector& input_X_names) { // CHECK_EQ(input_var_names.size(), 1); static int id = 0; - std::string prefix = "feed_" + std::to_string(id); + std::string prefix = "feed_" + paddle::lite::to_string(id); auto* op_desc = block_desc->AddOp(); auto* out = block_desc->AddVar(); @@ -123,7 +123,7 @@ std::vector AddFetchDesc( const std::vector& input_X_names) { // CHECK_EQ(input_var_names.size(), 1); static int id = 0; - std::string prefix = "fetch_" + std::to_string(id); + std::string prefix = "fetch_" + paddle::lite::to_string(id); auto* op_desc = block_desc->AddOp(); auto* out = block_desc->AddVar(); @@ -169,8 +169,8 @@ TEST(Subgraph, detect_simple_model) { }; std::vector> subgraphs = mir::SubgraphDetector(graph.get(), teller)(); - ASSERT_EQ(subgraphs.size(), 1); - ASSERT_EQ(graph->nodes().size(), 9); + ASSERT_EQ(subgraphs.size(), 1u); + ASSERT_EQ(graph->nodes().size(), 9u); mir::SubgraphVisualizer(graph.get(), subgraphs)(); } @@ -200,7 +200,7 @@ TEST(Subgraph, detect_custom_model) { #ifdef LITE_WITH_NPU Place{TARGET(kNPU), PRECISION(kFloat)}, #endif -#ifdef LITE_WITH_XPU +#ifdef LITE_WITH_XTCL Place{TARGET(kXPU), PRECISION(kFloat)}, #endif }); @@ -220,8 +220,8 @@ TEST(Subgraph, detect_custom_model) { }; std::vector> subgraphs = mir::SubgraphDetector(graph.get(), teller)(); - ASSERT_EQ(subgraphs.size(), 1); mir::SubgraphVisualizer(graph.get(), subgraphs)(); + ASSERT_EQ(subgraphs.size(), 1u); } } // namespace lite diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc index 5e2cecd277820ab39b5a25db6159591157982d01..663b69d38843555095957f30d652ba8ef6216a0e 100644 --- a/lite/core/mir/subgraph/subgraph_pass.cc +++ b/lite/core/mir/subgraph/subgraph_pass.cc @@ -20,6 +20,7 @@ #include #include "lite/core/mir/pass_registry.h" #include "lite/core/mir/subgraph/subgraph_detector.h" +#include "lite/utils/env.h" namespace paddle { namespace lite { @@ -39,7 +40,24 @@ void NPUSubgraphPass::Apply(const std::unique_ptr& graph) { fuser(); } +void APUSubgraphPass::Apply(const std::unique_ptr& graph) { + std::unordered_set supported_lists; +#define USE_SUBGRAPH_BRIDGE(op_type, target) \ + supported_lists.insert(#op_type); \ + LOG(INFO) << #op_type +#include "lite/kernels/apu/bridges/paddle_use_bridges.h" +#undef USE_SUBGRAPH_BRIDGE + auto teller = [&](Node* node) { + if (!node->IsStmt()) return false; + auto& stmt = node->AsStmt(); + return supported_lists.count(stmt.op_type()) != 0; + }; + SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */); + fuser(); +} + void XPUSubgraphPass::Apply(const std::unique_ptr& graph) { + if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return; std::unordered_set supported_lists; #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); #include "lite/kernels/xpu/bridges/paddle_use_bridges.h" @@ -67,13 +85,47 @@ void BMSubgraphPass::Apply(const std::unique_ptr& graph) { fuser(); } +void RKNPUSubgraphPass::Apply(const std::unique_ptr& graph) { + std::unordered_set supported_lists; +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); +#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h" +#undef USE_SUBGRAPH_BRIDGE + auto teller = [&](Node* node) { + if (!node->IsStmt()) return false; + auto& stmt = node->AsStmt(); + return supported_lists.count(stmt.op_type()) != 0; + }; + SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */); + fuser(); +} + +void MLUSubgraphPass::Apply(const std::unique_ptr& graph) { + std::unordered_set supported_lists; +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); +#include "lite/kernels/mlu/bridges/paddle_use_bridges.h" +#undef USE_SUBGRAPH_BRIDGE + auto teller = [&](Node* node) { + if (!node->IsStmt()) return false; + auto& stmt = node->AsStmt(); + return supported_lists.count(stmt.op_type()) != 0; + }; + SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */); + fuser(); +} + } // namespace mir } // namespace lite } // namespace paddle REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass) .BindTargets({TARGET(kNPU)}); +REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass) + .BindTargets({TARGET(kAPU)}); REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass) .BindTargets({TARGET(kXPU)}); REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass) .BindTargets({TARGET(kBM)}); +REGISTER_MIR_PASS(rknpu_subgraph_pass, paddle::lite::mir::RKNPUSubgraphPass) + .BindTargets({TARGET(kRKNPU)}); +REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass) + .BindTargets({TARGET(kMLU)}); diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h index 1ba0f2ab4aa52c384f4175de0eb34475b34fb94c..8c2b501a62356c91e93f3c4ca91f70879d3c9229 100644 --- a/lite/core/mir/subgraph/subgraph_pass.h +++ b/lite/core/mir/subgraph/subgraph_pass.h @@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass { void Apply(const std::unique_ptr& graph) override; }; +class APUSubgraphPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + class XPUSubgraphPass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override; @@ -37,6 +42,16 @@ class BMSubgraphPass : public ProgramPass { void Apply(const std::unique_ptr& graph) override; }; +class RKNPUSubgraphPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + +class MLUSubgraphPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + } // namespace mir } // namespace lite } // namespace paddle diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc index 247795a86ce2cbe962b161311f7845622ee3983e..8fd3751f9ca1585af6b8b00f23acd6bacf5b7a51 100644 --- a/lite/core/mir/subgraph/subgraph_pass_test.cc +++ b/lite/core/mir/subgraph/subgraph_pass_test.cc @@ -15,11 +15,9 @@ #include #include #include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" #include "lite/api/test_helper.h" #include "lite/utils/cp_logging.h" +#include "lite/utils/string.h" DEFINE_string(model_file, "", "model file path of combined protobuf model"); DEFINE_string(params_file, "", "params file path of combined protobuf model"); @@ -27,6 +25,7 @@ DEFINE_string(optimized_model_dir, "", "path of optimized naive buffer model"); DEFINE_string(input_tensor_shape, "1,3,224,224", "shape of input tensors"); DEFINE_string(input_tensor_type, "float32", "data type of input tensors"); DEFINE_string(output_tensor_type, "float32", "data type of output tensors"); +DEFINE_string(subgraph_model_cache_dir, "", "dir of subgraph model cache"); namespace paddle { namespace lite { @@ -34,43 +33,17 @@ namespace lite { // The helper functions for loading and running model from command line and // verifying output data std::vector TypeParsing(std::string text) { - std::vector types; - while (!text.empty()) { - size_t index = text.find_first_of(":"); - std::string type = text.substr(0, index); - VLOG(3) << type; - types.push_back(type); - if (index == std::string::npos) { - break; - } else { - text = text.substr(index + 1); - } - } - return types; + return Split(text, ":"); } std::vector> ShapeParsing(std::string text) { std::vector> shapes; - while (!text.empty()) { - size_t index = text.find_first_of(":"); - std::string slice = text.substr(0, index); - std::vector shape; - while (!slice.empty()) { - size_t index = slice.find_first_of(","); - int d = atoi(slice.substr(0, index).c_str()); - VLOG(3) << d; - shape.push_back(d); - if (index == std::string::npos) { - break; - } else { - slice = slice.substr(index + 1); - } - } - shapes.push_back(shape); - if (index == std::string::npos) { - break; - } else { - text = text.substr(index + 1); + std::vector shape_strings = Split(text, ":"); + shapes.resize(shape_strings.size()); + for (size_t i = 0; i < shape_strings.size(); i++) { + std::vector shape_nums = Split(shape_strings[i], ","); + for (auto shape_num : shape_nums) { + shapes[i].push_back(atoi(shape_num.c_str())); } } return shapes; @@ -94,7 +67,7 @@ void FillInputTensors( for (int j = 0; j < input_tensor_size; j++) { \ input_tensor_data[j] = static_cast(value); \ } - for (int i = 0; i < input_tensor_shape.size(); i++) { + for (size_t i = 0; i < input_tensor_shape.size(); i++) { auto input_tensor = predictor->GetInput(i); input_tensor->Resize(input_tensor_shape[i]); auto input_tensor_size = ShapeProduction(input_tensor->shape()); @@ -123,7 +96,7 @@ void CheckOutputTensors( << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff; \ EXPECT_LT(rel_diff, 0.1); \ } - for (int i = 0; i < output_tensor_type.size(); i++) { + for (size_t i = 0; i < output_tensor_type.size(); i++) { auto tar_output_tensor = tar_predictor->GetOutput(i); auto ref_output_tensor = ref_predictor->GetOutput(i); auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape()); @@ -160,6 +133,7 @@ std::shared_ptr TestModel( mobile_config.set_model_from_file(optimized_model_dir + ".nb"); mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH); mobile_config.set_threads(1); + mobile_config.set_subgraph_model_cache_dir(FLAGS_subgraph_model_cache_dir); predictor = lite_api::CreatePaddlePredictor(mobile_config); FillInputTensors(predictor, input_tensor_shape, input_tensor_type, 1); // Run optimized model @@ -167,6 +141,7 @@ std::shared_ptr TestModel( predictor->Run(); } for (int i = 0; i < FLAGS_repeats; i++) { + FillInputTensors(predictor, input_tensor_shape, input_tensor_type, i); auto start = GetCurrentUS(); predictor->Run(); LOG(INFO) << i << ", " << GetCurrentUS() - start << "us"; @@ -208,7 +183,7 @@ TEST(Subgraph, generate_model_and_check_precision) { #ifdef LITE_WITH_NPU valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}); #endif -#ifdef LITE_WITH_XPU +#ifdef LITE_WITH_XTCL valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}); #endif auto tar_predictor = TestModel(FLAGS_model_dir, diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc index b3b7a858f68367ac789f390c6bd3bd94873f77d5..1133e5ba8203ec9fea177844a6311c993f6b8ff7 100644 --- a/lite/core/mir/type_layout_cast_pass.cc +++ b/lite/core/mir/type_layout_cast_pass.cc @@ -20,6 +20,8 @@ #include #include "lite/core/mir/graph_visualize_pass.h" #include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/type_precision_cast_pass.h" +#include "lite/operators/subgraph_op.h" #include "lite/utils/string.h" namespace paddle { @@ -39,8 +41,9 @@ void TypeLayoutTransformPass::Apply(const std::unique_ptr& graph) { VLOG(4) << "!node->IsStmt():" << !node->IsStmt(); if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue; auto inlinks = node->inlinks; - VLOG(4) << "node->AsStmt().desc:" << node->AsStmt().desc - << " inlinks.size():" << inlinks.size(); + VLOG(4) << "============== node->AsStmt().op_type():" + << node->AsStmt().op_type() << " inlinks.size():" << inlinks.size() + << " ================"; for (auto* in : inlinks) { ComplementInputs(graph.get(), node, in); } @@ -66,13 +69,25 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph, CHECK(inst.op_info()->GetInputArgname(in_arg_name, &inst_in_tensor_name)); auto decl_arg_type = inst.picked_kernel().GetInputDeclType(inst_in_tensor_name); + CHECK(in->AsArg().type); - VLOG(5) << "\n inst_in_tensor_name:" << inst_in_tensor_name + VLOG(3) << "\n inst_in_tensor_name:" << inst_in_tensor_name << "\n in->AsArg().name:" << in->AsArg().name << "\n *in->AsArg().type:" << *in->AsArg().type << "\n *decl_arg_type:" << *decl_arg_type << "\n inst.op()->DebugString():" << inst.op()->DebugString(); + // TODO(ysh329): conflict if tensor with kARM target but kImageDefault(OpenCL + // layout). + // not a good judge, but don't find the source of this issue from + // static_pick_kernel_pass + // to this pass. + auto* in_arg_type = const_cast(in->AsArg().type); + if (in_arg_type->target() == TARGET(kARM) && + in_arg_type->layout() == DATALAYOUT(kImageDefault)) { + return; + } + if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) { VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name << " for kernel " << inst.op()->DebugString() << " " @@ -170,9 +185,8 @@ void TypeLayoutTransformPass::AddLayoutInst( DirectedLink(layout_output_arg, inst_node); // reset opdesc and update kernel information - UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), - in->AsArg().name, - layout_output_name); + UpdateInputs( + inst_node->AsStmt().op().get(), in->AsArg().name, layout_output_name); auto original_selected_kernel = std::move(inst_node->AsStmt().kernels().front()); auto update_op_info = *inst_node->AsStmt().op_info(); @@ -204,6 +218,30 @@ void TypeLayoutTransformPass::SetValidPlaces( valid_places_ = valid_places; } +void OpenCLTypeLayoutTransformPass::Apply( + const std::unique_ptr& graph) { + // Start from inputs of the graph, those should have place set. + VLOG(4) << "\n" << Visualize(graph.get()); + std::list nodes; + for (auto& node : graph->StmtTopologicalOrder()) { + nodes.push_back(node); + } + + VLOG(4) << "nodes.size():" << nodes.size(); + for (auto& node : nodes) { + VLOG(4) << "!node->IsStmt():" << !node->IsStmt(); + if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue; + VLOG(1) << "node->AsStmt().op_type():" << node->AsStmt().op_type(); + if (node->AsStmt().op_type() == "layout" || + node->AsStmt().op_type() == "io_copy") { + auto new_op = node->AsStmt().mutable_op_info(); + int process_type = 1; + new_op->SetAttr("process_type", process_type); + } + } + VLOG(4) << "\n" << Visualize(graph.get()); +} + } // namespace mir } // namespace lite } // namespace paddle @@ -213,3 +251,9 @@ REGISTER_MIR_PASS(type_layout_cast_pass, .BindTargets({TARGET(kAny)}) .BindKernel("layout_once") .BindKernel("layout"); + +REGISTER_MIR_PASS(type_layout_cast_preprocess_pass, + paddle::lite::mir::OpenCLTypeLayoutTransformPass) + .BindTargets({TARGET(kAny)}) + .BindKernel("layout_once") + .BindKernel("layout"); diff --git a/lite/core/mir/type_layout_cast_pass.h b/lite/core/mir/type_layout_cast_pass.h index bf36214e1dce33352468155a6817adda9039727a..4a3e4c02d1053e84dd39bee14a0e01260f0626e4 100644 --- a/lite/core/mir/type_layout_cast_pass.h +++ b/lite/core/mir/type_layout_cast_pass.h @@ -24,18 +24,6 @@ namespace paddle { namespace lite { namespace mir { -static void UpdateInputTo(cpp::OpDesc* desc, - const std::string& from, - const std::string& to) { - for (auto& item : *desc->mutable_inputs()) { - for (auto& input : item.second) { - if (input == from) { - input = to; - } - } - } -} - class TypeLayoutTransformPass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override; @@ -57,6 +45,15 @@ class TypeLayoutTransformPass : public ProgramPass { std::vector valid_places_; }; +// add preprocess and postprocess attribute for layout op +class OpenCLTypeLayoutTransformPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; + + private: + std::vector valid_places_; +}; + } // namespace mir } // namespace lite } // namespace paddle diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc index 2f177383fc2b3a035313c0654c961c0b21a7f197..121e64dc188eeb638becec3506b514bc24dad16d 100644 --- a/lite/core/mir/type_precision_cast_pass.cc +++ b/lite/core/mir/type_precision_cast_pass.cc @@ -20,11 +20,116 @@ #include #include "lite/core/mir/graph_visualize_pass.h" #include "lite/core/mir/pass_registry.h" +#include "lite/operators/subgraph_op.h" namespace paddle { namespace lite { namespace mir { +// For the subgraph op, we also need to update the attr 'input_data_names' and +// the input variables names of the Ops in the subblock. +void UpdateInputsForSubgraph(OpLite* op, + const std::string& from, + const std::string& to) { + auto* op_desc = op->mutable_op_info(); + auto input_data_names = + op_desc->GetAttr>("input_data_names"); + std::replace(input_data_names.begin(), input_data_names.end(), from, to); + op_desc->SetAttr("input_data_names", input_data_names); + auto* subblock_desc = static_cast(op)->GetSubBlock(); + CHECK(subblock_desc); + for (size_t i = 0; i < subblock_desc->OpsSize(); i++) { + auto* subblock_op_desc = subblock_desc->GetOp(i); + for (auto& subblock_op_input : *subblock_op_desc->mutable_inputs()) { + for (auto& subblock_var_name : subblock_op_input.second) { + if (subblock_var_name == from) { + subblock_var_name = to; + } + } + } + } +} + +// Update the input variable names from 'from' to 'to' for the target Op +void UpdateInputs(OpLite* op, const std::string& from, const std::string& to) { + auto* op_desc = op->mutable_op_info(); + auto op_type = op_desc->Type(); + for (auto& op_input : *op_desc->mutable_inputs()) { + for (auto& var_name : op_input.second) { + if (var_name == from) { + var_name = to; + } + } + } + if (op_type == "subgraph") { + UpdateInputsForSubgraph(op, from, to); + } +} + +// Infer the scale value for the new calib op from the subgraph op +static bool InferScaleFromSubgraph(std::string var_name, + const OpInfo* op_info, + float* scale, + bool reverse = false) { + std::string attr_name = reverse ? "output_data_names" : "input_data_names"; + if (!op_info->HasAttr(attr_name)) return false; + auto input_or_output_names = + op_info->GetAttr>(attr_name); + attr_name = reverse ? "output_data_scales" : "input_data_scales"; + if (!op_info->HasAttr(attr_name)) return false; + auto input_or_output_scales = op_info->GetAttr>(attr_name); + auto size = input_or_output_names.size(); + CHECK(size == input_or_output_scales.size()); + for (size_t i = 0; i < size; i++) { + if (input_or_output_names[i] == var_name) { + *scale = input_or_output_scales[i]; + return true; + } + } + return false; +} + +// Infer the scale value for the new calib op from the input_scale of the +// current op and output_scale of the previous op. +// case 1: prev_op->var_node->op_node(int8->any op, with input_scale). +// case 2: prev_op->var_node->op_node(subgraph op, int8->any, with +// input_data_scales). +// case 3: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any, +// without input_scale). +// case 4: prev_op(any->int8, subgraph_op, with +// output_data_scales)->var_node->op_node(fp32->any, without input_scale). +static bool InferScale(Node* var_node, Node* op_node, float* scale) { + bool found = false; + auto& inst = op_node->AsStmt(); + auto op_info = inst.op_info(); + auto op_type = op_info->Type(); + auto var_name = var_node->AsArg().name; + if (op_type == "subgraph") { + found = InferScaleFromSubgraph(var_name, op_info, scale, false); + } else { + if (op_info->HasAttr("input_scale")) { + *scale = op_info->GetAttr("input_scale"); + found = true; + } else { + // Obtain the output_scale from one of its previous Ops + auto prev_op_node = var_node->inlinks.front(); + CHECK(prev_op_node->IsStmt()); + auto& prev_inst = prev_op_node->AsStmt(); + auto prev_op_info = prev_inst.op_info(); + auto prev_op_type = prev_op_info->Type(); + if (prev_op_type == "subgraph") { + found = InferScaleFromSubgraph(var_name, prev_op_info, scale, true); + } else { + if (prev_op_info->HasAttr("output_scale")) { + *scale = prev_op_info->GetAttr("output_scale"); + found = true; + } + } + } + } + return found; +} + void PrecisionCastPass::Apply(const std::unique_ptr& graph) { // Start from inputs of the graph, those should have place set. std::list nodes; @@ -32,18 +137,23 @@ void PrecisionCastPass::Apply(const std::unique_ptr& graph) { nodes.push_back(node); } + // record the copied node. + std::unordered_map cast_nodes; + for (auto& node : nodes) { if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue; auto inlinks = node->inlinks; for (auto* in : inlinks) { - ComplementInputs(graph.get(), node, in); + ComplementInputs(graph.get(), node, in, &cast_nodes); } } } -void PrecisionCastPass::ComplementInputs(SSAGraph* graph, - Node* inst_node, - Node* in) { +void PrecisionCastPass::ComplementInputs( + SSAGraph* graph, + Node* inst_node, + Node* in, + std::unordered_map* cast_nodes) { // If this input is out of date. if (inst_node->inlinks.end() == std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in)) @@ -59,6 +169,14 @@ void PrecisionCastPass::ComplementInputs(SSAGraph* graph, auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp); CHECK(in->AsArg().type); VLOG(4) << inst.picked_kernel().name(); + if (inst.op_info()->Type() == "fetch") { + if (inst.op_info()->HasAttr("data_type")) { + auto data_type = + static_cast(inst.op_info()->GetAttr("data_type")); + decl_arg_type = LiteType::GetTensorTy( + decl_arg_type->target(), data_type, decl_arg_type->layout()); + } + } // if (!in->AsArg().is_weight && !PrecisionCompatibleTo(*in->AsArg().type, // *decl_arg_type)) { if (!PrecisionCompatibleTo(*in->AsArg().type, *decl_arg_type)) { @@ -71,16 +189,19 @@ void PrecisionCastPass::ComplementInputs(SSAGraph* graph, in, graph, inst_node, + cast_nodes, graph->valid_places()); } } -void PrecisionCastPass::AddCastInst(const Type& from, - const Type& to, - Node* in, - SSAGraph* graph, - Node* inst_node, - const std::vector& valid_places) { +void PrecisionCastPass::AddCastInst( + const Type& from, + const Type& to, + Node* in, + SSAGraph* graph, + Node* inst_node, + std::unordered_map* cast_nodes, + const std::vector& valid_places) { CHECK(!valid_places.empty()) << "valid_place should be set"; // var -> new_transform_op -> new_var -> inst @@ -88,67 +209,82 @@ void PrecisionCastPass::AddCastInst(const Type& from, CHECK(in->IsArg()); // auto node_id = [&] { return graph->nodes().size(); }; auto cast_op_output_name = in->AsArg().name + "/precision_trans"; - // in->AsArg().name + "/precision_trans/" + std::to_string(node_id()); - auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name); - cast_op_output_arg->AsArg().type = - LiteType::GetTensorTy(from.target(), to.precision(), from.layout()); - auto* cast_inst = graph->NewInstructNode(); - - // create Op and kernels. - bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; - std::string cast_type = in_persist ? "calib_once" : "calib"; - cast_op_output_arg->AsArg().is_persist = in_persist; - auto cast_op = LiteOpRegistry::Global().Create(cast_type); - CHECK(cast_op) << "create op [" << cast_op << "] failed"; - - // Create the new var manually. - inst_node->AsStmt().op()->scope()->Var(cast_op_output_name); - - // Create Calib Instruction. - cpp::OpDesc op_desc; - op_desc.SetType(cast_type); - op_desc.SetInput("Input", {in->AsArg().name}); - op_desc.SetOutput("Out", {cast_op_output_name}); - if (inst_node->AsStmt().op_info()->HasAttr("input_scale")) { - op_desc.SetAttr( - "scale", inst_node->AsStmt().op_info()->GetAttr("input_scale")); - } - cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); - auto kernels = cast_op->CreateKernels(valid_places); - std::vector> selected_kernels; - bool is_found = false; - for (auto& kernel : kernels) { - const Type* in_arg_ty = kernel->GetInputDeclType("Input"); - const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); - if (TypeCompatible(*in_arg_ty, from) && - out_arg_ty->precision() == to.precision()) { - is_found = true; - selected_kernels.emplace_back(std::move(kernel)); - // we pick the kernel - cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op); - break; + // in->AsArg().name + "/precision_trans/" + + // paddle::lite::to_string(node_id()); + if (cast_nodes->count(in->AsArg().name)) { + // Remove the old link + RemoveDirectedLink(in, inst_node); + // Update the original instruction OpDesc. + // Update its input to the cast_op_output_name + // Add new link, newarg->inst + DirectedLink(cast_nodes->at(in->AsArg().name), + inst_node); // [io_copy kernel]'s output -> [current kernel] + // reset opdesc and update kernel information + UpdateInputs( + inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name); + } else { + auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name); + cast_op_output_arg->AsArg().type = + LiteType::GetTensorTy(from.target(), to.precision(), from.layout()); + auto* cast_inst = graph->NewInstructNode(); + + // create Op and kernels. + bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; + std::string cast_type = in_persist ? "calib_once" : "calib"; + cast_op_output_arg->AsArg().is_persist = in_persist; + auto cast_op = LiteOpRegistry::Global().Create(cast_type); + CHECK(cast_op) << "create op [" << cast_op << "] failed"; + + // Create the new var manually. + inst_node->AsStmt().op()->scope()->Var(cast_op_output_name); + + // Create Calib Instruction. + cpp::OpDesc op_desc; + op_desc.SetType(cast_type); + op_desc.SetInput("Input", {in->AsArg().name}); + op_desc.SetOutput("Out", {cast_op_output_name}); + float scale; + if (InferScale(in, inst_node, &scale)) { + op_desc.SetAttr("scale", scale); } - } - CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":" - << in->AsArg().name << "->" << to << ":" - << inst_node->AsStmt().op_info()->Type(); + cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); + auto kernels = cast_op->CreateKernels(valid_places); + std::vector> selected_kernels; + bool is_found = false; + for (auto& kernel : kernels) { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (TypeCompatible(*in_arg_ty, from) && + out_arg_ty->precision() == to.precision()) { + is_found = true; + selected_kernels.emplace_back(std::move(kernel)); + // we pick the kernel + cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op); + (*cast_nodes)[in->AsArg().name] = cast_op_output_arg; + break; + } + } + + CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":" + << in->AsArg().name << "->" << to << ":" + << inst_node->AsStmt().op_info()->Type(); - // Remove the old link - RemoveDirectedLink(in, inst_node); + // Remove the old link + RemoveDirectedLink(in, inst_node); - // Update the original instruction OpDesc. - // Update its input to the io_copy_output_name + // Update the original instruction OpDesc. + // Update its input to the io_copy_output_name - // Add new link, var -> new_inst, new_inst->newarg, newarg->inst - DirectedLink(in, cast_inst); - DirectedLink(cast_inst, cast_op_output_arg); - DirectedLink(cast_op_output_arg, inst_node); + // Add new link, var -> new_inst, new_inst->newarg, newarg->inst + DirectedLink(in, cast_inst); + DirectedLink(cast_inst, cast_op_output_arg); + DirectedLink(cast_op_output_arg, inst_node); - // reset opdesc and update kernel information - UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), - in->AsArg().name, - cast_op_output_name); + // reset opdesc and update kernel information + UpdateInputs( + inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name); + } // recreate the op auto original_selected_kernel = @@ -178,5 +314,6 @@ void PrecisionCastPass::SetValidPlaces(const std::vector& valid_places) { REGISTER_MIR_PASS(type_precision_cast_pass, paddle::lite::mir::PrecisionCastPass) .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kOpenCL)}) .BindKernel("calib_once") .BindKernel("calib"); diff --git a/lite/core/mir/type_precision_cast_pass.h b/lite/core/mir/type_precision_cast_pass.h index 3f55e52ef9fed1f0b456533141654d1dcadb16f7..d8d6af5fcd06c187029c7c16a74efade0d4bd5ca 100644 --- a/lite/core/mir/type_precision_cast_pass.h +++ b/lite/core/mir/type_precision_cast_pass.h @@ -16,6 +16,7 @@ #include #include +#include #include #include "lite/core/mir/pass.h" #include "lite/core/op_registry.h" @@ -24,17 +25,7 @@ namespace paddle { namespace lite { namespace mir { -static void UpdateInputTo(cpp::OpDesc* desc, - const std::string& from, - const std::string& to) { - for (auto& item : *desc->mutable_inputs()) { - for (auto& input : item.second) { - if (input == from) { - input = to; - } - } - } -} +void UpdateInputs(OpLite* op, const std::string& from, const std::string& to); /* * The pass complement the necessary instruction to make data @@ -44,13 +35,17 @@ class PrecisionCastPass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override; - void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in); + void ComplementInputs(SSAGraph* graph, + Node* inst_node, + Node* in, + std::unordered_map* cast_nodes); void AddCastInst(const Type& from, const Type& to, Node* in, SSAGraph* graph, Node* inst_node, + std::unordered_map* cast_nodes, const std::vector& valid_places); void SetValidPlaces(const std::vector& valid_places); diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc index 85c22db45c6d3f8d6e00daf9cc74643ad308ba73..ed16211de4b54de0c5f023b34cf7fab5836a2558 100644 --- a/lite/core/mir/type_target_cast_pass.cc +++ b/lite/core/mir/type_target_cast_pass.cc @@ -21,6 +21,7 @@ #include #include "lite/core/mir/graph_visualize_pass.h" #include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/type_precision_cast_pass.h" #include "lite/utils/string.h" namespace paddle { @@ -180,7 +181,7 @@ void TypeTargetTransformPass::AddIoCopyInst( VLOG(4) << "picked, opencl found"; is_found = true; } else if (TypeCompatible(*in_arg_ty, from) && - out_arg_ty->target() == to.target()) { + TargetCompatibleTo(*out_arg_ty, to)) { VLOG(4) << "picked"; is_found = true; } @@ -241,9 +242,8 @@ void TypeTargetTransformPass::UpdateInstNode(Node* in, Node* inst_node, std::string io_copy_output_name) { // reset opdesc and update kernel information - UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), - in->AsArg().name, - io_copy_output_name); + UpdateInputs( + inst_node->AsStmt().op().get(), in->AsArg().name, io_copy_output_name); auto original_selected_kernel = std::move(inst_node->AsStmt().kernels().front()); auto update_op_info = *inst_node->AsStmt().op_info(); diff --git a/lite/core/mir/type_target_cast_pass.h b/lite/core/mir/type_target_cast_pass.h index e9a275882f7c2cb813c1c0b8add5cc4ca89b0c8b..3561a0a7dd22709648450a4b8f3c8f3f11448b38 100644 --- a/lite/core/mir/type_target_cast_pass.h +++ b/lite/core/mir/type_target_cast_pass.h @@ -25,18 +25,6 @@ namespace paddle { namespace lite { namespace mir { -static void UpdateInputTo(cpp::OpDesc* desc, - const std::string& from, - const std::string& to) { - for (auto& item : *desc->mutable_inputs()) { - for (auto& input : item.second) { - if (input == from) { - input = to; - } - } - } -} - /* * IoComplementPass complement the necessary instruction to make data * transferring or transformation between different places. diff --git a/lite/core/mir/weight_quantization_preprocess_pass.cc b/lite/core/mir/weight_quantization_preprocess_pass.cc index c7889a54903f2a1d194fb3eade0bd92670b36699..2bb247871b9500129eeea855677a907cb4fd88b9 100644 --- a/lite/core/mir/weight_quantization_preprocess_pass.cc +++ b/lite/core/mir/weight_quantization_preprocess_pass.cc @@ -22,9 +22,29 @@ namespace paddle { namespace lite { namespace mir { +bool IsAbsMaxQuantizedOp(const OpInfo& op_info) { + bool result = false; + if (op_info.HasAttr("quantization_type") && + op_info.GetAttr("quantization_type") == + "post_weight_abs_max") { + result = true; + } else if (!op_info.HasAttr("quantization_type") && + op_info.HasAttr("quantize_weight_bits")) { // Support older model, + // save this for now + result = true; + } + return result; +} + +/* + * For abs_max method in WeightQuantization, this pass obtains the scale value + * of conv2d, depthwise_conv2d and mul, expands the scale list, and save the + * list in the quantized ops. +*/ void WeightQuantizationPreprocessPass::Apply( const std::unique_ptr& graph) { - std::vector weight_quantized_op = {"conv2d", "depthwise_conv2d"}; + std::vector weight_quantized_op = { + "conv2d", "depthwise_conv2d", "mul"}; for (auto& node : graph->StmtTopologicalOrder()) { if (node->IsStmt() && std::find(weight_quantized_op.begin(), @@ -32,14 +52,20 @@ void WeightQuantizationPreprocessPass::Apply( node->AsStmt().op_type()) != weight_quantized_op.end()) { auto* scope = node->stmt()->op()->scope(); auto* op_desc = node->stmt()->mutable_op_info(); - if (op_desc->HasAttr("quantize_weight_bits")) { + if (IsAbsMaxQuantizedOp(*op_desc)) { for (auto& input_name : op_desc->input_vars()) { std::string scale_name = input_name + "_quant_scale"; if (op_desc->HasAttr(scale_name)) { - VLOG(5) << "op:" << op_desc->Type() << " input_name:" << input_name; + VLOG(0) << " WeightQuantizationPreprocessPass op:" + << op_desc->Type() << " input_name:" << input_name; auto input_tensor = scope->FindVar(input_name)->GetMutable(); - int weight_out_channel = static_cast(input_tensor->dims()[0]); + int weight_out_channel; + if (op_desc->Type() == "mul") { + weight_out_channel = static_cast(input_tensor->dims()[1]); + } else { + weight_out_channel = static_cast(input_tensor->dims()[0]); + } auto input_scale = op_desc->GetAttr>(scale_name); // scale length is equal to weight out channel std::vector scale_list(weight_out_channel, input_scale[0]); diff --git a/lite/core/mir/weight_quantization_preprocess_pass.h b/lite/core/mir/weight_quantization_preprocess_pass.h index 76a35c6b443c692ec08688abd4c10680be62b8af..e7c9f03eef78bdafea204d30c78cf0d044bb15e9 100644 --- a/lite/core/mir/weight_quantization_preprocess_pass.h +++ b/lite/core/mir/weight_quantization_preprocess_pass.h @@ -25,8 +25,9 @@ namespace mir { * If the model is quantized by WeightQuantization in PostTrainingQuantization, * the data type of the weight in quantized ops (conv2d, depthwise_conv2d) is * int, and the scale is save in the quantized ops. - * WeightQuantizationPreprocessPass obtains the scale value, expands the - * scale value to a list, and save the list in the quantized ops. + * For abs_max method in WeightQuantization, WeightQuantizationPreprocessPass + * obtains the scale value of conv2d, depthwise_conv2d and mul, expands the + * scale list, and save the list in the quantized ops. */ class WeightQuantizationPreprocessPass : public ProgramPass { public: diff --git a/lite/core/mir/xpu_pattern_matcher.cc b/lite/core/mir/xpu_pattern_matcher.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f268e7af8a55d22163d52c7f8824406f58bd17b --- /dev/null +++ b/lite/core/mir/xpu_pattern_matcher.cc @@ -0,0 +1,271 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "lite/core/mir/dot.h" +#include "lite/core/mir/xpu_pattern_matcher.h" +#include "lite/core/op_lite.h" +#include "lite/utils/string.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace xpu { + +void XPUPatternMatcher::operator()(SSAGraph *graph, + XPUPatternMatcher::handle_t handler) { + if (!MarkPMNodesInGraph(graph)) { + return; + } + + auto subgraphs = DetectPatterns(); + UniquePatterns(&subgraphs); + RemoveOverlappedMatch(&subgraphs); + ValidateByNodeRole(&subgraphs); + + if (subgraphs.empty()) return; + LOG(INFO) << "detected " << subgraphs.size() << " subgraph"; + int id = 0; + for (auto &g : subgraphs) { + VLOG(3) << "optimizing #" << id++ << " subgraph"; + handler(g, graph); + } +} + +bool XPUPatternMatcher::MarkPMNodesInGraph(SSAGraph *graph) { + VLOG(3) << "mark pmnodes in graph"; + if (graph->nodes().empty()) return false; + for (auto &node : graph->mutable_nodes()) { + for (const auto &pmnode : pattern_.nodes()) { + if (pmnode->Tell(&node)) { + pmnodes2nodes_[pmnode.get()].insert(&node); + } + } + } + // Check to early stop if some PMNode can't find matched Node. + for (auto &pmnode : pattern_.nodes()) { + if (!pmnodes2nodes_.count(pmnode.get())) { + VLOG(4) << pmnode->name() << " can't find matched Node, early stop"; + // return false; + } + } + VLOG(3) << pmnodes2nodes_.size() << " nodes marked"; + + return !pmnodes2nodes_.empty(); +} + +// The intermediate Nodes can only link to the nodes inside the pattern, or this +// subgraph will be droped. +void XPUPatternMatcher::ValidateByNodeRole( + std::vector *subgraphs) { + subgraphs->erase( + std::remove_if(subgraphs->begin(), + subgraphs->end(), + [](const XPUPatternMatcher::subgraph_t &subgraph) -> bool { + // Collect the inlinks and outlinks. + std::unordered_set ios; + for (auto &item : subgraph) { + ios.insert(item.second); + } + for (auto &item : subgraph) { + if (item.first->IsIntermediate()) { + for (auto *x : item.second->outlinks) { + if (!ios.count(x)) { + return true; + } + } + } + } + return false; + }), + subgraphs->end()); + + for (auto &subgraph : *subgraphs) { + std::unordered_set ios; + for (auto &item : subgraph) { + ios.insert(item.second); + } + extra_input_vars_.emplace_back(); + for (auto &item : subgraph) { + for (auto *x : item.second->inlinks) { + if (x->IsArg() && ios.count(x) == 0) { + // extra weight var + extra_input_vars_.back().push_back(x); + } + } + } + } +} + +struct HitGroup { + std::unordered_map roles; + + bool Match(Node *node, PMNode *pat) { + if (nodes_.count(node)) { + if (roles.count(pat) && roles[pat] == node) return true; + return false; + } else { + if (roles.count(pat) && roles[pat] != node) return false; + return true; + } + } + + void Register(Node *node, PMNode *pat) { + roles[pat] = node; + nodes_.insert(node); + } + + private: + std::unordered_set nodes_; +}; + +// Tell whether Node a links to b. +bool IsNodesLink(Node *a, Node *b) { + for (auto *node : a->outlinks) { + if (b == node) { + return true; + } + } + return false; +} + +std::vector XPUPatternMatcher::DetectPatterns() { + // Init empty subgraphs. + std::vector result; + std::vector init_groups; + std::array, 2> bi_records; + auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get() + : pattern_.edges().front().first; + if (!pmnodes2nodes_.count(first_pnode)) return result; + for (auto *node : pmnodes2nodes_[first_pnode]) { + HitGroup group; + group.roles[first_pnode] = node; + init_groups.emplace_back(group); + } + + int step = 0; + bi_records[0] = std::move(init_groups); + + // Extend a PMNode to subgraphs by deducing the connection relations defined + // in edges of PMNodes. + for (const auto &edge : pattern_.edges()) { + VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name(); + // TODO(Superjomn) Fix bug here, the groups might be duplicate here. + // Each role has two PMNodes, which indicates two roles. + // Detect two Nodes that can match these two roles and they are connected. + auto &pre_groups = bi_records[step % 2]; + auto &cur_groups = bi_records[1 - (step++ % 2)]; + cur_groups.clear(); + if (pre_groups.empty()) break; + // source -> target + for (Node *source : pmnodes2nodes_[edge.first]) { + for (Node *target : pmnodes2nodes_[edge.second]) { + // TODO(Superjomn) add some prune strategies. + for (const auto &group : pre_groups) { + if (IsNodesLink(source, target)) { + HitGroup new_group = group; + bool flag = new_group.Match(source, edge.first) && + new_group.Match(target, edge.second); + if (flag) { + new_group.Register(source, edge.first); + new_group.Register(target, edge.second); + cur_groups.push_back(new_group); + // TODO(Superjomn) need to unique + } + } + } + } + } + VLOG(3) << "step " << step << " get records: " << cur_groups.size(); + } + + for (auto &group : bi_records[step % 2]) { + XPUPatternMatcher::subgraph_t subgraph; + for (auto &role : group.roles) { + subgraph.emplace(role.first, role.second); + } + result.emplace_back(subgraph); + } + return result; +} + +struct GraphItemLessThan { + bool operator()(const std::pair &a, + const std::pair &b) { + if (a.first != b.first) { + return a.first < b.first; + } else { + return a.second < b.second; + } + } +}; + +// TODO(Superjomn) enhance the function as it marks unique unique as duplicates +// see https://github.com/PaddlePaddle/Paddle/issues/13550 +void XPUPatternMatcher::UniquePatterns( + std::vector *subgraphs) { + if (subgraphs->empty()) return; + std::vector result; + + std::unordered_set set; + std::hash hasher; + for (auto &g : *subgraphs) { + // Sort the items in the sub-graph, and transform to a string key. + std::vector> sorted_keys(g.begin(), g.end()); + std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan()); + STL::stringstream ss; + for (auto &item : sorted_keys) { + ss << reinterpret_cast(item.first) << ":" + << reinterpret_cast(item.second); + } + auto key = hasher(ss.str()); + if (!set.count(key)) { + result.emplace_back(g); + set.insert(key); + } + } + *subgraphs = result; +} + +void XPUPatternMatcher::RemoveOverlappedMatch( + std::vector *subgraphs) { + std::vector result; + std::unordered_set node_set; + + for (const auto &subgraph : *subgraphs) { + bool valid = true; + for (auto &item : subgraph) { + if (item.first->IsIntermediate() && node_set.count(item.second)) { + valid = false; + break; + } + } + if (valid) { + for (auto &item : subgraph) { + node_set.insert(item.second); + } + result.push_back(subgraph); + } + } + *subgraphs = result; +} + +} // namespace xpu +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/xpu_pattern_matcher.h b/lite/core/mir/xpu_pattern_matcher.h new file mode 100644 index 0000000000000000000000000000000000000000..4ac03718f32a859ff6888e63e57fd4098e435e06 --- /dev/null +++ b/lite/core/mir/xpu_pattern_matcher.h @@ -0,0 +1,93 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "lite/core/mir/pattern_matcher.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace xpu { + +/* + * PatternMatcher helps to detect the specific patterns in the graph. + * Input a pattern, output a list of the matched subgraphs/nodes. + * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.). + * + * The algorithm has three phases: + * 1. Mark the nodes that match the defined PMNodes in a PMPattern, + * 2. Extend a PMNode to subgraphs by deducing the connection relation defined + * in PAPattern(the edges), + * 3. Get the filtered subgraphs and treat them with a pre-defined handler. + * + * Usage: + * // Create a matcher + * PatternMatcher matcher; + * // Define the matcher's pattern, by adding PMNode and define the edges. + * auto* node0 = matcher.mutable_pattern().AddNode(...) + * auto* node1 = matcher.mutable_pattern().AddNode(...) + * node0->teller = some lambda. + * node1->teller = some lambda. + * matcher.mutable_pattern().AddEdge(node0, node1); + * // Create an handler, to define the behavior of treating the filtered + * // subgraphs that comply with the patterns. + * PatternMatcher::handle_t handler = some labmda + * // Execute the matcher. + * matcher(&graph, handler); + */ +struct XPUPatternMatcher { + using subgraph_t = std::unordered_map; + + // Operate on the detected pattern. + using handle_t = + std::function; + + void operator()(SSAGraph* graph, handle_t handler); + + const PMPattern& pattern() const { return pattern_; } + PMPattern* mutable_pattern() { return &pattern_; } + + // Mark the nodes that fits the pattern. + bool MarkPMNodesInGraph(SSAGraph* graph); + + // Detect all the pattern and output the hit records. + std::vector DetectPatterns(); + + // Remove duplicate patterns. + void UniquePatterns(std::vector* subgraphs); + + // Remove overlapped match subgraphs, when overlapped, keep the previous one. + // The intermediate PMNodes will be removed, so can't shared by multiple + // patterns. + void RemoveOverlappedMatch(std::vector* subgraphs); + + // Validate whether the intermediate nodes are linked by external nodes. + void ValidateByNodeRole(std::vector* subgraphs); + + using hit_rcd_t = + std::pair; + PMPattern pattern_; + std::unordered_map> pmnodes2nodes_; + std::vector> extra_input_vars_; +}; + +} // namespace xpu +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/xpu_pattern_matcher_high_api.cc b/lite/core/mir/xpu_pattern_matcher_high_api.cc new file mode 100644 index 0000000000000000000000000000000000000000..5ffc496d1593d15f02d82e824c06443e7b3e01c9 --- /dev/null +++ b/lite/core/mir/xpu_pattern_matcher_high_api.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" +#include +#include +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace xpu { + +void XPUFuseBase::PerformPatternMatcher(SSAGraph *graph) { + VLOG(4) << "\n" << matcher_.pattern().DotString(); + // Get subgraphs and record the mir::Node pointers for each PMNode. + auto handler = [&](const PatternMatcher::subgraph_t &subgraph, SSAGraph *g) { + // get all the reigistered nodes. + key2nodes_.emplace_back(); + for (auto &item : nodes_) { + key2nodes_.back()[item.first] = subgraph.at(item.second); + } + }; + + matcher_(graph, handler); +} + +void XPUFuseBase::DeleteInterNodes(SSAGraph *graph) { + std::set keys; + for (auto &node : nodes_) { + if (node.second->IsIntermediate()) { + keys.insert(node.first); + } + } + + VLOG(4) << "keys: " << key2nodes_.size(); + std::unordered_set nodes2rm; + for (auto &matched : key2nodes_) { + for (const auto &key : keys) { + nodes2rm.insert(matched.at(key)); + } + } + + VLOG(3) << "clean nodes " << nodes2rm.size(); + GraphSafeRemoveNodes(graph, nodes2rm); +} + +PMNode *XPUFuseBase::GetOrCreateNode(const std::string &key) { + auto it = nodes_.find(key); + if (it != nodes_.end()) { + return it->second; + } + nodes_.emplace(key, + matcher_.mutable_pattern()->NewNode(patterns::UniqueKey(key))); + it = nodes_.find(key); + return it->second; +} + +PMNode *XPUFuseBase::OpNode(const std::string &key, + const std::string &op_type) { + GetOrCreateNode(key)->set_op_type(op_type); + GetOrCreateNode(key)->AsOp(op_type); + return GetOrCreateNode(key); +} + +PMNode *XPUFuseBase::VarNode(const std::string &key) { + GetOrCreateNode(key)->AsVar(); + return GetOrCreateNode(key); +} + +} // namespace xpu +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/xpu_pattern_matcher_high_api.h b/lite/core/mir/xpu_pattern_matcher_high_api.h new file mode 100644 index 0000000000000000000000000000000000000000..3302bcb6137f16afcf82269af91c8a13558da2b9 --- /dev/null +++ b/lite/core/mir/xpu_pattern_matcher_high_api.h @@ -0,0 +1,82 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/core/mir/pattern_matcher_high_api.h" +#include "lite/core/mir/xpu_pattern_matcher.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace xpu { + +class XPUFuseBase { + public: + using key2nodes_t = std::map; + + virtual ~XPUFuseBase() = default; + + void operator()(SSAGraph* graph) { + BuildPattern(); + PerformPatternMatcher(graph); + + for (size_t i = 0; i < key2nodes_.size(); ++i) { + InsertNewNode(graph, key2nodes_[i], matcher_.extra_input_vars_[i]); + } + + DeleteInterNodes(graph); + } + + // Build a PMPattern using PMNode. + virtual void BuildPattern() = 0; + + // Generate an operator desc with a matched subgraph. + virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) { + return cpp::OpDesc(); + } + + PMNode* OpNode(const std::string& key) { + return GetOrCreateNode(key)->assert_is_op(); + } + + PMNode* OpNode(const std::string& key, const std::string& op_type); + + PMNode* VarNode(const std::string& key); + + protected: + virtual void InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched, + const std::vector& extra_input_vars) = 0; + + void PerformPatternMatcher(SSAGraph* graph); + + // Delete nodes that are marked as Intermediate + void DeleteInterNodes(SSAGraph* graph); + + PMNode* GetOrCreateNode(const std::string& key); + + protected: + XPUPatternMatcher matcher_; + std::map nodes_; + std::vector key2nodes_; +}; + +} // namespace xpu +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc index 0936a44a66e4777633b84dadf0a1dc049213faab..537636065d6aeea67fd7c8c71fb00b183720fecc 100644 --- a/lite/core/op_lite.cc +++ b/lite/core/op_lite.cc @@ -22,6 +22,62 @@ namespace paddle { namespace lite { +bool OpLite::InferShape() { + // if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_ + // InferShapeByMemoryInternal will be applied. + if (op_param_ && op_param_->input_tensor_ptrs() && + op_param_->output_tensor_ptrs()) { + return this->InferShapeWithCache(); + } else { + return this->InferShapeImpl(); + } +} +bool OpLite::InferShapeWithCache() { + // 1. Get vector of current input tensors + auto *current_inputs = op_param_->input_tensor_ptrs(); + // 2. Get hash value of current inputs shape and lod + bool use_cache = true; + if (last_input_shapes.size() == current_inputs->size()) { + for (int i = 0; i < current_inputs->size(); i++) { + if (last_input_shapes[i] != current_inputs->at(i)->dims() || + last_input_lods[i] != current_inputs->at(i)->lod()) { + use_cache = false; + break; + } + } + } else { + use_cache = false; + } + + // 3. infer shapes of output tensors + if (use_cache) { + // if current hash value is consistent with io_shape_lod_hash_, + // previous outputs shape and lod are reused. + auto *current_outputs = op_param_->output_tensor_ptrs(); + for (size_t i = 0; i < current_outputs->size(); i++) { + current_outputs->at(i)->Resize(last_output_shapes[i]); + current_outputs->at(i)->set_lod(last_output_lods[i]); + } + } else { + // otherwise, current hash value is changed, InferShapeImpl will apply. + this->InferShapeImpl(); + auto *current_outputs = op_param_->output_tensor_ptrs(); + last_output_shapes.clear(); + last_output_lods.clear(); + for (size_t i = 0; i < current_outputs->size(); i++) { + last_output_shapes.push_back(current_outputs->at(i)->dims()); + last_output_lods.push_back(current_outputs->at(i)->lod()); + } + last_input_shapes.clear(); + last_input_lods.clear(); + for (size_t i = 0; i < current_inputs->size(); i++) { + last_input_shapes.push_back(current_inputs->at(i)->dims()); + last_input_lods.push_back(current_inputs->at(i)->lod()); + } + } + return true; +} + std::vector> OpLite::CreateKernels( const std::vector &places, const std::string &kernel_type) { std::vector> kernels; @@ -47,18 +103,19 @@ std::vector> OpLite::CreateKernels( return kernels; } - std::set place_set; - for (auto place : places) { - place_set.insert(place); - // Pick kernels those support any Precision and any DataLayout - place.precision = PRECISION(kAny); - place_set.insert(place); - place.layout = DATALAYOUT(kAny); - place_set.insert(place); + std::set expanded_places(places.begin(), places.end()); + for (auto &place : places) { + // Pick kernels those support any Precision and any DataLayout, For example: + // kARM,kFloat,kNCHW -> kARM,kFloat,kAny; kARM,kAny,kNCHW; kARM,kAny,kAny + expanded_places.insert( + Place(place.target, place.precision, DATALAYOUT(kAny))); + expanded_places.insert(Place(place.target, PRECISION(kAny), place.layout)); + expanded_places.insert( + Place(place.target, PRECISION(kAny), DATALAYOUT(kAny))); } std::set targets; - for (auto place : place_set) { + for (auto place : expanded_places) { pick_kernel(place); targets.insert(place.target); } @@ -101,5 +158,33 @@ Tensor *OpLite::GetMutableTensor(lite::Scope *scope, return var->GetMutable(); } +void OpLite::AttachInput(const cpp::OpDesc &op_desc, + lite::Scope *scope, + const std::string &input_name, + bool is_dispensable, + lite::Tensor **input_var) { + bool is_have_input = + op_desc.HasInput(input_name) && op_desc.Input(input_name).size() > 0; + CHECK(is_dispensable || is_have_input); + if (is_have_input) { + std::string input_var_name = op_desc.Input(input_name).front(); + *input_var = scope->FindVar(input_var_name)->GetMutable(); + } +} + +void OpLite::AttachOutput(const cpp::OpDesc &op_desc, + lite::Scope *scope, + const std::string &output_name, + bool is_dispensable, + lite::Tensor **output_var) { + bool is_have_output = + op_desc.HasOutput(output_name) && op_desc.Output(output_name).size() > 0; + CHECK(is_dispensable || is_have_output); + if (is_have_output) { + std::string output_var_name = op_desc.Output(output_name).front(); + *output_var = scope->FindVar(output_var_name)->GetMutable(); + } +} + } // namespace lite } // namespace paddle diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h index 5dec9ed7aace837e3eb085a55d7b9b5382f7dea3..301065d5b6bb5c4f41b19d9a9034985ca2f74d89 100644 --- a/lite/core/op_lite.h +++ b/lite/core/op_lite.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -24,6 +25,7 @@ #include "lite/core/kernel.h" #include "lite/core/scope.h" #include "lite/model_parser/cpp/op_desc.h" +#include "lite/operators/op_params.h" namespace paddle { namespace lite { @@ -64,16 +66,25 @@ class OpLite : public Registry { // Check the shape. virtual bool CheckShape() const { return true; } // Inference the outputs' shape. - virtual bool InferShape() const { return true; } + virtual bool InferShapeImpl() const { return true; } + virtual bool InferShape(); // Run this operator. virtual bool Run(); // Indicate whether the Op runs only once or not virtual bool run_once() const { return false; } std::string Type() { return op_type_; } +#ifdef LITE_WITH_PROFILE + virtual void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {} +#endif // Link the external execution environ to internal context. bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope); + template + inline void AttachParam(T *param) { + op_param_ = static_cast(param); + } + const OpInfo *op_info() const { return op_info_.get(); } OpInfo *mutable_op_info() { return op_info_.get(); } @@ -102,6 +113,20 @@ class OpLite : public Registry { return kernel_.get(); } + // Attach input variable from scope by op_desc and input name + void AttachInput(const cpp::OpDesc &op_desc, + lite::Scope *scope, + const std::string &input_name, + bool is_dispensable, + lite::Tensor **input_var); + + // Attach output variable from scope by op_desc and output name + void AttachOutput(const cpp::OpDesc &op_desc, + lite::Scope *scope, + const std::string &output_name, + bool is_dispensable, + lite::Tensor **output_var); + virtual ~OpLite() = default; protected: @@ -150,6 +175,19 @@ class OpLite : public Registry { std::vector valid_places_; Place kernel_place_{TARGET(kHost), PRECISION(kFloat)}; std::unique_ptr op_info_; + // todo: it's prefered to combine last_input_shapes and + // last_input_lods into a single hash value to decrease + // memory usage. + std::vector last_input_shapes{}; + std::vector>> last_input_lods{}; + std::vector last_output_shapes{}; + std::vector>> last_output_lods{}; + mutable operators::ParamBase *op_param_{nullptr}; + + private: + // Infer Shape according to memory, if current input shapes are consistent + // with that of previous inputs, output shapes of last time will be reused. + bool InferShapeWithCache(); }; /* @@ -212,6 +250,32 @@ class OpInfo : public cpp::OpDesc { return false; } + // For the input variable name, find the index of the corresponding + // input argname + bool GetInputIndex(const std::string &value_name, int *out) const { + for (auto &item : inputs_) { + auto it = std::find(item.second.begin(), item.second.end(), value_name); + if (it != item.second.end()) { + *out = it - item.second.begin(); + return true; + } + } + return false; + } + + // For the output variable name, find the index of the corresponding + // output argname + bool GetOutputIndex(const std::string &value_name, int *out) const { + for (auto &item : outputs_) { + auto it = std::find(item.second.begin(), item.second.end(), value_name); + if (it != item.second.end()) { + *out = it - item.second.begin(); + return true; + } + } + return false; + } + void UpdateAllInputs(const std::string &from, const std::string &to) { for (auto &item : inputs_) { for (auto &var : item.second) { diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc index b49670eefb8b2c6aae30cb041de4d055a2b9964c..29c853c70caa80add9d47182da228a36f031cb42 100644 --- a/lite/core/op_registry.cc +++ b/lite/core/op_registry.cc @@ -19,6 +19,10 @@ namespace paddle { namespace lite { +const std::map &GetOp2PathDict() { + return OpKernelInfoCollector::Global().GetOp2PathDict(); +} + std::list> KernelRegistry::Create( const std::string &op_type, TargetType target, @@ -94,6 +98,9 @@ std::list> KernelRegistry::Create( case TARGET(kNPU): { CREATE_KERNEL(kNPU); } break; + case TARGET(kAPU): { + CREATE_KERNEL(kAPU); + } break; case TARGET(kXPU): { CREATE_KERNEL(kXPU); } break; @@ -103,6 +110,12 @@ std::list> KernelRegistry::Create( case TARGET(kBM): { CREATE_KERNEL(kBM); } break; + case TARGET(kMLU): { + CREATE_KERNEL(kMLU); + } break; + case TARGET(kRKNPU): { + CREATE_KERNEL(kRKNPU); + } break; default: CHECK(false) << "not supported kernel target " << TargetToStr(target); } @@ -135,14 +148,39 @@ KernelRegistry::KernelRegistry() INIT_FOR(kCUDA, kInt64, kNCHW); INIT_FOR(kCUDA, kInt64, kNHWC); - INIT_FOR(kHost, kFloat, kNCHW); + INIT_FOR(kMLU, kFloat, kNHWC); + INIT_FOR(kMLU, kFloat, kNCHW); + INIT_FOR(kMLU, kFP16, kNHWC); + INIT_FOR(kMLU, kFP16, kNCHW); + INIT_FOR(kMLU, kInt8, kNHWC); + INIT_FOR(kMLU, kInt8, kNCHW); + INIT_FOR(kMLU, kInt16, kNHWC); + INIT_FOR(kMLU, kInt16, kNCHW); + INIT_FOR(kHost, kAny, kNCHW); - INIT_FOR(kHost, kFloat, kNHWC); - INIT_FOR(kHost, kFloat, kAny); - INIT_FOR(kHost, kAny, kNHWC); - INIT_FOR(kHost, kAny, kAny); INIT_FOR(kHost, kAny, kNHWC); INIT_FOR(kHost, kAny, kAny); + INIT_FOR(kHost, kBool, kNCHW); + INIT_FOR(kHost, kBool, kNHWC); + INIT_FOR(kHost, kBool, kAny); + INIT_FOR(kHost, kFloat, kNCHW); + INIT_FOR(kHost, kFloat, kNHWC); + INIT_FOR(kHost, kFloat, kAny); + INIT_FOR(kHost, kFP16, kNCHW); + INIT_FOR(kHost, kFP16, kNHWC); + INIT_FOR(kHost, kFP16, kAny); + INIT_FOR(kHost, kInt8, kNCHW); + INIT_FOR(kHost, kInt8, kNHWC); + INIT_FOR(kHost, kInt8, kAny); + INIT_FOR(kHost, kInt16, kNCHW); + INIT_FOR(kHost, kInt16, kNHWC); + INIT_FOR(kHost, kInt16, kAny); + INIT_FOR(kHost, kInt32, kNCHW); + INIT_FOR(kHost, kInt32, kNHWC); + INIT_FOR(kHost, kInt32, kAny); + INIT_FOR(kHost, kInt64, kNCHW); + INIT_FOR(kHost, kInt64, kNHWC); + INIT_FOR(kHost, kInt64, kAny); INIT_FOR(kX86, kFloat, kNCHW); INIT_FOR(kX86, kAny, kNCHW); @@ -150,10 +188,13 @@ KernelRegistry::KernelRegistry() INIT_FOR(kX86, kInt64, kNCHW); INIT_FOR(kARM, kFloat, kNCHW); + INIT_FOR(kARM, kFloat, kNHWC); INIT_FOR(kARM, kInt8, kNCHW); + INIT_FOR(kARM, kInt8, kNHWC); INIT_FOR(kARM, kAny, kNCHW); INIT_FOR(kARM, kAny, kAny); INIT_FOR(kARM, kInt32, kNCHW); + INIT_FOR(kARM, kInt64, kNCHW); INIT_FOR(kOpenCL, kFloat, kNCHW); INIT_FOR(kOpenCL, kFloat, kNHWC); @@ -175,10 +216,14 @@ KernelRegistry::KernelRegistry() INIT_FOR(kOpenCL, kAny, kImageNW); INIT_FOR(kNPU, kFloat, kNCHW); + INIT_FOR(kNPU, kFloat, kNHWC); INIT_FOR(kNPU, kInt8, kNCHW); + INIT_FOR(kNPU, kInt8, kNHWC); INIT_FOR(kNPU, kAny, kNCHW); + INIT_FOR(kNPU, kAny, kNHWC); INIT_FOR(kNPU, kAny, kAny); + INIT_FOR(kAPU, kInt8, kNCHW); INIT_FOR(kXPU, kFloat, kNCHW); INIT_FOR(kXPU, kInt8, kNCHW); INIT_FOR(kXPU, kAny, kNCHW); @@ -194,6 +239,11 @@ KernelRegistry::KernelRegistry() INIT_FOR(kBM, kInt8, kNCHW); INIT_FOR(kBM, kAny, kNCHW); INIT_FOR(kBM, kAny, kAny); + + INIT_FOR(kRKNPU, kFloat, kNCHW); + INIT_FOR(kRKNPU, kInt8, kNCHW); + INIT_FOR(kRKNPU, kAny, kNCHW); + INIT_FOR(kRKNPU, kAny, kAny); #undef INIT_FOR } diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h index a49682eea68240bfa178eb3d3351b8c7fb41048d..5b58fd2bb9ee88fcdd4eba7289870b839aa88552 100644 --- a/lite/core/op_registry.h +++ b/lite/core/op_registry.h @@ -72,6 +72,8 @@ class OpKernelInfoCollector { namespace paddle { namespace lite { +const std::map &GetOp2PathDict(); + using KernelFunc = std::function; using KernelFuncCreator = std::function()>; class LiteOpRegistry final : public Factory> { @@ -109,18 +111,23 @@ class KernelRegistry final { KernelRegistryForTarget *, // + KernelRegistryForTarget *, // KernelRegistryForTarget *, // KernelRegistryForTarget *, // + KernelRegistryForTarget *, // KernelRegistryForTarget *, // + KernelRegistryForTarget *, // @@ -133,9 +140,13 @@ class KernelRegistry final { KernelRegistryForTarget *, // - KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // @@ -145,6 +156,9 @@ class KernelRegistry final { KernelRegistryForTarget *, // + KernelRegistryForTarget *, // KernelRegistryForTarget *, // @@ -220,6 +234,9 @@ class KernelRegistry final { PRECISION(kInt8), DATALAYOUT(kNCHW)> *, // + KernelRegistryForTarget *, // KernelRegistryForTarget *, // @@ -240,6 +257,19 @@ class KernelRegistry final { PRECISION(kInt8), DATALAYOUT(kNCHW)> *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // @@ -263,7 +293,32 @@ class KernelRegistry final { DATALAYOUT(kAny)> *, // KernelRegistryForTarget * // + DATALAYOUT(kAny)> *, // + + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget * // >; KernelRegistry(); @@ -399,32 +454,31 @@ class KernelRegistor : public lite::Registor { #define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \ LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__) -#define REGISTER_LITE_KERNEL( \ - op_type__, target__, precision__, layout__, KernelClass, alias__) \ - static paddle::lite::KernelRegistor \ - LITE_KERNEL_REGISTER_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__)(#op_type__, \ - #alias__); \ - static KernelClass LITE_KERNEL_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__); \ - int touch_##op_type__##target__##precision__##layout__##alias__() { \ - OpKernelInfoCollector::Global().AddKernel2path( \ - #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \ - __FILE__); \ - LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \ - .Touch(); \ - return 0; \ - } \ - static bool LITE_KERNEL_PARAM_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) \ - __attribute__((unused)) = \ - paddle::lite::ParamTypeRegistry::NewInstance( \ - #op_type__ "/" #alias__) +#define REGISTER_LITE_KERNEL( \ + op_type__, target__, precision__, layout__, KernelClass, alias__) \ + static paddle::lite::KernelRegistor \ + LITE_KERNEL_REGISTER_INSTANCE( \ + op_type__, target__, precision__, layout__, alias__)(#op_type__, \ + #alias__); \ + static KernelClass LITE_KERNEL_INSTANCE( \ + op_type__, target__, precision__, layout__, alias__); \ + int touch_##op_type__##target__##precision__##layout__##alias__() { \ + OpKernelInfoCollector::Global().AddKernel2path( \ + #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \ + __FILE__); \ + LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \ + .Touch(); \ + return 0; \ + } \ + static bool LITE_KERNEL_PARAM_INSTANCE( \ + op_type__, target__, precision__, layout__, alias__) UNUSED = \ + paddle::lite::ParamTypeRegistry::NewInstance( \ + #op_type__ "/" #alias__) #define LITE_KERNEL_INSTANCE( \ op_type__, target__, precision__, layout__, alias__) \ diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index bebafb88a8bcacbdd639d523831c0a61031191e3..5015b633e7b028ffe98a5c0a156c471271e16b0f 100755 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -53,7 +53,7 @@ class Optimizer { SpecifyKernelPickTactic(kernel_pick_factor); InitTargetTypeTransformPass(); - if (passes.empty()) { + if (passes.empty() || passes.size() == 1) { std::vector passes_local{ {"lite_quant_dequant_fuse_pass", // "weight_quantization_preprocess_pass", // @@ -71,10 +71,27 @@ class Optimizer { "identity_scale_eliminate_pass", // "elementwise_mul_constant_eliminate_pass", // "lite_sequence_pool_concat_fuse_pass", // + "lite_scale_activation_fuse_pass", // #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \ (defined LITE_WITH_ARM) - "lite_elementwise_add_activation_fuse_pass", // + "lite_elementwise_activation_fuse_pass", // #endif + "identity_dropout_eliminate_pass", + "__xpu__resnet_fuse_pass", + "__xpu__multi_encoder_fuse_pass", + "__xpu__embedding_with_eltwise_add_fuse_pass", + "__xpu__fc_fuse_pass", + "quantized_op_attributes_inference_pass", // Only for fully + // quantized model, infer + // the output scale and + // fix the attribute + // 'enable_int8' for all + // of the quantized ops. + "npu_subgraph_pass", + "xpu_subgraph_pass", + "bm_subgraph_pass", + "apu_subgraph_pass", + "rknpu_subgraph_pass", "static_kernel_pick_pass", // pick original kernel from graph "variable_place_inference_pass", // inference arg/var's "kernel_place_correct_pass", @@ -107,13 +124,33 @@ class Optimizer { "variable_place_inference_pass", // "argument_type_display_pass", + "mlu_subgraph_pass", + "runtime_context_assign_pass", "argument_type_display_pass", #ifndef LITE_WITH_FPGA "memory_optimize_pass", #endif - "npu_subgraph_pass", - "xpu_subgraph_pass"}}; + + "mlu_postprocess_pass"}}; + + if (passes.size() == 1) { + // multi_stream_analysis_pass must be in the front of + // runtime_context_assign_pass + const std::string msa_pass{"multi_stream_analysis_pass"}; + const std::string depend_pass{"runtime_context_assign_pass"}; + if (passes[0] == msa_pass) { + auto iter = + std::find(passes_local.begin(), passes_local.end(), depend_pass); + if (iter != passes_local.end()) { + passes_local.insert(iter, msa_pass); + } else { + CHECK(false) << "Not find " << depend_pass; + } + } else { + passes_local.push_back(passes[0]); + } + } RunPasses(passes_local); } else { RunPasses(passes); diff --git a/lite/core/profile/basic_profiler.cc b/lite/core/profile/basic_profiler.cc index a947bfa295658d720a448f2376dfe26c507c3da2..393c266f5a9cfe0eb7e915c72370b306a614c0e6 100644 --- a/lite/core/profile/basic_profiler.cc +++ b/lite/core/profile/basic_profiler.cc @@ -137,13 +137,13 @@ std::string BasicTimer::basic_repr() const { // clang-format off ss << GetCustomInfo("op_type") << "\t" << key() << "\t" - << kernel_timer_info.ave() / time_unit_factor << "\t" - << kernel_timer_info.min() / time_unit_factor << "\t" - << kernel_timer_info.max() / time_unit_factor << "\t" - << inst_timer_info.ave() / time_unit_factor << "\t" - << inst_timer_info.min() / time_unit_factor << "\t" - << inst_timer_info.max() / time_unit_factor << "\t" - << inst_timer_info.count() << "\t" + << kernel_timer_info.Ave() / time_unit_factor << "\t" + << kernel_timer_info.Min() / time_unit_factor << "\t" + << kernel_timer_info.Max() / time_unit_factor << "\t" + << inst_timer_info.Ave() / time_unit_factor << "\t" + << inst_timer_info.Min() / time_unit_factor << "\t" + << inst_timer_info.Max() / time_unit_factor << "\t" + << inst_timer_info.Count() << "\t" << GetCustomInfo("op_info"); // clang-format on return ss.str(); @@ -195,13 +195,13 @@ std::string BasicProfiler::summary_repr() const { auto& op_timer = iter.second; // clang-format off ss << iter.first << "\t" - << op_timer.ave() / time_unit_factor << "\t" - << op_timer.min() / time_unit_factor << "\t" - << op_timer.max() / time_unit_factor << "\t" - << op_timer.total() / time_unit_factor << "\t" + << op_timer.Ave() / time_unit_factor << "\t" + << op_timer.Min() / time_unit_factor << "\t" + << op_timer.Max() / time_unit_factor << "\t" + << op_timer.Total() / time_unit_factor << "\t" << total / time_unit_factor << "\t" - << (op_timer.total() * 1. / total * 100) << "%\t" - << op_timer.count() << "\t" + << (op_timer.Total() * 1. / total * 100) << "%\t" + << op_timer.Count() << "\t" << "\n"; // clang-format on } diff --git a/lite/core/profile/basic_profiler.h b/lite/core/profile/basic_profiler.h index 660650655e6fb5035e897f939aac621a784389b0..449e1cfb39e9bc3f94cea7c28b1634afb3063a5e 100644 --- a/lite/core/profile/basic_profiler.h +++ b/lite/core/profile/basic_profiler.h @@ -39,15 +39,15 @@ namespace profile { struct TimerInfo { uint64_t total_{0}; uint64_t count_{0}; - uint64_t max_{std::numeric_limits::min()}; - uint64_t min_{std::numeric_limits::max()}; + uint64_t max_{(std::numeric_limits::min)()}; + uint64_t min_{(std::numeric_limits::max)()}; uint64_t timer_{0}; - double ave() const { return total_ * 1. / count_; } - double max() const { return max_; } - double min() const { return min_; } - uint64_t total() const { return total_; } - uint64_t count() const { return count_; } + double Ave() const { return total_ * 1. / count_; } + double Max() const { return max_; } + double Min() const { return min_; } + uint64_t Total() const { return total_; } + uint64_t Count() const { return count_; } }; /* Base class of all the profile records */ diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h index d9111e5c46c9217b181e5a3e5a8c7981f46250df..1176608b4c4121e9e03b2b0168e80e2a0d6bc98c 100644 --- a/lite/core/profile/precision_profiler.h +++ b/lite/core/profile/precision_profiler.h @@ -18,22 +18,33 @@ * of each kernel. */ #pragma once +#include #include #include #include "lite/core/program.h" +#ifdef LITE_WITH_X86 +#include "lite/fluid/float16.h" +#endif + +#ifdef LITE_WITH_OPENCL +#include "lite/backends/opencl/cl_image_converter.h" +#include "lite/backends/opencl/cl_include.h" +#include "lite/kernels/opencl/image_helper.h" +#endif namespace paddle { namespace lite { namespace profile { template -static void write_tensorfile(const Tensor* tensor, const std::string& locate) { +static bool write_tensorfile(const Tensor* tensor, const std::string& locate) { if (locate.find('/') != std::string::npos) { - return; + return false; } FILE* fp = fopen(locate.c_str(), "w"); if (fp == nullptr) { LOG(ERROR) << "file open field " << locate; + return false; } else { const dtype* data = tensor->data(); for (int i = 0; i < tensor->numel(); ++i) { @@ -41,63 +52,260 @@ static void write_tensorfile(const Tensor* tensor, const std::string& locate) { } } fclose(fp); + return true; +} + +static bool write_precision_summary_tofile(const std::string& string, + const std::string& log_dir = "") { + if (log_dir == "") { + LOG(INFO) << "The `log_dir` of precision summary file is not set. log_dir:" + << log_dir; + return false; + } + FILE* fp = fopen(log_dir.c_str(), "a"); + if (fp == nullptr) { + LOG(INFO) << "Open precision summary file:" << log_dir << "failed."; + return false; + } else { + fprintf(fp, "%s\n", string.c_str()); + } + fclose(fp); + return true; } class PrecisionProfiler { public: - explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {} - ~PrecisionProfiler() { - LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr() - << " on Target " << TargetToStr(inst_->kernel()->target()) << " " - << PrecisionToStr(inst_->kernel()->precision()); - auto tensor_mean = [](const Tensor* in, - PrecisionType ptype, - std::string name = "inst") -> double { - if (!in->data()) { - return -99999; - } - double sum = 0.; - switch (ptype) { + // TODO(ysh329): need to remove `explicit PrecisionProfiler` + // keep this method only for arm/math/conditional + explicit PrecisionProfiler(const Instruction* inst) { + std::string inst_precison_str = GetInstPrecision(inst); + } + + PrecisionProfiler() {} + + std::string GetSummaryHeader() { + using std::setw; + using std::left; + using std::fixed; + STL::stringstream ss; + ss << "\n\n========================================= " + << "Detailed Precision Profiler Summary " + << "=========================================" << std::endl; + ss << setw(45) << left << "operator:(kernel_info)" + << " " << setw(70) << left << "output_tensor_name:(tensor_info)" + << " " << setw(15) << left << "dims" + << " " << setw(15) << left << "mean" + << " " << setw(15) << left << "std_deviation" + << " " << setw(15) << left << "ave_grow_rate*" << std::endl; + + // write to file with path: `log_dir` + if (log_dir_ != "") { + FILE* fp = fopen(log_dir_.c_str(), "a"); + std::string header_str{ss.str()}; + fprintf(fp, "%s\n", header_str.c_str()); + fclose(fp); + } + return ss.str(); + } + + template + double compute_mean(const T* in, const size_t length) { + double sum = 0.; + for (size_t i = 0; i < length; ++i) { + sum += in[i]; + } + return sum / length; + } + + template + double compute_standard_deviation(const T* in, + const size_t length, + bool has_mean = false, + double mean = 10000) { + if (!has_mean) { + mean = compute_mean(in, length); + } + + double variance = 0.; + for (size_t i = 0; i < length; ++i) { + variance += pow((in[i] - mean), 2); + } + variance /= length; + return sqrt(variance); + } + + template + double compute_average_grow_rate(const T* in, const size_t length) { + const double eps = 1e-5; + double ave_grow_rate = 0.0f; + for (size_t i = 1; i < length; ++i) { + ave_grow_rate += (in[i] - in[i - 1]) / (in[i - 1] + eps); + } + ave_grow_rate /= length; + return ave_grow_rate; + } + + // check if output tensor unused + bool is_unused(const Tensor* in) { + if (!in->data()) { + return true; + } + return false; + } + + void compute_tensor_precision_info(const Tensor* in, + TargetType target_type, + PrecisionType precision_type, + DataLayoutType layout_type, + double* mean, + double* std_dev, + double* ave_grow_rate, + std::string name = "inst", + bool write_result_to_file = false) { + std::string unsupported_error_log = + "Unsupported precision profile for kernel registered on" + + TargetToStr(target_type) + "/" + PrecisionToStr(precision_type) + "/" + + DataLayoutToStr(layout_type); + + if (target_type == TARGET(kARM) || target_type == TARGET(kHost) || + target_type == TARGET(kX86)) { + switch (precision_type) { case PRECISION(kFloat): { auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = + compute_standard_deviation(ptr, in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; } case PRECISION(kAny): { auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = + compute_standard_deviation(ptr, in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; } case PRECISION(kInt8): { auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = + compute_standard_deviation(ptr, in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; } case PRECISION(kInt32): { auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = compute_standard_deviation( + ptr, in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; + } + case PRECISION(kInt64): { + auto ptr = in->data(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = compute_standard_deviation( + ptr, in->numel(), true, *mean); + return; } default: - LOG(INFO) << "unsupport data type: " << PrecisionToStr(ptype); - return 0.; + *mean = -333333333333; + *std_dev = -33333333333; + *ave_grow_rate = -33333333333; + LOG(ERROR) << unsupported_error_log; + return; } - }; - if (inst_->op()->op_info()->Type() != "fetch") { - auto op = const_cast(inst_->op()); - auto kernel = inst_->kernel(); +#ifdef LITE_WITH_OPENCL + } else if (target_type == TARGET(kOpenCL)) { + CLRuntime::Global()->command_queue().finish(); + switch (layout_type) { + case DATALAYOUT(kImageDefault): { + paddle::lite::CLImageConverterDefault default_convertor; + auto image_shape = default_convertor.InitImageDimInfoWith(in->dims()); + size_t im_w = image_shape[0]; + size_t im_h = image_shape[1]; + VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " " + << im_h; + std::vector in_data_v(im_w * im_h * 4); + std::vector real_out_v(in->numel()); + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + TargetWrapperCL::ImgcpySync(in_data_v.data(), + in->data(), + im_w, + im_h, + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + default_convertor.ImageToNCHW( + in_data_v.data(), real_out_v.data(), image_shape, in->dims()); + CHECK(real_out_v.size() == in->numel()); + *mean = compute_mean(real_out_v.data(), real_out_v.size()); + *std_dev = compute_standard_deviation( + real_out_v.data(), in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(real_out_v.data(), + real_out_v.size()); + write_result_to_file&& write_tensorfile(in, name); + return; + } + case DATALAYOUT(kNCHW): { + std::vector in_data_v(in->numel(), 0); + TargetWrapperCL::MemcpySync(in_data_v.data(), + in->data(), + in->numel() * sizeof(float), + IoDirection::DtoH); + VLOG(1) << name << ":" << in->numel(); + *mean = compute_mean(in_data_v.data(), in->numel()); + *std_dev = compute_standard_deviation( + in_data_v.data(), in->numel(), true, *mean); + *ave_grow_rate = + compute_average_grow_rate(in_data_v.data(), in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; + } + default: + *mean = -222222222222; + *std_dev = -22222222222; + *ave_grow_rate = -22222222222; + LOG(ERROR) << unsupported_error_log; + return; + } +#endif + } else { + *mean = -111111111111; + *std_dev = -11111111111; + *ave_grow_rate = -11111111111; + LOG(ERROR) << unsupported_error_log; + return; + } + } + + std::string GetInstPrecision(const Instruction* inst = nullptr) { + using std::setw; + using std::left; + using std::fixed; + STL::stringstream ss; + bool write_result_to_file = false; + + VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr() + << " registered on " << TargetToStr(inst->kernel()->target()) << "/" + << PrecisionToStr(inst->kernel()->precision()) << "/" + << DataLayoutToStr(inst->kernel()->layout()); + + std::string kernel_repr = inst->op()->op_info()->Repr(); + std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" + + PrecisionToStr(inst->kernel()->precision()) + + "/" + DataLayoutToStr(inst->kernel()->layout()); + std::string op_name = inst->op()->op_info()->Type(); + + if (inst->op()->op_info()->Type() != "fetch") { + auto op = const_cast(inst->op()); + auto kernel = inst->kernel(); auto op_scope = op->scope(); auto out_names = op->op_info()->output_names(); for (auto& out_name : out_names) { @@ -106,32 +314,94 @@ class PrecisionProfiler { auto type = kernel->GetOutputDeclType(out_arg_name); if (type->IsTensor()) { - auto tout = op_scope->FindVar(out_name)->GetMutable(); - double mean = tensor_mean(tout, type->precision(), out_name); - LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims() - << ", precision: " << PrecisionToStr(type->precision()) - << ", mean value: " << mean << " shape:" << tout->dims(); + const Tensor* tout = + op_scope->FindVar(out_name)->GetMutable(); + double mean = -999999; + double std_dev = -100000; + double ave_grow_rate = 99999; + std::string mean_str{"unused"}; + std::string std_dev_str{"unused"}; + std::string ave_grow_rate_str{"unused"}; + + if (!is_unused(tout)) { + compute_tensor_precision_info(tout, + type->target(), + type->precision(), + type->layout(), + &mean, + &std_dev, + &ave_grow_rate, + out_name, + write_result_to_file); + mean_str = std::to_string(mean); + std_dev_str = std::to_string(std_dev); + ave_grow_rate_str = std::to_string(ave_grow_rate); + } + std::string kernel_info = op_name + ":" + kernel_place; + std::string output_arg_info = out_name + ":" + + TargetToStr(type->target()) + "/" + + PrecisionToStr(type->precision()) + + "/" + DataLayoutToStr(type->layout()); + + ss << setw(45) << left << kernel_info << " " << setw(70) << left + << output_arg_info << " " << setw(15) << left << tout->dims() + << " " << setw(15) << left << mean_str << " " << setw(15) << left + << std_dev_str << " " << setw(15) << left << ave_grow_rate_str + << std::endl; } else if (type->IsTensorList()) { - auto tout = + auto touts = op_scope->FindVar(out_name)->GetMutable>(); - for (auto& t : *tout) { - double mean = tensor_mean(&t, type->precision(), out_name); - LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims() - << ", precision: " << PrecisionToStr(type->precision()) - << ", mean value: " << mean; + for (auto t : *touts) { + const Tensor* tout = &t; + double mean = -999999; + double std_dev = -100000; + double ave_grow_rate = 99999; + std::string mean_str{"unused"}; + std::string std_dev_str{"unused"}; + std::string ave_grow_rate_str{"unused"}; + + if (!is_unused(tout)) { + compute_tensor_precision_info(tout, + type->target(), + type->precision(), + type->layout(), + &mean, + &std_dev, + &ave_grow_rate, + out_name, + write_result_to_file); + mean_str = std::to_string(mean); + std_dev_str = std::to_string(std_dev); + ave_grow_rate_str = std::to_string(ave_grow_rate); + } + std::string kernel_info = op_name + ":" + kernel_place; + std::string output_arg_info = out_name + ":" + + TargetToStr(type->target()) + "/" + + PrecisionToStr(type->precision()) + + "/" + DataLayoutToStr(type->layout()); + + ss << setw(45) << left << kernel_info << " " << setw(70) << left + << output_arg_info << " " << setw(15) << left << tout->dims() + << " " << setw(15) << left << mean_str << " " << setw(15) << left + << std_dev_str << " " << setw(15) << left << ave_grow_rate_str + << std::endl; } } } } + write_precision_summary_tofile(ss.str(), log_dir_); + return ss.str(); } private: - const Instruction* inst_{nullptr}; + std::string log_dir_{"/storage/emulated/0/precision.log"}; }; } // namespace profile } // namespace lite } // namespace paddle +// TODO(ysh329): need to remove. +// keep this method only for arm/math/conditional_block_compute #define LITE_PRECISION_PROFILE(inst) \ { auto a = paddle::lite::profile::PrecisionProfiler(&inst); } diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc index f4d0e3c0afbe1f9df4e381a502e1800a3d58ba68..3c50585ef2c9ed42b08232db0d9b9e59988d665a 100644 --- a/lite/core/profile/profiler.cc +++ b/lite/core/profile/profiler.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/core/profile/profiler.h" +#include #include #include #include @@ -64,22 +65,34 @@ int Profiler::NewTimer(const OpCharacter& ch) { return units_.size() - 1; } +OpCharacter* Profiler::GetOpCharacter(const size_t index) { + CHECK_LT(index, units_.size()) + << "The timer index in the profiler is out of range."; + return &units_[index].Character(); +} + void Profiler::StartTiming(Type type, const int index, KernelContext* ctx) { CHECK_LT(index, units_.size()) << "The timer index in the profiler is out of range."; units_[index].Timer(type)->Start(ctx); } -float Profiler::StopTiming(Type type, const int index, KernelContext* ctx) { +void Profiler::StopTiming(Type type, const int index, KernelContext* ctx) { CHECK_LT(index, units_.size()) << "The timer index in the profiler is out of range."; - return units_[index].Timer(type)->Stop(ctx); + units_[index].Timer(type)->Stop(ctx); +#ifdef LITE_WITH_OPENCL + units_[index].Timer(type)->CLStop(units_[index].character.op_type, + units_[index].character.io_duration, + units_[index].character.cl_event); +#endif } std::string Profiler::Summary(Type type, bool concise, size_t w) { using std::setw; using std::left; using std::fixed; + using std::setprecision; STL::stringstream ss; std::string title; // Title. @@ -94,13 +107,36 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) { << " Profiler Summary: " << name_ << ", Exclude " << w << " warm-ups =====" << std::endl; } - ss << setw(25) << left << "Operator Type" - << " " << setw(40) << left << "Kernel Name" - << " " << setw(12) << left << "Remark" - << " " << setw(12) << left << "Avg (ms)" - << " " << setw(12) << left << "Min (ms)" - << " " << setw(12) << left << "Max (ms)" - << " " << setw(12) << left << "Last (ms)" << std::endl; + ss << setw(20) << left << "OperatorType" + << " " << setw(30) << left << "KerneAttr"; + if (!concise) { + ss << " " << setw(24) << left << "KernelName"; + } + ss << " " << setw(16) << left << "Remark"; + if (!concise) { + ss << " " << setw(15) << left << "InDim" + << " " << setw(15) << left << "FilterDim" + << " " << setw(15) << left << "OutDim"; + } + ss << " " << setw(7) << left << "Avg(ms)" + << " " << setw(7) << left << "Min(ms)" + << " " << setw(7) << left << "Max(ms)"; + if (!concise) { + ss << " " << setw(7) << left << "Last(ms)"; + } + ss << " " << setw(7) << left << "Avg(%)"; + if (!concise) { + ss << " " << setw(7) << left << "GOPs" + << " " << setw(7) << left << "GOPS"; + } +#ifdef LITE_WITH_OPENCL + ss << " " << setw(9) << left << "clAvg(ms)" + << " " << setw(9) << left << "clMin(ms)" + << " " << setw(9) << left << "clMax(ms)" + << " " << setw(9) << left << "clAvg(%)"; +#endif + ss << std::endl; + // Profile information. if (concise) { std::map summary(op_comp); @@ -110,37 +146,126 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) { ch->second.avg += unit.Timer(type)->LapTimes().Avg(w); ch->second.min += unit.Timer(type)->LapTimes().Min(w); ch->second.max += unit.Timer(type)->LapTimes().Max(w); +#ifdef LITE_WITH_OPENCL + ch->second.cl_avg += unit.Timer(type)->CLLapTimes().Avg(w); + ch->second.cl_min += unit.Timer(type)->CLLapTimes().Min(w); + ch->second.cl_max += unit.Timer(type)->CLLapTimes().Max(w); +#endif } else { - TimeInfo info({unit.Timer(type)->LapTimes().Avg(w), - unit.Timer(type)->LapTimes().Min(w), - unit.Timer(type)->LapTimes().Max(w)}); + TimeInfo info; + info.avg = unit.Timer(type)->LapTimes().Avg(w); + info.min = unit.Timer(type)->LapTimes().Min(w); + info.max = unit.Timer(type)->LapTimes().Max(w); +#ifdef LITE_WITH_OPENCL + info.cl_avg = unit.Timer(type)->CLLapTimes().Avg(w); + info.cl_min = unit.Timer(type)->CLLapTimes().Min(w); + info.cl_max = unit.Timer(type)->CLLapTimes().Max(w); +#endif summary.insert({unit.Character(), info}); } } + + // compute total time + float total = 0.0; + for (const auto& item : summary) { + total += item.second.avg; + } +#ifdef LITE_WITH_OPENCL + float cl_total = 0.0; + for (const auto& item : summary) { + cl_total += item.second.cl_avg; + } +#endif + for (const auto& item : summary) { + float percent = 0; + if (total > 0) { + percent = 100 * (item.second.avg / total); + } // clang-format off - ss << setw(25) << left << fixed << item.first.op_type \ - << " " << setw(40) << left << fixed << item.first.kernel_name \ - << " " << setw(12) << left << fixed << item.first.remark \ - << " " << setw(12) << left << fixed << item.second.avg \ - << " " << setw(12) << left << fixed << item.second.min \ - << " " << setw(12) << left << fixed << item.second.max \ - << " " << std::endl; + ss << setw(20) << left << fixed << item.first.op_type + << " " << setw(30) << left << fixed << item.first.kernel_attr + << " " << setw(16) << left << fixed << item.first.remark + << " " << setw(7) << left << fixed << setprecision(3) + << item.second.avg + << " " << setw(7) << left << fixed << setprecision(3) + << item.second.min + << " " << setw(7) << left << fixed << setprecision(3) + << item.second.max + << " " << setprecision(2) << percent << "% "; +#ifdef LITE_WITH_OPENCL + float cl_percent = 0; + if (cl_total > 0) { + cl_percent = 100 * (item.second.cl_avg / cl_total); + } + ss << " " << setw(9) << left << fixed << setprecision(3) + << item.second.cl_avg + << " " << setw(9) << left << fixed << setprecision(3) + << item.second.cl_min + << " " << setw(9) << left << fixed << setprecision(3) + << item.second.cl_max + << " " << left << fixed <LapTimes(); + total += times.Avg(w); + } +#ifdef LITE_WITH_OPENCL + float cl_total = 0.0; + for (auto& unit : units_) { + const auto& cl_times = unit.Timer(type)->CLLapTimes(); + cl_total += cl_times.Avg(w); + } +#endif for (auto& unit : units_) { const auto& times = unit.Timer(type)->LapTimes(); + float run = times.Avg(w); + float percent = 0; + if (total > 0) { + percent = 100 * (run / total); + } + +#ifdef LITE_WITH_OPENCL + const auto& cl_times = unit.Timer(type)->CLLapTimes(); + float cl_run = cl_times.Avg(w); + float cl_percent = 0; + if (cl_total > 0) { + cl_percent = 100 * (cl_run / cl_total); + } +#endif + // clang-format off - ss << setw(25) << left << fixed << unit.Character().op_type \ - << " " << setw(40) << left << fixed << unit.Character().kernel_name \ - << " " << setw(12) << left << fixed << unit.Character().remark \ - << " " << setw(12) << left << fixed << times.Avg(w) \ - << " " << setw(12) << left << fixed << times.Min(w) \ - << " " << setw(12) << left << fixed << times.Max(w) \ - << " " << setw(12) << left << fixed << times.Last(w) \ - << std::endl; - // clang-format on + ss << setw(20) << left << fixed << unit.Character().op_type + << " " << setw(30) << left << fixed << unit.Character().kernel_attr + << " " << setw(24) << left << fixed + << unit.Character().kernel_func_name + << " " << setw(16) << left << fixed << unit.Character().remark + << " " << setw(15) << left << fixed << unit.Character().input_shape + << " " << setw(15) << left << fixed << unit.Character().filter_shape + << " " << setw(15) << left << fixed << unit.Character().output_shape + << " " << setw(7) << left << fixed << setprecision(3) << times.Avg(w) + << " " << setw(7) << left << fixed << setprecision(3) << times.Min(w) + << " " << setw(7) << left << fixed << setprecision(3) << times.Max(w) + << " " << setw(7) << left << fixed << setprecision(3) << times.Last(w) + << " " << left << setprecision(2) << percent << "% " + << " " << setw(7) << left << fixed << setprecision(2) + << 1e-9f * unit.Character().macs + << " " << setw(7) << left << fixed << setprecision(2) + << 1e-6f * unit.Character().macs / times.Avg(w); +// clang-format on +#ifdef LITE_WITH_OPENCL + ss << " " << setw(9) << left << fixed << setprecision(3) + << cl_times.Avg(w) << " " << setw(9) << left << fixed + << setprecision(3) << cl_times.Min(w) << " " << setw(9) << left + << fixed << setprecision(3) << cl_times.Max(w) << " " << left + << setprecision(2) << cl_percent << "% "; +#endif + ss << std::endl; } } return ss.str(); diff --git a/lite/core/profile/profiler.h b/lite/core/profile/profiler.h index 3933e5ba01ebcb20420494a955cbc0e202879f76..ff77ef39c3f5e7284644ec7f79f57a2ffd29a3c8 100644 --- a/lite/core/profile/profiler.h +++ b/lite/core/profile/profiler.h @@ -18,6 +18,7 @@ #include #include #include "lite/core/profile/timer.h" +#include "lite/core/tensor.h" namespace paddle { namespace lite { @@ -35,25 +36,61 @@ struct TimeInfo { float avg; float min; float max; +#ifdef LITE_WITH_OPENCL + float cl_avg; + float cl_min; + float cl_max; +#endif }; struct OpCharacter { TargetType target; + void* op_lite{nullptr}; std::string op_type{std::string("N/A")}; std::string kernel_name{std::string("N/A")}; + std::string kernel_attr{std::string("N/A")}; + std::string kernel_func_name{std::string("N/A")}; std::string remark{std::string("N/A")}; + + std::string input_shape{"N/A"}; + std::string output_shape{"N/A"}; + std::string filter_shape{"N/A"}; + + float macs{0}; + float macs_ps{0}; + + float io_duration{0}; + +#ifdef LITE_WITH_OPENCL + cl::Event cl_event{}; +#else + void* cl_event{nullptr}; +#endif + + std::string DimToStr(const paddle::lite::DDimLite& dim) { + if (!dim.size()) return "NotImpl"; + std::string dim_str{""}; + for (size_t i = 0; i < dim.size(); ++i) { + dim_str += std::to_string(dim[i]); + if (i != dim.size() - 1) { + dim_str += "x"; + } + } + return dim_str; + } }; class StatisUnit final { public: explicit StatisUnit(const OpCharacter& ch); lite::profile::Timer* Timer(Type type); - const OpCharacter& Character() const { return character; } + OpCharacter& Character() { return character; } + + OpCharacter character; protected: std::unique_ptr create_t; std::unique_ptr dispatch_t; - OpCharacter character; }; class Profiler final { @@ -62,8 +99,9 @@ class Profiler final { explicit Profiler(const std::string& name) : name_(name) {} int NewTimer(const OpCharacter& ch); void StartTiming(Type type, const int index, KernelContext* ctx); - float StopTiming(Type type, const int index, KernelContext* ctx); + void StopTiming(Type type, const int index, KernelContext* ctx); std::string Summary(Type type, bool concise = true, size_t warm_up = 10); + OpCharacter* GetOpCharacter(const size_t index); private: std::string name_{std::string("N/A")}; diff --git a/lite/core/profile/timer.h b/lite/core/profile/timer.h index e9bb16bd27d5ec6fd21814c35db52b2467a12b51..ddb8a25899da95c353aeb6a98ff1ca44a63244c1 100644 --- a/lite/core/profile/timer.h +++ b/lite/core/profile/timer.h @@ -15,6 +15,7 @@ #pragma once #include #include // NOLINT +#include #include #ifdef LITE_WITH_CUDA #include "lite/backends/cuda/cuda_utils.h" @@ -87,6 +88,22 @@ class Timer { this->laps_t_.Add(elapse_ms); return elapse_ms; } + +#ifdef LITE_WITH_OPENCL + float CLStop(const std::string& op_type, float io_duration, cl::Event event) { + float cl_kernel_elapse_ms = 0.0; + if (op_type != "io_copy") { + cl_kernel_elapse_ms = + CLRuntime::Global()->CLRuntime::GetCommandTime(event); + } else { + cl_kernel_elapse_ms = io_duration; + } + this->cl_laps_t_.Add(cl_kernel_elapse_ms); + return cl_kernel_elapse_ms; + } + const TimeList& CLLapTimes() const { return cl_laps_t_; } +#endif + virtual void Start(KernelContext* ctx) { return Start(); } virtual float Stop(KernelContext* ctx) { return Stop(); } float AvgLapTimeMs() const { return laps_t_.Avg(); } @@ -94,6 +111,9 @@ class Timer { protected: TimeList laps_t_; +#ifdef LITE_WITH_OPENCL + TimeList cl_laps_t_; +#endif private: std::chrono::time_point t_start_, t_stop_; diff --git a/lite/core/program.cc b/lite/core/program.cc index ce6bd3a36cd1d852f2d50f69c4be9e31b84b3f60..0d0fd22e8767c68434d9193cd7383e45a890d1f8 100755 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/core/program.h" +#include #include #include "lite/model_parser/cpp/block_desc.h" #include "lite/model_parser/cpp/op_desc.h" @@ -20,7 +21,7 @@ #include "lite/operators/conditional_block_op.h" #include "lite/operators/subgraph_op.h" #include "lite/operators/while_op.h" -#ifdef LITE_WITH_PROFILE +#ifdef LITE_WITH_PRECISION_PROFILE #include "lite/core/profile/precision_profiler.h" #endif @@ -72,7 +73,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { std::unordered_map origin_var_maps; auto& main_block = *desc->GetBlock(0); auto var_size = main_block.VarsSize(); - for (int i = 0; i < var_size; i++) { + for (size_t i = 0; i < var_size; i++) { auto v = main_block.GetVar(i); auto name = v->Name(); origin_var_maps.emplace(name, *v); @@ -85,48 +86,54 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { auto* scope = op->scope(); auto in_names = op->op_info()->input_names(); auto out_names = op->op_info()->output_names(); - for (auto& in_name : in_names) { - auto it = origin_var_maps.find(in_name); + + std::vector var_names; + var_names.insert(var_names.end(), in_names.begin(), in_names.end()); + var_names.insert(var_names.end(), out_names.begin(), out_names.end()); + std::sort(var_names.begin(), var_names.end()); + var_names.erase(std::unique(var_names.begin(), var_names.end()), + var_names.end()); + + for (auto& var_name : var_names) { + auto it = origin_var_maps.find(var_name); if (it != origin_var_maps.end()) { auto* v = main_block.AddVar(); v->SetName((it->second).Name()); v->SetType((it->second).GetType()); v->SetPersistable((it->second).Persistable()); + if ((it->second).Name() != "feed" && (it->second).Name() != "fetch") { + v->SetShape((it->second).GetShape()); + v->SetDataType((it->second).GetDataType()); + } } else { // New created vars must be LOD_TENSOR auto* v = main_block.AddVar(); - v->SetName(in_name); + v->SetName(var_name); v->SetType(cpp::VarDesc::Type::LOD_TENSOR); std::string in_arg_name; - op->op_info()->GetInputArgname(in_name, &in_arg_name); + op->op_info()->GetInputArgname(var_name, &in_arg_name); auto type = kernel->GetInputDeclType(in_arg_name); if (type->IsTensor()) { - auto tensor = scope->FindVar(in_name)->GetMutable(); + auto tensor = scope->FindVar(var_name)->GetMutable(); v->SetPersistable(tensor->persistable()); - } else { - CHECK(false) << "unsupported var type"; - } - } - } + if ((it->second).Name() != "feed" && (it->second).Name() != "fetch") { + v->SetShape(tensor->dims().data()); + switch (tensor->precision()) { +#define SET_DATATYPE(precision__, data_type) \ + case PrecisionType::precision__: \ + v->SetDataType(data_type); \ + break - for (auto& out_name : out_names) { - auto it = origin_var_maps.find(out_name); - if (it != origin_var_maps.end()) { - auto* v = main_block.AddVar(); - v->SetName((it->second).Name()); - v->SetType((it->second).GetType()); - v->SetPersistable((it->second).Persistable()); - } else { - // New created vars must be LOD_TENSOR - auto* v = main_block.AddVar(); - v->SetName(out_name); - v->SetType(cpp::VarDesc::Type::LOD_TENSOR); - std::string out_arg_name; - op->op_info()->GetOutputArgname(out_name, &out_arg_name); - auto type = kernel->GetOutputDeclType(out_arg_name); - if (type->IsTensor()) { - auto tensor = scope->FindVar(out_name)->GetMutable(); - v->SetPersistable(tensor->persistable()); + SET_DATATYPE(kFloat, VarDescAPI::VarDataType::FP32); + SET_DATATYPE(kInt8, VarDescAPI::VarDataType::INT8); + SET_DATATYPE(kInt16, VarDescAPI::VarDataType::INT16); + SET_DATATYPE(kInt32, VarDescAPI::VarDataType::INT32); + SET_DATATYPE(kInt64, VarDescAPI::VarDataType::INT64); +#undef SET_DATATYPE + default: + LOG(FATAL) << "unknown precision type"; + } + } } else { CHECK(false) << "unsupported var type"; } @@ -136,30 +143,41 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { } void RuntimeProgram::Run() { +#ifdef LITE_WITH_PRECISION_PROFILE + auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler(); + std::string precision_profiler_summary = + inst_precision_profiler.GetSummaryHeader(); +#endif + for (auto& inst : instructions_) { #ifndef LITE_WITH_FPGA if (inst.is_feed_fetch_op()) continue; #endif + std::string op_type = inst.op()->op_info()->Type(); VLOG(4) << ">> Running kernel: " << inst.op()->op_info()->Repr() << " on Target " << TargetToStr(inst.kernel()->target()); -#ifndef LITE_WITH_FPGA - if (op_type == "feed" || op_type == "fetch") continue; +#ifdef LITE_WITH_CUDA + if (inst.need_sync()) { + inst.Sync(); + } #endif inst.Run(); -#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE #ifndef LITE_WITH_FPGA - LITE_PRECISION_PROFILE(inst) + precision_profiler_summary += + inst_precision_profiler.GetInstPrecision(&inst); #endif #endif // LITE_WITH_PRECISION_PROFILE -#endif // LITE_WITH_PROFILE } #ifdef LITE_WITH_PROFILE - LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0); -#endif // LITE_WITH_PROFILE + LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 1); +#endif +#ifdef LITE_WITH_PRECISION_PROFILE + LOG(INFO) << "\n" << precision_profiler_summary; +#endif } void Program::Build(const cpp::ProgramDesc& prog) { @@ -285,6 +303,13 @@ void Instruction::Run() { op_->InferShape(); kernel_->Launch(); has_run_ = true; + +#ifdef LITE_WITH_PROFILE + if (first_epoch_for_profiler_) { + SetProfileRuntimeOpInfo(profiler_->GetOpCharacter(profile_id_)); + first_epoch_for_profiler_ = false; + } +#endif } STL::ostream& operator<<(STL::ostream& os, const Instruction& other) { diff --git a/lite/core/program.h b/lite/core/program.h index c845a17c52c0c565e339a13e093f3e8f59e8d4a7..5e25a5fcda3168b6c914d8b8dc9caf9e12390cd9 100644 --- a/lite/core/program.h +++ b/lite/core/program.h @@ -23,6 +23,9 @@ #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" #include "lite/model_parser/cpp/program_desc.h" +#ifdef LITE_WITH_PROFILE +#include "lite/core/profile/profiler.h" +#endif namespace paddle { namespace lite { @@ -108,18 +111,39 @@ struct Instruction { bool is_feed_fetch_op() const { return is_feed_fetch_op_; } +#ifdef LITE_WITH_CUDA + bool need_sync() const { + if (kernel_->target() == TargetType::kCUDA) { + return kernel_->mutable_context()->As().need_sync(); + } else { + // the io_copy kernel has synced, so cpu kernels don't need sync.. + return false; + } + } + void Sync() const { kernel_->mutable_context()->As().Sync(); } +#endif + #ifdef LITE_WITH_PROFILE void set_profiler(profile::Profiler* profiler) { profiler_ = profiler; if (op_->Type() != "feed" && op_->Type() != "fetch") { profile::OpCharacter ch; + ch.op_lite = static_cast(const_cast(op())); ch.target = kernel()->target(); ch.op_type = op_->Type(); ch.kernel_name = kernel()->name(); + ch.kernel_attr = kernel()->name().substr(ch.op_type.size() + 1, + kernel()->name().size()); + // append `ch.kernel_func_name` in StopTiming profile_id_ = profiler->NewTimer(ch); kernel_->SetProfiler(profiler_, profile_id_); } } + + void SetProfileRuntimeOpInfo(paddle::lite::profile::OpCharacter* ch) { + auto* op_lite = static_cast(ch->op_lite); + op_lite->GetOpRuntimeInfo(ch); + } #endif private: @@ -132,6 +156,7 @@ struct Instruction { #ifdef LITE_WITH_PROFILE profile::Profiler* profiler_; int profile_id_{-1}; + bool first_epoch_for_profiler_{true}; #endif // LITE_WITH_PROFILE }; diff --git a/lite/core/program_fake_utils.h b/lite/core/program_fake_utils.h index edcbb101aa5ddb090cc585a16597967cb5114936..fbee253872237bce08f3f67b948da79becbae21a 100644 --- a/lite/core/program_fake_utils.h +++ b/lite/core/program_fake_utils.h @@ -30,9 +30,9 @@ Program FakeProgram() { auto add_fc = [&](int id, std::string x) { // create variables - std::string w1 = "w" + std::to_string(id); - std::string b1 = "b" + std::to_string(id); - std::string out1 = "out" + std::to_string(id); + std::string w1 = "w" + paddle::lite::to_string(id); + std::string b1 = "b" + paddle::lite::to_string(id); + std::string out1 = "out" + paddle::lite::to_string(id); auto w1v = program.scope()->Var(w1)->GetMutable(); auto b1v = program.scope()->Var(b1)->GetMutable(); auto out1v = program.scope()->Var(out1)->GetMutable(); diff --git a/lite/core/scope.cc b/lite/core/scope.cc index 775652e2a0d3c962c17dc796ef5f1d381411fa50..d87360a1da8215332c71739bbfa2660977f4f74c 100644 --- a/lite/core/scope.cc +++ b/lite/core/scope.cc @@ -60,6 +60,29 @@ Variable *Scope::FindLocalVar(const std::string &name) const { return nullptr; } +// AttributeVarNames will get persistive attribute names stored in parent scope +std::vector Scope::AttributeVarNames() const { + std::vector resulted_keys; + const Scope *cur_scope = this; + while (cur_scope->parent()) { + cur_scope = cur_scope->parent(); + auto keys = cur_scope->LocalVarNames(); + resulted_keys.insert(resulted_keys.end(), keys.begin(), keys.end()); + } + // remove feed and fetch + std::vector skiped_vars = {"feed", "fetch"}; + for (int i = 0; i < skiped_vars.size(); i++) { + auto iter = + std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]); + while (iter != resulted_keys.end()) { + resulted_keys.erase(iter); + iter = + std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]); + } + } + return resulted_keys; +} + std::vector Scope::LocalVarNames() const { std::vector keys; for (const auto &item : vars_) { diff --git a/lite/core/scope.h b/lite/core/scope.h index 2593c365224a0564caa27cf10eee1f917b90c342..aa3a8a1bfb7f4bf1cc00b548c0b0962ce8d73663 100644 --- a/lite/core/scope.h +++ b/lite/core/scope.h @@ -45,6 +45,8 @@ class Scope final { const Scope* parent() const { return parent_; } + // Get attribute params stored in parent scopes. + std::vector AttributeVarNames() const; // Following the legacy scope interface. std::vector LocalVarNames() const; diff --git a/lite/core/tensor.cc b/lite/core/tensor.cc index 38a6be6767eae62f9d91c9c11811bc49639331bf..197ee4ddbcd5df62dd0f8a15eba39e2a880f7125 100644 --- a/lite/core/tensor.cc +++ b/lite/core/tensor.cc @@ -32,8 +32,8 @@ value_type DDimLite::production() const { } value_type DDimLite::count(int start, int end) const { - start = std::max(start, 0); - end = std::min(end, static_cast(data_.size())); + start = (std::max)(start, 0); + end = (std::min)(end, static_cast(data_.size())); if (end < start) { return 0; } @@ -45,8 +45,8 @@ value_type DDimLite::count(int start, int end) const { } DDimLite DDimLite::Slice(int start, int end) const { - start = std::max(start, 0); - end = std::min(end, static_cast(data_.size())); + start = (std::max)(start, 0); + end = (std::min)(end, static_cast(data_.size())); std::vector new_dim(end - start); for (int i = start; i < end; i++) { new_dim[i - start] = data_[i]; @@ -75,6 +75,7 @@ void TensorLite::ShareDataWith(const TensorLite &other) { target_ = other.target_; lod_ = other.lod_; memory_size_ = other.memory_size_; + precision_ = other.precision_; } void TensorLite::CopyDataFrom(const TensorLite &other) { @@ -82,6 +83,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) { target_ = other.target_; lod_ = other.lod_; memory_size_ = other.memory_size_; + precision_ = other.precision_; buffer_->CopyDataFrom(*other.buffer_, memory_size_); } @@ -96,6 +98,21 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) { return mutable_data(memory_size); } +void TensorLite::ResetBuffer(std::shared_ptr buffer, + size_t memory_size) { + CHECK_EQ(offset_, 0u) + << "Only the offset is supported to zero when the Buffer is reset."; + if (buffer_) { + CHECK_LE(memory_size_, buffer->space()) + << "The space of buffer is not enough to store the tensor."; + CHECK_LE(memory_size, buffer->space()) + << "The buffer is smaller than the specified minimum size."; + } + buffer_ = buffer; + memory_size_ = memory_size; + target_ = buffer->target(); +} + #ifdef LITE_WITH_OPENCL template <> const cl::Image2D *TensorLite::data() const { @@ -103,8 +120,8 @@ const cl::Image2D *TensorLite::data() const { return static_cast(buffer_->data()); } -template <> // use int16_t represent half float -const cl::Image2D *TensorLite::data() const { +template <> // use uint16_t represent half float +const cl::Image2D *TensorLite::data() const { if (nullptr == buffer_->data()) return nullptr; return static_cast(buffer_->data()); } diff --git a/lite/core/tensor.h b/lite/core/tensor.h index 04e540002b553a0e0f7db0144fd970bdb6a4d9ed..2209e524f413b4cedf255566bfc1b6b1f1229f8d 100755 --- a/lite/core/tensor.h +++ b/lite/core/tensor.h @@ -102,9 +102,10 @@ using LoD = std::vector>; class TensorLite { public: TensorLite() : buffer_(std::make_shared()) {} + explicit TensorLite(std::shared_ptr buffer) : buffer_(buffer) {} template - void Assign(DType *data, const DimT &dim) { + void Assign(const DType *data, const DimT &dim) { Resize(dim); auto *dst = mutable_data(Target); CopySync( @@ -178,6 +179,11 @@ class TensorLite { (static_cast(buffer_->data()) + offset_)); } + void *raw_data() { + return static_cast( + (static_cast(buffer_->data()) + offset_)); + } + void clear() { buffer_->Free(); offset_ = 0; @@ -195,6 +201,8 @@ class TensorLite { void CopyDataFrom(const TensorLite &other); + void ResetBuffer(std::shared_ptr buffer, size_t memory_size); + TargetType target() const { return target_; } template @@ -260,8 +268,8 @@ bool TensorCompareWith(const TensorT &a, const TensorT &b) { template <> const cl::Image2D *TensorLite::data() const; -template <> // use int16_t represent half float -const cl::Image2D *TensorLite::data() const; +template <> // use uint16_t represent half float +const cl::Image2D *TensorLite::data() const; #endif } // namespace lite diff --git a/lite/core/type_system.cc b/lite/core/type_system.cc index 276d0c4a349794bed0ece755c924cf789a7cf54e..aaafd29841f44e671460a4c45babc7a8f663dacf 100644 --- a/lite/core/type_system.cc +++ b/lite/core/type_system.cc @@ -21,9 +21,9 @@ namespace lite { size_t ParamTypeRegistry::KernelIdTy::hash() const { std::hash h; size_t hash = h(kernel_type); - hash = hash_combine(hash, place.hash()); - hash = hash_combine(hash, std::hash()(static_cast(io))); - hash = hash_combine(hash, std::hash()(arg_name)); + lite::CombineHash(place.hash(), &hash); + lite::CombineHash(std::hash()(static_cast(io)), &hash); + lite::CombineHash(std::hash()(arg_name), &hash); return hash; } @@ -48,8 +48,7 @@ const Type *Type::GetTensorTy(TargetType target, // NOTE quite naive implementation here, but not performance sensitive. DataType::ID type_id = DataType::ID::Tensor; -#define HASH_ONE(x) v = hash_combine(v, hasher(static_cast(x))) - +#define HASH_ONE(x) CombineHash(hasher(static_cast(x)), &v); std::hash hasher; size_t v = hasher(static_cast(type_id)); HASH_ONE(target); @@ -80,8 +79,7 @@ const Type *Type::GetTensorListTy(TargetType target, static std::map type_repo; DataType::ID type_id = DataType::ID::TensorList; -#define HASH_ONE(x) v = hash_combine(v, hasher(static_cast(x))) - +#define HASH_ONE(x) CombineHash(hasher(static_cast(x)), &v); std::hash hasher; size_t v = hasher(static_cast(type_id)); HASH_ONE(target); diff --git a/lite/core/type_system.h b/lite/core/type_system.h index aeddf965c3b999750c7cca3595cc9f669b32d50e..2cf8366a2a1cbb6eb0c5f4e3dff3e4ac2623ff66 100644 --- a/lite/core/type_system.h +++ b/lite/core/type_system.h @@ -177,8 +177,9 @@ static bool TargetCompatibleTo(const Type& a, const Type& b) { return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM); }; if (a.IsVoid() || b.IsVoid()) return true; - if (a.IsTensor() || b.IsTensor()) { - if (a.IsTensor() && b.IsTensor()) { + if (a.IsTensor() || b.IsTensor() || a.IsTensorList() || b.IsTensorList()) { + if ((a.IsTensor() && b.IsTensor()) || + (a.IsTensorList() && b.IsTensorList())) { return is_host(a.target()) ? is_host(b.target()) : a.target() == b.target(); } diff --git a/lite/core/types.cc b/lite/core/types.cc index 4ea383333d519ac2c481dce459ca49124a64df32..a19c5ed0a33986237ce03213875929d34a2fb363 100644 --- a/lite/core/types.cc +++ b/lite/core/types.cc @@ -67,31 +67,31 @@ STL::ostream& operator<<(STL::ostream& os, const KernelPickFactor& k) { template <> Type StdTypeToRepr() { - return Type::_int32; + return Type::INT32; } template <> Type StdTypeToRepr() { - return Type::_int64; + return Type::INT64; } template <> Type StdTypeToRepr() { - return Type::_float32; + return Type::FLOAT32; } template <> Type StdTypeToRepr() { - return Type::_float64; + return Type::Float64; } template <> Type StdTypeToRepr>() { - return Type::_char_list; + return Type::CHARLIST; } template <> Type StdTypeToRepr() { - return Type::_string; + return Type::STRING; } template <> Type StdTypeToRepr() { - return Type::_bool; + return Type::BOOL; } } // namespace core diff --git a/lite/core/types.h b/lite/core/types.h index 8f154f9dd509d3627750ecbf301923a2296252d1..66dc44746a7496d9805e8cc2b6bf2df89b33ddbf 100644 --- a/lite/core/types.h +++ b/lite/core/types.h @@ -29,23 +29,23 @@ namespace core { */ // TODO(Superjomn) unify all the type representation across the lite framework. enum class Type { - _unk = -1, - // primary types - _int32, - _int64, - _float32, - _float64, - _bool, - _string, + UNK = -1, + // primary typesINT32, + INT32, + INT64, + FLOAT32, + Float64, + BOOL, + STRING, // primary list type - _char_list, + CHARLIST, // list types - _list, + LIST, // enum type - _enum, - _float16, + ENUM, + FLOAT16, // number of types - __num__, + NUM, }; enum class FluidType { @@ -81,7 +81,7 @@ enum class FluidType { template Type StdTypeToRepr() { - return Type::_unk; + return Type::UNK; } template <> Type StdTypeToRepr(); @@ -92,6 +92,8 @@ Type StdTypeToRepr(); template <> Type StdTypeToRepr(); template <> +Type StdTypeToRepr(); +template <> Type StdTypeToRepr>(); template <> Type StdTypeToRepr(); diff --git a/lite/core/version.h.in b/lite/core/version.h.in index d34c32073b852a50b5d26984ed4812ac4f38a870..da2d5f3ed99631973d97a94741e1711391237261 100644 --- a/lite/core/version.h.in +++ b/lite/core/version.h.in @@ -53,9 +53,9 @@ static std::string version() { static int64_t int_version(const std::string& version) { const std::vector vec = Split(version, "."); if (vec.size() == 3) { - return std::stoi(vec[0]) * MAJOR_COEFF + - std::stoi(vec[1]) * MINOR_COEFF + - std::stoi(vec[2]) * PATCH_COEFF; + return atoi(vec[0].c_str()) * MAJOR_COEFF + + atoi(vec[1].c_str()) * MINOR_COEFF + + atoi(vec[2].c_str()) * PATCH_COEFF; } return -1; } diff --git a/lite/core/workspace.h b/lite/core/workspace.h index 117b80aaa7863719536d8dbec70cf38c7ba04efc..54efb6699ac6df63286b26843f8d79b7c84949f1 100644 --- a/lite/core/workspace.h +++ b/lite/core/workspace.h @@ -69,6 +69,13 @@ class WorkSpace { } #endif +#if defined(LITE_WITH_MLU) + static WorkSpace& Global_MLU() { + thread_local std::unique_ptr x(new WorkSpace(TARGET(kMLU))); + return *x; + } +#endif + private: explicit WorkSpace(TargetType x) : target_(x) {} diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md index 6fb0a11c2e623f295a2c9b31ff7c3146f9fc5b98..6f93c879d87e3668abc2dfc6757679e0988d64dd 100644 --- a/lite/demo/cxx/README.md +++ b/lite/demo/cxx/README.md @@ -8,12 +8,29 @@ 2. 人脸识别和佩戴口罩判断的Demo -参考[源码编译](https://paddlepaddle.github.io/Paddle-Lite/v2.2.0/source_compile/)准备编译环境。 +目前,PaddleLite提供了shell端的人脸识别和佩戴口罩判断的Demo,首先基于已经准备好的Demo进行演示,然后介绍如何基于代码编译Demo并执行。 -执行下面命令,下载PaddleLite代码。 +**下载Demo并执行** + +下载压缩包[mask_demo](https://paddle-inference-dist.cdn.bcebos.com/PaddleLiteDemo/mask_demo_v2.6.tgz),解压到本地,其中包括编译好的可执行文件、模型文件、测试图片、PaddleLite 2.6版本动态库。 + +电脑连接安卓手机,在电脑shell端进入 `mask_demo` 目录。 + +执行 `sh run.sh`,会将文件push到手机端、执行口罩检测、pull结果图片。 + +在电脑端查看 `test_img_result.jpg`,即是口罩检测结果。 + + +**编译Demo并执行** + +参考[预测库编译](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html)准备编译环境。 + +执行下面命令,下载PaddleLite代码,切换到2.6版本分支。 ```shell git clone https://github.com/PaddlePaddle/Paddle-Lite.git cd Paddle-Lite +git fetch origin release/v2.6:release/v2.6 +git checkout release/v2.6 ``` 进入PaddleLite根目录,编译预测库。 @@ -24,53 +41,41 @@ cd Paddle-Lite --arm_lang=gcc \ --android_stl=c++_static \ --build_extra=ON \ - --shutdown_log=OFF \ + --with_log=ON \ full_publish ``` -进入编译目录,下载模型和图片的压缩包,编译可执行文件。 +编译完成后,进入Demo编译目录,执行脚本,会编译可执行文件,同时将可执行文件、预测库、模型、图片保存到 `mask_demo` 文件中。 ```shell cd build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mask_detection -wget https://paddle-inference-dist.bj.bcebos.com/mask_detection.tar.gz -tar zxvf mask_detection.tar.gz -make +sh prepare.sh ``` -当然,大家也可以通过PaddleHub下载人脸检测模型和口罩佩戴判断模型。 +当然,大家也可以通过PaddleHub下载人脸检测模型和口罩佩戴判断模型,然后使用 `opt`工具转换,最后替换 `mask_demo` 文件中的模型文件。 ``` -# 下载paddlehub以后,通过python执行以下代码 +# 参考[文档](https://github.com/PaddlePaddle/PaddleHub)安装PaddleHub + +# 参考[文档](https://www.paddlepaddle.org.cn/hubdetail?name=pyramidbox_lite_mobile_mask&en_category=ObjectDetection)安装模型,执行 hub install pyramidbox_lite_mobile_mask==1.3.0 + +#通过python执行以下代码,将模型保存在test_program文件夹之中,人脸检测和口罩佩戴判断模型分别存储在pyramidbox_lite和mask_detector之中。文件夹中的__model__是模型结构文件,__param__文件是权重文件 import paddlehub as hub pyramidbox_lite_mobile_mask = hub.Module(name="pyramidbox_lite_mobile_mask") -# 将模型保存在test_program文件夹之中 -pyramidbox_lite_mobile_mask.processor.save_inference_model(dirname="test_program") -# 通过以上命令,可以获得人脸检测和口罩佩戴判断模型,分别存储在pyramidbox_lite和mask_detector之中。文件夹中的__model__是模型结构文件,__param__文件是权重文件。 -# 从PaddleHub下载的是预测模型,需要使用PaddleLite提供的model_optimize_tools对预测模型进行转换,请参考[模型转换文档](https://paddlepaddle.github.io/Paddle-Lite/v2.2.0/model_optimize_tool/)。 -``` +pyramidbox_lite_mobile_mask.processor.save_inference_model(dirname="test_program") -电脑连接安卓手机,将可执行文件、测试图片、模型文件、预测库push到安卓手机上。 -``` -adb push mask_detection /data/local/tmp/ -adb push test.jpg /data/local/tmp/ -adb push face_detection /data/local/tmp -adb push mask_classification /data/local/tmp -adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ -adb shell chmod +x /data/local/tmp/mask_detection +# 从PaddleHub下载的是预测模型,需要使用PaddleLite提供的 opt 对预测模型进行转换,请参考[模型转换文档](https://paddlepaddle.github.io/Paddle-Lite/v2.2.0/model_optimize_tool/)。 ``` -进入安卓手机,执行demo。 -``` -adb shell -cd /data/local/tmp -export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH -./mask_detection face_detection mask_classification test.jpg -``` +电脑连接安卓手机,在电脑shell端进入 `mask_demo` 目录。 -回到电脑端,将结果取出,查看如下效果图。 -``` -adb pull /data/local/tmp/test_mask_detection_result.jpg ./ -``` +执行 `sh run.sh`,会将文件push到手机端、执行口罩检测、pull结果图片。 + +在电脑端查看 `test_img_result.jpg`,即是口罩检测结果,如下图。 + +![test_mask_detection_result](https://user-images.githubusercontent.com/7383104/75131866-bae64300-570f-11ea-9cad-17acfaea1cfc.jpg) -![test_mask_detection_result](https://user-images.githubusercontent.com/7383104/74279176-6200cd00-4d55-11ea-9fc0-83cfc2b3b37d.jpg) +注:mask_detetion.cc 中的缩放因子shrink, 检测阈值detect_threshold, 可供自由配置: + - 缩放因子越大,模型运行速度越慢,检测准确率越高。 + - 检测阈值越高,人脸筛选越严格,检测出的人脸框可能越少。 3. 编译并运行全量api的demo(注:当编译模式为tiny_pubish时将不存在该demo) ```shell diff --git a/lite/demo/cxx/cuda_demo/CMakeLists.txt b/lite/demo/cxx/cuda_demo/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f057a1f189fdb92ff33f00d5ceacc83f7fc28c5d --- /dev/null +++ b/lite/demo/cxx/cuda_demo/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 2.8) +project(demo CXX C) + +add_definitions(-DLITE_WITH_CUDA) + +set(TARGET demo) +set(CMAKE_CXX_FLAGS "-std=c++11 -O3") + +set(LITE_ROOT "${PROJECT_SOURCE_DIR}/../../cxx") +set(PROTOBUF_ROOT "${PROJECT_SOURCE_DIR}/../../third_party/protobuf") + +include_directories("${LITE_ROOT}/include") +link_directories("${LITE_ROOT}/lib") +link_directories("${PROTOBUF_ROOT}/lib") +# cuda lib +link_directories("/usr/local/cuda/lib64/") + +add_executable(${TARGET} ${TARGET}.cc) + +set(DEPS ${LITE_ROOT}/lib/libpaddle_full_api_shared.so) +set(DEPS ${DEPS} protobuf-lite) +set(DEPS ${DEPS} "-lrt -lpthread -ldl -lcudart") + +target_link_libraries(${TARGET} ${DEPS}) diff --git a/lite/demo/cxx/cuda_demo/demo.cc b/lite/demo/cxx/cuda_demo/demo.cc new file mode 100644 index 0000000000000000000000000000000000000000..593e73cf83cd491fd8e33e415d17106dc8f4ce14 --- /dev/null +++ b/lite/demo/cxx/cuda_demo/demo.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle_api.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +int64_t ShapeProduction(const shape_t& shape) { + int64_t res = 1; + for (auto i : shape) res *= i; + return res; +} + +void RunModel(std::string model_dir) { + // 1. Create CxxConfig + CxxConfig config; + config.set_model_file(model_dir + "/__model__"); + config.set_param_file(model_dir + "/__params__"); + config.set_valid_places({ + Place{TARGET(kCUDA), PRECISION(kFloat)}, + }); + // 2. Create PaddlePredictor by CxxConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + // 3. Prepare input data + int num = 1; + int channels = 3; + int height = 608; + int width = 608; + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize({num, channels, height, width}); + // fake input data + std::vector data(num * channels * height * width, 0); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = i % 10 * 0.1; + } + input_tensor->CopyFromCpu(data.data()); + std::unique_ptr size_tensor(std::move(predictor->GetInput(1))); + size_tensor->Resize({1, 2}); + std::vector size_data{608, 608}; + size_tensor->CopyFromCpu(size_data.data()); + + // 4. Run predictor + predictor->Run(); + + // 5. Get output + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + std::vector out_cpu(ShapeProduction(output_tensor->shape()), 0); + std::cout << "output size is " << ShapeProduction(output_tensor->shape()) + << std::endl; + output_tensor->CopyToCpu(out_cpu.data()); + for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { + std::cout << "Output[" << i << "]: " << out_cpu[i] << std::endl; + } +} + +int main(int argc, char** argv) { + if (argc < 2) { + std::cerr << "[ERROR] usage: ./" << argv[0] << " model_dir\n"; + exit(1); + } + std::string model_dir = argv[1]; + RunModel(model_dir); + return 0; +} diff --git a/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv7 index dd6d4b0960160e140e2f051b78814d2fee08d5e0..486ebf3bc34fa6fa0fd7bc5b4805c1fc757adf2b 100644 --- a/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv7 +++ b/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv7 @@ -43,7 +43,7 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SY mask_detection: fetch_opencv mask_detection.o $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mask_detection.o -o mask_detection $(CXX_LIBS) $(LDFLAGS) -mask_detection.o: mask_detection.cc +mask_detection.o: fetch_opencv mask_detection.cc $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mask_detection.o -c mask_detection.cc fetch_opencv: diff --git a/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv8 index c2f601ed2f68c342b47c5add451f84c537f978de..5bc714eb8831fd53ca0093fce6f70f9bec28815b 100644 --- a/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv8 +++ b/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv8 @@ -43,7 +43,7 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SY mask_detection: fetch_opencv mask_detection.o $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mask_detection.o -o mask_detection $(CXX_LIBS) $(LDFLAGS) -mask_detection.o: mask_detection.cc +mask_detection.o: fetch_opencv mask_detection.cc $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mask_detection.o -c mask_detection.cc fetch_opencv: diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 index d659a316cd856fd550e83b125573409f239b8cf2..4a63563c4ff12b825e881327ec77adc5b2f03aeb 100644 --- a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 +++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 @@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) +#CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) ############################################################### # How to use one of static libaray: # @@ -40,7 +40,7 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYS # 1. Comment above line using `libpaddle_light_api_shared.so` # 2. Undo comment below line using `libpaddle_api_light_bundled.a` -#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) +CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) test_model_cv: fetch_opencv test_model_cv.o $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 index c80b07d5c029a3624a514e07375fd08e8770da25..70d6bed52b84be7d050ef15ab483e8d06342c82d 100644 --- a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 +++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 @@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) +#CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) ############################################################### # How to use one of static libaray: # # `libpaddle_api_full_bundled.a` # @@ -39,7 +39,7 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYS # 1. Comment above line using `libpaddle_light_api_shared.so` # 2. Undo comment below line using `libpaddle_api_light_bundled.a` -#CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) +CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) test_model_cv: fetch_opencv test_model_cv.o $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) diff --git a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7 b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7 new file mode 100644 index 0000000000000000000000000000000000000000..39c2caa20bd566a2bb4480d302447187bc7a5e7a --- /dev/null +++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7 @@ -0,0 +1,97 @@ +ARM_ABI = arm7 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include + +CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS) + +LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared +LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a +LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared +LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a + +########## +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + +test_helper.o: fetch_opencv test_helper.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc + +classification_full.o: fetch_opencv classification_full.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc + +classification_light.o: fetch_opencv classification_light.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc + +classification_full_shared: fetch_opencv classification_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS} + +classification_full_static: fetch_opencv classification_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +classification_light_shared: fetch_opencv classification_light.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS} + +classification_light_static: fetch_opencv classification_light.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +###### +yolov3_full.o: fetch_opencv yolov3_full.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc + +yolov3_light.o: fetch_opencv yolov3_light.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc + +yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS} + +yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS} + +yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +##### +all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static + +clean: + rm -f *.o + rm -f classification_full_shared + rm -f classification_full_static + rm -f classification_light_shared + rm -f classification_light_static + rm -f yolov3_full_shared + rm -f yolov3_full_static + rm -f yolov3_light_shared + rm -f yolov3_light_static diff --git a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8 b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8 new file mode 100644 index 0000000000000000000000000000000000000000..556fe9c772fc4a39d13ba9649c854c32b3370d8f --- /dev/null +++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8 @@ -0,0 +1,97 @@ +ARM_ABI = arm8 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include + +CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS) + +LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared +LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a +LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared +LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a + +########## +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + +test_helper.o: fetch_opencv test_helper.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc + +classification_full.o: fetch_opencv classification_full.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc + +classification_light.o: fetch_opencv classification_light.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc + +classification_full_shared: fetch_opencv classification_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS} + +classification_full_static: fetch_opencv classification_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +classification_light_shared: fetch_opencv classification_light.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS} + +classification_light_static: fetch_opencv classification_light.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +###### +yolov3_full.o: fetch_opencv yolov3_full.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc + +yolov3_light.o: fetch_opencv yolov3_light.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc + +yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS} + +yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS} + +yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS) + +##### +all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static + +clean: + rm -f *.o + rm -f classification_full_shared + rm -f classification_full_static + rm -f classification_light_shared + rm -f classification_light_static + rm -f yolov3_full_shared + rm -f yolov3_full_static + rm -f yolov3_light_shared + rm -f yolov3_light_static diff --git a/lite/demo/cxx/mask_detection/mask_detection.cc b/lite/demo/cxx/mask_detection/mask_detection.cc index 748b84365fc70aa59171a6bf8847f554308fdc8c..fe78f5d8d35ea65288c09a2dc63e0f25d3a3ecb1 100644 --- a/lite/demo/cxx/mask_detection/mask_detection.cc +++ b/lite/demo/cxx/mask_detection/mask_detection.cc @@ -81,6 +81,29 @@ void neon_mean_scale(const float* din, } } +cv::Mat crop_img(const cv::Mat& img, + cv::Rect rec, + int res_width, + int res_height) { + float xmin = rec.x; + float ymin = rec.y; + float w = rec.width; + float h = rec.height; + float center_x = xmin + w / 2; + float center_y = ymin + h / 2; + cv::Point2f center(center_x, center_y); + float max_wh = std::max(w / 2, h / 2); + float scale = res_width / (2 * max_wh * 1.5); + cv::Mat rot_mat = cv::getRotationMatrix2D(center, 0.f, scale); + rot_mat.at(0, 2) = + rot_mat.at(0, 2) - (center_x - res_width / 2.0); + rot_mat.at(1, 2) = + rot_mat.at(1, 2) - (center_y - res_width / 2.0); + cv::Mat affine_img; + cv::warpAffine(img, affine_img, rot_mat, cv::Size(res_width, res_height)); + return affine_img; +} + void pre_process(const cv::Mat& img, int width, int height, @@ -89,8 +112,12 @@ void pre_process(const cv::Mat& img, float* data, bool is_scale = false) { cv::Mat resized_img; - cv::resize( - img, resized_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC); + if (img.cols != width || img.rows != height) { + cv::resize( + img, resized_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC); + } else { + resized_img = img; + } cv::Mat imgf; float scale_factor = is_scale ? 1.f / 256 : 1.f; resized_img.convertTo(imgf, CV_32FC3, scale_factor); @@ -98,12 +125,12 @@ void pre_process(const cv::Mat& img, neon_mean_scale(dimg, data, width * height, mean, scale); } -void RunModel(std::string det_model_dir, - std::string class_model_dir, +void RunModel(std::string det_model_file, + std::string class_model_file, std::string img_path) { // Prepare cv::Mat img = imread(img_path, cv::IMREAD_COLOR); - float shrink = 0.2; + float shrink = 0.4; int width = img.cols; int height = img.rows; int s_width = static_cast(width * shrink); @@ -111,11 +138,12 @@ void RunModel(std::string det_model_dir, // Detection MobileConfig config; - config.set_model_dir(det_model_dir); + config.set_model_from_file(det_model_file); // Create Predictor For Detction Model std::shared_ptr predictor = CreatePaddlePredictor(config); + std::cout << "Load detecion model succeed." << std::endl; // Get Input Tensor std::unique_ptr input_tensor0(std::move(predictor->GetInput(0))); @@ -136,9 +164,10 @@ void RunModel(std::string det_model_dir, auto* outptr = output_tensor0->data(); auto shape_out = output_tensor0->shape(); int64_t out_len = ShapeProduction(shape_out); + std::cout << "Detecting face succeed." << std::endl; // Filter Out Detection Box - float detect_threshold = 0.3; + float detect_threshold = 0.7; std::vector detect_result; for (int i = 0; i < out_len / 6; ++i) { if (outptr[1] >= detect_threshold) { @@ -158,10 +187,11 @@ void RunModel(std::string det_model_dir, } // Classification - config.set_model_dir(class_model_dir); + config.set_model_from_file(class_model_file); // Create Predictor For Classification Model predictor = CreatePaddlePredictor(config); + std::cout << "Load classification model succeed." << std::endl; // Get Input Tensor std::unique_ptr input_tensor1(std::move(predictor->GetInput(0))); @@ -172,10 +202,14 @@ void RunModel(std::string det_model_dir, int detect_num = detect_result.size(); std::vector classify_mean = {0.5f, 0.5f, 0.5f}; std::vector classify_scale = {1.f, 1.f, 1.f}; - float classify_threshold = 0.5; for (int i = 0; i < detect_num; ++i) { cv::Rect rec_clip = detect_result[i].rec; - cv::Mat roi = img(rec_clip); + cv::Mat roi = crop_img(img, rec_clip, classify_w, classify_h); + + // uncomment two lines below, save roi img to disk + // std::string roi_name = "roi_" + paddle::lite::to_string(i) + // + ".jpg"; + // imwrite(roi_name, roi); // Do PreProcess pre_process(roi, @@ -191,56 +225,81 @@ void RunModel(std::string det_model_dir, // Get Output Tensor std::unique_ptr output_tensor1( - std::move(predictor->GetOutput(1))); + std::move(predictor->GetOutput(0))); auto* outptr = output_tensor1->data(); + float prob = outptr[1]; // Draw Detection and Classification Results - cv::rectangle(img, rec_clip, cv::Scalar(0, 0, 255), 2, cv::LINE_AA); - std::string text = outptr[1] > classify_threshold ? "wear mask" : "no mask"; - int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL; - double font_scale = 1.f; - int thickness = 1; + bool flag_mask = prob > 0.5f; + cv::Scalar roi_color; + std::string text; + if (flag_mask) { + text = "MASK: "; + roi_color = cv::Scalar(0, 255, 0); + } else { + text = "NO MASK: "; + roi_color = cv::Scalar(0, 0, 255); + prob = 1 - prob; + } + std::string prob_str = std::to_string(prob * 100); + int point_idx = prob_str.find_last_of("."); + + text += prob_str.substr(0, point_idx + 3) + "%"; + int font_face = cv::FONT_HERSHEY_SIMPLEX; + double font_scale = 0.38; + float thickness = 1; cv::Size text_size = cv::getTextSize(text, font_face, font_scale, thickness, nullptr); - float new_font_scale = rec_clip.width * 0.7 * font_scale / text_size.width; - text_size = - cv::getTextSize(text, font_face, new_font_scale, thickness, nullptr); + + int top_space = std::max(0.35 * text_size.height, 2.0); + int bottom_space = top_space + 2; + int right_space = 0.05 * text_size.width; + int back_width = text_size.width + right_space; + int back_height = text_size.height + top_space + bottom_space; + + // Configure text background + cv::Rect text_back = + cv::Rect(rec_clip.x, rec_clip.y - back_height, back_width, back_height); + + // Draw roi object, text, and background + cv::rectangle(img, rec_clip, roi_color, 1); + cv::rectangle(img, text_back, cv::Scalar(225, 225, 225), -1); cv::Point origin; - origin.x = rec_clip.x + 5; - origin.y = rec_clip.y + text_size.height + 5; + origin.x = rec_clip.x; + origin.y = rec_clip.y - bottom_space; cv::putText(img, text, origin, font_face, - new_font_scale, - cv::Scalar(0, 255, 255), - thickness, - cv::LINE_AA); + font_scale, + cv::Scalar(0, 0, 0), + thickness); std::cout << "detect face, location: x=" << rec_clip.x << ", y=" << rec_clip.y << ", width=" << rec_clip.width - << ", height=" << rec_clip.height - << ", wear mask: " << (outptr[1] > classify_threshold) - << std::endl; + << ", height=" << rec_clip.height << ", wear mask: " << flag_mask + << ", prob: " << prob << std::endl; } // Write Result to Image File int start = img_path.find_last_of("/"); int end = img_path.find_last_of("."); std::string img_name = img_path.substr(start + 1, end - start - 1); - std::string result_name = img_name + "_mask_detection_result.jpg"; + std::string result_name = img_name + "_result.jpg"; cv::imwrite(result_name, img); + std::cout << "write result to file: " << result_name << ", success." + << std::endl; } int main(int argc, char** argv) { if (argc < 3) { std::cerr << "[ERROR] usage: " << argv[0] - << " detction_model_dir classification_model_dir image_path\n"; + << " detction_model_file classification_model_file image_path\n"; exit(1); } - std::string detect_model_dir = argv[1]; - std::string classify_model_dir = argv[2]; + std::string detect_model_file = argv[1]; + std::string classify_model_file = argv[2]; std::string img_path = argv[3]; - RunModel(detect_model_dir, classify_model_dir, img_path); + RunModel(detect_model_file, classify_model_file, img_path); return 0; } diff --git a/lite/demo/cxx/mask_detection/prepare.sh b/lite/demo/cxx/mask_detection/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..e736b145590e08160a27931ba6f8198c0aef992a --- /dev/null +++ b/lite/demo/cxx/mask_detection/prepare.sh @@ -0,0 +1,24 @@ +# make +make -j + +# mkdir +gf=mask_demo +if [ -d ${gf} ];then + rm -rf ${gf} +fi +mkdir ${gf} + +# collect files +cp run.sh ${gf} +cp mask_detection ${gf} +cp ../../../cxx/lib/libpaddle_light_api_shared.so ${gf} + +if [ ! -f "mask_models_img.tar.gz" ]; +then + wget -c https://paddle-inference-dist.cdn.bcebos.com/PaddleLiteDemo/mask_models_img.tar.gz +fi +tar zxf mask_models_img.tar.gz +mv mask_models_img ${gf} + +# clean +make clean diff --git a/lite/demo/cxx/mask_detection/run.sh b/lite/demo/cxx/mask_detection/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..536b63c32844fe022664e417151aead5ef0e279e --- /dev/null +++ b/lite/demo/cxx/mask_detection/run.sh @@ -0,0 +1,12 @@ +adb push ../mask_demo /data/local/tmp/ + +mask_demo_path="/data/local/tmp/mask_demo" + +adb shell "cd ${mask_demo_path} \ + && export LD_LIBRARY_PATH=${mask_demo_path}:${LD_LIBRARY_PATH} \ + && ./mask_detection \ + mask_models_img/pyramidbox_lite_opt2.nb \ + mask_models_img/mask_detector_opt2.nb \ + mask_models_img/test_img.jpg" + +adb pull ${mask_demo_path}/test_img_result.jpg . diff --git a/lite/demo/cxx/mobile_classify/mobile_classify.cc b/lite/demo/cxx/mobile_classify/mobile_classify.cc index d0cf59e185e1330b7d8487d562afa0af29236007..518040ebd07bb4e8940f6a885cddd4f3c98143f3 100644 --- a/lite/demo/cxx/mobile_classify/mobile_classify.cc +++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc @@ -126,7 +126,7 @@ void pre_process(const cv::Mat& img, neon_mean_scale(dimg, data, width * height, means, scales); } -void RunModel(std::string model_dir, +void RunModel(std::string model_file, std::string img_path, const std::vector& labels, const int topk, @@ -134,7 +134,7 @@ void RunModel(std::string model_dir, int height) { // 1. Set MobileConfig MobileConfig config; - config.set_model_dir(model_dir); + config.set_model_from_file(model_file); // 2. Create PaddlePredictor by MobileConfig std::shared_ptr predictor = @@ -169,12 +169,12 @@ void RunModel(std::string model_dir, int main(int argc, char** argv) { if (argc < 4) { std::cerr << "[ERROR] usage: " << argv[0] - << " model_dir image_path label_file\n"; + << " model_file image_path label_file\n"; exit(1); } - printf("parameter: model_dir, image_path and label_file are necessary \n"); + printf("parameter: model_file, image_path and label_file are necessary \n"); printf("parameter: topk, input_width, input_height, are optional \n"); - std::string model_dir = argv[1]; + std::string model_file = argv[1]; std::string img_path = argv[2]; std::string label_file = argv[3]; std::vector labels; @@ -190,6 +190,6 @@ int main(int argc, char** argv) { height = atoi(argv[6]); } - RunModel(model_dir, img_path, labels, topk, width, height); + RunModel(model_file, img_path, labels, topk, width, height); return 0; } diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc index 0c9da1a76422edae45dfeec5d38556a5e2322a85..2a819883fa316bd1898c063912800b57804218db 100644 --- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc +++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc @@ -18,6 +18,11 @@ #include "paddle_api.h" // NOLINT #include "paddle_use_passes.h" // NOLINT +#if defined(_WIN32) +#include "paddle_use_kernels.h" // NOLINT +#include "paddle_use_ops.h" // NOLINT +#endif + using namespace paddle::lite_api; // NOLINT DEFINE_string(model_dir, "", "Model dir path."); diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc index 1f7c4522f159dd080b5965fb383ab6624df3db4e..3d09c071aa7ecbe51f1723cad314f2aedcdb2bd7 100644 --- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc +++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc @@ -12,8 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include +#include #include +#include #include + #include "paddle_api.h" // NOLINT using namespace paddle::lite_api; // NOLINT @@ -24,13 +29,59 @@ int64_t ShapeProduction(const shape_t& shape) { return res; } -void RunModel(std::string model_dir) { +std::string ShapePrint(const shape_t& shape) { + std::string shape_str{""}; + for (auto i : shape) { + shape_str += std::to_string(i) + " "; + } + return shape_str; +} + +template +double compute_mean(const T* in, const size_t length) { + double sum = 0.; + for (size_t i = 0; i < length; ++i) { + sum += in[i]; + } + return sum / length; +} + +template +double compute_standard_deviation(const T* in, + const size_t length, + bool has_mean = false, + double mean = 10000) { + if (!has_mean) { + mean = compute_mean(in, length); + } + + double variance = 0.; + for (size_t i = 0; i < length; ++i) { + variance += pow((in[i] - mean), 2); + } + variance /= length; + return sqrt(variance); +} + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +void RunModel(std::string model_dir, + const shape_t& input_shape, + size_t repeats, + size_t warmup, + size_t print_output_elem, + size_t power_mode) { // 1. Set MobileConfig MobileConfig config; config.set_model_from_file(model_dir); // NOTE: To load model transformed by model_optimize_tool before // release/v2.3.0, plese use `set_model_dir` API as listed below. // config.set_model_dir(model_dir); + config.set_power_mode(static_cast(power_mode)); // 2. Create PaddlePredictor by MobileConfig std::shared_ptr predictor = @@ -38,31 +89,115 @@ void RunModel(std::string model_dir) { // 3. Prepare input data std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); - input_tensor->Resize({1, 3, 224, 224}); + input_tensor->Resize( + {input_shape[0], input_shape[1], input_shape[2], input_shape[3]}); auto* data = input_tensor->mutable_data(); for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { data[i] = 1; } // 4. Run predictor - predictor->Run(); + for (size_t widx = 0; widx < warmup; ++widx) { + predictor->Run(); + } + + double sum_duration = 0.0; // millisecond; + double max_duration = 1e-5; + double min_duration = 1e5; + double avg_duration = -1; + for (size_t ridx = 0; ridx < repeats; ++ridx) { + auto start = GetCurrentUS(); + + predictor->Run(); + + auto duration = (GetCurrentUS() - start) / 1000.0; + sum_duration += duration; + max_duration = duration > max_duration ? duration : max_duration; + min_duration = duration < min_duration ? duration : min_duration; + std::cout << "run_idx:" << ridx + 1 << " / " << repeats << ": " << duration + << " ms" << std::endl; + } + avg_duration = sum_duration / static_cast(repeats); + std::cout << "\n======= benchmark summary =======\n" + << "input_shape(NCHW):" << ShapePrint(input_shape) << "\n" + << "model_dir:" << model_dir << "\n" + << "warmup:" << warmup << "\n" + << "repeats:" << repeats << "\n" + << "max_duration:" << max_duration << "\n" + << "min_duration:" << min_duration << "\n" + << "avg_duration:" << avg_duration << "\n"; // 5. Get output - std::unique_ptr output_tensor( - std::move(predictor->GetOutput(0))); - std::cout << "Output shape " << output_tensor->shape()[1] << std::endl; - for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { - std::cout << "Output[" << i << "]: " << output_tensor->data()[i] + std::cout << "\n====== output summary ====== " << std::endl; + size_t output_tensor_num = predictor->GetOutputNames().size(); + std::cout << "output tensor num:" << output_tensor_num << std::endl; + + for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { + std::unique_ptr output_tensor = + predictor->GetOutput(tidx); + std::cout << "\n--- output tensor " << tidx << " ---" << std::endl; + auto out_shape = output_tensor->shape(); + auto out_data = output_tensor->data(); + auto out_mean = compute_mean(out_data, ShapeProduction(out_shape)); + auto out_std_dev = compute_standard_deviation( + out_data, ShapeProduction(out_shape), true, out_mean); + + std::cout << "output shape(NCHW):" << ShapePrint(out_shape) << std::endl; + std::cout << "output tensor " << tidx + << " elem num:" << ShapeProduction(out_shape) << std::endl; + std::cout << "output tensor " << tidx + << " standard deviation:" << out_std_dev << std::endl; + std::cout << "output tensor " << tidx << " mean value:" << out_mean << std::endl; + + // print output + if (print_output_elem) { + for (int i = 0; i < ShapeProduction(out_shape); ++i) { + std::cout << "out[" << tidx << "][" << i + << "]:" << output_tensor->data()[i] << std::endl; + } + } } } int main(int argc, char** argv) { - if (argc < 2) { - std::cerr << "[ERROR] usage: ./" << argv[0] << " naive_buffer_model_dir\n"; - exit(1); + shape_t input_shape{1, 3, 224, 224}; // shape_t ==> std::vector + int repeats = 10; + int warmup = 10; + int print_output_elem = 0; + + if (argc > 2 && argc < 9) { + std::cerr << "usage: ./" << argv[0] << "\n" + << " \n" + << " \n" + << " \n" + << " \n" + << " \n" + << " \n" + << " \n" + << " " << std::endl; + return 0; } + std::string model_dir = argv[1]; - RunModel(model_dir); + if (argc >= 9) { + input_shape[0] = atoi(argv[2]); + input_shape[1] = atoi(argv[3]); + input_shape[2] = atoi(argv[4]); + input_shape[3] = atoi(argv[5]); + repeats = atoi(argv[6]); + warmup = atoi(argv[7]); + print_output_elem = atoi(argv[8]); + } + // set arm power mode: + // 0 for big cluster, high performance + // 1 for little cluster + // 2 for all cores + // 3 for no bind + size_t power_mode = 0; + + RunModel( + model_dir, input_shape, repeats, warmup, print_output_elem, power_mode); + return 0; } diff --git a/lite/demo/cxx/ssd_detection/ssd_detection.cc b/lite/demo/cxx/ssd_detection/ssd_detection.cc index 2408afcbf64a24924eca119a9d9481dc030250c9..0be4561cd8d083f26e562c2346da217bb4b48283 100644 --- a/lite/demo/cxx/ssd_detection/ssd_detection.cc +++ b/lite/demo/cxx/ssd_detection/ssd_detection.cc @@ -162,10 +162,10 @@ std::vector detect_object(const float* data, return rect_out; } -void RunModel(std::string model_dir, std::string img_path) { +void RunModel(std::string model_file, std::string img_path) { // 1. Set MobileConfig MobileConfig config; - config.set_model_dir(model_dir); + config.set_model_from_file(model_file); // 2. Create PaddlePredictor by MobileConfig std::shared_ptr predictor = @@ -199,11 +199,11 @@ void RunModel(std::string model_dir, std::string img_path) { int main(int argc, char** argv) { if (argc < 3) { - std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n"; + std::cerr << "[ERROR] usage: " << argv[0] << " model_file image_path\n"; exit(1); } - std::string model_dir = argv[1]; + std::string model_file = argv[1]; std::string img_path = argv[2]; - RunModel(model_dir, img_path); + RunModel(model_file, img_path); return 0; } diff --git a/lite/demo/cxx/test_cv/README.md b/lite/demo/cxx/test_cv/README.md index 36d2985a4fd4f243027f8caab9b6c5a8beb94cad..21574a9bf9fd0ebb3ecf1663f49beed93fdf51bb 100644 --- a/lite/demo/cxx/test_cv/README.md +++ b/lite/demo/cxx/test_cv/README.md @@ -1,5 +1,5 @@ # 图像预测库的使用 -1. 下载源码(https://github.com/PaddlePaddle/Paddle-Lite),打开LITE_WITH_CV=ON,编译full_publish模式 +1. 下载源码(https://github.com/PaddlePaddle/Paddle-Lite),打开LITE_WITH_CV=ON,编译full_publish or tiny_publish模式 example: ```shell set BUILD_WITH_CV=ON or LITE_WITH_CV=ON @@ -8,7 +8,7 @@ set BUILD_WITH_CV=ON or LITE_WITH_CV=ON --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static -full_publish +tiny_publish ``` 2. 准备模型和优化模型 @@ -17,7 +17,7 @@ example: wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz tar zxvf mobilenet_v1.tar.gz ./lite/tools/build.sh build_optimize_tool -./build.model_optimize_tool/lite/api/model_optimize_tool +./build.opt/lite/api/opt --optimize_out_type=naive_buffer --optimize_out=model_dir --model_dir=model_dir @@ -68,7 +68,8 @@ make adb -s device_id push mobilenet_v1 /data/local/tmp/ adb -s device_id push test_model_cv /data/local/tmp/ adb -s device_id push test.jpg /data/local/tmp/ -adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb -s device_id push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +#adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ adb -s device_id shell chmod +x /data/local/tmp/test_model_cv adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/test_model_cv /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg 1 3 224 224 " @@ -119,7 +120,8 @@ make adb -s device_id push mobilenet_v1 /data/local/tmp/ adb -s device_id push test_img_propress /data/local/tmp/ adb -s device_id push test.jpg /data/local/tmp/ -adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb -s device_id push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +#adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ adb -s device_id shell chmod +x /data/local/tmp/test_model_cv adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/test_img_propress /data/local/tmp/test.jpg /data/local/tmp/ 3 3 1 3 224 224 /data/local/tmp/mobilenet_v1 " diff --git a/lite/demo/cxx/test_cv/test_img_prepross.cc b/lite/demo/cxx/test_cv/test_img_prepross.cc index c2cbd66cc0a15a1032141641d83fbf8db85d20bf..1fe632d387cb5ed7a94ad1fcc37d4313b452d368 100644 --- a/lite/demo/cxx/test_cv/test_img_prepross.cc +++ b/lite/demo/cxx/test_cv/test_img_prepross.cc @@ -28,362 +28,1034 @@ typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess; typedef paddle::lite_api::DataLayoutType LayoutType; using namespace paddle::lite_api; // NOLINT -void fill_with_mat(cv::Mat& mat, uint8_t* src) { // NOLINT +// crop point +int flag_left_x = 50; +int flag_left_y = 50; +void fill_with_mat(cv::Mat& mat, uint8_t* src, int num) { // NOLINT for (int i = 0; i < mat.rows; i++) { for (int j = 0; j < mat.cols; j++) { - int tmp = (i * mat.cols + j) * 3; - cv::Vec3b& rgb = mat.at(i, j); - rgb[0] = src[tmp]; - rgb[1] = src[tmp + 1]; - rgb[2] = src[tmp + 2]; - } - } -} -void test_img(std::vector cluster_id, - std::vector thread_num, - std::string img_path, - std::string dst_path, - ImageFormat srcFormat, - ImageFormat dstFormat, - int width, - int height, - float rotate, - FlipParam flip, - LayoutType layout, - std::string model_dir, - int test_iter = 1) { - // init - // paddle::lite::DeviceInfo::Init(); - // read img and pre-process - cv::Mat img = imread(img_path, cv::IMREAD_COLOR); - float means[3] = {0.485f, 0.456f, 0.406f}; - float scales[3] = {0.229f, 0.224f, 0.225f}; - int srch = img.rows; - int srcw = img.cols; - for (auto& cls : cluster_id) { - for (auto& th : thread_num) { - std::cout << "cluster: " << cls << ", threads: " << th << std::endl; - // 1. Set MobileConfig - MobileConfig config; - config.set_model_dir(model_dir); - config.set_power_mode((PowerMode)cls); - config.set_threads(th); - std::cout << "model: " << model_dir; - - // 2. Create PaddlePredictor by MobileConfig - std::shared_ptr predictor = - CreatePaddlePredictor(config); - - // 3. Prepare input data from image - std::unique_ptr input_tensor(predictor->GetInput(0)); - - /* - imread(img_path, param) - IMREAD_UNCHANGED(<0) 表示加载原图,不做任何改变 - IMREAD_GRAYSCALE ( 0)表示把原图作为灰度图像加载进来 - IMREAD_COLOR (>0) 表示把原图作为RGB图像加载进来 - */ - cv::Mat img; - if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { - img = imread(img_path, cv::IMREAD_COLOR); - } else if (srcFormat == ImageFormat::GRAY) { - img = imread(img_path, cv::IMREAD_GRAYSCALE); + if (num == 1) { + int tmp = (i * mat.cols + j); + } else if (num == 2) { + int tmp = (i * mat.cols + j) * 2; + cv::Vec2b& rgb = mat.at(i, j); + rgb[0] = src[tmp]; + rgb[1] = src[tmp + 1]; + rgb[2] = src[tmp + 2]; + } else if (num == 3) { + int tmp = (i * mat.cols + j) * 3; + cv::Vec3b& rgb = mat.at(i, j); + rgb[0] = src[tmp]; + rgb[1] = src[tmp + 1]; + rgb[2] = src[tmp + 2]; + } else if (num == 4) { + int tmp = (i * mat.cols + j) * 4; + cv::Vec4b& rgb = mat.at(i, j); + rgb[0] = src[tmp]; + rgb[1] = src[tmp + 1]; + rgb[2] = src[tmp + 2]; + rgb[3] = src[tmp + 3]; } else { - printf("this format %d does not support \n", srcFormat); + std::cout << "it is not support" << std::endl; return; } - if (img.empty()) { - std::cout << "opencv read image " << img_path.c_str() << " failed" - << std::endl; - return; - } - int srch = img.rows; - int srcw = img.cols; - int dsth = height; - int dstw = width; - - std::cout << " input tensor size, num= " << 1 << ", channel= " << 1 - << ", height= " << srch << ", width= " << srcw - << ", srcFormat= " << (ImageFormat)srcFormat << std::endl; - // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12, - if (srcFormat == ImageFormat::GRAY) { - std::cout << "srcFormat: GRAY" << std::endl; - } - if (srcFormat == ImageFormat::BGR) { - std::cout << "srcFormat: BGR" << std::endl; - } - if (srcFormat == ImageFormat::RGB) { - std::cout << "srcFormat: RGB" << std::endl; - } - std::cout << " output tensor size, num=" << 1 << ", channel=" << 1 - << ", height=" << dsth << ", width=" << dstw - << ", dstFormat= " << (ImageFormat)dstFormat << std::endl; + } + } +} - if (dstFormat == ImageFormat::GRAY) { - std::cout << "dstFormat: GRAY" << std::endl; - } - if (dstFormat == ImageFormat::BGR) { - std::cout << "dstFormat: BGR" << std::endl; - } - if (dstFormat == ImageFormat::RGB) { - std::cout << "dstFormat: RGB" << std::endl; - } +double compare_diff(uint8_t* data1, uint8_t* data2, int size, uint8_t* diff_v) { + double diff = 0.0; + for (int i = 0; i < size; i++) { + double val = abs(data1[i] - data2[i]); + diff_v[i] = val; + diff = val > diff ? val : diff; + } + return diff; +} +void print_data(const uint8_t* data, int size) { + for (int i = 0; i < size; i++) { + if ((i + 1) % 10 == 0) { + std::cout << std::endl; + } + } + std::cout << std::endl; +} +bool test_convert(bool cv_run, + const uint8_t* src, + cv::Mat img, + ImagePreprocess image_preprocess, + int in_size, + int out_size, + ImageFormat srcFormat, + ImageFormat dstFormat, + int dsth, + int dstw, + std::string dst_path, + int test_iter = 1) { + // out + uint8_t* resize_cv = new uint8_t[out_size]; + uint8_t* resize_lite = new uint8_t[out_size]; + cv::Mat im_resize; - std::cout << "Rotate = " << rotate << ", Flip = " << flip - << ", Layout = " << static_cast(layout) << std::endl; - if (static_cast(layout) != 1 && static_cast(layout) != 3) { - std::cout << "this layout" << static_cast(layout) - << " is no support" << std::endl; + double to_cv = 0.0; + double to_lite = 0.0; + std::cout << "opencv compute:" << std::endl; + if (cv_run) { + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // convert bgr-gray + if (dstFormat == srcFormat) { + cv::Rect rect(0, 0, dstw, dsth); + im_resize = img(rect); + } else if ((dstFormat == ImageFormat::BGR || + dstFormat == ImageFormat::RGB) && + srcFormat == ImageFormat::GRAY) { + cv::cvtColor(img, im_resize, cv::COLOR_GRAY2BGR); + } else if ((srcFormat == ImageFormat::BGR || + dstFormat == ImageFormat::RGBA) && + dstFormat == ImageFormat::GRAY) { + cv::cvtColor(img, im_resize, cv::COLOR_BGR2GRAY); + } else if (dstFormat == srcFormat) { + printf("convert format error \n"); + return false; } - int size = 3 * srch * srcw; - if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { - size = 3 * srch * srcw; - } else if (srcFormat == ImageFormat::GRAY) { - size = srch * srcw; + clock_t end = clock(); + to_cv += (end - begin); + } + } + + std::cout << "lite compute:" << std::endl; + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // resize default linear + image_preprocess.imageConvert(src, resize_lite); + clock_t end = clock(); + to_lite += (end - begin); + } + to_cv = 1000 * to_cv / CLOCKS_PER_SEC; + to_lite = 1000 * to_lite / CLOCKS_PER_SEC; + + std::cout << "---opencv convert run time: " << to_cv + << "ms, avg: " << to_cv / test_iter << std::endl; + std::cout << "---lite convert run time: " << to_lite + << "ms, avg: " << to_lite / test_iter << std::endl; + std::cout << "compare diff: " << std::endl; + + if (cv_run) { + resize_cv = im_resize.data; + uint8_t* diff_v = new uint8_t[out_size]; + double diff = compare_diff(resize_cv, resize_lite, out_size, diff_v); + if (diff > 1) { + std::cout << "din: " << std::endl; + print_data(src, in_size); + std::cout << "cv out: " << std::endl; + print_data(resize_cv, out_size); + std::cout << "lite out: " << std::endl; + print_data(resize_lite, out_size); + std::cout << "lite out: " << std::endl; + print_data(diff_v, out_size); + delete[] diff_v; + delete[] resize_cv; + delete[] resize_lite; + return false; + } else { + // save_img + std::cout << "write image: " << std::endl; + std::string resize_name = dst_path + "/convert.jpg"; + cv::Mat resize_mat; + int num = 1; + if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC3); + num = 3; + } else if (dstFormat == ImageFormat::BGRA || + dstFormat == ImageFormat::RGBA) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC4); + num = 4; + } else if (dstFormat == ImageFormat::GRAY) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC1); + num = 1; + } else if (dstFormat == ImageFormat::NV12) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC2); + num = 2; } - uint8_t* src = img.data; + fill_with_mat(resize_mat, resize_lite, num); + cv::imwrite(resize_name, resize_mat); - int out_size = srch * srcw; - int resize = dstw * dsth; + std::cout << "convert successed!" << std::endl; + delete[] diff_v; + delete[] resize_cv; + delete[] resize_lite; + return true; + } + } + delete[] resize_cv; + delete[] resize_lite; + return false; +} + +bool test_flip(bool cv_run, + const uint8_t* src, + cv::Mat img, + ImagePreprocess image_preprocess, + int in_size, + int out_size, + FlipParam flip, + ImageFormat dstFormat, + int dsth, + int dstw, + std::string dst_path, + int test_iter = 1) { + // out + uint8_t* resize_cv = new uint8_t[out_size]; + uint8_t* resize_lite = new uint8_t[out_size]; + cv::Mat im_resize; + + double to_cv = 0.0; + double to_lite = 0.0; + std::cout << "opencv compute:" << std::endl; + if (cv_run) { + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // resize default linear + cv::flip(img, im_resize, flip); + clock_t end = clock(); + to_cv += (end - begin); + } + } + std::cout << "lite compute:" << std::endl; + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // resize default linear + image_preprocess.imageFlip(src, resize_lite); + clock_t end = clock(); + to_lite += (end - begin); + } + to_cv = 1000 * to_cv / CLOCKS_PER_SEC; + to_lite = 1000 * to_lite / CLOCKS_PER_SEC; + + std::cout << "---opencv flip run time: " << to_cv + << "ms, avg: " << to_cv / test_iter << std::endl; + std::cout << "---lite flip run time: " << to_lite + << "ms, avg: " << to_lite / test_iter << std::endl; + std::cout << "compare diff: " << std::endl; + + if (cv_run) { + resize_cv = im_resize.data; + uint8_t* diff_v = new uint8_t[out_size]; + double diff = compare_diff(resize_cv, resize_lite, out_size, diff_v); + if (diff > 1) { + std::cout << "din: " << std::endl; + print_data(src, in_size); + std::cout << "cv out: " << std::endl; + print_data(resize_cv, out_size); + std::cout << "lite out: " << std::endl; + print_data(resize_lite, out_size); + std::cout << "diff out: " << std::endl; + print_data(diff_v, out_size); + delete[] diff_v; + delete[] resize_cv; + delete[] resize_lite; + return false; + } else { + // save_img + std::cout << "write image: " << std::endl; + std::string resize_name = dst_path + "/flip.jpg"; + cv::Mat resize_mat; + int num = 1; if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) { - out_size = 3 * srch * srcw; - resize = 3 * dsth * dstw; + resize_mat = cv::Mat(dsth, dstw, CV_8UC3); + num = 3; + } else if (dstFormat == ImageFormat::BGRA || + dstFormat == ImageFormat::RGBA) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC4); + num = 4; } else if (dstFormat == ImageFormat::GRAY) { - out_size = srch * srcw; - resize = dsth * dstw; + resize_mat = cv::Mat(dsth, dstw, CV_8UC1); + num = 1; + } else if (dstFormat == ImageFormat::NV12) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC2); + num = 2; } - // out - uint8_t* lite_dst = new uint8_t[out_size]; - uint8_t* resize_tmp = new uint8_t[resize]; - uint8_t* tv_out_ratote = new uint8_t[out_size]; - uint8_t* tv_out_flip = new uint8_t[out_size]; - std::vector shape_out = {1, 3, srch, srcw}; - - input_tensor->Resize(shape_out); - Tensor dst_tensor = *input_tensor; - std::cout << "opencv compute" << std::endl; - cv::Mat im_convert; - cv::Mat im_resize; - cv::Mat im_rotate; - cv::Mat im_flip; - double to_1 = 0; - double to_2 = 0; - double to_3 = 0; - double to_4 = 0; - double to1 = 0; - for (int i = 0; i < test_iter; i++) { - clock_t start = clock(); - clock_t begin = clock(); - // convert bgr-gray - if (dstFormat == srcFormat) { - im_convert = img; - } else if (dstFormat == ImageFormat::BGR && - srcFormat == ImageFormat::GRAY) { - cv::cvtColor(img, im_convert, cv::COLOR_GRAY2BGR); - } else if (srcFormat == ImageFormat::BGR && - dstFormat == ImageFormat::GRAY) { - cv::cvtColor(img, im_convert, cv::COLOR_BGR2GRAY); - } else if (dstFormat == srcFormat) { - printf("convert format error \n"); - return; - } - clock_t end = clock(); - to_1 += (end - begin); - - begin = clock(); - // resize default linear - cv::resize(im_convert, im_resize, cv::Size(dstw, dsth), 0.f, 0.f); - end = clock(); - to_2 += (end - begin); - - begin = clock(); - // rotate 90 - if (rotate == 90) { - cv::flip(im_convert.t(), im_rotate, 1); - } else if (rotate == 180) { - cv::flip(im_convert, im_rotate, -1); - } else if (rotate == 270) { - cv::flip(im_convert.t(), im_rotate, 0); - } - end = clock(); - to_3 += (end - begin); - - begin = clock(); - // flip - cv::flip(im_convert, im_flip, flip); - end = clock(); - to_4 += (end - begin); - clock_t ovet = clock(); - to1 += (ovet - start); + fill_with_mat(resize_mat, resize_lite, num); + cv::imwrite(resize_name, resize_mat); + std::cout << "flip successed!" << std::endl; + delete[] diff_v; + delete[] resize_cv; + delete[] resize_lite; + return true; + } + } + delete[] resize_cv; + delete[] resize_lite; + return false; +} + +bool test_rotate(bool cv_run, + const uint8_t* src, + cv::Mat img, + ImagePreprocess image_preprocess, + int in_size, + int out_size, + float rotate, + ImageFormat dstFormat, + int dsth, + int dstw, + std::string dst_path, + int test_iter = 1) { + // out + uint8_t* resize_cv = new uint8_t[out_size]; + uint8_t* resize_lite = new uint8_t[out_size]; + cv::Mat im_resize; + + double to_cv = 0.0; + double to_lite = 0.0; + std::cout << "opencv compute:" << std::endl; + if (cv_run) { + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // rotate 90 + if (rotate == 90) { + cv::flip(img.t(), im_resize, 1); + } else if (rotate == 180) { + cv::flip(img, im_resize, -1); + } else if (rotate == 270) { + cv::flip(img.t(), im_resize, 0); } + clock_t end = clock(); + to_cv += (end - begin); + } + } + // lite + std::cout << "lite compute:" << std::endl; + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // resize default linear + image_preprocess.imageRotate(src, resize_lite); + clock_t end = clock(); + to_lite += (end - begin); + } + to_cv = 1000 * to_cv / CLOCKS_PER_SEC; + to_lite = 1000 * to_lite / CLOCKS_PER_SEC; - std::cout << "Paddle-lite compute" << std::endl; - double lite_to = 0; - double lite_to_1 = 0; - double lite_to_2 = 0; - double lite_to_3 = 0; - double lite_to_4 = 0; - double lite_to_5 = 0; - TransParam tparam; - tparam.ih = srch; - tparam.iw = srcw; - tparam.oh = dsth; - tparam.ow = dstw; - tparam.flip_param = flip; - tparam.rotate_param = rotate; - - ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); - - for (int i = 0; i < test_iter; ++i) { - clock_t start = clock(); - clock_t begin = clock(); - image_preprocess.imageConvert(src, lite_dst); - clock_t end = clock(); - lite_to_1 += (end - begin); - - begin = clock(); - image_preprocess.imageResize(lite_dst, resize_tmp); - end = clock(); - lite_to_2 += (end - begin); - - begin = clock(); - image_preprocess.imageRotate( - lite_dst, tv_out_ratote, (ImageFormat)dstFormat, srcw, srch, 90); - end = clock(); - lite_to_3 += (end - begin); - - begin = clock(); - image_preprocess.imageFlip( - lite_dst, tv_out_flip, (ImageFormat)dstFormat, srcw, srch, flip); - end = clock(); - lite_to_4 += (end - begin); - - clock_t over = clock(); - lite_to += (over - start); - - begin = clock(); - image_preprocess.image2Tensor(lite_dst, - &dst_tensor, - (ImageFormat)dstFormat, - srcw, - srch, - layout, - means, - scales); - end = clock(); - lite_to_5 += (end - begin); + std::cout << "---opencv rotate run time: " << to_cv + << "ms, avg: " << to_cv / test_iter << std::endl; + std::cout << "---lite rotate run time: " << to_lite + << "ms, avg: " << to_lite / test_iter << std::endl; + std::cout << "compare diff: " << std::endl; + if (cv_run) { + resize_cv = im_resize.data; + uint8_t* diff_v = new uint8_t[out_size]; + double diff = compare_diff(resize_cv, resize_lite, out_size, diff_v); + if (diff > 1) { + std::cout << "din: " << std::endl; + print_data(src, in_size); + std::cout << "cv out: " << std::endl; + print_data(resize_cv, out_size); + std::cout << "lite out: " << std::endl; + print_data(resize_lite, out_size); + std::cout << "diff out: " << std::endl; + print_data(diff_v, out_size); + delete[] diff_v; + delete[] resize_cv; + delete[] resize_lite; + return false; + } else { + // save_img + std::cout << "write image: " << std::endl; + std::string resize_name = dst_path + "/rotate.jpg"; + cv::Mat resize_mat; + int num = 1; + if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC3); + num = 3; + } else if (dstFormat == ImageFormat::BGRA || + dstFormat == ImageFormat::RGBA) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC4); + num = 4; + } else if (dstFormat == ImageFormat::GRAY) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC1); + num = 1; + } else if (dstFormat == ImageFormat::NV12) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC2); + num = 2; } - to_1 = 1000 * to_1 / CLOCKS_PER_SEC; - to_2 = 1000 * to_2 / CLOCKS_PER_SEC; - to_3 = 1000 * to_3 / CLOCKS_PER_SEC; - to_4 = 1000 * to_4 / CLOCKS_PER_SEC; - to1 = 1000 * to1 / CLOCKS_PER_SEC; - std::cout << "opencv convert run time: " << to_1 - << "ms, avg: " << to_1 / test_iter << std::endl; - std::cout << "opencv resize run time: " << to_2 - << "ms, avg: " << to_2 / test_iter << std::endl; - std::cout << "opencv rotate run time: " << to_3 - << "ms, avg: " << to_3 / test_iter << std::endl; - std::cout << "opencv flip time: " << to_4 - << "ms, avg: " << to_4 / test_iter << std::endl; - std::cout << "opencv total run time: " << to1 - << "ms, avg: " << to1 / test_iter << std::endl; - std::cout << "------" << std::endl; - - lite_to_1 = 1000 * lite_to_1 / CLOCKS_PER_SEC; - lite_to_2 = 1000 * lite_to_2 / CLOCKS_PER_SEC; - lite_to_3 = 1000 * lite_to_3 / CLOCKS_PER_SEC; - lite_to_4 = 1000 * lite_to_4 / CLOCKS_PER_SEC; - lite_to_5 = 1000 * lite_to_5 / CLOCKS_PER_SEC; - lite_to = 1000 * lite_to / CLOCKS_PER_SEC; - std::cout << "lite convert run time: " << lite_to_1 - << "ms, avg: " << lite_to_1 / test_iter << std::endl; - std::cout << "lite resize run time: " << lite_to_2 - << "ms, avg: " << lite_to_2 / test_iter << std::endl; - std::cout << "lite rotate run time: " << lite_to_3 - << "ms, avg: " << lite_to_3 / test_iter << std::endl; - std::cout << "lite flip time: " << lite_to_4 - << "ms, avg: " << lite_to_4 / test_iter << std::endl; - std::cout << "lite total run time: " << lite_to - << "ms, avg: " << lite_to / test_iter << std::endl; - std::cout << "lite img2tensor time: " << lite_to_5 - << "ms, avg: " << lite_to_5 / test_iter << std::endl; - std::cout << "------" << std::endl; - - double max_ratio = 0; - double max_diff = 0; - const double eps = 1e-6f; + fill_with_mat(resize_mat, resize_lite, num); + cv::imwrite(resize_name, resize_mat); + std::cout << "rotate successed!" << std::endl; + delete[] diff_v; + delete[] resize_cv; + delete[] resize_lite; + return true; + } + } + delete[] resize_cv; + delete[] resize_lite; + return false; +} + +bool test_resize(bool cv_run, + const uint8_t* src, + cv::Mat img, + ImagePreprocess image_preprocess, + int in_size, + int out_size, + ImageFormat dstFormat, + int dsth, + int dstw, + std::string dst_path, + int test_iter = 1) { + // out + uint8_t* resize_cv = new uint8_t[out_size]; + uint8_t* resize_lite = new uint8_t[out_size]; + cv::Mat im_resize; + + double to_cv = 0.0; + double to_lite = 0.0; + std::cout << "opencv compute:" << std::endl; + if (cv_run) { + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // resize default linear + cv::resize(img, im_resize, cv::Size(dstw, dsth), 0.f, 0.f); + clock_t end = clock(); + to_cv += (end - begin); + } + } + // param + std::cout << "lite compute:" << std::endl; + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // resize default linear + image_preprocess.imageResize(src, resize_lite); + clock_t end = clock(); + to_lite += (end - begin); + } + to_cv = 1000 * to_cv / CLOCKS_PER_SEC; + to_lite = 1000 * to_lite / CLOCKS_PER_SEC; + + std::cout << "---opencv resize run time: " << to_cv + << "ms, avg: " << to_cv / test_iter << std::endl; + std::cout << "---lite resize run time: " << to_lite + << "ms, avg: " << to_lite / test_iter << std::endl; + std::cout << "compare diff: " << std::endl; + + if (cv_run) { + resize_cv = im_resize.data; + uint8_t* diff_v = new uint8_t[out_size]; + double diff = compare_diff(resize_cv, resize_lite, out_size, diff_v); + if (diff > 10) { + std::cout << "din: " << std::endl; + print_data(src, in_size); + std::cout << "cv out: " << std::endl; + print_data(resize_cv, out_size); + std::cout << "lite out: " << std::endl; + print_data(resize_lite, out_size); + std::cout << "diff out: " << std::endl; + print_data(diff_v, out_size); + delete[] diff_v; + delete[] resize_cv; + delete[] resize_lite; + return false; + } else { // save_img std::cout << "write image: " << std::endl; std::string resize_name = dst_path + "/resize.jpg"; - std::string convert_name = dst_path + "/convert.jpg"; - std::string rotate_name = dst_path + "/rotate.jpg"; - std::string flip_name = dst_path + "/flip.jpg"; - cv::Mat resize_mat(dsth, dstw, CV_8UC3); - cv::Mat convert_mat(srch, srcw, CV_8UC3); - cv::Mat rotate_mat; - if (rotate == 90 || rotate == 270) { - rotate_mat = cv::Mat(srcw, srch, CV_8UC3); - } else { - rotate_mat = cv::Mat(srch, srcw, CV_8UC3); + cv::Mat resize_mat; + int num = 1; + if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC3); + num = 3; + } else if (dstFormat == ImageFormat::BGRA || + dstFormat == ImageFormat::RGBA) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC4); + num = 4; + } else if (dstFormat == ImageFormat::GRAY) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC1); + num = 1; + } else if (dstFormat == ImageFormat::NV12) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC2); + num = 2; + } + fill_with_mat(resize_mat, resize_lite, num); + cv::imwrite(resize_name, resize_mat); + std::cout << "resize successed!" << std::endl; + delete[] diff_v; + delete[] resize_cv; + delete[] resize_lite; + return true; + } + } + delete[] resize_cv; + delete[] resize_lite; + return false; +} + +bool test_crop(bool cv_run, + const uint8_t* src, + cv::Mat img, + ImagePreprocess image_preprocess, + int in_size, + int out_size, + ImageFormat dstFormat, + int left_x, + int left_y, + int dstw, + int dsth, + std::string dst_path, + int test_iter = 1) { + uint8_t* resize_cv = new uint8_t[out_size]; + uint8_t* resize_lite = new uint8_t[out_size]; + + cv::Mat im_resize; + + double to_cv = 0.0; + double to_lite = 0.0; + std::cout << "opencv compute:" << std::endl; + if (cv_run) { + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + cv::Rect rect(left_x, left_y, dstw, dsth); + im_resize = img(rect); + clock_t end = clock(); + to_cv += (end - begin); + } + } + // lite + int srcw = img.cols; + int srch = img.rows; + std::cout << "lite compute:" << std::endl; + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + image_preprocess.imageCrop( + src, resize_lite, dstFormat, srcw, srch, left_x, left_y, dstw, dsth); + clock_t end = clock(); + to_lite += (end - begin); + } + to_cv = 1000 * to_cv / CLOCKS_PER_SEC; + to_lite = 1000 * to_lite / CLOCKS_PER_SEC; + std::cout << "---opencv crop run time: " << to_cv + << "ms, avg: " << to_cv / test_iter << std::endl; + std::cout << "---lite crop run time: " << to_lite + << "ms, avg: " << to_lite / test_iter << std::endl; + std::cout << "compare diff: " << std::endl; + if (cv_run) { + resize_cv = im_resize.data; + uint8_t* diff_v = new uint8_t[out_size]; + double diff = compare_diff(resize_cv, resize_lite, out_size, diff_v); + diff = 0; + if (diff > 1) { + std::cout << "din: " << std::endl; + print_data(src, in_size); + std::cout << "cv out: " << std::endl; + print_data(resize_cv, out_size); + std::cout << "lite out: " << std::endl; + print_data(resize_lite, out_size); + std::cout << "diff out: " << std::endl; + print_data(diff_v, out_size); + delete[] diff_v; + delete[] resize_cv; + delete[] resize_lite; + return false; + } else { + // save_img + std::cout << "write image: " << std::endl; + std::string resize_name = dst_path + "/crop.jpg"; + cv::Mat resize_mat; + int num = 1; + if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC3); + num = 3; + } else if (dstFormat == ImageFormat::BGRA || + dstFormat == ImageFormat::RGBA) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC4); + num = 4; + } else if (dstFormat == ImageFormat::GRAY) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC1); + num = 1; + } else if (dstFormat == ImageFormat::NV12) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC2); + num = 2; } - cv::Mat flip_mat(srch, srcw, CV_8UC3); - fill_with_mat(resize_mat, resize_tmp); - fill_with_mat(convert_mat, lite_dst); - fill_with_mat(rotate_mat, tv_out_ratote); - fill_with_mat(flip_mat, tv_out_flip); - cv::imwrite(convert_name, convert_mat); + fill_with_mat(resize_mat, resize_lite, num); cv::imwrite(resize_name, resize_mat); - cv::imwrite(rotate_name, rotate_mat); - cv::imwrite(flip_name, flip_mat); - delete[] lite_dst; - delete[] resize_tmp; - delete[] tv_out_ratote; - delete[] tv_out_flip; + std::cout << "crop successed!" << std::endl; + delete[] diff_v; + delete[] resize_cv; + delete[] resize_lite; + return true; + } + } + delete[] resize_cv; + delete[] resize_lite; + return false; +} +void test_custom(bool has_img, // input is image + std::string img_path, + std::string in_txt, + std::string dst_path, + ImageFormat srcFormat, + ImageFormat dstFormat, + int srcw, + int srch, + int dstw, + int dsth, + float rotate, + FlipParam flip, + int test_iter = 1) { + // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12, + cv::Mat img; + uint8_t* src = nullptr; + int in_size = 0; + if (has_img) { + if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + img = imread(img_path, cv::IMREAD_COLOR); + } else if (srcFormat == ImageFormat::GRAY) { + img = imread(img_path, cv::IMREAD_GRAYSCALE); + } else { + printf("this format %d does not support \n", srcFormat); + return; + } + srcw = img.cols; + srch = img.rows; + src = img.data; + } + bool cv_run = true; + if (srcFormat == ImageFormat::GRAY) { + std::cout << "srcFormat: GRAY" << std::endl; + cv_run = false; + } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + in_size = 3 * srch * srcw; + std::cout << "srcFormat: BGR/RGB" << std::endl; + } else if (srcFormat == ImageFormat::RGBA || srcFormat == ImageFormat::BGRA) { + in_size = 4 * srch * srcw; + std::cout << "srcFormat: BGRA/RGBA" << std::endl; + } else if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) { + in_size = (3 * srch * srcw) / 2; + cv_run = false; + std::cout << "srcFormat: NV12/NV12" << std::endl; + } + int out_size = dstw * dsth; + // out + if (dstFormat == ImageFormat::GRAY) { + std::cout << "dstFormat: GRAY" << std::endl; + } else if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) { + out_size = 3 * dsth * dstw; + std::cout << "dstFormat: BGR/RGB" << std::endl; + } else if (dstFormat == ImageFormat::RGBA || dstFormat == ImageFormat::BGRA) { + out_size = 4 * dsth * dstw; + std::cout << "dstFormat: BGRA/RGBA" << std::endl; + } else if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) { + out_size = (3 * dsth * dstw) / 2; + cv_run = false; + std::cout << "dstFormat: NV12/NV12" << std::endl; + } + + if (!has_img) { + src = new uint8_t[in_size]; + // read txt + FILE* fp = fopen(in_txt.c_str(), "r"); + for (int i = 0; i < in_size; i++) { + fscanf(fp, "%d\n", &src[i]); + } + fclose(fp); + int num = 1; + if (srcFormat == ImageFormat::GRAY) { + img = cv::Mat(srch, srcw, CV_8UC1); + } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + img = cv::Mat(srch, srcw, CV_8UC3); + num = 3; + } else if (srcFormat == ImageFormat::BGRA || + srcFormat == ImageFormat::RGBA) { + img = cv::Mat(srch, srcw, CV_8UC4); + num = 4; + } else if (srcFormat == ImageFormat::NV12 || + srcFormat == ImageFormat::NV21) { + img = cv::Mat(srch, srcw, CV_8UC2); + num = 2; + std::cout << "CV not support NV12"; + } + fill_with_mat(img, src, num); + std::string name = dst_path + "input.jpg"; + cv::imwrite(name, img); // shurutup + } + + TransParam tparam; + tparam.ih = srch; + tparam.iw = srcw; + tparam.oh = srch; + tparam.ow = srcw; + tparam.flip_param = flip; + tparam.rotate_param = rotate; + + TransParam tparam1; + tparam1.ih = srch; + tparam1.iw = srcw; + tparam1.oh = dsth; + tparam1.ow = dstw; + tparam1.flip_param = flip; + tparam1.rotate_param = rotate; + + ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); + std::cout << "cv_run: " << cv_run << std::endl; + std::cout << "image crop testing" << std::endl; + bool res = test_crop(cv_run, + src, + img, + image_preprocess, + in_size, + out_size, + dstFormat, + flag_left_x, + flag_left_y, + dstw, + dsth, + dst_path, + test_iter); + if (!res) { + return; + } + std::cout << "image convert testing" << std::endl; + bool re = test_convert(cv_run, + src, + img, + image_preprocess, + in_size, + out_size, + srcFormat, + dstFormat, + srch, + srcw, + dst_path, + test_iter); + if (!re) { + return; + } + std::cout << "image resize testing" << std::endl; + tparam.oh = dsth; + tparam.ow = dstw; + ImagePreprocess image_preprocess1(srcFormat, srcFormat, tparam1); + re = test_resize(cv_run, + src, + img, + image_preprocess1, + in_size, + out_size, + srcFormat, + dsth, + dstw, + dst_path, + test_iter); + if (!re) { + return; + } + + std::cout << "image rotate testing" << std::endl; + if (rotate == 90 || rotate == 270) { + tparam.oh = srcw; + tparam.ow = srch; + dsth = srcw; + dstw = srch; + } else { + tparam.oh = srch; + tparam.ow = srcw; + dsth = srch; + dstw = srcw; + } + ImagePreprocess image_preprocess2(srcFormat, srcFormat, tparam); + re = test_rotate(cv_run, + src, + img, + image_preprocess2, + in_size, + out_size, + rotate, + srcFormat, + dsth, + dstw, + dst_path, + test_iter); + if (!re) { + return; + } + tparam.oh = srch; + tparam.ow = srcw; + ImagePreprocess image_preprocess3(srcFormat, srcFormat, tparam); + std::cout << "image flip testing" << std::endl; + re = test_flip(cv_run, + src, + img, + image_preprocess3, + in_size, + out_size, + flip, + srcFormat, + srch, + srcw, + dst_path, + test_iter); + if (!re) { + return; + } +} + +#if 0 +void test_all_r(std::string dst_path, int test_iter = 1) { + // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12, + cv::Mat img; + uint8_t* src = nullptr; + int in_size = 0; + for (auto& srcFormat : {1, 3, 4, 11}) { + for (auto& dstFormat : {1, 3, 4, 11}) { + for (auto& srcw : {10, 112, 200}) { + for (auto& srch : {10, 224, 400}) { + for (auto& dstw : {12, 224, 180}) { + for (auto& dsth : {12, 224, 320}) { + for (auto& flip : {-1, 0, 1}) { + for (auto& rotate : {90, 180, 270}) { + TransParam tparam; + tparam.ih = srch; + tparam.iw = srcw; + tparam.oh = srch; + tparam.ow = srcw; + tparam.flip_param = (FlipParam)flip; + tparam.rotate_param = rotate; + + TransParam tparam1; + tparam1.ih = srch; + tparam1.iw = srcw; + tparam1.oh = dsth; + tparam1.ow = dstw; + tparam1.flip_param = (FlipParam)flip; + tparam.rotate_param = rotate; + + ImagePreprocess image_preprocess( + (ImageFormat)srcFormat, (ImageFormat)dstFormat, tparam); + ImagePreprocess image_preprocess1( + (ImageFormat)srcFormat, (ImageFormat)srcFormat, tparam1); + ImagePreprocess image_preprocess2( + (ImageFormat)srcFormat, (ImageFormat)srcFormat, tparam); + int h = srch; + int w = srcw; + if (rotate == 90 || rotate == 270) { + tparam.oh = srcw; + h = srcw; + tparam.ow = srch; + w = srch; + } + ImagePreprocess image_preprocess3( + (ImageFormat)srcFormat, (ImageFormat)srcFormat, tparam); + int in_size = srcw * srch; + int out_size = dstw * dsth; + if (srcFormat == ImageFormat::GRAY) { + std::cout << "srcFormat: GRAY" << std::endl; + } else if (srcFormat == ImageFormat::BGR || + srcFormat == ImageFormat::RGB) { + in_size = 3 * srch * srcw; + std::cout << "srcFormat: BGR/RGB" << std::endl; + } else if (srcFormat == ImageFormat::RGBA || + srcFormat == ImageFormat::BGRA) { + in_size = 4 * srch * srcw; + std::cout << "srcFormat: BGRA/RGBA" << std::endl; + } else if (srcFormat == ImageFormat::NV12 || + srcFormat == ImageFormat::NV21) { + in_size = (3 * srch * srcw) / 2; + std::cout << "srcFormat: NV12/NV12" << std::endl; + } + // out + if (dstFormat == ImageFormat::GRAY) { + std::cout << "dstFormat: GRAY" << std::endl; + } else if (dstFormat == ImageFormat::BGR || + dstFormat == ImageFormat::RGB) { + out_size = 3 * dsth * dstw; + std::cout << "dstFormat: BGR/RGB" << std::endl; + } else if (dstFormat == ImageFormat::RGBA || + dstFormat == ImageFormat::BGRA) { + out_size = 4 * dsth * dstw; + std::cout << "dstFormat: BGRA/RGBA" << std::endl; + } else if (dstFormat == ImageFormat::NV12 || + dstFormat == ImageFormat::NV21) { + out_size = (3 * dsth * dstw) / 2; + std::cout << "dstFormat: NV12/NV12" << std::endl; + } + // init + uint8_t* src = new uint8_t[in_size]; + for (int i = 0; i < in_size; i++) { + src[i] = i % 255; + } + cv::Mat img; + int num = 1; + bool cv_run = true; + if (srcFormat == ImageFormat::GRAY) { + img = cv::Mat(srch, srcw, CV_8UC1); + cv_run = false; + } else if (srcFormat == ImageFormat::BGR || + srcFormat == ImageFormat::RGB) { + img = cv::Mat(srch, srcw, CV_8UC3); + num = 3; + } else if (srcFormat == ImageFormat::BGRA || + srcFormat == ImageFormat::RGBA) { + img = cv::Mat(srch, srcw, CV_8UC4); + num = 4; + } else if (srcFormat == ImageFormat::NV12 || + srcFormat == ImageFormat::NV21) { + img = cv::Mat(srch, srcw, CV_8UC2); + num = 2; + cv_run = false; + } + fill_with_mat(img, src, num); + std::string name = dst_path + "input.jpg"; + cv::imwrite(name, img); // shurutup + // convert + bool convert = true; + if (srcFormat == 11 || dstFormat == 11) { + // NV12, cv not support + convert = false; + cv_run = false; + } + if (convert) { + std::cout << "image convert testing"; + bool re = test_convert(cv_run, + src, + img, + image_preprocess, + in_size, + out_size, + (ImageFormat)srcFormat, + (ImageFormat)dstFormat, + srch, + srcw, + dst_path, + test_iter); + if (!re) { + return; + } + } + + // resize + std::cout << "image resize testing"; + bool re = test_resize(cv_run, + src, + img, + image_preprocess1, + in_size, + out_size, + (ImageFormat)srcFormat, + dsth, + dstw, + dst_path, + test_iter); + if (convert && !re) { + return; + } + // rotate + std::cout << "image rotate testing"; + + re = test_rotate(cv_run, + src, + img, + image_preprocess3, + in_size, + out_size, + rotate, + (ImageFormat)srcFormat, + h, + w, + dst_path, + test_iter); + if (convert && !re) { + return; + } + // flip + std::cout << "image rotate testing"; + re = test_flip(cv_run, + src, + img, + image_preprocess2, + in_size, + out_size, + (FlipParam)flip, + (ImageFormat)srcFormat, + srch, + srcw, + dst_path, + test_iter); + if (convert && !re) { + return; + } + } + } + } + } + } + } } } } +#endif int main(int argc, char** argv) { if (argc < 7) { std::cerr << "[ERROR] usage: " << argv[0] - << " image_path dst_apth srcFormat dstFormat width height\n"; + << " has_img image_path/txt_path dst_apth srcFormat dstFormat " + "dstw dsth " + << "[options] srcw srch flip rotate test_iter\n "; exit(1); } - std::string image_path = argv[1]; - std::string dst_path = argv[2]; - int srcFormat = atoi(argv[3]); - int dstFormat = atoi(argv[4]); - int width = atoi(argv[5]); - int height = atoi(argv[6]); + bool has_img = atoi(argv[1]); + std::string path = argv[2]; + std::string dst_path = argv[3]; + int srcFormat = atoi(argv[4]); + int dstFormat = atoi(argv[5]); + int dstw = atoi(argv[6]); + int dsth = atoi(argv[7]); + int srcw = 100; + int srch = 100; int flip = -1; float rotate = 90; - int layout = 1; - std::string model_dir = "mobilenet_v1"; - if (argc > 7) { - model_dir = argv[7]; - } - if (argc > 8) { - flip = atoi(argv[8]); - } - if (argc > 9) { - rotate = atoi(argv[9]); - } - if (argc > 10) { - layout = atoi(argv[10]); + int test_iter = 10; + if (!has_img) { + std::cout << "It needs srcw and srch"; + srcw = atoi(argv[8]); + srch = atoi(argv[9]); + if (argc > 10) { + flip = atoi(argv[10]); + } + if (argc > 11) { + rotate = atoi(argv[11]); + } + if (argc > 12) { + test_iter = atoi(argv[12]); + } + } else { + if (argc > 8) { + flip = atoi(argv[8]); + } + if (argc > 9) { + rotate = atoi(argv[9]); + } + if (argc > 10) { + flag_left_x = atoi(argv[10]); + flag_left_y = atoi(argv[11]); + } + if (argc > 12) { + test_iter = atoi(argv[12]); + } } - test_img({3}, - {1, 2, 4}, - image_path, - dst_path, - (ImageFormat)srcFormat, - (ImageFormat)dstFormat, - width, - height, - rotate, - (FlipParam)flip, - (LayoutType)layout, - model_dir, - 20); + test_custom(has_img, + path, + path, + dst_path, + (ImageFormat)srcFormat, + (ImageFormat)dstFormat, + srcw, + srch, + dstw, + dsth, + rotate, + (FlipParam)flip, + test_iter); +#if 0 + test_all_r(dst_path, test_iter); +#endif return 0; } diff --git a/lite/demo/cxx/test_cv/test_model_cv.cc b/lite/demo/cxx/test_cv/test_model_cv.cc index 24f408bf4a55ea2d499e39902201597c0e8c6e4e..caa085eecb81e54859c1bdd5cd7c0654175b7a9a 100644 --- a/lite/demo/cxx/test_cv/test_model_cv.cc +++ b/lite/demo/cxx/test_cv/test_model_cv.cc @@ -111,7 +111,7 @@ void pre_process(const cv::Mat& img, int width, int height, Tensor dstTensor) { #endif } -void RunModel(std::string model_dir, +void RunModel(std::string model_file, std::string img_path, std::vector input_shape, PowerMode power_mode, @@ -120,7 +120,7 @@ void RunModel(std::string model_dir, int warmup = 0) { // 1. Set MobileConfig MobileConfig config; - config.set_model_dir(model_dir); + config.set_model_from_file(model_file); config.set_power_mode(power_mode); config.set_threads(thread_num); @@ -161,7 +161,7 @@ void RunModel(std::string model_dir, } std::cout << "================== Speed Report ===================" << std::endl; - std::cout << "Model: " << model_dir + std::cout << "Model: " << model_file << ", power_mode: " << static_cast(power_mode) << ", threads num " << thread_num << ", warmup: " << warmup << ", repeats: " << test_iter << ", avg time: " << lps / test_iter @@ -187,10 +187,10 @@ void RunModel(std::string model_dir, int main(int argc, char** argv) { if (argc < 7) { std::cerr << "[ERROR] usage: " << argv[0] - << " model_dir image_path input_shape\n"; + << " model_file image_path input_shape\n"; exit(1); } - std::string model_dir = argv[1]; + std::string model_file = argv[1]; std::string img_path = argv[2]; std::vector input_shape; input_shape.push_back(atoi(argv[3])); @@ -213,7 +213,7 @@ int main(int argc, char** argv) { if (argc > 10) { warmup = atoi(argv[10]); } - RunModel(model_dir, + RunModel(model_file, img_path, input_shape, (PowerMode)power_mode, diff --git a/lite/demo/cxx/test_libs/README.md b/lite/demo/cxx/test_libs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..06fa4613581966b1e1839bdabc89cb52ca25c0a2 --- /dev/null +++ b/lite/demo/cxx/test_libs/README.md @@ -0,0 +1,7 @@ +**测试PaddleLite C++预测库** + +1、编译full_publish预测库,需要打开build_extra,比如 `./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static --build_extra=ON full_publish` + +2、进入编译产出的目录,比如 `build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/test_libs`,执行 `sh prepare.sh`,得到所有测试文件在 `test_lite_lib_files` 文件中 + +3、将 `test_lite_lib_files` 文件push到手机上,进入手机端 `test_lite_lib_files` 目录,执行 `sh run.sh`,查看log信息统计测试结果,其中涵盖测试light库、full库、动态库和静态库。 diff --git a/lite/demo/cxx/test_libs/classification_full.cc b/lite/demo/cxx/test_libs/classification_full.cc new file mode 100644 index 0000000000000000000000000000000000000000..2515d6abd89b6714ff731bed28f4e8e8c5c3dd75 --- /dev/null +++ b/lite/demo/cxx/test_libs/classification_full.cc @@ -0,0 +1,185 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle_api.h" // NOLINT +#include "test_helper.h" // NOLINT + +DEFINE_string(model_dir, + "", + "the path of the model, the model and param files is under " + "model_dir."); +DEFINE_string(model_filename, + "", + "the filename of model file. When the model is combined formate, " + "please set model_file."); +DEFINE_string(param_filename, + "", + "the filename of param file, set param_file when the model is " + "combined formate."); +DEFINE_string(img_path, "", "the path of input image"); +DEFINE_string(img_txt_path, + "", + "the path of input image, the image is processed " + " and saved in txt file"); +DEFINE_double(out_max_value, 0.0, "The max value in output tensor"); +DEFINE_double(threshold, + 1e-3, + "If the max value diff is smaller than threshold, pass test"); +DEFINE_int32(out_max_value_index, 65, "The max value index in output tensor"); + +// Optimize model for ARM CPU. +// If the model is not combined, set model_filename and params_filename as empty +void OptModel(const std::string& load_model_dir, + const std::string& model_filename, + const std::string& params_filename, + const std::string& save_model_path) { + paddle::lite_api::CxxConfig config; + config.set_model_dir(load_model_dir); + if (!model_filename.empty() && !params_filename.empty()) { + config.set_model_file(load_model_dir + "/" + model_filename); + config.set_param_file(load_model_dir + "/" + params_filename); + } + std::vector vaild_places = { + paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)}, + paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt32)}, + paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt64)}, + }; + config.set_valid_places(vaild_places); + + auto predictor = paddle::lite_api::CreatePaddlePredictor(config); + + std::string cmd_str = "rm -rf " + save_model_path; + int ret = system(cmd_str.c_str()); + if (ret == 0) { + std::cout << "Delete old optimized model " << save_model_path << std::endl; + } + predictor->SaveOptimizedModel(save_model_path, + paddle::lite_api::LiteModelType::kNaiveBuffer); + std::cout << "Load model from " << load_model_dir << std::endl; + std::cout << "Save optimized model to " << save_model_path << std::endl; +} + +void Run(const std::string& model_path, + const std::string& img_path, + const std::string& img_txt_path, + const float out_max_value, + const int out_max_value_index, + const float threshold, + const int height, + const int width) { + // set config and create predictor + paddle::lite_api::MobileConfig config; + config.set_threads(3); + config.set_model_from_file(model_path); + + auto predictor = paddle::lite_api::CreatePaddlePredictor(config); + + // set input + auto input_tensor = predictor->GetInput(0); + input_tensor->Resize({1, 3, height, width}); + auto input_data = input_tensor->mutable_data(); + if (img_txt_path.size() > 0) { + std::fstream fs(img_txt_path); + if (!fs.is_open()) { + std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl; + } + int num = 1 * 3 * height * width; + for (int i = 0; i < num; i++) { + fs >> input_data[i]; + } + } else { + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + if (!img.data) { + std::cerr << "Fail to open img:" << img_path << std::endl; + exit(1); + } + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + process_img(img, width, height, input_data, means, scales); + } + + predictor->Run(); + + auto out_tensor = predictor->GetOutput(0); + auto* out_data = out_tensor->data(); + int64_t output_num = ShapeProduction(out_tensor->shape()); + float max_value = out_data[0]; + int max_index = 0; + for (int i = 0; i < output_num; i++) { + if (max_value < out_data[i]) { + max_value = out_data[i]; + max_index = i; + } + } + + std::cout << "max_value:" << max_value << std::endl; + std::cout << "max_index:" << max_index << std::endl; + std::cout << "max_value_ground_truth:" << out_max_value << std::endl; + std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl; + if (max_index != out_max_value_index || + fabs(max_value - out_max_value) > threshold) { + std::cerr << "----------Fail Test.---------- \n\n"; + } else { + std::cout << "----------Pass Test.---------- \n\n"; + } +} + +int main(int argc, char** argv) { + // Check inputs + google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_model_dir.empty() || + (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) { + std::cerr << "Input error." << std::endl; + std::cerr + << "Usage: " << argv[0] << std::endl + << "--model_dir: the path of not optimized model \n" + "--model_filename: the model filename of not optimized model \n" + "--param_filename: the param filename of not optimized model \n" + "--img_txt_path: the path of input image, the image is processed \n" + " and saved in txt file \n" + "--img_path: the path of input image \n" + "--out_max_value: The max value in output tensor \n" + "--threshold: If the max value diff is smaller than threshold,\n" + " pass test. Default 1e-3.\n" + "--out_max_value_index: The max value index in output tensor \n"; + exit(1); + } + + const int height = 224; + const int width = 224; + std::string model_dir = FLAGS_model_dir; + if (model_dir.back() == '/') { + model_dir.pop_back(); + } + std::string optimized_model_path = model_dir + "_opt2"; + OptModel(FLAGS_model_dir, + FLAGS_model_filename, + FLAGS_param_filename, + optimized_model_path); + std::string run_model_path = optimized_model_path + ".nb"; + + // Run test + Run(run_model_path, + FLAGS_img_path, + FLAGS_img_txt_path, + FLAGS_out_max_value, + FLAGS_out_max_value_index, + FLAGS_threshold, + height, + width); + return 0; +} diff --git a/lite/demo/cxx/test_libs/classification_light.cc b/lite/demo/cxx/test_libs/classification_light.cc new file mode 100644 index 0000000000000000000000000000000000000000..91d981e1fc991bef48da97847eddee9e724fe654 --- /dev/null +++ b/lite/demo/cxx/test_libs/classification_light.cc @@ -0,0 +1,129 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle_api.h" // NOLINT +#include "test_helper.h" // NOLINT + +DEFINE_string(optimized_model_path, "", "the path of optimized model"); +DEFINE_string(img_path, "", "the path of input image"); +DEFINE_string(img_txt_path, + "", + "the path of input image, the image is processed " + " and saved in txt file"); +DEFINE_double(out_max_value, 0.0, "The max value in output tensor"); +DEFINE_double(threshold, + 1e-3, + "If the max value diff is smaller than threshold, pass test"); +DEFINE_int32(out_max_value_index, -1, "The max value index in output tensor"); + +void Run(const std::string& model_path, + const std::string& img_path, + const std::string& img_txt_path, + const float out_max_value, + const int out_max_value_index, + const float threshold, + const int height, + const int width) { + // set config and create predictor + paddle::lite_api::MobileConfig config; + config.set_threads(3); + config.set_model_from_file(model_path); + + auto predictor = paddle::lite_api::CreatePaddlePredictor(config); + + // set input + auto input_tensor = predictor->GetInput(0); + input_tensor->Resize({1, 3, height, width}); + auto input_data = input_tensor->mutable_data(); + if (img_txt_path.size() > 0) { + std::fstream fs(img_txt_path); + if (!fs.is_open()) { + std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl; + } + int num = 1 * 3 * height * width; + for (int i = 0; i < num; i++) { + fs >> input_data[i]; + } + } else { + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + if (!img.data) { + std::cerr << "Fail to open img:" << img_path << std::endl; + exit(1); + } + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + process_img(img, width, height, input_data, means, scales); + } + + predictor->Run(); + + auto out_tensor = predictor->GetOutput(0); + auto* out_data = out_tensor->data(); + int64_t output_num = ShapeProduction(out_tensor->shape()); + float max_value = out_data[0]; + int max_index = 0; + for (int i = 0; i < output_num; i++) { + if (max_value < out_data[i]) { + max_value = out_data[i]; + max_index = i; + } + } + + std::cout << "max_value:" << max_value << std::endl; + std::cout << "max_index:" << max_index << std::endl; + std::cout << "max_value_ground_truth:" << out_max_value << std::endl; + std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl; + if (max_index != out_max_value_index || + fabs(max_value - out_max_value) > threshold) { + std::cerr << "----------Fail Test---------- \n\n"; + } else { + std::cout << "----------Pass Test---------- \n\n"; + } +} + +int main(int argc, char** argv) { + // Check inputs + google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_optimized_model_path.empty() || + (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) { + std::cerr << "Input error." << std::endl; + std::cerr + << "Usage: " << argv[0] << std::endl + << "--optimized_model_path: the path of optimized model \n" + "--img_txt_path: the path of input image, the image is processed \n" + " and saved in txt file \n" + "--img_path: the path of input image \n" + "--out_max_value: The max value in output tensor \n" + "--threshold: If the max value diff is smaller than threshold,\n" + " pass test. Default 1e-3.\n" + "--out_max_value_index: The max value index in output tensor \n"; + exit(1); + } + + const int height = 224; + const int width = 224; + // Run test + Run(FLAGS_optimized_model_path, + FLAGS_img_path, + FLAGS_img_txt_path, + FLAGS_out_max_value, + FLAGS_out_max_value_index, + FLAGS_threshold, + height, + width); + return 0; +} diff --git a/lite/demo/cxx/test_libs/prepare.sh b/lite/demo/cxx/test_libs/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..ff1aca7cf3bc68777b7172e4497c40888778a1ae --- /dev/null +++ b/lite/demo/cxx/test_libs/prepare.sh @@ -0,0 +1,30 @@ +make clean +make all -j + +gf=test_lite_lib_files +if [ -d ${gf} ];then + rm -rf ${gf} +fi +mkdir ${gf} + +mv classification_full_shared ${gf} +mv classification_full_static ${gf} +mv classification_light_shared ${gf} +mv classification_light_static ${gf} +mv yolov3_full_shared ${gf} +mv yolov3_full_static ${gf} +mv yolov3_light_shared ${gf} +mv yolov3_light_static ${gf} +cp run.sh ${gf} + +make clean + +cp -r ../../../cxx/ ${gf} +mv ${gf}/cxx ${gf}/lite + +if [ ! -f "test_libs_models_imgs.tgz" ];then + wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/test_libs_models_imgs.tgz +fi +tar zxf test_libs_models_imgs.tgz +mv test_libs_models_imgs ${gf} +mv ${gf}/test_libs_models_imgs ${gf}/models_imgs diff --git a/lite/demo/cxx/test_libs/run.sh b/lite/demo/cxx/test_libs/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..d5624e32e0d2c90aa17a3d13969dbdb6385c6d74 --- /dev/null +++ b/lite/demo/cxx/test_libs/run.sh @@ -0,0 +1,76 @@ +export LD_LIBRARY_PATH=$PWD/lite/lib/:${LD_LIBRARY_PATH} + +# mobilenetv1 +model_name="mobilenetv1" +input_params="--img_txt_path=models_imgs/images/classification.jpg.txt \ + --out_max_value=0.936887 \ + --out_max_value_index=65" +echo "Test ${model_name}: light_shared, light_static, full_shared, full_static." + +./classification_light_shared ${input_params} \ + --optimized_model_path=models_imgs/models/mobilenetv1.nb + +./classification_light_static ${input_params} \ + --optimized_model_path=models_imgs/models/mobilenetv1.nb + +./classification_full_shared ${input_params} \ + --model_dir=models_imgs/models/mobilenetv1 + +./classification_full_static ${input_params} \ + --model_dir=models_imgs/models/mobilenetv1 + +# mobilenetv2 +model_name="mobilenetv2" +input_params="--img_txt_path=models_imgs/images/classification.jpg.txt \ + --out_max_value=0.868888 \ + --out_max_value_index=65" +echo "Test ${model_name}: light_shared, light_static, full_shared, full_static." + +./classification_light_shared ${input_params} \ + --optimized_model_path=models_imgs/models/mobilenetv2.nb + +./classification_light_static ${input_params} \ + --optimized_model_path=models_imgs/models/mobilenetv2.nb + +./classification_full_shared ${input_params} \ + --model_dir=models_imgs/models/mobilenetv2 + +./classification_full_static ${input_params} \ + --model_dir=models_imgs/models/mobilenetv2 + +# shufflenetv2 +model_name="shufflenetv2" +input_params="--img_txt_path=models_imgs/images/classification.jpg.txt \ + --out_max_value=0.776729 \ + --out_max_value_index=65" +echo "Test ${model_name}: light_shared, light_static, full_shared, full_static." + +./classification_light_shared ${input_params} \ + --optimized_model_path=models_imgs/models/shufflenetv2.nb + +./classification_light_static ${input_params} \ + --optimized_model_path=models_imgs/models/shufflenetv2.nb + +./classification_full_shared ${input_params} \ + --model_dir=models_imgs/models/shufflenetv2 + +./classification_full_static ${input_params} \ + --model_dir=models_imgs/models/shufflenetv2 + +# yolov3 +model_name="yolov3" +input_params="--img_txt_path=models_imgs/images/yolov3.jpg.txt \ + --out_values=0,0.153605,174.494,199.729,562.075,604.014" +echo "Test ${model_name}: light_shared, light_static, full_shared, full_static." + +./yolov3_light_shared ${input_params} \ + --optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb + +./yolov3_light_static ${input_params} \ + --optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb + +./yolov3_full_shared ${input_params} \ + --model_dir=models_imgs/models/yolov3_mobilenetv1 + +./yolov3_full_static ${input_params} \ + --model_dir=models_imgs/models/yolov3_mobilenetv1 diff --git a/lite/demo/cxx/test_libs/test_helper.cc b/lite/demo/cxx/test_libs/test_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..450579c90d66f952f32ac70353f4867cee94e007 --- /dev/null +++ b/lite/demo/cxx/test_libs/test_helper.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test_helper.h" // NOLINT + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" + +double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +int64_t ShapeProduction(const std::vector& shape) { + int64_t num = 1; + for (auto i : shape) { + num *= i; + } + return num; +} + +std::vector GetIntNumsFromStr(const std::string& str) { + std::vector nums; + std::string tmp_str = str; + while (!tmp_str.empty()) { + int num = atoi(tmp_str.data()); + nums.push_back(num); + size_t next_offset = tmp_str.find(","); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return nums; +} + +std::vector GetDoubleNumsFromStr(const std::string& str) { + std::vector nums; + std::string tmp_str = str; + while (!tmp_str.empty()) { + double num = atof(tmp_str.data()); + nums.push_back(num); + size_t next_offset = tmp_str.find(","); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return nums; +} + +// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up +void neon_mean_scale( + const float* din, float* dout, int size, float* mean, float* scale) { + float32x4_t vmean0 = vdupq_n_f32(mean[0]); + float32x4_t vmean1 = vdupq_n_f32(mean[1]); + float32x4_t vmean2 = vdupq_n_f32(mean[2]); + float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]); + float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]); + float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]); + + float* dout_c0 = dout; + float* dout_c1 = dout + size; + float* dout_c2 = dout + size * 2; + + int i = 0; + for (; i < size - 3; i += 4) { + float32x4x3_t vin3 = vld3q_f32(din); + float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0); + float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1); + float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2); + float32x4_t vs0 = vmulq_f32(vsub0, vscale0); + float32x4_t vs1 = vmulq_f32(vsub1, vscale1); + float32x4_t vs2 = vmulq_f32(vsub2, vscale2); + vst1q_f32(dout_c0, vs0); + vst1q_f32(dout_c1, vs1); + vst1q_f32(dout_c2, vs2); + + din += 12; + dout_c0 += 4; + dout_c1 += 4; + dout_c2 += 4; + } + for (; i < size; i++) { + *(dout_c0++) = (*(din++) - mean[0]) / scale[0]; + *(dout_c0++) = (*(din++) - mean[1]) / scale[1]; + *(dout_c0++) = (*(din++) - mean[2]) / scale[2]; + } +} + +// Process img and set it as input +void process_img(const cv::Mat& img, + int width, + int height, + float* dest_data, + float* means, + float* scales) { + cv::Mat rgb_img; + cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); + cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f); + cv::Mat imgf; + rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f); + const float* dimg = reinterpret_cast(imgf.data); + neon_mean_scale(dimg, dest_data, width * height, means, scales); +} diff --git a/lite/demo/cxx/test_libs/test_helper.h b/lite/demo/cxx/test_libs/test_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..3ef42af571925fd556538747cd21b72e925329bc --- /dev/null +++ b/lite/demo/cxx/test_libs/test_helper.h @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" + +double GetCurrentUS(); + +int64_t ShapeProduction(const std::vector& shape); + +std::vector GetIntNumsFromStr(const std::string& str); +std::vector GetDoubleNumsFromStr(const std::string& str); + +void neon_mean_scale( + const float* din, float* dout, int size, float* mean, float* scale); + +void process_img(const cv::Mat& img, + int width, + int height, + float* dst_data, + float* means, + float* scales); diff --git a/lite/demo/cxx/test_libs/yolov3_full.cc b/lite/demo/cxx/test_libs/yolov3_full.cc new file mode 100644 index 0000000000000000000000000000000000000000..d0e69f9042f6ebf8ed68626b52889fac59f73c18 --- /dev/null +++ b/lite/demo/cxx/test_libs/yolov3_full.cc @@ -0,0 +1,182 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle_api.h" // NOLINT +#include "test_helper.h" // NOLINT + +DEFINE_string(model_dir, + "", + "the path of the model, the model and param files is under " + "model_dir."); +DEFINE_string(model_filename, + "", + "the filename of model file. When the model is combined formate, " + "please set model_file."); +DEFINE_string(param_filename, + "", + "the filename of param file, set param_file when the model is " + "combined formate."); +DEFINE_string(img_path, "", "the path of input image"); +DEFINE_string(img_txt_path, + "", + "the path of input image, the image is processed " + " and saved in txt file"); +DEFINE_string(out_values, + "", + "The output values, separated by colon and comma"); +DEFINE_double(threshold, + 1e-3, + "If the output value diff is smaller than threshold, pass test"); + +void OptModel(const std::string& load_model_dir, + const std::string& model_filename, + const std::string& params_filename, + const std::string& save_model_path) { + paddle::lite_api::CxxConfig config; + config.set_model_dir(load_model_dir); + if (!model_filename.empty() && !params_filename.empty()) { + config.set_model_file(load_model_dir + "/" + model_filename); + config.set_param_file(load_model_dir + "/" + params_filename); + } + std::vector vaild_places = { + paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)}, + paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt32)}, + paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt64)}, + }; + config.set_valid_places(vaild_places); + + auto predictor = paddle::lite_api::CreatePaddlePredictor(config); + + std::string cmd_str = "rm -rf " + save_model_path; + int ret = system(cmd_str.c_str()); + if (ret == 0) { + std::cout << "Delete old optimized model " << save_model_path << std::endl; + } + predictor->SaveOptimizedModel(save_model_path, + paddle::lite_api::LiteModelType::kNaiveBuffer); + std::cout << "Load model from " << load_model_dir << std::endl; + std::cout << "Save optimized model to " << save_model_path << std::endl; +} + +void Run(const std::string& model_path, + const std::string& img_path, + const std::string& img_txt_path, + const std::vector& out_values, + const float threshold, + const int height, + const int width) { + // set config and create predictor + paddle::lite_api::MobileConfig config; + config.set_threads(3); + config.set_model_from_file(model_path); + + auto predictor = paddle::lite_api::CreatePaddlePredictor(config); + + // set input + auto input_tensor = predictor->GetInput(0); + input_tensor->Resize({1, 3, height, width}); + auto input_data = input_tensor->mutable_data(); + if (img_txt_path.size() > 0) { + std::fstream fs(img_txt_path); + if (!fs.is_open()) { + std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl; + } + int num = 1 * 3 * height * width; + for (int i = 0; i < num; i++) { + fs >> input_data[i]; + } + } else { + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + if (!img.data) { + std::cerr << "Fail to open img:" << img_path << std::endl; + exit(1); + } + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + process_img(img, width, height, input_data, means, scales); + } + auto shape_tensor = predictor->GetInput(1); + shape_tensor->Resize({1, 2}); + auto* shape_data = shape_tensor->mutable_data(); + shape_data[0] = height; + shape_data[1] = width; + + predictor->Run(); + + auto out_tensor = predictor->GetOutput(0); + auto* out_data = out_tensor->data(); + int64_t output_num = ShapeProduction(out_tensor->shape()); + bool is_pass = true; + for (int i = 0; i < output_num && i < out_values.size(); i++) { + std::cout << "id:" << i << " out_data:" << out_data[i] + << " gt_data:" << out_values[i] << std::endl; + if (fabs(out_data[i] - out_values[i]) > threshold) { + is_pass = false; + } + } + if (is_pass) { + std::cout << "----------Pass test---------- \n\n"; + } else { + std::cout << "----------Fail test---------- \n\n"; + } +} + +int main(int argc, char** argv) { + // Check inputs + google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_model_dir.empty() || + (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) { + std::cerr << "Input error." << std::endl; + std::cerr + << "Usage: " << argv[0] << std::endl + << "--model_dir: the path of not optimized model \n" + "--model_filename: the model filename of not optimized model \n" + "--param_filename: the param filename of not optimized model \n" + "--img_txt_path: the path of input image, the image is processed \n" + " and saved in txt file \n" + "--img_path: the path of input image \n" + "--out_values: The output values, separated by colon and comma.\n" + "--threshold: If the out value diff is smaller than threshold,\n" + " pass test. Default 1e-3.\n"; + exit(1); + } + + const int height = 608; + const int width = 608; + std::vector out_values = GetDoubleNumsFromStr(FLAGS_out_values); + + std::string model_dir = FLAGS_model_dir; + if (model_dir.back() == '/') { + model_dir.pop_back(); + } + std::string optimized_model_path = model_dir + "_opt2"; + OptModel(FLAGS_model_dir, + FLAGS_model_filename, + FLAGS_param_filename, + optimized_model_path); + std::string run_model_path = optimized_model_path + ".nb"; + + // Run test + Run(run_model_path, + FLAGS_img_path, + FLAGS_img_txt_path, + out_values, + FLAGS_threshold, + height, + width); + return 0; +} diff --git a/lite/demo/cxx/test_libs/yolov3_light.cc b/lite/demo/cxx/test_libs/yolov3_light.cc new file mode 100644 index 0000000000000000000000000000000000000000..b31151c8fc2384ec24f2f908d156f4200db279d7 --- /dev/null +++ b/lite/demo/cxx/test_libs/yolov3_light.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle_api.h" // NOLINT +#include "test_helper.h" // NOLINT + +DEFINE_string(optimized_model_path, "", "the path of the optimized model"); +DEFINE_string(img_path, "", "the path of input image"); +DEFINE_string(img_txt_path, + "", + "the path of input image, the image is processed " + " and saved in txt file"); +DEFINE_string(out_values, + "", + "The output values, separated by colon and comma"); +DEFINE_double(threshold, + 1e-3, + "If the output value diff is smaller than threshold, pass test"); + +void Run(const std::string& model_path, + const std::string& img_path, + const std::string& img_txt_path, + const std::vector& out_values, + const float threshold, + const int height, + const int width) { + // set config and create predictor + paddle::lite_api::MobileConfig config; + config.set_threads(3); + config.set_model_from_file(model_path); + + auto predictor = paddle::lite_api::CreatePaddlePredictor(config); + + // set input + auto input_tensor = predictor->GetInput(0); + input_tensor->Resize({1, 3, height, width}); + auto input_data = input_tensor->mutable_data(); + if (img_txt_path.size() > 0) { + std::fstream fs(img_txt_path); + if (!fs.is_open()) { + std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl; + } + int num = 1 * 3 * height * width; + for (int i = 0; i < num; i++) { + fs >> input_data[i]; + } + } else { + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + if (!img.data) { + std::cerr << "Fail to open img:" << img_path << std::endl; + exit(1); + } + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + process_img(img, width, height, input_data, means, scales); + } + auto shape_tensor = predictor->GetInput(1); + shape_tensor->Resize({1, 2}); + auto* shape_data = shape_tensor->mutable_data(); + shape_data[0] = height; + shape_data[1] = width; + + predictor->Run(); + + auto out_tensor = predictor->GetOutput(0); + auto* out_data = out_tensor->data(); + int64_t output_num = ShapeProduction(out_tensor->shape()); + bool is_pass = true; + for (int i = 0; i < output_num && i < out_values.size(); i++) { + std::cout << "id:" << i << " out_data:" << out_data[i] + << " gt_data:" << out_values[i] << std::endl; + if (fabs(out_data[i] - out_values[i]) > threshold) { + is_pass = false; + } + } + if (is_pass) { + std::cout << "----------Pass test---------- \n\n"; + } else { + std::cout << "----------Fail test---------- \n\n"; + } +} + +int main(int argc, char** argv) { + // Check inputs + google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_optimized_model_path.empty() || + (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) { + std::cerr << "Input error." << std::endl; + std::cerr + << "Usage: " << argv[0] << std::endl + << "--optimized_model_path: the path of optimized model \n" + "--img_txt_path: the path of input image, the image is processed \n" + " and saved in txt file \n" + "--img_path: the path of input image \n" + "--out_values: The output values, separated by colon and comma.\n" + "--threshold: If the out value diff is smaller than threshold,\n" + " pass test. Default 1e-3.\n"; + exit(1); + } + + const int height = 608; + const int width = 608; + std::vector out_values = GetDoubleNumsFromStr(FLAGS_out_values); + + // Run test + Run(FLAGS_optimized_model_path, + FLAGS_img_path, + FLAGS_img_txt_path, + out_values, + FLAGS_threshold, + height, + width); + return 0; +} diff --git a/lite/demo/cxx/train_demo/README.md b/lite/demo/cxx/train_demo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..56f4513d45676a1deb51bfb93096db156ddd0449 --- /dev/null +++ b/lite/demo/cxx/train_demo/README.md @@ -0,0 +1,191 @@ + +# Introduction + 我们都知道,PaddleLite可以做移动端预测,事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子,这一例子对应的任务是“波士顿房价预测”,又称作“fit-a-line”。 + + 你可以通过book库中的 +[文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html) +和 +[源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line) +进一步了解“波士顿房价预测”这一任务的定义及其建模过程, +其使用线性回归(Linear Regression) +模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。 + +注:这是一篇使用C++ API做模型训练的教程,其他API暂时不支持训练功能。 + +# Requirements + +- 一部安卓手机,用于运行训练程序 +- 装了Paddle (version: 1.7.0) 的python + +# Quick start + +## Step1 build paddle-lite + +请按照[paddle-lite官方文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite) 的教程编译full_publish的paddle-lite lib。以Linux上编译为例,其具体的命令为: + +```shell +## 配置环境 +wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz --no-check-certificate +tar xzf cmake-3.10.3-Linux-x86_64.tar.gz +export PATH=${PWD}'/cmake-3.10.3-Linux-x86_64/bin':$PATH + +wget https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip +unzip android-ndk-r17c-linux-x86_64.zip +export NDK_ROOT=/opt/android-ndk-r17c + +## 编译 +git clone https://github.com/PaddlePaddle/Paddle-Lite.git +cd Paddle-Lite +./lite/tools/build.sh \ + --arm_os=android \ + --arm_abi=armv7 \ + --build_extra=ON \ + --arm_lang=gcc \ + --android_stl=c++_static \ + --build_train=ON full_publish +``` + +产物: + +```shell +Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so +``` + +## Step2 编译lr_trainer + +```shell +cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/ +sh run_build.sh /path/to/your/Paddle-Lite/build.lite.android.armv7.gcc/ /path/to/your/android-ndk-r17c +``` + +产物: +```shell +bin/ +`-- demo_trainer +``` + +## Step3 download model and run it! + +在你的笔记本电脑上,用usb连接到手机,开启开发者模式,在任意目录下执行: + +```shell +local_path=/data/local/tmp/linear_regression +adb shell "mkdir "${local_path} + +# download model and push to mobile +wget http://paddle-tar.bj.bcebos.com/paddle-lite/lite_lr_model.tar.gz +tar -zxvf lite_lr_model.tar.gz +adb push lite_lr_model/housing.data ${local_path} +adb push lite_lr_model/model_dir ${local_path} + +# push lib and executable file to moblie +adb push libpaddle_full_api_shared.so ${local_path} +adb push demo_trainer ${local_path} +adb shell chmod +x ${local_path}/demo_trainer + +# run it! +adb shell "export LD_LIBRARY_PATH="${local_path}" && export LIBRARY_PATH="${local_path}" && cd "${local_path}" && ./demo_trainer true" +``` + +期望结果: + +``` +sample 0: Loss: 564.317 +sample 1: Loss: 463.9 +sample 2: Loss: 1197.54 +sample 3: Loss: 1093.83 +sample 4: Loss: 1282.76 +sample 5: Loss: 792.097 +sample 6: Loss: 491.776 +sample 7: Loss: 698.496 +sample 8: Loss: 248.445 +sample 9: Loss: 325.135 +``` + +# 更多细节 +上面提到的模型是直接下载得到的,如果你想自己生成,可以执行以下命令: + +```shell +git clone https://github.com/PaddlePaddle/Paddle-Lite.git +cd Paddle-Lite/lite/demo/cxx/train_demo/ +python train.py --save_model +``` + +产物: + +```shell +model_dir/ +|-- fc_0.b_0 +|-- fc_0.w_0 +|-- learning_rate_0 +`-- __model__ + +md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d +``` + +如果你想生成自己的模型用于训练,可以参考`train.py`中保存模型的方式。 + +# 与Paddle训练结果做校对 + +## 前10个Loss值 + +为了验证paddle与lite的一致性,我们控制模型参数一致、数据一致、batch size = 1的情况下,训练10个batch, 记录了二者的loss值。 + +python + paddle 命令: + +```shell + fluid train.py --num_steps=10 --batch_size=1 +``` + +python + paddle 结果: + +```shell +Train cost, Step 0, Cost 564.317017 +Train cost, Step 1, Cost 463.900238 +Train cost, Step 2, Cost 1197.537354 +Train cost, Step 3, Cost 1093.833008 +Train cost, Step 4, Cost 1282.760254 +Train cost, Step 5, Cost 792.097351 +Train cost, Step 6, Cost 491.775848 +Train cost, Step 7, Cost 698.496033 +Train cost, Step 8, Cost 248.444885 +Train cost, Step 9, Cost 325.135132 +``` + +c++ 与 paddle-lite命令: +``` +./demo_trainer true +``` + +c++ 与 paddle-lite结果: +``` +sample 0: Loss: 564.317 +sample 1: Loss: 463.9 +sample 2: Loss: 1197.54 +sample 3: Loss: 1093.83 +sample 4: Loss: 1282.76 +sample 5: Loss: 792.097 +sample 6: Loss: 491.776 +sample 7: Loss: 698.496 +sample 8: Loss: 248.445 +sample 9: Loss: 325.135 +``` + +## Loss 曲线 + +控制训练时的batch size为20,每个epoch对训练数据做全局shuffle,训练100个epoch后,paddle和lite的loss曲线对比如下。 + +![lr_loss](image/lr_loss.png) + +如果想复现上述效果,paddle+python的运行命令为: + +``` +git clone https://github.com/PaddlePaddle/book.git +cd book/01.fit_a_line +python train.py +``` + +lite + c++的运行命令为: +``` +./demo_trainer false +``` diff --git a/lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt b/lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b41808352a186e8ed434c0cf9364a9cae7d3928e --- /dev/null +++ b/lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 2.8) +set (CMAKE_CXX_STANDARD 11) + +# Project's name + +if(NOT DEFINED LITE_ROOT) + message(FATAL_ERROR "please set LITE_ROOT with + -DLITE_ROOT=/path/to/your/build.lite.android.armv7.gcc/") +endif() + +project(demo_trainer) +# Set the output folder where your program will be created +set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin) +set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}) +set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}) + +# The following folder will be included +include_directories("include") +include_directories("${LITE_ROOT}/inference_lite_lib.android.armv7/cxx/include") + +add_executable(demo_trainer ${PROJECT_SOURCE_DIR}/demo_trainer.cc ${PROJECT_SOURCE_DIR}/data_reader.cc) + +TARGET_LINK_LIBRARIES(demo_trainer +"${LITE_ROOT}/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so") diff --git a/lite/demo/cxx/train_demo/cplus_train/data_reader.cc b/lite/demo/cxx/train_demo/cplus_train/data_reader.cc new file mode 100644 index 0000000000000000000000000000000000000000..4546e2e5fecc17321e8126485022b4ac30876747 --- /dev/null +++ b/lite/demo/cxx/train_demo/cplus_train/data_reader.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/data_reader.h" +#include + +using std::string; +using std::vector; + +int FEATURE_NUM = 13; +float rate = 0.8; + +int get_samples(string line, vector* feature, float* label) { + std::istringstream reader(line); + std::vector numbers; + do { + // read as many numbers as possible. + for (float number; reader >> number;) { + numbers.push_back(number); + } + // consume and discard token from stream. + if (reader.fail()) { + reader.clear(); + std::string token; + reader >> token; + } + } while (!reader.eof()); + + assert(numbers.size() == FEATURE_NUM + 1); + for (int i = 0; i < FEATURE_NUM; i++) { + feature->push_back(numbers[i]); + } + *label = numbers[FEATURE_NUM]; + return 0; +} + +int normalize(const vector>& origin_features, + vector>* features, + float rate) { + int inf = std::numeric_limits::max(); + vector min_vec(FEATURE_NUM, static_cast(inf)); + vector max_vec(FEATURE_NUM, -(static_cast(inf))); + vector sum_vec(FEATURE_NUM, 0); + vector avg_vec(FEATURE_NUM, 0); + + for (int i = 0; i < origin_features.size(); i++) { + for (int j = 0; j < FEATURE_NUM; j++) { + min_vec[j] = min(min_vec[j], origin_features[i][j]); + max_vec[j] = max(max_vec[j], origin_features[i][j]); + sum_vec[j] += origin_features[i][j]; + } + } + + for (int i = 0; i < FEATURE_NUM; i++) { + avg_vec[i] = sum_vec[i] / origin_features.size(); + } + + for (int i = 0; i < origin_features.size() * rate - 1; i++) { + vector feat; + for (int j = 0; j < FEATURE_NUM; j++) { + feat.push_back((origin_features[i][j] - avg_vec[j]) / + (max_vec[j] - min_vec[j])); + } + features->push_back(feat); + } +} + +int read_samples(const string fname, + vector>* features, + vector* labels) { + fstream fin; + fin.open(fname); + if (!static_cast(fin)) { + return 1; + } + vector> origin_features; + vector lines; + string line; + while (getline(fin, line)) { + lines.push_back(line); + } + fin.close(); + + for (int i = 0; i < lines.size(); i++) { + vector feat; + float lbl = 0; + get_samples(lines[i], &feat, &lbl); + origin_features.push_back(feat); + if (i < lines.size() * rate - 1) { + labels->push_back(lbl); + } + } + + cout << "finish read fata" << endl; + normalize(origin_features, features, rate); + assert(features->size() == labels->size()); + return 0; +} diff --git a/lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc b/lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc new file mode 100644 index 0000000000000000000000000000000000000000..f035078fff35c4b2c0b41d0de84d2621c550d14e --- /dev/null +++ b/lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include "include/data_reader.h" +#include "paddle_api.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +class LRModel { + public: + void InitModel() { + // 1. Set CxxConfig + CxxConfig config; + config.set_model_dir("model_dir"); + std::vector valid_places{Place{TARGET(kARM), PRECISION(kFloat)}}; + config.set_valid_places(valid_places); + predictor_ = CreatePaddlePredictor(config); + } + + float Predict(const vector>& features, + const vector& labels) { + // Create Tensor + assert(features.size() == labels.size()); + int batch_size = features.size(); + std::unique_ptr input_tensor(std::move(predictor_->GetInput(0))); + input_tensor->Resize(shape_t({batch_size, FEATURE_NUM})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < batch_size; i++) { + for (int j = 0; j < FEATURE_NUM; j++) { + data[FEATURE_NUM * i + j] = features[i][j]; + } + } + std::unique_ptr y_tensor(std::move(predictor_->GetInput(1))); + y_tensor->Resize(shape_t({batch_size, 1})); + auto* y_data = y_tensor->mutable_data(); + for (int i = 0; i < batch_size; i++) { + y_data[i] = labels[i]; + } + predictor_->Run(); + std::unique_ptr output_tensor( + std::move(predictor_->GetOutput(0))); + return output_tensor->data()[0]; + } + + private: + std::shared_ptr predictor_; +}; + +int shuffle(vector>* features, vector* labels) { + assert(features->size() == labels->size()); + vector index; + for (int i = 0; i < features->size(); i++) { + index.push_back(i); + } + random_shuffle(index.begin(), index.end()); + + vector> tmp_features; + vector tmp_labels; + + for (int i = 0; i < features->size(); i++) { + tmp_features.push_back((*features)[index[i]]); + tmp_labels.push_back((*labels)[index[i]]); + } + + for (int i = 0; i < features->size(); i++) { + for (int j = 0; j < FEATURE_NUM; j++) { + (*features)[i][j] = tmp_features[i][j]; + } + (*labels)[i] = tmp_labels[i]; + } + return 0; +} + +int main(int argc, char* argv[]) { + if (argc < 2) { + cerr << "usage: ./demo_trainer is_small" << endl; + cerr << " if is_small is true, the batch size is set to 1, " << endl; + cerr << " and it will only runs for 10 steps." << endl; + return 1; + } + string is_small = argv[1]; + vector> features; + vector labels; + read_samples("housing.data", &features, &labels); + cout << "sample count: " << features.size() << " " << endl; + + std::shared_ptr local_model(new LRModel()); + local_model->InitModel(); + + if (is_small == "true") { + cout << "small mode" << endl; + for (int i; i < 10; i++) { + vector> batch_feature; + vector batch_label; + batch_feature.push_back(features[i]); + batch_label.push_back(labels[i]); + auto loss = local_model->Predict(batch_feature, batch_label); + cout << "sample " << i << ": " << loss << endl; + } + } else if (is_small == "false") { + // shuffle + cout << "full model" << endl; + int epoch = 100; + int batch_size = 20; + int step = 0; + for (int i; i < epoch; i++) { + shuffle(&features, &labels); + for (int j = 0; + j < ceil(static_cast(features.size()) / batch_size); + j++) { + int start_idx = j * batch_size; + int end_idx = + min((j + 1) * batch_size, static_cast(features.size())); + auto batch_feature = vector>(features.begin() + start_idx, + features.begin() + end_idx); + auto batch_label = + vector(labels.begin() + start_idx, labels.begin() + end_idx); + auto loss = local_model->Predict(batch_feature, batch_label); + if (step % 10 == 0) { + std::cout << "batch: " << i << ", step: " << step + << ", Loss: " << loss << endl; + } + step += 1; + } + } + } else { + cerr << "wrong arg for is_small: " << is_small << endl; + } +} diff --git a/lite/demo/cxx/train_demo/cplus_train/include/data_reader.h b/lite/demo/cxx/train_demo/cplus_train/include/data_reader.h new file mode 100644 index 0000000000000000000000000000000000000000..050e929c9135ac939dac747e2e4a2490397a4c3d --- /dev/null +++ b/lite/demo/cxx/train_demo/cplus_train/include/data_reader.h @@ -0,0 +1,37 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include + +using std::string; +using std::vector; +using std::cerr; +using std::cout; +using std::endl; +using std::min; +using std::max; +using std::fstream; + +extern int FEATURE_NUM; + +int get_samples(string line, const vector& feature, float* label); +int read_samples(const string fname, + vector>* features, + vector* labels); diff --git a/lite/demo/cxx/train_demo/cplus_train/run_build.sh b/lite/demo/cxx/train_demo/cplus_train/run_build.sh new file mode 100644 index 0000000000000000000000000000000000000000..4fb444ebd1ecda40db2d69c24016cb78bacdc0ad --- /dev/null +++ b/lite/demo/cxx/train_demo/cplus_train/run_build.sh @@ -0,0 +1,21 @@ + +rm -rf build +mkdir build +cd build + +LITE_ROOT=$1 +NDK_ROOT=$2 + + +cmake .. \ + -DLITE_ROOT=${LITE_ROOT} \ + -DNDK_ROOT=${NDK_ROOT} \ + -DCMAKE_TOOLCHAIN_FILE=${NDK_ROOT}/build/cmake/android.toolchain.cmake \ + -DANDROID_TOOLCHAIN=gcc \ + -DANDROID_ABI="armeabi-v7a" \ + -DANDROID_PLATFORM=android-23 \ + -DANDROID=true \ + -DANDROID_STL=c++_static +make +cd .. +# ./bin/demo_trainer diff --git a/lite/demo/cxx/train_demo/image/lr_loss.png b/lite/demo/cxx/train_demo/image/lr_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..626cb57ecd5d4cf50fd4d0b8aaadcc29146ca19b Binary files /dev/null and b/lite/demo/cxx/train_demo/image/lr_loss.png differ diff --git a/lite/demo/cxx/train_demo/train.py b/lite/demo/cxx/train_demo/train.py new file mode 100644 index 0000000000000000000000000000000000000000..37825a5cc472990664f68cb38dbf7ee7859286b8 --- /dev/null +++ b/lite/demo/cxx/train_demo/train.py @@ -0,0 +1,135 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys +import argparse + +import math +import numpy + +import paddle +import paddle.fluid as fluid + + +def parse_args(): + parser = argparse.ArgumentParser("fit_a_line") + parser.add_argument( + '--save_model', + action='store_true', + help="Whether to save main program") + parser.add_argument( + '--num_steps', + type=int, + default=1000000000000, + help="train steps") + parser.add_argument( + '--num_epochs', type=int, default=100, help="number of epochs.") + parser.add_argument( + '--batch_size', type=int, default=20, help="batch size.") + parser.add_argument( + '--shuffle', + action='store_true', + help="Whether to shuffle train data.") + args = parser.parse_args() + return args + +# For training test cost +def train_test(executor, program, reader, feeder, fetch_list): + accumulated = 1 * [0] + count = 0 + for data_test in reader(): + outs = executor.run( + program=program, feed=feeder.feed(data_test), fetch_list=fetch_list) + accumulated = [x_c[0] + x_c[1][0] for x_c in zip(accumulated, outs)] + count += 1 + return [x_d / count for x_d in accumulated] + + +def main(): + if args.shuffle: + print("doing shuffle") + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.uci_housing.train(), buf_size=500), + batch_size=args.batch_size) + else: + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=args.batch_size) + + # feature vector of length 13 + x = fluid.data(name='x', shape=[None, 13], dtype='float32') + y = fluid.data(name='y', shape=[None, 1], dtype='float32') + + main_program = fluid.default_main_program() + startup_program = fluid.default_startup_program() + + main_program.random_seed = 90 + startup_program.random_seed = 90 + + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_loss = fluid.layers.mean(cost) + + test_program = main_program.clone(for_test=True) + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer.minimize(avg_loss) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + + num_epochs = args.num_epochs + + # main train loop. + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe.run(startup_program) + if args.save_model: + fluid.io.save_persistables(exe, "model_dir") + + # add feed and fetch op + feeded_var_names = ['x', 'y'] + fetch_var_names = ['mean_0.tmp_0'] + fluid.io.prepend_feed_ops(main_program, feeded_var_names) + fluid.io.append_fetch_ops(main_program, fetch_var_names) + with open("model_dir/__model__", "wb") as f: + f.write(main_program.desc.serialize_to_string()) + + with open("debug_main_program", "w") as f: + f.write(str(main_program)) + print("train model saved to model_dir") + return + + train_prompt = "Train cost" + step = 0 + for pass_id in range(num_epochs): + for data_train in train_reader(): + avg_loss_value, = exe.run( + main_program, + feed=feeder.feed(data_train), + fetch_list=[avg_loss]) + print("%s, Step %d, Cost %f" % + (train_prompt, step, avg_loss_value[0])) + if step == args.num_steps - 1: + return + step += 1 + + if math.isnan(float(avg_loss_value[0])): + sys.exit("got NaN loss, training failed.") + + +if __name__ == '__main__': + args = parse_args() + main() diff --git a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4b8497ebb30630b91d0eee9ebde389ae10f0e2c --- /dev/null +++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 2.8) + +set(TARGET mobilenet_full_api) + +# 1. path to Paddle-Lite lib and mklml lib +set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx") +set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/") + +# 2. link mklml and Paddle-Lite directory +link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib) +include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include) + +# 3. compile options +add_definitions(-std=c++11 -g -O3 -pthread) +set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}) + +# 4.add executable output +add_executable(${TARGET} ${TARGET}.cc) +target_link_libraries(${TARGET} -lpaddle_full_api_shared) +target_link_libraries(${TARGET} -liomp5) +target_link_libraries(${TARGET} -ldl) diff --git a/lite/demo/cxx/x86_mobilenetv1_full_demo/build.sh b/lite/demo/cxx/x86_mobilenetv1_full_demo/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..c9570e326e361d40b9a2b857dc97a1caf1450a92 --- /dev/null +++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/build.sh @@ -0,0 +1,6 @@ +mkdir ./build +cd ./build +cmake .. +make +cd .. +rm -rf ./build diff --git a/lite/demo/cxx/x86_mobilenetv1_full_demo/mobilenet_full_api.cc b/lite/demo/cxx/x86_mobilenetv1_full_demo/mobilenet_full_api.cc new file mode 100644 index 0000000000000000000000000000000000000000..c2837e0fdd9bfaa9fc146dff9daee963f707b886 --- /dev/null +++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/mobilenet_full_api.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle_api.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +int64_t ShapeProduction(const shape_t& shape) { + int64_t res = 1; + for (auto i : shape) res *= i; + return res; +} + +void RunModel(std::string model_dir) { + // 1. Create CxxConfig + CxxConfig config; + config.set_model_dir(model_dir); + config.set_valid_places({Place{TARGET(kX86), PRECISION(kFloat)}, + Place{TARGET(kHost), PRECISION(kFloat)}}); + // 2. Create PaddlePredictor by CxxConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + // 3. Prepare input data + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize({1, 3, 224, 224}); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 1; + } + + // 4. Run predictor + predictor->Run(); + + // 5. Get output + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + std::cout << "Output shape " << output_tensor->shape()[1] << std::endl; + for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { + std::cout << "Output[" << i << "]: " << output_tensor->data()[i] + << std::endl; + } +} + +int main(int argc, char** argv) { + if (argc < 2) { + std::cerr << "[ERROR] usage: ./" << argv[0] << " naive_buffer_model_dir\n"; + exit(1); + } + std::string model_dir = argv[1]; + RunModel(model_dir); + return 0; +} diff --git a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e85b8fe67e1a8be859d4d7a9a95a9008802a7521 --- /dev/null +++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 2.8) + +set(TARGET mobilenet_light_api) + +# 1. path to Paddle-Lite lib and mklml lib +set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx") +set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/") + +# 2. link mklml and Paddle-Lite directory +link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib) +include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include) + +# 3. compile options +add_definitions(-std=c++11 -g -O3 -pthread) +set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}) + +# 4.add executable output +add_executable(${TARGET} ${TARGET}.cc) +target_link_libraries(${TARGET} -lpaddle_light_api_shared) +target_link_libraries(${TARGET} -liomp5) +target_link_libraries(${TARGET} -ldl) diff --git a/lite/demo/cxx/x86_mobilenetv1_light_demo/build.sh b/lite/demo/cxx/x86_mobilenetv1_light_demo/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..c9570e326e361d40b9a2b857dc97a1caf1450a92 --- /dev/null +++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/build.sh @@ -0,0 +1,6 @@ +mkdir ./build +cd ./build +cmake .. +make +cd .. +rm -rf ./build diff --git a/lite/demo/cxx/x86_mobilenetv1_light_demo/mobilenet_light_api.cc b/lite/demo/cxx/x86_mobilenetv1_light_demo/mobilenet_light_api.cc new file mode 100644 index 0000000000000000000000000000000000000000..763a3fe8871398dda37e5302d24b8cf1659cf6ce --- /dev/null +++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/mobilenet_light_api.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle_api.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +int64_t ShapeProduction(const shape_t& shape) { + int64_t res = 1; + for (auto i : shape) res *= i; + return res; +} + +void RunModel(std::string model_name) { + // 1. Create MobileConfig + MobileConfig config; + config.set_model_from_file(model_name); + // 2. Create PaddlePredictor by CxxConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + // 3. Prepare input data + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize({1, 3, 224, 224}); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 1; + } + + // 4. Run predictor + predictor->Run(); + + // 5. Get output + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + std::cout << "Output shape " << output_tensor->shape()[1] << std::endl; + for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { + std::cout << "Output[" << i << "]: " << output_tensor->data()[i] + << std::endl; + } +} + +int main(int argc, char** argv) { + if (argc < 2) { + std::cerr << "[ERROR] usage: ./" << argv[0] << " naive_buffer_model_dir\n"; + exit(1); + } + std::string model_dir = argv[1]; + RunModel(model_dir); + return 0; +} diff --git a/lite/demo/cxx/yolov3_detection/yolov3_detection.cc b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc index a9beb1ed28de1f3c28eb5c03b3b660d518ee10c5..d34319050392c74c3fa552bd24c0ea24245ced99 100644 --- a/lite/demo/cxx/yolov3_detection/yolov3_detection.cc +++ b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc @@ -182,10 +182,10 @@ std::vector detect_object(const float* data, return rect_out; } -void RunModel(std::string model_dir, std::string img_path) { +void RunModel(std::string model_file, std::string img_path) { // 1. Set MobileConfig MobileConfig config; - config.set_model_dir(model_dir); + config.set_model_from_file(model_file); // 2. Create PaddlePredictor by MobileConfig std::shared_ptr predictor = @@ -228,11 +228,11 @@ void RunModel(std::string model_dir, std::string img_path) { int main(int argc, char** argv) { if (argc < 3) { - std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n"; + std::cerr << "[ERROR] usage: " << argv[0] << " model_file image_path\n"; exit(1); } - std::string model_dir = argv[1]; + std::string model_file = argv[1]; std::string img_path = argv[2]; - RunModel(model_dir, img_path); + RunModel(model_file, img_path); return 0; } diff --git a/lite/demo/java/README.md b/lite/demo/java/README.md index 904726d744b7bda075cee05830903a470d52cf54..4cf651a829e6b43607fe12ab21454d52408528e8 100644 --- a/lite/demo/java/README.md +++ b/lite/demo/java/README.md @@ -24,7 +24,7 @@ cmake .. \ -DLITE_WITH_ARM=ON \ -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ -DWITH_TESTING=OFF \ --DLITE_SHUTDOWN_LOG=ON \ +-DLITE_WITH_LOG=OFF \ -DLITE_ON_TINY_PUBLISH=ON \ -DARM_TARGET_OS=android -DARM_TARGET_ARCH_ABI=armv8 -DARM_TARGET_LANG=gcc diff --git a/lite/demo/python/mobilenetv1_full_api.py b/lite/demo/python/mobilenetv1_full_api.py index a31469e3e8da81f3753dc5d241d4ef39ac03832f..c3a6bd077be5978f1ecaf9b040b119e50117d62b 100644 --- a/lite/demo/python/mobilenetv1_full_api.py +++ b/lite/demo/python/mobilenetv1_full_api.py @@ -23,7 +23,7 @@ import argparse import sys sys.path.append('../../python/lib') -from lite_core import * +from paddlelite.lite import * # Command arguments parser = argparse.ArgumentParser() diff --git a/lite/demo/python/mobilenetv1_light_api.py b/lite/demo/python/mobilenetv1_light_api.py index a44427092bae88aa41b3b1d0684cfcf36835b3d2..5847c7819366b654dd9d5b5cbe2108b54da7b04c 100644 --- a/lite/demo/python/mobilenetv1_light_api.py +++ b/lite/demo/python/mobilenetv1_light_api.py @@ -23,7 +23,7 @@ import argparse import sys sys.path.append('../../python/lib') -from lite_core import * +from paddlelite.lite import * # Command arguments parser = argparse.ArgumentParser() diff --git a/lite/fluid/data_type.cc b/lite/fluid/data_type.cc index d33a77c4bfcefbc349d453de05dcbb7c27707a19..9c96459993e55b441ea795c4f2cb58f40846c0d9 100644 --- a/lite/fluid/data_type.cc +++ b/lite/fluid/data_type.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "lite/fluid/data_type.h" #include #include diff --git a/lite/fluid/lod.h b/lite/fluid/lod.h index 36386f7eb967f31ec258681fe17222a928aa7b4b..b1f2f14a0a4534e588d18237826858812740db69 100644 --- a/lite/fluid/lod.h +++ b/lite/fluid/lod.h @@ -19,7 +19,7 @@ namespace paddle { namespace lite { namespace fluid { -using LoD = std::vector>; +using LoD = std::vector>; static LoD ToAbsOffset(const LoD &in) { // the lowest level stores relative offsets diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt index 40c95415546d99a66abf2d6f3595ae8695c4df86..2416278ad74068d28f6de523c55513891b08cc72 100644 --- a/lite/gen_code/CMakeLists.txt +++ b/lite/gen_code/CMakeLists.txt @@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + RKNPU_DEPS ${rknpu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} @@ -43,6 +44,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + RKNPU_DEPS ${rknpu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} diff --git a/lite/gen_code/gen_code.cc b/lite/gen_code/gen_code.cc index 0d8f4d0d192f3563d00bb66778ca4e13a17b93b1..6c43f6e0116d9adfc4fc6f315d5653b2634dfe7b 100644 --- a/lite/gen_code/gen_code.cc +++ b/lite/gen_code/gen_code.cc @@ -111,11 +111,11 @@ void Module::AddOpDescHelper(const std::string &op_id, switch (type) { case AttrType::INT: - return std::to_string(desc.GetAttr(name)); + return paddle::lite::to_string(desc.GetAttr(name)); case AttrType::FLOAT: - return std::to_string(desc.GetAttr(name)); + return paddle::lite::to_string(desc.GetAttr(name)); case AttrType::BOOLEAN: - return std::to_string(desc.GetAttr(name)); + return paddle::lite::to_string(desc.GetAttr(name)); case AttrType::STRING: return "\"" + desc.GetAttr(name) + "\""; case AttrType::FLOATS: { diff --git a/lite/gen_code/gen_code.h b/lite/gen_code/gen_code.h index 58a7959f4eb34cb438bf0e25b49b36110435cc6b..d316eac43f99664fa71cba54b3ab5360852300a0 100644 --- a/lite/gen_code/gen_code.h +++ b/lite/gen_code/gen_code.h @@ -153,16 +153,16 @@ class Module { private: std::string WeightUniqueName() const { - return "w_" + std::to_string(weight_counter_++); + return "w_" + paddle::lite::to_string(weight_counter_++); } std::string TmpVarUniqueName() const { - return "tmp_" + std::to_string(tmp_var_counter_++); + return "tmp_" + paddle::lite::to_string(tmp_var_counter_++); } std::string OpUniqueName() const { - return "op_" + std::to_string(op_counter_++); + return "op_" + paddle::lite::to_string(op_counter_++); } std::string KernelUniqueName() const { - return "kernel_" + std::to_string(kernel_counter_++); + return "kernel_" + paddle::lite::to_string(kernel_counter_++); } std::string DataRepr(const std::string &raw_data, PrecisionType dtype); diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt index 4e0092b392eb31ce81f2a410ea86002b343f0aec..17a836b17183d69b0e2a15b46b7a2097c323312f 100644 --- a/lite/kernels/CMakeLists.txt +++ b/lite/kernels/CMakeLists.txt @@ -10,4 +10,7 @@ add_subdirectory(opencl) add_subdirectory(fpga) add_subdirectory(npu) add_subdirectory(xpu) +add_subdirectory(mlu) +add_subdirectory(apu) add_subdirectory(bm) +add_subdirectory(rknpu) diff --git a/lite/kernels/apu/CMakeLists.txt b/lite/kernels/apu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f51a8291f582ba022cffa999b5c19a91ca2d45d8 --- /dev/null +++ b/lite/kernels/apu/CMakeLists.txt @@ -0,0 +1,3 @@ +add_subdirectory(bridges) + +add_kernel(subgraph_compute_apu APU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_apu neuron_adapter subgraph_bridge_engine ${apu_subgraph_bridges}) diff --git a/lite/kernels/apu/bridges/CMakeLists.txt b/lite/kernels/apu/bridges/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b42af5a6fe79bbb8417c2a6a37a86c30f4a0f8b --- /dev/null +++ b/lite/kernels/apu/bridges/CMakeLists.txt @@ -0,0 +1,30 @@ +if(NOT LITE_WITH_APU) + return() +endif() + + +lite_cc_library(subgraph_bridge_utility_apu SRCS utility.cc DEPS tensor neuron_adapter) +lite_cc_library(subgraph_bridge_graph_apu SRCS graph.cc DEPS subgraph_bridge_utility_apu) + +set(apu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_apu subgraph_bridge_graph_apu) + +lite_cc_library(subgraph_bridge_conv_op_apu SRCS conv_op.cc DEPS ${apu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_elementwise_ops_apu SRCS elementwise_ops.cc DEPS ${apu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps}) + + +set(apu_subgraph_bridges + subgraph_bridge_registry + subgraph_bridge_utility_apu + subgraph_bridge_conv_op_apu + subgraph_bridge_elementwise_ops_apu + subgraph_bridge_act_op_apu + subgraph_bridge_softmax_op_apu + subgraph_bridge_fc_op_apu + subgraph_bridge_pool_op_apu + CACHE INTERNAL "apu_subgraph_bridges") + +message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}") diff --git a/lite/kernels/apu/bridges/act_op.cc b/lite/kernels/apu/bridges/act_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c2451d640eb52f6da88c4cd91bbf4ccd95f49152 --- /dev/null +++ b/lite/kernels/apu/bridges/act_op.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/apu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[APU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + + return SUCCESS; +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(relu, kAPU, paddle::lite::subgraph::apu::ActConverter); diff --git a/lite/kernels/apu/bridges/conv_op.cc b/lite/kernels/apu/bridges/conv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ca6e0ff2ac3930fe5cab9230dbbefa0af0a864ab --- /dev/null +++ b/lite/kernels/apu/bridges/conv_op.cc @@ -0,0 +1,556 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/conv_op.h" +#include +#include +#include "lite/kernels/apu/bridges/graph.h" +#include "lite/kernels/apu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto model = graph->model(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + int neuron_errCode; + VLOG(3) << "[APU] Converting [" << op_type << "]"; + + // Get input and output vars and op attributes + auto input_name = op_info->Input("Input").front(); + auto input = scope->FindMutableTensor(input_name); + auto input_dims = input->dims(); + + auto filter_name = op_info->Input("Filter").front(); + auto filter = scope->FindMutableTensor(filter_name); + auto filter_dims = filter->dims(); + + auto output_name = op_info->Output("Output").front(); + auto output = scope->FindMutableTensor(output_name); + auto output_dims = output->dims(); + + auto bs = input_dims[0]; + auto ic = input_dims[1]; + auto oc = filter_dims[0]; + CHECK_EQ(input_dims.size(), 4L); + CHECK_EQ(output_dims.size(), 4L); + CHECK_EQ(filter_dims.size(), 4L); + CHECK_EQ(output_dims[0], bs); + CHECK_EQ(output_dims[1], oc); + auto strides = op_info->GetAttr>("strides"); + auto paddings = op_info->GetAttr>("paddings"); + auto groups = op_info->GetAttr("groups"); + auto dilations = op_info->GetAttr>("dilations"); + bool with_act = + op_info->HasAttr("with_act") && op_info->GetAttr("with_act"); + std::string act_type = + with_act ? op_info->GetAttr("act_type") : ""; + float leaky_relu_alpha = act_type == "leaky_relu" + ? op_info->GetAttr("leaky_relu_alpha") + : 0.f; + CHECK_EQ(strides.size(), 2L); + CHECK_EQ(dilations.size(), 2L); + bool is_depthwise_mode = ic == groups && oc == groups; + VLOG(3) << "is_depthwise_mode" << is_depthwise_mode; + + if (paddings.size() == 2L) { + for (size_t i = 0; i < strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + + CHECK_EQ(paddings.size(), 4L) + << "[APU] Paddings size should be the same or twice as the input size." + << paddings.size(); + + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + operators::UpdatePaddingAndDilation(&paddings, + &dilations, + strides, + padding_algorithm, + input_dims, + filter_dims); + + float input_scale; + float output_scale; + std::vector weight_scale; + if (op_info->HasAttr("enable_int8")) { + if (op_info->GetAttr("enable_int8")) { + if (op_info->HasAttr("input_scale")) + input_scale = op_info->GetAttr("input_scale"); + if (op_info->HasAttr("weight_scale")) + weight_scale = op_info->GetAttr>("weight_scale"); + if (op_info->HasAttr("output_scale")) + output_scale = op_info->GetAttr("output_scale"); + VLOG(3) << "has output scale:" << output_scale; + } else { + return FAILED; + } + } else { + return FAILED; + } + + VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups + << " ,dilations: " << dilations[0] << ":" << dilations[1]; + VLOG(3) << "with_act: " << with_act << " ,act_type:" << act_type; + VLOG(3) << "input_dims: " << input_dims << " ,output_dims: " << output_dims + << " ,weight_scale size: " << weight_scale.size(); + VLOG(3) << "filter_dims: " << filter_dims + << " ,memory_size: " << filter->memory_size() + << " ,data_size: " << filter->data_size(); + + // Add input tensor type + NeuronOperandType inType; + inType.type = NEURON_TENSOR_QUANT8_ASYMM; + inType.scale = input_scale; + inType.zeroPoint = 128; + inType.dimensionCount = input_dims.size(); + std::vector dims_in = {(uint32_t)input_dims[0], + (uint32_t)input_dims[2], + (uint32_t)input_dims[3], + (uint32_t)input_dims[1]}; + inType.dimensions = &dims_in[0]; + + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + VLOG(3) << "Graph has " << input_name; + // input operand already exist + input_node = graph->Get(input_name); + } else { + // add input operand + if (graph->IsInput(input_name)) { + // Insert transpose for NCHW -> NHWC + insert_transpose_node( + ctx, + input_name, + "transpose_" + input_name, + {input_dims[0], input_dims[1], input_dims[2], input_dims[3]}, + dims_in, + {0, 2, 3, 1}, + inType.scale, + inType.zeroPoint); + + // change input_name + input_name = "transpose_" + input_name; + input_node = graph->Get(input_name); + if (input_node == nullptr) return subgraph::FAILED; + } else { + NeuronModel_addOperand(model, &inType); // input + input_node = graph->Add(input_name, dims_in); + } + } + VLOG(3) << "input node idx" << input_node->index() + << ": input_scale: " << input_scale + << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1] + << ":" << inType.dimensions[2] << ":" << inType.dimensions[3]; + + // Add bias type + NeuronOperandType biasType; + + // Add filter type + // filter NCHW -> NHWC + Tensor transpose_filter; + std::vector dims_filter; + + if (is_depthwise_mode) { + transpose_filter.Resize({1, + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3], + (uint32_t)filter_dims[0]}); + dims_filter = {1, + (uint32_t)filter_dims[0], + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3]}; + transpose(filter->data(), + transpose_filter.mutable_data(), + dims_filter, + {0, 2, 3, 1}); + + dims_filter = {(uint32_t)filter_dims[1], + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3], + (uint32_t)filter_dims[0]}; + } else { + transpose_filter.Resize({(uint32_t)filter_dims[0], + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3], + (uint32_t)filter_dims[1]}); + dims_filter = {(uint32_t)filter_dims[0], + (uint32_t)filter_dims[1], + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3]}; + transpose(filter->data(), + transpose_filter.mutable_data(), + dims_filter, + {0, 2, 3, 1}); + + dims_filter = {(uint32_t)filter_dims[0], + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3], + (uint32_t)filter_dims[1]}; + } + + NeuronOperandType filterType; + NeuronOperandType channelFilterType; + NeuronSymmPerChannelQuantParams symmPerChannelQuantParams; + if (1 == weight_scale.size()) { + // Per layer type + filterType.type = NEURON_TENSOR_QUANT8_ASYMM; + filterType.scale = weight_scale[0]; + filterType.zeroPoint = 128; + filterType.dimensionCount = filter_dims.size(); + filterType.dimensions = &dims_filter[0]; + biasType.scale = inType.scale * filterType.scale; + } else { + // Per channel type + channelFilterType.type = NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL; + channelFilterType.scale = 0.0f; + channelFilterType.zeroPoint = 0; + channelFilterType.dimensionCount = filter_dims.size(); + channelFilterType.dimensions = &dims_filter[0]; + + // Per channel setting + if (is_depthwise_mode) + symmPerChannelQuantParams.channelDim = 3; + else + symmPerChannelQuantParams.channelDim = 0; + symmPerChannelQuantParams.scaleCount = weight_scale.size(); + symmPerChannelQuantParams.scales = weight_scale.data(); + biasType.scale = 0; + } + + std::shared_ptr filter_node = nullptr; + if (1 == weight_scale.size()) { + NeuronModel_addOperand(model, &filterType); // 1: filter + filter_node = graph->Add(filter_name, dims_filter); + VLOG(3) << "filter node idx: " << filter_node->index() << "w_scale[0]" + << weight_scale[0] << ": filterType: " << filterType.dimensions[0] + << ":" << filterType.dimensions[1] << ":" + << filterType.dimensions[2] << ":" << filterType.dimensions[3]; + memcpy(filter->mutable_data(), + transpose_filter.mutable_data(), + filter->memory_size()); + neuron_errCode = NeuronModel_setOperandValue( + model, filter_node->index(), filter->raw_data(), filter->memory_size()); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Set filter operand value fail:" << neuron_errCode; + return subgraph::FAILED; + } + } else { + NeuronModel_addOperand(model, &channelFilterType); // 1: filter + filter_node = graph->Add(filter_name, dims_filter); + VLOG(3) << "chennel filter node idx: " << filter_node->index() + << " ,scale_count:" << weight_scale.size() + << " weight_scale[0]:" << weight_scale.data()[0] + << " ,channelFilterType: " << channelFilterType.dimensions[0] << ":" + << channelFilterType.dimensions[1] << ":" + << channelFilterType.dimensions[2] << ":" + << channelFilterType.dimensions[3]; + memcpy(filter->mutable_data(), + transpose_filter.mutable_data(), + filter->memory_size()); + neuron_errCode = NeuronModel_setOperandValue( + model, filter_node->index(), filter->raw_data(), filter->memory_size()); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Set filter operand value fail:" << neuron_errCode; + return subgraph::FAILED; + } + neuron_errCode = NeuronModel_setOperandSymmPerChannelQuantParams( + model, filter_node->index(), &symmPerChannelQuantParams); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Set per channel filter params fail:" << neuron_errCode; + return subgraph::FAILED; + } + } + + // Add biasType node value + // A 1-D tensor, of shape [depth_out], specifying the bias. + // For filter tensor of NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL, the bias + // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 + // and bias_scale of 0. The actual scale of each value 'i' is equal + // to bias_scale[i] = input_scale * filter_scale[i]. + biasType.type = NEURON_TENSOR_INT32; + biasType.zeroPoint = 0; + std::vector dims_bias; + std::shared_ptr bias_node = nullptr; + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + auto bias_type = kernel->GetInputDeclType("Bias"); + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + + biasType.dimensionCount = bias_dims.size(); + for (int i = 0; i < bias_dims.size(); i++) + dims_bias.push_back(bias_dims[i]); + biasType.dimensions = &dims_bias[0]; + NeuronModel_addOperand(model, &biasType); // 2: bias + bias_node = graph->Add(bias_name, dims_bias); + VLOG(3) << "node idx" << bias_node->index() << ": Bias name: " << bias_name + << " ,bias scale: " << biasType.scale + << " ,dimensions: " << bias_dims; + } else { + biasType.dimensionCount = 1; + dims_bias = {(uint32_t)output_dims[1]}; + biasType.dimensions = &dims_bias[0]; + NeuronModel_addOperand(model, &biasType); // 2: bias + bias_node = graph->Add(filter_name + "_default_bias", dims_bias); + VLOG(3) << "node idx" << bias_node->index() << ": Bias name: default_bias " + << " ,bias scale: " << biasType.scale + << " ,dimensions: " << dims_bias.size(); + } + + NeuronOperandType int32Type; + int32Type.type = NEURON_INT32; + int32Type.dimensionCount = 0; + std::vector dims_int32 = {1}; + + std::shared_ptr paddingL_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 3: padding left + paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32); + + std::shared_ptr paddingR_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 4: padding right + paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32); + + std::shared_ptr paddingT_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 5: padding top + paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32); + + std::shared_ptr paddingB_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 6: padding bottom + paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32); + + std::shared_ptr strideW_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 7: stride width + strideW_node = graph->Add(filter_name + "_stride_width", dims_int32); + + std::shared_ptr strideH_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 8: stride height + strideH_node = graph->Add(filter_name + "_stride_height", dims_int32); + + std::shared_ptr dm_node = nullptr; + if (is_depthwise_mode) { + NeuronModel_addOperand(model, &int32Type); // 9: depthwise multiplier + dm_node = graph->Add(filter_name + "_dm", dims_int32); + } + + std::shared_ptr fuse_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 9/10: fuse + fuse_node = graph->Add(filter_name + "_fuse", dims_int32); + + // Add output tensor type + NeuronOperandType outType; + outType.type = NEURON_TENSOR_QUANT8_ASYMM; + if (graph->IsOutput(output_name)) + outType.scale = output_scale / 127; + else + outType.scale = output_scale; + outType.zeroPoint = 128; + outType.dimensionCount = output_dims.size(); + std::vector dims_out = {(uint32_t)output_dims[0], + (uint32_t)output_dims[2], + (uint32_t)output_dims[3], + (uint32_t)output_dims[1]}; + outType.dimensions = &dims_out[0]; + std::shared_ptr output_node = nullptr; + if (graph->Has(output_name)) { + output_node = graph->Get(output_name); + } else { + // add output operand + if (graph->IsOutput(output_name)) { + NeuronModel_addOperand(model, &outType); // output + output_node = graph->Add("transpose_" + output_name, dims_out); + } else { + NeuronModel_addOperand(model, &outType); // output + output_node = graph->Add(output_name, dims_out); + } + } + VLOG(3) << "output node idx: " << output_node->index() + << ": output_scale: " << outType.scale + << ", outType: " << outType.dimensions[0] << ":" + << outType.dimensions[1] << ":" << outType.dimensions[2] << ":" + << outType.dimensions[3]; + + // Add bias value + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + auto bias = scope->FindMutableTensor(bias_name); + int32_t* int32_bias_data = + reinterpret_cast(bias->mutable_data()); + float2int32( + bias->data(), input_scale, weight_scale, int32_bias_data); + + VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << " : " + << int32_bias_data[1] << " : " << int32_bias_data[2] << " : " + << int32_bias_data[3]; + neuron_errCode = NeuronModel_setOperandValue( + model, bias_node->index(), bias->raw_data(), bias->memory_size()); + } else { + auto int32_bias = std::make_shared(); + int32_bias->Resize({1, output_dims[1]}); + int32_bias->mutable_data(); + VLOG(3) << "bais_default: " << int32_bias->memory_size(); + memset(int32_bias->mutable_data(), 0, int32_bias->memory_size()); + neuron_errCode = NeuronModel_setOperandValue(model, + bias_node->index(), + int32_bias->raw_data(), + int32_bias->memory_size()); + bias_node->set_data(int32_bias); + } + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Set bias operand value fail:" << neuron_errCode; + return subgraph::FAILED; + } + + VLOG(3) << "paddings: " << paddings[0] << ":" << paddings[1] << ":" + << paddings[2] << ":" << paddings[3]; + // Add padding value + int32_t padding_val[1]; + padding_val[0] = paddings[2]; + NeuronModel_setOperandValue( + model, paddingL_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[3]; + NeuronModel_setOperandValue( + model, paddingR_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[0]; + NeuronModel_setOperandValue( + model, paddingT_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[1]; + NeuronModel_setOperandValue( + model, paddingB_node->index(), padding_val, sizeof(int32_t) * 1); + + VLOG(3) << " stride width:" << strides[1] << " height:" << strides[0]; + + // Add Stride + int32_t stride_val[1]; + stride_val[0] = strides[1]; // width + NeuronModel_setOperandValue( + model, strideW_node->index(), stride_val, sizeof(int32_t) * 1); + stride_val[0] = strides[0]; // height + NeuronModel_setOperandValue( + model, strideH_node->index(), stride_val, sizeof(int32_t) * 1); + + // Add fuse + int32_t fuse_val[1] = {0}; + if (act_type == "relu") { + fuse_val[0] = 1; + } else if (act_type == "relu1") { + fuse_val[0] = 2; + } else if (act_type == "relu6") { + fuse_val[0] = 3; + } else if (!act_type.empty()) { + fuse_val[0] = 0; + LOG(WARNING) << "Support act_type: " << act_type; + return FAILED; + } + + if (is_depthwise_mode) { + int32_t dm = oc / ic; + NeuronModel_setOperandValue( + model, dm_node->index(), &dm, sizeof(int32_t) * 1); + VLOG(3) << "depthwise multiplier:" << dm; + + // Depthwise conv + NeuronModel_setOperandValue( + model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); + std::vector addInIndex = { + input_node->index(), // 0: input + filter_node->index(), // 1: filter + bias_node->index(), // 2: bias + paddingL_node->index(), // 3: padding left + paddingR_node->index(), // 4: padding right + paddingT_node->index(), // 5: padding top + paddingB_node->index(), // 6: padding bottom + strideW_node->index(), // 7: stride width + strideH_node->index(), // 8: stride height + dm_node->index(), // 9: depthwise multiplier + fuse_node->index()}; // 10 : fuse + + std::vector addOutIndex = {output_node->index()}; + neuron_errCode = NeuronModel_addOperation(model, + NEURON_DEPTHWISE_CONV_2D, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + } else { + NeuronModel_setOperandValue( + model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); + std::vector addInIndex = { + input_node->index(), // 0: input + filter_node->index(), // 1: filter + bias_node->index(), // 2: bias + paddingL_node->index(), // 3: padding left + paddingR_node->index(), // 4: padding right + paddingT_node->index(), // 5: padding top + paddingB_node->index(), // 6: padding bottom + strideW_node->index(), // 7: stride width + strideH_node->index(), // 8: stride height + fuse_node->index()}; // 9: fuse + + std::vector addOutIndex = {output_node->index()}; + neuron_errCode = NeuronModel_addOperation(model, + NEURON_CONV_2D, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + } + + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Add op fail:" << op_type; + return FAILED; + } + + if (graph->IsOutput(output_name)) { + // Insert transpose for NHWC -> NCHW + insert_transpose_node( + ctx, + "transpose_" + output_name, + output_name, + dims_out, + {output_dims[0], output_dims[1], output_dims[2], output_dims[3]}, + {0, 3, 1, 2}, + outType.scale, + outType.zeroPoint); + output_node = graph->Get(output_name); + if (output_node == nullptr) return subgraph::FAILED; + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(conv2d, + kAPU, + paddle::lite::subgraph::apu::ConvConverter); +REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d, + kAPU, + paddle::lite::subgraph::apu::ConvConverter); diff --git a/lite/kernels/apu/bridges/elementwise_ops.cc b/lite/kernels/apu/bridges/elementwise_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..9c637e0fe746ce2a4d2b42dc902d62279967e73c --- /dev/null +++ b/lite/kernels/apu/bridges/elementwise_ops.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/apu/bridges/graph.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto model = graph->model(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[APU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + + auto y_name = op_info->Input("Y").front(); + auto y = scope->FindMutableTensor(y_name); + auto y_dims = y->dims(); + + auto out_name = op_info->Output("Out").front(); + auto out = scope->FindMutableTensor(out_name); + auto out_dims = out->dims(); + auto axis = op_info->GetAttr("axis"); + + // Act node + if (op_type == "fusion_elementwise_add_activation" || + op_type == "fusion_elementwise_sub_activation" || + op_type == "fusion_elementwise_mul_activation" || + op_type == "fusion_elementwise_div_activation") { + auto act_type = op_info->GetAttr("act_type"); + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(elementwise_add, + kAPU, + paddle::lite::subgraph::apu::ElementwiseConverter); +REGISTER_SUBGRAPH_BRIDGE(elementwise_mul, + kAPU, + paddle::lite::subgraph::apu::ElementwiseConverter); diff --git a/lite/kernels/apu/bridges/fc_op.cc b/lite/kernels/apu/bridges/fc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a00a35f9a0766b4fb4f02d05419a0ae42354ca37 --- /dev/null +++ b/lite/kernels/apu/bridges/fc_op.cc @@ -0,0 +1,244 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/apu/bridges/graph.h" +#include "lite/kernels/apu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto model = graph->model(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[APU] Converting [" + op_type + "]"; + + auto input_name = op_info->Input("Input").front(); + auto input = scope->FindMutableTensor(input_name); + auto input_dims = input->dims(); + CHECK_GE(input_dims.size(), 2UL); + auto w_name = op_info->Input("W").front(); + auto w = scope->FindMutableTensor(w_name); + auto w_dims = w->dims(); + CHECK_EQ(w_dims.size(), 2UL); + auto out_name = op_info->Output("Out").front(); + auto out = scope->FindMutableTensor(out_name); + auto out_dims = out->dims(); + + int in_num_col_dims = op_info->GetAttr("in_num_col_dims"); + int m = input_dims.Slice(0, in_num_col_dims).production(); + int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production(); + int n = w_dims[1]; + CHECK_EQ(k * n, w_dims.production()); + VLOG(3) << "[APU] input dims: " << input_dims << " w dims: " << w_dims + << " out_dims: " << out_dims << " m: " << m << " k: " << k + << " n: " << n; + + float input_scale = 1.0f; + float out_scale = 1.0f; + std::vector w_scale; + if (op_info->HasAttr("enable_int8")) { + if (op_info->GetAttr("enable_int8")) { + if (op_info->HasAttr("input_scale")) + input_scale = op_info->GetAttr("input_scale"); + if (op_info->HasAttr("weight_scale")) + w_scale = op_info->GetAttr>("weight_scale"); + if (op_info->HasAttr("output_scale")) + out_scale = op_info->GetAttr("output_scale"); + } else { + return FAILED; + } + } else { + return FAILED; + } + + // Add input tensor type + NeuronOperandType inType; + inType.type = NEURON_TENSOR_QUANT8_ASYMM; + inType.scale = input_scale; + inType.zeroPoint = 128; + inType.dimensionCount = input_dims.size(); + std::vector dims_in = {(uint32_t)input_dims[0], + (uint32_t)input_dims[2], + (uint32_t)input_dims[3], + (uint32_t)input_dims[1]}; + + inType.dimensions = &dims_in[0]; + std::shared_ptr in_node = nullptr; + if (graph->Has(input_name)) { + // input operand already exist + in_node = graph->Get(input_name); + VLOG(3) << "Graph has " << input_name << ",index: " << in_node->index(); + } else { + // add input operand + NeuronModel_addOperand(model, &inType); // 0: input + in_node = graph->Add(input_name, dims_in); + } + VLOG(3) << "input_scale: " << input_scale + << ", inType: " << inType.dimensions[0] << " : " + << inType.dimensions[1] << " : " << inType.dimensions[2] << " : " + << inType.dimensions[3]; + + NeuronOperandType wType; + wType.type = NEURON_TENSOR_QUANT8_ASYMM; + wType.scale = w_scale[0]; + wType.zeroPoint = 128; + wType.dimensionCount = w_dims.size(); + std::vector dims_w = {(uint32_t)w_dims[1], (uint32_t)w_dims[0]}; + wType.dimensions = &dims_w[0]; + NeuronModel_addOperand(model, &wType); // 1: weight + std::shared_ptr w_node = nullptr; + w_node = graph->Add(w_name, dims_w); + VLOG(3) << "w_scale size: " << w_scale.size() << ",w_scale: " << w_scale[0] + << ", wType dimensions: " << wType.dimensions[0] << " : " + << wType.dimensions[1] << ", memory size: " << w->memory_size(); + + // Add bias type + NeuronOperandType biasType; + biasType.type = NEURON_TENSOR_INT32; + biasType.zeroPoint = 0; + biasType.scale = input_scale * w_scale[0]; + std::shared_ptr bias_node = nullptr; + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + auto bias_type = kernel->GetInputDeclType("Bias"); + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + + biasType.dimensionCount = bias_dims.size(); + std::vector dims_bias = {(uint32_t)bias_dims[0]}; + biasType.dimensions = &dims_bias[0]; + NeuronModel_addOperand(model, &biasType); // 2: bias + bias_node = graph->Add(bias_name, dims_bias); + VLOG(3) << "Bias name: " << bias_name << ", bias dims: " << bias_dims + << ", bias scale: " << biasType.scale + << " ,memory size: " << bias->memory_size(); + } else { + biasType.dimensionCount = 1; + std::vector dims_bias = {(uint32_t)n}; + biasType.dimensions = &dims_bias[0]; + NeuronModel_addOperand(model, &biasType); // 2: bias + bias_node = graph->Add(w_name + "_default_bias", dims_bias); + } + + // Add fuse type + NeuronOperandType fuseType; + fuseType.type = NEURON_INT32; + fuseType.dimensionCount = 0; + std::vector dims_int32 = {0}; + NeuronModel_addOperand(model, &fuseType); // 3: fuse + std::shared_ptr fuse_node = nullptr; + fuse_node = graph->Add(w_name + "_fuse", dims_int32); + + // Add output tensor type + NeuronOperandType outType; + outType.type = NEURON_TENSOR_QUANT8_ASYMM; + outType.scale = out_scale; + outType.zeroPoint = 128; + outType.dimensionCount = 2; + std::vector dims_out = {(uint32_t)out_dims[0], out_dims[1]}; + outType.dimensions = &dims_out[0]; + VLOG(3) << "out_scale: " << out_scale + << ", outType: " << outType.dimensions[0] << " : " + << outType.dimensions[1]; + NeuronModel_addOperand(model, &outType); // output + std::shared_ptr out_node = nullptr; + out_node = graph->Add(out_name, dims_out); + + int8_t* w_data = w->mutable_data(); + Tensor transpose_filter; + // Original dimension + transpose_filter.Resize({(uint32_t)w_dims[1], (uint32_t)w_dims[0]}); + transpose_filter.mutable_data(); + transposeAsym(w->data(), + transpose_filter.mutable_data(), + {1, 1, (uint32_t)w_dims[0], (uint32_t)w_dims[1]}, + {0, 1, 3, 2}); + memcpy(w->mutable_data(), + transpose_filter.mutable_data(), + w->memory_size()); + int neuron_errCode = NeuronModel_setOperandValue( + model, w_node->index(), w->raw_data(), w->memory_size()); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Set W operand value fail:" << neuron_errCode + << ",index: " << w_node->index(); + return FAILED; + } + + // Add bias if bias tensor exists + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + auto bias = scope->FindMutableTensor(bias_name); + int32_t* int32_bias_data = + reinterpret_cast(bias->mutable_data()); + float2int32(bias->data(), input_scale, w_scale, int32_bias_data); + + VLOG(3) << int32_bias_data[0] << ":" << int32_bias_data[1] << ":" + << int32_bias_data[2] << ":" << int32_bias_data[3]; + neuron_errCode = + NeuronModel_setOperandValue(model, + bias_node->index(), + bias->raw_data(), + bias->memory_size()); // 2: bias + } else { + auto int32_bias = std::make_shared(); + int32_bias->Resize({1, out_dims[1]}); + int32_bias->mutable_data(); + memset(int32_bias->mutable_data(), 0, int32_bias->memory_size()); + VLOG(3) << "default: " << int32_bias->memory_size(); + neuron_errCode = + NeuronModel_setOperandValue(model, + bias_node->index(), + int32_bias->raw_data(), + int32_bias->memory_size()); // 2: bias + bias_node->set_data(int32_bias); + } + // Add fuse value + int32_t fuse_val[1] = {0}; + NeuronModel_setOperandValue( + model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); // 3: fuse + + std::vector addInIndex = {in_node->index(), + w_node->index(), + bias_node->index(), + fuse_node->index()}; + std::vector addOutIndex = {out_node->index()}; + neuron_errCode = NeuronModel_addOperation(model, + NEURON_FULLY_CONNECTED, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Add op fail:" << op_type; + return FAILED; + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(fc, kAPU, paddle::lite::subgraph::apu::FCConverter); diff --git a/lite/kernels/apu/bridges/graph.cc b/lite/kernels/apu/bridges/graph.cc new file mode 100644 index 0000000000000000000000000000000000000000..515853aa26a1d84339c61047b5d3be20478b5ca3 --- /dev/null +++ b/lite/kernels/apu/bridges/graph.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/apu/bridges/graph.h" +#include +#include "lite/kernels/apu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int Graph::Add(const std::string& name, std::shared_ptr node) { + auto it = nodes_.find(name); + + if (it != nodes_.end()) { + LOG(FATAL) << "[APU] Node" << name << " is redefined."; + return -1; + } else { + VLOG(3) << " Add: " << name << " : " << node->index(); + auto ret = nodes_.insert( + std::make_pair(name, std::vector>())); + CHECK(ret.second); + it = ret.first; + } + operandIdx_ += 1; + it->second.push_back(node); + + return it->second.size(); +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/apu/bridges/graph.h b/lite/kernels/apu/bridges/graph.h new file mode 100644 index 0000000000000000000000000000000000000000..2eca1e3f1a76c6448d8f894efa1b2bf42d16cbb8 --- /dev/null +++ b/lite/kernels/apu/bridges/graph.h @@ -0,0 +1,109 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/backends/apu/neuron_adapter.h" +#include "lite/core/op_lite.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +// Graph and node is defined to collect all of converted HiAI IR nodes +class Node { + public: + Node(int32_t operand_idx, std::vector shape) + : idx_(operand_idx), shape_(shape) {} + + void set_shape(std::vector shape) { shape_ = shape; } + + uint32_t index() { return idx_; } + std::vector shape() const { return shape_; } + void set_data(std::shared_ptr data) { data_ = data; } + + private: + int32_t idx_; + std::vector shape_; + std::shared_ptr data_{nullptr}; +}; + +class Graph { + public: + int Add(const std::string& name, std::shared_ptr node); + + // Variable, const or data node + std::shared_ptr Add(const std::string& name, + std::vector shape) { + CHECK(shape.size()) << name << " : " << shape.size(); + auto node = std::make_shared(operandIdx_, shape); + auto idx = Add(name, node); + CHECK_GE(idx, 1); + + return node; + } + + void set_model(NeuronModel* model) { model_ = model; } + NeuronModel* model() { return model_; } + + void set_input_names(const std::vector input_names) { + input_names_ = input_names; + } + + bool IsInput(const std::string& name) { + for (int i = 0; i < input_names_.size(); i++) { + if (input_names_[i] == name) return true; + } + return false; + } + + bool IsOutput(const std::string& name) { + for (int i = 0; i < output_names_.size(); i++) { + if (output_names_[i] == name) return true; + } + return false; + } + + void set_output_names(const std::vector output_names) { + output_names_ = output_names; + } + + std::shared_ptr Get(std::string name) { + CHECK(Has(name)) << "[APU] Node " << name << " not found."; + return nodes_.at(name).back(); + } + + bool Has(const std::string& name) { + return nodes_.find(name) != nodes_.end(); + } + + private: + NeuronModel* model_; + std::unordered_map>> nodes_; + int32_t operandIdx_ = 0; + std::vector input_names_; + std::vector output_names_; +}; + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/apu/bridges/paddle_use_bridges.h b/lite/kernels/apu/bridges/paddle_use_bridges.h new file mode 100644 index 0000000000000000000000000000000000000000..e3e68afc6c7c18d2b8d68361ac09de2abf2b684c --- /dev/null +++ b/lite/kernels/apu/bridges/paddle_use_bridges.h @@ -0,0 +1,24 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +USE_SUBGRAPH_BRIDGE(relu, kAPU); +USE_SUBGRAPH_BRIDGE(conv2d, kAPU); +USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kAPU); +USE_SUBGRAPH_BRIDGE(elementwise_add, kAPU); +USE_SUBGRAPH_BRIDGE(elementwise_mul, kAPU); +USE_SUBGRAPH_BRIDGE(fc, kAPU); +USE_SUBGRAPH_BRIDGE(pool2d, kAPU); +USE_SUBGRAPH_BRIDGE(softmax, kAPU); diff --git a/lite/kernels/apu/bridges/pool_op.cc b/lite/kernels/apu/bridges/pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2bda76ab99af727276102e884f84534b77a59586 --- /dev/null +++ b/lite/kernels/apu/bridges/pool_op.cc @@ -0,0 +1,273 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/pool_op.h" +#include "lite/kernels/apu/bridges/graph.h" +#include "lite/kernels/apu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto model = graph->model(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[APU] Converting [" + op_type + "] "; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out = scope->FindMutableTensor(out_name); + auto out_dims = out->dims(); + auto pooling_type = op_info->GetAttr("pooling_type"); + auto global_pooling = op_info->GetAttr("global_pooling"); + auto ksize = op_info->GetAttr>("ksize"); + auto paddings = op_info->GetAttr>("paddings"); + + // pool mode + if ((pooling_type == "max") || (pooling_type == "avg")) { + } else { + LOG(WARNING) << "[APU] Unsupported pooling type: " << pooling_type; + return FAILED; + } + + // pad mode + int pad_mode = 0; + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + if (padding_algorithm == "SAME") { + pad_mode = 6; + } else if (padding_algorithm == "VALID") { + pad_mode = 5; + } + + // paddings and strides + if (paddings.size() == 2L) { + for (size_t i = 0; i < 2L; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "[APU] Paddings size should be the same or twice as the inputs size."; + + bool adaptive = false; + if (op_info->HasAttr("adaptive")) { + adaptive = op_info->GetAttr("adaptive"); + } + auto strides = op_info->GetAttr>("strides"); + lite::operators::UpdatePadding(&paddings, + global_pooling, + adaptive, + padding_algorithm, + x->dims(), + strides, + ksize); + + // Add x tensor type + float x_scale = 1.0f; + float out_scale = 1.0f; + if (op_info->HasAttr("enable_int8")) { + if (op_info->GetAttr("enable_int8")) { + if (op_info->HasAttr("input_scale")) + x_scale = op_info->GetAttr("input_scale"); + if (op_info->HasAttr("output_scale")) + out_scale = op_info->GetAttr("output_scale"); + } else { + LOG(WARNING) << "Do not enable_int8"; + return FAILED; + } + } else { + LOG(WARNING) << "Do not enable_int8"; + return FAILED; + } + + NeuronOperandType xType; + xType.type = NEURON_TENSOR_QUANT8_ASYMM; + xType.scale = x_scale; + xType.zeroPoint = 128; + xType.dimensionCount = x_dims.size(); + std::vector dims_x = {(uint32_t)x_dims[0], + (uint32_t)x_dims[2], + (uint32_t)x_dims[3], + (uint32_t)x_dims[1]}; + xType.dimensions = &dims_x[0]; + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + LOG(INFO) << "Graph has " << x_name; + // input operand already exist + x_node = graph->Get(x_name); + } else { + // add input operand + NeuronModel_addOperand(model, &xType); // 0: x + x_node = graph->Add(x_name, dims_x); + } + VLOG(3) << "x_scale: " << x_scale << ", xType: " << xType.dimensions[0] << ":" + << xType.dimensions[1] << ":" << xType.dimensions[2] << ":" + << xType.dimensions[3]; + + NeuronOperandType int32Type; + int32Type.type = NEURON_INT32; + int32Type.dimensionCount = 0; + std::vector dims_int32 = {0}; + + std::shared_ptr paddingL_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 1: padding left + paddingL_node = graph->Add(x_name + "_padding_left", dims_int32); + + std::shared_ptr paddingR_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 2: padding right + paddingR_node = graph->Add(x_name + "_padding_right", dims_int32); + + std::shared_ptr paddingT_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 3: padding top + paddingT_node = graph->Add(x_name + "_padding_top", dims_int32); + + std::shared_ptr paddingB_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 4: padding bottom + paddingB_node = graph->Add(x_name + "_padding_bottom", dims_int32); + + std::shared_ptr strideW_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 5: stride width + strideW_node = graph->Add(x_name + "_stride_width", dims_int32); + + std::shared_ptr strideH_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 6: stride height + strideH_node = graph->Add(x_name + "_stride_height", dims_int32); + + std::shared_ptr filterW_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 7: filter width + filterW_node = graph->Add(x_name + "_filter_width", dims_int32); + + std::shared_ptr filterH_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 8: filter height + filterH_node = graph->Add(x_name + "_filter_height", dims_int32); + + std::shared_ptr fuse_node = nullptr; + NeuronModel_addOperand(model, &int32Type); // 9: fuse + fuse_node = graph->Add(x_name + "_fuse", dims_int32); + + // Add out type + // Add output tensor type + NeuronOperandType outType; + outType.type = NEURON_TENSOR_QUANT8_ASYMM; + outType.scale = out_scale; + outType.zeroPoint = 128; + outType.dimensionCount = out_dims.size(); + std::vector dims_out = {(uint32_t)out_dims[0], + (uint32_t)out_dims[2], + (uint32_t)out_dims[3], + (uint32_t)out_dims[1]}; + outType.dimensions = &dims_out[0]; + std::shared_ptr out_node = nullptr; + if (graph->Has(out_name)) { + out_node = graph->Get(out_name); + } else { + NeuronModel_addOperand(model, &outType); // out + out_node = graph->Add(out_name, dims_out); + } + VLOG(3) << "output_scale: " << x_scale + << ", outType: " << outType.dimensions[0] << ":" + << outType.dimensions[1] << ":" << outType.dimensions[2] << ":" + << outType.dimensions[3]; + + // Add padding value + int32_t padding_val[1]; + padding_val[0] = paddings[2]; + NeuronModel_setOperandValue( + model, paddingL_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[3]; + NeuronModel_setOperandValue( + model, paddingR_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[0]; + NeuronModel_setOperandValue( + model, paddingT_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[1]; + NeuronModel_setOperandValue( + model, paddingB_node->index(), padding_val, sizeof(int32_t) * 1); + + // Add Stride + int32_t stride_val[1]; + stride_val[0] = strides[1]; // width + NeuronModel_setOperandValue( + model, strideW_node->index(), stride_val, sizeof(int32_t) * 1); + stride_val[0] = strides[0]; // height + NeuronModel_setOperandValue( + model, strideH_node->index(), stride_val, sizeof(int32_t) * 1); + + // Add filter + int32_t filter_val[1]; + filter_val[0] = global_pooling ? x_dims[3] : ksize[1]; // width + NeuronModel_setOperandValue( + model, filterW_node->index(), filter_val, sizeof(int32_t) * 1); + filter_val[0] = global_pooling ? x_dims[2] : ksize[0]; // height + NeuronModel_setOperandValue( + model, filterH_node->index(), filter_val, sizeof(int32_t) * 1); + + // Add fuse + int32_t fuse_val[1] = {0}; + NeuronModel_setOperandValue( + model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); + + std::vector addInIndex = {x_node->index(), + paddingL_node->index(), + paddingR_node->index(), + paddingT_node->index(), + paddingB_node->index(), + strideW_node->index(), + strideH_node->index(), + filterW_node->index(), + filterH_node->index(), + fuse_node->index()}; + std::vector addOutIndex = {out_node->index()}; + + int neuron_errCode; + if (pooling_type == "max") { + neuron_errCode = NeuronModel_addOperation(model, + NEURON_MAX_POOL_2D, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + } else { + neuron_errCode = NeuronModel_addOperation(model, + NEURON_AVERAGE_POOL_2D, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(pool2d, + kAPU, + paddle::lite::subgraph::apu::PoolConverter); diff --git a/lite/kernels/apu/bridges/softmax_op.cc b/lite/kernels/apu/bridges/softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a289ac987b9fa300cb548d190b6e46b67f24c44 --- /dev/null +++ b/lite/kernels/apu/bridges/softmax_op.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/apu/bridges/graph.h" +#include "lite/kernels/apu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto model = graph->model(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[APU] Converting [" + op_type + "]"; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + CHECK_GE(x_dims.size(), 2UL); + auto x_rank = x_dims.size(); + auto out_name = op_info->Output("Out").front(); + + // Check output shape + auto axis = op_info->GetAttr("axis"); + if (axis < 0) { + axis += x_rank; + } + + float input_scale = 1.0f; + float out_scale = 1.0f; + if (op_info->HasAttr("enable_int8")) { + if (op_info->GetAttr("enable_int8")) { + if (op_info->HasAttr("input_scale")) + input_scale = op_info->GetAttr("input_scale"); + if (op_info->HasAttr("output_scale")) + out_scale = op_info->GetAttr("output_scale"); + } else { + LOG(WARNING) << "Do not enable_int8"; + return FAILED; + } + } else { + LOG(WARNING) << "Do not enable_int8"; + return FAILED; + } + + // Check output scale + NeuronOperandType xType; + xType.type = NEURON_TENSOR_QUANT8_ASYMM; + xType.scale = input_scale; + xType.zeroPoint = 128; + xType.dimensionCount = x_dims.size(); + std::vector dims_x; + for (int i = 0; i < x_dims.size(); i++) dims_x.push_back(x_dims[i]); + xType.dimensions = &dims_x[0]; + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + // input operand already exist + x_node = graph->Get(x_name); + VLOG(3) << "Graph has " << x_name << ",index: " << x_node->index(); + } else { + // add input operand + NeuronModel_addOperand(model, &xType); // 0: input + x_node = graph->Add(x_name, dims_x); + } + VLOG(3) << "input_scale size: " << input_scale + << " ,x_dims size: " << x_dims.size() << " ,x_dims: " << x_dims; + + // Add beta operand + std::vector dims_int32 = {0}; + NeuronOperandType betaType; + betaType.type = NEURON_FLOAT32; + betaType.dimensionCount = 0; + NeuronModel_addOperand(model, &betaType); // 1: beta + std::shared_ptr beta_node = nullptr; + beta_node = graph->Add(x_name + "_beta", dims_int32); + + // Add axis operand + NeuronOperandType axisType; + axisType.type = NEURON_INT32; + axisType.dimensionCount = 0; + NeuronModel_addOperand(model, &axisType); // 2: axis + std::shared_ptr axis_node = nullptr; + axis_node = graph->Add(x_name + "_axis", dims_int32); + + // Add out operand + NeuronOperandType outType; + outType.type = NEURON_TENSOR_QUANT8_ASYMM; + outType.scale = out_scale / 127; + outType.zeroPoint = 128; + outType.dimensionCount = x_dims.size(); + outType.dimensions = &dims_x[0]; + NeuronModel_addOperand(model, &outType); // 3: output + std::shared_ptr out_node = nullptr; + out_node = graph->Add(out_name, dims_x); + VLOG(3) << "output_scale: " << out_scale; + + float beta_val[] = {1.0f}; + NeuronModel_setOperandValue( + model, beta_node->index(), beta_val, sizeof(float) * 1); + + int32_t axis_val[1]; + axis_val[0] = axis; + NeuronModel_setOperandValue( + model, axis_node->index(), axis_val, sizeof(int32_t) * 1); + std::vector addInIndex = { + x_node->index(), beta_node->index(), axis_node->index()}; + std::vector addOutIndex = {out_node->index()}; + int neuron_errCode = NeuronModel_addOperation(model, + NEURON_SOFTMAX, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Add op fail:" << op_type; + return FAILED; + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(softmax, + kAPU, + paddle::lite::subgraph::apu::SoftmaxConverter); diff --git a/lite/kernels/apu/bridges/utility.cc b/lite/kernels/apu/bridges/utility.cc new file mode 100644 index 0000000000000000000000000000000000000000..c91e81476e519a28ebf851f42f2916c9d7c38dd8 --- /dev/null +++ b/lite/kernels/apu/bridges/utility.cc @@ -0,0 +1,200 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/apu/bridges/utility.h" +#include +#include "lite/kernels/apu/bridges/graph.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname) { + auto iarg_names = op_info->input_argnames(); + if (std::find(iarg_names.begin(), iarg_names.end(), argname) != + iarg_names.end()) { + auto inputs = op_info->Input(argname); + if (inputs.empty()) { + return false; + } + auto var_name = inputs.front(); + auto var = scope->FindVar(var_name); + return var != nullptr; + } else { + return false; + } +} + +void insert_transpose_node(void* ctx, + const std::string& input_name, + const std::string& output_name, + std::vector input_shape, + std::vector output_shape, + std::vector axis, + float scale, + int32_t zeroPoint) { + int neuron_errCode; + auto graph = static_cast(ctx); + auto model = graph->model(); + + // Add input + NeuronOperandType inType; + inType.type = NEURON_TENSOR_QUANT8_ASYMM; + inType.scale = scale; + inType.zeroPoint = zeroPoint; + inType.dimensionCount = input_shape.size(); + inType.dimensions = &input_shape[0]; + + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + VLOG(3) << "Has " << input_name; + input_node = graph->Get(input_name); + } else { + neuron_errCode = NeuronModel_addOperand(model, &inType); // input + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Insert transpose op fail!"; + return; + } + VLOG(3) << "Add " << input_name; + input_node = graph->Add(input_name, input_shape); + } + + // Add perm + NeuronOperandType permsType; + permsType.type = NEURON_TENSOR_INT32; + permsType.dimensionCount = 1; + uint32_t dims_perms[1] = {4}; + permsType.dimensions = dims_perms; + + neuron_errCode = NeuronModel_addOperand(model, &permsType); // perm + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Insert transpose op fail!"; + return; + } + std::shared_ptr perms_node = nullptr; + perms_node = graph->Add(input_name + "_perms", {4}); + + VLOG(3) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" + << axis[3]; + // &axis[0], sizeof(int32_t) * axis.size()); + neuron_errCode = NeuronModel_setOperandValue( + model, perms_node->index(), &axis[0], sizeof(int32_t) * axis.size()); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Insert transpose op fail!"; + return; + } + + // Add output + NeuronOperandType outType; + outType.type = NEURON_TENSOR_QUANT8_ASYMM; + outType.scale = scale; + outType.zeroPoint = zeroPoint; + outType.dimensionCount = output_shape.size(); + outType.dimensions = &output_shape[0]; + + NeuronModel_addOperand(model, &outType); // output + std::shared_ptr output_node = nullptr; + output_node = graph->Add(output_name, output_shape); + + std::vector addInIndex = {input_node->index(), // 0: input + perms_node->index()}; // 1: perm + + std::vector addOutIndex = {output_node->index()}; + + neuron_errCode = NeuronModel_addOperation(model, + NEURON_TRANSPOSE, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Insert transpose op fail!"; + } +} + +void transpose(const int8_t* input_data, + uint8_t* output_data, + std::vector input_shape, + std::vector axis) { + int old_index = -1; + int new_index = -1; + int dim[4] = {0}; + std::vector shape = input_shape; + VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2] + << ":" << input_shape[3]; + VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3]; + for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) { + for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) { + for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) { + for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) { + old_index = dim[0] * shape[1] * shape[2] * shape[3] + + dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3]; + new_index = + dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] + + dim[axis[1]] * shape[axis[2]] * shape[axis[3]] + + dim[axis[2]] * shape[axis[3]] + dim[axis[3]]; + + output_data[new_index] = input_data[old_index]; + } + } + } + } +} + +void transposeAsym(const int8_t* input_data, + uint8_t* output_data, + std::vector input_shape, + std::vector axis) { + int old_index = -1; + int new_index = -1; + int dim[4] = {0}; + std::vector shape = input_shape; + VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2] + << ":" << input_shape[3]; + VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3]; + for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) { + for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) { + for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) { + for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) { + old_index = dim[0] * shape[1] * shape[2] * shape[3] + + dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3]; + new_index = + dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] + + dim[axis[1]] * shape[axis[2]] * shape[axis[3]] + + dim[axis[2]] * shape[axis[3]] + dim[axis[3]]; + + output_data[new_index] = input_data[old_index] + 128; // per layer + } + } + } + } +} + +void float2int32(const float* bias_data, + float input_scale, + std::vector weight_scale, + int32_t* int32_bias_data) { + for (int i = 0; i < weight_scale.size(); i++) { + int32_bias_data[i] = bias_data[i] / (input_scale * weight_scale[i]); + } +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/apu/bridges/utility.h b/lite/kernels/apu/bridges/utility.h new file mode 100644 index 0000000000000000000000000000000000000000..ece26566ae8c55f9551bf4eab0e8ba6419b9ef89 --- /dev/null +++ b/lite/kernels/apu/bridges/utility.h @@ -0,0 +1,63 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "lite/core/op_lite.h" +#include "lite/utils/macros.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +// Type/tensor converters for converting Paddle type/tensor to HiAI type/tensor +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname); + +void insert_transpose_node(void* ctx, + const std::string& input_name, + const std::string& output_name, + std::vector input_shape, + std::vector output_shape, + std::vector axis, + float scale, + int32_t zeroPoint); + +void transpose(const int8_t* input_data, + uint8_t* output_data, + std::vector input_shape, + std::vector axis); + +void transposeAsym(const int8_t* input_data, + uint8_t* output_data, + std::vector input_shape, + std::vector axis); + +void float2int32(const float* bias_data, + float input_scale, + std::vector weight_scale, + int32_t* int32_bias_data); + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/apu/subgraph_compute.cc b/lite/kernels/apu/subgraph_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..6009e71e05c33f6dedfd995020612e112c888d36 --- /dev/null +++ b/lite/kernels/apu/subgraph_compute.cc @@ -0,0 +1,243 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/apu/subgraph_compute.h" +#include +#include +#include +#include +#include "lite/backends/apu/device.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/apu/bridges/graph.h" +#include "lite/kernels/apu/bridges/paddle_use_bridges.h" +#include "lite/kernels/apu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace apu { + +int SubgraphEngine::BuildDeviceProgram() { + unsigned int version; + Neuron_getVersion(&version); + VLOG(3) << "Neuron Adapter version: " << version; + + int status = 0; + subgraph::apu::Graph graph; + int neuron_errCode = NeuronModel_create(&model_); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Fail to create model"; + return subgraph::FAILED; + } + graph.set_model(model_); + graph.set_input_names(input_names_); + graph.set_output_names(output_names_); + + // Convert all of ops and their input vars and weights and added into the APU + // NIR graph + const auto& bridges = subgraph::Registry::Instance(); + for (auto& inst : origin_program_) { + auto op = const_cast(inst.op()); + CHECK(op); + op->CheckShape(); + op->InferShape(); + std::string op_type = op->op_info()->Type(); + if (!bridges.Exists(op_type, TARGET(kAPU))) { + return subgraph::FAILED; + } + + auto kernel = inst.kernel(); + status |= + bridges.Select(op_type, TARGET(kAPU))(reinterpret_cast(&graph), + const_cast(op), + const_cast(kernel)); + if (subgraph::CHECK_FAILED(status)) { + return subgraph::FAILED; + } + } + + // Get input tensor + std::vector ins; + origin_itensors_.resize(input_names_.size()); + origin_idims_.resize(input_names_.size()); + for (int i = 0; i < input_names_.size(); i++) { + origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]); + CHECK(origin_itensors_[i]); + origin_idims_[i] = origin_itensors_[i]->dims(); + VLOG(3) << "subgraph input name: " << i << ", " << input_names_[i] << ":" + << origin_idims_[i].production(); + // Get input index + int idx; + if (graph.Has(input_names_[i])) { + ins.push_back(graph.Get(input_names_[i])->index()); + VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index(); + } else { + LOG(WARNING) << "Fail to find input: " << input_names_[i]; + return subgraph::FAILED; + } + } + + // Get output tensor + std::vector outs; + origin_otensors_.resize(output_names_.size()); + origin_odims_.resize(output_names_.size()); + for (int i = 0; i < output_names_.size(); i++) { + origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]); + CHECK(origin_otensors_[i]); + origin_odims_[i] = origin_otensors_[i]->dims(); + VLOG(3) << "subgraph output name: " << i << ", " << output_names_[i] << ":" + << origin_odims_[i].production(); + origin_otensors_[i]->mutable_data(); + // Get input index + if (graph.Has(output_names_[i])) { + outs.push_back(graph.Get(output_names_[i])->index()); + VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index(); + } else { + LOG(WARNING) << "Fail to find output: " << output_names_[i]; + return subgraph::FAILED; + } + } + + VLOG(3) << "ins size: " << ins.size() << " outs size:" << outs.size(); + // Set subgraph input/output + NeuronModel_identifyInputsAndOutputs( + model_, ins.size(), &ins[0], outs.size(), &outs[0]); + neuron_errCode = NeuronModel_finish(model_); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode; + return subgraph::FAILED; + } + VLOG(3) << "[APU] APU NIR model created!"; + + auto GetCurrentUS = []() -> double { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; + }; + auto start_time = GetCurrentUS(); + compilation_ = lite::apu::Device::Global().Build(model_); + if (compilation_ == nullptr) { + LOG(WARNING) << "[APU] Build APU DLA model failed!"; + return subgraph::FAILED; + } + VLOG(3) << "[APU] APU DLA model created, Build cost " + << GetCurrentUS() - start_time << " us"; + + return status; +} + +int SubgraphEngine::LaunchDeviceProgram() { + auto GetCurrentUS = []() -> double { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; + }; + + auto start_time = GetCurrentUS(); + NeuronExecution* run = NULL; + int neuron_errCode = NeuronExecution_create(compilation_, &run); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "[APU] Build APU runtime failed!"; + return subgraph::FAILED; + } + + // Set input buffer + Tensor input_temp; + for (size_t i = 0; i < origin_itensors_.size(); i++) { + input_temp.Resize({origin_idims_[i]}); + uint8_t* input_data = input_temp.mutable_data(); + memcpy(input_data, + origin_itensors_[i]->raw_data(), + origin_itensors_[i]->memory_size()); + for (int j = 0; j < origin_itensors_[i]->data_size(); j++) { + input_data[j] += (uint8_t)128; + } + NeuronExecution_setInput( + run, i, NULL, input_data, origin_itensors_[i]->memory_size()); + } + + // Set output buffer + for (size_t i = 0; i < origin_otensors_.size(); i++) { + NeuronExecution_setOutput( + run, + i, + NULL, + reinterpret_cast(origin_otensors_[i]->raw_data()), + origin_otensors_[i]->memory_size()); + } + + neuron_errCode = NeuronExecution_compute(run); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Fail to run execution!" << neuron_errCode; + return subgraph::FAILED; + } + + for (size_t i = 0; i < origin_otensors_.size(); i++) { + int8_t* output_data = origin_otensors_[i]->mutable_data(); + VLOG(3) << "output size:" << origin_otensors_[i]->memory_size(); + for (int j = 0; j < origin_otensors_[i]->data_size(); j++) { + output_data[j] -= (int8_t)128; + } + } + NeuronExecution_free(run); + VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us"; + return 0; +} + +SubgraphEngine::~SubgraphEngine() { + if (compilation_) { + NeuronCompilation_free(compilation_); + } + if (model_) { + NeuronModel_free(model_); + } +} + +void SubgraphCompute::PrepareForRun() { + auto& param = this->Param(); + engine_.reset(new SubgraphEngine(ctx_.get(), + param.sub_block_idx, + param.sub_block_desc, + param.input_data_names, + param.output_data_names, + param.scope)); + CHECK(engine_); + engine_->Build(); +} + +void SubgraphCompute::Run() { + CHECK(engine_); + engine_->Launch(); +} + +} // namespace apu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(subgraph, + kAPU, + kInt8, + kNCHW, + paddle::lite::kernels::apu::SubgraphCompute, + def) + .BindInput("Inputs", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .BindOutput("Outputs", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/apu/subgraph_compute.h b/lite/kernels/apu/subgraph_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..ecd8a38343cd1f62bb5a3bf8e948384b90cfe826 --- /dev/null +++ b/lite/kernels/apu/subgraph_compute.h @@ -0,0 +1,69 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "NeuronAdapter.h" +#include "lite/core/kernel.h" +#include "lite/kernels/npu/bridges/engine.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace apu { + +class SubgraphEngine : public subgraph::Engine { + public: + SubgraphEngine(KernelContext *ctx, + int block_idx, + cpp::BlockDesc *block_desc, + const std::vector &input_names, + const std::vector &output_names, + Scope *scope) + : subgraph::Engine( + ctx, block_idx, block_desc, input_names, output_names, scope) {} + + ~SubgraphEngine(); + + protected: + int BuildDeviceProgram() override; + int LaunchDeviceProgram() override; + + NeuronModel *model_; + NeuronCompilation *compilation_; +}; + +class SubgraphCompute + : public KernelLite { + public: + using param_t = operators::SubgraphParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~SubgraphCompute() = default; + + private: + std::unique_ptr engine_; +}; + +} // namespace apu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt index 60d5e3b5e234ef19cd144100d07441eb4acf48de..1f9cd45d616bf0af753a4bfbda2e4cf8c79a78f5 100644 --- a/lite/kernels/arm/CMakeLists.txt +++ b/lite/kernels/arm/CMakeLists.txt @@ -1,6 +1,6 @@ # NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered # to the model_optimize_tool. -if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))) +if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))) return() endif() @@ -40,8 +40,6 @@ add_kernel(box_coder_compute_arm ARM basic SRCS box_coder_compute.cc DEPS ${lite add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(unsqueeze_compute_arm ARM basic SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(expand_compute_arm ARM basic SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -56,18 +54,17 @@ add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_k add_kernel(crop_compute_arm ARM extra SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(power_compute_arm ARM extra SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(norm_compute_arm ARM extra SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm) ## 3. extra kernels add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(decode_bboxes_compute_arm ARM extra SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(axpy_compute_arm ARM extra SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(shape_compute_arm ARM extra SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(reduce_max_compute_arm ARM extra SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sequence_expand_compute_arm ARM extra SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sequence_pool_compute_arm ARM extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(sequence_conv_compute_arm ARM extra SRCS sequence_conv_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(layer_norm_compute_arm ARM extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(gather_compute_arm ARM extra SRCS gather_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(reduce_prod_compute_arm ARM extra SRCS reduce_prod_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -77,31 +74,35 @@ add_kernel(anchor_generator_compute_arm ARM extra SRCS anchor_generator_compute. add_kernel(generate_proposals_compute_arm ARM extra SRCS generate_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(assign_value_compute_arm ARM extra SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(assign_value_compute_arm ARM basic SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(conditional_block_compute_arm ARM extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm) - # for OCR specific add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(less_than_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(compare_compute_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(fill_constant_batch_size_like_compute_arm ARM basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math_arm) + +# 4. training kernels +add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(mean_grad_compute_arm ARM train SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(activation_grad_compute_arm ARM train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(elementwise_grad_compute_arm ARM train SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(mul_grad_compute_arm ARM train SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(sgd_compute_arm ARM train SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm) lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm) lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm) @@ -121,5 +122,4 @@ if(LITE_BUILD_EXTRA) lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm) lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm) lite_cc_test(test_layer_norm_compute_arm SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_arm) - lite_cc_test(test_lookup_table_compute_arm SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_arm) endif() diff --git a/lite/kernels/arm/activation_compute.cc b/lite/kernels/arm/activation_compute.cc index d50049d48748cf7ec43485a12fa7c65c0171a63d..085e914c6e05c26d3031a4cfdac3c39d31f40f6d 100644 --- a/lite/kernels/arm/activation_compute.cc +++ b/lite/kernels/arm/activation_compute.cc @@ -169,6 +169,54 @@ void RsqrtCompute::Run() { x_data, output_data, x_dims.production(), ctx.threads()); } +void SquareCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto x_dims = param.X->dims(); + auto x_data = param.X->data(); + auto output_data = param.Out->mutable_data(); + lite::arm::math::act_square( + x_data, output_data, x_dims.production(), ctx.threads()); +} + +void HardSwishCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto x_dims = param.X->dims(); + auto x_data = param.X->data(); + auto output_data = param.Out->mutable_data(); + float threshold = param.hard_swish_threshold; + float scale = param.hard_swish_scale; + float offset = param.hard_swish_offset; + lite::arm::math::act_hard_swish(x_data, + output_data, + x_dims.production(), + threshold, + scale, + offset, + ctx.threads()); +} + +void ReciprocalCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto x_dims = param.X->dims(); + auto x_data = param.X->data(); + auto output_data = param.Out->mutable_data(); + lite::arm::math::act_reciprocal( + x_data, output_data, x_dims.production(), ctx.threads()); +} + +void AbsCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto x_dims = param.X->dims(); + auto x_data = param.X->data(); + auto output_data = param.Out->mutable_data(); + lite::arm::math::act_abs( + x_data, output_data, x_dims.production(), ctx.threads()); +} + } // namespace arm } // namespace kernels } // namespace lite @@ -260,3 +308,31 @@ REGISTER_LITE_KERNEL( .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); +REGISTER_LITE_KERNEL( + square, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SquareCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); +REGISTER_LITE_KERNEL(hard_swish, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::HardSwishCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); +REGISTER_LITE_KERNEL(reciprocal, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::ReciprocalCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); +REGISTER_LITE_KERNEL( + abs, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::AbsCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/activation_compute.h b/lite/kernels/arm/activation_compute.h index ba1318ea36d01d1c3352679e7b5de12d013c0e84..2e9774637b7a9156197ffeff5f4bca13a20620bb 100644 --- a/lite/kernels/arm/activation_compute.h +++ b/lite/kernels/arm/activation_compute.h @@ -139,6 +139,42 @@ class RsqrtCompute : public KernelLite { virtual ~RsqrtCompute() = default; }; +class SquareCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + + virtual ~SquareCompute() = default; +}; + +class HardSwishCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + + virtual ~HardSwishCompute() = default; +}; + +class ReciprocalCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + + virtual ~ReciprocalCompute() = default; +}; + +class AbsCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + + virtual ~AbsCompute() = default; +}; + } // namespace arm } // namespace kernels } // namespace lite diff --git a/lite/kernels/arm/activation_grad_compute.cc b/lite/kernels/arm/activation_grad_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..137668fa5e0d1bd07e838b3040a31e084a7475c8 --- /dev/null +++ b/lite/kernels/arm/activation_grad_compute.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/arm/activation_grad_compute.h" +#include "lite/backends/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void SquareGradCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto out_grad_dims = param.Out_grad->dims(); + auto out_grad_data = param.Out_grad->data(); + + auto x_data = param.X->data(); + auto x_grad_data = param.X_grad->mutable_data(); + lite::arm::math::act_square_grad(x_data, + out_grad_data, + x_grad_data, + out_grad_dims.production(), + ctx.threads()); +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(square_grad, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::SquareGradCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/activation_grad_compute.h b/lite/kernels/arm/activation_grad_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..ef03f58fa8cd499192aa6edfe3a7c51b49b14f65 --- /dev/null +++ b/lite/kernels/arm/activation_grad_compute.h @@ -0,0 +1,37 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class SquareGradCompute : public KernelLite { + public: + using param_t = operators::ActivationGradParam; + + void Run() override; + + virtual ~SquareGradCompute() = default; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/arm/argmax_compute.cc b/lite/kernels/arm/argmax_compute.cc index ad279e8f8e1f80639c0b2512f89595d01ef062fd..dda38809875e46835c99b35e564473056391d2c6 100644 --- a/lite/kernels/arm/argmax_compute.cc +++ b/lite/kernels/arm/argmax_compute.cc @@ -30,6 +30,9 @@ void ArgmaxCompute::Run() { lite::Tensor* input = param.X; lite::Tensor* output = param.Out; int axis = param.Axis; + if (axis < 0) { + axis += input->dims().size(); + } lite::arm::math::argmax_func(input, axis, output); return; @@ -47,5 +50,5 @@ REGISTER_LITE_KERNEL(arg_max, paddle::lite::kernels::arm::ArgmaxCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) .Finalize(); diff --git a/lite/kernels/arm/argmax_compute_test.cc b/lite/kernels/arm/argmax_compute_test.cc index 58bdf18474ae69b2bdb863b9818dab41e25bf17b..034d57cdaba77130b319d203c3ae0616720c9d31 100644 --- a/lite/kernels/arm/argmax_compute_test.cc +++ b/lite/kernels/arm/argmax_compute_test.cc @@ -33,7 +33,7 @@ void argmax_compute_ref(const operators::ArgmaxParam& param) { int axis = param.Axis; auto x_data = x->data(); - auto output_data = output->mutable_data(); + auto output_data = output->mutable_data(); DDim x_dims = x->dims(); DDim output_dims = output->dims(); @@ -59,7 +59,7 @@ void argmax_compute_ref(const operators::ArgmaxParam& param) { std::greater>()); // out - dtype* out_ptr = output_data + n * out_channel + k; + auto* out_ptr = output_data + n * out_channel + k; *out_ptr = vec[0].second; } } @@ -115,12 +115,12 @@ TEST(argmax_arm, compute) { param.Axis = axis; argmaxOp.SetParam(param); argmaxOp.Launch(); - auto* output_data = output.mutable_data(); + auto* output_data = output.mutable_data(); // obtain output_ref_data param.Out = &output_ref; argmax_compute_ref(param); - auto* output_ref_data = output_ref.mutable_data(); + auto* output_ref_data = output_ref.mutable_data(); // compare for (int i = 0; i < output.dims().production(); i++) { diff --git a/lite/kernels/arm/assign_value_compute.cc b/lite/kernels/arm/assign_value_compute.cc index 45f28ba36369cc79d70d683894c8a934b9308863..1d097e336f156966689823f4ef6d0d36bc536545 100644 --- a/lite/kernels/arm/assign_value_compute.cc +++ b/lite/kernels/arm/assign_value_compute.cc @@ -58,9 +58,9 @@ void AssignValueCompute::Run() { REGISTER_LITE_KERNEL(assign_value, kARM, - kFloat, + kAny, kNCHW, paddle::lite::kernels::arm::AssignValueCompute, def) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .Finalize(); diff --git a/lite/kernels/arm/assign_value_compute.h b/lite/kernels/arm/assign_value_compute.h index f0c33f865bb770adc64a1727521fad10d0516ede..32b1fb41ab733dc3827496833a633dd415f098b9 100644 --- a/lite/kernels/arm/assign_value_compute.h +++ b/lite/kernels/arm/assign_value_compute.h @@ -22,7 +22,7 @@ namespace lite { namespace kernels { namespace arm { -class AssignValueCompute : public KernelLite { +class AssignValueCompute : public KernelLite { public: using param_t = operators::AssignValueParam; diff --git a/lite/kernels/arm/beam_search_compute.cc b/lite/kernels/arm/beam_search_compute.cc index 5ac53b3b96d0ba676e2909d6102e9edded5e9a92..437ba070b7eaf2d6edc8ecd2dd161f57c8fac345 100644 --- a/lite/kernels/arm/beam_search_compute.cc +++ b/lite/kernels/arm/beam_search_compute.cc @@ -20,8 +20,6 @@ namespace lite { namespace kernels { namespace arm { -void BeamSearchCompute::PrepareForRun() {} - void BeamSearchCompute::Run() { auto& ctx = this->ctx_->template As(); auto& param = this->Param(); @@ -50,11 +48,17 @@ REGISTER_LITE_KERNEL(beam_search, kNCHW, paddle::lite::kernels::arm::BeamSearchCompute, def) - .BindInput("pre_ids", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("pre_scores", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("ids", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("scores", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("selected_ids", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("selected_scores", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("parent_idx", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("pre_ids", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("pre_scores", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) + .BindInput("ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("scores", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) + .BindOutput("selected_ids", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindOutput("selected_scores", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) + .BindOutput("parent_idx", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .Finalize(); diff --git a/lite/kernels/arm/beam_search_compute.h b/lite/kernels/arm/beam_search_compute.h index ebd72732bb25e826c24f20cd28588b170f344268..854696e5b9f40b480f2c92592245e52f46bc8f14 100644 --- a/lite/kernels/arm/beam_search_compute.h +++ b/lite/kernels/arm/beam_search_compute.h @@ -25,10 +25,6 @@ namespace arm { class BeamSearchCompute : public KernelLite { public: - using param_t = operators::BeamSearchParam; - - void PrepareForRun() override; - void Run() override; ~BeamSearchCompute() {} diff --git a/lite/kernels/arm/beam_search_decode_compute.cc b/lite/kernels/arm/beam_search_decode_compute.cc index 49ca51bf697f272dacf55db655bc237aff2cc460..bbd17d98c6ab3096039a5741dd236467ab577f27 100644 --- a/lite/kernels/arm/beam_search_decode_compute.cc +++ b/lite/kernels/arm/beam_search_decode_compute.cc @@ -38,7 +38,7 @@ const size_t kSentenceLevel = 1; template struct Sentence { - std::vector word_ids; + std::vector word_ids; std::vector scores; }; @@ -73,7 +73,7 @@ struct BeamSearchDecoder { std::vector source_level_lod = {0}; std::vector sentence_level_lod = {0}; - std::vector id_data; + std::vector id_data; std::vector score_data; for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { @@ -114,14 +114,14 @@ struct BeamSearchDecoder { lod.push_back(source_level_lod); lod.push_back(sentence_level_lod); - *(id_tensor->mutable_lod()) = lod; + id_tensor->set_lod(lod); id_tensor->Resize({static_cast(id_data.size())}); - auto id_ptr = id_tensor->mutable_data(); + auto id_ptr = id_tensor->mutable_data(); TargetCopy( - TARGET(kARM), id_ptr, id_data.data(), id_data.size() * sizeof(float)); + TARGET(kARM), id_ptr, id_data.data(), id_data.size() * sizeof(int64_t)); - *(score_tensor->mutable_lod()) = lod; + score_tensor->set_lod(lod); score_tensor->Resize({static_cast(score_data.size())}); auto score_ptr = score_tensor->mutable_data(); TargetCopy(TARGET(kARM), @@ -169,7 +169,7 @@ struct BeamSearchDecoder { ++candidate_idx) { prefix_idx_vector.push_back(prefix_idx); size_t idx = prefix_idx_vector.size() - 1; - auto cur_id = cur_ids.data()[candidate_idx]; + auto cur_id = cur_ids.data()[candidate_idx]; auto cur_score = cur_scores.data()[candidate_idx]; sentence_vector.at(idx).word_ids.push_back(cur_id); sentence_vector.at(idx).scores.push_back(cur_score); @@ -184,7 +184,7 @@ struct BeamSearchDecoder { cur_ids.lod().at(kSentenceLevel)[prefix_idx]; for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) { auto candidate_idx = prefix_idx_vector.at(idx); - auto cur_id = cur_ids.data()[candidate_idx]; + auto cur_id = cur_ids.data()[candidate_idx]; auto cur_score = cur_scores.data()[candidate_idx]; if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) { // to skip redundant end tokens @@ -293,8 +293,12 @@ REGISTER_LITE_KERNEL(beam_search_decode, kNCHW, paddle::lite::kernels::arm::BeamSearchDecodeCompute, def) - .BindInput("Ids", {LiteType::GetTensorListTy(TARGET(kARM))}) - .BindInput("Scores", {LiteType::GetTensorListTy(TARGET(kARM))}) - .BindOutput("SentenceIds", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("SentenceScores", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Ids", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("Scores", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kFloat))}) + .BindOutput("SentenceIds", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindOutput("SentenceScores", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) .Finalize(); diff --git a/lite/kernels/arm/calib_compute.cc b/lite/kernels/arm/calib_compute.cc index 525e5aefd63474cfac09900e9c411ca5e5868311..6dac97dcbc59991d4680ab1a98a54a900573f631 100644 --- a/lite/kernels/arm/calib_compute.cc +++ b/lite/kernels/arm/calib_compute.cc @@ -23,24 +23,24 @@ namespace lite { namespace kernels { namespace arm { -void CalibComputeFp32ToInt8::Run() { - auto& param = this->Param(); +template +void CalibComputeFp32ToInt8::Run() { + auto& param = this->template Param(); std::vector scale = {param.scale}; - const auto* din = param.input->data(); - auto* dout = param.output->mutable_data(); + const auto* din = param.input->template data(); + auto* dout = param.output->template mutable_data(); lite::arm::math::fp32_to_int8( din, dout, scale.data(), 1, 1, param.input->numel()); - return; } -void CalibComputeInt8ToFp32::Run() { - auto& param = this->Param(); - const auto* din = param.input->data(); +template +void CalibComputeInt8ToFp32::Run() { + auto& param = this->template Param(); + const auto* din = param.input->template data(); std::vector scale = {param.scale}; - auto* dout = param.output->mutable_data(); + auto* dout = param.output->template mutable_data(); lite::arm::math::int8_to_fp32( din, dout, scale.data(), 1, 1, param.input->numel()); - return; } } // namespace arm @@ -48,43 +48,116 @@ void CalibComputeInt8ToFp32::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL(calib, - kARM, - kInt8, - kNCHW, - paddle::lite::kernels::arm::CalibComputeFp32ToInt8, - fp32_to_int8) +REGISTER_LITE_KERNEL( + calib, + kARM, + kInt8, + kNCHW, + paddle::lite::kernels::arm::CalibComputeFp32ToInt8, + fp32_to_int8) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .Finalize(); -REGISTER_LITE_KERNEL(calib, - kARM, - kInt8, - kNCHW, - paddle::lite::kernels::arm::CalibComputeInt8ToFp32, - int8_to_fp32) +REGISTER_LITE_KERNEL( + calib, + kARM, + kInt8, + kNCHW, + paddle::lite::kernels::arm::CalibComputeInt8ToFp32, + int8_to_fp32) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .Finalize(); -REGISTER_LITE_KERNEL(calib_once, - kARM, - kInt8, - kNCHW, - paddle::lite::kernels::arm::CalibComputeFp32ToInt8, - fp32_to_int8) + +REGISTER_LITE_KERNEL( + calib, + kARM, + kInt8, + kNHWC, + paddle::lite::kernels::arm::CalibComputeFp32ToInt8, + fp32_to_int8) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + calib, + kARM, + kInt8, + kNHWC, + paddle::lite::kernels::arm::CalibComputeInt8ToFp32, + int8_to_fp32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + calib_once, + kARM, + kInt8, + kNCHW, + paddle::lite::kernels::arm::CalibComputeFp32ToInt8, + fp32_to_int8) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .Finalize(); -REGISTER_LITE_KERNEL(calib_once, - kARM, - kInt8, - kNCHW, - paddle::lite::kernels::arm::CalibComputeInt8ToFp32, - int8_to_fp32) +REGISTER_LITE_KERNEL( + calib_once, + kARM, + kInt8, + kNCHW, + paddle::lite::kernels::arm::CalibComputeInt8ToFp32, + int8_to_fp32) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .Finalize(); + +REGISTER_LITE_KERNEL( + calib_once, + kARM, + kInt8, + kNHWC, + paddle::lite::kernels::arm::CalibComputeFp32ToInt8, + fp32_to_int8) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + calib_once, + kARM, + kInt8, + kNHWC, + paddle::lite::kernels::arm::CalibComputeInt8ToFp32, + int8_to_fp32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/arm/calib_compute.h b/lite/kernels/arm/calib_compute.h index 8d9a32bc245579b861607389bac3a4258a0e7abe..a4c8b4c1232101416e95171d70ab629f6a37177b 100644 --- a/lite/kernels/arm/calib_compute.h +++ b/lite/kernels/arm/calib_compute.h @@ -21,8 +21,9 @@ namespace lite { namespace kernels { namespace arm { +template class CalibComputeFp32ToInt8 - : public KernelLite { + : public KernelLite { public: using param_t = operators::CalibParam; @@ -33,8 +34,9 @@ class CalibComputeFp32ToInt8 private: }; +template class CalibComputeInt8ToFp32 - : public KernelLite { + : public KernelLite { public: using param_t = operators::CalibParam; diff --git a/lite/kernels/arm/cast_compute.cc b/lite/kernels/arm/cast_compute.cc index 0b92317ac51b0af24443ec24436f6a483198dbbc..25a2bc6edaa130c8f13f91e62d27a4e3bc97eac1 100755 --- a/lite/kernels/arm/cast_compute.cc +++ b/lite/kernels/arm/cast_compute.cc @@ -77,7 +77,7 @@ void CastCompute::Run() { } // namespace paddle REGISTER_LITE_KERNEL( - cast, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::CastCompute, def) + cast, kARM, kAny, kNCHW, paddle::lite::kernels::arm::CastCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .Finalize(); diff --git a/lite/kernels/arm/cast_compute.h b/lite/kernels/arm/cast_compute.h index d342a405ad593b8457b2899fa3ee6ae843d8f792..1f8da056a8be61de20b5d6e98e455e850b9c9f8d 100644 --- a/lite/kernels/arm/cast_compute.h +++ b/lite/kernels/arm/cast_compute.h @@ -23,7 +23,7 @@ namespace lite { namespace kernels { namespace arm { -class CastCompute : public KernelLite { +class CastCompute : public KernelLite { public: using param_t = operators::CastParam; diff --git a/lite/kernels/arm/compare_compute.cc b/lite/kernels/arm/compare_compute.cc deleted file mode 100644 index 6118cbc6e403645cada84d2434497b084636a4a3..0000000000000000000000000000000000000000 --- a/lite/kernels/arm/compare_compute.cc +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/compare_compute.h" -#include -#include "lite/api/paddle_place.h" -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -#define COMPARE_FUNCTOR(name, op) \ - template \ - struct _##name##Functor { \ - inline bool operator()(const T &a, const T &b) const { return a op b; } \ - }; - -COMPARE_FUNCTOR(Equal, ==); -COMPARE_FUNCTOR(NotEqual, !=); -COMPARE_FUNCTOR(LessThan, <); -COMPARE_FUNCTOR(LessEqual, <=); -COMPARE_FUNCTOR(GreaterThan, >); -COMPARE_FUNCTOR(GreaterEqual, >=); - -template <> -struct _EqualFunctor { - inline bool operator()(const float &a, const float &b) const { - // It is safe to cast a and b to double. - return fabs(static_cast(a - b)) < 1e-8; - } -}; - -template <> -struct _NotEqualFunctor { - inline bool operator()(const float &a, const float &b) const { - return !_EqualFunctor()(a, b); - } -}; - -inline void get_mid_dims(const lite::DDim &x_dims, - const lite::DDim &y_dims, - const int axis, - int *pre, - int *n, - int *post) { - *pre = 1; - *n = 1; - *post = 1; - for (int i = 0; i < axis; ++i) { - (*pre) *= x_dims[i]; - } - - for (int i = 0; i < y_dims.size(); ++i) { - (*n) *= y_dims[i]; - } - - for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { - (*post) *= x_dims[i]; - } -} -template