diff --git a/.gitignore b/.gitignore index ed131bdbbad6bd4dad500fa29f40a29fddeb7593..9823f8c945c1be8e717b622a993d402c49517b7c 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,5 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models metal/MobileNetDemo/MobileNetDemo/Resources + +build* diff --git a/CMakeLists.txt b/CMakeLists.txt index e3f7a211d70920aa74765b976af6939d55a328ab..377e58d3ac7c37271d2a813b22912528c556164b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,6 +61,7 @@ lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) +lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF) lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF) @@ -76,6 +77,7 @@ lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF) # cv build options lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF) lite_option(LITE_WITH_STATIC_CUDA "Statically link cuda libraries." ON) +lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF) # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter. if(ANDROID OR IOS OR ARMLINUX) @@ -130,7 +132,8 @@ endif() if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) message(STATUS "Building the mobile framework") include(cross_compiling/postproject) - include(cross_compiling/npu) # check and prepare NPU DDK + include(device/npu) # check and prepare NPU DDK + include(device/xpu) # check and prepare XPU SDK # We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON # So the following third party dependencies are not needed. @@ -171,7 +174,7 @@ endif() ######################################################################################## if(LITE_WITH_XPU) - include(xpu) + include(device/xpu) endif() include(external/mklml) # download mklml package diff --git a/README.md b/README.md index 22b84888294b5ef60c3d91d7a7909aef8f601d81..b72e4bc9307ba9e12f1252455668bd07f80f6029 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,14 @@ # Paddle Lite -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.github.io/Paddle-Lite/) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddle-lite.readthedocs.io/zh/latest/) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) Paddle Lite is an updated version of Paddle-Mobile, an open-open source deep learning framework designed to make it easy to perform inference on mobile, embeded, and IoT devices. It is compatible with PaddlePaddle and pre-trained models from other sources. -For tutorials, please see [PaddleLite Document](https://paddlepaddle.github.io/Paddle-Lite/). +For tutorials, please see [PaddleLite Document](https://paddle-lite.readthedocs.io/zh/latest/). ## Key Features diff --git a/README_cn.md b/README_cn.md index 11d3967fe8ce88826ca982b71d96268c1a7e5c3a..4f5cd9254d42b4dc02035cb3ecfc8280b0e1c1ac 100644 --- a/README_cn.md +++ b/README_cn.md @@ -1,13 +1,13 @@ # Paddle Lite -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.github.io/Paddle-Lite/) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddle-lite.readthedocs.io/zh/latest/) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) Paddle Lite为Paddle-Mobile的升级版,定位支持包括手机移动端在内更多场景的轻量化高效预测,支持更广泛的硬件和平台,是一个高性能、轻量级的深度学习预测引擎。在保持和PaddlePaddle无缝对接外,也兼容支持其他训练框架产出的模型。 -完整使用文档位于 [PaddleLite 文档](https://paddlepaddle.github.io/Paddle-Lite/) 。 +完整使用文档位于 [PaddleLite 文档](https://paddle-lite.readthedocs.io/zh/latest/) 。 ## 特性 diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 752b22461d9d1c36b3ca6a0bfe472a5dcc3ab976..d38c78f62fa2bed4f4483355de0683f1f5b7656b 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -122,6 +122,9 @@ if (LITE_WITH_ARM) endif() endif() +if (LITE_WITH_TRAIN) + add_definitions("-DLITE_WITH_TRAIN") +endif() if (WITH_ARM_DOTPROD) add_definitions("-DWITH_ARM_DOTPROD") diff --git a/cmake/cross_compiling/findar.cmake b/cmake/cross_compiling/findar.cmake index bcb0dc70fd811a5041244dedb4a4bcf5b540dc3a..0f86231e49cdca274da27b596305144251a65f4b 100644 --- a/cmake/cross_compiling/findar.cmake +++ b/cmake/cross_compiling/findar.cmake @@ -23,7 +23,7 @@ endif() get_filename_component(AR_PATH ${CMAKE_CXX_COMPILER} PATH) -find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH}) +find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH} NO_DEFAULT_PATH) if(NOT AR_TOOL) message(ERROR "Failed to find AR_TOOL in ${AR_PATH}") diff --git a/cmake/cross_compiling/postproject.cmake b/cmake/cross_compiling/postproject.cmake index 7466b3e6d438277ad31020f76665bf689df436f5..3db715ba74945d9e501637af5ef3086e4f11b294 100644 --- a/cmake/cross_compiling/postproject.cmake +++ b/cmake/cross_compiling/postproject.cmake @@ -57,10 +57,14 @@ function(check_linker_flag) endforeach() set(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} PARENT_SCOPE) endfunction() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if (LITE_ON_TINY_PUBLISH) - if(NOT LITE_WITH_PYTHON) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions") + if((NOT LITE_WITH_PYTHON)) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions") + endif() + if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections") diff --git a/cmake/cross_compiling/npu.cmake b/cmake/device/npu.cmake similarity index 83% rename from cmake/cross_compiling/npu.cmake rename to cmake/device/npu.cmake index c22bb1db4fbf8a7370ff3e7c9aca40cc94d550a2..88598f4690a157b20ac1873d84ad13c2f8652725 100644 --- a/cmake/cross_compiling/npu.cmake +++ b/cmake/device/npu.cmake @@ -17,15 +17,16 @@ if(NOT LITE_WITH_NPU) endif() if(NOT DEFINED NPU_DDK_ROOT) - set(NPU_DDK_ROOT $ENV{NPU_DDK_ROOT}) - if(NOT NPU_DDK_ROOT) - message(FATAL_ERROR "Must set NPU_DDK_ROOT or env NPU_DDK_ROOT when LITE_WITH_NPU=ON") - endif() + set(NPU_DDK_ROOT $ENV{NPU_DDK_ROOT}) + if(NOT NPU_DDK_ROOT) + message(FATAL_ERROR "Must set NPU_DDK_ROOT or env NPU_DDK_ROOT when LITE_WITH_NPU=ON") + endif() endif() message(STATUS "NPU_DDK_ROOT: ${NPU_DDK_ROOT}") find_path(NPU_DDK_INC NAMES HiAiModelManagerService.h - PATHS ${NPU_DDK_ROOT}/include NO_DEFAULT_PATH) + PATHS ${NPU_DDK_ROOT}/include + NO_DEFAULT_PATH) if(NOT NPU_DDK_INC) message(FATAL_ERROR "Can not find HiAiModelManagerService.h in ${NPU_DDK_ROOT}/include") endif() @@ -34,21 +35,24 @@ include_directories("${NPU_DDK_ROOT}/include") set(NPU_SUB_LIB_PATH "lib64") if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") - set(NPU_SUB_LIB_PATH "lib64") + set(NPU_SUB_LIB_PATH "lib64") endif() if(ARM_TARGET_ARCH_ABI STREQUAL "armv7") - set(NPU_SUB_LIB_PATH "lib") + set(NPU_SUB_LIB_PATH "lib") endif() find_library(NPU_DDK_HIAI_FILE NAMES hiai - PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) + PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH} + NO_DEFAULT_PATH) find_library(NPU_DDK_IR_FILE NAMES hiai_ir - PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) + PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH} + NO_DEFAULT_PATH) find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build - PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) + PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH} + NO_DEFAULT_PATH) if(NOT NPU_DDK_HIAI_FILE) message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}") @@ -76,6 +80,3 @@ endif() set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs") set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs") - - - diff --git a/cmake/xpu.cmake b/cmake/device/xpu.cmake similarity index 74% rename from cmake/xpu.cmake rename to cmake/device/xpu.cmake index 2112f6b658f5f89b20d63c957cd0b979299c350b..099833ee4cf80968671036cffe89329506bbf091 100644 --- a/cmake/xpu.cmake +++ b/cmake/device/xpu.cmake @@ -17,15 +17,16 @@ if(NOT LITE_WITH_XPU) endif() if(NOT DEFINED XPU_SDK_ROOT) - set(XPU_SDK_ROOT $ENV{XPU_SDK_ROOT}) - if(NOT XPU_SDK_ROOT) - message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON") - endif() + set(XPU_SDK_ROOT $ENV{XPU_SDK_ROOT}) + if(NOT XPU_SDK_ROOT) + message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON") + endif() endif() message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}") find_path(XPU_SDK_INC NAMES xtcl.h - PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH) + PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl + NO_DEFAULT_PATH) if(NOT XPU_SDK_INC) message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include") endif() @@ -34,7 +35,8 @@ include_directories("${XPU_SDK_ROOT}/XTCL/include") include_directories("${XPU_SDK_ROOT}/XTDK/include") find_library(XPU_SDK_XTCL_FILE NAMES xtcl - PATHS ${XPU_SDK_ROOT}/XTCL/so) + PATHS ${XPU_SDK_ROOT}/XTCL/so + NO_DEFAULT_PATH) if(NOT XPU_SDK_XTCL_FILE) message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}") @@ -45,7 +47,8 @@ else() endif() find_library(XPU_SDK_TVM_FILE NAMES tvm - PATHS ${XPU_SDK_ROOT}/XTCL/so) + PATHS ${XPU_SDK_ROOT}/XTCL/so + NO_DEFAULT_PATH) if(NOT XPU_SDK_TVM_FILE) message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}") @@ -56,7 +59,8 @@ else() endif() find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi - PATHS ${XPU_SDK_ROOT}/XTDK/shlib) + PATHS ${XPU_SDK_ROOT}/XTDK/shlib + NO_DEFAULT_PATH) if(NOT XPU_SDK_XPU_API_FILE) message(FATAL_ERROR "Can not find XPU API Library in ${XPU_SDK_ROOT}") @@ -67,7 +71,8 @@ else() endif() find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt - PATHS ${XPU_SDK_ROOT}/XTDK/shlib) + PATHS ${XPU_SDK_ROOT}/XTDK/shlib + NO_DEFAULT_PATH) if(NOT XPU_SDK_XPU_RT_FILE) message(FATAL_ERROR "Can not find XPU RT Library in ${XPU_SDK_ROOT}") @@ -78,18 +83,12 @@ else() endif() find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc - PATHS ${XPU_SDK_ROOT}/XTDK/shlib) - -if(NOT XPU_SDK_XPU_JITC_FILE) - message(FATAL_ERROR "Can not find XPU JITC Library in ${XPU_SDK_ROOT}") -else() - message(STATUS "Found XPU JITC Library: ${XPU_SDK_XPU_JITC_FILE}") - add_library(xpu_sdk_xpu_jitc SHARED IMPORTED GLOBAL) - set_property(TARGET xpu_sdk_xpu_jitc PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_JITC_FILE}) -endif() + PATHS ${XPU_SDK_ROOT}/XTDK/shlib + NO_DEFAULT_PATH) find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8 - PATHS ${XPU_SDK_ROOT}/XTDK/shlib) + PATHS ${XPU_SDK_ROOT}/XTDK/shlib + NO_DEFAULT_PATH) if(NOT XPU_SDK_LLVM_FILE) message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}") @@ -99,7 +98,7 @@ else() set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE}) endif() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1 -D_GLIBCXX_USE_CXX11_ABI=0") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=0") -set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs") -set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu builder libs") +set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs") +set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs") diff --git a/cmake/lite.cmake b/cmake/lite.cmake index fd40fa437b52ff33089b55c6cfb7df6604a0530d..265de3fbf68542f1b1525257887cbfaa4d1c4d62 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -275,6 +275,11 @@ set(host_kernels CACHE INTERNAL "host kernels") set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt") file(WRITE ${kernels_src_list} "") # clean + +# file to record faked kernels for opt python lib +set(fake_kernels_src_list "${CMAKE_BINARY_DIR}/fake_kernels_src_list.txt") +file(WRITE ${fake_kernels_src_list} "") # clean + if(LITE_BUILD_TAILOR) set(tailored_kernels_list_path "${LITE_OPTMODEL_DIR}/.tailored_kernels_source_list") file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) @@ -303,62 +308,74 @@ function(add_kernel TARGET device level) return() endif() - if (LITE_ON_MODEL_OPTIMIZE_TOOL) - # the source list will collect for model_optimize_tool to fake kernel generation. - foreach(src ${args_SRCS}) - file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") - endforeach() - return() - endif() - - # when compiling the model_optimize_tool, a source file with all the fake kernel definitions will be generated, - # no need to continue the compilation of the true kernel source. - if (LITE_ON_MODEL_OPTIMIZE_TOOL) - return() - endif(LITE_ON_MODEL_OPTIMIZE_TOOL) - if ("${device}" STREQUAL "Host") set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "ARM") if (NOT LITE_WITH_ARM) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "X86") if (NOT LITE_WITH_X86) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + elseif (LITE_ON_MODEL_OPTIMIZE_TOOL) + foreach(src ${args_SRCS}) + file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "NPU") if (NOT LITE_WITH_NPU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "XPU") if (NOT LITE_WITH_XPU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "FPGA") if (NOT LITE_WITH_FPGA) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "BM") if (NOT LITE_WITH_BM) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "OPENCL") if (NOT LITE_WITH_OPENCL) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "") @@ -366,6 +383,9 @@ function(add_kernel TARGET device level) if ("${device}" STREQUAL "CUDA") if (NOT LITE_WITH_CUDA) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(cuda_kernels "${cuda_kernels};${TARGET}" CACHE INTERNAL "") diff --git a/cmake/mlu.cmake b/cmake/mlu.cmake new file mode 100644 index 0000000000000000000000000000000000000000..b73ab16462b83e952807289d511fdb95ad74c6cd --- /dev/null +++ b/cmake/mlu.cmake @@ -0,0 +1,61 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_MLU) + return() +endif() + +if(NOT DEFINED NEUWARE_HOME) + set(NEUWARE_HOME $ENV{NEUWARE_HOME}) + if(NOT NEUWARE_HOME) + message(FATAL_ERROR "Must set NEUWARE_HOME or env NEUWARE_HOME when LITE_WITH_MLU=ON") + endif() +endif() + +message(STATUS "LITE_WITH_MLU: ${LITE_WITH_MLU}") +find_path(CNML_INC NAMES cnml.h + PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH) +if(NOT CNML_INC) + message(FATAL_ERROR "Can not find cnml.h in ${NEUWARE_HOME}/include") +endif() + +find_path(CNRT_INC NAMES cnrt.h + PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH) +if(NOT CNRT_INC) + message(FATAL_ERROR "Can not find cnrt.h in ${NEUWARE_HOME}/include") +endif() + +include_directories("${NEUWARE_HOME}/include") + +find_library(CNML_LIB_FILE NAMES cnml + PATHS ${NEUWARE_HOME}/lib64) + +if(NOT CNML_LIB_FILE) + message(FATAL_ERROR "Can not find CNML Library in ${NEUWARE_HOME}/lib64") +else() + message(STATUS "Found CNML Library: ${CNML_LIB_FILE}") + add_library(cnml_lib SHARED IMPORTED GLOBAL) + set_property(TARGET cnml_lib PROPERTY IMPORTED_LOCATION ${CNML_LIB_FILE}) +endif() + +find_library(CNRT_LIB_FILE NAMES cnrt + PATHS ${NEUWARE_HOME}/lib64) + +if(NOT CNRT_LIB_FILE) + message(FATAL_ERROR "Can not find CNRT Library in ${NEUWARE_HOME}/lib64") +else() + message(STATUS "Found CNRT Library: ${CNRT_LIB_FILE}") + add_library(cnrt_lib SHARED IMPORTED GLOBAL) + set_property(TARGET cnrt_lib PROPERTY IMPORTED_LOCATION ${CNRT_LIB_FILE}) +endif() diff --git a/docs/advanced_user_guides/index.rst b/docs/advanced_user_guides/index.rst deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/docs/api_reference/cv.md b/docs/api_reference/cv.md new file mode 100644 index 0000000000000000000000000000000000000000..5110e40c423c39e33feb084fa0d09c89ddd13d16 --- /dev/null +++ b/docs/api_reference/cv.md @@ -0,0 +1,263 @@ +# CV图像预处理API + +请把编译脚本`Paddle-Lite/lite/too/build.sh`中`BUILD_CV`变量设置为`ON`, 其他编译参数设置请参考[源码编译](../user_guides/source_compile), 以确保 Lite 可以正确编译。这样`CV`图像的加速库就会编译进去,且会生成`paddle_image_preprocess.h`的API文件 + +- 硬件平台: `ARM` +- 操作系统:`MAC` 和 `LINUX` + +## CV 图像预处理功能 + +Lite 支持不同颜色空间的图像相互转换 `Convert` 、缩放 `Resize` 、翻转 `Flip`、旋转 `Rotate` 和图像数据转换为 `Tensor` 存储`ImageToTensor` 功能,下文将详细介绍每个功能的API接口。 + +### CV 枚举变量和结构体变量 + +- 颜色空间 +```cpp +enum ImageFormat { + RGBA = 0, + BGRA, + RGB, + BGR, + GRAY, + NV21 = 11, + NV12, +}; +``` +- 翻转参数 +```cpp +enum FlipParam { + X = 0, // flip along the X axis + Y, // flip along the Y axis + XY // flip along the XY axis +}; +``` +- 转换参数 +```cpp +typedef struct { + int ih; // input height + int iw; // input width + int oh; // outpu theight + int ow; // output width + FlipParam flip_param; // flip, support x, y, xy + float rotate_param; // rotate, support 90, 180, 270 +} TransParam; +``` + +### ImagePreprocess 类的成员变量 + +`ImagePreprocess` 类含有以下三个私有成员变量,通过构造函数进行初始化。 +```cpp +private: + ImageFormat srcFormat_; // input image color format + ImageFormat dstFormat_; // output image color format + TransParam transParam_; // image transform parameter + +// init +ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, ImageFormat dstFormat, TransParam param) { + this->srcFormat_ = srcFormat; + this->dstFormat_ = dstFormat; + this->transParam_ = param; +} +``` + +### 颜色空间转换 Convert + +`Convert` 函数支持颜色空间:GRAY、NV12(NV21)、RGB(BGR)和RGBA(BGRA) + ++ 目前支持以下颜色空间的相互转换: + - GRAY2BGR + - GRAY2RGB + - BGR2RGB + - BGRA2BGR + - BGRA2RGB + - RGBA2RGB + - RGBA2BGR + - BGRA2RGBA + ++ 目前支持以下颜色空间的单向转换: + - NV12—BGR + - NV21—BGR + - NV12—RGB + - NV21—RGB + - NV12—BGRA + - NV21—BGRA + - NV12—RGBA + - NV21—RGBA + ++ `Convert` 功能的API接口 + ```cpp + // 方法一 + void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst); + // 方法二 + void ImagePreprocess::imageCovert(const uint8_t* src, + uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat); + ``` + + + 第一个 `imageCovert` 接口,缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时,必须要给以下成员变量赋值: + - param srcFormat:`ImagePreprocess` 类的成员变量`srcFormat_` + - param dstFormat:`ImagePreprocess` 类的成员变量`dstFormat_` + + - 第二个`imageCovert` 接口,可以直接使用 + +### 缩放 Resize + +`Resize` 功能支持颜色空间:GRAY、NV12(NV21)、RGB(BGR)和RGBA(BGRA) +`Resize` 功能目前支持的方法:`bilinear` + ++ `Resize` 功能的API接口 + ```cpp + // 方法一 + void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst); + // 方法二 + void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, ImageFormat srcFormat, int srcw, int srch, int dstw, int dsth); + ``` + + + 第一个`imageResize` 接口,缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时,必须要给以下成员变量赋值: + - param srcFormat:`ImagePreprocess` 类的成员变量`dstFormat_` + - param srcw:`ImagePreprocess` 类的成员变量`transParam_.iw` + - param srch:`ImagePreprocess` 类的成员变量`transParam_.ih` + - param dstw:`ImagePreprocess` 类的成员变量`transParam_.ow` + - param dsth:`ImagePreprocess` 类的成员变量`transParam_.ow` + + - 第二个`imageResize` 接口,可以直接使用 + +### 旋转 Rotate + +`Rotate` 功能支持颜色空间:GRAY、RGB(BGR)和RGBA(BGRA) +`Rotate` 功能目前支持的角度:90、180 和 270 + ++ `Rotate` 功能的API接口 + ```cpp + // 方法一 + void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst); + // 方法二 + void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, ImageFormat srcFormat, int srcw, int srch, float degree); + ``` + + + 第一个`imageRotate` 接口,缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时,必须要给以下成员变量赋值: + - param srcFormat:`ImagePreprocess` 类的成员变量`dstFormat_` + - param srcw:`ImagePreprocess` 类的成员变量`transParam_.ow` + - param srch:`ImagePreprocess` 类的成员变量`transParam_.oh` + - param degree:`ImagePreprocess` 类的成员变量`transParam_.rotate_param` + + - 第二个`imageRotate` 接口,可以直接使用 + +### 翻转 Flip + +`Flip` 功能支持颜色空间:GRAY、RGB(BGR)和RGBA(BGRA) +`Flip` 功能目前支持的功能:沿X轴翻转、沿Y轴翻转和沿XY轴翻转 + ++ `Flip` 功能的API接口 + ```cpp + // 方法一 + void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst); + // 方法二 + void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, ImageFormat srcFormat, int srcw, int srch, FlipParam flip_param); + ``` + + + 第一个`imageFlip` 接口,缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时,必须要给以下成员变量赋值: + - param srcFormat:`ImagePreprocess` 类的成员变量`dstFormat_` + - param srcw:`ImagePreprocess` 类的成员变量`transParam_.ow` + - param srch:`ImagePreprocess` 类的成员变量`transParam_.oh` + - param flip_param:`ImagePreprocess` 类的成员变量`transParam_.flip_param` + + - 第二个`imageFlip` 接口,可以直接使用 + +### Image2Tensor + +`Image2Tensor` 功能支持颜色空间:RGB(BGR)和RGBA(BGRA) +`Image2Tensor` 功能目前支持的Layout:`NCHW`和 `NHWC` +`Image2Tensor` 不仅完成图像转换为`Tensor`数据处理,而且还完成了图像数据的归一化处理 + ++ `Image2Tensor` 功能的API接口 + ```cpp + // 方法一 + void ImagePreprocess::image2Tensor(const uint8_t* src, Tensor* dstTensor, LayoutType layout, float* means, float* scales); + // 方法二 + void ImagePreprocess::image2Tensor(const uint8_t* src, Tensor* dstTensor, ImageFormat srcFormat, srcw, int srch, LayoutType layout, float* means, float* scales; + ``` + + + 第一个`image2Tensor` 接口,缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时,必须要给以下成员变量赋值: + - param srcFormat:`ImagePreprocess` 类的成员变量`dstFormat_` + - param srcw:`ImagePreprocess` 类的成员变量`transParam_.ow` + - param srch:`ImagePreprocess` 类的成员变量`transParam_.oh` + + - 第二个`image2Tensor` 接口,可以直接使用 + + + +## CV 图像预处理 Demo 示例 + +例子:输入 `1920x1080` 大小的 `NV12` 图像src,输出 `960x540` 大小 `RGB` 格式的图像dst;然后,完成 `90` 度旋转和沿 `X` 轴翻转功能;最后,用 `NHWC` 格式存储在Tensor里。 + +定义 `ImagePreprocess` 类的对象,初始化成员变量 + +```cpp +// init +srcFormat = ImageFormat::NV12; +dstFormat = ImageFormat::RGB; +srch = 1920; +srcw = 1080; +dsth = 960; +dstw = 540; +flip_param = FlipParam::X; +degree = 90; +layout = LayoutType::NHWC +// 方法一: +TransParam tparam; +tparam.ih = srch; +tparam.iw = srcw; +tparam.oh = dsth; +tparam.ow = dstw; +tparam.flip_param = flip_param; +tparam.rotate_param = degree; +ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); +// 方法二: +ImagePreprocess image_preprocess(); +``` + +### imageConvert Demo + +```cpp +// 方法一: +image_preprocess.imageCovert(src, lite_dst); +// 方法二: +image_preprocess.imageCovert(src, lite_dst, (ImageFormat)srcFormat, (ImageFormat)dstFormat); +``` + +### imageResize Demo + +```cpp +// 方法一: +image_preprocess.imageResize(lite_dst, resize_tmp); +// 方法二: +image_preprocess.imageResize(lite_dst,resize_tmp, (ImageFormat)dstFormat, srcw, +srch, dstw, dsth); +``` + +### imageRotate Demo + +```cpp +// 方法一: +image_preprocess.imageRotate(resize_tmp, tv_out_ratote); +// 方法二: +image_preprocess.imageRotate(resize_tmp,tv_out_ratote, (ImageFormat)dstFormat, dstw, dsth, degree); +``` + +### imageFlip Demo + +```cpp +// 方法一: +image_preprocess.imageFlip(tv_out_ratote, tv_out_flip); +// 方法二: +image_preprocess.imageFlip(tv_out_ratote, tv_out_flip, (ImageFormat)dstFormat, dstw, dsth, flip_param); +``` + +### image2Tensor Demo + +```cpp +// 方法一: +image_preprocess.image2Tensor(tv_out_flip, &dst_tensor, layout, means, scales); +// 方法二: +image_preprocess.image2Tensor(tv_out_flip, &dst_tensor,(ImageFormat)dstFormat, dstw, dsth, layout, means, scales); +``` diff --git a/docs/api_reference/cxx_api_doc.md b/docs/api_reference/cxx_api_doc.md index 38385a4267d5727d9c5c7d985d3457dd011e203c..0b0f1f3d9b321959ef1f6210010da69fc0ffc7b8 100644 --- a/docs/api_reference/cxx_api_doc.md +++ b/docs/api_reference/cxx_api_doc.md @@ -1,5 +1,5 @@ -# C++ API文档 +# C++ API ## CreatePaddlePredictor @@ -260,14 +260,14 @@ class MobileConfig; `MobileConfig`用来配置构建轻量级PaddlePredictor的配置信息,如NaiveBuffer格式的模型地址、模型的内存地址(从内存加载模型时使用)、能耗模式、工作线程数等等。 -*注意:输入的模型需要使用[Model Optimize Tool](../model_optimize_tool)转化为NaiveBuffer格式的优化模型。* +*注意:输入的模型需要使用[Model Optimize Tool](../user_guides/model_optimize_tool)转化为NaiveBuffer格式的优化模型。* 示例: ```c++ MobileConfig config; // 设置NaiveBuffer格式模型目录,从文件加载模型时使用 -config.set_model_dir(FLAGS_model_dir); +config.set_model_from_file(); // 设置工作线程数 config.set_threads(4); // 设置能耗模式 @@ -277,13 +277,13 @@ config.set_power_mode(LITE_POWER_HIGH); std::shared_ptr predictor = CreatePaddlePredictor(config); ``` -### `set_model_from_file(model_dir)` +### `set_model_from_file(model_file)` 设置模型文件,当需要从磁盘加载模型时使用。 参数: -- `model_dir(std::string)` - 模型文件路径 +- `model_file(std::string)` - 模型文件路径 返回:`None` @@ -589,7 +589,7 @@ for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { 根据名称获取输出Tensor的指针。 -**注意**:`GetTensor`接口是为开发者设计的调试接口,可以输出[转化](../model_optimize_tool)后模型中的任一节点。如果出现`GetTensor(InputName)`返回值为空`Tensor`,可能原因是以该`InputName`命名的Tensor在模型转化的**子图融合**过程被融合替换了。 +**注意**:`GetTensor`接口是为开发者设计的调试接口,可以输出[转化](../user_guides/model_optimize_tool)后模型中的任一节点。如果出现`GetTensor(InputName)`返回值为空`Tensor`,可能原因是以该`InputName`命名的Tensor在模型转化的**子图融合**过程被融合替换了。 参数: diff --git a/docs/api_reference/index.rst b/docs/api_reference/index.rst deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/docs/api_reference/java_api_doc.md b/docs/api_reference/java_api_doc.md new file mode 100644 index 0000000000000000000000000000000000000000..3ef8edb6e68daef0a86c04d7bb216106d36b26d5 --- /dev/null +++ b/docs/api_reference/java_api_doc.md @@ -0,0 +1,394 @@ +# Java API + +## MobileConfig + +```java +public class MobileConfig extends ConfigBase; +``` + +`MobileConfig`用来配置构建轻量级PaddlePredictor的配置信息,如NaiveBuffer格式的模型地址、能耗模式、工作线程数等等。 + +*注意:输入的模型需要使用Model Optimize Tool转化为NaiveBuffer格式的优化模型。* + +示例: + +```java +MobileConfig config = new MobileConfig(); +// 设置NaiveBuffer格式模型目录 +config.setModelFromFile(modelfile); +// 设置能耗模式 +config.setPowerMode(PowerMode.LITE_POWER_HIGH); +// 设置工作线程数 +config.setThreads(1); + +// 根据MobileConfig创建PaddlePredictor +PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config); +``` + +### ``setModelFromFile(model_file)`` + +设置模型文件夹路径。 + +参数: + +- `model_file(String)` - 模型文件路径 + +返回:`None` + +返回类型:`void` + + + +### ``setModelDir(model_dir)`` + +**注意**:Lite模型格式在release/v2.3.0之后修改,本接口为加载老格式模型的接口,将在release/v3.0.0废弃。建议替换为`setModelFromFile`接口。 + +设置模型文件夹路径。 + +参数: + +- `model_dir(String)` - 模型文件夹路径 + +返回:`None` + +返回类型:`void` + + + +### ``setModelFromBuffer(model_buffer)`` + +设置模型的内存数据,当需要从内存加载模型时使用。 + +参数: + +- `model_buffer(str)` - 内存中的模型数据 + +返回:`None` + +返回类型:`void` + + + +### `getModelDir()` + +返回设置的模型文件夹路径。 + +参数: + +- `None` + +返回:模型文件夹路径 + +返回类型:`String` + + + +### `setPowerMode(mode)` + +设置CPU能耗模式。若不设置,则默认使用`LITE_POWER_HIGH`。 + +*注意:只在开启`OpenMP`时生效,否则系统自动调度。* + +参数: + +- `mode(PowerMode)` - CPU能耗模式。 + +返回:`None` + +返回类型:`void` + + + +### `getPowerMode()` + +获取设置的CPU能耗模式。 + +参数: + +- `None` + +返回:设置的CPU能耗模式 + +返回类型:`PowerMode` + + + +### `setThreads(threads)` + +设置工作线程数。若不设置,则默认使用单线程。 + +*注意:只在开启`OpenMP`的模式下生效,否则只使用单线程。* + +参数: + +- `threads(int)` - 工作线程数。默认为1。 + +返回:`None` + +返回类型:`void` + + + +### `getThreads()` + +获取设置的工作线程数。 + +参数: + +- `None` + +返回:工作线程数 + +返回类型:`int` + +## PaddlePredictor + +```java +public class PaddlePredictor; +``` + +`PaddlePredictor`是Paddle-Lite的预测器。用户可以根据PaddlePredictor提供的接口使用MobileConfig创建新的预测器、设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。 + +示例: + +```java +// 设置MobileConfig +MobileConfig config = new MobileConfig(); +config.setModelDir(modelPath); + +// 创建PaddlePredictor +PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config); + +// 设置输入数据 +long[] dims = {100, 100}; +float[] inputBuffer = new float[10000]; +for (int i = 0; i < 10000; ++i) { + inputBuffer[i] = i; +} +Tensor input = predictor.getInput(0); +input.resize(dims); +input.setData(inputBuffer); + +// 执行预测 +predictor.run(); + +// 获取输出数据 +Tensor output = predictor.getOutput(0); +float[] output = result.getFloatData(); +for (int i = 0; i < 1000; ++i) { + System.out.println(output[i]); +} +``` + + + +### `CreatePaddlePredictor(config)` + +```java +public static PaddlePredictor createPaddlePredictor(ConfigBase config); +``` + +`CreatePaddlePredictor`用来根据`ConfigBase`动态创建预测器,目前Java API支持使用MobileConfig`。框架会根据您在config中指定的模型路径、能耗模型、工作线程数等自动创建一个预测器。 + +参数: + +- `config(ConfigBase,目前应使用MobileConfig)` - 创建预测器的配置信息 + +返回:根据config创建完成的预测器 + +返回类型:`PaddlePredictor` + + + +### `getInput(index)` + +获取输入Tensor,用来设置模型的输入数据。 + +参数: + +- `index(int)` - 输入Tensor的索引 + +返回:第`index`个输入`Tensor` + +返回类型:`Tensor` + + + +### `getOutput(index)` + +获取输出Tensor,用来获取模型的输出结果。 + +参数: + +- `index(int)` - 输出Tensor的索引 + +返回:第`index`个输出Tensor + +返回类型:`Tensor` + + + +### `run()` + +执行模型预测,需要在***设置输入数据后***调用。 + +参数: + +- `None` + +返回:预测执行状态,成功返回`true`,否则返回`false` + +返回类型:`boolean` + + + +### `getVersion()` + +用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息,如`v2.0-beta`;否则返回代码的`branch(commitid)`,如`develop(7e44619)`。 + +参数: + +- `None` + +返回:当前lib使用的代码版本信息 + +返回类型:`String` + +## PowerMode + +```java +public enum PowerMode; +``` + +`PowerMode`为ARM CPU能耗模式,用户可以根据应用场景设置能耗模式获得最优的能效比。 + +示例: + +```java +MobileConfig config = new MobileConfig(); +// 设置NaiveBuffer格式模型目录 +config.setModelDir(modelPath); +// 设置能耗模式 +config.setPowerMode(PowerMode.LITE_POWER_HIGH); + +// 根据MobileConfig创建PaddlePredictor +PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config); +``` + +PowerMode详细说明如下: + +| 选项 | 说明 | +| :------------------: | ------------------------------------------------------------ | +| LITE_POWER_HIGH | 绑定大核运行模式。如果ARM CPU支持big.LITTLE,则优先使用并绑定Big cluster。如果设置的线程数大于大核数量,则会将线程数自动缩放到大核数量。如果系统不存在大核或者在一些手机的低电量情况下会出现绑核失败,如果失败则进入不绑核模式。 | +| LITE_POWER_LOW | 绑定小核运行模式。如果ARM CPU支持big.LITTLE,则优先使用并绑定Little cluster。如果设置的线程数大于小核数量,则会将线程数自动缩放到小核数量。如果找不到小核,则自动进入不绑核模式。 | +| LITE_POWER_FULL | 大小核混用模式。线程数可以大于大核数量。当线程数大于核心数量时,则会自动将线程数缩放到核心数量。 | +| LITE_POWER_NO_BIND | 不绑核运行模式(推荐)。系统根据负载自动调度任务到空闲的CPU核心上。 | +| LITE_POWER_RAND_HIGH | 轮流绑定大核模式。如果Big cluster有多个核心,则每预测10次后切换绑定到下一个核心。 | +| LITE_POWER_RAND_LOW | 轮流绑定小核模式。如果Little cluster有多个核心,则每预测10次后切换绑定到下一个核心。 | + + + +## Tensor + +```c++ +public class Tensor; +``` + +Tensor是Paddle-Lite的数据组织形式,用于对底层数据进行封装并提供接口对数据进行操作,包括设置维度、数据等。 + +*注意:用户应使用`PaddlePredictor`的`getInput`和`getOuput`接口获取输入/输出的`Tensor`。* + +示例: + +```java +// 导入Java API +import com.baidu.paddle.lite.MobileConfig; +import com.baidu.paddle.lite.Tensor; +import com.baidu.paddle.lite.Predictor; +import com.baidu.paddle.lite.PowerMode; + +// 设置MobileConfig +MobileConfig config = new MobileConfig(); +config.setModelDir(modelPath); + +// 创建PaddlePredictor +PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config); + +// 设置输入数据 +long[] dims = {100, 100}; +float[] inputBuffer = new float[10000]; +for (int i = 0; i < 10000; ++i) { + inputBuffer[i] = i; +} +// 获取输入Tensor +Tensor input = predictor.getInput(0); +// 设置输入维度 +input.resize(dims); +// 设置输入数据 +input.setData(inputBuffer); + +// 执行预测 +predictor.run(); + +// 获取输出Tensor +Tensor result = predictor.getOutput(0); +// 获取输出数据 +float[] output = result.getFloatData(); +for (int i = 0; i < 1000; ++i) { + System.out.println(output[i]); +} +``` + +### `resize(dims)` + +设置Tensor的维度信息。 + +参数: + +- `dims(long[])` - 维度信息 + +返回:设置成功返回`true`,否则返回`false` + +返回类型:`boolean` + + + +### `shape()` + +获取Tensor的维度信息。 + +参数: + +- `None` + +返回:Tensor的维度信息 + +返回类型:`long[]` + + + +### `setData(data)` + +设置Tensor数据。 + +参数: + +- `data(float[])` - 需要设置的数据 + +返回:成功则返回`true`,否则返回`false` + +返回类型:`boolean` + + + +### `getFloatData()` + +获取Tensor的底层float型数据。 + +参数: + +- `None` + +返回:`Tensor`底层数据 + +返回类型:`float[]` diff --git a/docs/api_reference/python_api_doc.md b/docs/api_reference/python_api_doc.md new file mode 100755 index 0000000000000000000000000000000000000000..b4c9e1715ccae9d194aa29fea30f41b3496ec0ae --- /dev/null +++ b/docs/api_reference/python_api_doc.md @@ -0,0 +1,800 @@ +# Python API + +## create_paddle_predictor + +```python +CxxPredictor create_paddle_predictor(config); # config为CxxConfig类型 +LightPredictor create_paddle_predictor(config); # config为MobileConfig类型 +``` + +`create_paddle_predictor`函数用来根据`CxxConfig`或`MobileConfig`构建预测器。 + +示例: + +```python +from lite_core import * + +# 设置CxxConfig +config = CxxConfig() +config.set_model_dir() +places = [Place(TargetType.ARM, PrecisionType.FP32)] +config.set_valid_places(places) + +# 根据CxxConfig创建CxxPredictor +predictor = create_paddle_predictor(config) +``` + +参数: + +- `config(CxxConfig或MobileConfig)` - 用于构建Predictor的配置信息。 + +返回:预测器`predictor` + +返回类型:`CxxPredictor`或`LightPredictor` + +## CxxConfig + +```python +class CxxConfig; +``` + +`CxxConfig`用来配置构建CxxPredictor的配置信息,如protobuf格式的模型地址、能耗模式、工作线程数、place信息等等。 + +示例: + +```python +from lite_core import * + +config = CxxConfig() +# 设置模型目录,加载非combined模型时使用 +config.set_model_dir() +# 设置工作线程数 +config.set_threads(4); +# 设置能耗模式 +config.set_power_mode(PowerMode.LITE_POWER_NO_BIND) +# 设置valid places +places = [Place(TargetType.ARM, PrecisionType.FP32)] +config.set_valid_places(places) + +# 根据CxxConfig创建CxxPredictor +predictor = create_paddle_predictor(config) +``` + +### `set_model_dir(model_dir)` + +设置模型文件夹路径,当需要从磁盘加载非combined模型时使用。 + +参数: + +- `model_dir(str)` - 模型文件夹路径 + +返回:`None` + +返回类型:`None` + + + +### `model_dir()` + +返回设置的模型文件夹路径。 + +参数: + +- `None` + +返回:模型文件夹路径 + +返回类型:`str` + + + +### `set_model_file(model_file)` + +设置模型文件路径,加载combined形式模型时使用。 + +参数: + +- `model_file(str)` - 模型文件路径 + +返回类型:`None` + + + +### `model_file()` + +获取设置模型文件路径,加载combined形式模型时使用。 + +参数: + +- `None` + +返回:模型文件路径 + +返回类型:`str` + + + +### `set_param_file(param_file)` + +设置模型参数文件路径,加载combined形式模型时使用。 + +参数: + +- `param_file(str)` - 模型文件路径 + +返回类型:`None` + + + +### `param_file()` + +获取设置模型参数文件路径,加载combined形式模型时使用。 + +参数: + +- `None` + +返回:模型参数文件路径 + +返回类型:`str` + + + +### `set_valid_places(valid_places)` + +设置可用的places列表。 + +参数: + +- `valid_places(list)` - 可用place列表。 + +返回类型:`None` + +示例: + +```python +from lite_core import * + +config = CxxConfig() +# 设置模型目录,加载非combined模型时使用 +config.set_model_dir() +# 设置valid places +# 注意,valid_places列表中Place的排序表明了用户对Place的偏好程度,如用户想优先使用ARM上Int8精度的 +# kernel,则应把Place(TargetType.ARM, PrecisionType.INT8)置于valid_places列表的首位。 +places = [Place(TargetType.ARM, PrecisionType.INT8), + Place(TargetType.ARM, PrecisionType.FP32)] +config.set_valid_places(places) + +# 根据CxxConfig创建CxxPredictor +predictor = create_paddle_predictor(config) +``` + + + +### `set_power_mode(mode)` + +设置CPU能耗模式。若不设置,则默认使用`PowerMode.LITE_POWER_HIGH`。 + +*注意:只在开启`OpenMP`时生效,否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `mode(PowerMode)` - CPU能耗模式 + +返回:`None` + +返回类型:`None` + + + +### `power_mode()` + +获取设置的CPU能耗模式。 + +*注意:此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `None` + +返回:设置的CPU能耗模式 + +返回类型:`PowerMode` + + + +### `set_threads(threads)` + +设置工作线程数。若不设置,则默认使用单线程。 + +*注意:只在开启`OpenMP`的模式下生效,否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `threads(int)` - 工作线程数 + +返回:`None` + +返回类型:`None` + + + +### `threads()` + +获取设置的工作线程数。 + +*注意:此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `None` + +返回:工作线程数 + +返回类型:`int` + +## MobileConfig + +```python +class MobileConfig; +``` + +`MobileConfig`用来配置构建LightPredictor的配置信息,如NaiveBuffer格式的模型地址、能耗模式、工作线程数等等。 + +示例: + +```python +from lite_core import * + +config = MobileConfig() +# 设置NaiveBuffer格式模型目录 +config.set_model_from_file() +# 设置工作线程数 +config.set_threads(4); +# 设置能耗模式 +config.set_power_mode(PowerMode.LITE_POWER_NO_BIND) + +# 根据MobileConfig创建LightPredictor +predictor = create_paddle_predictor(config) +``` + +### `set_model_from_file(model_file)` + +**注意**:`model_file`应该是经过`opt`优化后产生的`NaiveBuffer`格式的模型。 + +设置模型文件夹路径。 + +参数: + +- `model_file(str)` - 模型文件路径 + +返回:`None` + +返回类型:`None` + + + +### `set_model_dir(model_dir)` + +**注意**:Lite模型格式在release/v2.3.0之后修改,本接口为加载老格式模型的接口,将在release/v3.0.0废弃。建议替换为`setModelFromFile`接口。`model_dir`应该是经过`Model Optimize Tool`优化后产生的`NaiveBuffer`格式的模型。 + +设置模型文件夹路径。 + +参数: + +- `model_dir(str)` - 模型文件夹路径 + +返回:`None` + +返回类型:`None` + + + +### `set_model_from_buffer(model_buffer)` + +设置模型的内存数据,当需要从内存加载模型时使用。 + +参数: + +- `model_buffer(str)` - 内存中的模型数据 + +返回:`None` + +返回类型:`void` + + + + +### `model_dir()` + +返回设置的模型文件夹路径。 + +参数: + +- `None` + +返回:模型文件夹路径 + +返回类型:`str` + + + +### `set_power_mode(mode)` + +设置CPU能耗模式。若不设置,则默认使用`PowerMode.LITE_POWER_HIGH`。 + +*注意:只在开启`OpenMP`时生效,否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `mode(PowerMode)` - CPU能耗模式 + +返回:`None` + +返回类型:`None` + + + +### `power_mode()` + +获取设置的CPU能耗模式。 + +*注意:此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `None` + +返回:设置的CPU能耗模式 + +返回类型:`PowerMode` + + + +### `set_threads(threads)` + +设置工作线程数。若不设置,则默认使用单线程。 + +*注意:只在开启`OpenMP`的模式下生效,否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `threads(int)` - 工作线程数 + +返回:`None` + +返回类型:`None` + + + +### `threads()` + +获取设置的工作线程数。 + +*注意:此函数只在使用`LITE_WITH_ARM`编译选项下生效。* + +参数: + +- `None` + +返回:工作线程数 + +返回类型:`int` + +## CxxPredictor + +```c++ +class CxxPredictor +``` + +`CxxPredictor`是Paddle-Lite的预测器,由`create_paddle_predictor`根据`CxxConfig`进行创建。用户可以根据CxxPredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。 + +示例: + +```python +from __future__ import print_function +from lite_core import * + +# 1. 设置CxxConfig +config = CxxConfig() +if args.model_file != '' and args.param_file != '': + config.set_model_file(args.model_file) + config.set_param_file(args.param_file) +else: + config.set_model_dir(args.model_dir) +places = [Place(TargetType.ARM, PrecisionType.FP32)] +config.set_valid_places(places) + +# 2. 创建CxxPredictor +predictor = create_paddle_predictor(config) + +# 3. 设置输入数据 +input_tensor = predictor.get_input(0) +input_tensor.resize([1, 3, 224, 224]) +input_tensor.set_float_data([1.] * 3 * 224 * 224) + +# 4. 运行模型 +predictor.run() + +# 5. 获取输出数据 +output_tensor = predictor.get_output(0) +print(output_tensor.shape()) +print(output_tensor.float_data()[:10]) +``` + +### `get_input(index)` + +获取输入Tensor,用来设置模型的输入数据。 + +参数: + +- `index(int)` - 输入Tensor的索引 + +返回:第`index`个输入`Tensor` + +返回类型:`Tensor` + + + +### `get_output(index)` + +获取输出Tensor,用来获取模型的输出结果。 + +参数: + +- `index(int)` - 输出Tensor的索引 + +返回:第`index`个输出`Tensor` + +返回类型:`Tensor` + + + +### `run()` + +执行模型预测,需要在***设置输入数据后***调用。 + +参数: + +- `None` + +返回:`None` + +返回类型:`None` + + + +### `get_version()` + +用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息,如`v2.0-beta`;否则返回代码的`branch(commitid)`,如`develop(7e44619)`。 + +参数: + +- `None` + +返回:当前lib使用的代码版本信息 + +返回类型:`str` + +## LightPredictor + +```c++ +class LightPredictor +``` + +`LightPredictor`是Paddle-Lite的预测器,由`create_paddle_predictor`根据`MobileConfig`进行创建。用户可以根据LightPredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。 + +示例: + +```python +from __future__ import print_function +from lite_core import * + +# 1. 设置MobileConfig +config = MobileConfig() +config.set_model_dir(args.model_dir) + +# 2. 创建LightPredictor +predictor = create_paddle_predictor(config) + +# 3. 设置输入数据 +input_tensor = predictor.get_input(0) +input_tensor.resize([1, 3, 224, 224]) +input_tensor.set_float_data([1.] * 3 * 224 * 224) + +# 4. 运行模型 +predictor.run() + +# 5. 获取输出数据 +output_tensor = predictor.get_output(0) +print(output_tensor.shape()) +print(output_tensor.float_data()[:10]) +``` + +### `get_input(index)` + +获取输入Tensor,用来设置模型的输入数据。 + +参数: + +- `index(int)` - 输入Tensor的索引 + +返回:第`index`个输入`Tensor` + +返回类型:`Tensor` + + + +### `get_output(index)` + +获取输出Tensor,用来获取模型的输出结果。 + +参数: + +- `index(int)` - 输出Tensor的索引 + +返回:第`index`个输出`Tensor` + +返回类型:`Tensor` + + + +### `run()` + +执行模型预测,需要在***设置输入数据后***调用。 + +参数: + +- `None` + +返回:`None` + +返回类型:`None` + + + +### `get_version()` + +用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息,如`v2.0-beta`;否则返回代码的`branch(commitid)`,如`develop(7e44619)`。 + +参数: + +- `None` + +返回:当前lib使用的代码版本信息 + +返回类型:`str` + +## TargetType + +```python +class TargetType; +``` +`TargetType`为目标设备硬件类型,用户可以根据应用场景选择硬件平台类型。 + +枚举型变量`TargetType`的所有可能取值包括: + +`{X86, CUDA, ARM, OpenCL, FPGA, NPU}` + + +## PrecisionType +```python +class PrecisionType {FP32}; +``` +`PrecisionType`为模型中Tensor的数据精度,默认值为FP32(float32)。 + +枚举型变量`PrecisionType`的所有可能取值包括: + +`{FP32, INT8, INT32, INT64}` + + + + +## DataLayoutType + +```python +class DataLayoutType {NCHW}; +``` +`DataLayoutType`为Tensor的数据格式,默认值为NCHW(number, channel, height, weigth)。 + +枚举型变量`DataLayoutType`的所有可能取值包括: + +` {NCHW, NHWC}` + + + +## Place +```python +class Place{ + TargetType target; + PrecisionType precision{FP32}; + DataLayoutType layout{NCHW} +} +``` +`Place`是`TargetType`、`PrecisionType`和`DataLayoutType`的集合,说明运行时的设备类型、数据精度和数据格式。 + +示例: +```python +from lite_core import * + +Place{TargetType(ARM), PrecisionType(FP32), DataLayoutType(NCHW)} +``` + + + +## PowerMode + +```python +class PowerMode; +``` + +`PowerMode`为ARM CPU能耗模式,用户可以根据应用场景设置能耗模式获得最优的能效比。 + +示例: + +```python +from lite_core import * + +config = MobileConfig() +# 设置NaiveBuffer格式模型目录 +config.set_model_dir() +# 设置能耗模式 +config.set_power_mode(PowerMode.LITE_POWER_NO_BIND) + +# 根据MobileConfig创建LightPredictor +predictor = create_paddle_predictor(config) +``` + +PowerMode详细说明如下: + +| 选项 | 说明 | +| :------------------: | ------------------------------------------------------------ | +| LITE_POWER_HIGH | 绑定大核运行模式。如果ARM CPU支持big.LITTLE,则优先使用并绑定Big cluster。如果设置的线程数大于大核数量,则会将线程数自动缩放到大核数量。如果系统不存在大核或者在一些手机的低电量情况下会出现绑核失败,如果失败则进入不绑核模式。 | +| LITE_POWER_LOW | 绑定小核运行模式。如果ARM CPU支持big.LITTLE,则优先使用并绑定Little cluster。如果设置的线程数大于小核数量,则会将线程数自动缩放到小核数量。如果找不到小核,则自动进入不绑核模式。 | +| LITE_POWER_FULL | 大小核混用模式。线程数可以大于大核数量。当线程数大于核心数量时,则会自动将线程数缩放到核心数量。 | +| LITE_POWER_NO_BIND | 不绑核运行模式(推荐)。系统根据负载自动调度任务到空闲的CPU核心上。 | +| LITE_POWER_RAND_HIGH | 轮流绑定大核模式。如果Big cluster有多个核心,则每预测10次后切换绑定到下一个核心。 | +| LITE_POWER_RAND_LOW | 轮流绑定小核模式。如果Little cluster有多个核心,则每预测10次后切换绑定到下一个核心。 | + + + +## Tensor + +```c++ +class Tensor +``` + +Tensor是Paddle-Lite的数据组织形式,用于对底层数据进行封装并提供接口对数据进行操作,包括设置Shape、数据、LoD信息等。 + +*注意:用户应使用`CxxPredictor`或`LightPredictor`的`get_input`和`get_output`接口获取输入/输出的`Tensor`。* + +示例: + +```python +from __future__ import print_function +from lite_core import * + +# 1. 设置CxxConfig +config = CxxConfig() +if args.model_file != '' and args.param_file != '': + config.set_model_file(args.model_file) + config.set_param_file(args.param_file) +else: + config.set_model_dir(args.model_dir) +places = [Place(TargetType.ARM, PrecisionType.FP32)] +config.set_valid_places(places) + +# 2. 创建CxxPredictor +predictor = create_paddle_predictor(config) + +# 3. 设置输入数据 +input_tensor = predictor.get_input(0) +input_tensor.resize([1, 3, 224, 224]) +input_tensor.set_float_data([1.] * 3 * 224 * 224) + +# 4. 运行模型 +predictor.run() + +# 5. 获取输出数据 +output_tensor = predictor.get_output(0) +print(output_tensor.shape()) +print(output_tensor.float_data()[:10]) +``` + +### `resize(shape)` + +设置Tensor的维度信息。 + +参数: + +- `shape(list)` - 维度信息 + +返回:`None` + +返回类型:`None` + + + +### `shape()` + +获取Tensor的维度信息。 + +参数: + +- `None` + +返回:Tensor的维度信息 + +返回类型:`list` + + + +### `float_data()` + +获取Tensor的持有的float型数据。 + +示例: + +```python +output_tensor = predictor.get_output(0) +print(output_tensor.shape()) +print(output_tensor.float_data()[:10]) +``` + +参数: + +- `None` + +返回:`Tensor`持有的float型数据 + +返回类型:`list` + + + +### `set_float_data(float_data)` + +设置Tensor持有float数据。 + +示例: + +```python +input_tensor = predictor.get_input(0) +input_tensor.resize([1, 3, 224, 224]) +input_tensor.set_float_data([1.] * 3 * 224 * 224) +``` + +参数: + +- `float_data(list)` - 待设置的float型数据 + +返回:`None` + +返回类型:`None` + + + +### `set_lod(lod)` + +设置Tensor的LoD信息。 + +参数: + +- `lod(list[list])` - Tensor的LoD信息 + +返回:`None` + +返回类型:`None` + + + +### `lod()` + +获取Tensor的LoD信息 + +参数: + +- `None` + +返回:`Tensor`的LoD信息 + +返回类型:`list[list]` diff --git a/docs/benchmark/benchmark.md b/docs/benchmark/benchmark.md index efb0805fddc0bd62a2b21a130018edaa9213e0cf..2868d0e7e573d83a0fa804732c80744e566e78d3 100644 --- a/docs/benchmark/benchmark.md +++ b/docs/benchmark/benchmark.md @@ -1,4 +1,4 @@ -# Benchmark 数据 +# 性能数据 可以参考[benchmark_tools](benchmark_tools),推荐**一键benchmark**。 @@ -15,14 +15,12 @@ * int8模型 * mobilenet_v1 * mobilenet_v2 - * resnet50 * 测试机器(android ndk ndk-r17c) * 骁龙855 * xiaomi mi9, snapdragon 855 * 4xA76(1@2.84GHz + 3@2.4GHz) + 4xA55@1.78GHz - * 骁龙845 * xiaomi mi8, 845 * 2.8GHz(大四核),1.7GHz(小四核) @@ -30,20 +28,12 @@ * 骁龙835 * xiaomi mix2, snapdragon 835 * 2.45GHz(大四核),1.9GHz(小四核) - - * 骁龙625 - * oppo R9s, snapdragon625 - * A53 x 8, big core@2.0GHz - - * 骁龙653 - * 360 N5, snapdragon 653 - * 4 x A73@2.0GHz + 4 x A53@1.4GHz - + * 麒麟970 * HUAWEI Mate10 * 测试说明 - * branch: release/2.0.0 + * branch: release/v2.3.0 * warmup=10, repeats=30,统计平均时间,单位是ms * 当线程数为1时,```DeviceInfo::Global().SetRunMode```设置LITE_POWER_HIGH,否者设置LITE_POWER_NO_BIND * 模型的输入图像的维度是{1, 3, 224, 224},输入图像的每一位数值是1 @@ -55,78 +45,59 @@ #### paddlepaddle model - 骁龙855|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 ----| ---- | ---- | ---- | ---- |---- |---- threads num|1 |2 |4 |1 |2 |4 -mobilenet_v1 |32.19 |18.81 |10.90 |30.92 |18.31 |10.15 -mobilenet_v2 |22.91 |13.75 |8.64 |21.15 |12.79 |7.84 -shufflenet_v2 |4.67 |3.37 |2.65 |4.43 |3.15 |2.66 -squeezenet_v1.1 |25.10 |15.93 |9.68 |23.28 |14.61 |8.71 -mnasnet |21.84 |13.14 |7.96 |19.61 |11.88 |7.55 +mobilenet_v1 |33.27 |19.52 |11.14 |31.72 |18.76 |10.24 | +mobilenet_v2 |29.08 |15.79 |9.25 |25.89 |14.17 |8.38 | +shufflenet_v2 |4.40 |3.09 |2.30 |4.28 |3.02 |2.35 | +squeezenet_v1.1 |19.96 |12.61 |8.76 |18.25 |11.46 |7.97 | +mnasnet |21.00 |12.54 |7.28 |19.65 |11.65 |6.96 | - -骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 +骁龙845|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 ----| ---- | ---- | ---- | ---- |---- |---- threads num|1 |2 |4 |1 |2 |4 -mobilenet_v1 |94.13 |52.17 |30.68 |88.28 |47.58 |26.64 -mobilenet_v2 |61.24 |34.64 |22.36 |56.66 |32.19 |19.63 -shufflenet_v2 |10.87 |6.92 |5.12 |10.41 |6.76 |4.97 -squeezenet_v1.1 |73.61 |42.25 |24.44 |64.87 |38.43 |23.06 -mnasnet |58.22 |33.43 |20.44 |53.43 |30.20 |18.09 - +mobilenet_v1 |66.36 |35.97 |19.45 |62.66 |33.87 |17.85 | +mobilenet_v2 |45.86 |25.53 |14.6 |41.58 |23.24 |13.39 | +shufflenet_v2 |7.58 |4.89 |3.41 |7.44 |4.91 |3.58 | +squeezenet_v1.1 |37.15 |22.74 |13.51 |34.69 |21.27 |12.74 | +mnasnet |40.09 |21.73 |11.91 |38.19 |21.02 |12.11 | -麒麟980|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 -----| ---- | ---- | ---- | ---- |---- |---- -threads num|1 |2 |4 |1 |2 |4 -mobilenet_v1 |55.11 |28.24 |13.27 |34.24 |17.74 |12.41 -mobilenet_v2 |37.03 |19.80 |51.94 |23.64 |12.98 |9.38 -shufflenet_v2 |7.26 |4.94 |15.06 |5.32 |3.33 |2.82 -squeezenet_v1.1 |42.73 |23.66 |57.39 |26.03 |14.53 |13.66 -mnasnet |36.87 |20.15 |46.04 |21.85 |12.06 |8.68 -麒麟970|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 +骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 ----| ---- | ---- | ---- | ---- |---- |---- threads num|1 |2 |4 |1 |2 |4 -mobilenet_v1 |97.80 |52.64 |34.46 |94.51 |49.36 |28.43 -mobilenet_v2 |66.55 |38.52 |23.19 |62.89 |34.93 |21.53 -shufflenet_v2 |13.78 |8.11 |5.93 |11.95 |7.90 |5.91 -squeezenet_v1.1 |77.64 |43.67 |25.72 |69.91 |40.66 |24.62 -mnasnet |61.86 |34.62 |22.68 |59.61 |32.79 |19.56 +mobilenet_v1 |96.98 |53.92 |32.24 |89.31 |48.02 |27.58 | +mobilenet_v2 |67.72 |37.66 |23.82 |60.10 |34.36 |21.05 | +shufflenet_v2 |10.72 |6.62 |4.63 |10.10 |6.44 |4.63 | +squeezenet_v1.1 |53.89 |33.28 |20.73 |50.83 |32.31 |19.51 | +mnasnet |59.55 |33.53 |20.32 |56.21 |31.58 |19.06 | #### caffe model 骁龙855|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 ----| ---- | ---- | ---- | ---- |---- |---- threads num|1 |2 |4 |1 |2 |4 | -mobilenet_v1 |32.42 |18.68 |10.86 |30.92 |18.35 |10.07 | -mobilenet_v2 |29.53 |17.76 |10.89 |27.19 |16.53 |9.75 | -shufflenet_v2 |4.61 |3.29 |2.61 |4.36 |3.11 |2.51 | - - -骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 -----| ---- | ---- | ---- | ---- |---- |---- -threads num|1 |2 |4 |1 |2 |4 | -mobilenet_v1 |92.52 |52.34 |30.37 |88.31 |49.75 |27.29 | -mobilenet_v2 |79.50 |45.67 |28.79 |76.13 |44.01 |26.13 | -shufflenet_v2 |10.94 |7.08 |5.16 |10.64 |6.83 |5.01 | +mobilenet_v1 |33.36 |19.45 |11.26 |31.63 |18.74 |10.31 | +mobilenet_v2 |31.63 |19.21 |11.61 |28.34 |17.14 |10.16 | +shufflenet_v2 |4.46 |3.08 |2.32 |4.26 |2.98 |2.35 | -麒麟980|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 +骁龙845|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 ----| ---- | ---- | ---- | ---- |---- |---- threads num|1 |2 |4 |1 |2 |4 | -mobilenet_v1 |55.36 |28.18 |13.31 |34.42 |17.93 |12.52 | -mobilenet_v2 |49.17 |26.10 |65.49 |30.50 |16.66 |11.72 | -shufflenet_v2 |8.45 |5.00 |15.65 |4.58 |3.14 |2.83 | +mobilenet_v1 |66.32 |35.83 |19.56 |62.52 |33.79 |17.91 | +mobilenet_v2 |58.46 |32.69 |18.56 |53.72 |29.86 |16.80 | +shufflenet_v2 |7.65 |4.82 |3.46 |7.55 |4.97 |3.62 | -麒麟970|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 +骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 ----| ---- | ---- | ---- | ---- |---- |---- threads num|1 |2 |4 |1 |2 |4 | -mobilenet_v1 |97.85 |53.38 |33.85 |94.29 |49.42 |28.29 | -mobilenet_v2 |87.40 |50.25 |31.85 |85.55 |48.11 |28.24 | -shufflenet_v2 |12.16 |8.39 |6.21 |12.21 |8.33 |6.32 | +mobilenet_v1 |95.38 |54.09 |32.03 |95.05 |48.33 |27.54 | +mobilenet_v2 |88.46 |48.98 |30.23 |79.28 |44.64 |27.10 | +shufflenet_v2 |10.07 |6.51 |4.61 |10.31 |6.50 |4.66 | #### int8量化模型测试数据 @@ -136,6 +107,7 @@ threads num|1 |2 |4 |1 |2 |4 | mobilenet_v1 |36.80 |21.58 |11.12 | 14.01 |8.13 |4.32 | mobilenet_v2 |28.72 |19.08 |12.49 | 17.24 |11.55 |7.82 | + 骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8 ----| ---- | ---- | ---- | ---- |---- |---- threads num|1 |2 |4 |1 |2 |4 | diff --git a/docs/benchmark/benchmark_tools.md b/docs/benchmark/benchmark_tools.md index 60341762b70772bc46196b836050714b9d43228b..3cf1486307ad79a47dfbfe199e3d6d708c99db4b 100644 --- a/docs/benchmark/benchmark_tools.md +++ b/docs/benchmark/benchmark_tools.md @@ -1,4 +1,4 @@ -# Benchmark 测试方法 +# 测试方法 本文将会介绍,在**Ubuntu:16.04交叉编译环境**下,用安卓手机在终端测试Paddle-Lite的性能,并介绍两种Benchmark方法: @@ -57,7 +57,7 @@ wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/bench #### 方式二:由源码编译benchmark_bin文件 -根据[源码编译](../source_compile)准备编译环境,拉取PaddleLite最新release发布版代码,并在仓库根目录下,执行: +根据[源码编译](../user_guides/source_compile)准备编译环境,拉取PaddleLite最新release发布版代码,并在仓库根目录下,执行: ```shell ########################################### @@ -135,53 +135,53 @@ sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true > 不同手机,不同版本,测试模型的性能数据不同。 ```shell -run benchmark armv7 +run benchmark armv8 -------------------------------------- PaddleLite Benchmark Threads=1 Warmup=10 Repeats=30 --- mnasnet avg = 159.8427 ms --- mobilenet_v1 avg = 235.0072 ms --- mobilenet_v2 avg = 173.0387 ms --- shufflenet_v2 avg = 76.0040 ms --- squeezenet_v11 avg = 164.2957 ms +mnasnet min = 19.83500 max = 19.38500 average = 19.65503 +mobilenetv1 min = 32.00600 max = 31.56900 average = 31.81983 +mobilenetv2 min = 22.37900 max = 22.08700 average = 22.28623 +shufflenetv2 min = 10.80400 max = 10.62900 average = 10.68890 +squeezenet min = 17.67400 max = 17.47900 average = 17.57677 Threads=2 Warmup=10 Repeats=30 --- mnasnet avg = 83.1287 ms --- mobilenet_v1 avg = 121.6029 ms --- mobilenet_v2 avg = 86.6175 ms --- shufflenet_v2 avg = 41.5761 ms --- squeezenet_v11 avg = 87.8678 ms +mnasnet min = 11.85600 max = 11.72000 average = 11.77127 +mobilenetv1 min = 18.75000 max = 18.64300 average = 18.70593 +mobilenetv2 min = 14.05100 max = 13.59900 average = 13.71450 +shufflenetv2 min = 6.67200 max = 6.58300 average = 6.63400 +squeezenet min = 12.07100 max = 11.33400 average = 11.41253 Threads=4 Warmup=10 Repeats=30 --- mnasnet avg = 73.3880 ms --- mobilenet_v1 avg = 119.0739 ms --- mobilenet_v2 avg = 85.3050 ms --- shufflenet_v2 avg = 38.0762 ms --- squeezenet_v11 avg = 64.2201 ms +mnasnet min = 7.19300 max = 7.02600 average = 7.08480 +mobilenetv1 min = 10.42000 max = 10.29100 average = 10.34267 +mobilenetv2 min = 8.61900 max = 8.46900 average = 8.54707 +shufflenetv2 min = 4.55200 max = 4.41900 average = 4.46477 +squeezenet min = 8.60000 max = 7.85200 average = 7.98407 -------------------------------------- -run benchmark armv8 +run benchmark armv7 -------------------------------------- PaddleLite Benchmark Threads=1 Warmup=10 Repeats=30 --- mnasnet avg = 165.3073 ms --- mobilenet_v1 avg = 306.0188 ms --- mobilenet_v2 avg = 195.1884 ms --- shufflenet_v2 avg = 99.3692 ms --- squeezenet_v11 avg = 156.6971 ms +mnasnet min = 20.98300 max = 20.81400 average = 20.92527 +mobilenetv1 min = 33.19000 max = 32.81700 average = 33.08490 +mobilenetv2 min = 25.91400 max = 25.61700 average = 25.73097 +shufflenetv2 min = 11.14300 max = 10.97600 average = 11.06757 +squeezenet min = 19.31800 max = 19.20000 average = 19.26530 Threads=2 Warmup=10 Repeats=30 --- mnasnet avg = 90.2290 ms --- mobilenet_v1 avg = 157.0007 ms --- mobilenet_v2 avg = 118.1607 ms --- shufflenet_v2 avg = 68.6804 ms --- squeezenet_v11 avg = 91.3090 ms +mnasnet min = 12.59900 max = 12.46600 average = 12.52207 +mobilenetv1 min = 19.05800 max = 18.94700 average = 18.97897 +mobilenetv2 min = 15.28400 max = 15.11300 average = 15.19843 +shufflenetv2 min = 6.97000 max = 6.81400 average = 6.90863 +squeezenet min = 12.87900 max = 12.12900 average = 12.22530 Threads=4 Warmup=10 Repeats=30 --- mnasnet avg = 179.9730 ms --- mobilenet_v1 avg = 204.0684 ms --- mobilenet_v2 avg = 181.6486 ms --- shufflenet_v2 avg = 123.2728 ms --- squeezenet_v11 avg = 412.9046 ms +mnasnet min = 7.31400 max = 7.12900 average = 7.20357 +mobilenetv1 min = 11.44000 max = 10.86900 average = 10.94383 +mobilenetv2 min = 9.14900 max = 9.03800 average = 9.09907 +shufflenetv2 min = 4.60600 max = 4.49400 average = 4.53360 +squeezenet min = 8.27000 max = 8.10600 average = 8.19000 -------------------------------------- ``` diff --git a/docs/benchmark/index.rst b/docs/benchmark/index.rst deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/docs/demo_guides/android_app_demo.md b/docs/demo_guides/android_app_demo.md new file mode 100644 index 0000000000000000000000000000000000000000..7c40e1eb52bec0112b98fac7b1c49ef79273089f --- /dev/null +++ b/docs/demo_guides/android_app_demo.md @@ -0,0 +1,133 @@ +# Android Demo + +## 多种应用场景 + +我们提供的Paddle-Lite示例工程[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo),其中包含[Android](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo)、[iOS](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-ios-demo)和[Armlinux](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-armlinux-demo)平台的示例工程。涵盖[人脸识别](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/face_detection_demo)、[人像分割](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/human_segmentation_demo)、[图像分类](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/image_classification_demo)、[目标检测](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/object_detection_demo)4个应用场景。 + +### 1. 人脸识别 + +人脸检测是Paddle-Lite提供的人像检测demo。在移动端上提供了高精度、实时的人脸检测能力,能处理基于人脸检测的业务场景。在移动端预测的效果图如下: + +

     

+ +### 2. 人像分割 + +人像分割是Paddle-Lite 提供的图像分割demo ,在移动端上提供了实时的人像分割能力,可以应用证件照自动抠图、面积测量、智能交通(标记车道和交通标志)等场景。 在移动端预测的效果图如下: + +

     

+ +### 3. 图像分类 + +图像分类是Paddle-Lite 提供的图像处理demo ,在移动端上提供了实时的物体识别能力,可以应用到生产线自动分拣或质检、识别医疗图像、辅助医生肉眼诊断等场景。在移动端预测的效果图如下: + +

     

+ +### 4. 物体检测 + +物体检测是Paddle-Lite 提供的图像识别demo ,在移动端上提供了检测多个物体的位置、名称、位置及数量的能力。可以应用到视频监控(是否有违规物体或行为)、工业质检(微小瑕疵的数量和位置)、医疗诊断(细胞计数、中药识别)等场景。在移动端预测的效果图如下: + +

     

+ +## Android demo部署方法 + +下面我们以 **目标检测示例(object_detection_demo)** 为例讲解如何部署。 + +**目的**:将基于Paddle-Lite预测库的Android APP 部署到手机,实现物体检测 + +**需要的环境**: Android Studio、Android手机(开启USB调试模式)、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程 + +**部署步骤**: + +1、 目标检测的Android示例位于 `Paddle-Lite-Demo\PaddleLite-android-demo\object_detection_demo` + +2、用Android Studio 打开object_detection_demo工程 (本步骤需要联网)。 + +3、手机连接电脑,打开**USB调试**和**文件传输模式**,在Android Studio上连接自己的手机设备(手机需要开启允许从 USB安装软件权限) + +![Android_studio](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Android_studio.png) + +4、按下 Run按钮,自动编译APP并安装到手机。(该过程会自动下载Paddle-Lite预测库和模型,需要联网) + +成功后效果如下,图一:APP安装到手机 图二: APP打开后的效果,会自动识别图片中的物体并标记 + +

     

+ +## Android demo结构讲解 + +Android 示例的代码结构如下图所示: + +

+ + + 1、 Predictor.java: 预测代码 + +```shell +# 位置: +object_detection_demo/app/src/main/java/com/baidu/paddle/lite/demo/object_detection/Predictor.java +``` + + 2、 model.nb : 模型文件 (opt 工具转化后Paddle-Lite模型);pascalvoc_label_list:训练模型时的`labels`文件 + +```shell +# 位置: +object_detection_demo/app/src/main/assets/models/ssd_mobilenet_v1_pascalvoc_for_cpu/model.nb +object_detection_demo/app/src/main/assets/labels/pascalvoc_label_list +``` + + 3、 libpaddle_lite_jni.so、PaddlePredictor.jar:Paddle-Lite Java 预测库与Jar包 + +```shell +# 位置 +object_detection_demo/app/src/main/jniLibs/arm64-v8a/libpaddle_lite_jni.so +object_detection_demo/app/libs/PaddlePredictor.jar +``` + + 4、 build.gradle : 定义编译过程的 gradle 脚本。(不用改动,定义了自动下载Paddle-Lite预测和模型的过程) + +```shell +# 位置 +object_detection_demo/app/build.gradle +``` + + + +## 代码讲解 (使用Paddle-Lite Java API 执行预测) + +Android 示例基于Java API 开发,调用Paddle-Lite Java API包括以下五步。更详细的API 描述参考: [Paddle-Lite Java API](https://paddle-lite.readthedocs.io/zh/latest/api_reference/java_api_doc.html)。 + +```c++ +// 导入Java API +import com.baidu.paddle.lite.MobileConfig; +import com.baidu.paddle.lite.Tensor; +import com.baidu.paddle.lite.Predictor; +import com.baidu.paddle.lite.PowerMode; + +// 1. 写入配置:设置MobileConfig +MobileConfig config = new MobileConfig(); +config.setModelFromFile(); // 设置Paddle-Lite模型路径 +config.setPowerMode(PowerMode.LITE_POWER_NO_BIND); // 设置CPU运行模式 +config.setThreads(4); // 设置工作线程数 + +// 2. 创建 PaddlePredictor +PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config); + +// 3. 设置输入数据 +long[] dims = {100, 100}; +float[] inputBuffer = new float[10000]; +for (int i = 0; i < 10000; ++i) { + inputBuffer[i] = i; +} +Tensor input = predictor.getInput(0); +input.resize(dims); +input.setData(inputBuffer); + +// 4. 执行预测 +predictor.run(); + +// 5. 获取输出数据 +Tensor result = predictor.getOutput(0); +float[] output = result.getFloatData(); +for (int i = 0; i < 1000; ++i) { + System.out.println(output[i]); +} +``` diff --git a/docs/demo_guides/cpp_demo.md b/docs/demo_guides/cpp_demo.md new file mode 100644 index 0000000000000000000000000000000000000000..55abd3a70fe23dd0e8798d6a772ee216140c2875 --- /dev/null +++ b/docs/demo_guides/cpp_demo.md @@ -0,0 +1,266 @@ +# C++ Demo + +## 1. 下载最新版本预测库 + +预测库下载界面位于[Paddle-Lite官方预编译库](../user_guides/release_lib),可根据需求选择合适版本。 + +以**Android-ARMv8架构**为例,可以下载以下版本: + + +|ARM Version|build_extra|arm_stl|target|下载| +|:-------:|:-----:|:-----:|:-----:|:-------:| +|armv8|OFF|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.tiny_publish.tar.gz)| + +**解压后内容如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/1inference_lib.png) + +## 2. 转化模型 + +PaddlePaddle的原生模型需要经过[opt]()工具转化为Paddle-Lite可以支持的naive_buffer格式。 + +以`mobilenet_v1`模型为例: + +(1)下载[mobilenet_v1模型](http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz)后解压: + +```shell +wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz +tar zxf mobilenet_v1.tar.gz +``` + +**如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/3inference_model.png) + +(2)下载[opt工具](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt)。放入同一文件夹,终端输入命令转化模型: + +```shell +wget https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt +chmod +x opt +./opt --model_dir=./mobilenet_v1 --optimize_out_type=naive_buffer --optimize_out=./mobilenet_v1_opt +``` + +**结果如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/2opt_model.png) + + + +## 3. 编写预测程序 + +准备好预测库和模型,我们便可以编写程序来执行预测。我们提供涵盖图像分类、目标检测等多种应用场景的C++示例demo可供参考,位于`inference_lite_lib.android.armv8/demo/cxx`。 + +以mobile net_v1预测为例:`mobile_light`为mobilenet_v1预测示例,可以直接调用。 + +**示例如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/4light_demo.png) + + + +## 4. 编译 + +预测程序需要编译为Android可执行文件。 + +以mobilenet_v1模型为例,C++示例位于`inference_lite_lib.android.armv8/demo/mobile_light` + +```shell +cd inference_lite_lib.android.armv8/demo/mobile_light +``` + +编译demo + +```shell +make +``` + +**结果如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/5compile_demo.png) + +## 5. 执行预测 + +通过adb工具将可执行文件推送到手机上执行预测 + +(1)保证电脑已经安装adb工具,手机以"USB调试"、"文件传输模式"连接到电脑。 + +``` shell +adb deveices #查看adb设备是否已被识别 +``` + +**连接如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/6adb_devices.png) + +(2)准备预测库、模型和预测文件 + +1、将模型、动态库和预测文件放入同一文件夹: + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/7files.png) + +**注意**:动态预测库文件位于: `inference_lite_lib.android.armv8/cxx/liblibpaddle_light_api_shared.so` + +2、文件推送到手机: + +``` shell +chmod +x mobilenetv1_light_api +adb push mobilenet_v1_opt.nb /data/local/tmp +adb push libpaddle_light_api_shared.so /data/local/tmp +adb push mobilenetv1_light_api /data/local/tmp +``` +**效果如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/8push_file.png) + +(3)执行预测 + +```shell +adb shell 'cd /data/local/tmp && export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp && mobilenetv1_light_api ./mobilenet_v1_opt.nb' +``` +**结果如下图所示:** + +![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/9result.png) + +上图的`Output`为mobilenet_v1模型在全1输入时,得到的预测输出。至此,Paddle-Lite的C++ demo执行完毕。 + + + + + +## 注:如何在代码中使用 API + +C++代码调用Paddle-Lite执行预测库仅需以下五步: + +(1)引用头文件和命名空间 + +```c++ +#include "paddle_api.h" +using namespace paddle::lite_api; +``` + +(2)指定模型文件,创建Predictor + +```C++ +// 1. Set MobileConfig, model_file_path is +// the path to model model file. +MobileConfig config; +config.set_model_from_file(model_file_path); +// 2. Create PaddlePredictor by MobileConfig +std::shared_ptr predictor = + CreatePaddlePredictor(config); +``` + +(3)设置模型输入 (下面以全一输入为例) + +```c++ +std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); +input_tensor->Resize({1, 3, 224, 224}); +auto* data = input_tensor->mutable_data(); +for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 1; +} +``` + +(4)执行预测 + +```c++ +predictor->Run(); +``` + +(5)获得预测结果 + +```c++ +std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); +// 转化为数据 +auto output_data=output_tensor->data(); +``` + + + + + +## 其他cxx_demo的编译与预期结果 + +### Light API Demo + +```shell +cd ../mobile_light +make +adb push mobilenetv1_light_api /data/local/tmp/ +adb shell chmod +x /data/local/tmp/mobilenetv1_light_api +adb shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt " +``` + + +### 图像分类 Demo + +```shell +cd ../mobile_classify +wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz +tar zxvf mobilenet_v1.tar.gz +make +adb push mobile_classify /data/local/tmp/ +adb push test.jpg /data/local/tmp/ +adb push labels.txt /data/local/tmp/ +adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +adb shell chmod +x /data/local/tmp/mobile_classify +adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1.opt /data/local/tmp/test.jpg /data/local/tmp/labels.txt" +``` + +### 目标检测 Demo + +```shell +cd ../mobile_detection +wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz +tar zxvf mobilenetv1-ssd.tar.gz +make +adb push mobile_detection /data/local/tmp/ +adb push test.jpg /data/local/tmp/ +adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +adb shell chmod +x /data/local/tmp/mobile_detection +adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg" +adb pull /data/local/tmp/test_detection_result.jpg ./ +``` + +### light API Demo 运行结果 + +运行成功后 ,将在控制台输出预测结果的前10个类别的预测概率: + +```shell +Output dim: 1000 +Output[0]: 0.000191 +Output[100]: 0.000160 +Output[200]: 0.000264 +Output[300]: 0.000211 +Output[400]: 0.001032 +Output[500]: 0.000110 +Output[600]: 0.004829 +Output[700]: 0.001845 +Output[800]: 0.000202 +Output[900]: 0.000586 +``` + +### 图像分类 Demo 运行结果 + +运行成功后 ,将在控制台输出预测结果的前5个类别的类型索引、名字和预测概率: + +```shell +parameter: model_dir, image_path and label_file are necessary +parameter: topk, input_width, input_height, are optional +i: 0, index: 285, name: Egyptian cat, score: 0.482870 +i: 1, index: 281, name: tabby, tabby cat, score: 0.471593 +i: 2, index: 282, name: tiger cat, score: 0.039779 +i: 3, index: 287, name: lynx, catamount, score: 0.002430 +i: 4, index: 722, name: ping-pong ball, score: 0.000508 +``` + +### 目标检测 Demo 运行结果 + +运行成功后 ,将在控制台输出检测目标的类型、预测概率和坐标: + +```shell +running result: +detection image size: 935, 1241, detect object: person, score: 0.996098, location: x=187, y=43, width=540, height=592 +detection image size: 935, 1241, detect object: person, score: 0.935293, location: x=123, y=639, width=579, height=597 +``` diff --git a/docs/user_guides/cuda.md b/docs/demo_guides/cuda.md similarity index 73% rename from docs/user_guides/cuda.md rename to docs/demo_guides/cuda.md index 45597057bb18c44b60234459f9a49a59b54135f6..8b3e76acef590bda19a59388017added6a0b8d52 100644 --- a/docs/user_guides/cuda.md +++ b/docs/demo_guides/cuda.md @@ -1,4 +1,4 @@ -# Lite基于CUDA的模型预测 +# PaddleLite使用CUDA预测部署 Lite支持在x86_64,arm64架构上(如:TX2)进行CUDA的编译运行。 @@ -28,7 +28,27 @@ cd Paddle-Lite ./lite/tools/build.sh --build_python=ON cuda ``` -编译结束会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。 +## 编译结果说明 + +cuda的编译结果位于 `build_cuda/inference_lite_lib` +**具体内容**说明: + +1、 `bin`文件夹:可执行工具文件,目前为空 + +2、 `cxx`文件夹:包含c++的库文件与相应的头文件 + +- `include` : 头文件 +- `lib` : 库文件 + - 打包的静态库文件: + - `libpaddle_api_full_bundled.a` :包含 full_api 和 light_api 功能的静态库 + - 打包的动态态库文件: + - `libpaddle_full_api_shared.so` :包含 full_api 和 light_api 功能的动态库 + +3、 `third_party` 文件夹:第三方库文件 + +4、 `demo` 文件夹:c++ demo. + +如果编译打开了python选项,则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。 ## 运行 @@ -36,7 +56,6 @@ cd Paddle-Lite 一: 下载darknet_yolov3模型,模型信息请参考[这里](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/yolov3) - ``` # 下载模型 wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz @@ -47,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg 二: 运行 -**NOTE:**此处示例使用的是python接口,后续会开放C++接口以及示例。 +**NOTE:**此处示例使用的是python接口。 ``` python #-*- coding: utf-8 -*- @@ -107,4 +126,14 @@ print (output_tensor.float_data()[:6]) ``` -**NOTE:** 对CUDA的支持还在持续开发中。 +**NOTE:** 此处示例使用的是C++接口。 + +``` +cd build_cuda/inference_lite_lib/demo/cxx/ +mkdir build && cd build +cmake .. +make +wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz +tar -zxf yolov3_infer.tar.gz +./demo yolov3_infer +``` diff --git a/docs/demo_guides/fpga.md b/docs/demo_guides/fpga.md new file mode 100644 index 0000000000000000000000000000000000000000..f7885fd3b7f6600fe890332d2805a386008659e5 --- /dev/null +++ b/docs/demo_guides/fpga.md @@ -0,0 +1,106 @@ +# PaddleLite使用FPGA预测部署 + +Paddle Lite支持基于arm的FPGA zu3/zu5/zu9的模型预测,提供armv8的交叉编译 + +Lite基于FPGA运行模型需要相应的FPGA驱动,目前只支持百度[Edgeboard开发板](https://ai.baidu.com/tech/hardware/deepkit) + +## Lite实现FPGA简介 + +Lite支持FPGA作为后端硬件进行模型推理,其主要特性如下: + +- Lite中FPGA的kernel(feed、fetch除外)均以FP16、NHWC的格式作为输入输出格式,所有的weights和bias仍为FP32、NCHW的格式,feed的输入和fetch的输出均为FP32、NCHW格式的数据,在提升计算速度的同时能做到用户对数据格式无感知 + +- 对于FPGA暂不支持的kernel,均会切回arm端运行,实现arm+FPGA混合布署运行 + +- 目前FPGA成本功耗都较低,Lite基于FPGA的模型性能远远好于arm端,可作为边缘设备首选硬件 + +## 编译 + +需要提前准备带有FPGAdrv.ko的FPGA开发板(如edgeboard开发板)和Lite代码 + +CMAKE编译选项: + +- 设置`LITE_WITH_FPGA=ON`和`LITE_WITH_ARM=ON` + +其他编译选项与ARM编译相同,可以参考[“Paddle Lite在Docker下的ARM编译”](../user_guides/source_compile)。 +示例如下: +```shell + cmake .. \ + -DWITH_GPU=OFF \ + -DWITH_MKL=OFF \ + -DWITH_LITE=ON \ + -DLITE_WITH_CUDA=OFF \ + -DLITE_WITH_X86=OFF \ + -DLITE_WITH_ARM=ON \ + -DLITE_WITH_OPENMP=ON \ + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ + -DWITH_TESTING=OFF \ + -DLITE_WITH_FPGA=ON \ + -DARM_TARGET_OS=armlinux + make publish_inference -j2 +``` +Lite提供FPGA编译脚本,位于lite/tools/build_FPGA.sh,在Lite根目录执行该脚本即可编译 + +## 运行示例 + +- **运行文件准备** + +下面以Resnet50模型为例,介绍如何使用edgeboard开发板实现模型运行 + +```bash +#连接开发板,并利用screen命令启动 [本机执行] +screen /dev/cu.SLAB_USBtoUART 115200 +#查看开发板ip并ssh登录到开发板,假设开发板ip为192.0.1.1 [本机执行] +ssh root@192.0.1.1 + +#在开发板上建立目录workspace,拷贝FPGA驱动FPGAdrv.ko到workspace目录 [开发板执行] +mkdir workspace && scp $DRIVER_PATH/FPGAdrv.ko workspace + +#将Lite中编译好的测试程序拷贝到开发板workspace目录 [本机执行] +scp $LITE_ROOT/build_FPGA/lite/api/test_resnet50_FPGA root@$EDGEBOARD_IP:workspace/ +#把Resnet50的模型和参数scp到开发板workspace目录 [本机执行] +scp -r $LITE_ROOT/build_FPGA/lite/third_party/install/resnet50/ root@$EDGEBOARD_IP:workspace/ + +#在运行模型前需要加载FPGA驱动 [开发板执行] +insmod FPGAdrv.ko +#给测试程序添加可运行权限 [开发板执行] +chmod +x test_resnet50_FPGA +``` + +- **使用FPGA进行模型预测** + +```bash +#以下命令均在开发板上运行 +#直接运行单测程序 +./test_resnet50_FPGA --model_dir=resnet50 +#如果需要测试性能,可以用repeats参数设置模型运行次数(如1000),同时可以设置预热次数(如10)来让硬件事先运行到稳定水平 +./test_resnet50_FPGA --model_dir=resnet50 --repeats=1000 --warmup=10 +``` + +## 如何在Code中使用 + +在Lite中使用FPGA与ARM相似,具体的区别如下: + +- 由于fpga运行模式为fp16精度、nhwc布局,所以需要修改相应的`valid_place` +- fpga不需要device的初始化和运行模式设置 + +代码示例: +```cpp +lite::Predictor predictor; +std::vector valid_places( + {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},Place{TARGET(kARM)}); + +predictor.Build(model_dir, "", "", valid_places); + +auto* input_tensor = predictor.GetInput(0); +input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); +auto* data = input_tensor->mutable_data(); +auto item_size = input_tensor->dims().production(); +//假设设置输入数据全为1 +for (int i = 0; i < item_size; i++) { + data[i] = 1; +} + +predictor.Run(); +auto* out = predictor.GetOutput(0); +``` diff --git a/docs/demo_guides/ios_app_demo.md b/docs/demo_guides/ios_app_demo.md new file mode 100644 index 0000000000000000000000000000000000000000..2d9bbcbf83e1703a116d65c7ce8379638bd13cfe --- /dev/null +++ b/docs/demo_guides/ios_app_demo.md @@ -0,0 +1,129 @@ +# iOS Demo + +## 多种应用场景 + +我们提供Paddle-Lite示例工程[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo),其中包含[Android](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo)、[iOS](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-ios-demo)和[Armlinux](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-armlinux-demo)平台的示例工程。iOS demo涵盖[图像分类](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/image_classification_demo)、[目标检测](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/object_detection_demo)2个应用场景。 + +### 1. 图像分类 + +图像分类是Paddle-Lite 提供的图像处理demo ,在移动端上提供了实时的物体识别能力,可以应用到生产线自动分拣或质检、识别医疗图像、辅助医生肉眼诊断等场景。在移动端预测的效果图如下: + +

     

+ +### 2. 物体检测 + +物体检测是Paddle-Lite 提供的图像识别demo ,在移动端上提供了检测多个物体的位置、名称、位置及数量的能力。可以应用到视频监控(是否有违规物体或行为)、工业质检(微小瑕疵的数量和位置)、医疗诊断(细胞计数、中药识别)等场景。在移动端预测的效果图如下: + +

     

+ +## iOS demo部署方法 + +下面我们以**目标检测(object_detection_demo)**为例讲解如何部署iOS工程。 + +**目的**:将基于Paddle-Lite预测库的iOS APP部署到苹果手机,实现物体检测。 + +**需要的环境**:Mac 电脑上安装Xcode、苹果手机、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程 + +**部署步骤**: + +1、 目标检测的iOS示例位于 `Paddle-Lite-Demo\PaddleLite-ios-demo\object_detection_demo` + +2、终端中执行 `download_dependencies.sh`脚本自动下载模型和Paddle-Lite预测库 + +```shell +cd PaddleLite-ios-demo # 1. 终端中进入 Paddle-Lite-Demo\PaddleLite-ios-demo +sh download_dependencies.sh # 2. 执行脚本下载依赖项 (需要联网) +``` + +下载完成后会出现提示: `Extract done ` + +3、用Xcode打开`object_detection_demo/detection_demo.xcodeproj`文件,修改工程配置。 +依次修改 `General/Identity`和`Signing&Capabilities`属性,替换为自己的工程代号和团队名称。(必须修改,不然无法通过编译) + +![Xcode1](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/Xcode1.png) + + + +![Xcode2](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/Xcode2.png) + +4、 IPhone手机连接电脑,在Xcode中连接自己的手机 (第一次连接IPhone到电脑时,需要在IPhone的`设置->通用->设备管理`中选择本电脑并信任) + +

+ +5、按下左上角的 Run按钮,自动编译APP并安装到手机。在苹果手机中设置信任该APP(进入`设置->通用->设备管理`,选中新安装的APP并`验证该应用`) + +成功后效果如下,图一:APP安装到手机 图二: APP打开后的效果,会自动识别图片中的物体并标记 + +

     

+ +## iOS demo结构讲解 + +iOS 示例的代码结构如下图所示: + +

+ + 1、 mobilenetv1-ssd: 模型文件 (opt 工具转化后Paddle-Lite模型) + +```shell +# 位置: +ios-detection_demo/detection_demo/models/mobilenetv1-ssd +``` + + 2、 libpaddle_api_light_bundled.a、paddle_api.h : Paddle-Lite C++ 预测库和头文件 + +```shell +# 位置: +# iOS预测库 +ios-detection_demo/detection_demo/lib/libpaddle_api_light_bundled.a +# 预测库头文件 +ios-detection_demo/detection_demo/include/paddle_api.h +ios-detection_demo/detection_demo/include/paddle_use_kernels.h +ios-detection_demo/detection_demo/include/paddle_use_ops.h +``` + + 3、 ViewController.mm:主要预测代码 + +```shell +# 位置 +ios-detection_demo/detection_demo/ViewController.mm +``` + +## 代码讲解 (如何使用Paddle-Lite C++ API 执行预测) + +IOS 示例基于C++ API 开发,调用Paddle-Lite C++ API包括以下五步。更详细的API 描述参考: [Paddle-Lite C++ API](https://paddle-lite.readthedocs.io/zh/latest/api_reference/java_api_doc.html)。 + +```c++ +#include +// 引入C++ API +#include "paddle_lite/paddle_api.h" +#include "paddle_lite/paddle_use_ops.h" +#include "paddle_lite/paddle_use_kernels.h" + +// 1. 设置MobileConfig +MobileConfig config; +config.set_model_from_file(); // 设置NaiveBuffer格式模型路径 +config.set_power_mode(LITE_POWER_NO_BIND); // 设置CPU运行模式 +config.set_threads(4); // 设置工作线程数 + +// 2. 创建PaddlePredictor +std::shared_ptr predictor = CreatePaddlePredictor(config); + +// 3. 设置输入数据 +std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); +input_tensor->Resize({1, 3, 224, 224}); +auto* data = input_tensor->mutable_data(); +for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 1; +} + +// 4. 执行预测 +predictor->run(); + +// 5. 获取输出数据 +std::unique_ptr output_tensor(std::move(predictor->GetOutput(0))); +std::cout << "Output shape " << output_tensor->shape()[1] << std::endl; +for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { + std::cout << "Output[" << i << "]: " << output_tensor->data()[i] + << std::endl; +} +``` diff --git a/docs/demo_guides/java_demo.md b/docs/demo_guides/java_demo.md new file mode 100644 index 0000000000000000000000000000000000000000..ad37e7b95dbd439ccc7393af27140a404e16cf07 --- /dev/null +++ b/docs/demo_guides/java_demo.md @@ -0,0 +1,99 @@ +# Java Demo + +本节中,Java demo 完整代码位于 [demo/java](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite/demo/java) 。 + +要编译和跑起Android demo 程序 PaddlePredictor,你需要准备: + +1. 一台能运行安卓程序的安卓手机 +2. 一台带有AndroidStudio的开发机 + +## 编译 + +首先在PaddleLite的开发 [Docker镜像](../user_guides/source_compile) 中,拉取最新PaddleLite代码,编译对应你手机架构的预测库, +下面我们以arm8 架构举例。进入paddlelite 目录,运行以下命令: + +```shell +./lite/tools/build.sh \ + --arm_os=android \ + --arm_abi=armv8 \ + --arm_lang=gcc \ + --android_stl=c++_static \ + tiny_publish +``` + +命令完成后查看要存在 + +``` +./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/java/so/libpaddle_lite_jni.so +./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/java/jar/PaddlePredictor.jar +./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/java/android +``` + +libpaddle_lite_jni.so为 PaddleLite c++ 动态链接库,PaddlePredictor.jar为 Java jar 包,两者包含 PaddleLite Java API,接下来 Android Java 代码会使用这些api。android文件夹中则是Android demo。 + +## 准备 demo 需要的其他文件 + +Demo 除了代码,还需要准备在Android工程目录下配置好JNI .so 库(上节提到的`libpaddle_lite_jni.so`),Java .jar 包(上文提到的`PaddlePredictor.jar` ),和模型文件。我们提供了自动化的脚本和手动拷贝两种方法,用户可以根据自己需要选择: + +### 脚本方法 + +进入 `build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/java/android`,我们准备了一个脚本`prepare_demo.bash`,脚本输入一个参数,为你要拷贝的.so 对应的架构文件夹名。 + +例如运行 + +``` +bash prepare_demo.bash arm8 +``` + +该脚本自动下载并解压缩模型文件,拷贝了 .jar 包进demo,还有生成的.so包进`PaddlePredictor/app/src/main/jinLibs/架构文件夹下`, +在我们这个例子里,armv8 就是架构文件夹。备注:这种方式构建的 demo 在 armv8 手机运行正常。如果要demo 程序在别的手机架构(如 armv7)上也运行正常,需要添加别的架构。 + +### 手动拷贝方法 + +接下来我们介绍手动拷贝,如果使用了脚本,那么可以跳过以下手动方法的介绍。 + +### 把 .so 动态库和 .jar 拷贝进安卓demo程序: + +1. 将PaddlePredictor 载入到AndroidStudio。 +2. 将`libpaddle_lite_jni.so`拷贝进 `PaddlePredictor/app/src/main/jinLibs/架构文件夹下` ,比如文件夹arm8里要包含该 .so文件。 +3. 将 `PaddlePredictor.jar` 拷贝进 `PaddlePredictor/app/libs` 下 + +### 把demo使用到的模型文件拷贝进安卓程序: + +下载我们的5个模型文件,并解压缩到 `PaddlePredictor/app/src/main/assets` 这个文件夹中 +需要拷贝的模型文件和下载地址: + +``` +inception_v4_simple_opt.nb http://paddle-inference-dist.bj.bcebos.com/inception_v4_simple_opt.nb.tar.gz +lite_naive_model_opt.nb http://paddle-inference-dist.bj.bcebos.com/lite_naive_model_opt.nb.tar.gz +mobilenet_v1_opt.nb http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1_opt.nb.tar.gz +mobilenet_v2_relu_opt.nb http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2_relu_opt.nb.tar.gz +resnet50_opt.nb http://paddle-inference-dist.bj.bcebos.com/resnet50_opt.nb.tar.gz +``` + +下载完后,assets文件夹里要包含解压后的上面五个模型文件夹,但demo里不需要保存原压缩.tar.gz 文件。 + +注意:输入的模型要求为naive buffer存储格式,您可以通过 [**Model Optimize Tool**](../user_guides/model_optimize_tool) 将fluid模型转为naive buffer存储格式。 + +## 运行 Android 程序结果 + +以上准备工作完成,就可以开始Build 、安装、和运行安卓demo程序。当你运行PaddlePredictor 程序时,大概会等10秒,然后看到类似以下字样: + +``` +lite_naive_model output: 50.213173, -28.872887 +expected: 50.2132, -28.8729 + +inception_v4_simple test:true +time: xxx ms + +resnet50 test:true +time: xxx ms + +mobilenet_v1 test:true +time: xxx ms + +mobilenet_v2 test:true +time: xxx ms +``` + +该 demo 程序跑我们的 5 个模型,第一个模型结果将真正的头两个数字输出,并在第二行附上期望的正确值。你应该要看到他们的误差小于0.001。后面四个模型如果你看到 `test:true` 字样,说明模型输出通过了我们在 demo 程序里对其输出的测试。time 代表该测试花费的时间。 diff --git a/docs/demo_guides/npu.md b/docs/demo_guides/npu.md new file mode 100644 index 0000000000000000000000000000000000000000..0bdec8d73a881c186d9c4141e2d59a1b2bf11d8b --- /dev/null +++ b/docs/demo_guides/npu.md @@ -0,0 +1,128 @@ +# PaddleLite使用NPU(华为)预测部署 + +Paddle Lite是首款支持华为自研达芬奇架构NPU(Kirin 810/990 SoC搭载的NPU)的预测框架。 +原理是在线分析Paddle模型,将Paddle算子转成HiAI IR后,调用HiAI IR/Builder/Runtime APIs生成并执行HiAI模型。 + +## 已支持的设备 + +- 华为nova5、nova5i pro、mate30、mate30 pro、mate30 5G、荣耀v30,以及即将推出的mate40、p40。据华为透露,今后上市的大部分手机都会搭载其自研达芬奇架构NPU。 + +## 已支持的模型 + +- MobileNetV1 +- MobileNetV2 +- ResNet-18/50 +- ShuffleNetV2 +- CycleGAN (暂时需要华为内部rom的支持) +- 百度内部业务模型(由于涉密,不方便透露具体细节) + +## 已支持(或部分支持)的Paddle算子 + +- sigmoid +- relu +- tanh +- relu_clipped +- leaky_relu +- softsign +- hard_sigmoid +- batch_norm +- concat +- conv2d +- depthwise_conv2d +- conv2d_transpose +- dropout +- elementwise_add +- elementwise_sub +- elementwise_mul +- elementwise_div +- fusion_elementwise_add_activation +- fusion_elementwise_sub_activation +- fusion_elementwise_mul_activation +- fusion_elementwise_div_activation +- fc +- bilinear_interp +- nearest_interp +- matmul +- mul +- pad2d +- pool2d +- reduce_mean +- reshape +- reshape2 +- scale +- shuffle_channel +- softmax +- split +- sqrt +- square +- transpose +- transpose2 +- unsqueeze +- unsqueeze2 +- instance_norm (暂时需要华为内部rom的支持) +- layer_norm (暂时需要华为内部rom的支持) + +## 编译支持NPU的Paddle Lite库 + +- 从https://developer.huawei.com/consumer/cn/hiai/下载华为HiAI DDK后解压到任意路径(注意:华为提供了多个版本的DDK,我们需要下载针对麒麟810/990芯片HiAI Foundation开发套件,例如最新的[DDK V310版本](https://obs.cn-north-2.myhwclouds.com/hms-ds-wf/sdk/hwhiai-ddk-100.310.011.010.zip))。 +- 将HiAI DDK中的ai_ddk_lib目录拷贝至Paddle Lite源码根目录后,使用[NPU编译脚本](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/tools/build_npu.sh)编译full_publish和tiny_publish。 + +注意:以下是HiAI DDK V310版解压后的目录结构,需要将ai_ddk_lib目录拷贝至Paddle Lite源码根目录。 +```shell +- app_sample +- ddk + - ai_ddk_lib + - include + - lib # for armv7 + - lib64 # for armv8 +- document +- tools +``` + +- full_publish and tiny_publish for armv8,由于HiAI DDK的armv7和armv8的so库均基于c++_shared构建,因此,建议使用c++_shared编译Paddle Lite。 +```shell +$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared full_publish +$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared tiny_publish +``` + +- full_publish and tiny_publish for armv7 +```shell +$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_shared full_publish +$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_shared tiny_publish +``` + +注意:为了保证编译环境一致,建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置,然后再执行上述命令。 + +## 优化生成NPU模型 + +- model_optimize_tool工具已经支持生成NPU模型,仅需要将valid_targets设置为npu,arm即可,具体参考[模型转化方法](../user_guides/model_optimize_tool)。 +```shell +./model_optimize_tool --model_dir= \ + --model_file= \ + --param_file= \ + --optimize_out_type=(protobuf|naive_buffer) \ + --optimize_out= \ + --valid_targets=npu,arm \ + --record_tailoring_info =(true|false) +``` +- model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子,并没有真正生成NPU HiAI模型,只有在执行时才会将标记的Paddle算子转成HiAI IR,最终生成并执行HiAI模型,具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)。 +- 不同模型,不同型号(ROM版本)的华为手机,在执行阶段,由于某些Paddle算子无法完全转成HiAI IR,或目标手机的HiAI版本过低等原因,可能导致HiAI模型无法成功生成,在这种情况下,Paddle Lite会调用CPU版算子进行运算完成整个预测任务。 + +## 通过JAVA接口加载并执行NPU模型 + +- 使用方法和[Java实例](java_demo)一致,无需额外设置任何参数,只需将模型换成NPU模型即可。[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中的Image Classification Demo for Android是同时支持CPU和NPU两种模型的图像分类Demo。 + +注意:在拷贝libpaddle_lite_jni.so的时候,由于依赖HiAI DDK so和libc++_shared.so库,需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so,拷到libpaddle_lite_jni.so同级目录下。 + +## 通过C++接口加载并执行NPU模型 + +- 使用方法和[C++实例](cpp_demo)一致,同样无需额外设置任何参数,只需将模型换成NPU模型即可。 + +注意:1)不能使用安卓模拟器,需要使用真实设备,且必须是支持NPU的华为手机。2)在使用adb push命令向手机推送目标程序时,需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so,推送到目标程序同级目录下。 + + +## 其它说明 + +- 华为达芬奇架构的NPU内部大量采用float16进行运算,因此,预测结果会存在偏差,但大部分情况下精度不会有较大损失,可参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中Image Classification Demo for Android对同一张图片CPU与NPU的预测结果。 +- 华为Kirin 810/990 Soc搭载的自研达芬奇架构的NPU,与Kirin 970/980 Soc搭载的寒武纪NPU不一样,同样的,与Hi3559A、Hi3519A使用的NNIE也不一样,Paddle Lite只支持华为自研达芬奇架构NPU。 +- 我们正在持续增加能够适配HiAI IR的Paddle算子bridge/converter,以便适配更多Paddle模型,同时华为研发同学也在持续对HiAI IR性能进行优化。 diff --git a/docs/user_guides/opencl.md b/docs/demo_guides/opencl.md similarity index 53% rename from docs/user_guides/opencl.md rename to docs/demo_guides/opencl.md index e9533af1ff6e2447a8e4d389df90cdb457f58fb2..e255038575796f0c1079f47fb859f8402ac79c1f 100644 --- a/docs/user_guides/opencl.md +++ b/docs/demo_guides/opencl.md @@ -1,4 +1,4 @@ -# Lite基于OpenCL的ARM GPU预测 +# PaddleLite使用OpenCL预测部署 Lite支持在Android系统上运行基于OpenCL的程序,目前支持Ubuntu环境下armv8、armv7的交叉编译。 @@ -11,18 +11,45 @@ Lite支持在Android系统上运行基于OpenCL的程序,目前支持Ubuntu环 详见 **源码编译指南-环境准备** 章节。 -### 编译选项 - -|参数|介绍|值| -|--------|--------|--------| -|--arm_os|代表目标操作系统|目前仅支持且默认为`android`| -|--arm_abi|代表体系结构类型,支持armv8和armv7|默认为`armv8`即arm64-v8a;`armv7`即armeabi-v7a| -|--arm_lang|代表编译目标文件所使用的编译器|默认为gcc,支持 gcc和clang两种| - ### 编译Paddle-Lite OpenCL库范例 注:以android-armv8-opencl的目标、Docker容器的编译开发环境为例,CMake3.10,android-ndk-r17c位于`/opt/`目录下。 +#### 针对 Lite 用户的编译命令(无单元测试,有编译产物) + +- `arm_os`: `[android]`,目前不支持linux; +- `arm_abi`: `[armv7 | armv8]`; +- `arm_lang`: `[gcc]`,目前不支持clang; +- `build_extra`: `[OFF | ON]`,编译全量op和kernel,体积会大,编译时间长; +- `build_cv`: `[OFF | ON]`,编译arm cpu neon实现的的cv预处理模块; +- `android_stl`: `[c++_shared | c++_static]`,paddlelite的库以何种方式链接`android_stl`,选择`c++_shared`得到的动态库体积更小,但使用时候记得上传paddlelite所编译版本(armv7或armv8)一致的`libc++_shared.so`(来自Android-NDK); +注:调用`./lite/tools/build.sh`执行编译。 + +```bash +# 假设当前位于处于Lite源码根目录下 + +# 导入NDK_ROOT变量,注意检查您的安装目录若与本示例不同 +export NDK_ROOT=/opt/android-ndk-r17c + +# 删除上一次CMake自动生成的.h文件 +rm ./lite/api/paddle_use_kernels.h +rm ./lite/api/paddle_use_ops.h + +# 根据指定编译参数编译 +./lite/tools/build.sh \ + --arm_os=android \ + --arm_abi=armv8 \ + --arm_lang=gcc \ + --build_extra=OFF \ + --build_cv=OFF \ + --android_stl=c++_shared \ + opencl +``` + +#### 针对 Lite 开发者的编译命令(有单元测试,编译产物) + +注:调用`./lite/tools/ci_build.sh`执行编译,该命令会编译armv7和armv8的opencl库。虽然有编译产物,但因编译单元测试,编译产物包体积可能较大,不推荐使用。 + ```bash # 假设当前位于处于Lite源码根目录下 @@ -38,16 +65,20 @@ rm ./lite/api/paddle_use_ops.h --arm_os=android \ --arm_abi=armv8 \ --arm_lang=gcc \ - build_test_arm_opencl + build_opencl ``` +注:如果要调试cl kernel,假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件,保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`,该命令会自动将修改后,再切到build目录下执行`make publish_inference`或者你要编译的单测的可执行文件名,cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。 + +### 编译产物说明 + 编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内,这里仅罗列关键产物: - `cxx`:该目录是编译目标的C++的头文件和库文件; - `demo`:该目录包含了两个demo,用来调用使用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`,分别对应`mobile_full`和`mobile_light`文件夹。编译对应的demo仅需在`mobile_full`或`mobile_light`文 - `mobile_full`:使用cxx config,可直接加载fluid模型,若使用OpenCL需要在`mobilenetv1_full_api.cc`代码里开启`DEMO_USE_OPENCL`的宏,详细见代码注释; - - `mobile_light`:使用mobile config,只能加载`model_optimize_tool`优化过的模型; -- `opencl`:该目录存放opencl实现的相关kernel。 + - `mobile_light`:使用mobile config,只能加载`model_optimize_tool`优化过的模型。 +注:`opencl`实现的相关kernel已经打包到动态库中。 ```bash . @@ -65,40 +96,23 @@ rm ./lite/api/paddle_use_ops.h | |-- libpaddle_api_light_bundled.a | |-- libpaddle_full_api_shared.so | `-- libpaddle_light_api_shared.so -|-- demo -| `-- cxx -| |-- Makefile.def -| |-- README.md -| |-- include -| | |-- paddle_api.h -| | |-- paddle_lite_factory_helper.h -| | |-- paddle_place.h -| | |-- paddle_use_kernels.h -| | |-- paddle_use_ops.h -| | `-- paddle_use_passes.h -| |-- mobile_full -| | |-- Makefile -| | `-- mobilenetv1_full_api.cc -| `-- mobile_light -| |-- Makefile -| `-- mobilenetv1_light_api.cc -`-- opencl - `-- cl_kernel - |-- buffer - | |-- depthwise_conv2d_kernel.cl - | |-- elementwise_add_kernel.cl - | |-- fc_kernel.cl - | |-- im2col_kernel.cl - | |-- layout_kernel.cl - | |-- mat_mul_kernel.cl - | |-- pool_kernel.cl - | `-- relu_kernel.cl - |-- cl_common.h - `-- image - |-- channel_add_kernel.cl - |-- elementwise_add_kernel.cl - |-- pool_kernel.cl - `-- relu_kernel.cl +`-- demo + `-- cxx + |-- Makefile.def + |-- README.md + |-- include + | |-- paddle_api.h + | |-- paddle_lite_factory_helper.h + | |-- paddle_place.h + | |-- paddle_use_kernels.h + | |-- paddle_use_ops.h + | `-- paddle_use_passes.h + |-- mobile_full + | |-- Makefile + | `-- mobilenetv1_full_api.cc + `-- mobile_light + |-- Makefile + `-- mobilenetv1_light_api.cc ``` 调用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`见下一部分运行示例。 @@ -109,48 +123,9 @@ rm ./lite/api/paddle_use_ops.h 下面以android、ARMv8、gcc的环境为例,介绍3个示例,分别如何在手机上执行基于OpenCL的ARM GPU推理过程。 - -**注意:** 以下命令均在Lite源码根目录下运行。在3个示例前,下面这段命令都先要执行用来准备环境: - -```bash -# 在/data/local/tmp目录下创建OpenCL文件目录 -adb shell mkdir -p /data/local/tmp/opencl -adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/buffer -adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/image - -# 将OpenCL的kernels文件推送到/data/local/tmp/opencl目录下 -adb push lite/backends/opencl/cl_kernel/cl_common.h /data/local/tmp/opencl/cl_kernel/ -adb push lite/backends/opencl/cl_kernel/buffer/* /data/local/tmp/opencl/cl_kernel/buffer/ -adb push lite/backends/opencl/cl_kernel/image/* /data/local/tmp/opencl/cl_kernel/image/ -``` - ### 运行示例1: 编译产物demo示例 ```bash -###################################################################### -# 编译mobile_full的demo # -###################################################################### -# 步骤: # -# 0.确保编译Paddle-Lite时编译了OpenCL; # -# 1.编辑`mobilenetv1_full_api.cc`代码, 开启`DEMO_USE_OPENCL`的宏; # -# 2.在产物目录`demo/cxx/mobile_full`下编译`mobile_full`的demo; # -# 3.上传demo, 模型, opencl kernel文件到手机; # -# 4.运行demo得到预期结果. # -###################################################################### -adb shell mkdir /data/local/tmp/opencl/mobilenet_v1 -chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api -adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api /data/local/tmp/opencl/ -adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1 - -# use mobile_full run mobilenet_v1 -# `GLOG_v` is log level -adb shell "export GLOG_v=0; \ - /data/local/tmp/opencl/mobilenetv1_full_api \ - --model_dir=/data/local/tmp/opencl/mobilenet_v1 \ - --optimized_model_dir=/data/local/tmp/opencl/full_api_opt_model" - - - ###################################################################### # 编译mobile_light的demo # ###################################################################### @@ -158,33 +133,40 @@ adb shell "export GLOG_v=0; \ # 0.确保编译Paddle-Lite时编译了OpenCL; # # 1.编译model_optimize_tool并对模型优化, `targets`参数为`opencl`; # # 2.在产物目录`demo/cxx/mobile_light`下编译`mobile_light`的demo; # -# 3.上传demo, 模型, opencl kernel文件到手机; # +# 3.上传demo, 模型文件到手机; # # 4.运行demo得到预期结果. # ###################################################################### +# 在/data/local/tmp目录下创建OpenCL文件目录 +adb shell mkdir -p /data/local/tmp/opencl # use model_optimize_tool to optimize model ./build.model_optimize_tool/lite/api/model_optimize_tool \ --model_dir=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \ --optimize_out_type=naive_buffer \ - --optimize_out=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \ + --optimize_out=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/mobilenetv1_opt \ --valid_targets=opencl -adb shell mkdir /data/local/tmp/opencl/mobilenet_v1 +adb shell mkdir /data/local/tmp/opencl/mobilenet_v1/ chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp/opencl/ -adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1 +adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/mobilenetv1_opt.nb /data/local/tmp/opencl/ # use mobile_light run mobilenet_v1 -adb shell "export GLOG_v=5; \ +adb shell "export GLOG_v=1; \ /data/local/tmp/opencl/mobilenetv1_light_api \ - --model_dir=/data/local/tmp/opencl/" + /data/local/tmp/opencl/mobilenetv1_opt.nb" ``` +**注:** `GLOG_v`是指定需要显示VLOG的日志级别,默认为0。权重参数会在第一次运行时加载,所以第一次执行时间略长。一般将warmup的值设为10,repeats值设为多次。 + ### 运行示例2: test_mobilenetv1单元测试 - **运行文件准备** ```bash +# 在/data/local/tmp目录下创建OpenCL文件目录 +adb shell mkdir -p /data/local/tmp/opencl + # 将mobilenet_v1的模型文件推送到/data/local/tmp/opencl目录下 adb shell mkdir -p /data/local/tmp/opencl/mobilenet_v1 adb push build.lite.android.armv8.gcc.opencl/third_party/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1/ @@ -195,42 +177,26 @@ adb push build.lite.android.armv8.gcc.opencl/lite/api/test_mobilenetv1 /data/loc - **执行OpenCL推理过程** -使用如下命令运行OpenCL程序。其中: - -- `--cl_path`指定了OpenCL的kernels文件即cl\_kernel所在目录; -- `--modle_dir`指定了模型文件所在目录。 - ```bash adb shell chmod +x /data/local/tmp/opencl/test_mobilenetv1 -adb shell /data/local/tmp/opencl/test_mobilenetv1 \ - --cl_path=/data/local/tmp/opencl \ - --model_dir=/data/local/tmp/opencl/mobilenet_v1 \ - --warmup=1 \ - --repeats=1 +adb shell "export GLOG_v=1; \ + /data/local/tmp/opencl-image/test_mobilenetv1 \ + --model_dir=/data/local/tmp/opencl-image/mobilenetv1_fluid/ \ + --warmup=10 \ + --repeats=100" ``` -**注意:** 因为权重参数均会在Op Kernel第一次运行时进行加载,所以第一次的执行时间会略长。一般将warmup的值设为1,repeats值设为多次。 - ### 运行示例3: test_layout_opencl单元测试 -- **运行文件准备** - -```bash -# 将OpenCL单元测试程序test_layout_opencl,推送到/data/local/tmp/opencl目录下 -adb push build.lite.android.armv8.gcc.opencl/lite/kernels/opencl/test_layout_opencl /data/local/tmp/opencl/ -``` - - -OpenCL推理过程** - ```bash +adb shell mkdir -p /data/local/tmp/opencl adb shell chmod +x /data/local/tmp/opencl/test_layout_opencl -adb shell /data/local/tmp/opencl/test_layout_opencl +adb shell "export GLOG_v=4; \ + /data/local/tmp/opencl/test_layout_opencl" ``` - -# 如何在Code中使用 +### 如何在Code中使用 见运行示例1的demo代码: diff --git a/docs/advanced_user_guides/x86.md b/docs/demo_guides/x86.md similarity index 53% rename from docs/advanced_user_guides/x86.md rename to docs/demo_guides/x86.md index 7cb08683440312b0349662699b05e99df0cb6df1..c65ca99006b924488ceee50489e3d5654bae990c 100644 --- a/docs/advanced_user_guides/x86.md +++ b/docs/demo_guides/x86.md @@ -1,6 +1,6 @@ -# 使用X86预测库 +# PaddleLite使用X86预测部署 -Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../installation/source_compile)。 +Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../user_guides/source_compile)。 (注意:非docker Linux环境需要是Ubuntu16.04) @@ -9,8 +9,8 @@ Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考 1、 下载代码 ```bash git clone https://github.com/PaddlePaddle/Paddle-Lite.git -#需要切换到 release/v2.0.0之后版本 -git checkout +# 切换到release分支 +git checkout release/v2.3 ``` 2、 源码编译 @@ -42,43 +42,56 @@ x86编译结果位于 `build.lite.x86/inference_lite_lib` ## x86预测API使用示例 +1、我们提供Linux环境下x86 API运行mobilenet_v1的示例:[mobilenet_full_x86demo](https://paddlelite-data.bj.bcebos.com/x86/mobilenet_full_x86demo.zip)。下载解压后内容如下: + +![](https://paddlelite-data.bj.bcebos.com/x86/x86-doc/demo.png) + +`mobilenet_v1`为模型文件、`lib`和`include`分别是Paddle-Lite的预测库和头文件、`third_party`下是编译时依赖的第三方库`mklml`、`mobilenet_full_api.cc`是x86示例的源代码、`build.sh`为编译的脚本。 + +2、demo内容与使用方法 + +``` bash +# 1、编译 +sh build.sh +``` +编译结果为当前目录下的 `mobilenet_full_api ` +``` bash +# 2、执行预测 +mobilenet_full_api mobilenet_v1 +``` +`mobilenet_v1`为当前目录下的模型路径,`mobilenet_full_api`为第一步编译出的可执行文件。 + +3、示例源码`mobilenet_full_api.cc` + ```c++ -#include #include #include -#include "paddle_api.h" // NOLINT -#include "paddle_use_kernels.h" // NOLINT -#include "paddle_use_ops.h" // NOLINT -#include "paddle_use_passes.h" // NOLINT +#include "paddle_api.h" -using namespace paddle::lite_api; // NOLINT -DEFINE_string(model_dir, "", "Model dir path."); -DEFINE_string(optimized_model_dir, "", "Optimized model dir."); -DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels"); +using namespace paddle::lite_api; // NOLINT int64_t ShapeProduction(const shape_t& shape) { int64_t res = 1; for (auto i : shape) res *= i; return res; } -void RunModel() { - // 1. Set CxxConfig - CxxConfig config; - config.set_model_file(FLAGS_model_dir + "model"); - config.set_param_file(FLAGS_model_dir + "params"); - - config.set_valid_places({ - lite_api::Place{TARGET(kX86), PRECISION(kFloat)} - }); +void RunModel(std::string model_dir) { + // 1. Create CxxConfig + CxxConfig config; + config.set_model_dir(model_dir); + config.set_valid_places({ + Place{TARGET(kX86), PRECISION(kFloat)}, + Place{TARGET(kHost), PRECISION(kFloat)} + }); // 2. Create PaddlePredictor by CxxConfig std::shared_ptr predictor = CreatePaddlePredictor(config); // 3. Prepare input data std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); - input_tensor->Resize(shape_t({1, 3, 224, 224})); + input_tensor->Resize({1, 3, 224, 224}); auto* data = input_tensor->mutable_data(); for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { data[i] = 1; @@ -90,15 +103,21 @@ void RunModel() { // 5. Get output std::unique_ptr output_tensor( std::move(predictor->GetOutput(0))); - std::cout << "Output dim: " << output_tensor->shape()[1] << std::endl; + std::cout << "Output shape " << output_tensor->shape()[1] << std::endl; for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { - std::cout << "Output[" << i << "]:" << output_tensor->data()[i] << std::endl; + std::cout << "Output[" << i << "]: " << output_tensor->data()[i] + << std::endl; } } int main(int argc, char** argv) { - google::ParseCommandLineFlags(&argc, &argv, true); - RunModel(); + if (argc < 2) { + std::cerr << "[ERROR] usage: ./" << argv[0] << " naive_buffer_model_dir\n"; + exit(1); + } + std::string model_dir = argv[1]; + RunModel(model_dir); return 0; } + ``` diff --git a/docs/advanced_user_guides/add_layout.md b/docs/develop_guides/add_layout.md similarity index 99% rename from docs/advanced_user_guides/add_layout.md rename to docs/develop_guides/add_layout.md index 11e504f93c2b1bcaefaa06c0a5f51aea0995884e..26b7a07cc5788ee6e7fa36206c2432f5fc3def1c 100644 --- a/docs/advanced_user_guides/add_layout.md +++ b/docs/develop_guides/add_layout.md @@ -1,4 +1,4 @@ -# 如何增加Layout +# 新增Layout Paddle-Lite中Place包含了Target、Layout、Precision信息,用来注册和选择模型中的具体Kernel。下面以增加Place中的layout:`ImageDefault`、`ImageFolder`、`ImageNW`为例,讲解如何增加新Layout。 diff --git a/docs/advanced_user_guides/add_new_pass.md b/docs/develop_guides/add_new_pass.md similarity index 99% rename from docs/advanced_user_guides/add_new_pass.md rename to docs/develop_guides/add_new_pass.md index 93b27cd038642c702cd213adffcc378dc852a1b3..5740b7978f18cfad5754c0f77a8208bece565893 100644 --- a/docs/advanced_user_guides/add_new_pass.md +++ b/docs/develop_guides/add_new_pass.md @@ -1,5 +1,4 @@ - -# 新增Pass方法 +# 新增Pass 本文从三个方面介绍了`Lite`中的`Pass`结构:**Pass是什么**、**Pass的实现与接口**、**Pass的一般注册流程**。最后以`Fc_fuse_pass`为例介绍了`fusion_pass`的作用与注册方法。 diff --git a/docs/advanced_user_guides/add_operation.md b/docs/develop_guides/add_operation.md similarity index 99% rename from docs/advanced_user_guides/add_operation.md rename to docs/develop_guides/add_operation.md index 525832f8a9d7341c3124498084e05b160358b2ad..1aa955fa6a1b260fd3a17401e658e33b2b862fd9 100644 --- a/docs/advanced_user_guides/add_operation.md +++ b/docs/develop_guides/add_operation.md @@ -1,4 +1,4 @@ -# 新增OP的方法 +# 新增OP 以下以添加argmax为例,详细说明新增op的方法。 diff --git a/docs/develop_guides/architecture-intro.md b/docs/develop_guides/architecture-intro.md new file mode 100644 index 0000000000000000000000000000000000000000..f49f0525e122de9da19bacb441dfa84ab0eef7ca --- /dev/null +++ b/docs/develop_guides/architecture-intro.md @@ -0,0 +1,245 @@ +# 架构详解 + +这篇文档会从开发者角度详细介绍开发 Paddle-Lite 需要的相关信息。 + +## 设计及思考 + +近年来,各种深度学习预估硬件称出不穷,从手机APP到车载设备,再到音箱,均需要部署深度学习预测,且有如下共性需求: + +1. 高性能 +2. 硬件支持和扩展容易 +3. 轻量级部署 + +Paddle-Lite 的架构方面便是定向参考如上需求设计实现的,具体地 + +- 高性能方面 + - 通过 MIR(Machine IR) 实现精细复杂的计算图的分析和优化 + - 执行期 Kernel 的简单设计,几乎没有额外调度开销 + - 适当的硬件层抽象,框架支持各个硬件后端中做特定的调度实现 +- 轻量级部署方面 + - 拆分分析和执行两个阶段,执行阶段轻量级实现,可以单独部署 + - 轻量级 Op 和 Kernel 设计 +- 硬件支持和扩展方面 + - 通过 MIR 支撑带硬件和执行信息的宏观分析优化 + - TypeSystem 抽象带硬件的不同计算模式的表示,实现整个计算图的强类型推导,以及执行状态机的静态分析 + +Paddle-Lite 的架构尝试从强类型推导的角度建模支持多硬件,多种计算模式(不同量化精度、不同的 data layout等)的混合计算,从而实现宏观上的各异硬件和计算模式的混合。 + +框架部分已经经过 FPGA,GPU,NPU 等异构硬件的打磨,各项能力也在完善中。 + +## 重要模块介绍 + +### OpLite + +[OpLite](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/op_lite.h#L52) 是 Paddle-Lite 中的 Operator,用户扩展单个硬件时,最多的就是扩展 Op 和 Kernel。 + +重要方法如下: + +```c++ +class OpLite : public Registry { + public: + // Check the shape. + virtual bool CheckShape() const { return true; } + // Inference the outputs' shape. + virtual bool InferShape() const { return true; } + // Link the external execution environ to internal context. + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope); +}; +``` + +其中,分析期执行 + +- `AttachImpl` + +执行期执行 + +- `CheckShape` +- `InferShape` + +扩展须知: + +1. `CheckShape` 只在第一个 batch 执行,所以耗时不敏感 + +2. `InferShape` 需要在每个 batch 执行,应该严格耗时 + + 1. 可以通过添加 member variable 的方式,对其中一部分信息增加 cache,比如 + + ```c++ + class XXOp : public OpLite { + void InferShape() { + int batch_size = param().input.shape[0]; + if (!shape_cache_.empty()) { + shape_cache_[0] = batch_size; + param().output->Resize(shape_cache_); + } + } + + private: + shape_t shape_cache_; + } + ``` + + + +### OpParam + +[OpParam](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/operators/op_params.h) 用于存储执行期 Kernel 需要的各项参数。 所有字段可以直接存储(比如指针或者 `int`),以避免执行中获取参数的延迟。 + +因为没有需求,OpParam 暂时没有设置基类。 + +实际例子: + +```c++ +// For Softmax op +struct SoftmaxParam { + lite::Tensor* x{}; + lite::Tensor* output{}; + int axis{-1}; +}; +``` + +OpLite 的 `AttachImpl` 方法就用于构建 `OpParam` ,复制传递给 `Kernel` 用于执行。 + +OpParam 是执行期的重要模块,需要严格保证性能,相应的扩展要求: + +1. 字段的获取必须是低延迟的,可以直接用指针,或者直接复制值 +2. 避免执行无关信息混入,包括 debug 信息 +3. 命名需要与 Paddle OpDesc 中的信息严格一致,以降低功能对齐和理解的难度 + +### Kernel + +```c++ +template +class KernelLite : public KernelBase { + public: + // Run the kernel. + virtual void Run() { CHECK(false) << "Not Implemented"; } + + TargetType target() const override { return Target; } + PrecisionType precision() const override { return Precision; } + DataLayoutType layout() const override { return DataLayout; } + Place place() const override { return Place{Target, Precision, DataLayout}; } + std::string name() const override; +}; +``` + +由于是执行期的重要概念,因此 Kernel 设计地非常简单高效。 + +其中,执行期的 `Run` 是其唯一重要的接口,其中包含具体的计算逻辑。 + +模板中的参数主要用于方便多硬件编译,以及自解释: + +- Target: 执行硬件 +- Precision: 主要的计算精度 +- DataLayout:主要计算的 data layout + +这部分信息用于帮助挑选 kernel,具体的值并不严格。 + + + +Kernel 的注册需要用到 TypeSystem,不光对 Kernel 本身的特性进行描述,对其输入和输出均进行详尽的定义。 + +例如 FullyConnected 的注册 + +```c++ +REGISTER_LITE_KERNEL( + fc, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::FcCompute, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat), LAYOUT(kNCHW))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); +``` + +Kernel自身定义是 `kARM` 的,也就是ARM上的kernel,主要的计算精度是 `kFloat`,主要的 Data layout 是 `kNCHW`。 + +接着会对其所有的输入和输出做详细定义,比如看 `Input` 输入的定义是 `LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat), LAYOUT(kNCHW))`,也就是声明其 Target 是 `kARM`, PRECISION 是 `kFloat`,Data Layout 是 `kNCHW`。 + +这里的设计思想是类似C++中的函数重载,同一个 Kernel(的名字),在重载了其输入输出的类型之后可以是不同的kernel。 + +#### 扩展须知 + +1. 模板参数选用计算中主要的来表示 + 1. 比如,scale kernel,同时能接受 `float` 和 `int` 的输入,但其不算量化 kernel,那应该设置为 `Precision=float`,代表常规的计算精度中使用 +2. Kernel 输入输出的定义需要足够精确,是什么类型就是什么类型;框架会根据其输入输出的定义来动态构建状态机,否则会出现分析期和执行期的状态机不一致,造成未定义行为 + +### MIR + +MIR 类似于 LLVM 里的 IR,只是加上了硬件和执行期的信息参与分析优化。 + +Pass 是MIR中的模块化策略,其输入和输出都是 SSA Graph. + +框架会自动基于模型的Program 构建 SSA Graph,之后按 [Optimizer](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/optimizer.h) 中定义的pass的顺序调用一系列 Pass。 + +#### Op Fusion + +MIR 中的 [PatternMacher](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/mir/pattern_matcher.h) 实现了简单有效的基于图的模板识别的算法,相关的 op fusion 的图操作可以基于此实现。 + +实际的例子可以参考 [fc_fuse_pass.h](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/mir/fusion/fc_fuse_pass.h)。 + +### TypeSystem + +TypeSystem 是 Paddle-Lite 中构建复杂计算图的基础模块,核心思想是协助 SSA Graph 构建一个状态机,表示其中不同的状态。 + +这里的 Type 主要包含下面四组信息,更多的信息可以按需扩展: + +- TargetType +- Precision +- DataLayout +- device id,用于表示卡号 + + + +状态机的表示: + +```python +Tensor0(kARM, kFloat, kNCHW) --pass--> Tensor1(kOpenCL, kFloat, kNCHW) +``` + +MIR 会识别出,Tensor0 和 Tensor1 的硬件位置不同,因此触发相依的 Pass 插入对应的 cast op 来进行 type cast,比如 + +``` +Tensor0(kARM, kFloat, kNCHW) --pass-> IoCopyOp(kARM, kOpenCL) --pass-> Tensor1(kOpenCL, kFloat, kNCHW) +``` + +### KernelContext + +KernelContext 是硬件支持的核心封装,主要用于为 Kernel 提供执行期的硬件上下文。 + +KernelContext 的设计类似于 OpParam,两者均没有基类;对于 KernelContext,其假定是,不同的硬件间的接口和逻辑可能完全不同,比如 kARM 和 kCUDA,因此不设定基类,也不需要提供统一的接口来封装不同硬件行为。 + +不同硬件的 KernelContext 直接与该硬件对应的 Kernel 对接。 + +KernelContext 的行为可以被 MIR 在分析期确定和调度。 + +注意事项: + +1. 由于是执行期概念,KernelContext 也需要注意性能和轻量化 +2. 移动端部署时只会部署执行期,因此 MIR 和 KernelContext 会拆开,因此 KernelContext 相应的设置需要能够序列化到 ProgramDesc 中,以便执行期载入和执行 + +## 扩展硬件后端 + +### 扩展现有的硬件后端 + +主要是扩充 Op 和 Kernel 的工作,如果需要 fuse,则参考 MIR 章节,增加相应的fuse pass便可,具体地,可以参考 + +- [fc_op](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/operators/fc_op.h) 实现类似的 Op +- [fc_compute](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/kernels/arm/fc_compute.h) 实现类似的 Kernel +- [fc_fuse_pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/fusion/fc_fuse_pass.h) 实现fuse逻辑,并注册到 [optimizer](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/optimizer.h) + +### 扩展全新硬件后端 + +需要额外扩充如下模块,让框架能够支撑硬件执行: + +- TypeSystem,需要扩充其中相关的 type + - 相关 [enum](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/api/paddle_place.h#L44) +- MIR,需要扩展其中的 type cast 相关的 pass + - [TargetType cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_target_cast_pass.cc) 用于拷贝不同硬件上的tensor + - [Data layout cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_target_cast_pass.h) 用于转化不同的 data layout + - [Precision cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_precision_cast_pass.h) 用于转化不同 tensor 的量化精度 +- KernelContext,具体地可以参考 + - [ARM context](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/context.h#L91) + - 需要注意的是,硬件 context 的接口只服务于该硬件的 kernel + - context 有分析期和执行期两个阶段,如果分析期没有特殊的优化,则无需考虑;否则,需要注意将分析期的信息整理并序列化到离线模型中,用于执行期直接加载。 diff --git a/docs/develop_guides/for-developer.md b/docs/develop_guides/for-developer.md new file mode 100644 index 0000000000000000000000000000000000000000..fc7bd412ee5091552c7244a621f9e298496973a4 --- /dev/null +++ b/docs/develop_guides/for-developer.md @@ -0,0 +1,14 @@ +# 开发基础须知 + +可以参考 [Paddle 开发者文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/development/contribute_to_paddle/local_dev_guide.html)。 + +## 提交PR + +需要在 commit message 里加上 `test=develop` 才能触发 CI + +## 版本发布检查清单 + +1. 所有 feature 梳理,确认状态 +2. 所有 QA 测试结果梳理,确认版本可靠 +3. Release note 确认 review 通过 +4. 确认需要 release 的 binary 编译完毕 diff --git a/docs/develop_guides/index.rst b/docs/develop_guides/index.rst deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/docs/index.rst b/docs/index.rst index d7359f1d0508f8e85824f450ca07f095d047f90c..5e8cb6b2148af4a7f68faf602bdb617743e48e1b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -13,10 +13,12 @@ Welcome to Paddle-Lite's documentation! introduction/tech_highlights introduction/architecture + introduction/support_hardware + introduction/support_operation_list .. toctree:: :maxdepth: 1 - :caption: Benchmark数据和方法 + :caption: Benchmark :name: sec-benchmark benchmark/benchmark @@ -24,46 +26,67 @@ Welcome to Paddle-Lite's documentation! .. toctree:: :maxdepth: 1 - :caption: 安装 - :name: sec-install - - installation/source_compile - -.. toctree:: - :maxdepth: 1 - :caption: 使用指南 + :caption: 使用方法 :name: sec-user-guides + user_guides/tutorial + user_guides/release_lib + user_guides/source_compile + user_guides/x2paddle user_guides/model_optimize_tool + user_guides/post_quant_with_data + user_guides/post_quant_no_data + user_guides/model_quantization + user_guides/debug user_guides/library_tailoring - user_guides/cuda - user_guides/opencl .. toctree:: :maxdepth: 1 - :caption: 进阶使用指南 - - advanced_user_guides/support_operation_list - advanced_user_guides/add_operation - advanced_user_guides/add_layout - advanced_user_guides/model_quantization - advanced_user_guides/add_new_pass - advanced_user_guides/x86 + :caption: 部署示例 + :name: sec-demo_guides + + demo_guides/cpp_demo + demo_guides/java_demo + demo_guides/android_app_demo + demo_guides/ios_app_demo + demo_guides/x86 + demo_guides/cuda + demo_guides/opencl + demo_guides/fpga + demo_guides/npu .. toctree:: :maxdepth: 1 - :caption: 开发者文档 + :caption: API文档 + + api_reference/cxx_api_doc + api_reference/java_api_doc + api_reference/python_api_doc + api_reference/cv + +.. toctree:: + :maxdepth: 1 + :caption: 开发者贡献 + + develop_guides/for-developer + develop_guides/architecture-intro + develop_guides/add_operation + develop_guides/add_layout + develop_guides/add_new_pass .. toctree:: :maxdepth: 1 - :caption: API文档 + :caption: Roadmap + :name: sec-roadmap - api_reference/cxx_api_doc + introduction/roadmap .. toctree:: :maxdepth: 1 :caption: FAQ + introduction/faq + .. toctree:: :maxdepth: 1 :caption: paddle-mobile diff --git a/docs/installation/library.md b/docs/installation/library.md deleted file mode 100644 index ef2f8fdb18ade439d620b348738cbb752d5bd8b6..0000000000000000000000000000000000000000 --- a/docs/installation/library.md +++ /dev/null @@ -1,61 +0,0 @@ - -# 预测库说明 - -Paddle-Lite的编译结果为预测库文件(包括静态库和动态库),具体编译过程参考[源码编译](./source_compile)。 - -Lite预测库分为**基础预测库**和**全量预测库**:基础预测库只打包了基础模型需要的基础算子,预测库体积较小;全量预测库打包了所有的Lite算子,可以支持更多的模型,但是预测库的体积也更大。 编译时由编译选项 `build_extra`(默认为OFF)控制,`--build_extra=OFF`时编译基础预测库,`--build_extra=ON`时编译全量的预测库。 - -## 基础预测库 - -### 编译方法 -编译时设置`--build_extra=OFF` (默认值) 或不指定即可编译出基础预测库。例如: - -``` -./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static tiny_publish -``` - -### 基础预测库支持的功能 - -(1)支持基础CV模型 - -(2)支持基础的in8量化模型 - -(3)支持[benchmark测试](../benchmark/benchmark) - - -### 基础预测库支持的基础模型: - -1. fluid基础模型(paddle model 提供的基础模型9个) - -``` -mobileNetV1 mnasnet yolov3 ssd_mobilenetv1 shufflenet_v2 -mobileNetV2 resnet50 unet squeezenet_v11 -``` - -2. int8量化模型模型 - -``` -mobilenet_v1 mobilenet_v2 resnet50 -``` - -### 特点 - 轻量级预测库,体积更小,支持常用的基础模型。 - - - -## 全量预测库 - -### 编译方法 -编译时设置`--build_extra=ON` 即可编译出全量预测库。例如: - -``` -./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static --build_extra=ON tiny_publish -``` -### 全量预测库功能 - -(1) 基础预测库所有功能 - -(2)支持所有Paddle-Lite中注册的所有算子 - -### 特点 - 支持更多的硬件平台和算子,可以支持更多模型但体量更大。 diff --git a/docs/introduction/faq.md b/docs/introduction/faq.md new file mode 100644 index 0000000000000000000000000000000000000000..768b92a31b42934d454bfa3afbee6f8dba1ef462 --- /dev/null +++ b/docs/introduction/faq.md @@ -0,0 +1,8 @@ +# FAQ 常见问题 + +问题或建议可以发Issue,为加快问题解决效率,可先检索是否有类似问题,我们也会及时解答! +欢迎加入Paddle-Lite百度官方QQ群:696965088 + +1. 在Host端采用交叉编译方式编译PaddleLite,将编译后的libpaddle_light_api_shared.so和可执行程序放到板卡上运行,出现了如下图所示的错误,怎么解决? +![host_target_compiling_env_miss_matched](https://user-images.githubusercontent.com/9973393/75761527-31b8b700-5d74-11ea-8a9a-0bc0253ee003.png) +- 原因是Host端的交叉编译环境与Target端板卡的运行环境不一致,导致libpaddle_light_api_shared.so链接的GLIBC库高于板卡环境的GLIBC库。目前有四种解决办法(为了保证编译环境与官方一致,推荐第一种方式):1)在Host端,参考[源码编译](../user_guides/source_compile)中的Docker方式重新编译libpaddle_light_api_shared.so;2)在Host端,使用与Target端版本一致的ARM GCC和GLIBC库重新编译libpaddle_light_api_shared.so;3)在Target端板卡上,参考[源码编译](../user_guides/source_compile)中的ARM Linux本地编译方式重新编译libpaddle_light_api_shared.so;4)在Target端板卡上,将GLIBC库升级到和Host端一致的版本,即GLIBC2.27。 diff --git a/docs/introduction/roadmap.md b/docs/introduction/roadmap.md new file mode 100644 index 0000000000000000000000000000000000000000..0c5b5366041ff4cf406fe5d9d67833925c7795f8 --- /dev/null +++ b/docs/introduction/roadmap.md @@ -0,0 +1,32 @@ +# Road map + +这篇文档会介绍 Paddle-Lite 近期对外的开源版本和计划。 + +其中包含的 feature 为最小集合,按最终发布的版本为准。 + + +## 2.0.0-beta1-prerelease + +预计发布 *2019-8-26 ~ 2days* + +- 完善编译和 benchmark 文档 +- 增加第三方依赖代码的离线下载功能,加速编译过程 +- 去掉 `tiny_publish` 模式下无关的第三方代码下载,可以不依赖任何第三方 + +## 2.0.0-beta1 + +预计发布 *2019-9-1~2days* + +- `model_optimize_tool` 从 ARM 上执行修改为 Host 上执行,只从 kernel 分布来确定计算图优化;后续硬件针对优化会发布新的工具; +- Paddle 模型支持参数 composed 的格式 +- 增加分层编译来控制常用模型的部署库的大小,分两个模式 `basic`, `extra`;默认 `basic` 模式只发布核心的op 和kernel;将控制流相关的Op和kernel 折叠进 `extra` 按需编译 +- 增加 INT8 量化,从 PaddleSlim 训练到 PaddleLite 部署完整案例 +- 支持内存中加载模型,以支持 APP 的简易加密 + +## 2.3 + +[v2.3 project](https://github.com/PaddlePaddle/Paddle-Lite/milestone/3?closed=1) + +## 2.6 + +[v2.6 project](https://github.com/PaddlePaddle/Paddle-Lite/milestones/v2.6) diff --git a/docs/introduction/support_hardware.md b/docs/introduction/support_hardware.md new file mode 100644 index 0000000000000000000000000000000000000000..b1a6823d26d4fe8838afee00732707608b836599 --- /dev/null +++ b/docs/introduction/support_hardware.md @@ -0,0 +1,45 @@ + +# 支持硬件 + + +## ARM CPU +Paddle Lite支持[ARM Cortex-A系列处理器](https://en.wikipedia.org/wiki/ARM_Cortex-A),支持列表如下: +### 32bit(ARMv7a) +- Cortex-A5 +- Cortex-A7 +- Cortex-A8 +- Cortex-A9 +- Cortex-A12 +- Cortex-A15 +- Cortex-A17(RK3288) +- Cortex-A32 +### 64bit(ARMv7a, ARMv8a) +- Cortex-A35 +- Cortex-A53(树莓派3) +- Cortex-A55 +- Cortex-A57(Nvidia tx1,Nvidia tx2, 高通810等) +- Cortex-A72(麒麟95X,高通820, RK3399,树莓派4等) +- Cortex-A73(麒麟960,麒麟970,高通835, 联发科X30等) +- Cortex-A75(高通845等) +- Cortex-A76(麒麟980,麒麟990,高通855,高通730,联发科G90等) +- Cortex-A77 +- ARMv8-A compatible(Apple A系列处理器, Nvidia tegra, Qualcomm Kryo, Falkor, Samsung Mongoose) + +## 移动端GPU +Paddle Lite支持移动端GPU和Nvidia端上GPU设备,支持列表如下: +- ARM Mali G 系列 +- Qualcomm Adreno 系列 +- Nvida tegra系列: tx1, tx2, nano, xavier + +## NPU +Paddle Lite支持NPU,支持列表如下: +- 华为达芬奇架构NPU + +## FPGA +Paddle Lite支持FPGA,支持列表如下: +- 百度Edgeboard系列:ZU9, ZU5, ZU3 + +## XPU +Paddle Lite支持XPU,支持列表如下: +- 百度昆仑818-100芯片 +- 百度昆仑818-300芯片 diff --git a/docs/advanced_user_guides/support_operation_list.md b/docs/introduction/support_operation_list.md similarity index 96% rename from docs/advanced_user_guides/support_operation_list.md rename to docs/introduction/support_operation_list.md index 7c2ceb0ff819f7f1676308a33ec88f5eab820e57..7a60cf46e424dfe610a0541c9e364cf6e5d98531 100644 --- a/docs/advanced_user_guides/support_operation_list.md +++ b/docs/introduction/support_operation_list.md @@ -1,44 +1,31 @@ -# 支持OP列表 +# 支持OP -## Ops +## Ops (共计158个算子) +### Basic Operators (默认编译的算子) - affine_channel -- anchor_generator - arg_max -- assign -- assign_value -- attention_padding_mask -- axpy - batch_norm -- beam_search -- beam_search_decode - bilinear_interp -- box_clip - box_coder - calib -- calib_once - cast -- collect_fpn_proposals - concat -- conditional_block - conv2d - conv2d_transpose -- crop -- decode_bboxes - density_prior_box - depthwise_conv2d -- distribute_fpn_proposals - dropout - elementwise_add - elementwise_div - elementwise_max - elementwise_mul - elementwise_sub -- equal - exp - expand - fake_channel_wise_dequantize_max_abs - fake_dequantize_max_abs +- fake_quantize_abs_max - fake_quantize_dequantize_moving_average_abs_max - fake_quantize_moving_average_abs_max - fake_quantize_range_abs_max @@ -55,6 +42,72 @@ - fusion_elementwise_max_activation - fusion_elementwise_mul_activation - fusion_elementwise_sub_activation +- gelu +- grid_sampler +- hard_sigmoid +- instance_norm +- io_copy +- io_copy_once +- layout +- leaky_relu +- log +- matmul +- mean +- mul +- multiclass_nms +- nearest_interp +- pad2d +- pool2d +- prelu +- prior_box +- range +- reduce_mean +- relu +- relu6 +- relu_clipped +- reshape +- reshape2 +- rsqrt +- scale +- search_fc +- sequence_topk_avg_pooling +- shuffle_channel +- sigmoid +- slice +- softmax +- softsign +- split +- sqrt +- square +- squeeze +- squeeze2 +- stack +- subgraph +- swish +- tanh +- transpose +- transpose2 +- unsqueeze +- unsqueeze2 +- yolo_box + +### Extra Operators (打开 `--build_extra=ON`开关才会编译) + +- anchor_generator +- assign +- assign_value +- attention_padding_mask +- axpy +- beam_search +- beam_search_decode +- box_clip +- calib_once +- collect_fpn_proposals +- conditional_block +- crop +- decode_bboxes +- distribute_fpn_proposals +- equal - gather - generate_proposals - graph_op @@ -62,21 +115,14 @@ - greater_than - gru - gru_unit -- hard_sigmoid - im2sequence - increment -- instance_norm -- io_copy -- io_copy_once - is_empty - layer_norm -- layout - layout_once -- leaky_relu - less_equal - less_than - lod_reset -- log - logical_and - logical_not - logical_or @@ -85,37 +131,18 @@ - lookup_table_v2 - lrn - match_matrix_tensor -- matmul -- mean - merge_lod_tensor -- mul -- multiclass_nms -- nearest_interp - negative - norm -- notequal -- pad2d -- pool2d +- not_equal - power -- prelu -- prior_box -- range - read_from_array - reduce_max -- reduce_mean - reduce_prod - reduce_sum -- relu -- relu6 -- relu_clipped -- reshape -- reshape2 - roi_align -- rsqrt -- scale - search_aligned_mat_mul - search_attention_padding_mask -- search_fc - search_grnn - search_group_padding - search_seq_arithmetic @@ -130,32 +157,15 @@ - sequence_reshape - sequence_reverse - sequence_softmax -- sequence_topk_avg_pooling - shape -- shuffle_channel -- sigmoid -- slice -- softmax -- softsign -- split - split_lod_tensor -- sqrt -- square -- squeeze -- squeeze2 -- stack -- swish -- tanh - top_k -- transpose -- transpose2 - uniform_random -- unsqueeze -- unsqueeze2 - var_conv_2d - while - write_to_array -- yolo_box + + ## Kernels diff --git a/docs/user_guides/debug.md b/docs/user_guides/debug.md new file mode 100644 index 0000000000000000000000000000000000000000..93395b25fae772954f83a1128cdb7e86c9eee994 --- /dev/null +++ b/docs/user_guides/debug.md @@ -0,0 +1,89 @@ +# 调试 + +## Profiler工具 + +Basic profiler 用于 CPU 上kernel 耗时的统计。 + +### 开启方法: + +参照 [编译安装](../user_guides/source_compile) 中的**full_publish**部分进行环境配置,在 cmake 时添加 `-DLITE_WITH_PROFILE=ON` ,就可以开启相应支持。 + +### 使用示例: + +在模型执行完毕后,会自动打印类似如下 profiler 的日志 + +``` + kernel average min max count + feed/def/1/4/2 0 0 0 1 + conv2d/def/4/1/1 1175 1175 1175 1 + conv2d/def/4/1/1 1253 1253 1253 1 + depthwise_conv2d/def/4/1/1 519 519 519 1 + conv2d/def/4/1/1 721 721 721 1 + elementwise_add/def/4/1/1 18 18 18 1 + conv2d/def/4/1/1 2174 2174 2174 1 + depthwise_conv2d/def/4/1/1 380 380 380 1 + conv2d/def/4/1/1 773 773 773 1 + elementwise_add/def/4/1/1 2 2 2 1 + conv2d/def/4/1/1 1248 1248 1248 1 + depthwise_conv2d/def/4/1/1 492 492 492 1 + conv2d/def/4/1/1 1150 1150 1150 1 + elementwise_add/def/4/1/1 33 33 33 1 + elementwise_add/def/4/1/1 3 3 3 1 + conv2d/def/4/1/1 1254 1254 1254 1 + depthwise_conv2d/def/4/1/1 126 126 126 1 +``` + +## Debug工具 + +**Lite Model Debug Tool** 是用来检查Paddle-Lite框架与Paddle-Fluid框架运行时tensor(包括variable与weight)之间diff信息的基础工具。 + +### 编译方法: + +1. 参照 [编译安装](../user_guides/source_compile) 中的**full_publish**部分进行环境配置和编译。 +2. 在生成的`build`目录下,执行`make lite_model_debug_tool`,`lite_model_debug_tool`产出在编译目录的`lite/tools/debug`目录下。 + +### 工作流程: + +1. 运行 `/bin/bash check_model.sh --model_dir= --build_root_dir= debug_cpp_stage` 获得模型在Paddle-Lite框架下的运行拓扑信息、varibles信息和weights信息。运行后拓扑信息将会存储在默认名为 `topo_file.txt` 的文件中,variables和weights信息将会存储在默认名为 `tensor_cpp.txt` 的文件中。 +2. 运行 `/bin/bash check_model.sh --model_dir= --build_root_dir= debug_py_stage`执行fluid框架预测以获取相同模型在fluid框架下的variable与weight信息(注意:我们使用fluid的python api运行fluid模型,因此您在运行此步之前应确保已正确安装fluid的python api)。然后debug tool将会自动比较Paddle-Lite框架输出的信息和Paddle-Fluid框架输出的信息来检查是否存在运行时diff。 执行Paddle-Fluid框架,输出的信息将会存储在默认名为 `tensor_py.txt` 的文件中,相应的diff信息将会存储在默认名为 `diff.txt`的文件中(默认情况下,只会输出执行拓扑序中第一个有diff的variable相关的信息)。 + +### 注意事项: + +1. 输出的结果是在**执行完一次预测后**输出的相应变量/权重的最终值,因此如果您在预测过程进行过诸如变量复用/子图融合等优化方法,则相应的输出可能会出现偏差。 +2. 默认情况下debug tools将以全1作为输入进行比对。 +3. 默认情况下,为了保证与Paddle-Fluid框架的结果可比对,debug tool将会禁用掉所有的Paddle-Lite的优化策略。 +4. Paddle-Lite框架的执行环境由与您的编译选项有关,比如您开启了LITE_WITH_ARM编译选项,那debug tool的`debug_cpp_stage`也需要在ARM平台下运行。 + +### Diff信息输出: + +如果debug tool检测到diff信息,那么在`diff.txt`中将会输出类似以下结构信息 + +```c++ +>>>>>>>>>>>>>>>>>>DIFF VARIABLE: dropout_0.tmp_0<<<<<<<<<<<<<<<<<<< +dropout (X:pool2d_7.tmp_0) (Mask:dropout_0.tmp_1 Out:dropout_0.tmp_0) +--------------- Tensor File info --------------- +pool2d_7.tmp_0 {1,1536,1,1} 0.749892 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0150336 0.621641 0.147099 0.636727 0.0 0.0 0.00410917 0.784708 0.0 0.0704846 0.233599 0.840123 0.239201 0.112878 0.0 0.155352 0.306906 0.0 0.0 0.860938 0.221037 0.787316 0.256585 ... +dropout_0.tmp_0 {1,1536,1,1} 0.749892 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0150336 0.621641 0.147099 0.636727 0.0 0.0 0.00410917 0.784708 0.0 0.0704846 0.233599 0.840123 0.239201 0.112878 0.0 0.155352 0.306906 0.0 0.0 0.860938 0.221037 0.787316 0.256585 ... +--------------- Fluid Tensor info --------------- +pool2d_7.tmp_0 {1,1536,1,1} 0.7498912 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.015033395 0.6216395 0.14709876 0.63672537 0.0 0.0 0.0041093696 0.7847073 0.0 0.07048465 0.23359808 0.8401219 0.23919891 0.1128789 0.0 0.1553514 0.3069055 0.0 0.0 0.8609365 0.22103554 ... +dropout_0.tmp_0 {1,1536,1,1} 0.599913 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.012026716 0.4973116 0.117679015 0.5093803 0.0 0.0 0.0032874958 0.62776583 0.0 0.056387722 0.18687847 0.67209756 0.19135913 0.090303116 0.0 0.12428112 0.2455244 0.0 0.0 0.68874925 ... +``` + +其中第二行为op相关信息,标明了执行哪个op出现了diff及其对应的输入输出变量名。Tensor File info为Paddle-Lite框架的输出信息,而Fluid Tensor info为Paddle-Fluid框架的相应输出信息。 +示例中的`dropout_0.tmp_1`没有相应的tensor信息是因为工具检测到其在预测的后序流程中未被使用,因此不会对预测结果造成影响,从而将其自动屏蔽掉以保证输出尽量简洁。 + +### 其他选项: + +| Option | Description | +| --------------------------- | ------------------------------------------------------------ | +| --input_file | 输入文件名,不同field以逗号分隔,相同field内以空格分隔, 只有文件中的第一行输入信息会被使用. 如果您不指定input_file,那么所有输入将会被置为1。注意:`debug_py_stage`目前不支持多field输入。 | +| --cpp_topo_file | 存储运行时拓扑信息,由`debug_cpp_stage`写入并且由`debug_py_stage`读取使用。 默认为`topo_file.txt` 。 | +| --cpp_tensor_file | 存储`debug_cpp_stage` 在运行拓扑序下的输出信息,默认为 `tensor_cpp.txt` 。 | +| --tensor_names | 如果此选项不为空,那么只输出由此选项中指定名字的variable/weight信息,名字间用逗号分隔。 | +| --tensor_output_length | 输出数据的长度,默认为全部输出。 | +| --py_threshold | 判断diff发生的阈值,默认为 `1e-5` 。 | +| --py_tensor_file | 存储`debug_py_stage` 在运行拓扑序下的输出信息,默认为`tensor_py.txt`. | +| --py_output_file | diff信息的存储文件,默认为`diff.txt`。 | +| --py_only_output_first_diff | 是否只输出运行时拓扑序中第一个有diff的var/op信息,默认为true | + +您可以参考 `check_model.sh` 脚本中的代码以获得更多细节. diff --git a/docs/user_guides/index.rst b/docs/user_guides/index.rst deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/docs/user_guides/library.md b/docs/user_guides/library.md new file mode 100644 index 0000000000000000000000000000000000000000..20f16322c67cc9d10d2f667fa2ca7bceb83e338b --- /dev/null +++ b/docs/user_guides/library.md @@ -0,0 +1,57 @@ + +# `build_extra`参数说明: + +Lite预测库分为**基础预测库**和**全量预测库(with_extra)**:基础预测库只包含基础CV算子(OP),体积较小;全量预测库包含所有Lite算子,体积较大,支持模型较多。 + +编译时由编译选项 `build_extra`(默认为OFF)控制,`--build_extra=OFF`时编译**基础预测库**,`--build_extra=ON`时编译**全量预测库**。 + +## 基础预测库( [基础OP列表](../advanced_user_guides/support_operation_list.html#basic-operators) ) + + +### 支持功能 + +(1)87个[基础OP](../advanced_user_guides/support_operation_list.html#basic-operators) (2)9个基础模型 (3)3个in8量化模型 + + +### 支持的模型 + +1. fluid基础模型(来源:[paddle-models](https://github.com/PaddlePaddle/models) ) + +``` +mobilenetV1 mnasnet yolov3 ssd_mobilenetv1 shufflenet_v2 +mobilenetV2 resnet50 unet squeezenet_v11 +``` + +2. int8量化模型 + +``` +mobilenet_v1 mobilenet_v2 resnet50 +``` + +### 特点 + 轻量级预测库,体积更小,支持常用模型。 + +### 编译方法 +编译时设置`--build_extra=OFF` (默认值) 编译出基础预测库。例如: + +``` +./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static tiny_publish +``` + + +## 全量预测库( [OP列表](../advanced_user_guides/support_operation_list.html#op) ) + + +### 支持功能 + + Paddle-Lite中的全量算子( [基础OP](../advanced_user_guides/support_operation_list.html#basic-operators) + [Extra OP](../advanced_user_guides/support_operation_list.html#extra-operators-build-extra-on) ) + +### 特点 + 包含更多算子、支持更多模型,但体量更大。 + +### 编译方法 +设置`--build_extra=ON` 可编译出全量预测库。例如: + +``` +./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static --build_extra=ON tiny_publish +``` diff --git a/docs/user_guides/library_tailoring.md b/docs/user_guides/library_tailoring.md index 5ba12cf819945ab2f182f672a2c96123bc12e070..cf0641b7314f112e9cb7ac4f0a9094bdbdaa7ca6 100644 --- a/docs/user_guides/library_tailoring.md +++ b/docs/user_guides/library_tailoring.md @@ -1,5 +1,5 @@ -# 裁剪预测库方法 +# 裁剪预测库 Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编译会将所有已注册的operator打包到预测库中,造成库文件体积膨胀;**裁剪预测库**能针对具体的模型,只打包优化后该模型需要的operator,有效降低预测库文件大小。 @@ -39,7 +39,7 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编 例如: ```bash -./lite/tools/build.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_static --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB full_publish +./lite/tools/build.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_static --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB tiny_publish ``` **注意**:上面命令中的`../mobilenet_v1NB`是第1步得到的转化模型的输出路径 @@ -88,9 +88,6 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编 #include #include #include "paddle_api.h" // NOLINT -#include "paddle_use_kernels.h" // NOLINT -#include "paddle_use_ops.h" // NOLINT -#include "paddle_use_passes.h" // NOLINT using namespace paddle::lite_api; // NOLINT @@ -182,4 +179,4 @@ int main(int argc, char** argv) { 1. 模型集合**必须**均为combined参数模型或均为非combined参数模型。 2. 使用非combined参数模型时,模型拓扑文件名应为`__model__`,使用非combined参数模型时,集合中各模型的拓扑与参数名应相同,分别由`--model_filename`和`--param_filename`指定。 3. 模型集合**必须**均为INT8量化模型或均为非INT8量化模型。 -4. 需要使用Paddle-Lite 最新版本(release/v2.1.0之后)代码编译出的model_optimize_tool。 +4. 需要使用Paddle-Lite `release/v2.1.0`之后版本代码编译出的模型优化工具。 diff --git a/docs/user_guides/model_optimize_tool.md b/docs/user_guides/model_optimize_tool.md index fccc6d8b23c78474257d11399d121816f57fc422..c3d5f527048519e851cc8b9e785dc39668e971a4 100644 --- a/docs/user_guides/model_optimize_tool.md +++ b/docs/user_guides/model_optimize_tool.md @@ -1,20 +1,26 @@ -# 模型转化方法 +# 模型优化工具 opt -Lite架构在预测过程中表现出来的高性能得益于其丰富的优化组件,其中包括量化、子图融合、混合调度、Kernel优选等等策略。为了使优化过程更加方便易用,我们提供了**opt**来自动完成优化步骤,输出一个轻量的、最优的可执行模型。具体使用方法介绍如下: +Paddle-Lite 提供了多种策略来自动优化原始的训练模型,其中包括量化、子图融合、混合调度、Kernel优选等等方法。为了使优化过程更加方便易用,我们提供了**opt** 工具来自动完成优化步骤,输出一个轻量的、最优的可执行模型。 -**注意**:release/v2.2.0之前的模型转化工具名称为`model_optimize_tool`,从release/v2.3开始模型转化工具名称修改为`opt` +具体使用方法介绍如下: + +**注意**:`v2.2.0` 之前的模型转化工具名称为`model_optimize_tool`,从 `v2.3` 开始模型转化工具名称修改为 `opt` ## 准备opt 当前获得opt方法有三种: -1. 我们提供当前develop分支编译结果下载:[opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt)、[opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) -release/v2.2.0之前版本的model_optimize_tool: [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool)、[model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) - -2. 可以进入Paddle-Lite Github仓库的[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases),选择release版本下载对应的转化工具`opt` +1. **推荐!** 可以进入Paddle-Lite Github仓库的[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases),选择release版本下载对应的转化工具`opt` (release/v2.2.0之前的转化工具为model_optimize_tool、release/v2.3.0之后为opt) +2. 本文提供`release/v2.3`和`release/v2.2.0`版本的优化工具下载 + +|版本 | Linux | MacOS| +|---|---|---| +| `release/v2.3`| [opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) | +|`release/v2.2.0` | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) | -3. 可以下载Paddle-Lite源码,从源码编译出opt工具 + +3. 如果 release 列表里的工具不符合您的环境,可以下载Paddle-Lite 源码,源码编译出opt工具 ```bash git clone https://github.com/PaddlePaddle/Paddle-Lite.git cd Paddle-Lite @@ -22,11 +28,11 @@ git checkout ./lite/tools/build.sh build_optimize_tool ``` 编译结果位于`Paddle-Lite/build.opt/lite/api/opt` -**注意**:从源码编译opt前需要先[安装Paddle-Lite的开发环境](../installation/source_compile)。 +**注意**:从源码编译opt前需要先[安装Paddle-Lite的开发环境](source_compile)。 ## 使用opt -opt是x86平台上的可执行文件,需要在PC端运行:包括Linux终端和Mac终端。 +opt是 x86 平台上的可执行文件,需要在PC端运行:支持Linux终端和Mac终端。 ### 帮助信息 执行opt时不加入任何输入选项,会输出帮助信息,提示当前支持的选项: @@ -36,7 +42,10 @@ opt是x86平台上的可执行文件,需要在PC端运行:包括Linux终端 ![](https://paddlelite-data.bj.bcebos.com/doc_images/1.png) ### 功能一:转化模型为Paddle-Lite格式 -opt可以将PaddlePaddle支持的模型转化为Paddle-Lite支持的模型格式,期间执行的操作包括:将protobuf格式的模型文件转化为naive_buffer格式的模型文件,有效降低模型体积;执行“量化、子图融合、混合调度、Kernel优选”等图优化操作,提升其在Paddle-Lite上的运行速度、内存占用等性能指标。 +opt可以将PaddlePaddle的部署模型格式转化为Paddle-Lite 支持的模型格式,期间执行的操作包括: + +- 将protobuf格式的模型文件转化为naive_buffer格式的模型文件,有效降低模型体积 +- 执行“量化、子图融合、混合调度、Kernel优选”等图优化操作,提升其在Paddle-Lite上的运行速度、内存占用等效果 模型优化过程: @@ -54,7 +63,10 @@ PaddlePaddle模型有两种保存格式: **使用示例**:转化`mobilenet_v1`模型 ``` -./opt --model_dir=./mobilenet_v1 --valid_targets=arm --optimize_out_type=naive_buffer --optimize_out=mobilenet_v1_opt +./opt --model_dir=./mobilenet_v1 \ + --valid_targets=arm \ + --optimize_out_type=naive_buffer \ + --optimize_out=mobilenet_v1_opt ``` 以上命令可以将`mobilenet_v1`模型转化为arm硬件平台、naive_buffer格式的Paddle_Lite支持模型,优化后的模型文件为`mobilenet_v1_opt.nb`,转化结果如下图所示: @@ -71,7 +83,6 @@ PaddlePaddle模型有两种保存格式: --optimize_out_type=(protobuf|naive_buffer) \ --optimize_out= \ --valid_targets=(arm|opencl|x86|npu|xpu) \ - --prefer_int8_kernel=(true|false) \ --record_tailoring_info =(true|false) ``` @@ -83,12 +94,12 @@ PaddlePaddle模型有两种保存格式: | --optimize_out_type | 输出模型类型,目前支持两种类型:protobuf和naive_buffer,其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测,请将此选项设置为naive_buffer。默认为protobuf。 | | --optimize_out | 优化模型的输出路径。 | | --valid_targets | 指定模型可执行的backend,默认为arm。目前可支持x86、arm、opencl、npu、xpu,可以同时指定多个backend(以空格分隔),Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU(Kirin 810/990 Soc搭载的达芬奇架构NPU),应当设置为npu, arm。 | -| --prefer_int8_kernel | 若待优化模型为int8量化模型(如量化训练得到的量化模型),则设置该选项为true以使用int8内核函数进行推理加速,默认为false。 | | --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时,则设置该选项为true,以记录优化后模型含有的kernel和OP信息,默认为false。 | * 如果待优化的fluid模型是非combined形式,请设置`--model_dir`,忽略`--model_file`和`--param_file`。 * 如果待优化的fluid模型是combined形式,请设置`--model_file`和`--param_file`,忽略`--model_dir`。 -* 优化后的模型包括__model__.nb和param.nb文件。 +* 优化后的模型为以`.nb`名称结尾的单个文件。 +* 删除`prefer_int8_kernel`的输入参数,`opt`自动判别是否是量化模型,进行相应的优化操作。 ### 功能二:统计模型算子信息、判断是否支持 @@ -121,14 +132,14 @@ opt可以统计并打印出model中的算子信息、判断Paddle-Lite是否支 **背景**:如果想用Paddle-Lite运行第三方来源(tensorflow、caffe、onnx)模型,一般需要经过两次转化。即使用x2paddle工具将第三方模型转化为PaddlePaddle格式,再使用opt将PaddlePaddle模型转化为Padde-Lite可支持格式。 为了简化这一过程,我们提供一键脚本,将x2paddle转化和opt转化合并: -**一键转化脚本**:[auto_transform.sh](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/auto_transform.sh) +**一键转化脚本**:[auto_transform.sh](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.3/lite/tools/auto_transform.sh) -**环境要求**:使用`auto_transform.sh`脚本转化第三方模型时,需要先安装x2paddle环境,请参考[x2paddle环境安装方法](https://github.com/PaddlePaddle/X2Paddle#环境依赖) 安装x2paddle和其环境依赖项。 +**环境要求**:使用`auto_transform.sh`脚本转化第三方模型时,需要先安装x2paddle环境,请参考[x2paddle环境安装方法](https://github.com/PaddlePaddle/X2Paddle#环境依赖) 安装x2paddle和x2paddle依赖项(tensorflow、caffe等)。 **使用方法**: -(1)打印帮助帮助信息:` ./auto_transform.sh` +(1)打印帮助帮助信息:` sh ./auto_transform.sh` (2)转化模型方法 @@ -138,7 +149,7 @@ USAGE: tranform model from tensorflow/caffe/onnx form into paddle-lite naive-buffer form. ---------------------------------------- example: - ./auto_transform.sh --framework=tensorflow --model=tf_model.pb --optimize_out=opt_model_result + sh ./auto_transform.sh --framework=tensorflow --model=tf_model.pb --optimize_out=opt_model_result ---------------------------------------- Arguments about x2paddle: --framework=(tensorflow|caffe|onnx); diff --git a/docs/advanced_user_guides/model_quantization.md b/docs/user_guides/model_quantization.md similarity index 78% rename from docs/advanced_user_guides/model_quantization.md rename to docs/user_guides/model_quantization.md index 7d781ba9904400c26b64aed5f5dc764ecc5b24fa..cf506cfa61e3942452ddaf1218d9d55c2fffa3fc 100644 --- a/docs/advanced_user_guides/model_quantization.md +++ b/docs/user_guides/model_quantization.md @@ -1,21 +1,38 @@ -# 模型量化 +# 模型量化-量化训练 -本文主要介绍使用Paddle-Lite加载PaddlePaddle产出的量化模型,并进行推理执行。我们以MobileNetV1模型为示例,首先介绍准备量化模型,然后介绍部署执行。 +本文主要介绍使用Paddle-Lite加载PaddlePaddle产出的量化模型,并进行推理执行。我们以MobileNetV1模型为示例,首先说明产出量化模型,然后说明预测部署。 -## 准备量化模型 +## 1 简介 -PaddlePaddle使用量化训练和训练后量化两种方法将FP32模型量化成Int8模型,下面分别介绍两种方法如何产出量化模型。 +量化训练是基于大量训练数据,对训练好的预测模型进行量化。该方法使用模拟量化的思想,在训练阶段更新权重,实现减小量化误差。 -### 量化训练 +使用条件: +* 有预训练模型 +* 有较多训练数据 + +使用步骤: +* 产出量化模型:使用PaddlePaddle调用量化训练接口,产出量化模型 +* 量化模型预测:使用PaddleLite加载量化模型进行预测推理 + +优点: +* 减小计算量、降低计算内存、减小模型大小 +* 模型精度受量化影响小 + +缺点: +* 使用条件较苛刻,使用门槛稍高 + +建议首先使用“有校准数据训练后量化”对模型进行量化,然后使用使用量化模型进行预测。如果该量化模型的精度达不到要求,再使用“量化训练”。 + + +## 2 产出量化模型 目前,PaddlePaddle框架的量化训练主要针对卷积层(包括二维卷积和Depthwise卷积)、和全连接层,对应算子是conv2d、depthwise_conv2d和mul,更多量化训练的原理请参考[文档](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/tutorial.md#1-quantization-aware-training%E9%87%8F%E5%8C%96%E4%BB%8B%E7%BB%8D)。Paddle-Lite支持运行PaddlePaddle框架量化训练产出的模型,可以进一步加快模型在移动端的执行速度。 温馨提示:如果您是初次接触PaddlePaddle框架,建议首先学习[新人入门](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)和[使用指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/index_cn.html)。 - 您可以选择下载训练好的量化模型,或者使用PaddleSlim模型压缩工具训练得到量化模型。 -#### 下载量化模型 +### 下载量化模型 官方发布了[MobileNetV1量化模型](https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip),直接下载到本地。 @@ -23,9 +40,9 @@ PaddlePaddle使用量化训练和训练后量化两种方法将FP32模型量化 wget https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip ``` -#### 使用PaddleSlim模型压缩工具训练量化模型 +### 使用PaddleSlim模型压缩工具训练量化模型 -##### 安装PaddlePaddle +#### 安装PaddlePaddle 根据操作系统、安装方式、Python版本和CUDA版本,按照[官方说明](https://paddlepaddle.org.cn/start)安装PaddlePaddle。例如: @@ -39,7 +56,7 @@ Ubuntu 16.04.4 LTS操作系统,CPU版本安装: pip install paddlepaddle==1.6.0 -i https://mirrors.aliyun.com/pypi/simple/ ``` -##### 克隆量化训练所需的代码库 +#### 克隆量化训练所需的代码库 克隆[PaddlePaddle/models](https://github.com/PaddlePaddle/models)到本地,并进入models/PaddleSlim路径。 @@ -48,12 +65,13 @@ git clone https://github.com/PaddlePaddle/models.git cd models/PaddleSlim ``` -##### 数据准备 -###### 训练数据准备 +#### 准备数据和模型 + +##### 训练数据准备 参考[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification#data-preparation)中的数据准备教程,下载训练数据,并且保存到PaddleSlim/data路径下。 -###### 预训练模型准备 +##### 预训练模型准备 参考/models/PaddleSlim/run.sh脚本, 从[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#supported-models-and-performances)下载MobileNetV1的预训练模型,并保存到PaddleSlim/pretrain路径下。 @@ -84,8 +102,7 @@ cd models/PaddleSlim 在`compress.py`中定义了执行压缩任务需要的所有模型相关的信息,这里对几个关键的步骤进行简要介绍: -###### 目标网络的定义 - +**目标网络的定义** compress.py的以下代码片段定义了train program, 这里train program只有前向计算操作。 ```python out = model.net(input=image, class_dim=args.class_dim) @@ -103,7 +120,7 @@ val_program = fluid.default_main_program().clone() 定义完目标网络结构,需要对其初始化,并根据需要加载预训练模型。 -###### 定义feed_list和fetch_list +**定义feed_list和fetch_list** 对于train program, 定义train_feed_list用于指定从train data reader中取的数据feed给哪些variable。定义train_fetch_list用于指定在训练时,需要在log中展示的结果。如果需要在训练过程中在log中打印accuracy信心,则将('acc_top1', acc_top1.name)添加到train_fetch_list中即可。 ```python train_feed_list = [('image', image.name), ('label', label.name)] @@ -119,7 +136,7 @@ val_feed_list = [('image', image.name), ('label', label.name)] val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5', acc_top5.name)] ``` -###### Compressor和量化配置文件 +**Compressor和量化配置文件** `compress.py`主要使用Compressor和yaml文件完成对模型的量化训练工作。Compressor类的定义如下: ```python class Compressor(object): @@ -192,7 +209,7 @@ compressor: > > 3)**目前,Paddle-Lite仅支持运行weight量化方式使用`abs_max`且activation量化方式使用`moving_average_abs_max`或`range_abs_max`产出的量化模型**。 -##### 执行int8量化训练 +#### 执行量化训练 修改run.sh,即注释掉`# enable GC strategy`与`# for sensitivity filter pruning`之间的内容并打开`#for quantization`相关的脚本命令(所需打开注释的命令如下所示)。 @@ -214,56 +231,13 @@ python compress.py \ * int8目录: 参数范围为int8范围且参数数据类型为int8的量化模型。 * mobile目录:参数特点与int8目录相同且兼容paddle-mobile的量化模型(目前paddle-mobile已升级为Paddle-Lite)。 -### 训练后量化 - -下面以MobileNetV1为例,介绍使用训练后量化方法产出量化模型。关于训练后量化的原理和详细使用方法,请参考[文档](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api)。 - -> 该示例的代码放在[models/PaddleSlim/quant_low_level_api/](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api)目录下。如果需要执行该示例,首先clone下来[models](https://github.com/PaddlePaddle/models.git),安装具有训练后量化功能的PaddlePaddle。因为目前Lite支持支持对conv2d、depthwise_conv2d和mul量化,所以修改[run_post_training_quanzation.sh](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/quant_low_level_api/run_post_training_quanzation.sh) 脚本,设置is_full_quantize=False,然后执行该脚本;执行结束后,量化模型保存在`mobilenetv1_int8_model`目录下。下面介绍详细步骤。 - -1)**准备模型和校准数据** - -安装PaddlePaddle的develop分支编译的whl包,准备已经训练好的FP32预测模型。 - -准备校准数据,文件结构如下。val文件夹中有100张图片,val_list.txt文件中包含图片的label。 -```bash -samples_100 -└──val -└──val_list.txt -``` - -2)**配置校准数据生成器** - -MobileNetV1的输入是图片和标签,所以配置读取校准数据的sample_generator,每次返回一张图片和一个标签。详细代码在[models/PaddleSlim/reader.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/reader.py)。 - -3)**调用训练后量化** - -调用训练后量化的核心代码如下,详细代码在[post_training_quantization.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/quant_low_level_api/post_training_quantization.py)。 -``` python -place = fluid.CUDAPlace(0) if args.use_gpu == "True" else fluid.CPUPlace() -exe = fluid.Executor(place) -sample_generator = reader.val(data_dir=args.data_path) - -ptq = PostTrainingQuantization( - executor=exe, - sample_generator=sample_generator, - model_dir=args.model_dir, - model_filename=args.model_filename, - params_filename=args.params_filename, - batch_size=args.batch_size, - batch_nums=args.batch_nums, - algo=args.algo, - is_full_quantize=args.is_full_quantize == "True") -quantized_program = ptq.quantize() -ptq.save_quantized_model(args.save_model_path) -``` - -## 使用Paddle-Lite运行量化模型推理 +## 3 使用Paddle-Lite运行量化模型推理 -#### 使用模型优化工具对量化模型进行优化 +### 使用模型优化工具对量化模型进行优化 接下来,使用原始的量化模型生成适合在移动端直接部署的模型。 -参考[源码编译](../source_compile)配置编译环境,确保可以编译成功。参考[模型转化方法](../model_optimize_tool),首先编译model_optimize_tool工具,然后执行下面命令对量化训练的模型进行优化(注意,需要自行修改model_file、param_file和optimize_out)。 +参考[源码编译](source_compile)配置编译环境,确保可以编译成功。参考[模型转化方法](model_optimize_tool),首先编译model_optimize_tool工具,然后执行下面命令对量化训练的模型进行优化(注意,需要自行修改model_file、param_file和optimize_out)。 ```bash ./model_optimize_tool \ --model_file=mobilenet_v1_quant/float/model \ @@ -271,12 +245,11 @@ ptq.save_quantized_model(args.save_model_path) --optimize_out_type=naive_buffer \ --optimize_out=mobilenet_v1_quant_opt \ --valid_targets=arm \ ---prefer_int8_kernel=true ``` 如前所述,量化训练后,float目录下的模型参数范围为int8,但参数数据类型仍为float32类型,这样确实没有起到模型参数压缩的效果。但是,经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果,且模型结构也被优化(如进行了各种operator fuse操作)。 -#### 在手机端准备量化模型文件 +### 在手机端准备量化模型文件 使用如下命令将mobilenet_v1_quant_opt目录下的量化模型文件导入到手机端: @@ -284,9 +257,9 @@ ptq.save_quantized_model(args.save_model_path) adb push mobilenet_v1_quant_opt /data/local/tmp ``` -#### 使用mobilenetv1\_light\_api运行优化后的量化模型 +### 使用mobilenetv1\_light\_api运行优化后的量化模型 -参考[源码编译](../source_compile)配置编译环境后,在Paddle-Lite执行如下命令获取轻量级API的demo: +参考[源码编译](source_compile)配置编译环境后,在Paddle-Lite执行如下命令获取轻量级API的demo: ```bash cd /Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light @@ -314,9 +287,9 @@ Output[700]: 0.002509 Output[800]: 0.000538 Output[900]: 0.000969 ``` -在C++中使用Paddle-Lite API的方法请猛戳[此处](../cpp_demo),用户也可参考[mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)的代码示例。 +在C++中使用Paddle-Lite API的方法请猛戳[此处](../demo_guides/cpp_demo),用户也可参考[mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)的代码示例。 -### FAQ +## FAQ **问题**:Compiled with WITH_GPU, but no GPU found in runtime diff --git a/docs/user_guides/paddle_mobile.md b/docs/user_guides/paddle_mobile.md new file mode 100644 index 0000000000000000000000000000000000000000..43d17db7be4935b11ff0101e06e1f06998e9f532 --- /dev/null +++ b/docs/user_guides/paddle_mobile.md @@ -0,0 +1,7 @@ +# paddle-mobile 编译 + +详情可以参考 [mobile/README](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/mobile) + +要切换 paddle-mobile 编译,cmake 需要加上 **-DWITH_PADDLE_MOBILE=ON** 开关,其余 flag 请参考上面文档添加到后面 + +所有其他选项跟 paddle-mobile 原始操作完全一致 diff --git a/docs/user_guides/post_quant_no_data.md b/docs/user_guides/post_quant_no_data.md new file mode 100644 index 0000000000000000000000000000000000000000..4068249ff7544f42c5f2643c971eb003836b1f59 --- /dev/null +++ b/docs/user_guides/post_quant_no_data.md @@ -0,0 +1,99 @@ +# 模型量化-无校准数据训练后量化 + +本文首先简单介绍无校准数据训练后量化,然后说明产出量化模型,最好阐述量化模型预测。 + +## 1 简介 + +无校准数据训练后量化,将模型中特定OP的权重从FP32类型量化成INT8/16类型,可以减小预测模型的大小。使用该量化模型预测,首先将INT8/16类型的权重反量化成FP32类型,然后再进行预测。 + +使用条件: +* 有训练好的预测模型 + +使用步骤: +* 产出量化模型:使用PaddlePaddle调用无校准数据训练后量化接口,产出量化模型 +* 量化模型预测:使用PaddleLite加载量化模型进行预测推理 + +优点: +* 权重量化成INT16类型,模型精度不受影响,模型大小为原始的1/2 +* 权重量化成INT8类型,模型精度会受到影响,模型大小为原始的1/4 + +缺点: +* 暂无 + +## 2 产出量化模型 + +大家可以使用PaddlePaddle调用无校准数据训练后量化接口,得到量化模型。 + +### 2.1 安装PaddlePaddle + +参考PaddlePaddle[官网](https://www.paddlepaddle.org.cn/install/quick),安装PaddlePaddle CPU/GPU 1.7版本。 + +### 2.2 准备模型 + +准备已经训练好的FP32预测模型,即 `save_inference_model()` 保存的模型。 + +### 2.3 调用无校准数据训练后量化 + +对于调用无校准数据训练后量化,首先给出一个例子。 + +```python +from paddle.fluid.contrib.slim.quantization import WeightQuantization + +model_dir = path/to/fp32_model_params +save_model_dir = path/to/save_model_path +weight_quant = WeightQuantization(model_dir=model_dir) +weight_quant.quantize_weight_to_int(save_model_dir=save_model_dir, + weight_bits=16, + quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul']) +``` + +对于调用无校准数据训练后量化,以下对api接口进行详细介绍。 + +```python +class WeightQuantization(model_dir, model_filename=None, params_filename=None) +``` +参数说明如下: +* model_dir(str):待量化模型的路径,其中保存模型文件和权重文件。 +* model_filename(str, optional):待量化模型的模型文件名,如果模型文件名不是`__model__`,则需要使用model_filename设置模型文件名。 +* params_filename(str, optional):待量化模型的权重文件名,如果所有权重保存成一个文件,则需要使用params_filename设置权重文件名。 + +```python +WeightQuantization.quantize_weight_to_int(save_model_dir, + save_model_filename=None, + save_params_filename=None, + quantizable_op_type=['conv2d', 'mul'], + weight_bits=8, + threshold_rate=0.0) +``` +参数说明如下: +* save_model_dir(str):保存量化模型的路径。 +* save_model_filename(str, optional):如果save_model_filename等于None,则模型的网络结构保存到__model__文件,如果save_model_filename不等于None,则模型的网络结构保存到特定的文件。默认为None。 +* save_params_filename(str, optional):如果save_params_filename等于None,则模型的参数分别保存到一系列文件中,如果save_params_filename不等于None,则模型的参数会保存到一个文件中,文件名为设置的save_params_filename。默认为None。 +* quantizable_op_type(list[str]): 需要量化的op类型,默认是`['conv2d', 'mul']`,列表中的值可以是任意支持量化的op类型 `['conv2d', 'depthwise_conv2d', 'mul']`。 +* weight_bits(int, optional):权重量化保存的比特数,可以是8~16,一般设置为8/16。默认为8。 + + +## 3 量化模型预测 + +目前,对于无校准数据训练后量化产出的量化模型,不支持PaddlePaddle加载执行,只能使用PaddleLite进行预测部署。 + +很简单,首先使用PaddleLite提供的模型转换工具(opt)将量化模型转换成移动端预测的模型,然后加载转换后的模型进行预测部署。 + +注意,PaddleLite 2.3版本才支持无校准数据训练后量化产出的量化,所以转换工具和预测库必须是2.3及之后的版本。 + +### 3.1 模型转换 + +参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具,建议从Release页面下载。 + +参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。 +比如在安卓手机ARM端进行预测,模型转换的命令为: +```bash +./opt --model_dir=./mobilenet_v1_quant \ + --optimize_out_type=naive_buffer \ + --optimize_out=mobilenet_v1_quant_opt \ + --valid_targets=arm +``` + +### 3.2 量化模型预测 + +和FP32模型一样,转换后的量化模型可以在Android/IOS APP中加载预测,建议参考[C++ Demo](../demo_guides/cpp_demo)、[Java Demo](../demo_guides/java_demo)、[Android/IOS Demo](../demo_guides/android_app_demo)。 diff --git a/docs/user_guides/post_quant_with_data.md b/docs/user_guides/post_quant_with_data.md new file mode 100644 index 0000000000000000000000000000000000000000..0044b47610a2a211859bdc42f83f1921a681d50b --- /dev/null +++ b/docs/user_guides/post_quant_with_data.md @@ -0,0 +1,202 @@ +# 模型量化-有校准数据训练后量化 + +本文首先简单介绍有校准数据训练后量化,然后说明产出量化模型、量化模型预测,最后给出一个使用示例。 +如果想快速上手,大家可以先参考使用示例,再查看详细使用方法。 + +## 1 简介 + +有校准数据训练后量化,使用少量校准数据计算量化因子,可以快速得到量化模型。使用该量化模型进行预测,可以减少计算量、降低计算内存、减小模型大小。 + +有校准数据训练后量化中,有两种计算量化因子的方法,非饱和量化方法和饱和量化方法。非饱和量化方法计算整个Tensor的绝对值最大值`abs_max`,将其映射为127。饱和量化方法使用KL散度计算一个合适的阈值`T` (`0 | --arm_abi |必选,选择编译的arm版本,其中`armv7hf`为ARMLinux编译时选用| `armv8`、`armv7`、`armv7hf`(仅`armlinux`支持) | | --arm_lang |arm_os=android时必选,选择编译器 | `gcc`、`clang`(`clang`当前暂不支持) | | --android_stl |arm_os=android时必选,选择静态链接STL或动态链接STL | `c++_static`、`c++_shared`| -| --build_java | 可选,是否编译java预测库(默认为OFF) | `ON`、`OFF` | +| --build_java | 可选,是否编译java预测库(默认为ON) | `ON`、`OFF` | | --build_extra | 可选,是否编译全量预测库(默认为OFF)。详情可参考[预测库说明](./library.html)。 | `ON`、`OFF` | | target |必选,选择编译模式,`tiny_publish`为编译移动端部署库、`full_publish`为带依赖的移动端部署库、`test`为移动端单元测试、`ios`为编译ios端`tiny_publish` | `tiny_publish`、`full_publish`、`test`、 `ios` | @@ -278,7 +282,6 @@ git checkout --build_extra=OFF \ --arm_lang=gcc \ --android_stl=c++_static \ - --build_extra=OFF \ tiny_publish ``` ##### IOS @@ -306,7 +309,6 @@ sudo xcode-select -s /Applications/Xcode.app/Contents/Developer --arm_os=armlinux \ --arm_abi=armv7hf \ --arm_lang=gcc \ - --build_extra=OFF \ tiny_publish ``` - `--arm_abi`: 树莓派3b使用armv7hf,RK3399使用armv8 @@ -321,7 +323,6 @@ sudo xcode-select -s /Applications/Xcode.app/Contents/Developer --build_extra=OFF \ --arm_lang=gcc \ --android_stl=c++_static \ - --build_extra=OFF \ full_publish ``` ##### ARMLinux diff --git a/docs/user_guides/tutorial.md b/docs/user_guides/tutorial.md new file mode 100644 index 0000000000000000000000000000000000000000..8f8aeb6af124bc4805c281e22e39cca51b507651 --- /dev/null +++ b/docs/user_guides/tutorial.md @@ -0,0 +1,54 @@ +# 使用流程 + +Lite是一种轻量级、灵活性强、易于扩展的高性能的深度学习预测框架,它可以支持诸如ARM、OpenCL、NPU等等多种终端,同时拥有强大的图优化及预测加速能力。如果您希望将Lite框架集成到自己的项目中,那么只需要如下几步简单操作即可。 + +## 一. 准备模型 + +Lite框架目前支持的模型结构为[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)深度学习框架产出的模型格式。因此,在您开始使用 Lite 框架前您需要准备一个由PaddlePaddle框架保存的模型。 +如果您手中的模型是由诸如Caffe2、Tensorflow等框架产出的,那么我们推荐您使用 [X2Paddle](https://github.com/PaddlePaddle/X2Paddle) 工具进行模型格式转换。 + +## 二. 模型优化 + +Lite框架拥有强大的加速、优化策略及实现,其中包含诸如量化、子图融合、Kernel优选等等优化手段,为了方便您使用这些优化策略,我们提供了[opt](model_optimize_tool)帮助您轻松进行模型优化。优化后的模型更轻量级,耗费资源更少,并且执行速度也更快。 + +opt的详细介绍,请您参考 [模型优化方法](model_optimize_tool) 。 + +使用opt,您只需编译后在开发机上执行以下代码: + +``` shell +$ cd +$ cd build.opt/lite/api/ +$ ./opt \ + --model_dir= \ + --model_file= \ + --param_file= \ + --optimize_out_type=(protobuf|naive_buffer) \ + --optimize_out= \ + --valid_targets=(arm|opencl|x86) +``` + +其中,optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式,其目前支持Protobuf与Naive Buffer两种方式,其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测,那么您需要设置optimize_out_type=naive_buffer。 + +## 三. 使用Lite框架执行预测 + +在上一节中,我们已经通过`opt`获取到了优化后的模型,使用优化模型进行预测也十分的简单。为了方便您的使用,Lite进行了良好的API设计,隐藏了大量您不需要投入时间研究的细节。您只需要简单的五步即可使用Lite在移动端完成预测(以C++ API进行说明): + + +1. 声明MobileConfig。在config中可以设置**从文件加载模型**也可以设置**从memory加载模型**。从文件加载模型需要声明模型文件路径,如 `config.set_model_from_file(FLAGS_model_file)` ;从memory加载模型方法现只支持加载优化后模型的naive buffer,实现方法为: +`void set_model_from_buffer(model_buffer) ` + +2. 创建Predictor。Predictor即为Lite框架的预测引擎,为了方便您的使用我们提供了 `CreatePaddlePredictor` 接口,你只需要简单的执行一行代码即可完成预测引擎的初始化,`std::shared_ptr predictor = CreatePaddlePredictor(config)` 。 +3. 准备输入。执行predictor->GetInput(0)您将会获得输入的第0个field,同样的,如果您的模型有多个输入,那您可以执行 `predictor->GetInput(i)` 来获取相应的输入变量。得到输入变量后您可以使用Resize方法指定其具体大小,并填入输入值。 +4. 执行预测。您只需要执行 `predictor->Run()` 即可使用Lite框架完成预测。 +5. 获取输出。与输入类似,您可以使用 `predictor->GetOutput(i)` 来获得输出的第i个变量。您可以通过其shape()方法获取输出变量的维度,通过 `data()` 模板方法获取其输出值。 + + + + +## 四. Lite API + +为了方便您的使用,我们提供了C++、Java、Python三种API,并且提供了相应的api的完整使用示例:[C++完整示例](../demo_guides/cpp_demo)、[Java完整示例](../demo_guides/java_demo)、[Python完整示例](../demo_guides/cuda),您可以参考示例中的说明快速了解C++/Java/Python的API使用方法,并集成到您自己的项目中去。需要说明的是,为了减少第三方库的依赖、提高Lite预测框架的通用性,在移动端使用Lite API您需要准备Naive Buffer存储格式的模型,具体方法可参考第2节`模型优化`。 + +## 五. 测试工具 + +为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Debug工具](debug#debug) 和 [Profile工具](debug#profiler)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。 diff --git a/docs/user_guides/x2paddle.md b/docs/user_guides/x2paddle.md new file mode 100644 index 0000000000000000000000000000000000000000..7e44ba980cc6836189d3f1a03bbbf29c8d7bd5c1 --- /dev/null +++ b/docs/user_guides/x2paddle.md @@ -0,0 +1,69 @@ +# 模型转换工具 X2Paddle + +X2Paddle可以将caffe、tensorflow、onnx模型转换成Paddle支持的模型。 + +[X2Paddle](https://github.com/PaddlePaddle/X2Paddle)支持将Caffe/TensorFlow模型转换为PaddlePaddle模型。目前X2Paddle支持的模型参考[x2paddle_model_zoo](https://github.com/PaddlePaddle/X2Paddle/blob/develop/x2paddle_model_zoo.md)。 + + +## 多框架支持 + +|模型 | caffe | tensorflow | onnx | +|---|---|---|---| +|mobilenetv1 | Y | Y | | +|mobilenetv2 | Y | Y | Y | +|resnet18 | Y | Y | | +|resnet50 | Y | Y | Y | +|mnasnet | Y | Y | | +|efficientnet | Y | Y | Y | +|squeezenetv1.1 | Y | Y | Y | +|shufflenet | Y | Y | | +|mobilenet_ssd | Y | Y | | +|mobilenet_yolov3 | | Y | | +|inceptionv4 | | | | +|mtcnn | Y | Y | | +|facedetection | Y | | | +|unet | Y | Y | | +|ocr_attention | | | | +|vgg16 | | | | + + +## 安装 + +``` +pip install x2paddle +``` + +安装最新版本,可使用如下安装方式 + +``` +pip install git+https://github.com/PaddlePaddle/X2Paddle.git@develop +``` + +## 使用 + +### Caffe + +``` +x2paddle --framework caffe \ + --prototxt model.proto \ + --weight model.caffemodel \ + --save_dir paddle_model +``` + +### TensorFlow + +``` +x2paddle --framework tensorflow \ + --model model.pb \ + --save_dir paddle_model +``` + +## 转换结果说明 + +在指定的`save_dir`下生成两个目录 +1. inference_model : 模型结构和参数均序列化保存的模型格式 +2. model_with_code : 保存了模型参数文件和模型的python代码 + +## 问题反馈 + +X2Paddle使用时存在问题时,欢迎您将问题或Bug报告以[Github Issues](https://github.com/PaddlePaddle/X2Paddle/issues)的形式提交给我们,我们会实时跟进。 diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index bac6f80c4721e0c5de201eebfe7e6a39a0bdc73a..a39c0a02681f16578ae81c74d83979fe0c57e6c6 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -12,6 +12,7 @@ message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}") +message(STATUS "LITE_WITH_ARM_LANG:\t${LITE_WITH_ARM_LANG}") set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install") set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}) @@ -64,6 +65,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (LITE_WITH_NPU) set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.npu") endif(LITE_WITH_NPU) + if (LITE_WITH_XPU) + set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.xpu") + endif(LITE_WITH_XPU) if (LITE_WITH_FPGA) set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga") endif(LITE_WITH_FPGA) @@ -79,7 +83,16 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}") if (LITE_WITH_PYTHON) add_custom_target(publish_inference_python_lib ${TARGET} COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite_core.so") + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so") + add_custom_target(publish_inference_python_installer ${TARGET} + COMMAND python setup.py bdist_wheel + WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/ + DEPENDS publish_inference_python_lib) add_custom_target(publish_inference_python_light_demo ${TARGET} COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/python" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/") @@ -91,6 +104,7 @@ if (LITE_WITH_PYTHON) endif() add_dependencies(publish_inference_python_lib lite_pybind) add_dependencies(publish_inference publish_inference_python_lib) + add_dependencies(publish_inference publish_inference_python_installer) add_dependencies(publish_inference publish_inference_python_light_demo) endif() @@ -123,7 +137,29 @@ if (LITE_WITH_X86) endif() if(LITE_WITH_CUDA) - add_dependencies(publish_inference paddle_full_api_shared) + add_custom_target(publish_inference_cuda_cxx_lib ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + ) + add_dependencies(publish_inference_cuda_cxx_lib bundle_full_api) + add_dependencies(publish_inference_cuda_cxx_lib bundle_light_api) + add_dependencies(publish_inference_cuda_cxx_lib paddle_full_api_shared) + add_dependencies(publish_inference_cuda_cxx_lib paddle_light_api_shared) + add_dependencies(publish_inference publish_inference_cuda_cxx_lib) + + add_custom_target(publish_inference_cuda_cxx_demos ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/cuda_demo/*" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + ) + add_dependencies(publish_inference_cuda_cxx_lib publish_inference_cuda_cxx_demos) + add_dependencies(publish_inference_cuda_cxx_demos paddle_full_api_shared) endif(LITE_WITH_CUDA) if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (NOT LITE_ON_TINY_PUBLISH) @@ -135,22 +171,23 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" - #COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/model_optimize_tool" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/gen_code/paddle_code_generator" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" ) if(NOT IOS) - #add_dependencies(publish_inference_cxx_lib model_optimize_tool) add_dependencies(publish_inference_cxx_lib paddle_code_generator) add_dependencies(publish_inference_cxx_lib bundle_full_api) add_dependencies(publish_inference_cxx_lib bundle_light_api) add_dependencies(publish_inference_cxx_lib test_model_bin) + add_dependencies(publish_inference_cxx_lib benchmark_bin) if (ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux") add_dependencies(publish_inference_cxx_lib paddle_full_api_shared) add_dependencies(publish_inference paddle_light_api_shared) add_custom_command(TARGET publish_inference_cxx_lib - COMMAND cp ${CMAKE_BINARY_DIR}/lite/api/*.so ${INFER_LITE_PUBLISH_ROOT}/cxx/lib) + COMMAND cp ${CMAKE_BINARY_DIR}/lite/api/*.so ${INFER_LITE_PUBLISH_ROOT}/cxx/lib + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/benchmark_bin" "${INFER_LITE_PUBLISH_ROOT}/bin" + ) endif() add_dependencies(publish_inference publish_inference_cxx_lib) if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug") @@ -185,6 +222,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) add_dependencies(publish_inference tiny_publish_cxx_lib) if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug") add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD + COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_api_light_bundled.a COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so) endif() endif() @@ -281,6 +319,10 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/opencl" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/backends/opencl/cl_kernel" "${INFER_LITE_PUBLISH_ROOT}/opencl" ) + if (NOT LITE_ON_TINY_PUBLISH) add_dependencies(publish_inference_cxx_lib publish_inference_opencl) + else() + add_dependencies(tiny_publish_cxx_lib publish_inference_opencl) + endif() endif() endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index f7f74ab5822a1305e3e8d24cf36a0a458a6494ff..b360b476e0c99a62ea39a70241b548bddf5a872a 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -45,7 +45,11 @@ else() if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux")) add_library(paddle_light_api_shared SHARED "") target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc) - set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") + set(TARGET_COMIPILE_FLAGS "-fdata-sections") + if (NOT (ARM_TARGET_LANG STREQUAL "clang")) #gcc + set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto") + endif() + set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}") add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) if (LITE_WITH_NPU) # Need to add HIAI runtime libs (libhiai.so) dependency @@ -78,6 +82,7 @@ message(STATUS "get X86 kernels ${x86_kernels}") message(STATUS "get CUDA kernels ${cuda_kernels}") message(STATUS "get Host kernels ${host_kernels}") message(STATUS "get ARM kernels ${arm_kernels}") +message(STATUS "get OpenCL kernels ${opencl_kernels}") message(STATUS "get NPU kernels ${npu_kernels}") message(STATUS "get XPU kernels ${xpu_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}") @@ -143,38 +148,40 @@ if(WITH_TESTING) --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) add_dependencies(test_cxx_api extern_lite_download_lite_naive_model_tar_gz) if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - lite_cc_test(test_googlenet SRCS test_googlenet_lite.cc - DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/googlenet) - add_dependencies(test_googlenet extern_lite_download_GoogleNet_inference_tar_gz) - lite_cc_test(test_mobilenetv1_lite_x86 SRCS test_mobilenetv1_lite_x86.cc - DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1) - add_dependencies(test_mobilenetv1_lite_x86 extern_lite_download_mobilenet_v1_tar_gz) - lite_cc_test(test_mobilenetv2_lite_x86 SRCS test_mobilenetv2_lite_x86.cc - DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu) - add_dependencies(test_mobilenetv2_lite_x86 extern_lite_download_mobilenet_v2_relu_tar_gz) - lite_cc_test(test_inceptionv4_lite_x86 SRCS test_inceptionv4_lite_x86.cc - DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/inception_v4_simple) - add_dependencies(test_inceptionv4_lite_x86 extern_lite_download_inception_v4_simple_tar_gz) - lite_cc_test(test_resnet50_lite_x86 SRCS test_resnet50_lite_x86.cc - DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) - add_dependencies(test_resnet50_lite_x86 extern_lite_download_resnet50_tar_gz) - lite_cc_test(test_step_rnn_lite_x86 SRCS test_step_rnn_lite_x86.cc - DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/step_rnn) - add_dependencies(test_step_rnn_lite_x86 extern_lite_download_step_rnn_tar_gz) + if(LITE_WITH_X86) + lite_cc_test(test_googlenet SRCS test_googlenet_lite.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/googlenet) + add_dependencies(test_googlenet extern_lite_download_GoogleNet_inference_tar_gz) + lite_cc_test(test_mobilenetv1_lite_x86 SRCS test_mobilenetv1_lite_x86.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1) + add_dependencies(test_mobilenetv1_lite_x86 extern_lite_download_mobilenet_v1_tar_gz) + lite_cc_test(test_mobilenetv2_lite_x86 SRCS test_mobilenetv2_lite_x86.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu) + add_dependencies(test_mobilenetv2_lite_x86 extern_lite_download_mobilenet_v2_relu_tar_gz) + lite_cc_test(test_inceptionv4_lite_x86 SRCS test_inceptionv4_lite_x86.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/inception_v4_simple) + add_dependencies(test_inceptionv4_lite_x86 extern_lite_download_inception_v4_simple_tar_gz) + lite_cc_test(test_resnet50_lite_x86 SRCS test_resnet50_lite_x86.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) + add_dependencies(test_resnet50_lite_x86 extern_lite_download_resnet50_tar_gz) + lite_cc_test(test_step_rnn_lite_x86 SRCS test_step_rnn_lite_x86.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/step_rnn) + add_dependencies(test_step_rnn_lite_x86 extern_lite_download_step_rnn_tar_gz) + endif() if(LITE_WITH_BM) - lite_cc_test(test_resnet50_lite_bm SRCS test_resnet50_lite_bm.cc + lite_cc_test(test_classify_lite_bm SRCS test_classify_lite_bm.cc DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges} ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) @@ -229,6 +236,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL) add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz) + # brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc # DEPS ${lite_model_test_DEPS}) @@ -295,6 +303,11 @@ if (LITE_ON_TINY_PUBLISH) return() endif() + +# add library for opt_base +lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils) +add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h) + if (LITE_ON_MODEL_OPTIMIZE_TOOL) message(STATUS "Compiling opt") lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc @@ -330,6 +343,30 @@ if(NOT IOS) FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) + + lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} + CL_DEPS ${opencl_kernels} + BM_DEPS ${bm_kernels} + FPGA_DEPS ${fpga_kernels} + X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels}) + + lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} + CL_DEPS ${opencl_kernels} + BM_DEPS ${bm_kernels} + FPGA_DEPS ${fpga_kernels} + X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels}) lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} @@ -341,6 +378,7 @@ if(NOT IOS) FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) + lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ARM_DEPS ${arm_kernels} @@ -352,6 +390,16 @@ if(NOT IOS) FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) + lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels} + X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels}) endif() #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc diff --git a/lite/api/android/jni/native/CMakeLists.txt b/lite/api/android/jni/native/CMakeLists.txt index c1766772f8aaa417c3da1d72f2692c10c10194b4..d46e9f7cdec1cf422340ff11165ee166c7520bab 100644 --- a/lite/api/android/jni/native/CMakeLists.txt +++ b/lite/api/android/jni/native/CMakeLists.txt @@ -25,7 +25,11 @@ if (NOT LITE_ON_TINY_PUBLISH) endif() else() add_library(paddle_lite_jni SHARED "") - set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") + set(TARGET_COMIPILE_FLAGS "-fdata-sections") + if (NOT (ARM_TARGET_LANG STREQUAL "clang")) #gcc + set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto") + endif() + set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS ${TARGET_COMIPILE_FLAGS}) target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc) add_dependencies(paddle_lite_jni op_list_h kernel_list_h) if (LITE_WITH_NPU) diff --git a/lite/api/android/jni/native/paddle_lite_jni.h b/lite/api/android/jni/native/paddle_lite_jni.h index f447ce105a1ca7b2d94a00287d2b699f920a09af..983f108a869db91c7cfeb9eb539286e2a3f0bf99 100644 --- a/lite/api/android/jni/native/paddle_lite_jni.h +++ b/lite/api/android/jni/native/paddle_lite_jni.h @@ -17,11 +17,6 @@ #include /* Header for class com_baidu_paddle_lite_PaddlePredictor */ #include "lite/api/paddle_lite_factory_helper.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#ifndef LITE_ON_TINY_PUBLISH -#include "lite/api/paddle_use_passes.h" -#endif #ifdef __cplusplus extern "C" { #endif diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java b/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java index e150f98f22113ef6bcedd5e9882e0bd2a6378c97..fe05c4302c71b439ae125e165244146726b3bf3d 100644 --- a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java +++ b/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java @@ -78,7 +78,7 @@ public class MobileConfig extends ConfigBase { * * @return liteModelFile */ - public String getModelFile() { + public String getModelFromFile() { return liteModelFile; } @@ -96,7 +96,7 @@ public class MobileConfig extends ConfigBase { * * @return liteModelBuffer */ - public String getModelBuffer() { + public String getModelFromBuffer() { return liteModelBuffer; } diff --git a/lite/api/apis_test.cc b/lite/api/apis_test.cc index bb852297d11a8862460ed6f12e007d727aca9428..917f2a73a95c3fbd7464fd40824b833993a2a18c 100644 --- a/lite/api/apis_test.cc +++ b/lite/api/apis_test.cc @@ -21,9 +21,6 @@ #include #include "lite/api/cxx_api.h" #include "lite/api/light_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" #include "lite/core/mir/pass_registry.h" DEFINE_string(model_dir, "", ""); diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc index 718dbe44296f2d197efc5b567cf0cc211835d176..d53de7bf2ed00fed70bbd1f70729a051e5d7203b 100644 --- a/lite/api/benchmark.cc +++ b/lite/api/benchmark.cc @@ -23,31 +23,28 @@ #include #include #include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" #include "lite/core/device_info.h" #include "lite/utils/cp_logging.h" #include "lite/utils/string.h" DEFINE_string(model_dir, "", - "the path of the model, set model_dir when the model is no " - "combined formate. This option will be ignored if model_file " - "and param_file are exist."); -DEFINE_string(model_file, + "the path of the model, the model and param files is under " + "model_dir."); +DEFINE_string(model_filename, "", - "the path of model file, set model_file when the model is " - "combined formate."); -DEFINE_string(param_file, + "the filename of model file. When the model is combined formate, " + "please set model_file."); +DEFINE_string(param_filename, "", - "the path of param file, set param_file when the model is " + "the filename of param file, set param_file when the model is " "combined formate."); DEFINE_string(input_shape, "1,3,224,224", "set input shapes according to the model, " "separated by colon and comma, " - "such as 1,3,244,244:1,3,300,300."); + "such as 1,3,244,244"); +DEFINE_string(input_img_path, "", "the path of input image"); DEFINE_int32(warmup, 0, "warmup times"); DEFINE_int32(repeats, 1, "repeats times"); DEFINE_int32(power_mode, @@ -80,12 +77,13 @@ inline double GetCurrentUS() { return 1e+6 * time.tv_sec + time.tv_usec; } -void OutputOptModel(const std::string& save_optimized_model_dir, - const std::vector>& input_shapes) { +void OutputOptModel(const std::string& save_optimized_model_dir) { lite_api::CxxConfig config; config.set_model_dir(FLAGS_model_dir); - config.set_model_file(FLAGS_model_file); - config.set_param_file(FLAGS_param_file); + if (!FLAGS_model_filename.empty() && !FLAGS_param_filename.empty()) { + config.set_model_file(FLAGS_model_dir + "/" + FLAGS_model_filename); + config.set_param_file(FLAGS_model_dir + "/" + FLAGS_param_filename); + } std::vector vaild_places = { Place{TARGET(kARM), PRECISION(kFloat)}, }; @@ -109,7 +107,7 @@ void OutputOptModel(const std::string& save_optimized_model_dir, } #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK -void Run(const std::vector>& input_shapes, +void Run(const std::vector& input_shape, const std::string& model_dir, const std::string model_name) { // set config and create predictor @@ -121,17 +119,27 @@ void Run(const std::vector>& input_shapes, auto predictor = lite_api::CreatePaddlePredictor(config); // set input - for (int j = 0; j < input_shapes.size(); ++j) { - auto input_tensor = predictor->GetInput(j); - input_tensor->Resize(input_shapes[j]); - auto input_data = input_tensor->mutable_data(); - int input_num = 1; - for (size_t i = 0; i < input_shapes[j].size(); ++i) { - input_num *= input_shapes[j][i]; - } + auto input_tensor = predictor->GetInput(0); + input_tensor->Resize(input_shape); + auto input_data = input_tensor->mutable_data(); + int input_num = 1; + for (size_t i = 0; i < input_shape.size(); ++i) { + input_num *= input_shape[i]; + } + if (FLAGS_input_img_path.empty()) { for (int i = 0; i < input_num; ++i) { input_data[i] = 1.f; } + } else { + std::fstream fs(FLAGS_input_img_path); + if (!fs.is_open()) { + LOG(FATAL) << "open input image " << FLAGS_input_img_path << " error."; + } + for (int i = 0; i < input_num; i++) { + fs >> input_data[i]; + } + // LOG(INFO) << "input data:" << input_data[0] << " " << + // input_data[input_num-1]; } // warmup @@ -178,25 +186,12 @@ int main(int argc, char** argv) { exit(0); } + if (FLAGS_model_dir.back() == '/') { + FLAGS_model_dir.pop_back(); + } std::size_t found = FLAGS_model_dir.find_last_of("/"); std::string model_name = FLAGS_model_dir.substr(found + 1); - std::string save_optimized_model_dir = FLAGS_model_dir + "opt2"; - - auto split_string = - [](const std::string& str_in) -> std::vector { - std::vector str_out; - std::string tmp_str = str_in; - while (!tmp_str.empty()) { - size_t next_offset = tmp_str.find(":"); - str_out.push_back(tmp_str.substr(0, next_offset)); - if (next_offset == std::string::npos) { - break; - } else { - tmp_str = tmp_str.substr(next_offset + 1); - } - } - return str_out; - }; + std::string save_optimized_model_dir = FLAGS_model_dir + "_opt2"; auto get_shape = [](const std::string& str_shape) -> std::vector { std::vector shape; @@ -214,22 +209,18 @@ int main(int argc, char** argv) { return shape; }; - std::vector str_input_shapes = split_string(FLAGS_input_shape); - std::vector> input_shapes; - for (size_t i = 0; i < str_input_shapes.size(); ++i) { - input_shapes.push_back(get_shape(str_input_shapes[i])); - } + std::vector input_shape = get_shape(FLAGS_input_shape); // Output optimized model if needed if (FLAGS_run_model_optimize) { - paddle::lite_api::OutputOptModel(save_optimized_model_dir, input_shapes); + paddle::lite_api::OutputOptModel(save_optimized_model_dir); } #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK // Run inference using optimized model std::string run_model_dir = FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir; - paddle::lite_api::Run(input_shapes, run_model_dir, model_name); + paddle::lite_api::Run(input_shape, run_model_dir, model_name); #endif return 0; } diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index f6f7ec75e65ff54e3f3642822e51057d3522ae3a..556a9e0af01854ff5c57a14dade72b81ed255964 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -294,6 +294,32 @@ void Predictor::Build(const cpp::ProgramDesc &desc, inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); inner_places.emplace_back( TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + + const std::vector quant_dequant_op = { + "fake_quantize_abs_max", + "fake_quantize_range_abs_max", + "fake_quantize_moving_average_abs_max", + "fake_quantize_dequantize_moving_average_abs_max", + "fake_dequantize_max_abs", + "fake_channel_wise_dequantize_max_abs"}; + bool is_quantized_model = false; + for (size_t i = 0; i < program_desc_.BlocksSize() && !is_quantized_model; + ++i) { + auto *block_desc = program_desc_.GetBlock(i); + for (size_t j = 0; j < block_desc->OpsSize() && !is_quantized_model; ++j) { + auto *op_desc = block_desc->GetOp(j); + std::string op_type = op_desc->Type(); + if (std::find(quant_dequant_op.begin(), + quant_dequant_op.end(), + op_type) != quant_dequant_op.end()) { + is_quantized_model = true; + } + } + } + if (is_quantized_model) { + inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)}); + } + Program program(desc, scope_, inner_places); core::KernelPickFactor factor; @@ -333,16 +359,16 @@ lite::Tensor *Predictor::GetInputByName(const std::string &name) { } } -#ifdef LITE_WITH_TRAIN -void Predictor::FeedVars(const std::vector &tensors) { - auto var = scope_->FindVar("feed"); - auto &feed_list = *(var->GetMutable>()); - feed_list.resize(tensors.size()); +// #ifdef LITE_WITH_TRAIN +// void Predictor::FeedVars(const std::vector &tensors) { +// auto var = scope_->FindVar("feed"); +// auto &feed_list = *(var->GetMutable>()); +// feed_list.resize(tensors.size()); - for (size_t i = 0; i < tensors.size(); ++i) - feed_list[i].ShareDataWith(tensors[i]); -} -#endif +// for (size_t i = 0; i < tensors.size(); ++i) +// feed_list[i].ShareDataWith(tensors[i]); +// } +// #endif } // namespace lite } // namespace paddle diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h index 504710d9fa29420b8762f31e0c675b59c6c626bd..e63893cb91e112beb6be50bd661a57b9738e5fb1 100644 --- a/lite/api/cxx_api.h +++ b/lite/api/cxx_api.h @@ -101,14 +101,14 @@ class LITE_API Predictor { bool record_info = false); void SaveOpKernelInfo(const std::string& model_dir); -#ifdef LITE_WITH_TRAIN - void Run(const std::vector& tensors) { - FeedVars(tensors); - program_->Run(); - } - - void FeedVars(const std::vector& tensors); -#endif + // #ifdef LITE_WITH_TRAIN + // void Run(const std::vector& tensors) { + // FeedVars(tensors); + // program_->Run(); + // } + + // void FeedVars(const std::vector& tensors); + // #endif private: Optimizer optimizer_; diff --git a/lite/api/cxx_api_bin.cc b/lite/api/cxx_api_bin.cc index 8c929e9c8700a65c868e2facd763b0ec36719e23..eec17cc30e308e7169b7d8c394c0e47eee0c1c3e 100644 --- a/lite/api/cxx_api_bin.cc +++ b/lite/api/cxx_api_bin.cc @@ -67,7 +67,7 @@ void Run(const char* model_dir, int repeat) { int main(int argc, char** argv) { CHECK_EQ(argc, 3) << "usage: ./cmd "; - paddle::lite::Run(argv[1], std::stoi(argv[2])); + paddle::lite::Run(argv[1], atoi(argv[2])); return 0; } diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc index 81ea60eac66849f8ce42fb8cb210226d18bbfa9b..972210c8f9ea05ba1b041382c43efad64aeacc1b 100644 --- a/lite/api/cxx_api_impl.cc +++ b/lite/api/cxx_api_impl.cc @@ -35,8 +35,16 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { Env::Init(); #endif auto places = config.valid_places(); - raw_predictor_.Build(config, places); - + std::vector passes{}; + auto use_layout_preprocess_pass = + config.model_dir().find("OPENCL_PRE_PRECESS"); + VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass; + if (places[0].target == TARGET(kOpenCL) && + use_layout_preprocess_pass != std::string::npos) { + passes = {"type_layout_cast_preprocess_pass"}; + VLOG(1) << "add pass:" << passes[0]; + } + raw_predictor_.Build(config, places, passes); mode_ = config.power_mode(); threads_ = config.threads(); diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc index 29d8f4f29ab822f8c9601bbd63a3626abbbf1818..b641973a15b2e6abc1cf4c999d759271f7522638 100644 --- a/lite/api/light_api.cc +++ b/lite/api/light_api.cc @@ -13,6 +13,12 @@ // limitations under the License. #include "lite/api/light_api.h" +#include "paddle_use_kernels.h" // NOLINT +#include "paddle_use_ops.h" // NOLINT +#ifndef LITE_ON_TINY_PUBLISH +#include "lite/api/paddle_use_passes.h" +#endif + #include namespace paddle { @@ -25,6 +31,8 @@ void LightPredictor::Build(const std::string& lite_model_file, } else { LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_); } + + DequantizeWeight(); BuildRuntimeProgram(cpp_program_desc_); PrepareFeedFetch(); } diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc index 3965843250abe45c43490bdbb4aaed58915e0908..cdf5b7fb06df35b2e7fb72fc4e33ccb721a0f7f7 100644 --- a/lite/api/light_api_impl.cc +++ b/lite/api/light_api_impl.cc @@ -58,6 +58,7 @@ void LightPredictorImpl::Run() { std::shared_ptr LightPredictorImpl::Clone() { LOG(FATAL) << "The Clone API is not supported in LigthPredictor"; + return nullptr; } std::string LightPredictorImpl::GetVersion() const { return lite::version(); } diff --git a/lite/api/light_api_shared.cc b/lite/api/light_api_shared.cc index 557804bfa56787fa8a83bfbfc3046df08be010f8..cfe3d9de09a646e33c4a116bb3cd087d28aa24c2 100644 --- a/lite/api/light_api_shared.cc +++ b/lite/api/light_api_shared.cc @@ -12,11 +12,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#ifndef LITE_ON_TINY_PUBLISH -#include "lite/api/paddle_use_passes.h" -#endif namespace paddle { namespace lite_api { diff --git a/lite/api/light_api_test.cc b/lite/api/light_api_test.cc index 7d322530f624c43737018d8ece98fb24d48bc16a..b49ff8b80c936b93acd630c6e0cde03df8b22ee4 100644 --- a/lite/api/light_api_test.cc +++ b/lite/api/light_api_test.cc @@ -15,9 +15,6 @@ #include "lite/api/light_api.h" #include #include -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" DEFINE_string(optimized_model, "", ""); diff --git a/lite/api/lite_multithread_test.cc b/lite/api/lite_multithread_test.cc index addd512eb0039c43edeca562b8f568528aab76f9..12559d171ff3df808cf252e8e09c652246902abf 100644 --- a/lite/api/lite_multithread_test.cc +++ b/lite/api/lite_multithread_test.cc @@ -16,9 +16,6 @@ #include #include #include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" #include "lite/api/test_helper.h" #include "lite/core/device_info.h" #include "lite/core/profile/timer.h" diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc index bcc9644f81542ab6fb8a0badf8ecaea89fc8dedb..5342a36ec154b2bdde44fa72bc21e9d430ad4efe 100644 --- a/lite/api/mobilenetv1_test.cc +++ b/lite/api/mobilenetv1_test.cc @@ -53,9 +53,13 @@ void TestModel(const std::vector& valid_places, predictor.Run(); } - auto start = GetCurrentUS(); + double sum_duration = 0.0; // millisecond; for (int i = 0; i < FLAGS_repeats; ++i) { + auto start = GetCurrentUS(); predictor.Run(); + auto duration = (GetCurrentUS() - start) / 1000.0; + sum_duration += duration; + VLOG(1) << "run_idx:" << i << " " << duration << " ms"; } if (save_model) { @@ -68,8 +72,7 @@ void TestModel(const std::vector& valid_places, LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; + << ", spend " << sum_duration / FLAGS_repeats << " ms in average."; std::vector> ref; ref.emplace_back(std::vector( @@ -81,29 +84,63 @@ void TestModel(const std::vector& valid_places, auto* out = predictor.GetOutput(0); const auto* pdata = out->data(); int step = 50; -#ifdef LITE_WITH_NPU - ASSERT_EQ(out->dims().production(), 1000); - double eps = 0.1; - for (int i = 0; i < ref.size(); ++i) { - for (int j = 0; j < ref[i].size(); ++j) { - auto result = pdata[j * step + (out->dims()[1] * i)]; - auto diff = std::fabs((result - ref[i][j]) / ref[i][j]); - VLOG(3) << diff; - EXPECT_LT(diff, eps); + + // Get target and check result + VLOG(1) << "valid_places.size():" << valid_places.size(); + for (int i = 0; i < valid_places.size(); ++i) { + auto p = valid_places[i]; + VLOG(1) << "valid_places[" << i << "]:" << p.DebugString(); + } + auto first_target = valid_places[0].target; + + if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) { + ASSERT_EQ(out->dims().production(), 1000); + double eps = first_target == TARGET(kOpenCL) ? 0.12 : 0.1; + for (int i = 0; i < ref.size(); ++i) { + for (int j = 0; j < ref[i].size(); ++j) { + auto result = pdata[j * step + (out->dims()[1] * i)]; + auto diff = std::fabs((result - ref[i][j]) / ref[i][j]); + VLOG(3) << diff; + EXPECT_LT(diff, eps); + } + } + } else { + ASSERT_EQ(out->dims().size(), 2); + ASSERT_EQ(out->dims()[0], 1); + ASSERT_EQ(out->dims()[1], 1000); + double eps = 1e-6; + for (int i = 0; i < ref.size(); ++i) { + for (int j = 0; j < ref[i].size(); ++j) { + auto result = pdata[j * step + (out->dims()[1] * i)]; + EXPECT_NEAR(result, ref[i][j], eps); + } } } -#else - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - double eps = 1e-6; - for (int i = 0; i < ref.size(); ++i) { - for (int j = 0; j < ref[i].size(); ++j) { - auto result = pdata[j * step + (out->dims()[1] * i)]; - EXPECT_NEAR(result, ref[i][j], eps); + + // Get detailed result + size_t output_tensor_num = predictor.GetOutputNames().size(); + VLOG(1) << "output tensor num:" << output_tensor_num; + + for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { + auto* output_tensor = predictor.GetOutput(tidx); + VLOG(1) << "============= output tensor " << tidx << " =============\n"; + auto out_dims = output_tensor->dims(); + auto out_data = output_tensor->data(); + auto out_mean = compute_mean(out_data, out_dims.production()); + auto out_std_dev = compute_standard_deviation( + out_data, out_dims.production(), true, out_mean); + + VLOG(1) << "output tensor dims:" << out_dims; + VLOG(1) << "output tensor elements num:" << out_dims.production(); + VLOG(1) << "output tensor standard deviation:" << out_std_dev; + VLOG(1) << "output tensor mean value:" << out_mean; + + // print result + for (int i = 0; i < out_dims.production(); ++i) { + VLOG(2) << "output_tensor->data()[" << i + << "]:" << output_tensor->data()[i]; } } -#endif } #ifdef LITE_WITH_NPU @@ -130,7 +167,7 @@ TEST(MobileNetV1, test_arm) { #ifdef LITE_WITH_OPENCL TEST(MobileNetV1, test_opencl) { std::vector valid_places({ - Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)}, + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}, diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc index 012d6d48d9e6d3747f83a7f1089944bbaf359f71..465f82056c6bb80b706cfb7d875773d75735911b 100644 --- a/lite/api/mobilenetv2_test.cc +++ b/lite/api/mobilenetv2_test.cc @@ -54,9 +54,13 @@ void TestModel(const std::vector& valid_places, predictor.Run(); } - auto start = GetCurrentUS(); + double sum_duration = 0.0; // millisecond; for (int i = 0; i < FLAGS_repeats; ++i) { + auto start = GetCurrentUS(); predictor.Run(); + auto duration = (GetCurrentUS() - start) / 1000.0; + sum_duration += duration; + VLOG(1) << "run_idx:" << i << " " << duration << " ms"; } if (save_model) { @@ -69,8 +73,7 @@ void TestModel(const std::vector& valid_places, LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; + << ", spend " << sum_duration / FLAGS_repeats << " ms in average."; std::vector> ref; // i = 1 @@ -83,27 +86,63 @@ void TestModel(const std::vector& valid_places, auto* out = predictor.GetOutput(0); const auto* pdata = out->data(); int step = 50; -#ifdef LITE_WITH_NPU - ASSERT_EQ(out->dims().production(), 1000); - double eps = 0.1; - for (int i = 0; i < ref.size(); ++i) { - for (int j = 0; j < ref[i].size(); ++j) { - auto result = pdata[j * step + (out->dims()[1] * i)]; - auto diff = std::fabs((result - ref[i][j]) / ref[i][j]); - VLOG(3) << diff; - EXPECT_LT(diff, eps); + + // Get target and check result + VLOG(1) << "valid_places.size():" << valid_places.size(); + for (int i = 0; i < valid_places.size(); ++i) { + auto p = valid_places[i]; + VLOG(1) << "valid_places[" << i << "]:" << p.DebugString(); + } + auto first_target = valid_places[0].target; + + if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) { + ASSERT_EQ(out->dims().production(), 1000); + double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1; + for (int i = 0; i < ref.size(); ++i) { + for (int j = 0; j < ref[i].size(); ++j) { + auto result = pdata[j * step + (out->dims()[1] * i)]; + auto diff = std::fabs((result - ref[i][j]) / ref[i][j]); + VLOG(3) << diff; + EXPECT_LT(diff, eps); + } + } + } else { + ASSERT_EQ(out->dims().size(), 2); + ASSERT_EQ(out->dims()[0], 1); + ASSERT_EQ(out->dims()[1], 1000); + double eps = 1e-6; + for (int i = 0; i < ref.size(); ++i) { + for (int j = 0; j < ref[i].size(); ++j) { + auto result = pdata[j * step + (out->dims()[1] * i)]; + EXPECT_NEAR(result, ref[i][j], eps); + } } } -#else - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - for (int i = 0; i < ref.size(); ++i) { - for (int j = 0; j < ref[i].size(); ++j) { - EXPECT_NEAR(pdata[j * step + (out->dims()[1] * i)], ref[i][j], 1e-6); + + // Get detailed result + size_t output_tensor_num = predictor.GetOutputNames().size(); + VLOG(1) << "output tensor num:" << output_tensor_num; + + for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { + auto* output_tensor = predictor.GetOutput(tidx); + VLOG(1) << "============= output tensor " << tidx << " =============\n"; + auto out_dims = output_tensor->dims(); + auto out_data = output_tensor->data(); + auto out_mean = compute_mean(out_data, out_dims.production()); + auto out_std_dev = compute_standard_deviation( + out_data, out_dims.production(), true, out_mean); + + VLOG(1) << "output tensor dims:" << out_dims; + VLOG(1) << "output tensor elements num:" << out_dims.production(); + VLOG(1) << "output tensor standard deviation:" << out_std_dev; + VLOG(1) << "output tensor mean value:" << out_mean; + + // print result + for (int i = 0; i < out_dims.production(); ++i) { + VLOG(2) << "output_tensor->data()[" << i + << "]:" << output_tensor->data()[i]; } } -#endif } #ifdef LITE_WITH_NPU @@ -130,7 +169,7 @@ TEST(MobileNetV2, test_arm) { #ifdef LITE_WITH_OPENCL TEST(MobileNetV2, test_opencl) { std::vector valid_places({ - Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)}, + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}, Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}, diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc index 190890da4c109f39cc52ca5209cd952f8937f780..b0f7a0479f0db91b816838f9d0ee1cc31b9b232a 100644 --- a/lite/api/model_test.cc +++ b/lite/api/model_test.cc @@ -17,9 +17,6 @@ #include #include #include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" #include "lite/api/test_helper.h" #include "lite/core/device_info.h" #include "lite/core/profile/timer.h" @@ -141,7 +138,7 @@ void Run(const std::vector>& input_shapes, std::ofstream out(FLAGS_arg_name + ".txt"); for (size_t i = 0; i < arg_num; ++i) { sum += arg_tensor->data()[i]; - out << std::to_string(arg_tensor->data()[i]) << "\n"; + out << paddle::lite::to_string(arg_tensor->data()[i]) << "\n"; } LOG(INFO) << FLAGS_arg_name << " shape is " << os.str() << ", mean value is " << sum * 1. / arg_num; diff --git a/lite/api/model_test_classify.cc b/lite/api/model_test_classify.cc new file mode 100644 index 0000000000000000000000000000000000000000..375d249476bf5323d69ea41c3f11d07e9c8bc711 --- /dev/null +++ b/lite/api/model_test_classify.cc @@ -0,0 +1,335 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "lite/api/paddle_api.h" +#include "lite/api/test_helper.h" +#include "lite/core/device_info.h" +#include "lite/core/profile/timer.h" +#include "lite/utils/cp_logging.h" +#include "lite/utils/string.h" +#ifdef LITE_WITH_PROFILE +#include "lite/core/profile/basic_profiler.h" +#endif // LITE_WITH_PROFILE + +using paddle::lite::profile::Timer; + +DEFINE_string(input_shape, + "1,3,224,224", + "input shapes, separated by colon and comma"); +DEFINE_bool(use_optimize_nb, + false, + "optimized & naive buffer model for mobile devices"); +DEFINE_string(arg_name, "", "the arg name"); + +DEFINE_string(threshold, "0.5", "threshold value default 0.5f"); +DEFINE_string(in_txt, "", "input text"); +DEFINE_string(out_txt, "", "output text"); +DEFINE_string(label_file, "", "label file path"); +DEFINE_int32(topk, 1, "topk num"); + +namespace paddle { +namespace lite_api { + +void OutputOptModel(const std::string& load_model_dir, + const std::string& save_optimized_model_dir, + const std::vector>& input_shapes) { + lite_api::CxxConfig config; + config.set_model_dir(load_model_dir); + config.set_valid_places({ + Place{TARGET(kARM), PRECISION(kFloat)}, + }); + auto predictor = lite_api::CreatePaddlePredictor(config); + + // delete old optimized model + int ret = system( + paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str()) + .c_str()); + if (ret == 0) { + LOG(INFO) << "delete old optimized model " << save_optimized_model_dir; + } + predictor->SaveOptimizedModel(save_optimized_model_dir, + LiteModelType::kNaiveBuffer); + LOG(INFO) << "Load model from " << load_model_dir; + LOG(INFO) << "Save optimized model to " << save_optimized_model_dir; +} + +#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK +std::vector load_labels(std::string label_path) { + FILE* fp = fopen(label_path.c_str(), "r"); + if (fp == nullptr) { + LOG(FATAL) << "load label file failed! " << label_path; + } + std::vector labels; + while (!feof(fp)) { + char str[1024]; + fgets(str, 1024, fp); + std::string str_s(str); + + if (str_s.length() > 0) { + for (int i = 0; i < str_s.length(); i++) { + if (str_s[i] == ' ') { + std::string strr = str_s.substr(i, str_s.length() - i - 1); + labels.push_back(strr); + i = str_s.length(); + } + } + } + } + fclose(fp); + return labels; +} + +void print_topk(const float* scores, + const int size, + const int topk, + const std::vector labels) { + std::vector> vec; + vec.resize(size); + for (int i = 0; i < size; i++) { + vec[i] = std::make_pair(scores[i], i); + } + std::partial_sort(vec.begin(), + vec.begin() + topk, + vec.end(), + std::greater>()); + + // print topk and score + std::string name = FLAGS_out_txt + "_accu.txt"; + FILE* fp = fopen(name.c_str(), "w"); + fprintf(fp, "%d \n", topk); + for (int i = 0; i < topk; i++) { + float score = vec[i].first; + int index = vec[i].second; + fprintf(fp, "%d ", index); + fprintf(fp, "%f \n", score); + LOG(INFO) << i << ": " << index << " " << labels[index] << " " << score; + } + fclose(fp); +} + +void Run(const std::vector>& input_shapes, + const std::string& model_dir, + const PowerMode power_mode, + const int thread_num, + const int repeat, + const int warmup_times = 0) { + lite_api::MobileConfig config; + config.set_model_dir(model_dir); + config.set_power_mode(power_mode); + config.set_threads(thread_num); + + auto predictor = lite_api::CreatePaddlePredictor(config); + bool flag_in = true; + bool flag_out = true; + if (FLAGS_in_txt == "") { + flag_in = false; + } + if (FLAGS_out_txt == "") { + flag_out = false; + } + printf("flag_in: %d, flag_out: %d \n", flag_in, flag_out); + for (int j = 0; j < input_shapes.size(); ++j) { + auto input_tensor = predictor->GetInput(j); + input_tensor->Resize(input_shapes[j]); + auto input_data = input_tensor->mutable_data(); + int input_num = 1; + for (int i = 0; i < input_shapes[j].size(); ++i) { + input_num *= input_shapes[j][i]; + } + + FILE* fp_r = nullptr; + if (flag_in) { + fp_r = fopen(FLAGS_in_txt.c_str(), "r"); + } + for (int i = 0; i < input_num; ++i) { + if (flag_in) { + fscanf(fp_r, "%f\n", &input_data[i]); + } else { + input_data[i] = 1.f; + } + } + if (flag_in) { + fclose(fp_r); + } + } + + for (int i = 0; i < warmup_times; ++i) { + predictor->Run(); + } + + Timer ti; + for (int j = 0; j < repeat; ++j) { + ti.Start(); + predictor->Run(); + float t = ti.Stop(); + LOG(INFO) << "iter: " << j << ", time: " << t << " ms"; + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << model_dir + << ", power_mode: " << static_cast(power_mode) + << ", threads num " << thread_num << ", warmup: " << warmup_times + << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg() + << " ms" + << ", min time: " << ti.LapTimes().Min() << " ms" + << ", max time: " << ti.LapTimes().Max() << " ms."; + + auto output = predictor->GetOutput(0); + auto out = output->data(); + auto output_shape = output->shape(); + int output_num = 1; + for (int i = 0; i < output_shape.size(); ++i) { + output_num *= output_shape[i]; + } + // classify + printf("load_labels \n"); + std::vector labels = load_labels(FLAGS_label_file); + printf("print_topk \n"); + print_topk(out, output_num, FLAGS_topk, labels); + LOG(INFO) << "output_num: " << output_num; + LOG(INFO) << "out " << out[0]; + LOG(INFO) << "out " << out[1]; + FILE* fp = nullptr; + if (flag_out) { + fp = fopen(FLAGS_out_txt.c_str(), "w"); + } + double sum1 = 0.f; + for (int i = 0; i < output_num; ++i) { + if (flag_out) { + fprintf(fp, "%f\n", out[i]); + } + sum1 += out[i]; + } + if (flag_out) { + fclose(fp); + } + printf("out mean: %f \n", sum1 / output_num); + + FILE* fp_w = fopen("time.txt", "a+"); + if (!fp_w) { + printf("open file failed \n"); + return; + } + fprintf(fp_w, + "model: %s, threads: %d, avg: %f ms, min: %f ms, max: %f ms \n", + model_dir.c_str(), + thread_num, + ti.LapTimes().Avg(), + ti.LapTimes().Min(), + ti.LapTimes().Max()); + fclose(fp_w); + + // please turn off memory_optimize_pass to use this feature. + if (FLAGS_arg_name != "") { + auto arg_tensor = predictor->GetTensor(FLAGS_arg_name); + auto arg_shape = arg_tensor->shape(); + int arg_num = 1; + std::ostringstream os; + os << "{"; + for (int i = 0; i < arg_shape.size(); ++i) { + arg_num *= arg_shape[i]; + os << arg_shape[i] << ","; + } + os << "}"; + float sum = 0.; + std::ofstream out(FLAGS_arg_name + ".txt"); + for (size_t i = 0; i < arg_num; ++i) { + sum += arg_tensor->data()[i]; + out << paddle::lite::to_string(arg_tensor->data()[i]) << "\n"; + } + LOG(INFO) << FLAGS_arg_name << " shape is " << os.str() + << ", mean value is " << sum * 1. / arg_num; + } +} +#endif + +} // namespace lite_api +} // namespace paddle + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_model_dir == "") { + LOG(INFO) << "usage: " + << "--model_dir /path/to/your/model"; + exit(0); + } + std::string save_optimized_model_dir = ""; + if (FLAGS_use_optimize_nb) { + save_optimized_model_dir = FLAGS_model_dir; + } else { + save_optimized_model_dir = FLAGS_model_dir + "opt2"; + } + + auto split_string = + [](const std::string& str_in) -> std::vector { + std::vector str_out; + std::string tmp_str = str_in; + while (!tmp_str.empty()) { + size_t next_offset = tmp_str.find(":"); + str_out.push_back(tmp_str.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return str_out; + }; + + auto get_shape = [](const std::string& str_shape) -> std::vector { + std::vector shape; + std::string tmp_str = str_shape; + while (!tmp_str.empty()) { + int dim = atoi(tmp_str.data()); + shape.push_back(dim); + size_t next_offset = tmp_str.find(","); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return shape; + }; + + LOG(INFO) << "input shapes: " << FLAGS_input_shape; + std::vector str_input_shapes = split_string(FLAGS_input_shape); + std::vector> input_shapes; + for (int i = 0; i < str_input_shapes.size(); ++i) { + LOG(INFO) << "input shape: " << str_input_shapes[i]; + input_shapes.push_back(get_shape(str_input_shapes[i])); + } + + if (!FLAGS_use_optimize_nb) { + // Output optimized model + paddle::lite_api::OutputOptModel( + FLAGS_model_dir, save_optimized_model_dir, input_shapes); + } + +#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK + // Run inference using optimized model + paddle::lite_api::Run( + input_shapes, + save_optimized_model_dir, + static_cast(FLAGS_power_mode), + FLAGS_threads, + FLAGS_repeats, + FLAGS_warmup); +#endif + return 0; +} diff --git a/lite/api/model_test_detection.cc b/lite/api/model_test_detection.cc new file mode 100644 index 0000000000000000000000000000000000000000..f9be12b2c78c623a2b2c9852850576cc11815bd3 --- /dev/null +++ b/lite/api/model_test_detection.cc @@ -0,0 +1,349 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "lite/api/paddle_api.h" +#include "lite/api/test_helper.h" +#include "lite/core/device_info.h" +#include "lite/core/profile/timer.h" +#include "lite/utils/cp_logging.h" +#include "lite/utils/string.h" +#ifdef LITE_WITH_PROFILE +#include "lite/core/profile/basic_profiler.h" +#endif // LITE_WITH_PROFILE + +using paddle::lite::profile::Timer; + +DEFINE_string(input_shape, + "1,3,224,224", + "input shapes, separated by colon and comma"); +DEFINE_bool(use_optimize_nb, + false, + "optimized & naive buffer model for mobile devices"); +DEFINE_string(arg_name, "", "the arg name"); + +DEFINE_string(threshold, "0.5", "threshold value default 0.5f"); +DEFINE_string(in_txt, "", "input text"); +DEFINE_string(out_txt, "", "output text"); +DEFINE_int32(orih, 1920, "input image height"); +DEFINE_int32(oriw, 1080, "input image width"); + +namespace paddle { +namespace lite_api { + +struct Object { + float x; + float y; + float width; + float height; + float class_id; + float prob; +}; + +void OutputOptModel(const std::string& load_model_dir, + const std::string& save_optimized_model_dir, + const std::vector>& input_shapes) { + lite_api::CxxConfig config; + config.set_model_dir(load_model_dir); + config.set_valid_places({ + Place{TARGET(kARM), PRECISION(kFloat)}, + }); + auto predictor = lite_api::CreatePaddlePredictor(config); + + // delete old optimized model + int ret = system( + paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str()) + .c_str()); + if (ret == 0) { + LOG(INFO) << "delete old optimized model " << save_optimized_model_dir; + } + predictor->SaveOptimizedModel(save_optimized_model_dir, + LiteModelType::kNaiveBuffer); + LOG(INFO) << "Load model from " << load_model_dir; + LOG(INFO) << "Save optimized model to " << save_optimized_model_dir; +} + +void detect_choose(const float* dout, + std::vector dims, + const float thresh) { + std::string name = FLAGS_out_txt + "_accu.txt"; + FILE* fp = fopen(name.c_str(), "w"); + for (int iw = 0; iw < dims[0]; iw++) { + const float* values = dout + iw * dims[1]; + if (values[1] > thresh) { // pro > 0.01 + fprintf(fp, "%f \n", values[0]); + fprintf(fp, "%f \n", values[1]); + fprintf(fp, "%f \n", values[2]); + fprintf(fp, "%f \n", values[3]); + fprintf(fp, "%f \n", values[4]); + fprintf(fp, "%f \n", values[5]); + } + } + fclose(fp); +} +void detect_object(const float* dout, + std::vector dims, + const float thresh, + int orih, + int oriw) { + std::vector objects; + for (int iw = 0; iw < dims[0]; iw++) { + Object object; + const float* values = dout + iw * dims[1]; + object.class_id = values[0]; + object.prob = values[1]; + object.x = values[2] * oriw; + object.y = values[3] * orih; + object.width = values[4] * oriw - object.x; + object.height = values[5] * orih - object.y; + objects.push_back(object); + } + std::string name = FLAGS_out_txt + "_accu.txt"; + FILE* fp = fopen(name.c_str(), "w"); + for (int i = 0; i < objects.size(); ++i) { + Object object = objects.at(i); + if (object.prob > thresh && object.x > 0 && object.y > 0 && + object.width > 0 && object.height > 0) { + if (object.x >= oriw || object.width >= oriw || object.y >= orih || + object.height >= orih) + continue; + fprintf(fp, "%f \n", object.x); + fprintf(fp, "%f \n", object.y); + fprintf(fp, "%f \n", object.width); + fprintf(fp, "%f \n", object.height); + fprintf(fp, "%f \n", object.prob); + fprintf(fp, "%f \n", object.class_id); + LOG(INFO) << "object id: " << object.class_id << ", image size: " << oriw + << ", " << orih << ", detect object: " << object.prob + << ", location: x=" << object.x << ", y=" << object.y + << ", width=" << object.width << ", height=" << object.height; + } + } + fclose(fp); +} +#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK +void Run(const std::vector>& input_shapes, + const std::string& model_dir, + const PowerMode power_mode, + const int thread_num, + const int repeat, + const int warmup_times = 0) { + lite_api::MobileConfig config; + config.set_model_dir(model_dir); + config.set_power_mode(power_mode); + config.set_threads(thread_num); + + auto predictor = lite_api::CreatePaddlePredictor(config); + bool flag_in = true; + bool flag_out = true; + if (FLAGS_in_txt == "") { + flag_in = false; + } + if (FLAGS_out_txt == "") { + flag_out = false; + } + printf("flag_in: %d, flag_out: %d \n", flag_in, flag_out); + for (int j = 0; j < input_shapes.size(); ++j) { + auto input_tensor = predictor->GetInput(j); + input_tensor->Resize(input_shapes[j]); + auto input_data = input_tensor->mutable_data(); + int input_num = 1; + for (int i = 0; i < input_shapes[j].size(); ++i) { + input_num *= input_shapes[j][i]; + } + + FILE* fp_r = nullptr; + if (flag_in) { + fp_r = fopen(FLAGS_in_txt.c_str(), "r"); + } + for (int i = 0; i < input_num; ++i) { + if (flag_in) { + fscanf(fp_r, "%f\n", &input_data[i]); + } else { + input_data[i] = 1.f; + } + } + if (flag_in) { + fclose(fp_r); + } + } + + for (int i = 0; i < warmup_times; ++i) { + predictor->Run(); + } + + Timer ti; + for (int j = 0; j < repeat; ++j) { + ti.Start(); + predictor->Run(); + float t = ti.Stop(); + LOG(INFO) << "iter: " << j << ", time: " << t << " ms"; + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << model_dir + << ", power_mode: " << static_cast(power_mode) + << ", threads num " << thread_num << ", warmup: " << warmup_times + << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg() + << " ms" + << ", min time: " << ti.LapTimes().Min() << " ms" + << ", max time: " << ti.LapTimes().Max() << " ms."; + + auto output = predictor->GetOutput(0); + auto out = output->data(); + auto output_shape = output->shape(); + // detect + detect_object( + out, output_shape, atof(FLAGS_threshold.data()), FLAGS_orih, FLAGS_oriw); + // detect_choose(out, output_shape, atof(FLAGS_threshold.data())); + LOG(INFO) << "out " << out[0]; + LOG(INFO) << "out " << out[1]; + int output_num = 1; + for (int i = 0; i < output_shape.size(); ++i) { + output_num *= output_shape[i]; + } + LOG(INFO) << "output_num: " << output_num; + FILE* fp = nullptr; + if (flag_out) { + fp = fopen(FLAGS_out_txt.c_str(), "w"); + } + double sum1 = 0.f; + for (int i = 0; i < output_num; ++i) { + if (flag_out) { + fprintf(fp, "%f\n", out[i]); + } + sum1 += out[i]; + } + if (flag_out) { + fclose(fp); + } + + printf("out mean: %f \n", sum1 / output_num); + + FILE* fp_w = fopen("time.txt", "a+"); + if (!fp_w) { + printf("open file failed \n"); + return; + } + fprintf(fp_w, + "model: %s, threads: %d, avg: %f ms, min: %f ms, max: %f ms \n", + model_dir.c_str(), + thread_num, + ti.LapTimes().Avg(), + ti.LapTimes().Min(), + ti.LapTimes().Max()); + fclose(fp_w); + + // please turn off memory_optimize_pass to use this feature. + if (FLAGS_arg_name != "") { + auto arg_tensor = predictor->GetTensor(FLAGS_arg_name); + auto arg_shape = arg_tensor->shape(); + int arg_num = 1; + std::ostringstream os; + os << "{"; + for (int i = 0; i < arg_shape.size(); ++i) { + arg_num *= arg_shape[i]; + os << arg_shape[i] << ","; + } + os << "}"; + float sum = 0.; + std::ofstream out(FLAGS_arg_name + ".txt"); + for (size_t i = 0; i < arg_num; ++i) { + sum += arg_tensor->data()[i]; + out << paddle::lite::to_string(arg_tensor->data()[i]) << "\n"; + } + LOG(INFO) << FLAGS_arg_name << " shape is " << os.str() + << ", mean value is " << sum * 1. / arg_num; + } +} +#endif + +} // namespace lite_api +} // namespace paddle + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_model_dir == "") { + LOG(INFO) << "usage: " + << "--model_dir /path/to/your/model"; + exit(0); + } + std::string save_optimized_model_dir = ""; + if (FLAGS_use_optimize_nb) { + save_optimized_model_dir = FLAGS_model_dir; + } else { + save_optimized_model_dir = FLAGS_model_dir + "opt2"; + } + + auto split_string = + [](const std::string& str_in) -> std::vector { + std::vector str_out; + std::string tmp_str = str_in; + while (!tmp_str.empty()) { + size_t next_offset = tmp_str.find(":"); + str_out.push_back(tmp_str.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return str_out; + }; + + auto get_shape = [](const std::string& str_shape) -> std::vector { + std::vector shape; + std::string tmp_str = str_shape; + while (!tmp_str.empty()) { + int dim = atoi(tmp_str.data()); + shape.push_back(dim); + size_t next_offset = tmp_str.find(","); + if (next_offset == std::string::npos) { + break; + } else { + tmp_str = tmp_str.substr(next_offset + 1); + } + } + return shape; + }; + + LOG(INFO) << "input shapes: " << FLAGS_input_shape; + std::vector str_input_shapes = split_string(FLAGS_input_shape); + std::vector> input_shapes; + for (int i = 0; i < str_input_shapes.size(); ++i) { + LOG(INFO) << "input shape: " << str_input_shapes[i]; + input_shapes.push_back(get_shape(str_input_shapes[i])); + } + + if (!FLAGS_use_optimize_nb) { + // Output optimized model + paddle::lite_api::OutputOptModel( + FLAGS_model_dir, save_optimized_model_dir, input_shapes); + } + +#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK + // Run inference using optimized model + paddle::lite_api::Run( + input_shapes, + save_optimized_model_dir, + static_cast(FLAGS_power_mode), + FLAGS_threads, + FLAGS_repeats, + FLAGS_warmup); +#endif + return 0; +} diff --git a/lite/api/ocr_attention_test.cc b/lite/api/ocr_attention_test.cc index 5e39c5437c18990be9c6414695a94c6f2c9fcf20..ae45b8e2282d0946019d83a76298c0b0a61f9832 100644 --- a/lite/api/ocr_attention_test.cc +++ b/lite/api/ocr_attention_test.cc @@ -32,18 +32,10 @@ void TestModel(const std::vector& valid_places, bool use_npu = false) { predictor.Build(FLAGS_model_dir, "", "", valid_places); - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 1, 48, 512}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - auto* init_scores = predictor.GetInput(2); init_scores->Resize(DDim(std::vector({1, 1}))); auto* data_scores = init_scores->mutable_data(); - auto scores_size = input_tensor->dims().production(); + auto scores_size = init_scores->dims().production(); for (int i = 0; i < scores_size; i++) { data_scores[i] = 0; } @@ -53,7 +45,7 @@ void TestModel(const std::vector& valid_places, bool use_npu = false) { auto* init_ids = predictor.GetInput(1); init_ids->Resize(DDim(std::vector({1, 1}))); - auto* data_ids = init_ids->mutable_data(); + auto* data_ids = init_ids->mutable_data(); auto ids_size = init_ids->dims().production(); for (int i = 0; i < ids_size; i++) { data_ids[i] = 0; @@ -62,6 +54,13 @@ void TestModel(const std::vector& valid_places, bool use_npu = false) { std::vector> lod_i{{0, 1}, {0, 1}}; *lod_ids = lod_i; + auto* input_tensor = predictor.GetInput(0); + input_tensor->Resize(DDim(std::vector({1, 1, 48, 512}))); + auto* data = input_tensor->mutable_data(); + auto item_size = input_tensor->dims().production(); + for (int i = 0; i < item_size; i++) { + data[i] = 1; + } for (int i = 0; i < FLAGS_warmup; ++i) { predictor.Run(); } @@ -102,6 +101,7 @@ void TestModel(const std::vector& valid_places, bool use_npu = false) { TEST(OcrAttention, test_arm) { std::vector valid_places({ + Place{TARGET(kARM), PRECISION(kInt64)}, Place{TARGET(kARM), PRECISION(kFloat)}, }); diff --git a/lite/api/opt.cc b/lite/api/opt.cc index a00646f4e11b68f0233a8b6009fbf847e9d50d63..b8497199684cb4f6d4cc602291be5762eb93f7f9 100644 --- a/lite/api/opt.cc +++ b/lite/api/opt.cc @@ -30,6 +30,7 @@ #include "lite/model_parser/compatible_pb.h" #include "lite/model_parser/pb/program_desc.h" #include "lite/utils/cp_logging.h" +#include "lite/utils/io.h" #include "lite/utils/string.h" #include "supported_kernel_op_info.h" // NOLINT @@ -66,7 +67,6 @@ DEFINE_string(valid_targets, "arm", "The targets this model optimized for, should be one of (arm, " "opencl, x86), splitted by space"); -DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels"); DEFINE_bool(print_supported_ops, false, "Print supported operators on the inputed target"); @@ -87,10 +87,13 @@ std::vector ParserValidPlaces() { auto target_reprs = lite::Split(FLAGS_valid_targets, ","); for (auto& target_repr : target_reprs) { if (target_repr == "arm") { - valid_places.emplace_back(TARGET(kARM)); + valid_places.emplace_back( + Place{TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)}); + valid_places.emplace_back( + Place{TARGET(kARM), PRECISION(kInt32), DATALAYOUT(kNCHW)}); } else if (target_repr == "opencl") { valid_places.emplace_back( - Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)}); + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)}); valid_places.emplace_back( Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}); valid_places.emplace_back( @@ -117,11 +120,6 @@ std::vector ParserValidPlaces() { << "At least one target should be set, should set the " "command argument 'valid_targets'"; - if (FLAGS_prefer_int8_kernel) { - LOG(WARNING) << "Int8 mode is only support by ARM target"; - valid_places.insert(valid_places.begin(), - Place{TARGET(kARM), PRECISION(kInt8)}); - } return valid_places; } @@ -251,7 +249,6 @@ void PrintHelpInfo() { " `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out=`\n" " `--valid_targets=(arm|opencl|x86|npu|xpu)`\n" - " `--prefer_int8_kernel=(true|false)`\n" " `--record_tailoring_info=(true|false)`\n" " Arguments of model checking and ops information:\n" " `--print_all_ops=true` Display all the valid operators of " @@ -400,6 +397,7 @@ void Main() { return; } + lite::MkDirRecur(FLAGS_optimize_out); auto model_dirs = lite::ListDir(FLAGS_model_set_dir, true); if (model_dirs.size() == 0) { LOG(FATAL) << "[" << FLAGS_model_set_dir << "] does not contain any model"; @@ -454,7 +452,9 @@ int main(int argc, char** argv) { } google::ParseCommandLineFlags(&argc, &argv, false); paddle::lite_api::ParseInputCommand(); - paddle::lite_api::CheckIfModelSupported(); + if (FLAGS_model_set_dir == "") { + paddle::lite_api::CheckIfModelSupported(); + } paddle::lite_api::Main(); return 0; } diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc new file mode 100644 index 0000000000000000000000000000000000000000..bd86f486248a2daccde13da078ae3860d8e31169 --- /dev/null +++ b/lite/api/opt_base.cc @@ -0,0 +1,364 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/api/opt_base.h" +#include "all_kernel_faked.cc" // NOLINT + +namespace paddle { +namespace lite_api { + +void OptBase::SetModelDir(const std::string& model_path) { + opt_config_.set_model_dir(model_path); +} + +void OptBase::SetModelFile(const std::string& model_path) { + opt_config_.set_model_file(model_path); +} + +void OptBase::SetParamFile(const std::string& param_path) { + opt_config_.set_param_file(param_path); +} + +void OptBase::SetModelType(std::string optimize_out_type) { + if (optimize_out_type == "protobuf") { + model_type_ = LiteModelType::kProtobuf; + } else if (optimize_out_type == "naive_buffer") { + model_type_ = LiteModelType::kNaiveBuffer; + } else { + LOG(FATAL) << "Unsupported Model type :" << optimize_out_type; + } +} + +void OptBase::SetValidPlaces(const std::string& valid_places) { + valid_places_.clear(); + auto target_reprs = lite::Split(valid_places, ","); + for (auto& target_repr : target_reprs) { + if (target_repr == "arm") { + valid_places_.emplace_back(TARGET(kARM)); + } else if (target_repr == "opencl") { + valid_places_.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)}); + valid_places_.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}); + valid_places_.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}); + valid_places_.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}); + valid_places_.emplace_back( + TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel + } else if (target_repr == "x86") { + valid_places_.emplace_back(TARGET(kX86)); + } else if (target_repr == "npu") { + valid_places_.emplace_back(TARGET(kNPU)); + } else if (target_repr == "xpu") { + valid_places_.emplace_back(TARGET(kXPU)); + } else { + LOG(FATAL) << lite::string_format( + "Wrong target '%s' found, please check the command flag " + "'valid_targets'", + target_repr.c_str()); + } + } + CHECK(!valid_places_.empty()) + << "At least one target should be set, should set the " + "command argument 'valid_targets'"; +} + +void OptBase::SetOptimizeOut(const std::string& optimized_out_path) { + optimize_out_path_ = optimized_out_path; +} + +void OptBase::RunOptimize(bool record_strip_info) { + CheckIfModelSupported(false); + OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map); + opt_config_.set_valid_places(valid_places_); + if (model_set_dir_ != "") { + RunOptimizeFromModelSet(record_strip_info); + } else { + auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_); + opt_predictor->SaveOptimizedModel( + optimize_out_path_, model_type_, record_strip_info); + auto resulted_model_name = + record_strip_info ? "information of striped model" : "optimized model"; + std::cout << "Save the " << resulted_model_name + << " into :" << optimize_out_path_ << "successfully"; + } +} + +// collect ops info of modelset +void CollectModelMetaInfo(const std::string& output_dir, + const std::vector& models, + const std::string& filename) { + std::set total; + for (const auto& name : models) { + std::string model_path = + lite::Join({output_dir, name, filename}, "/"); + auto lines = lite::ReadLines(model_path); + total.insert(lines.begin(), lines.end()); + } + std::string output_path = + lite::Join({output_dir, filename}, "/"); + lite::WriteLines(std::vector(total.begin(), total.end()), + output_path); +} + +void OptBase::SetModelSetDir(const std::string& model_set_path) { + model_set_dir_ = model_set_path; +} +void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { + // 1. mkdir of outputed optimized model set. + lite::MkDirRecur(optimize_out_path_); + auto model_dirs = lite::ListDir(model_set_dir_, true); + if (model_dirs.size() == 0) { + LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model"; + } + + // 2. optimize each model in inputed model set dir. + std::string model_file = opt_config_.model_file(); + std::string param_file = opt_config_.param_file(); + for (const auto& name : model_dirs) { + std::string input_model_dir = + lite::Join({model_set_dir_, name}, "/"); + std::string output_model_dir = + lite::Join({optimize_out_path_, name}, "/"); + + if (opt_config_.model_file() != "" && opt_config_.param_file() != "") { + auto model_file_path = + lite::Join({input_model_dir, model_file}, "/"); + auto param_file_path = + lite::Join({input_model_dir, param_file}, "/"); + } + + std::cout << "Start optimize model: " << input_model_dir; + + opt_config_.set_model_dir(input_model_dir); + opt_config_.set_model_file(model_file); + opt_config_.set_param_file(param_file); + + auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_); + opt_predictor->SaveOptimizedModel( + optimize_out_path_, model_type_, record_strip_info); + + std::cout << "Optimize done. "; + } + + // 3. if record_strip_info = true, we will record striping info + if (record_strip_info) { + // Collect all models information + CollectModelMetaInfo( + optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME); + CollectModelMetaInfo( + optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME); + CollectModelMetaInfo(optimize_out_path_, + model_dirs, + lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME); + CollectModelMetaInfo( + optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME); + std::cout << "Record the information of stripped models into :" + << optimize_out_path_ << "successfully"; + } +} + +void OptBase::PrintHelpInfo() { + const std::string opt_version = lite::version(); + const char help_info[] = + "At least one argument should be inputed. Valid arguments are listed " + "below:\n" + " Arguments of help information:\n" + " `help()` Print help infomation\n" + " Arguments of model optimization:\n" + " `set_model_dir(model_dir)`\n" + " `set_model_file(model_file_path)`\n" + " `set_param_file(param_file_path)`\n" + " `set_model_type(protobuf|naive_buffer)`\n" + " `set_optimize_out(output_optimize_model_dir)`\n" + " `set_valid_places(arm|opencl|x86|npu|xpu)`\n" + " `run_optimize(false|true)`\n" + " ` ----fasle&true refer to whether to record ops info for " + "tailoring lib, false by default`\n" + " Arguments of model checking and ops information:\n" + " `print_all_ops()` Display all the valid operators of " + "Paddle-Lite\n" + " `print_supported_ops` Display supported operators of valid " + "places\n" + " `check_if_model_supported()` Check if the input model is " + "supported\n"; + + std::cout << "opt version:" << opt_version << std::endl + << help_info << std::endl; +} +// 2. Print supported info of inputed ops +void OptBase::PrintOpsInfo(const std::set& valid_ops) { + std::vector lite_supported_targets = {"kHost", + "kX86", + "kCUDA", + "kARM", + "kOpenCL", + "kFPGA", + "kNPU", + "kXPU", + "kAny", + "kUnk"}; + // Get the lengh of the first column: maximum length of the op_type + size_t maximum_optype_length = 0; + for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) { + maximum_optype_length = it->first.size() > maximum_optype_length + ? it->first.size() + : maximum_optype_length; + } + std::cout << std::setiosflags(std::ios::internal); + // Print the first row: OP_nam taget1 target2 ... + std::cout << std::setw(maximum_optype_length) << "OP_name"; + for (size_t i = 0; i < lite_supported_targets.size(); i++) { + std::cout << std::setw(10) << lite_supported_targets[i].substr(1); + } + std::cout << std::endl; + // Print the name of supported ops and mark if it's supported by each target + // print the support info of inputed ops: valid_ops + for (auto op = valid_ops.begin(); op != valid_ops.end(); op++) { + std::cout << std::setw(maximum_optype_length) << *op; + // Check: If this kernel doesn't match any operator, we will skip it. + if (supported_ops.find(*op) == supported_ops.end()) { + continue; + } + // Print OP info. + auto ops_valid_places = supported_ops.at(*op); + for (size_t i = 0; i < lite_supported_targets.size(); i++) { + if (std::find(ops_valid_places.begin(), + ops_valid_places.end(), + lite_supported_targets[i]) != ops_valid_places.end()) { + std::cout << std::setw(10) << "Y"; + } else { + std::cout << std::setw(10) << " "; + } + } + std::cout << std::endl; + } +} + +void OptBase::DisplayKernelsInfo() { // Display kernel information + std::cout << ::paddle::lite::KernelRegistry::Global().DebugString(); +} +void OptBase::PrintAllOps() { + // 1. Get supported ops on these targets + std::set valid_ops; + for (size_t i = 0; i < supported_ops_target.size(); i++) { + auto ops = supported_ops_target[i]; + valid_ops.insert(ops.begin(), ops.end()); + } + // 2. Print support info of these ops + PrintOpsInfo(valid_ops); +} + +void OptBase::PrintSupportedOps() { + // 1. Get the valid hardware targets + std::vector target_types = {}; + for (size_t i = 0; i < valid_places_.size(); i++) { + target_types.push_back(valid_places_[i].target); + } + std::string targets_str = TargetToStr(target_types[0]); + for (size_t i = 1; i < target_types.size(); i++) { + targets_str = targets_str + TargetToStr(target_types[i]); + } + std::cout << "Supported OPs on '" << targets_str << "': " << std::endl; + target_types.push_back(TARGET(kHost)); + target_types.push_back(TARGET(kUnk)); + + // 2. Get supported ops on these targets + std::set valid_ops; + for (size_t i = 0; i < target_types.size(); i++) { + auto ops = supported_ops_target[static_cast(target_types[i])]; + valid_ops.insert(ops.begin(), ops.end()); + } + // 3. Print support info of these ops + PrintOpsInfo(valid_ops); +} + +// test whether this model is supported +void OptBase::CheckIfModelSupported(bool print_ops_info) { + // 1. parse valid places and valid targets + auto valid_ops = supported_ops_target[static_cast(TARGET(kHost))]; + auto valid_unktype_ops = supported_ops_target[static_cast(TARGET(kUnk))]; + valid_ops.insert( + valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end()); + for (size_t i = 0; i < valid_places_.size(); i++) { + auto target = valid_places_[i].target; + auto ops = supported_ops_target[static_cast(target)]; + valid_ops.insert(valid_ops.end(), ops.begin(), ops.end()); + } + // get valid ops + std::set valid_ops_set(valid_ops.begin(), valid_ops.end()); + + // 2.Load model into program to get ops in model + std::string prog_path = opt_config_.model_dir() + "/__model__"; + if (!(opt_config_.model_file()).empty() && + !(opt_config_.param_file()).empty()) { + prog_path = opt_config_.model_file(); + } + lite::cpp::ProgramDesc cpp_prog; + framework::proto::ProgramDesc pb_proto_prog = + *lite::LoadProgram(prog_path, false); + lite::pb::ProgramDesc pb_prog(&pb_proto_prog); + // Transform to cpp::ProgramDesc + lite::TransformProgramDescAnyToCpp(pb_prog, &cpp_prog); + + std::set unsupported_ops; + std::set input_model_ops; + for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) { + auto current_block = cpp_prog.GetBlock(index); + for (size_t i = 0; i < current_block->OpsSize(); ++i) { + auto& op_desc = *current_block->GetOp(i); + auto op_type = op_desc.Type(); + input_model_ops.insert(op_type); + if (valid_ops_set.count(op_type) == 0) { + unsupported_ops.insert(op_type); + } + } + } + // 3. Print ops_info of input model and check if this model is supported + if (print_ops_info) { + std::cout << "OPs in the input model include:\n"; + PrintOpsInfo(input_model_ops); + } + if (!unsupported_ops.empty()) { + std::string unsupported_ops_str = *unsupported_ops.begin(); + for (auto op_str = ++unsupported_ops.begin(); + op_str != unsupported_ops.end(); + op_str++) { + unsupported_ops_str = unsupported_ops_str + ", " + *op_str; + } + std::vector targets = {}; + for (size_t i = 0; i < valid_places_.size(); i++) { + targets.push_back(valid_places_[i].target); + } + std::sort(targets.begin(), targets.end()); + targets.erase(unique(targets.begin(), targets.end()), targets.end()); + std::string targets_str = TargetToStr(targets[0]); + for (size_t i = 1; i < targets.size(); i++) { + targets_str = targets_str + "," + TargetToStr(targets[i]); + } + + LOG(ERROR) << "Error: This model is not supported, because " + << unsupported_ops.size() << " ops are not supported on '" + << targets_str << "'. These unsupported ops are: '" + << unsupported_ops_str << "'."; + exit(1); + } + if (print_ops_info) { + std::cout << "Paddle-Lite supports this model!" << std::endl; + exit(1); + } +} +} // namespace lite_api +} // namespace paddle diff --git a/lite/api/opt_base.h b/lite/api/opt_base.h new file mode 100644 index 0000000000000000000000000000000000000000..a8d6d0390ccd3f1c9b0291b1bcf6eb1ecc47a248 --- /dev/null +++ b/lite/api/opt_base.h @@ -0,0 +1,86 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file defines Opt and basic functions about model transformation. + */ + +#ifndef PADDLE_LITE_OPT_H_ // NOLINT +#define PADDLE_LITE_OPT_H_ +#include +#include +#include +#include +#include +// stores the map that records the source_file path of each kernel. +#include "kernel_src_map.h" // NOLINT +#include "lite/api/cxx_api.h" +// version of Paddle-lite +#include "lite/core/version.h" +// model parser functions to pre-load model to verify if this model is supported +#include "lite/model_parser/compatible_pb.h" +#include "lite/model_parser/pb/program_desc.h" +#include "lite/utils/string.h" +// recorded all the ops supported by paddle-lite +#include "supported_kernel_op_info.h" // NOLINT + +namespace paddle { +namespace lite_api { + +/// The PaddlePredictor defines the basic interfaces for different kinds of +/// predictors. +class LITE_API OptBase { + public: + OptBase() = default; + void SetModelSetDir(const std::string &model_set_path); + void SetModelDir(const std::string &model_path); + void SetModelFile(const std::string &model_path); + void SetParamFile(const std::string ¶m_path); + void SetValidPlaces(const std::string &valid_places); + void SetOptimizeOut(const std::string &optimized_out_path); + // set optimized_model type + void SetModelType(std::string model_type); + // transform and save the optimized model + void RunOptimize(bool record_strip_info = false); + + // fuctions of printing info + // 1. help info + void PrintHelpInfo(); + // 2. PrintOpsInfo + void PrintOpsInfo(const std::set &valid_ops = + {}); // print supported ops on target_types + void PrintAllOps(); // print all ops + void PrintSupportedOps(); // print ops supported on valid_places_ + void DisplayKernelsInfo(); // Display kernel information + // 3. Check if this model is supported + void CheckIfModelSupported(bool print_ops_info = true); + + private: + CxxConfig opt_config_; + // valid places for the optimized_model + std::vector valid_places_; + // filename of the optimized_model + std::string optimize_out_path_; + // type of the optimized_model, kNaiveBuffer default. + LiteModelType model_type_{LiteModelType::kNaiveBuffer}; + // Dir path of a set of models, this should be combined with model + std::string model_set_dir_; + + void RunOptimizeFromModelSet(bool record_strip_info = false); +}; + +} // namespace lite_api +} // namespace paddle + +#endif // NOLINT diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc index 9f071cf7780e27defdd1fcd6be02844618165fb6..2cb2064da518bca442e882d0733c5c6966c4fac0 100644 --- a/lite/api/paddle_api.cc +++ b/lite/api/paddle_api.cc @@ -38,6 +38,7 @@ void Tensor::Resize(const shape_t &shape) { tensor(raw_tensor_)->Resize(shape); } +// Tensor::data template <> const float *Tensor::data() const { return ctensor(raw_tensor_)->data(); @@ -47,15 +48,19 @@ const int8_t *Tensor::data() const { return ctensor(raw_tensor_)->data(); } template <> +const uint8_t *Tensor::data() const { + return ctensor(raw_tensor_)->data(); +} +template <> const int64_t *Tensor::data() const { return ctensor(raw_tensor_)->data(); } - template <> const int32_t *Tensor::data() const { return ctensor(raw_tensor_)->data(); } +// Tensor::mutable_data template <> int *Tensor::mutable_data(TargetType type) const { return tensor(raw_tensor_)->mutable_data(type); @@ -69,6 +74,10 @@ int8_t *Tensor::mutable_data(TargetType type) const { return tensor(raw_tensor_)->mutable_data(type); } template <> +uint8_t *Tensor::mutable_data(TargetType type) const { + return tensor(raw_tensor_)->mutable_data(type); +} +template <> int64_t *Tensor::mutable_data(TargetType type) const { return tensor(raw_tensor_)->mutable_data(type); } @@ -116,18 +125,22 @@ void Tensor::CopyToCpu(T *data) const { template void Tensor::CopyFromCpu(const int *); template void Tensor::CopyFromCpu(const float *); template void Tensor::CopyFromCpu(const int8_t *); +template void Tensor::CopyFromCpu(const uint8_t *); template void Tensor::CopyFromCpu(const int *); template void Tensor::CopyFromCpu(const float *); template void Tensor::CopyFromCpu(const int8_t *); +template void Tensor::CopyFromCpu(const uint8_t *); + template void Tensor::CopyFromCpu(const int *); template void Tensor::CopyFromCpu(const int64_t *); template void Tensor::CopyFromCpu(const float *); template void Tensor::CopyFromCpu(const int8_t *); -template void Tensor::CopyToCpu(int8_t *) const; template void Tensor::CopyToCpu(float *) const; template void Tensor::CopyToCpu(int *) const; +template void Tensor::CopyToCpu(int8_t *) const; +template void Tensor::CopyToCpu(uint8_t *) const; shape_t Tensor::shape() const { return ctensor(raw_tensor_)->dims().Vectorize(); diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index 307eeb74e8b4cdc3b2d6188eb18490e4dcf89b8f..c445ef641b96d9fbbc5b4123be794976c0cf03c4 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -206,7 +206,7 @@ class LITE_API MobileConfig : public ConfigBase { }; template -std::shared_ptr CreatePaddlePredictor(const ConfigT&); +LITE_API std::shared_ptr CreatePaddlePredictor(const ConfigT&); } // namespace lite_api } // namespace paddle diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc index 9213a24e5c0614550a098c4de8d97b6cf6695177..9b8384f2823ee121aa8bb505dd135735d9f96774 100644 --- a/lite/api/paddle_api_test.cc +++ b/lite/api/paddle_api_test.cc @@ -15,9 +15,6 @@ #include "lite/api/paddle_api.h" #include #include -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" #include "lite/utils/cp_logging.h" #include "lite/utils/io.h" DEFINE_string(model_dir, "", ""); diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index 2cced919e601f8ecb79ce262a2b083d5b6862da9..dba65656cbcffb00319c8f6083909e487e3df7a2 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -45,6 +45,21 @@ std::string Place::DebugString() const { return os.str(); } +const std::string& ActivationTypeToStr(ActivationType act) { + static const std::string act2string[] = {"unk", + "Relu", + "Relu6", + "PRelu", + "LeakyRelu", + "Sigmoid", + "Tanh", + "Swish", + "Exp"}; + auto x = static_cast(act); + CHECK_LT(x, static_cast(ActivationType::NUM)); + return act2string[x]; +} + const std::string& TargetToStr(TargetType target) { static const std::string target2string[] = {"unk", "host", diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h index 7da52adc7fb6fdd70de3b098508e4622496bed7d..1de46a39467af125e705cfcb7a9eeae64a0be133 100644 --- a/lite/api/paddle_place.h +++ b/lite/api/paddle_place.h @@ -54,7 +54,8 @@ enum class TargetType : int { kXPU = 9, kBM = 10, kAny = 6, // any target - NUM = 11, // number of fields. + kMLU = 11, + NUM = 12, // number of fields. }; enum class PrecisionType : int { kUnk = 0, @@ -96,7 +97,9 @@ enum class ActivationType : int { kLeakyRelu = 4, kSigmoid = 5, kTanh = 6, - kSwish = 7 + kSwish = 7, + kExp = 8, + NUM = 9, }; static size_t PrecisionTypeLength(PrecisionType type) { @@ -148,6 +151,8 @@ _ForEachPrecisionType(DefinePrecisionTypeTrait); #define PRECISION(item__) paddle::lite_api::PrecisionType::item__ #define DATALAYOUT(item__) paddle::lite_api::DataLayoutType::item__ +const std::string& ActivationTypeToStr(ActivationType act); + const std::string& TargetToStr(TargetType target); const std::string& PrecisionToStr(PrecisionType precision); diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index 943760d30742b74a0fe9150e4c2d8c8bb5dbc52a..41eca021a9ded40134122cb7b68604d9cd8f9fc2 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -24,7 +24,7 @@ USE_MIR_PASS(generate_program_pass); USE_MIR_PASS(io_copy_kernel_pick_pass); USE_MIR_PASS(argument_type_display_pass); USE_MIR_PASS(runtime_context_assign_pass); -USE_MIR_PASS(graph_visualze); +USE_MIR_PASS(graph_visualize_pass); USE_MIR_PASS(lite_conv_bn_fuse_pass); USE_MIR_PASS(lite_fc_fuse_pass); @@ -40,8 +40,10 @@ USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass); USE_MIR_PASS(lite_quant_dequant_fuse_pass); USE_MIR_PASS(type_precision_cast_pass); USE_MIR_PASS(type_layout_cast_pass); +USE_MIR_PASS(type_layout_cast_preprocess_pass); USE_MIR_PASS(memory_optimize_pass); USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(npu_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass); USE_MIR_PASS(weight_quantization_preprocess_pass); +USE_MIR_PASS(quantized_op_attributes_inference_pass); diff --git a/lite/api/python/CMakeLists.txt b/lite/api/python/CMakeLists.txt index 43178a37c663bb09acb7c025e021cbc91bf0cc5d..ba0c6eb2404ce1ffc2ad5950ee5a3476d42f01b8 100644 --- a/lite/api/python/CMakeLists.txt +++ b/lite/api/python/CMakeLists.txt @@ -2,6 +2,23 @@ if (NOT LITE_WITH_PYTHON) return() endif() +# to create setup.py for packeting whl for Paddle-Lite and opt + +execute_process( + COMMAND git describe --tags --exact-match + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_LITE_TAG + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +execute_process( + COMMAND git log -1 --format=%h + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_LITE_COMMIT + OUTPUT_STRIP_TRAILING_WHITESPACE +) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in + ${CMAKE_CURRENT_BINARY_DIR}/setup.py) add_subdirectory(pybind) #add_subdirectory(interface) diff --git a/lite/api/python/__init__.py b/lite/api/python/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/lite/api/python/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/lite/api/python/pybind/CMakeLists.txt b/lite/api/python/pybind/CMakeLists.txt index eabb6b150b93a722282118c3932676cd1aee5da8..b1de18d50c1582b0f872ad38d24939665ab1d3b0 100644 --- a/lite/api/python/pybind/CMakeLists.txt +++ b/lite/api/python/pybind/CMakeLists.txt @@ -1,6 +1,6 @@ set(PYBIND_DEPS pybind python paddle_api_light paddle_api) if (NOT LITE_ON_TINY_PUBLISH) - set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full) + set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base) endif() lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc index 2dfe0c49490ecd13e8a3ce480807bdf3875348b7..e86d570e18b50bdc3d8943ecdd3732f8475ad56c 100644 --- a/lite/api/python/pybind/pybind.cc +++ b/lite/api/python/pybind/pybind.cc @@ -26,13 +26,11 @@ #ifndef LITE_ON_TINY_PUBLISH #include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_passes.h" +#include "lite/api/opt_base.h" #endif #include "lite/api/light_api.h" #include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" #include "lite/core/tensor.h" namespace py = pybind11; @@ -50,10 +48,27 @@ using lite_api::PrecisionType; using lite_api::DataLayoutType; using lite_api::Place; using lite::LightPredictorImpl; +using lite_api::OptBase; #ifndef LITE_ON_TINY_PUBLISH using lite::CxxPaddleApiImpl; static void BindLiteCxxPredictor(py::module *m); +void BindLiteOpt(py::module *m) { + py::class_ opt_base(*m, "Opt"); + opt_base.def(py::init<>()) + .def("set_model_dir", &OptBase::SetModelDir) + .def("set_modelset_dir", &OptBase::SetModelSetDir) + .def("set_model_file", &OptBase::SetModelFile) + .def("set_param_file", &OptBase::SetParamFile) + .def("set_valid_places", &OptBase::SetValidPlaces) + .def("set_optimize_out", &OptBase::SetOptimizeOut) + .def("set_model_type", &OptBase::SetModelType) + .def("run_optimize", &OptBase::RunOptimize) + .def("help", &OptBase::PrintHelpInfo) + .def("print_supported_ops", &OptBase::PrintSupportedOps) + .def("display_kernels_info", &OptBase::DisplayKernelsInfo) + .def("print_all_ops", &OptBase::PrintAllOps); +} #endif static void BindLiteLightPredictor(py::module *m); static void BindLiteCxxConfig(py::module *m); diff --git a/lite/api/python/pybind/pybind.h b/lite/api/python/pybind/pybind.h index ca05f24b32fd0b0418d9cf595fe6134b34fa725f..15609957e05391be54466262f962e151594ef383 100644 --- a/lite/api/python/pybind/pybind.h +++ b/lite/api/python/pybind/pybind.h @@ -22,11 +22,15 @@ namespace lite { namespace pybind { void BindLiteApi(pybind11::module *m); +void BindLiteOpt(pybind11::module *m); -PYBIND11_MODULE(lite_core, m) { +PYBIND11_MODULE(lite, m) { m.doc() = "C++ core of Paddle-Lite"; BindLiteApi(&m); +#ifndef LITE_ON_TINY_PUBLISH + BindLiteOpt(&m); +#endif } } // namespace pybind diff --git a/lite/api/python/setup.py.in b/lite/api/python/setup.py.in new file mode 100644 index 0000000000000000000000000000000000000000..79028fb7493bf55eab74aa76ee51ac79f418ba0a --- /dev/null +++ b/lite/api/python/setup.py.in @@ -0,0 +1,72 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# module of pack whl installer for Paddle-lite + +import shutil +import os +from setuptools import setup, Distribution + + +class BinaryDistribution(Distribution): + 'binary distribution' + def has_ext_modules(foo): + return True + + +# get paddle-lite version, if it's not based on a release tag, we use commit id instead +PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@" +PADDLELITE_TAG = "@PADDLE_LITE_TAG@" +if PADDLELITE_TAG == "": + PADDLELITE_VERSION = PADDLELITE_COMMITE +else: + PADDLELITE_VERSION = PADDLELITE_TAG + +# core lib of paddlelite is stored as lite.so +LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite' +PACKAGE_DATA = {'paddlelite': ['lite.so']} +# put all thirdparty libraries in paddlelite.libs +PACKAGE_DATA['paddlelite.libs'] = [] +LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs' +if '${WITH_MKL}' == 'ON': + shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH) + shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH) + PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so'] + +# link lite.so to paddlelite.libs +COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\ +/inference_lite_lib/python/install/lite/lite.so" +if os.system(COMMAND) != 0: + raise Exception("patch third_party libs failed, command: %s" % COMMAND) + +# remove unused paddle/libs/__init__.py +if os.path.isfile(LIB_PATH+'/__init__.py'): + os.remove(LIB_PATH+'/__init__.py') + +# set dir path of each package +PACKAGE_DIR = { + # The paddle.fluid.proto will be generated while compiling. + # So that package points to other directory. + 'paddlelite.libs': LIB_PATH, + 'paddlelite': LITE_PATH +} + +setup( + name='paddlelite', + version=PADDLELITE_VERSION, + description='Paddle-Lite Library', + packages=['paddlelite', 'paddlelite.libs'], + package_dir=PACKAGE_DIR, + package_data=PACKAGE_DATA, + distclass=BinaryDistribution +) diff --git a/lite/api/test_resnet50_lite_bm.cc b/lite/api/test_classify_lite_bm.cc similarity index 97% rename from lite/api/test_resnet50_lite_bm.cc rename to lite/api/test_classify_lite_bm.cc index 62a58704f4245b8618540ea7109447dd99d0bfea..7da7dc03745aa623e35dec5b344e16de03cf5aca 100644 --- a/lite/api/test_resnet50_lite_bm.cc +++ b/lite/api/test_classify_lite_bm.cc @@ -33,7 +33,6 @@ namespace lite { void TestModel(const std::vector& valid_places) { lite::Predictor predictor; std::vector passes; - passes.push_back("bm_subgraph_pass"); predictor.Build(FLAGS_model_dir, "", "", valid_places, passes); auto* input_tensor = predictor.GetInput(0); @@ -81,7 +80,7 @@ void TestModel(const std::vector& valid_places) { fclose(fp); } -TEST(ResNet50, test_bm) { +TEST(Classify, test_bm) { std::vector valid_places({Place{TARGET(kBM), PRECISION(kFloat)}, Place{TARGET(kX86), PRECISION(kFloat)}}); diff --git a/lite/api/test_helper.h b/lite/api/test_helper.h index 71752c942bb53e7f2ed289ac0d965ae1d1007c55..a17fc331310cfe17ec36be504b94ddacc724e90f 100644 --- a/lite/api/test_helper.h +++ b/lite/api/test_helper.h @@ -17,6 +17,7 @@ #include #include #include +#include // for eval DEFINE_string(model_dir, "", "model dir"); @@ -43,5 +44,31 @@ inline double GetCurrentUS() { return 1e+6 * time.tv_sec + time.tv_usec; } +template +double compute_mean(const T* in, const size_t length) { + double sum = 0.; + for (size_t i = 0; i < length; ++i) { + sum += in[i]; + } + return sum / length; +} + +template +double compute_standard_deviation(const T* in, + const size_t length, + bool has_mean = false, + double mean = 10000) { + if (!has_mean) { + mean = compute_mean(in, length); + } + + double variance = 0.; + for (size_t i = 0; i < length; ++i) { + variance += pow((in[i] - mean), 2); + } + variance /= length; + return sqrt(variance); +} + } // namespace lite } // namespace paddle diff --git a/lite/api/transform_test.cc b/lite/api/transform_test.cc index 8e51f3778d30ba9fcfde493c3e27ecc973e66a59..896b47a97fb20e6935764e12fbe9ebd646a4f816 100644 --- a/lite/api/transform_test.cc +++ b/lite/api/transform_test.cc @@ -28,11 +28,10 @@ DEFINE_int32(batch, 1, "batch"); namespace paddle { namespace lite { -namespace test_transformer { +namespace test_transformer { std::vector inputed_lines; - -void LoadInputLines(const char* filename) { +void load_input_lines(const char* filename) { static const int max_line_buf_size = 100 * 1024 * 1024; char* line_buffer = (char*)calloc(max_line_buf_size, sizeof(char)); // NOLINT FILE* input_file = fopen(filename, "r"); @@ -49,7 +48,7 @@ void LoadInputLines(const char* filename) { line_buffer = NULL; fclose(input_file); } -void Split2(const std::string& main_str, +void split2(const std::string& main_str, std::vector& str_list, // NOLINT const std::string& delimiter) { size_t pre_pos = 0; @@ -75,19 +74,19 @@ void Split2(const std::string& main_str, } } // NOLINT -void PadBatchInput(std::vector& input_lines, // NOLINT - int pad_idx, - int n_head, - Tensor* src_word, - Tensor* src_pos, - Tensor* src_attn_bias, - Tensor* trg_word, - Tensor* init_scores, - Tensor* init_idx, - Tensor* trg_bias, - int line_start, - int batch_size, - int bos_idx) { +void pad_batch_input(std::vector& input_lines, // NOLINT + int pad_idx, + int n_head, + Tensor* src_word, + Tensor* src_pos, + Tensor* src_attn_bias, + Tensor* trg_word, + Tensor* init_scores, + Tensor* init_idx, + Tensor* trg_bias, + int line_start, + int batch_size, + int bos_idx) { int max_len = 0; int max_line = input_lines.size(); @@ -98,27 +97,27 @@ void PadBatchInput(std::vector& input_lines, // NOLINT std::vector split_str; - test_transformer::Split2(cur_line, split_str, " "); + test_transformer::split2(cur_line, split_str, " "); batch_lines.push_back(split_str); max_len = max_len >= split_str.size() ? max_len : split_str.size(); } - src_word->Resize(std::vector({batch_size, max_len, 1})); - src_pos->Resize(std::vector({batch_size, max_len, 1})); + src_word->Resize(std::vector({batch_size, max_len})); + src_pos->Resize(std::vector({batch_size, max_len})); src_attn_bias->Resize( std::vector({batch_size, n_head, max_len, max_len})); trg_bias->Resize( - std::vector({batch_size, n_head, 1, max_len})); - float* src_word_data = src_word->mutable_data(); - float* src_pos_data = src_pos->mutable_data(); + std::vector({batch_size, n_head, max_len, max_len})); + auto* src_word_data = src_word->mutable_data(); + auto* src_pos_data = src_pos->mutable_data(); float* src_bias_data = src_attn_bias->mutable_data(); float* trg_bias_data = trg_bias->mutable_data(); for (int i = 0; i < batch_size; ++i) { std::vector cur_words = batch_lines[i]; int fill_len = cur_words.size(); int src_bias_start = i * n_head * max_len * max_len; - int trg_bias_start = i * n_head * max_len; + int trg_bias_start = i * n_head * max_len * max_len; for (int j = 0; j < fill_len; ++j) { src_word_data[i * max_len + j] = (atoi(cur_words[j].c_str())); src_pos_data[i * max_len + j] = j; @@ -137,22 +136,24 @@ void PadBatchInput(std::vector& input_lines, // NOLINT int value_ind = j % max_len + src_bias_start; src_bias_data[j] = src_bias_data[value_ind]; } - for (int j = trg_bias_start; j < trg_bias_start + n_head * max_len; ++j) { + for (int j = trg_bias_start; + j < trg_bias_start + n_head * max_len * max_len; + ++j) { int value_ind = j % max_len + trg_bias_start; trg_bias_data[j] = trg_bias_data[value_ind]; } } - trg_word->Resize(std::vector({batch_size, 1, 1})); - auto* trg_word_data = trg_word->mutable_data(); - for (int i = 0; i < batch_size; ++i) { + trg_word->Resize(std::vector({batch_size, max_len})); + auto* trg_word_data = trg_word->mutable_data(); + for (int i = 0; i < batch_size * max_len; ++i) { trg_word_data[i] = bos_idx; } init_scores->Resize(std::vector({batch_size, 1})); init_idx->Resize(std::vector({batch_size})); float* score_data = init_scores->mutable_data(); - float* idx_data = init_idx->mutable_data(); + auto* idx_data = init_idx->mutable_data(); for (int i = 0; i < init_scores->numel(); ++i) { score_data[i] = 0; } @@ -175,21 +176,25 @@ void PadBatchInput(std::vector& input_lines, // NOLINT void TestModel(const std::vector& valid_places, const Place& preferred_place, bool use_npu = false) { +#ifdef LITE_WITH_ARM DeviceInfo::Init(); DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); +#endif lite::Predictor predictor; std::string test_data_path = FLAGS_input; - predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places); + predictor.Build("", + FLAGS_model_dir + "/__model__", + FLAGS_model_dir + "/weights", + valid_places); + // predictor.Build(FLAGS_model_dir, "", "", valid_places); int n_head = 8; int batch_size = FLAGS_batch; int bos_idx = 0; int eos_idx = 1; - LOG(INFO) << "reading"; - test_transformer::LoadInputLines(test_data_path.c_str()); - LOG(INFO) << "reading finished"; + test_transformer::load_input_lines(test_data_path.c_str()); auto* trg_bias = predictor.GetInput(6); auto* src_word = predictor.GetInput(0); @@ -205,28 +210,31 @@ void TestModel(const std::vector& valid_places, auto start = GetCurrentUS(); for (int i = 0; i < FLAGS_repeats; ++i) { - auto start_i = GetCurrentUS(); - PadBatchInput(test_transformer::inputed_lines, - eos_idx, - n_head, - src_word, // src_word - src_pos, // src_pos - src_bias, // src_bias - trg_word, // trg_word - init_score, // init_score - init_idx, // init_idx - trg_bias, // trg_bias - i * batch_size, - batch_size, - bos_idx); - LOG(INFO) << "src_word:" << src_word->dims(); - auto start_ii = GetCurrentUS(); - LOG(INFO) << i << "->ii:" << (start_ii - start_i) / 1000.0; + pad_batch_input(test_transformer::inputed_lines, + eos_idx, + n_head, + src_word, // src_word + src_pos, // src_pos + src_bias, // src_bias + trg_word, // trg_word + init_score, // init_score + init_idx, // init_idx + trg_bias, // trg_bias + i * batch_size, + batch_size, + bos_idx); predictor.Run(); - auto start_iii = GetCurrentUS(); - LOG(INFO) << i << "->iii:" << (start_iii - start_ii) / 1000.0; - auto* outs = predictor.GetOutputs(); - LOG(INFO) << "out:" << (*outs)[0].dims(); + auto* outs = predictor.GetOutput(0); + auto o_data = outs->data(); + auto lod = outs->lod(); + for (int i = 0; i < outs->numel(); ++i) { + LOG(INFO) << o_data[i]; + } + for (int i = 0; i < lod.size(); ++i) { + for (int j = 0; j < lod[i].size(); ++j) { + LOG(INFO) << lod[i][j]; + } + } } LOG(INFO) << "================== Speed Report ==================="; @@ -234,25 +242,18 @@ void TestModel(const std::vector& valid_places, << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 << " ms in average."; - - auto* outs = predictor.GetOutputs(); - for (auto out : *outs) { - LOG(INFO) << "======" - << "here"; - LOG(INFO) << out; - } - LOG(INFO) << "======" - << "hereggg"; } -TEST(OcrAttention, test_arm) { +} // namespace lite +} // namespace paddle +using namespace paddle::lite; // NOLINT +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kARM), PRECISION(kInt64)}, Place{TARGET(kARM), PRECISION(kFloat)}, + Place{TARGET(kHost), PRECISION(kFloat)}, }); TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)})); } - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt index 6f6f7e7aa71ba5067d831a2bcc2b7b933205fbe0..aecec295ae0269fb34a3c4fa38e396bdf98d4418 100644 --- a/lite/backends/arm/math/CMakeLists.txt +++ b/lite/backends/arm/math/CMakeLists.txt @@ -68,6 +68,8 @@ if (NOT HAS_ARM_MATH_LIB_DIR) gemv_arm_int8.cc conv3x3s1_direct_fp32.cc conv3x3s2_direct_fp32.cc + conv3x3s1p01_depthwise_fp32_relu.cc + conv3x3s2p01_depthwise_fp32_relu.cc conv3x3s1p01_depthwise_fp32.cc conv3x3s2p01_depthwise_fp32.cc conv3x3s1px_depthwise_fp32.cc @@ -123,5 +125,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR) anchor_generator.cc split_merge_lod_tenosr.cc reduce_prod.cc + lstm.cc DEPS ${lite_kernel_deps} context tensor) endif() diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc index 634021cc3ce82bbb5fba72123b38457ab0c7ac06..9f478eab60538eeca38415afea4e0989eff5a04e 100644 --- a/lite/backends/arm/math/activation.cc +++ b/lite/backends/arm/math/activation.cc @@ -700,6 +700,35 @@ void act_rsqrt(const float* din, float* dout, int size, int threads) { } } +template <> +void act_square(const float* din, float* dout, int size, int threads) { + const float* ptr_in = din; + float* ptr_out = dout; + for (int i = 0; i < size; ++i) { + ptr_out[0] = ptr_in[0] * ptr_in[0]; + ptr_in++; + ptr_out++; + } +} + +#ifdef LITE_WITH_TRAIN +template <> +void act_square_grad(const float* din, + const float* dout_grad, + float* din_grad, + int size, + int threads) { + const float* ptr_out_grad = dout_grad; + float* ptr_in_grad = din_grad; + for (int i = 0; i < size; ++i) { + ptr_in_grad[0] = ptr_out_grad[0] * 2.0 * din[0]; + ptr_out_grad++; + ptr_in_grad++; + din++; + } +} +#endif + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h index bb8189eef0d81a92caf2aaf73e401e20d9c80155..63f4418d70db25f98dea2a405de1f4bb6b0b9111 100644 --- a/lite/backends/arm/math/activation.h +++ b/lite/backends/arm/math/activation.h @@ -69,6 +69,15 @@ void act_hard_sigmoid(const T* din, template void act_rsqrt(const T* din, T* dout, int size, int threads); +template +void act_square(const T* din, T* dout, int size, int threads); + +#ifdef LITE_WITH_TRAIN +template +void act_square_grad( + const T* din, const T* dout_grad, T* din_grad, int size, int threads); +#endif + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/argmax.cc b/lite/backends/arm/math/argmax.cc index 3ca6d97c4d8ab97ca58e9859bfd753f7bf7f05ad..4177ad0ae05a5f29be56e9e277c0161841ba6124 100644 --- a/lite/backends/arm/math/argmax.cc +++ b/lite/backends/arm/math/argmax.cc @@ -53,7 +53,7 @@ void argmax_func(const lite::Tensor *input, std::greater>()); // out - float *out_ptr = output->mutable_data() + n * out_channel + k; + int64_t *out_ptr = output->mutable_data() + n * out_channel + k; *out_ptr = vec[0].second; } } diff --git a/lite/backends/arm/math/beam_search.cc b/lite/backends/arm/math/beam_search.cc index f93fcc0d601cc076163e4d6fb1e31fc58e7035a8..32b7d3bfeba6107493d62a0c9be14a3c15ce7692 100644 --- a/lite/backends/arm/math/beam_search.cc +++ b/lite/backends/arm/math/beam_search.cc @@ -70,7 +70,7 @@ void PruneEndBeams(const Tensor *pre_ids, std::vector> *items, size_t lod_level, int end_id) { - auto *pre_ids_data = pre_ids->data(); + auto *pre_ids_data = pre_ids->data(); auto &high_level = abs_lod[lod_level]; for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) { size_t src_prefix_start = high_level[src_idx]; @@ -152,10 +152,10 @@ std::vector> SelectTopBeamSizeItems(const Tensor *pre_ids, // find the current candidates // auto abs_lod = framework::ToAbsOffset(scores->lod()); auto abs_lod = scores->lod(); - auto *pre_ids_data = pre_ids->data(); + auto *pre_ids_data = pre_ids->data(); auto *pre_scores_data = pre_scores->data(); - auto *ids_data = ids ? ids->data() : nullptr; + auto *ids_data = ids ? ids->data() : nullptr; auto *scores_data = scores->data(); size_t num_seqs = abs_lod[lod_level].size() - 1; @@ -236,7 +236,7 @@ void beam_search(const Tensor *pre_ids, if (parent_idx) { parent_idx->Resize(dims); } - auto *selected_ids_data = selected_ids->mutable_data(); + auto *selected_ids_data = selected_ids->mutable_data(); auto *selected_scores_data = selected_scores->mutable_data(); auto *parent_idx_data = parent_idx ? parent_idx->mutable_data() : nullptr; diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc index 66d61413fc43fd518e0b34c7bc8d7b7bf5cc72a7..b024d69507101e902dc45fb83668e00dc718a6b0 100644 --- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc +++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc @@ -91,23 +91,20 @@ void conv_depthwise_3x3s1_fp32(const float *din, bool flag_bias, const operators::ActivationParam act_param, ARMContext *ctx) { + bool has_active = act_param.has_active; + bool flag_relu = false; + bool relu6 = false; + if (has_active) { + if (act_param.active_type == lite_api::ActivationType::kRelu) { + flag_relu = true; + } else { + relu6 = true; + } + } if (pad == 0) { if (w_in > 5) { - conv_depthwise_3x3s1p0_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - act_param, - ctx); - } else { - conv_depthwise_3x3s1p0_bias_s(dout, + if (relu6) { + conv_depthwise_3x3s1p0_bias(dout, din, weights, bias, @@ -120,25 +117,57 @@ void conv_depthwise_3x3s1_fp32(const float *din, w_out, act_param, ctx); + } else { + conv_depthwise_3x3s1p0_bias_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } else { + if (relu6) { + conv_depthwise_3x3s1p0_bias_s(dout, + din, + weights, + bias, + flag_bias, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + act_param, + ctx); + } else { + conv_depthwise_3x3s1p0_bias_s_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } } } if (pad == 1) { if (w_in > 4) { - conv_depthwise_3x3s1p1_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - act_param, - ctx); - } else { - conv_depthwise_3x3s1p1_bias_s(dout, + if (relu6) { + conv_depthwise_3x3s1p1_bias(dout, din, weights, bias, @@ -151,6 +180,51 @@ void conv_depthwise_3x3s1_fp32(const float *din, w_out, act_param, ctx); + } else { + conv_depthwise_3x3s1p1_bias_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } else { + if (relu6) { + conv_depthwise_3x3s1p1_bias_s(dout, + din, + weights, + bias, + flag_bias, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + act_param, + ctx); + } else { + conv_depthwise_3x3s1p1_bias_s_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } } } } @@ -1924,223 +1998,169 @@ void act_switch_3x3s1p1(const float *din_ptr0, float *vbias, int cnt, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { - float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); - float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); - - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: - asm volatile( - INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 - MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - break; - case lite_api::ActivationType::kRelu6: - /* 0 <= din <= 6 */ - asm volatile( - INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1 - MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6 - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [vsix] "w"(vsix), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - break; - case lite_api::ActivationType::kLeakyRelu: - /*din = din >= 0 ? din : din * scale*/ - asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU - MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU - RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [vscale] "w"(vscale), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - break; - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { - asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 - MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); + float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); + float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); + + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 + MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + break; + case lite_api::ActivationType::kRelu6: + /* 0 <= din <= 6 */ + asm volatile( + INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1 + MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6 + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [vsix] "w"(vsix), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + break; + case lite_api::ActivationType::kLeakyRelu: + /*din = din >= 0 ? din : din * scale*/ + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU + MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_LEAKY_RELU + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [vscale] "w"(vscale), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + break; + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } #else @@ -2159,153 +2179,117 @@ void act_switch_3x3s1p1(const float *din_ptr0, float bias_val, int cnt, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { - float tmp = act_param.Relu_clipped_coef; - float ss = act_param.Leaky_relu_alpha; - float vsix[4] = {tmp, tmp, tmp, tmp}; - float vscale[4] = {ss, ss, ss, ss}; - - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: - asm volatile( - INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 - MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; - case lite_api::ActivationType::kRelu6: - /* 0 <= din <= 6 */ - asm volatile( - INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1 - MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6 - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [six_ptr] "r"(vsix), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; - case lite_api::ActivationType::kLeakyRelu: - /*din = din >= 0 ? din : din * scale*/ - asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU - MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU - RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [scale_ptr] "r"(vscale), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { - asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 - MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); + float tmp = act_param.Relu_clipped_coef; + float ss = act_param.Leaky_relu_alpha; + float vsix[4] = {tmp, tmp, tmp, tmp}; + float vscale[4] = {ss, ss, ss, ss}; + + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 + MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; + case lite_api::ActivationType::kRelu6: + /* 0 <= din <= 6 */ + asm volatile( + INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1 + MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6 + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [six_ptr] "r"(vsix), + [vzero] "w"(vzero) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; + case lite_api::ActivationType::kLeakyRelu: + /*din = din >= 0 ? din : din * scale*/ + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU + MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_LEAKY_RELU + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [scale_ptr] "r"(vscale), + [vzero] "w"(vzero) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } #endif @@ -2575,278 +2559,214 @@ void act_switch_3x3s1p1_s(const float *din_ptr0, float32x4_t vzero, float32x4_t wbias, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { #ifdef __aarch64__ - float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); - float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); + float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); + float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); #else - float tmp = act_param.Relu_clipped_coef; - float ss = act_param.Leaky_relu_alpha; - float vsix[4] = {tmp, tmp, tmp, tmp}; - float vscale[4] = {ss, ss, ss, ss}; + float tmp = act_param.Relu_clipped_coef; + float ss = act_param.Leaky_relu_alpha; + float vsix[4] = {tmp, tmp, tmp, tmp}; + float vscale[4] = {ss, ss, ss, ss}; #endif - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: #ifdef __aarch64__ - asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); - break; + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17"); + break; #else - asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; #endif - case lite_api::ActivationType::kRelu6: + case lite_api::ActivationType::kRelu6: /* 0 <= din <= 6 */ #ifdef __aarch64__ - asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [vsix] "w"(vsix), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); - break; + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6 + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [vsix] "w"(vsix), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17"); + break; #else - asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [six_ptr] "r"(vsix), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6 + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [six_ptr] "r"(vsix), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; #endif - case lite_api::ActivationType::kLeakyRelu: + case lite_api::ActivationType::kLeakyRelu: /*din = din >= 0 ? din : din * scale*/ #ifdef __aarch64__ - asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [vscale] "w"(vscale), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); - break; -#else - asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [scale_ptr] "r"(vscale), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; -#endif - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { -#ifdef __aarch64__ - asm volatile(COMPUTE_S_S1 RESULT_S_S1 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); + asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [vscale] "w"(vscale), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20"); + break; #else - asm volatile(COMPUTE_S_S1 RESULT_S_S1 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); + asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [scale_ptr] "r"(vscale), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; #endif + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } /** @@ -2987,262 +2907,198 @@ void act_switch_3x3s1p0(const float *din_ptr0, int cnt, int remain, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { - float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); - float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); - - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: - asm volatile( - INIT_S1 - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - MID_COMPUTE_S1 MID_RESULT_S1_RELU - "cmp %w[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_RELU "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - break; - case lite_api::ActivationType::kRelu6: - /* 0 <= din <= 6 */ - asm volatile( - INIT_S1 - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - MID_COMPUTE_S1 MID_RESULT_S1_RELU6 - "cmp %w[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_RELU6 "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [vsix] "w"(vsix), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - break; - case lite_api::ActivationType::kLeakyRelu: - /*din = din >= 0 ? din : din * scale*/ - asm volatile( - INIT_S1 - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU - "cmp %w[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_LEAKY_RELU "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [vscale] "w"(vscale), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - break; - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { - asm volatile( - INIT_S1 - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - MID_COMPUTE_S1 MID_RESULT_S1 - "cmp %w[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 - "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); + float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); + float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); + + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: + asm volatile( + INIT_S1 + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ + MID_COMPUTE_S1 MID_RESULT_S1_RELU + "cmp %w[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU "0: \n" + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + break; + case lite_api::ActivationType::kRelu6: + /* 0 <= din <= 6 */ + asm volatile( + INIT_S1 + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ + MID_COMPUTE_S1 MID_RESULT_S1_RELU6 + "cmp %w[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU6 "0: \n" + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [vsix] "w"(vsix), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [remain] "r"(remain) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + break; + case lite_api::ActivationType::kLeakyRelu: + /*din = din >= 0 ? din : din * scale*/ + asm volatile( + INIT_S1 + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ + MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU + "cmp %w[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_LEAKY_RELU "0: \n" + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [vscale] "w"(vscale), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [remain] "r"(remain) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + break; + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } #else @@ -3262,191 +3118,146 @@ void act_switch_3x3s1p0(const float *din_ptr0, int cnt, int remain, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { - float tmp = act_param.Relu_clipped_coef; - float ss = act_param.Leaky_relu_alpha; - float vsix[4] = {tmp, tmp, tmp, tmp}; - float vscale[4] = {ss, ss, ss, ss}; - - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: - asm volatile(INIT_S1 - "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" - "vext.32 q6, q8, q9, #1 @ 0012\n" - "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 - MID_RESULT_S1_RELU - "cmp %[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_RELU "0: \n" - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; - case lite_api::ActivationType::kRelu6: - /* 0 <= din <= 6 */ - asm volatile(INIT_S1 - "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" - "vext.32 q6, q8, q9, #1 @ 0012\n" - "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 - MID_RESULT_S1_RELU6 - "cmp %[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_RELU6 "0: \n" - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [six_ptr] "r"(vsix), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; - case lite_api::ActivationType::kLeakyRelu: - /*din = din >= 0 ? din : din * scale*/ - asm volatile(INIT_S1 - "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" - "vext.32 q6, q8, q9, #1 @ 0012\n" - "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 - MID_RESULT_S1_LEAKY_RELU - "cmp %[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_LEAKY_RELU - "0: \n" - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [scale_ptr] "r"(vscale), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { - asm volatile( - INIT_S1 - "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" - "vext.32 q6, q8, q9, #1 @ 0012\n" - "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 MID_RESULT_S1 - "cmp %[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 - "0: \n" - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); + float tmp = act_param.Relu_clipped_coef; + float ss = act_param.Leaky_relu_alpha; + float vsix[4] = {tmp, tmp, tmp, tmp}; + float vscale[4] = {ss, ss, ss, ss}; + + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: + asm volatile(INIT_S1 + "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" + "vext.32 q6, q8, q9, #1 @ 0012\n" + "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 + MID_RESULT_S1_RELU + "cmp %[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU "0: \n" + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; + case lite_api::ActivationType::kRelu6: + /* 0 <= din <= 6 */ + asm volatile(INIT_S1 + "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" + "vext.32 q6, q8, q9, #1 @ 0012\n" + "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 + MID_RESULT_S1_RELU6 + "cmp %[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU6 "0: \n" + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [six_ptr] "r"(vsix), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; + case lite_api::ActivationType::kLeakyRelu: + /*din = din >= 0 ? din : din * scale*/ + asm volatile(INIT_S1 + "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" + "vext.32 q6, q8, q9, #1 @ 0012\n" + "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 + MID_RESULT_S1_LEAKY_RELU + "cmp %[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_LEAKY_RELU + "0: \n" + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [scale_ptr] "r"(vscale), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } #endif @@ -3694,287 +3505,220 @@ void act_switch_3x3s1p0_s(const float *din_ptr0, unsigned int *vmask_ptr, float bias_val, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { #ifdef __aarch64__ - float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); - float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); + float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef); + float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha); #else - float tmp = act_param.Relu_clipped_coef; - float ss = act_param.Leaky_relu_alpha; - float vsix[4] = {tmp, tmp, tmp, tmp}; - float vscale[4] = {ss, ss, ss, ss}; + float tmp = act_param.Relu_clipped_coef; + float ss = act_param.Leaky_relu_alpha; + float vsix[4] = {tmp, tmp, tmp, tmp}; + float vscale[4] = {ss, ss, ss, ss}; #endif - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: #ifdef __aarch64__ - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [vzero] "w"(vzero), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - break; + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vbias] "w"(wbias), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [vzero] "w"(vzero), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + break; #else - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [bias_val] "r"(bias_val), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [bias_val] "r"(bias_val), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; #endif - case lite_api::ActivationType::kRelu6: + case lite_api::ActivationType::kRelu6: /* 0 <= din <= 6 */ #ifdef __aarch64__ - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [vzero] "w"(vzero), - [vsix] "w"(vsix), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - break; + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6 + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vbias] "w"(wbias), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [vzero] "w"(vzero), + [vsix] "w"(vsix), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + break; #else - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [six_ptr] "r"(vsix), - [bias_val] "r"(bias_val), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6 + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [six_ptr] "r"(vsix), + [bias_val] "r"(bias_val), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; #endif - case lite_api::ActivationType::kLeakyRelu: + case lite_api::ActivationType::kLeakyRelu: /*din = din >= 0 ? din : din * scale*/ #ifdef __aarch64__ - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [vzero] "w"(vzero), - [vscale] "w"(vscale), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - break; -#else - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [scale_ptr] "r"(vscale), - [bias_val] "r"(bias_val), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - break; -#endif - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { -#ifdef __aarch64__ - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [vzero] "w"(vzero), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vbias] "w"(wbias), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [vzero] "w"(vzero), + [vscale] "w"(vscale), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + break; #else - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [bias_val] "r"(bias_val), - [out1] "r"(doutr0), - [out2] "r"(doutr1) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU + : [din0] "+r"(din_ptr0), + [din1] "+r"(din_ptr1), + [din2] "+r"(din_ptr2), + [din3] "+r"(din_ptr3), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [scale_ptr] "r"(vscale), + [bias_val] "r"(bias_val), + [out1] "r"(doutr0), + [out2] "r"(doutr1) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + break; #endif + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } /** diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc new file mode 100644 index 0000000000000000000000000000000000000000..c9dd4d2fd1e30d9b82a8db64a4872095af3f9768 --- /dev/null +++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc @@ -0,0 +1,2418 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/arm/math/conv_depthwise.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +#ifdef __aarch64__ +#define INIT_S1 \ + "PRFM PLDL1KEEP, [%[din_ptr0]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr1]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr2]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr3]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr4]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr5]] \n" \ + "movi v21.4s, #0x0\n" /* out0 = 0 */ \ + \ + "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ + +#define LEFT_COMPUTE_S1 \ + "ext v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ /* r0 */ \ + "fmla v12.4s, v0.4s, %[w0].s[1]\n" /* outr00 += din0_0123 * w0[1]*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "sub %[din_ptr0], %[din_ptr0], #4 \n" /* din_ptr0-- */ \ + "sub %[din_ptr1], %[din_ptr1], #4 \n" /* din_ptr0-- */ \ + \ + "fmla v12.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din0_0012 * w0[0]*/ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */ \ + "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */ \ + \ + "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_1234 * w0[2]*/ \ + \ + "ext v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ /* r1 */ \ + "fmla v13.4s , v2.4s, %[w0].s[1]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v2.4s, %[w1].s[1]\n" /* outr00 += din1_0123 * w1[1]*/ \ + "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */ \ + "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */ \ + \ + "fmla v13.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v17.16b, v4.16b, v5.16b, #4 \n" /* v16=1234 */ \ + "ext v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/ \ + \ + /* r2 */ \ + "fmla v14.4s , v4.4s, %[w0].s[1]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v13.4s , v4.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v12.4s , v4.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[1]\n" /*outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ + +#define LEFT_RESULT_S1 \ + /* r4 */ \ + "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ /* r5 */ \ + "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ \ + "cmp %w[cnt], #1 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "blt 3f \n" + +#define MID_COMPUTE_S1 \ + "1: \n" /* r0 */ \ + "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */ \ + "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */ \ + "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ + +#define MID_RESULT_S1 \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "bne 1b \n" + +#define RIGHT_COMPUTE_S1 \ + "3: \n" \ + "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" \ + "ld1 {v22.4s}, [%[doutr0]] \n" \ + "ld1 {v23.4s}, [%[doutr1]] \n" \ + "ld1 {v24.4s}, [%[doutr2]] \n" \ + "ld1 {v25.4s}, [%[doutr3]] \n" \ + \ + "bif v0.16b, %[vzero].16b, v18.16b \n" \ + "bif v1.16b, %[vzero].16b, v19.16b \n" \ + "bif v2.16b, %[vzero].16b, v18.16b \n" \ + "bif v3.16b, %[vzero].16b, v19.16b \n" \ + \ + "bif v4.16b, %[vzero].16b, v18.16b \n" \ + "bif v5.16b, %[vzero].16b, v19.16b \n" \ + "bif v6.16b, %[vzero].16b, v18.16b \n" \ + "bif v7.16b, %[vzero].16b, v19.16b \n" \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ /* r0 */ \ + "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v8.16b, %[vzero].16b, v18.16b \n" \ + "bif v9.16b, %[vzero].16b, v19.16b \n" \ + "bif v10.16b, %[vzero].16b, v18.16b \n" \ + "bif v11.16b, %[vzero].16b, v19.16b \n" \ + \ + "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v18.4s}, [%[rmask]] \n" \ + \ + "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */ \ + "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */ \ + "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ + +#define RIGHT_RESULT_S1 \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v12.16b, v22.16b, v18.16b \n" \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v13.16b, v23.16b, v18.16b \n" \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v14.16b, v24.16b, v18.16b \n" \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "bif v15.16b, v25.16b, v18.16b \n" \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" + +#define LEFT_RESULT_S1_RELU \ + /* r4 */ \ + "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ + "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \ + "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ + \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ \ + "cmp %w[cnt], #1 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "blt 3f \n" + +#define MID_RESULT_S1_RELU \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \ + \ + /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "bne 1b \n" + +#define RIGHT_RESULT_S1_RELU \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "bif v12.16b, v22.16b, v18.16b \n" \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "bif v13.16b, v23.16b, v18.16b \n" \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \ + \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "bif v14.16b, v24.16b, v18.16b \n" \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ + \ + "bif v15.16b, v25.16b, v18.16b \n" \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" + +#define COMPUTE_S_S1 \ + "prfm pldl1keep, [%[din0]]\n" \ + "prfm pldl1keep, [%[din1]]\n" \ + "prfm pldl1keep, [%[din2]]\n" \ + "prfm pldl1keep, [%[din3]]\n" \ + \ + "ld1 {v0.4s}, [%[din0]], #16\n" \ + "ld1 {v1.4s}, [%[din1]], #16\n" \ + "ld1 {v2.4s}, [%[din2]], #16\n" \ + "ld1 {v3.4s}, [%[din3]], #16\n" \ + \ + "bif v0.16b, %[zero].16b, %[mask].16b\n" \ + "bif v1.16b, %[zero].16b, %[mask].16b\n" \ + "bif v2.16b, %[zero].16b, %[mask].16b\n" \ + "bif v3.16b, %[zero].16b, %[mask].16b\n" \ + \ + "ext v4.16b, %[zero].16b, v0.16b, #12\n" \ + "ext v5.16b, %[zero].16b, v1.16b, #12\n" \ + "ext v6.16b, %[zero].16b, v2.16b, #12\n" \ + "ext v7.16b, %[zero].16b, v3.16b, #12\n" \ + \ + "ext v8.16b, v0.16b, %[zero].16b, #4\n" \ + "ext v9.16b, v1.16b, %[zero].16b, #4\n" \ + "ext v10.16b, v2.16b, %[zero].16b, #4\n" \ + "ext v11.16b, v3.16b, %[zero].16b, #4\n" \ + \ + "fmul v12.4s, v0.4s, %[wr0].s[1]\n" \ + "fmul v13.4s, v1.4s, %[wr0].s[1]\n" \ + \ + "fmul v14.4s, v1.4s, %[wr1].s[1]\n" \ + "fmul v15.4s, v2.4s, %[wr1].s[1]\n" \ + \ + "fmul v16.4s, v2.4s, %[wr2].s[1]\n" \ + "fmul v17.4s, v3.4s, %[wr2].s[1]\n" \ + \ + "fmla v12.4s, v4.4s, %[wr0].s[0]\n" \ + "fmla v13.4s, v5.4s, %[wr0].s[0]\n" \ + \ + "fmla v14.4s, v5.4s, %[wr1].s[0]\n" \ + "fmla v15.4s, v6.4s, %[wr1].s[0]\n" \ + \ + "fmla v16.4s, v6.4s, %[wr2].s[0]\n" \ + "fmla v17.4s, v7.4s, %[wr2].s[0]\n" \ + \ + "fmla v12.4s, v8.4s, %[wr0].s[2]\n" \ + "fmla v13.4s, v9.4s, %[wr0].s[2]\n" \ + \ + "fmla v14.4s, v9.4s, %[wr1].s[2]\n" \ + "fmla v15.4s, v10.4s, %[wr1].s[2]\n" \ + \ + "fmla v16.4s, v10.4s, %[wr2].s[2]\n" \ + "fmla v17.4s, v11.4s, %[wr2].s[2]\n" \ + \ + "fadd v12.4s, v12.4s, v14.4s\n" \ + "fadd v12.4s, v12.4s, v16.4s\n" \ + \ + "fadd v13.4s, v13.4s, v15.4s\n" \ + "fadd v13.4s, v13.4s, v17.4s\n" \ + \ + "fadd v12.4s, v12.4s, %[bias].4s\n" \ + "fadd v13.4s, v13.4s, %[bias].4s\n" + +#define RESULT_S_S1 \ + "prfm pldl1keep, [%[out1]]\n" \ + "prfm pldl1keep, [%[out2]]\n" \ + \ + "st1 {v12.4s}, [%[out1]]\n" \ + "st1 {v13.4s}, [%[out2]]\n" + +#define RESULT_S_S1_RELU \ + "prfm pldl1keep, [%[out1]]\n" \ + "prfm pldl1keep, [%[out2]]\n" \ + \ + "fmax v12.4s, v12.4s, %[zero].4s\n" \ + "fmax v13.4s, v13.4s, %[zero].4s\n" \ + \ + "st1 {v12.4s}, [%[out1]]\n" \ + "st1 {v13.4s}, [%[out2]]\n" + +#define COMPUTE_S_S1_P0 \ + "prfm pldl1keep, [%[din0]]\n" \ + "prfm pldl1keep, [%[din1]]\n" \ + "prfm pldl1keep, [%[din2]]\n" \ + "prfm pldl1keep, [%[din3]]\n" \ + \ + "ld1 {v0.4s, v1.4s}, [%[din0]]\n" \ + "ld1 {v2.4s, v3.4s}, [%[din1]]\n" \ + "ld1 {v4.4s, v5.4s}, [%[din2]]\n" \ + "ld1 {v6.4s, v7.4s}, [%[din3]]\n" \ + \ + "bif v0.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v1.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "bif v2.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v3.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "bif v4.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v5.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "bif v6.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v7.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "ext v8.16b, v0.16b, v1.16b, #4\n" \ + "ext v9.16b, v0.16b, v1.16b, #8\n" \ + \ + "and v12.16b, %[vbias].16b, %[vbias].16b \n" \ + "and v13.16b, %[vbias].16b, %[vbias].16b \n" /* r0 */ \ + "fmul v10.4s, v0.4s, %[wr0].s[0]\n" \ + "fmul v11.4s, v8.4s, %[wr0].s[1]\n" \ + "fmla v12.4s, v9.4s, %[wr0].s[2]\n" \ + \ + "ext v8.16b, v2.16b, v3.16b, #4\n" \ + "ext v9.16b, v2.16b, v3.16b, #8\n" /* r1 */ \ + "fmul v14.4s, v2.4s, %[wr0].s[0]\n" \ + "fmla v10.4s, v2.4s, %[wr1].s[0]\n" \ + \ + "fmul v15.4s, v8.4s, %[wr0].s[1]\n" \ + "fmla v11.4s, v8.4s, %[wr1].s[1]\n" \ + \ + "fmla v13.4s, v9.4s, %[wr0].s[2]\n" \ + "fmla v12.4s, v9.4s, %[wr1].s[2]\n" \ + \ + "ext v8.16b, v4.16b, v5.16b, #4\n" \ + "ext v9.16b, v4.16b, v5.16b, #8\n" /* r2 */ \ + "fmla v14.4s, v4.4s, %[wr1].s[0]\n" \ + "fmla v10.4s, v4.4s, %[wr2].s[0]\n" \ + \ + "fmla v15.4s, v8.4s, %[wr1].s[1]\n" \ + "fmla v11.4s, v8.4s, %[wr2].s[1]\n" \ + \ + "fmla v13.4s, v9.4s, %[wr1].s[2]\n" \ + "fmla v12.4s, v9.4s, %[wr2].s[2]\n" \ + \ + "ext v8.16b, v6.16b, v7.16b, #4\n" \ + "ext v9.16b, v6.16b, v7.16b, #8\n" \ + \ + "fmla v14.4s, v6.4s, %[wr2].s[0]\n" \ + \ + "fmla v15.4s, v8.4s, %[wr2].s[1]\n" \ + \ + "fadd v12.4s, v12.4s, v10.4s\n" \ + \ + "fmla v13.4s, v9.4s, %[wr2].s[2]\n" \ + \ + "fadd v12.4s, v12.4s, v11.4s\n" \ + "fadd v13.4s, v13.4s, v14.4s\n" \ + "fadd v13.4s, v13.4s, v15.4s\n" // \ + // "prfm pldl1keep, [%[out1]]\n" \ + // "prfm pldl1keep, [%[out2]]\n" \ + // \ + // "st1 {v12.4s}, [%[out1]]\n" \ + // "st1 {v13.4s}, [%[out2]]\n" \ + + +#else +#define INIT_S1 \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + "pld [%[din3_ptr]] @ preload data\n" \ + \ + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" \ + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" \ + "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" \ + "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" \ + \ + "vdup.32 q4, %[bias_val] @ and \n" \ + "vdup.32 q5, %[bias_val] @ and \n" + +#define LEFT_COMPUTE_S1 \ + "vext.32 q6, %q[vzero], q8, #3 @ 0012\n" \ + "vext.32 q7, q8, q9, #1 @ 1234\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" \ + "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" \ + "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" \ + "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + "pld [%[din3_ptr]] @ preload data\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, %q[vzero], q10, #3 @ 0012\n" \ + "vext.32 q7, q10, q11, #1 @ 1234\n" \ + \ + /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q10, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" \ + "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" \ + "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" \ + "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, %q[vzero], q12, #3 @ 0012\n" \ + "vext.32 q7, q12, q13, #1 @ 1234\n" \ + \ + /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q12, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" \ + "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, %q[vzero], q14, #3 @ 0012\n" \ + "vext.32 q7, q14, q15, #1 @ 1234\n" + +#define LEFT_RESULT_S1 \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + "cmp %[cnt], #1 @ check whether has mid cols\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + "blt 3f @ jump to main loop start point\n" + +#define MID_COMPUTE_S1 \ + "1: @ right pad entry\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + "pld [%[din3_ptr]] @ preload data\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" \ + \ + "vext.32 q6, q10, q11, #1 @ 1234\n" \ + "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q12, q13, #1 @ 1234\n" \ + "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q14, q15, #1 @ 1234\n" \ + "vext.32 q7, q14, q15, #2 @ 2345\n" + +#define MID_RESULT_S1 \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "subs %[cnt], #1 @ loop count minus 1\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + \ + "bne 1b @ jump to main loop start point\n" + +#define RIGHT_COMPUTE_S1 \ + "3: @ right pad entry\n" \ + "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" \ + "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" \ + \ + "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" \ + "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" \ + \ + "vbif d16, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d17, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d18, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vbif d20, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d21, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d22, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vbif d24, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d25, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d26, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vbif d28, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d29, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d30, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q10, q11, #1 @ 1234\n" \ + "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" \ + "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" \ + "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q12, q13, #1 @ 1234\n" \ + "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q14, q15, #1 @ 1234\n" \ + "vext.32 q7, q14, q15, #2 @ 2345\n" + +#define RIGHT_RESULT_S1 \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vbif d8, d16, d19 @ bit select, deal with right pad\n" \ + "vbif d9, d17, d23 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vbif d10, d20, d19 @ bit select, deal with right pad\n" \ + "vbif d11, d21, d23 @ bit select, deal with right pad\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" + +#define LEFT_RESULT_S1_RELU \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ + \ + "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ + \ + "cmp %[cnt], #1 @ check whether has mid cols\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + "blt 3f @ jump to main loop start point\n" + +#define MID_RESULT_S1_RELU \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "subs %[cnt], #1 @ loop count minus 1\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + \ + "bne 1b @ jump to main loop start point\n" + +#define RIGHT_RESULT_S1_RELU \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vbif d8, d16, d19 @ bit select, deal with right pad\n" \ + "vbif d9, d17, d23 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ + \ + "vbif d10, d20, d19 @ bit select, deal with right pad\n" \ + "vbif d11, d21, d23 @ bit select, deal with right pad\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" + +#define COMPUTE_S_S1 \ + "pld [%[din0]]\n" \ + "pld [%[din1]]\n" \ + "pld [%[din2]]\n" \ + "pld [%[din3]]\n" \ + \ + "vld1.32 {d12-d13}, [%[din0]]!\n" \ + "vld1.32 {d14-d15}, [%[din1]]!\n" \ + "vld1.32 {d16-d17}, [%[din2]]!\n" \ + "vld1.32 {d18-d19}, [%[din3]]!\n" \ + \ + "vbif q6, %q[vzero], %q[mask]\n" \ + "vbif q7, %q[vzero], %q[mask]\n" \ + "vbif q8, %q[vzero], %q[mask]\n" \ + "vbif q9, %q[vzero], %q[mask]\n" \ + \ + "vmul.f32 q14, q6, %e[wr0][1]\n" \ + "vmul.f32 q15, q7, %e[wr0][1]\n" \ + \ + "vmla.f32 q14, q7, %e[wr1][1]\n" \ + "vmla.f32 q15, q8, %e[wr1][1]\n" \ + \ + "vmla.f32 q14, q8, %e[wr2][1]\n" \ + "vmla.f32 q15, q9, %e[wr2][1]\n" \ + \ + "vext.32 q10, %q[vzero], q6, #3\n" \ + "vext.32 q11, %q[vzero], q7, #3\n" \ + "vext.32 q12, %q[vzero], q8, #3\n" \ + "vext.32 q13, %q[vzero], q9, #3\n" \ + \ + "vmla.f32 q14, q10, %e[wr0][0]\n" \ + "vmla.f32 q15, q11, %e[wr0][0]\n" \ + \ + "vmla.f32 q14, q11, %e[wr1][0]\n" \ + "vmla.f32 q15, q12, %e[wr1][0]\n" \ + \ + "vmla.f32 q14, q12, %e[wr2][0]\n" \ + "vmla.f32 q15, q13, %e[wr2][0]\n" \ + \ + "vext.32 q10, q6, %q[vzero], #1\n" \ + "vext.32 q11, q7, %q[vzero], #1\n" \ + "vext.32 q12, q8, %q[vzero], #1\n" \ + "vext.32 q13, q9, %q[vzero], #1\n" \ + \ + "vmla.f32 q14, q10, %f[wr0][0]\n" \ + "vmla.f32 q15, q11, %f[wr0][0]\n" \ + \ + "vmla.f32 q14, q11, %f[wr1][0]\n" \ + "vmla.f32 q15, q12, %f[wr1][0]\n" \ + \ + "vmla.f32 q14, q12, %f[wr2][0]\n" \ + "vmla.f32 q15, q13, %f[wr2][0]\n" \ + \ + "vadd.f32 q14, q14, %q[bias]\n" \ + "vadd.f32 q15, q15, %q[bias]\n" + +#define RESULT_S_S1 \ + "pld [%[out1]]\n" \ + "pld [%[out2]]\n" \ + \ + "vst1.32 {d28-d29}, [%[out1]]\n" \ + "vst1.32 {d30-d31}, [%[out2]]\n" + +#define RESULT_S_S1_RELU \ + "pld [%[out1]]\n" \ + "pld [%[out2]]\n" \ + \ + "vmax.f32 q14, q14, %q[vzero]\n" \ + "vmax.f32 q15, q15, %q[vzero]\n" \ + \ + "vst1.32 {d28-d29}, [%[out1]]\n" \ + "vst1.32 {d30-d31}, [%[out2]]\n" + +#define COMPUTE_S_S1_P0 \ + "pld [%[din0]]\n" \ + "pld [%[din1]]\n" \ + "pld [%[din2]]\n" \ + "pld [%[din3]]\n" \ + "vld1.32 {d16-d18}, [%[din0]] @ load din r0\n" \ + "vld1.32 {d20-d22}, [%[din1]] @ load din r1\n" \ + "vld1.32 {d24-d26}, [%[din2]] @ load din r2\n" \ + "vld1.32 {d28-d30}, [%[din3]] @ load din r3\n" \ + \ + "vdup.32 q4, %[bias_val] @ and \n" \ + "vdup.32 q5, %[bias_val] @ and \n" \ + \ + "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" \ + "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" \ + \ + "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" \ + \ + "vbif d16, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d20, %e[vzero], d19 @ bit select, deal with right pad\n" \ + \ + "vbif d17, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d21, %e[vzero], d23 @ bit select, deal with right pad\n" \ + \ + "vbif d18, %e[vzero], d27 @ bit select, deal with right pad\n" \ + "vbif d22, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vbif d24, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d25, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d26, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vbif d28, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d29, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d30, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q10, q11, #1 @ 1234\n" \ + "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmul.f32 q8, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmul.f32 q10, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmul.f32 q9, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ + "vmul.f32 q11, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q12, q13, #1 @ 1234\n" \ + "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q8, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q10, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q9, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q11, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q14, q15, #1 @ 1234\n" \ + "vext.32 q7, q14, q15, #2 @ 2345\n" /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vmla.f32 q8, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + "vadd.f32 q4, q4, q10 @ q4 += q10 \n" \ + \ + "pld [%[out1]]\n" \ + "pld [%[out2]]\n" \ + \ + "vmla.f32 q9, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + "vadd.f32 q14, q4, q11 @ q4 += q10 \n" \ + \ + "vadd.f32 q5, q5, q8 @ q4 += q10 \n" \ + "vadd.f32 q15, q5, q9 @ q4 += q10 \n" + +#endif +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width > 4 + */ +void conv_depthwise_3x3s1p1_bias_relu(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! pad is done implicit + const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + //! for 4x6 convolution window + const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; + + float *zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float *write_ptr = zero_ptr + w_in; + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + int w_stride = 9; + + int tile_w = w_out >> 2; + int remain = w_out % 4; + int cnt_col = tile_w - 1; + + unsigned int size_pad_right = (unsigned int)(5 + (tile_w << 2) - w_in); + const unsigned int remian_idx[4] = {0, 1, 2, 3}; + + if (remain == 0 && size_pad_right == 5) { + size_pad_right = 1; + cnt_col -= 1; + remain = 4; + } else if (remain == 0 && size_pad_right == 6) { + size_pad_right = 2; + cnt_col -= 1; + remain = 4; + } + + uint32x4_t vmask_rp1 = + vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_rp2 = + vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_result = + vcgtq_u32(vdupq_n_u32(remain), vld1q_u32(remian_idx)); + + unsigned int vmask[8]; + vst1q_u32(vmask, vmask_rp1); + vst1q_u32(vmask + 4, vmask_rp2); + + unsigned int rmask[4]; + vst1q_u32(rmask, vmask_result); + + float32x4_t vzero = vdupq_n_f32(0.f); + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int c = 0; c < ch_in; c++) { + float *dout_ptr = dout_batch + c * size_out_channel; + + const float *din_ch_ptr = din_batch + c * size_in_channel; + + float bias_val = flag_bias ? bias[c] : 0.f; + float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; + + const float *wei_ptr = weights + c * w_stride; + + float32x4_t wr0 = vld1q_f32(wei_ptr); + float32x4_t wr1 = vld1q_f32(wei_ptr + 3); + float32x4_t wr2 = vld1q_f32(wei_ptr + 6); + + float *doutr0 = dout_ptr; + float *doutr1 = doutr0 + w_out; + float *doutr2 = doutr1 + w_out; + float *doutr3 = doutr2 + w_out; + + const float *dr0 = din_ch_ptr; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + const float *dr4 = dr3 + w_in; + const float *dr5 = dr4 + w_in; + + const float *din_ptr0 = dr0; + const float *din_ptr1 = dr1; + const float *din_ptr2 = dr2; + const float *din_ptr3 = dr3; + const float *din_ptr4 = dr4; + const float *din_ptr5 = dr5; + float *ptr_zero = const_cast(zero); +#ifdef __aarch64__ + for (int i = 0; i < h_in; i += 4) { + //! process top pad pad_h = 1 + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + din_ptr4 = dr4; + din_ptr5 = dr5; + + doutr0 = dout_ptr; + doutr1 = doutr0 + w_out; + doutr2 = doutr1 + w_out; + doutr3 = doutr2 + w_out; + if (i == 0) { + din_ptr0 = zero_ptr; + din_ptr1 = dr0; + din_ptr2 = dr1; + din_ptr3 = dr2; + din_ptr4 = dr3; + din_ptr5 = dr4; + dr0 = dr3; + dr1 = dr4; + dr2 = dr5; + } else { + dr0 = dr4; + dr1 = dr5; + dr2 = dr1 + w_in; + } + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + dr5 = dr4 + w_in; + + //! process bottom pad + if (i + 5 > h_in) { + switch (i + 5 - h_in) { + case 5: + din_ptr1 = zero_ptr; + case 4: + din_ptr2 = zero_ptr; + case 3: + din_ptr3 = zero_ptr; + case 2: + din_ptr4 = zero_ptr; + case 1: + din_ptr5 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 4 > h_out) { + switch (i + 4 - h_out) { + case 3: + doutr1 = write_ptr; + case 2: + doutr2 = write_ptr; + case 1: + doutr3 = write_ptr; + default: + break; + } + } + + int cnt = cnt_col; + if (flag_relu) { + asm volatile( + INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 + MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } else { + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 + MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } + dout_ptr = dout_ptr + 4 * w_out; + } +#else + for (int i = 0; i < h_in; i += 2) { + //! process top pad pad_h = 1 + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + + doutr0 = dout_ptr; + doutr1 = dout_ptr + w_out; + // unsigned int* rst_mask = rmask; + + if (i == 0) { + din_ptr0 = zero_ptr; + din_ptr1 = dr0; + din_ptr2 = dr1; + din_ptr3 = dr2; + dr0 = dr1; + dr1 = dr2; + dr2 = dr3; + dr3 = dr2 + w_in; + } else { + dr0 = dr2; + dr1 = dr3; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + } + //! process bottom pad + if (i + 3 > h_in) { + switch (i + 3 - h_in) { + case 3: + din_ptr1 = zero_ptr; + case 2: + din_ptr2 = zero_ptr; + case 1: + din_ptr3 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 2 > h_out) { + doutr1 = write_ptr; + } + int cnt = cnt_col; + unsigned int *rmask_ptr = rmask; + unsigned int *vmask_ptr = vmask; + if (flag_relu) { + asm volatile( + INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 + MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 + MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + dout_ptr += 2 * w_out; + } //! end of processing mid rows +#endif + } + } +} + +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width <= 4 + */ +void conv_depthwise_3x3s1p1_bias_s_relu(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! 3x3s1 convolution, implemented by direct algorithm + //! pad is done implicit + //! for 4x6 convolution window + const int right_pad_idx[4] = {3, 2, 1, 0}; + const float zero[4] = {0.f, 0.f, 0.f, 0.f}; + + float32x4_t vzero = vdupq_n_f32(0.f); + uint32x4_t vmask_rp = + vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in)); + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + float *dout_channel = dout_batch + i * size_out_channel; + const float *din_channel = din_batch + i * size_in_channel; + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } + + int hs = -1; + int he = 3; + + float out_buf1[4]; + float out_buf2[4]; + float trash_buf[4]; + + int h_cnt = (h_out + 1) >> 1; + float *doutr0 = dout_channel; + float *doutr1 = dout_channel + w_out; + + for (int j = 0; j < h_cnt; ++j) { + const float *dr0 = din_channel + hs * w_in; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + + if (hs == -1) { + dr0 = zero; + } + + switch (he - h_in) { + case 2: + dr2 = zero; + doutr1 = trash_buf; + case 1: + dr3 = zero; + default: + break; + } +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [zero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17"); + } else { + asm volatile(COMPUTE_S_S1 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [zero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17"); + } +#else + if (flag_relu) { + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S1 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *doutr0++ = out_buf1[w]; + *doutr1++ = out_buf2[w]; + } + doutr0 = doutr1; + doutr1 += w_out; + hs += 2; + he += 2; + } // end of processing heights + } // end of processing channels + } // end of processing batchs +} + +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width > 4 + */ +void conv_depthwise_3x3s1p0_bias_relu(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! pad is done implicit + const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + //! for 4x6 convolution window + const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; + + float *zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float *write_ptr = zero_ptr + w_in; + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + int w_stride = 9; + + int tile_w = w_out >> 2; + int remain = w_out % 4; + + unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in); + const int remian_idx[4] = {0, 1, 2, 3}; + + uint32x4_t vmask_rp1 = + vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_rp2 = + vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_result = + vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx)); + + unsigned int vmask[8]; + vst1q_u32(vmask, vmask_rp1); + vst1q_u32(vmask + 4, vmask_rp2); + + unsigned int rmask[4]; + vst1q_u32(rmask, vmask_result); + + float32x4_t vzero = vdupq_n_f32(0.f); + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int c = 0; c < ch_in; c++) { + float *dout_ptr = dout_batch + c * size_out_channel; + + const float *din_ch_ptr = din_batch + c * size_in_channel; + + float bias_val = flag_bias ? bias[c] : 0.f; + float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; + + const float *wei_ptr = weights + c * w_stride; + + float32x4_t wr0 = vld1q_f32(wei_ptr); + float32x4_t wr1 = vld1q_f32(wei_ptr + 3); + float32x4_t wr2 = vld1q_f32(wei_ptr + 6); + + float *doutr0 = dout_ptr; + float *doutr1 = doutr0 + w_out; + float *doutr2 = doutr1 + w_out; + float *doutr3 = doutr2 + w_out; + + const float *dr0 = din_ch_ptr; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + const float *dr4 = dr3 + w_in; + const float *dr5 = dr4 + w_in; + + const float *din_ptr0 = dr0; + const float *din_ptr1 = dr1; + const float *din_ptr2 = dr2; + const float *din_ptr3 = dr3; + const float *din_ptr4 = dr4; + const float *din_ptr5 = dr5; + + float *ptr_zero = const_cast(zero); +#ifdef __aarch64__ + for (int i = 0; i < h_out; i += 4) { + //! process top pad pad_h = 1 + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + din_ptr4 = dr4; + din_ptr5 = dr5; + + doutr0 = dout_ptr; + doutr1 = doutr0 + w_out; + doutr2 = doutr1 + w_out; + doutr3 = doutr2 + w_out; + + dr0 = dr4; + dr1 = dr5; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + dr5 = dr4 + w_in; + + //! process bottom pad + if (i + 5 >= h_in) { + switch (i + 5 - h_in) { + case 4: + din_ptr1 = zero_ptr; + case 3: + din_ptr2 = zero_ptr; + case 2: + din_ptr3 = zero_ptr; + case 1: + din_ptr4 = zero_ptr; + case 0: + din_ptr5 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 4 > h_out) { + switch (i + 4 - h_out) { + case 3: + doutr1 = write_ptr; + case 2: + doutr2 = write_ptr; + case 1: + doutr3 = write_ptr; + default: + break; + } + } + + int cnt = tile_w; + if (flag_relu) { + asm volatile( + INIT_S1 + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ + MID_COMPUTE_S1 MID_RESULT_S1_RELU + "cmp %w[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU "0: \n" + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } else { + asm volatile( + INIT_S1 + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ + MID_COMPUTE_S1 MID_RESULT_S1 + "cmp %w[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1 "0: \n" + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } + dout_ptr = dout_ptr + 4 * w_out; + } +#else + for (int i = 0; i < h_out; i += 2) { + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + + doutr0 = dout_ptr; + doutr1 = dout_ptr + w_out; + + dr0 = dr2; + dr1 = dr3; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + //! process bottom pad + if (i + 3 >= h_in) { + switch (i + 3 - h_in) { + case 3: + din_ptr1 = zero_ptr; + case 2: + din_ptr2 = zero_ptr; + case 1: + din_ptr3 = zero_ptr; + case 0: + din_ptr3 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 2 > h_out) { + doutr1 = write_ptr; + } + int cnt = tile_w; + unsigned int *rmask_ptr = rmask; + unsigned int *vmask_ptr = vmask; + if (flag_relu) { + asm volatile(INIT_S1 + "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" + "vext.32 q6, q8, q9, #1 @ 0012\n" + "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 + MID_RESULT_S1_RELU + "cmp %[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU "0: \n" + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S1 + "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" + "vext.32 q6, q8, q9, #1 @ 0012\n" + "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 + MID_RESULT_S1 + "cmp %[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1 "0: \n" + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + dout_ptr += 2 * w_out; + } //! end of processing mid rows +#endif + } + } +} +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width <= 4 + */ +void conv_depthwise_3x3s1p0_bias_s_relu(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! 3x3s1 convolution, implemented by direct algorithm + //! pad is done implicit + //! for 4x6 convolution window + const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; + const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; + + float32x4_t vzero = vdupq_n_f32(0.f); + uint32x4_t vmask_rp1 = + vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in)); + uint32x4_t vmask_rp2 = + vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in)); + + unsigned int vmask[8]; + vst1q_u32(vmask, vmask_rp1); + vst1q_u32(vmask + 4, vmask_rp2); + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + float *dout_channel = dout_batch + i * size_out_channel; + const float *din_channel = din_batch + i * size_in_channel; + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + +#ifdef __aarch64__ + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } +#endif // __aarch64__ + + float out_buf1[4]; + float out_buf2[4]; + float trash_buf[4]; + + float *doutr0 = dout_channel; + float *doutr1 = dout_channel + w_out; + + for (int j = 0; j < h_out; j += 2) { + const float *dr0 = din_channel + j * w_in; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + + doutr0 = dout_channel + j * w_out; + doutr1 = doutr0 + w_out; + + if (j + 3 >= h_in) { + switch (j + 3 - h_in) { + case 3: + dr1 = zero_ptr; + case 2: + dr2 = zero_ptr; + case 1: + dr3 = zero_ptr; + doutr1 = trash_buf; + case 0: + dr3 = zero_ptr; + if (j + 2 > h_out) { + doutr1 = trash_buf; + } + default: + break; + } + } +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vbias] "w"(wbias), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [zero] "w"(vzero), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } else { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vbias] "w"(wbias), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [zero] "w"(vzero), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } +#else + unsigned int *vmask_ptr = vmask; + float bias_val = flag_bias ? bias[i] : 0.f; + if (flag_relu) { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [bias_val] "r"(bias_val), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [bias_val] "r"(bias_val), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *doutr0++ = out_buf1[w]; + *doutr1++ = out_buf2[w]; + } + } // end of processing heights + } // end of processing channels + } // end of processing batchs +} +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc index 55ea94949ba93396c97be5e3ea66d6e29ce95429..c998ddc3a34c2f6194a5156b7d04b7a9db3fbcef 100644 --- a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc +++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc @@ -508,6 +508,8 @@ void act_switch_3x3s1(const float* inr0, "x5", "x6", "x7"); +#else +#if 1 // def LITE_WITH_ARM_CLANG #else asm volatile(COMPUTE RELU STORE : [r0] "+r"(inr0), @@ -541,6 +543,7 @@ void act_switch_3x3s1(const float* inr0, "r3", "r4", "r5"); +#endif #endif break; case lite_api::ActivationType::kRelu6: @@ -593,6 +596,8 @@ void act_switch_3x3s1(const float* inr0, "x5", "x6", "x7"); +#else +#if 1 // def LITE_WITH_ARM_CLANG #else asm volatile(COMPUTE RELU RELU6 STORE : [r0] "+r"(inr0), @@ -626,6 +631,7 @@ void act_switch_3x3s1(const float* inr0, "r3", "r4", "r5"); +#endif #endif break; case lite_api::ActivationType::kLeakyRelu: @@ -678,6 +684,8 @@ void act_switch_3x3s1(const float* inr0, "x5", "x6", "x7"); +#else +#if 1 // def LITE_WITH_ARM_CLANG #else asm volatile(COMPUTE LEAKY_RELU STORE : [r0] "+r"(inr0), @@ -711,6 +719,7 @@ void act_switch_3x3s1(const float* inr0, "r3", "r4", "r5"); +#endif #endif break; default: @@ -768,6 +777,8 @@ void act_switch_3x3s1(const float* inr0, "x5", "x6", "x7"); +#else +#if 1 // def LITE_WITH_ARM_CLANG #else asm volatile(COMPUTE STORE : [r0] "+r"(inr0), @@ -801,6 +812,7 @@ void act_switch_3x3s1(const float* inr0, "r3", "r4", "r5"); +#endif #endif } } @@ -988,6 +1000,8 @@ void conv_3x3s1_depthwise_fp32(const float* i_data, w8, vbias, act_param); +#else +#if 1 // def LITE_WITH_ARM_CLANG #else act_switch_3x3s1(inr0, inr1, @@ -1008,6 +1022,7 @@ void conv_3x3s1_depthwise_fp32(const float* i_data, vbias, vbias, act_param); +#endif #endif outl[0] += 4; outl[1] += 4; diff --git a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc index 3e5569365119b97397c6d42f48bacd2552b248e5..d2e8f66a609d44d2c69228f3b9a343fdf91296a8 100644 --- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc +++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc @@ -91,23 +91,20 @@ void conv_depthwise_3x3s2_fp32(const float* din, bool flag_bias, const operators::ActivationParam act_param, ARMContext* ctx) { - if (pad == 0) { - if (w_in > 7) { - conv_depthwise_3x3s2p0_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - act_param, - ctx); + bool has_active = act_param.has_active; + bool flag_relu = false; + bool relu6 = false; + if (has_active) { + if (act_param.active_type == lite_api::ActivationType::kRelu) { + flag_relu = true; } else { - conv_depthwise_3x3s2p0_bias_s(dout, + relu6 = true; + } + } + if (pad == 0) { + if (w_in > 8) { + if (relu6) { + conv_depthwise_3x3s2p0_bias(dout, din, weights, bias, @@ -120,25 +117,57 @@ void conv_depthwise_3x3s2_fp32(const float* din, w_out, act_param, ctx); + } else { + conv_depthwise_3x3s2p0_bias_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } else { + if (relu6) { + conv_depthwise_3x3s2p0_bias_s(dout, + din, + weights, + bias, + flag_bias, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + act_param, + ctx); + } else { + conv_depthwise_3x3s2p0_bias_s_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } } } if (pad == 1) { if (w_in > 7) { - conv_depthwise_3x3s2p1_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - act_param, - ctx); - } else { - conv_depthwise_3x3s2p1_bias_s(dout, + if (relu6) { + conv_depthwise_3x3s2p1_bias(dout, din, weights, bias, @@ -151,6 +180,51 @@ void conv_depthwise_3x3s2_fp32(const float* din, w_out, act_param, ctx); + } else { + conv_depthwise_3x3s2p1_bias_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } else { + if (relu6) { + conv_depthwise_3x3s2p1_bias_s(dout, + din, + weights, + bias, + flag_bias, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + act_param, + ctx); + } else { + conv_depthwise_3x3s2p1_bias_s_relu(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } } } } @@ -476,7 +550,7 @@ void conv_depthwise_3x3s2_fp32(const float* din, \ "st1 {v16.4s}, [%[outptr0]], #16 \n" \ "fcmge v11.4s, v17.4s, %[vzero].4s \n" /* vcgeq_u32 */ \ - "fmul v12.4s, v16.4s, v22.4s \n" \ + "fmul v12.4s, v17.4s, v22.4s \n" \ \ "ld1 {v20.4s}, [%[inptr3]] \n" \ "ld1 {v21.4s}, [%[inptr4]] \n" \ @@ -552,6 +626,7 @@ void conv_depthwise_3x3s2_fp32(const float* din, "ld1 {v20.4s}, [%[inptr3]] \n" \ "ld1 {v21.4s}, [%[inptr4]] \n" \ \ + "fadd v17.4s, v17.4s, v14.4s \n" \ "bif v16.16b, v12.16b, v11.16b \n" /* choose*/ \ "ext v10.16b, v0.16b, v15.16b, #4 \n" \ "fcmge v11.4s, v17.4s, %[vzero].4s \n" /* vcgeq_u32 */ \ @@ -977,207 +1052,158 @@ void act_switch_3x3s2p1(const float* din0_ptr, int cnt, int cnt_remain, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { - float tmp = act_param.Relu_clipped_coef; - float ss = act_param.Leaky_relu_alpha; - float vsix[4] = {tmp, tmp, tmp, tmp}; - float vscale[4] = {ss, ss, ss, ss}; - - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: - asm volatile( - INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 - MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - break; - case lite_api::ActivationType::kRelu6: - /* 0 <= din <= 6 */ - asm volatile( - INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU6 MID_COMPUTE_S2 - MID_RESULT_S2_RELU6 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU6 - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [six_ptr] "r"(vsix), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); - break; - case lite_api::ActivationType::kLeakyRelu: - /*din = din >= 0 ? din : din * scale*/ - asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU - MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU - RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_LEAKY_RELU - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [scale_ptr] "r"(vscale), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); - break; - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { - asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 - MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); + float tmp = act_param.Relu_clipped_coef; + float ss = act_param.Leaky_relu_alpha; + float vsix[4] = {tmp, tmp, tmp, tmp}; + float vscale[4] = {ss, ss, ss, ss}; + + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: + asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 + MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + break; + case lite_api::ActivationType::kRelu6: + /* 0 <= din <= 6 */ + asm volatile( + INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU6 MID_COMPUTE_S2 + MID_RESULT_S2_RELU6 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU6 + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [six_ptr] "r"(vsix), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22"); + break; + case lite_api::ActivationType::kLeakyRelu: + /*din = din >= 0 ? din : din * scale*/ + asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU + MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2_LEAKY_RELU + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [scale_ptr] "r"(vscale), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22"); + break; + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } #endif @@ -1569,249 +1595,191 @@ void act_switch_3x3s2p0(const float* din0_ptr, int cnt, int cnt_remain, const operators::ActivationParam act_param) { - bool has_active = act_param.has_active; - if (has_active) { - float tmp = act_param.Relu_clipped_coef; - float ss = act_param.Leaky_relu_alpha; - float vsix[4] = {tmp, tmp, tmp, tmp}; - float vscale[4] = {ss, ss, ss, ss}; - - switch (act_param.active_type) { - case lite_api::ActivationType::kRelu: - asm volatile( - INIT_S2 - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - MID_COMPUTE_S2 MID_RESULT_S2_RELU - "cmp %w[remain], #1 \n" - "blt 4f \n" RIGHT_COMPUTE_S2 - RIGHT_RESULT_S2_RELU - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - break; - case lite_api::ActivationType::kRelu6: - /* 0 <= din <= 6 */ - asm volatile( - INIT_S2 - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v22.4s}, [%[six_ptr]] \n" MID_COMPUTE_S2 - MID_RESULT_S2_RELU6 - "cmp %w[remain], #1 \n" - "blt 4f \n" RIGHT_COMPUTE_S2 - RIGHT_RESULT_S2_RELU6 - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [six_ptr] "r"(vsix), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); - break; - case lite_api::ActivationType::kLeakyRelu: - /*din = din >= 0 ? din : din * scale*/ - asm volatile( - INIT_S2 - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v22.4s}, [%[scale_ptr]] \n" MID_COMPUTE_S2 - MID_RESULT_S2_LEAKY_RELU - "cmp %w[remain], #1 \n" - "blt 4f \n" RIGHT_COMPUTE_S2 - RIGHT_RESULT_S2_LEAKY_RELU - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [scale_ptr] "r"(vscale), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); - break; - default: - LOG(FATAL) << "this act_type: " - << static_cast(act_param.active_type) - << " fuse not support"; - } - } else { - asm volatile( - INIT_S2 - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - MID_COMPUTE_S2 MID_RESULT_S2 - "cmp %w[remain], #1 \n" - "blt 4f \n" RIGHT_COMPUTE_S2 - RIGHT_RESULT_S2 "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); + float tmp = act_param.Relu_clipped_coef; + float ss = act_param.Leaky_relu_alpha; + float vsix[4] = {tmp, tmp, tmp, tmp}; + float vscale[4] = {ss, ss, ss, ss}; + + switch (act_param.active_type) { + case lite_api::ActivationType::kRelu: + asm volatile( + INIT_S2 + "ld1 {v15.4s}, [%[inptr0]] \n" + "ld1 {v18.4s}, [%[inptr1]] \n" + "ld1 {v19.4s}, [%[inptr2]] \n" + "ld1 {v20.4s}, [%[inptr3]] \n" + "ld1 {v21.4s}, [%[inptr4]] \n" + "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} + MID_COMPUTE_S2 MID_RESULT_S2_RELU + "cmp %w[remain], #1 \n" + "blt 4f \n" RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2_RELU + "4: \n" + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + break; + case lite_api::ActivationType::kRelu6: + /* 0 <= din <= 6 */ + asm volatile( + INIT_S2 + "ld1 {v15.4s}, [%[inptr0]] \n" + "ld1 {v18.4s}, [%[inptr1]] \n" + "ld1 {v19.4s}, [%[inptr2]] \n" + "ld1 {v20.4s}, [%[inptr3]] \n" + "ld1 {v21.4s}, [%[inptr4]] \n" + "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} + "ld1 {v22.4s}, [%[six_ptr]] \n" MID_COMPUTE_S2 + MID_RESULT_S2_RELU6 + "cmp %w[remain], #1 \n" + "blt 4f \n" RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2_RELU6 + "4: \n" + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [six_ptr] "r"(vsix), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22"); + break; + case lite_api::ActivationType::kLeakyRelu: + /*din = din >= 0 ? din : din * scale*/ + asm volatile( + INIT_S2 + "ld1 {v15.4s}, [%[inptr0]] \n" + "ld1 {v18.4s}, [%[inptr1]] \n" + "ld1 {v19.4s}, [%[inptr2]] \n" + "ld1 {v20.4s}, [%[inptr3]] \n" + "ld1 {v21.4s}, [%[inptr4]] \n" + "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} + "ld1 {v22.4s}, [%[scale_ptr]] \n" MID_COMPUTE_S2 + MID_RESULT_S2_LEAKY_RELU + "cmp %w[remain], #1 \n" + "blt 4f \n" RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2_LEAKY_RELU + "4: \n" + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [scale_ptr] "r"(vscale), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22"); + break; + default: + LOG(FATAL) << "this act_type: " << static_cast(act_param.active_type) + << " fuse not support"; } } #endif diff --git a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc new file mode 100644 index 0000000000000000000000000000000000000000..b2f0243279fd1be27349bfeb97a3a61eed3eff4d --- /dev/null +++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc @@ -0,0 +1,1735 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/arm/math/conv_depthwise.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +#ifdef __aarch64__ +#define INIT_S2 \ + "prfm pldl1keep, [%[inptr0]] \n" \ + "prfm pldl1keep, [%[inptr1]] \n" \ + "prfm pldl1keep, [%[inptr2]] \n" \ + "prfm pldl1keep, [%[inptr3]] \n" \ + "prfm pldl1keep, [%[inptr4]] \n" \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" + +#define LEFT_COMPUTE_S2 \ + "ext v10.16b, %[vzero].16b, v1.16b, #12 \n" /* r0 */ \ + "fmul v11.4s, v0.4s, %[w0].s[1] \n" /* {0,2,4,6} * w01 */ \ + "fmul v12.4s, v1.4s, %[w0].s[2] \n" /* {1,3,5,7} * w02 */ \ + "fmla v16.4s, v10.4s, %[w0].s[0] \n" /* {0,1,3,5} * w00*/ \ + \ + "ext v10.16b, %[vzero].16b, v3.16b, #12 \n" /* v10 = {0,1,3,5} */ \ + \ + "sub %[inptr0], %[inptr0], #4 \n" \ + "sub %[inptr1], %[inptr1], #4 \n" /* r1 */ \ + "fmla v11.4s, v2.4s, %[w1].s[1] \n" \ + "fmla v12.4s, v3.4s, %[w1].s[2] \n" \ + "fmla v16.4s, v10.4s, %[w1].s[0] \n" \ + \ + "ext v10.16b, %[vzero].16b, v5.16b, #12 \n" \ + \ + "sub %[inptr2], %[inptr2], #4 \n" \ + "sub %[inptr3], %[inptr3], #4 \n" /* r2 */ \ + "fmul v13.4s, v4.4s, %[w0].s[1] \n" \ + "fmla v11.4s, v4.4s, %[w2].s[1] \n" \ + \ + "fmul v14.4s, v5.4s, %[w0].s[2] \n" \ + "fmla v12.4s, v5.4s, %[w2].s[2] \n" \ + \ + "fmla v17.4s, v10.4s, %[w0].s[0] \n" \ + "fmla v16.4s, v10.4s, %[w2].s[0] \n" \ + \ + "ext v10.16b, %[vzero].16b, v7.16b, #12 \n" \ + \ + "sub %[inptr4], %[inptr4], #4 \n" /* r3 */ \ + "fmla v13.4s, v6.4s, %[w1].s[1] \n" \ + "fmla v14.4s, v7.4s, %[w1].s[2] \n" \ + "fmla v17.4s, v10.4s, %[w1].s[0] \n" \ + \ + "ext v10.16b, %[vzero].16b, v9.16b, #12 \n" \ + "fadd v16.4s, v16.4s, v11.4s \n" \ + "fadd v16.4s, v16.4s, v12.4s \n" + +#define LEFT_RESULT_S2 \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[1] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[2] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[0] \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "cmp %w[cnt], #1 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "blt 1f \n" + +#define MID_COMPUTE_S2 \ + "2: \n" /* r0 */ \ + "fmul v11.4s, v0.4s, %[w0].s[0] \n" \ + "fmul v12.4s, v1.4s, %[w0].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w0].s[2] \n" \ + \ + "ext v10.16b, v2.16b, v18.16b, #4 \n" \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" /* r1 */ \ + "fmla v11.4s, v2.4s, %[w1].s[0] \n" \ + "fmla v12.4s, v3.4s, %[w1].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v4.16b, v19.16b, #4 \n" \ + \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" /* r2 */ \ + "fmul v13.4s, v4.4s, %[w0].s[0] \n" \ + "fmla v11.4s, v4.4s, %[w2].s[0] \n" \ + \ + "fmul v14.4s, v5.4s, %[w0].s[1] \n" \ + "fmla v12.4s, v5.4s, %[w2].s[1] \n" \ + \ + "fmla v17.4s, v10.4s, %[w0].s[2] \n" \ + "fmla v16.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ext v10.16b, v6.16b, v20.16b, #4 \n" \ + \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" /* r3 */ \ + "fmla v13.4s, v6.4s, %[w1].s[0] \n" \ + "fmla v14.4s, v7.4s, %[w1].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v8.16b, v21.16b, #4 \n" \ + \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + \ + "fadd v16.4s, v16.4s, v11.4s \n" \ + "fadd v16.4s, v16.4s, v12.4s \n" + +#define MID_RESULT_S2 \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "bne 2b \n" + +#define RIGHT_COMPUTE_S2 \ + "1: \n" \ + "cmp %w[remain], #1 \n" \ + "blt 4f \n" \ + "3: \n" \ + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" \ + \ + "bif v2.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v3.16b, %[vzero].16b, %[mask2].16b \n" \ + \ + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" \ + \ + "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" \ + \ + "bif v6.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v7.16b, %[vzero].16b, %[mask2].16b \n" /* r0 */ \ + "fmul v11.4s, v0.4s, %[w0].s[0] \n" \ + "fmul v12.4s, v1.4s, %[w0].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w0].s[2] \n" \ + \ + "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" \ + "bif v8.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v9.16b, %[vzero].16b, %[mask2].16b \n" /* r1 */ \ + "fmla v11.4s, v2.4s, %[w1].s[0] \n" \ + "fmla v12.4s, v3.4s, %[w1].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" /* r2 */ \ + "fmul v13.4s, v4.4s, %[w0].s[0] \n" \ + "fmla v11.4s, v4.4s, %[w2].s[0] \n" \ + \ + "fmul v14.4s, v5.4s, %[w0].s[1] \n" \ + "fmla v12.4s, v5.4s, %[w2].s[1] \n" \ + \ + "fmla v17.4s, v10.4s, %[w0].s[2] \n" \ + "fmla v16.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" /* r3 */ \ + "fmla v13.4s, v6.4s, %[w1].s[0] \n" \ + "fmla v14.4s, v7.4s, %[w1].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" \ + "ld1 {v0.4s}, [%[outptr0]] \n" \ + \ + "fadd v16.4s, v16.4s, v11.4s \n" \ + "fadd v16.4s, v16.4s, v12.4s \n" \ + "ld1 {v1.4s}, [%[outptr1]] \n" + +#define RIGHT_RESULT_S2 \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "bif v16.16b, v0.16b, %[wmask].16b \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "bif v17.16b, v1.16b, %[wmask].16b \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + "4: \n" + +#define LEFT_RESULT_S2_RELU \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[1] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[2] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[0] \n" \ + \ + "fmax v16.4s, v16.4s, %[vzero].4s \n" \ + \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + \ + "fmax v17.4s, v17.4s, %[vzero].4s \n" \ + \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "cmp %w[cnt], #1 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "blt 1f \n" + +#define MID_RESULT_S2_RELU \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "bne 2b \n" + +#define RIGHT_RESULT_S2_RELU \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "bif v16.16b, v0.16b, %[wmask].16b \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ \ + \ + "bif v17.16b, v1.16b, %[wmask].16b \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + "4: \n" + +#define COMPUTE_S_S2 \ + "movi v9.4s, #0 \n" \ + "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \ + \ + "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" \ + "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" \ + "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" \ + \ + "bif v10.16b, v9.16b, v6.16b \n" \ + "bif v11.16b, v9.16b, v7.16b \n" \ + "bif v12.16b, v9.16b, v6.16b \n" \ + "bif v13.16b, v9.16b, v7.16b \n" \ + "bif v14.16b, v9.16b, v6.16b \n" \ + "bif v15.16b, v9.16b, v7.16b \n" \ + \ + "ext v6.16b, v9.16b, v11.16b, #12 \n" \ + "ext v7.16b, v9.16b, v13.16b, #12 \n" \ + "ext v8.16b, v9.16b, v15.16b, #12 \n" \ + \ + "fmul v4.4s, v10.4s, %[wr0].s[1] \n" \ + "fmul v5.4s, v11.4s, %[wr0].s[2] \n" \ + "fmul v6.4s, v6.4s, %[wr0].s[0] \n" \ + \ + "fmla v4.4s, v12.4s, %[wr1].s[1] \n" \ + "fmla v5.4s, v13.4s, %[wr1].s[2] \n" \ + "fmla v6.4s, v7.4s, %[wr1].s[0] \n" \ + \ + "fmla v4.4s, v14.4s, %[wr2].s[1] \n" \ + "fmla v5.4s, v15.4s, %[wr2].s[2] \n" \ + "fmla v6.4s, v8.4s, %[wr2].s[0] \n" \ + \ + "fadd v4.4s, v4.4s, v5.4s \n" \ + "fadd v4.4s, v4.4s, v6.4s \n" + +#define RESULT_S_S2 \ + "fadd v4.4s, v4.4s, %[bias].4s \n" \ + \ + "st1 {v4.4s}, [%[out]] \n" + +#define RESULT_S_S2_RELU \ + "fadd v4.4s, v4.4s, %[bias].4s \n" \ + "fmax v4.4s, v4.4s, v9.4s \n" \ + \ + "st1 {v4.4s}, [%[out]] \n" + +#define COMPUTE_S_S2_P0 \ + "movi v9.4s, #0 \n" \ + "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \ + \ + "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" \ + "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" \ + "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" \ + "and v4.16b, %[bias].16b, %[bias].16b \n" \ + \ + "bif v10.16b, v9.16b, v6.16b \n" \ + "bif v11.16b, v9.16b, v7.16b \n" \ + "bif v12.16b, v9.16b, v6.16b \n" \ + "bif v13.16b, v9.16b, v7.16b \n" \ + "bif v14.16b, v9.16b, v6.16b \n" \ + "bif v15.16b, v9.16b, v7.16b \n" \ + \ + "ext v6.16b, v10.16b, v9.16b, #4 \n" \ + "ext v7.16b, v12.16b, v9.16b, #4 \n" \ + "ext v8.16b, v14.16b, v9.16b, #4 \n" \ + \ + "fmla v4.4s, v10.4s, %[wr0].s[0] \n" \ + "fmul v5.4s, v11.4s, %[wr0].s[1] \n" \ + "fmul v16.4s, v6.4s, %[wr0].s[2] \n" \ + \ + "fmla v4.4s, v12.4s, %[wr1].s[0] \n" \ + "fmla v5.4s, v13.4s, %[wr1].s[1] \n" \ + "fmla v16.4s, v7.4s, %[wr1].s[2] \n" \ + \ + "fmla v4.4s, v14.4s, %[wr2].s[0] \n" \ + "fmla v5.4s, v15.4s, %[wr2].s[1] \n" \ + "fmla v16.4s, v8.4s, %[wr2].s[2] \n" \ + \ + "fadd v4.4s, v4.4s, v5.4s \n" \ + "fadd v4.4s, v4.4s, v16.4s \n" + +#define RESULT_S_S2_P0 "st1 {v4.4s}, [%[out]] \n" + +#define RESULT_S_S2_P0_RELU \ + "fmax v4.4s, v4.4s, v9.4s \n" \ + "st1 {v4.4s}, [%[out]] \n" + +#else +#define INIT_S2 \ + "vmov.u32 q9, #0 \n" \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + \ + "vdup.32 q3, %[bias] @ and \n" + +#define LEFT_COMPUTE_S2 \ + "vext.32 q6, q9, q11, #3 @ shift right 1 data\n" \ + "vext.32 q7, q9, q13, #3 @ shift right 1 data\n" \ + "vext.32 q8, q9, q15, #3 @ shift right 1 data\n" \ + "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 1, out0\n" \ + "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 1, out0\n" \ + \ + "sub %[din0_ptr], #4 @ inpitr0 - 1\n" \ + "sub %[din1_ptr], #4 @ inpitr1 - 1\n" \ + "sub %[din2_ptr], #4 @ inpitr2 - 1\n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, out0\n" \ + \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 1, out1\n" \ + "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 1, out1\n" \ + "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 1, out1\n" \ + \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define LEFT_RESULT_S2 \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "cmp %[cnt], #1 \n" \ + "blt 1f \n" + +#define MID_COMPUTE_S2 \ + "2: \n" \ + "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + "vext.32 q6, q10, q8, #1 @ shift left 1 \n" \ + "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" \ + \ + "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ + \ + "vext.32 q7, q12, q8, #1 @ shift left 1 \n" \ + "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ + \ + "vext.32 q6, q14, q8, #1 @ shift left 1 \n" \ + \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, out0\n" \ + \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define MID_RESULT_S2 \ + "subs %[cnt], #1 \n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "bne 2b \n" + +#define RIGHT_COMPUTE_S2 \ + "1: \n" \ + "cmp %[remain], #1 \n" \ + "blt 3f \n" \ + \ + "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + \ + "vbif q10, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q11, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q12, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q13, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q14, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q15, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + \ + "vext.32 q6, q10, q9, #1 @ shift left 1 \n" \ + "vext.32 q7, q12, q9, #1 @ shift left 1 \n" \ + \ + "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ + \ + "vext.32 q6, q14, q9, #1 @ shift left 1 \n" \ + "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ + \ + "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, out0\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define RIGHT_RESULT_S2 \ + "vbif.f32 q3, q10, q11 @ write mask\n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "3: \n" + +#define LEFT_RESULT_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "cmp %[cnt], #1 \n" \ + "blt 1f \n" + +#define MID_RESULT_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "subs %[cnt], #1 \n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "bne 2b \n" + +#define RIGHT_RESULT_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "vbif.f32 q3, q10, q11 @ write mask\n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "3: \n" + +#define COMPUTE_S_S2 \ + "vmov.u32 q9, #0 \n" \ + "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ + \ + "vbif q10, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q11, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q12, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q13, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q14, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q15, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + \ + "vext.32 q6, q9, q11, #3 @ shift left 1 \n" \ + "vext.32 q7, q9, q13, #3 @ shift left 1 \n" \ + "vext.32 q8, q9, q15, #3 @ shift left 1 \n" \ + \ + "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 0, out0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, out0\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 2, out0\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]] \n" + +#define RESULT_S_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu\n" \ + \ + "vst1.32 {d6-d7}, [%[out]] \n" + +#define COMPUTE_S_S2_P0 \ + "vmov.u32 q9, #0 \n" \ + "vld1.f32 {d12-d15}, [%[mask_ptr]] @ load mask\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ + \ + "vbif q10, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q11, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q12, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q13, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q14, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q15, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + \ + "vext.32 q6, q10, q9, #1 @ shift left 1 \n" \ + "vext.32 q7, q12, q9, #1 @ shift left 1 \n" \ + "vext.32 q8, q14, q9, #1 @ shift left 1 \n" \ + \ + "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q8, %f[wr2][0] @ mul weight 2, out0\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]] \n" + +#define RESULT_S_S2_P0_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "vst1.32 {d6-d7}, [%[out]] \n" + +#endif + +/** + * \brief depthwise convolution kernel 3x3, stride 2 + * w_in > 7 + */ +void conv_depthwise_3x3s2p1_bias_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + int size_pad_bottom = h_out * 2 - h_in; + + int cnt_col = (w_out >> 2) - 2; + int size_right_remain = w_in - (7 + cnt_col * 8); + if (size_right_remain >= 9) { + cnt_col++; + size_right_remain -= 8; + } + int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4); // + + int size_right_pad = w_out * 2 - w_in; + + uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + uint32x4_t wmask = + vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + float* zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float* write_ptr = zero_ptr + w_in; + + unsigned int dmask[12]; + + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + vst1q_u32(dmask + 8, wmask); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float32x4_t vzero = vdupq_n_f32(0.f); +#ifdef __aarch64__ + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } +#else + float bias_c = 0.f; + if (flag_bias) { + bias_c = bias[i]; + } +#endif // __aarch64__ + + const float* dr0 = din_channel; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + const float* dr3 = dr2 + w_in; + const float* dr4 = dr3 + w_in; + + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + const float* din3_ptr = dr3; + const float* din4_ptr = dr4; + + float* doutr0 = dout_channel; + float* doutr0_ptr = nullptr; + float* doutr1_ptr = nullptr; + +#ifdef __aarch64__ + for (int i = 0; i < h_in; i += 4) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + din3_ptr = dr3; + din4_ptr = dr4; + + doutr0_ptr = doutr0; + doutr1_ptr = doutr0 + w_out; + + if (i == 0) { + din0_ptr = zero_ptr; + din1_ptr = dr0; + din2_ptr = dr1; + din3_ptr = dr2; + din4_ptr = dr3; + dr0 = dr3; + dr1 = dr4; + } else { + dr0 = dr4; + dr1 = dr0 + w_in; + } + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + + //! process bottom pad + if (i + 4 > h_in) { + switch (i + 4 - h_in) { + case 4: + din1_ptr = zero_ptr; + case 3: + din2_ptr = zero_ptr; + case 2: + din3_ptr = zero_ptr; + case 1: + din4_ptr = zero_ptr; + default: + break; + } + } + //! process output pad + if (i / 2 + 2 > h_out) { + doutr1_ptr = write_ptr; + } + int cnt = cnt_col; + if (flag_relu) { + asm volatile( + INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 + MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } else { + asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 + MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } + doutr0 = doutr0 + 2 * w_out; + } +#else + for (int i = 0; i < h_in; i += 2) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + + doutr0_ptr = doutr0; + + if (i == 0) { + din0_ptr = zero_ptr; + din1_ptr = dr0; + din2_ptr = dr1; + dr0 = dr1; + dr1 = dr2; + dr2 = dr1 + w_in; + } else { + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + } + + //! process bottom pad + if (i + 2 > h_in) { + switch (i + 2 - h_in) { + case 2: + din1_ptr = zero_ptr; + case 1: + din2_ptr = zero_ptr; + default: + break; + } + } + int cnt = cnt_col; + unsigned int* mask_ptr = dmask; + if (flag_relu) { + asm volatile( + INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 + MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 + MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + doutr0 = doutr0 + w_out; + } +#endif + } + } +} + +/** + * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 + */ +void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + float zeros[8] = {0.0f}; + + uint32x4_t vmask_rp1 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + unsigned int dmask[8]; + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float bias_c = 0.f; + + if (flag_bias) { + bias_c = bias[i]; + } + float32x4_t vbias = vdupq_n_f32(bias_c); + int hs = -1; + int he = 2; + float out_buf[4]; + for (int j = 0; j < h_out; ++j) { + const float* dr0 = din_channel + hs * w_in; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + if (hs == -1) { + dr0 = zeros; + } + if (he > h_in) { + dr2 = zeros; + } + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + + unsigned int* mask_ptr = dmask; +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } else { + asm volatile(COMPUTE_S_S2 RESULT_S_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } +#else + if (flag_relu) { + asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S2 RESULT_S_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *dout_channel++ = out_buf[w]; + } + hs += 2; + he += 2; + } + } + } +} + +/** + * \brief depthwise convolution kernel 3x3, stride 2 + */ +// w_in > 7 +void conv_depthwise_3x3s2p0_bias_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + + int tile_w = w_out >> 2; + int cnt_remain = w_out % 4; + + unsigned int size_right_remain = (unsigned int)(8 + (tile_w << 3) - w_in); + size_right_remain = 8 - size_right_remain; + + if (cnt_remain == 0 && size_right_remain == 0) { + cnt_remain = 4; + tile_w -= 1; + size_right_remain = 8; + } + uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + uint32x4_t wmask = + vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + float* zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float* write_ptr = zero_ptr + w_in; + + unsigned int dmask[12]; + + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + vst1q_u32(dmask + 8, wmask); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float32x4_t vzero = vdupq_n_f32(0.f); + +#ifdef __aarch64__ + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } +#else + float bias_c = 0.f; + if (flag_bias) { + bias_c = bias[i]; + } +#endif // __aarch64__ + + const float* dr0 = din_channel; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + const float* dr3 = dr2 + w_in; + const float* dr4 = dr3 + w_in; + + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + const float* din3_ptr = dr3; + const float* din4_ptr = dr4; + + float* doutr0 = dout_channel; + float* doutr0_ptr = nullptr; + float* doutr1_ptr = nullptr; + +#ifdef __aarch64__ + for (int i = 0; i < h_out; i += 2) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + din3_ptr = dr3; + din4_ptr = dr4; + + doutr0_ptr = doutr0; + doutr1_ptr = doutr0 + w_out; + + dr0 = dr4; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + + //! process bottom pad + if (i * 2 + 5 > h_in) { + switch (i * 2 + 5 - h_in) { + case 4: + din1_ptr = zero_ptr; + case 3: + din2_ptr = zero_ptr; + case 2: + din3_ptr = zero_ptr; + case 1: + din4_ptr = zero_ptr; + case 0: + din4_ptr = zero_ptr; + default: + break; + } + } + //! process output pad + if (i + 2 > h_out) { + doutr1_ptr = write_ptr; + } + int cnt = tile_w; + if (flag_relu) { + asm volatile( + INIT_S2 + "ld1 {v15.4s}, [%[inptr0]] \n" + "ld1 {v18.4s}, [%[inptr1]] \n" + "ld1 {v19.4s}, [%[inptr2]] \n" + "ld1 {v20.4s}, [%[inptr3]] \n" + "ld1 {v21.4s}, [%[inptr4]] \n" + "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} + MID_COMPUTE_S2 MID_RESULT_S2_RELU + "cmp %w[remain], #1 \n" + "blt 4f \n" RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2_RELU + "4: \n" + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } else { + asm volatile( + INIT_S2 + "ld1 {v15.4s}, [%[inptr0]] \n" + "ld1 {v18.4s}, [%[inptr1]] \n" + "ld1 {v19.4s}, [%[inptr2]] \n" + "ld1 {v20.4s}, [%[inptr3]] \n" + "ld1 {v21.4s}, [%[inptr4]] \n" + "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} + MID_COMPUTE_S2 MID_RESULT_S2 + "cmp %w[remain], #1 \n" + "blt 4f \n" RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2 + "4: \n" + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } + doutr0 = doutr0 + 2 * w_out; + } +#else + for (int i = 0; i < h_out; i++) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + + doutr0_ptr = doutr0; + + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + + //! process bottom pad + if (i * 2 + 3 > h_in) { + switch (i * 2 + 3 - h_in) { + case 2: + din1_ptr = zero_ptr; + case 1: + din2_ptr = zero_ptr; + default: + break; + } + } + int cnt = tile_w; + unsigned int* mask_ptr = dmask; + if (flag_relu) { + asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU + RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + doutr0 = doutr0 + w_out; + } +#endif + } + } +} + +/** + * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 + */ +void conv_depthwise_3x3s2p0_bias_s_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + float zeros[8] = {0.0f}; + const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; + + uint32x4_t vmask_rp1 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + unsigned int dmask[8]; + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float bias_c = 0.f; + + if (flag_bias) { + bias_c = bias[i]; + } + float32x4_t vbias = vdupq_n_f32(bias_c); + float out_buf[4]; + const float* dr0 = din_channel; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + for (int j = 0; j < h_out; j++) { + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + if (j * 2 + 2 >= h_in) { + switch (j + 2 - h_in) { + case 1: + din1_ptr = zero_ptr; + case 0: + din2_ptr = zero_ptr; + default: + break; + } + } + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + + unsigned int* mask_ptr = dmask; +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "cc", + "memory", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16"); + } else { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "cc", + "memory", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16"); + } +#else + if (flag_relu) { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf), + [mask_ptr] "r"(dmask) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf), + [mask_ptr] "r"(dmask) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *dout_channel++ = out_buf[w]; + } + } + } + } +} +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc index 4617d40f4372f6589f20b50205fb307cdc705808..4bb8554202b8feeea48b07e2057ea5d20606ab8e 100644 --- a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc +++ b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc @@ -113,9 +113,9 @@ namespace math { "fcmge v7.4s, v22.4s, v0.4s \n" /* vcgeq_u32 */ \ "fmul v8.4s, v22.4s, %[vscale].4s \n" /* mul */ \ "bif v19.16b, v2.16b, v1.16b \n" /* choose*/ \ - "bif v19.16b, v4.16b, v3.16b \n" /* choose*/ \ - "bif v19.16b, v6.16b, v5.16b \n" /* choose*/ \ - "bif v19.16b, v8.16b, v7.16b \n" /* choose*/ + "bif v20.16b, v4.16b, v3.16b \n" /* choose*/ \ + "bif v21.16b, v6.16b, v5.16b \n" /* choose*/ \ + "bif v22.16b, v8.16b, v7.16b \n" /* choose*/ #define STORE /* save result */ \ "str q19, [%[outc0]], #16\n" \ "str q20, [%[outc1]], #16\n" \ diff --git a/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc b/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc index c778896550de73f888979c8337731a0b9967b5dd..0ac1705de76102c92c9e63d64721aa2467baaf04 100644 --- a/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc +++ b/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc @@ -102,7 +102,7 @@ void conv_depthwise_5x5s2_int8(Dtype* dout, if (h + hout_r_block > hout) { h_kernel = hout - h; } - int hs = h - padh; + int hs = h * 2 - padh; int he = hs + h_kernel * 2 + 3; #pragma omp parallel for num_threads(threads) diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h index 85404d6a6e2e6246677857be8231e15afa86210d..c4fb51021e5b0288a4bc1fd476764348fdc7e450 100644 --- a/lite/backends/arm/math/conv_block_utils.h +++ b/lite/backends/arm/math/conv_block_utils.h @@ -703,7 +703,9 @@ inline void act_switch_c1_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -722,7 +724,7 @@ inline void act_switch_c1_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif break; case lite_api::ActivationType::kRelu6: @@ -734,7 +736,9 @@ inline void act_switch_c1_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [six] "w"(six) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -753,7 +757,7 @@ inline void act_switch_c1_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [six] "w"(six) - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif break; case lite_api::ActivationType::kLeakyRelu: @@ -765,7 +769,9 @@ inline void act_switch_c1_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [scale] "w"(scale) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -785,7 +791,9 @@ inline void act_switch_c1_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [scale] "w"(scale) - : "q0", + : "cc", + "memory", + "q0", "q1", "q2", "q3", @@ -812,14 +820,14 @@ inline void act_switch_c1_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", "v1", "v2", "v3", "v20"); + : "cc", "memory", "v0", "v1", "v2", "v3", "v20"); #else asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_STORE : [doutc0r0] "+r"(doutc0_ptr), [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif } } @@ -1006,7 +1014,9 @@ inline void act_switch_c2_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1026,7 +1036,7 @@ inline void act_switch_c2_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif break; case lite_api::ActivationType::kRelu6: @@ -1039,7 +1049,9 @@ inline void act_switch_c2_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [six] "w"(six) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1059,7 +1071,7 @@ inline void act_switch_c2_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [six] "w"(six) - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif break; case lite_api::ActivationType::kLeakyRelu: @@ -1072,7 +1084,9 @@ inline void act_switch_c2_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [scale] "w"(scale) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1092,7 +1106,9 @@ inline void act_switch_c2_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [scale] "w"(scale) - : "q0", + : "cc", + "memory", + "q0", "q1", "q2", "q3", @@ -1120,7 +1136,7 @@ inline void act_switch_c2_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", "v1", "v2", "v3", "v4", "v5", "v20"); + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v20"); #else asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE : [doutc0r0] "+r"(doutc0_ptr), @@ -1128,7 +1144,7 @@ inline void act_switch_c2_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif } } @@ -1373,7 +1389,9 @@ inline void act_switch_c4_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1403,7 +1421,7 @@ inline void act_switch_c4_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif break; case lite_api::ActivationType::kRelu6: @@ -1418,7 +1436,9 @@ inline void act_switch_c4_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [six] "w"(six) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1448,7 +1468,7 @@ inline void act_switch_c4_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [six] "w"(six) - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif break; case lite_api::ActivationType::kLeakyRelu: @@ -1463,7 +1483,9 @@ inline void act_switch_c4_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [scale] "w"(scale) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1493,7 +1515,9 @@ inline void act_switch_c4_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [scale] "w"(scale) - : "q0", + : "cc", + "memory", + "q0", "q1", "q2", "q3", @@ -1523,7 +1547,9 @@ inline void act_switch_c4_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1544,7 +1570,7 @@ inline void act_switch_c4_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q15"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q15"); #endif } } @@ -1929,7 +1955,9 @@ inline void act_switch_c8_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -1963,7 +1991,17 @@ inline void act_switch_c8_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15"); + : "cc", + "memory", + "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q15"); #endif break; case lite_api::ActivationType::kRelu6: @@ -1982,7 +2020,9 @@ inline void act_switch_c8_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [six] "w"(six) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -2012,7 +2052,17 @@ inline void act_switch_c8_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [six] "w"(six) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15"); + : "cc", + "memory", + "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q15"); #endif break; case lite_api::ActivationType::kLeakyRelu: @@ -2031,7 +2081,9 @@ inline void act_switch_c8_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : [scale] "w"(scale) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -2076,7 +2128,9 @@ inline void act_switch_c8_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : [scale] "w"(scale) - : "q0", + : "cc", + "memory", + "q0", "q1", "q2", "q3", @@ -2112,7 +2166,9 @@ inline void act_switch_c8_fp32(const float* din_ptr, [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_ptr) : - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -2146,7 +2202,17 @@ inline void act_switch_c8_fp32(const float* din_ptr, [ptr_din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15"); + : "cc", + "memory", + "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q15"); #endif } } @@ -2744,8 +2810,18 @@ inline void int32_nchwc4_kernel(int8_t*& dout0, // NOLINT float32x4_t bias, bool is_relu) { #ifdef __aarch64__ + float32x4_t vmax = vdupq_n_f32(-127.f); asm volatile(NCHWC4_TRANS_INT32 "subs %w[cnt], %w[cnt], #1\n" + /* data >= -127 */ + "fcmge v4.4s, v16.4s, %[vmax].4s \n" + "fcmge v5.4s, v18.4s, %[vmax].4s \n" + "fcmge v6.4s, v17.4s, %[vmax].4s \n" + "fcmge v7.4s, v19.4s, %[vmax].4s \n" + "bif v16.16b, %[vmax].16b, v4.16b \n" + "bif v18.16b, %[vmax].16b, v5.16b \n" + "bif v17.16b, %[vmax].16b, v6.16b \n" + "bif v19.16b, %[vmax].16b, v7.16b \n" /* fp32-int32 */ "fcvtas v4.4s, v16.4s\n" "fcvtas v5.4s, v18.4s\n" @@ -2773,7 +2849,10 @@ inline void int32_nchwc4_kernel(int8_t*& dout0, // NOLINT [doutc3r0] "+r"(dout3), [ptr_din] "+r"(din), [cnt] "+r"(cnt) - : [scale] "w"(scale), [bias] "w"(bias), [relu] "r"(is_relu) + : [scale] "w"(scale), + [vmax] "w"(vmax), + [bias] "w"(bias), + [relu] "r"(is_relu) : "cc", "memory", "v0", @@ -2799,6 +2878,7 @@ inline void int32_nchwc4_kernel(int8_t*& dout0, // NOLINT "v20", "v31"); #else + float vmax[4] = {-127.f, -127.f, -127.f, -127.f}; asm volatile(NCHWC4_TRANS_INT32 /* set 0.5 offset */ "vmov.f32 q2, #0.5\n" @@ -2815,11 +2895,21 @@ inline void int32_nchwc4_kernel(int8_t*& dout0, // NOLINT "vbif.f32 q3, q14, q7 @ get right offset\n" "vbif.f32 q4, q14, q8 @ get right offset\n" "vbif.f32 q5, q14, q9 @ get right offset\n" + "vld1.32 {d28-d29}, [%[vmax]] \n" /* add offset */ "vadd.f32 q10, q2, q10\n" "vadd.f32 q11, q3, q11\n" "vadd.f32 q12, q4, q12\n" "vadd.f32 q13, q5, q13\n" + /* data >= -127 */ + "vcge.f32 q6, q10, q14 @ q10 >= vmax \n" + "vcge.f32 q7, q11, q14 @ q11 >= vmax \n" + "vcge.f32 q8, q12, q14 @ q12 >= vmax \n" + "vcge.f32 q9, q13, q14 @ q13 >= vmax \n" + "vbif q10, q14, q6 @ choose \n" + "vbif q11, q14, q7 @ choose \n" + "vbif q12, q14, q8 @ choose \n" + "vbif q13, q14, q9 @ choose \n" /* fp32 to int32 */ "vcvt.s32.f32 q6, q10 @ cvt to int32\n" "vcvt.s32.f32 q7, q11 @ cvt to int32\n" @@ -2836,7 +2926,7 @@ inline void int32_nchwc4_kernel(int8_t*& dout0, // NOLINT "vqmovn.s16 d14, q12 @ cnt to int8\n" "vqmovn.s16 d15, q13 @ cnt to int8\n" "subs %[cnt], %[cnt], #1\n" - /* store */ + /* store data*/ "vld1.32 {d4-d7}, [%[ptr_din]]!\n" "vst1.32 {d12[0]}, [%[doutc0r0]]!\n" "vst1.32 {d13[0]}, [%[doutc1r0]]!\n" @@ -2850,7 +2940,10 @@ inline void int32_nchwc4_kernel(int8_t*& dout0, // NOLINT [doutc3r0] "+r"(dout3), [ptr_din] "+r"(din), [cnt] "+r"(cnt) - : [scale] "w"(scale), [bias] "w"(bias), [relu] "r"(is_relu) + : [scale] "w"(scale), + [bias] "w"(bias), + [relu] "r"(is_relu), + [vmax] "r"(vmax) : "cc", "memory", "q2", @@ -2989,8 +3082,10 @@ template <> inline int8_t cvt_kernel(int din, float scale, float bias, bool flag_relu) { if (flag_relu) { return saturate_cast(round(LITEMAX(din * scale + bias, 0))); + } else { + auto tmp = saturate_cast(round(din * scale + bias)); + return tmp < -127 ? -127 : tmp; } - return saturate_cast(round(din * scale + bias)); } template <> @@ -3362,7 +3457,27 @@ inline void int32_nchwc8_kernel(int8_t*& dout0, // NOLINT float32x4_t bias1, bool is_relu) { #ifdef __aarch64__ + float32x4_t vmax = vdupq_n_f32(-127.f); asm volatile(INT32_NCHWC8_TO_NCHW_FP32 /* fp32-int32 */ + /* data >= -127 */ + "fcmge v10.4s, v16.4s, %[vmax].4s \n" + "fcmge v11.4s, v17.4s, %[vmax].4s \n" + "fcmge v14.4s, v18.4s, %[vmax].4s \n" + "fcmge v15.4s, v19.4s, %[vmax].4s \n" + "fcmge v20.4s, v8.4s, %[vmax].4s \n" + "fcmge v21.4s, v9.4s, %[vmax].4s \n" + "fcmge v22.4s, v12.4s, %[vmax].4s \n" + "fcmge v23.4s, v13.4s, %[vmax].4s \n" + /* choose data */ + "bif v16.16b, %[vmax].16b, v10.16b \n" + "bif v17.16b, %[vmax].16b, v11.16b \n" + "bif v18.16b, %[vmax].16b, v14.16b \n" + "bif v19.16b, %[vmax].16b, v15.16b \n" + "bif v8.16b, %[vmax].16b, v20.16b \n" + "bif v9.16b, %[vmax].16b, v21.16b \n" + "bif v12.16b, %[vmax].16b, v22.16b \n" + "bif v13.16b, %[vmax].16b, v23.16b \n" + /* fp32 - int32 */ "fcvtas v10.4s, v16.4s\n" "fcvtas v11.4s, v17.4s\n" "fcvtas v14.4s, v18.4s\n" @@ -3413,6 +3528,7 @@ inline void int32_nchwc8_kernel(int8_t*& dout0, // NOLINT [scale1] "w"(scale1), [bias0] "w"(bias0), [bias1] "w"(bias1), + [vmax] "w"(vmax), [relu] "r"(is_relu) : "cc", "memory", @@ -3442,6 +3558,7 @@ inline void int32_nchwc8_kernel(int8_t*& dout0, // NOLINT "v23", "v31"); #else + float vmax[4] = {-127.f, -127.f, -127.f, -127.f}; asm volatile(INT32_NCHWC8_TO_NCHW_FP32 /* set +-0.5 offset */ "vmov.f32 q10, #-0.5\n" "vmov.f32 q9, #0.5\n" @@ -3475,7 +3592,18 @@ inline void int32_nchwc8_kernel(int8_t*& dout0, // NOLINT "vmov.f32 q9, #0.5\n" "vcgt.f32 q11, q7, q8 @ get mask > 0, in0\n" "vbif.f32 q9, q10, q11 @ get right offset\n" + "vld1.32 {d22-d23}, [%[vmax]] \n" "vadd.f32 q7, q7, q9\n" + /* data >= -127 */ + "vcge.f32 q8, q0, q11 @ q10 >= vmax \n" + "vcge.f32 q9, q2, q11 @ q10 >= vmax \n" + "vcge.f32 q10, q4, q11 @ q10 >= vmax \n" + /* choose data */ + "vbif q0, q11, q8 @ choose \n" + "vcge.f32 q8, q6, q11 @ q10 >= vmax \n" + "vbif q2, q11, q9 @ choose \n" + "vbif q4, q11, q10 @ choose \n" + "vbif q6, q11, q8 @ choose \n" /* fp32 to int32 */ "vcvt.s32.f32 q8, q0 @ cvt to int32\n" "vcvt.s32.f32 q9, q2 @ cvt to int32\n" @@ -3486,6 +3614,17 @@ inline void int32_nchwc8_kernel(int8_t*& dout0, // NOLINT "vqmovn.s32 d4, q9 @ cnt to int16\n" "vqmovn.s32 d8, q10 @ cnt to int16\n" "vqmovn.s32 d12, q11 @ cnt to int16\n" + /* data >= -127 */ + "vld1.32 {d22-d23}, [%[vmax]] \n" + "vcge.f32 q8, q1, q11 @ q10 >= vmax \n" + "vcge.f32 q9, q3, q11 @ q10 >= vmax \n" + "vcge.f32 q10, q5, q11 @ q10 >= vmax \n" + /* choose data */ + "vbif q1, q11, q8 @ choose \n" + "vcge.f32 q8, q7, q11 @ q10 >= vmax \n" + "vbif q3, q11, q9 @ choose \n" + "vbif q5, q11, q10 @ choose \n" + "vbif q7, q11, q8 @ choose \n" /* fp32 to int32 */ "vcvt.s32.f32 q8, q1 @ cvt to int32\n" "vcvt.s32.f32 q9, q3 @ cvt to int32\n" @@ -3529,6 +3668,7 @@ inline void int32_nchwc8_kernel(int8_t*& dout0, // NOLINT [scale1] "w"(scale1), [bias0] "w"(bias0), [bias1] "w"(bias1), + [vmax] "r"(vmax), [relu] "r"(is_relu) : "cc", "memory", diff --git a/lite/backends/arm/math/conv_depthwise.h b/lite/backends/arm/math/conv_depthwise.h index 4c5f284a19f615382ea04904184427f569f95ff3..72d887ce4e630057286d98c86970def4a9efdb04 100644 --- a/lite/backends/arm/math/conv_depthwise.h +++ b/lite/backends/arm/math/conv_depthwise.h @@ -207,6 +207,118 @@ void conv_depthwise_5x5s2_int8(Dtype* dout, int padh, ARMContext* ctx); +void conv_depthwise_3x3s1p0_bias_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s1p0_bias_s_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s1p1_bias_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s1p1_bias_s_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2p0_bias_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2p0_bias_s_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2p1_bias_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc index 96d0893bc0f0a1c145f4e58dd2caecfba78786ab..4fcef3813b792808414415fa874e14f5ef253fcd 100644 --- a/lite/backends/arm/math/conv_impl.cc +++ b/lite/backends/arm/math/conv_impl.cc @@ -573,6 +573,22 @@ template void conv_im2col_gemm_int8(const int8_t* i_data, ARMContext* ctx, const float* scale); +template void im2col(const float* data_im, + int channels, + int height, + int width, + int kernel_h, + int kernel_w, + int pad_top, + int pad_bottom, + int pad_left, + int pad_right, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + float* data_col); + void conv_depthwise_3x3_fp32(const void* din, void* dout, int num, @@ -613,6 +629,26 @@ void conv_depthwise_3x3_fp32(const void* din, act_param, ctx); } else { +#ifdef __aarch64__ + conv_3x3s1_depthwise_fp32(reinterpret_cast(din), + reinterpret_cast(dout), + num, + ch_out, + h_out, + w_out, + ch_in, + h_in, + w_in, + reinterpret_cast(weights), + bias, + param, + act_param, + ctx); +#else +#ifdef LITE_WITH_ARM_CLANG + LOG(FATAL) << "fp32 depthwise conv3x3s1px doesnot support in v7-clang, " + "this can run in basic"; +#else conv_3x3s1_depthwise_fp32(reinterpret_cast(din), reinterpret_cast(dout), num, @@ -627,6 +663,8 @@ void conv_depthwise_3x3_fp32(const void* din, param, act_param, ctx); +#endif +#endif } } else if (stride == 2) { if (pads_less && pad_h == pad_w && (pad < 2)) { // support pad = [0, 1] diff --git a/lite/backends/arm/math/conv_impl.h b/lite/backends/arm/math/conv_impl.h index 60f74b7feecc91a2fe8262a1fea4dce26430031d..28a2fb7e2a42a27e9ecd3d42b25f9942b481004e 100644 --- a/lite/backends/arm/math/conv_impl.h +++ b/lite/backends/arm/math/conv_impl.h @@ -359,6 +359,24 @@ void conv_compute_2x2_3x3_small(const float* input, const float* bias, const operators::ConvParam& param, ARMContext* ctx); + +template +void im2col(const Dtype* data_im, + int channels, + int height, + int width, + int kernel_h, + int kernel_w, + int pad_top, + int pad_bottom, + int pad_left, + int pad_right, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + Dtype* data_col); + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc index 186ad19735799dcb91641354af4b4f09692bfce9..4d08c1e957d43b5b748ffdb90fd14a07a61d0183 100644 --- a/lite/backends/arm/math/elementwise.cc +++ b/lite/backends/arm/math/elementwise.cc @@ -266,6 +266,72 @@ void elementwise_add_relu_broadcast(const float* dinx, } } +template <> +void elementwise_add_grad(const float* dout_grad, + float* x_grad, + int num) { + int cnt = num >> 4; + int remain = num & 0x0f; +#pragma omp parallel for + for (int i = 0; i < cnt; ++i) { + const float* out_data = dout_grad + 16 * i; + float* x_data = x_grad + 16 * i; + float32x4_t din0 = vld1q_f32(out_data); + float32x4_t din1 = vld1q_f32(out_data + 4); + float32x4_t din2 = vld1q_f32(out_data + 8); + float32x4_t din3 = vld1q_f32(out_data + 12); + vst1q_f32(x_data, din0); + vst1q_f32(x_data + 4, din1); + vst1q_f32(x_data + 8, din2); + vst1q_f32(x_data + 12, din3); + } + if (remain > 0) { + const float* out_data = dout_grad + 16 * cnt; + float* x_data = x_grad + 16 * cnt; + for (int i = 0; i < remain; ++i) { + x_data[i] = out_data[i]; + } + } +} +// we assume that y_data numel less than x_data, otherwise, call this function +// by change x_grad and y_grad position +template <> +void elementwise_add_grad_broadcast(const float* dout_grad, + float* x_grad, + float* y_grad, + int pre, + int n, + int post) { + if (x_grad != nullptr) { + elementwise_add_grad(dout_grad, x_grad, pre * n * post); + } + if (y_grad != nullptr) { + memset(y_grad, 0, n * sizeof(float)); +#pragma omp parallel for + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + float sum = 0; + int cnt = post >> 2; + int remain = post & 0x03; + const float* out_data = dout_grad + (i * n + j) * post; + float32x4_t sum_v = vdupq_n_f32(0); + for (int ci = 0; ci < cnt; ++ci) { + float32x4_t din = vld1q_f32(out_data + 4 * ci); + sum_v = vaddq_f32(sum_v, din); + } + out_data += 4 * cnt; + for (int ci = 0; ci < remain; ++ci) { + sum += out_data[ci]; + } + float32x2_t high = vget_high_f32(sum_v); + float32x2_t low = vget_low_f32(sum_v); + sum += vget_lane_f32(high, 0) + vget_lane_f32(high, 1) + + vget_lane_f32(low, 0) + vget_lane_f32(low, 1); + y_grad[j] += sum; + } + } + } +} template <> void elementwise_sub(const float* dinx, const float* diny, @@ -510,6 +576,84 @@ void elementwise_sub_relu_broadcast(const float* dinx, } } } +// we assume the formula is x-y +template <> +void elementwise_sub_grad(const float* dout_grad, + float* x_grad, + float* y_grad, + int num) { + if (x_grad != nullptr) { + elementwise_add_grad(dout_grad, x_grad, num); + } + if (y_grad != nullptr) { + int cnt = num >> 4; + int remain = num & 0x0f; + float32x4_t minus = vdupq_n_f32(-1); +#pragma omp parallel for + for (int i = 0; i < cnt; ++i) { + const float* out_data = dout_grad + 16 * i; + float* y_data = y_grad + 16 * i; + float32x4_t din0 = vld1q_f32(out_data); + float32x4_t din1 = vld1q_f32(out_data + 4); + float32x4_t din2 = vld1q_f32(out_data + 8); + float32x4_t din3 = vld1q_f32(out_data + 12); + din0 = vmulq_f32(din0, minus); + din1 = vmulq_f32(din1, minus); + din2 = vmulq_f32(din2, minus); + din3 = vmulq_f32(din3, minus); + vst1q_f32(y_data, din0); + vst1q_f32(y_data + 4, din1); + vst1q_f32(y_data + 8, din2); + vst1q_f32(y_data + 12, din3); + } + if (remain > 0) { + const float* out_data = dout_grad + 16 * cnt; + float* y_data = y_grad + 16 * cnt; + for (int i = 0; i < remain; ++i) { + y_data[i] = -out_data[i]; + } + } + } +} +// we assume that y_data numel less than x_data, otherwise, call this function +// by change x_grad and y_grad position +template <> +void elementwise_sub_grad_broadcast(const float* dout_grad, + float* x_grad, + float* y_grad, + int pre, + int n, + int post) { + if (x_grad != nullptr) { + elementwise_add_grad(dout_grad, x_grad, pre * n * post); + } + if (y_grad != nullptr) { + memset(y_grad, 0, n * sizeof(float)); +#pragma omp parallel for + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + float sum = 0; + int cnt = post << 2; + int remain = post & 0x03; + const float* out_data = dout_grad + (i * n + j) * post; + float32x4_t sum_v = vdupq_n_f32(0); + for (int ci = 0; ci < cnt; ++ci) { + float32x4_t din = vld1q_f32(out_data + 4 * ci); + sum_v = vaddq_f32(sum_v, din); + } + out_data += 4 * cnt; + for (int ci = 0; ci < remain; ++ci) { + sum -= out_data[ci]; + } + float32x2_t high = vget_high_f32(sum_v); + float32x2_t low = vget_low_f32(sum_v); + sum -= vget_lane_f32(high, 0) + vget_lane_f32(high, 1) + + vget_lane_f32(low, 0) + vget_lane_f32(low, 1); + y_grad[j] += sum; + } + } + } +} template <> void elementwise_mul(const float* dinx, diff --git a/lite/backends/arm/math/elementwise.h b/lite/backends/arm/math/elementwise.h index f8273a5bb39505b03e911b5699cc10c5be755619..06ecab08edcaf06614de94b99084be2ee80647aa 100644 --- a/lite/backends/arm/math/elementwise.h +++ b/lite/backends/arm/math/elementwise.h @@ -13,11 +13,161 @@ // limitations under the License. #pragma once - +#include +#include +#include +#include "lite/operators/op_params.h" namespace paddle { namespace lite { namespace arm { namespace math { +template +void elementwise_broadcast_common(T const* x_data, + T const* y_data, + T* out_data, + std::vector x_real_dim, + std::vector y_real_dim, + std::vector out_real_dim, + std::string type, + bool is_xsize_large = false) { + int out_size = 1; + int max_dim = out_real_dim.size(); + std::vector index_array(max_dim, 0); + for (int i = 0; i < max_dim; ++i) { + out_size *= out_real_dim[i]; + } + int x_index, y_index; + for (int out_index = 0; out_index < out_size; ++out_index) { + x_index = 0; + for (int i = 0; i < max_dim; i++) { + if (x_real_dim[i] > 1) { + x_index = x_index * x_real_dim[i] + index_array[i]; + } + } + y_index = 0; + for (int i = 0; i < max_dim; i++) { + if (y_real_dim[i] > 1) { + y_index = y_index * y_real_dim[i] + index_array[i]; + } + } + + if (type == "add") { + out_data[out_index] = x_data[x_index] + y_data[y_index]; + } + if (type == "mul") { + out_data[out_index] = x_data[x_index] * y_data[y_index]; + } + } + for (int i = max_dim - 1; i >= 0; --i) { + ++index_array[i]; + if (index_array[i] >= out_real_dim[i]) { + index_array[i] -= out_real_dim[i]; + } else { + break; + } + } +} +template +void elementwise_compute_basic(const operators::ElementwiseParam& param, + const std::string elt_type, + const std::string act_type) { + const dtype* x_data = param.X->data(); + const dtype* y_data = param.Y->data(); + dtype* out_data = param.Out->mutable_data(); + auto x_dims = param.X->dims(); + auto y_dims = param.Y->dims(); + int axis = param.axis; + if (axis < 0) { + axis = x_dims.size() - y_dims.size(); + } + int batch = 1; + int channels = 1; + int num = 1; + for (int i = 0; i < axis; ++i) { + batch *= x_dims[i]; + } + for (int i = 0; i < y_dims.size(); ++i) { + channels *= y_dims[i]; + } + for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { + num *= x_dims[i]; + } + // do elementwise add/sub/max... + if (elt_type == "add") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr + diny_data; + dout_ptr++; + din_ptr++; + } + } + } + } else if (elt_type == "sub") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr - diny_data; + dout_ptr++; + } + } + } + } else if (elt_type == "mul") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr * diny_data; + dout_ptr++; + din_ptr++; + } + } + } + } else if (elt_type == "max") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = std::max(*din_ptr, diny_data); + dout_ptr++; + din_ptr++; + } + } + } + } else { + LOG(FATAL) << "unsupported Elementwise type: " << elt_type; + } + // do activation relu/sigmod... + if (act_type.size() > 0) { + if (act_type == "relu") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + dtype* dout_ptr = out_data + (i * channels + j) * num; + for (int k = 0; k < num; ++k) { + *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f; + dout_ptr++; + } + } + } + } else { + LOG(FATAL) << "unsupported Activation type: " << elt_type; + } + } +} template void elementwise_add(const T* dinx, const T* diny, T* dout, int num); @@ -33,6 +183,13 @@ template void elementwise_add_relu_broadcast( const T* dinx, const T* diny, T* dout, int batch, int channels, int num); +template +void elementwise_add_grad(const T* dout, T* dinx, int num); + +template +void elementwise_add_grad_broadcast( + const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post); + template void elementwise_sub(const T* dinx, const T* diny, T* dout, int num); @@ -47,6 +204,13 @@ template void elementwise_sub_relu_broadcast( const T* dinx, const T* diny, T* dout, int batch, int channels, int num); +template +void elementwise_sub_grad(const T* dout, T* dinx, T* diny, int num); + +template +void elementwise_sub_grad_broadcast( + const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post); + template void elementwise_mul(const T* dinx, const T* diny, T* dout, int num); diff --git a/lite/backends/arm/math/gemm_prepacked_int8.cc b/lite/backends/arm/math/gemm_prepacked_int8.cc index d7e04bfc60b1214bd1e77738efa420d3e25e1456..08f88105e052322e13390b7482fed7d8dd15089b 100644 --- a/lite/backends/arm/math/gemm_prepacked_int8.cc +++ b/lite/backends/arm/math/gemm_prepacked_int8.cc @@ -572,6 +572,25 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, #define GEMM_INT8_INT8_OUT \ GEMM_TRANS_INT32_TO_FP32 \ GEMM_INT8_RELU \ + "ld1 {v8.4s}, [%[vmax]] \n" /* v8 = -127 */ \ + /* data >= -127 */ \ + "fcmge v0.4s, v16.4s, v8.4s\n" \ + "fcmge v1.4s, v17.4s, v8.4s\n" \ + "fcmge v2.4s, v18.4s, v8.4s\n" \ + "fcmge v3.4s, v19.4s, v8.4s\n" \ + "fcmge v4.4s, v20.4s, v8.4s\n" \ + "fcmge v5.4s, v21.4s, v8.4s\n" \ + "fcmge v6.4s, v22.4s, v8.4s\n" \ + "fcmge v7.4s, v23.4s, v8.4s\n" \ + /* choose data */ \ + "bif v16.16b, v8.16b, v0.16b \n" \ + "bif v17.16b, v8.16b, v1.16b \n" \ + "bif v18.16b, v8.16b, v2.16b \n" \ + "bif v19.16b, v8.16b, v3.16b \n" \ + "bif v20.16b, v8.16b, v4.16b \n" \ + "bif v21.16b, v8.16b, v5.16b \n" \ + "bif v22.16b, v8.16b, v6.16b \n" \ + "bif v23.16b, v8.16b, v7.16b \n" \ "fcvtas v0.4s, v16.4s\n" /* 00, cvt to int */ \ "fcvtas v1.4s, v17.4s\n" /* 01, cvt to int */ \ "fcvtas v2.4s, v18.4s\n" /* 02, cvt to int */ \ @@ -580,6 +599,24 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, "fcvtas v5.4s, v21.4s\n" /* 11, cvt to int */ \ "fcvtas v6.4s, v22.4s\n" /* 12, cvt to int */ \ "fcvtas v7.4s, v23.4s\n" /* 13, cvt to int */ \ + /* data >= -127 */ \ + "fcmge v16.4s, v24.4s, v8.4s\n" \ + "fcmge v17.4s, v25.4s, v8.4s\n" \ + "fcmge v18.4s, v26.4s, v8.4s\n" \ + "fcmge v19.4s, v27.4s, v8.4s\n" \ + "fcmge v20.4s, v28.4s, v8.4s\n" \ + "fcmge v21.4s, v29.4s, v8.4s\n" \ + "fcmge v22.4s, v30.4s, v8.4s\n" \ + "fcmge v23.4s, v31.4s, v8.4s\n" \ + /* choose data */ \ + "bif v24.16b, v8.16b, v16.16b\n" \ + "bif v25.16b, v8.16b, v17.16b\n" \ + "bif v26.16b, v8.16b, v18.16b\n" \ + "bif v27.16b, v8.16b, v19.16b\n" \ + "bif v28.16b, v8.16b, v20.16b\n" \ + "bif v29.16b, v8.16b, v21.16b\n" \ + "bif v30.16b, v8.16b, v22.16b\n" \ + "bif v31.16b, v8.16b, v23.16b\n" \ "sqxtn v16.4h, v0.4s\n" /* 00, cvt int32 to int16 */ \ "fcvtas v8.4s, v24.4s\n" /* 20, cvt to int */ \ "sqxtn2 v16.8h, v1.4s\n" /* 01, cvt int32 to int16 */ \ @@ -648,7 +685,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, "v9","v10","v11","v12","v13","v14", "v15","v16","v17","v18","v19","v20", "v21","v22","v23","v24","v25","v26", - "v27","v28","v29","v30","v31","cc"); + "v27","v28","v29","v30","v31","cc", "memory"); // clang-format on } @@ -665,6 +702,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, int k, int rem) { // clang-format off + float vmax[4] = {-127.0, -127.0, -127.0, -127.0}; asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), @@ -676,13 +714,14 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, : [is_relu] "r"(is_relu), [bias] "r"(bias), [rem] "r"(rem), - [scale] "r"(scale) + [scale] "r"(scale), + [vmax] "r"(vmax) : "v0","v1","v2","v3","v4","v5","v6","v7", "v8","v9","v10","v11","v12", "v13","v14","v15","v16","v17", "v18","v19","v20","v21","v22", "v23","v24","v25","v26","v27", - "v28","v29","v30","v31","cc"); + "v28","v29","v30","v31","cc", "memory"); // clang-format on } @@ -1179,6 +1218,25 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, #define GEMM_SDOT_INT8_OUT \ GEMM_SDOT_CVT_INT32_TO_FP32 \ GEMM_SDOT_RELU \ + "ld1 {v6.4s}, [%[vmax]]\n" /* v8 = -127.f */ \ + /* data >= -127 */ \ + "fcmge v0.4s, v8.4s, v6.4s\n" \ + "fcmge v1.4s, v9.4s, v6.4s\n" \ + "fcmge v2.4s, v10.4s, v6.4s\n" \ + "fcmge v3.4s, v11.4s, v6.4s\n" \ + "fcmge v4.4s, v12.4s, v6.4s\n" \ + "fcmge v5.4s, v13.4s, v6.4s\n" \ + "fcmge v7.4s, v14.4s, v6.4s\n" \ + /* choose data */ \ + "bif v8.16b, v6.16b, v0.16b\n" \ + "fcmge v0.4s, v15.4s, v6.4s\n" \ + "bif v9.16b, v6.16b, v1.16b\n" \ + "bif v10.16b, v6.16b, v2.16b\n" \ + "bif v11.16b, v6.16b, v3.16b\n" \ + "bif v12.16b, v6.16b, v4.16b\n" \ + "bif v13.16b, v6.16b, v5.16b\n" \ + "bif v14.16b, v6.16b, v7.16b\n" \ + "bif v15.16b, v6.16b, v0.16b \n" \ "fcvtas v0.4s, v8.4s\n" /* 00, cvt to int */ \ "fcvtas v1.4s, v9.4s\n" /* 01, cvt to int */ \ "fcvtas v2.4s, v10.4s\n" /* 02, cvt to int */ \ @@ -1194,7 +1252,30 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, "sqxtn2 v12.8h, v4.4s\n" /* 11, cvt int32 to int16 */ \ "sqxtn v13.4h, v5.4s\n" /* 12, cvt int32 to int16 */ \ "sqxtn v14.4h, v6.4s\n" /* 20, cvt int32 to int16 */ \ + "ld1 {v6.4s}, [%[vmax]]\n" /* v8 = -127.f */ \ "sqxtn2 v14.8h, v7.4s\n" /* 21, cvt int32 to int16 */ \ + /* data >= -127 */ \ + "fcmge v0.4s, v16.4s, v6.4s\n" \ + "fcmge v1.4s, v17.4s, v6.4s\n" \ + "fcmge v2.4s, v18.4s, v6.4s\n" \ + "fcmge v3.4s, v19.4s, v6.4s\n" \ + "fcmge v4.4s, v20.4s, v6.4s\n" \ + "fcmge v5.4s, v21.4s, v6.4s\n" \ + "fcmge v7.4s, v22.4s, v6.4s\n" \ + "fcmge v8.4s, v23.4s, v6.4s\n" \ + "fcmge v9.4s, v24.4s, v6.4s\n" \ + /* choose data */ \ + "bif v16.16b, v6.16b, v0.16b\n" \ + "fcmge v0.4s, v25.4s, v6.4s\n" \ + "bif v17.16b, v6.16b, v1.16b\n" \ + "bif v18.16b, v6.16b, v2.16b\n" \ + "bif v19.16b, v6.16b, v3.16b\n" \ + "bif v20.16b, v6.16b, v4.16b\n" \ + "bif v21.16b, v6.16b, v5.16b\n" \ + "bif v22.16b, v6.16b, v7.16b\n" \ + "bif v23.16b, v6.16b, v8.16b\n" \ + "bif v24.16b, v6.16b, v9.16b\n" \ + "bif v25.16b, v6.16b, v0.16b\n" \ "fcvtas v0.4s, v16.4s\n" /* 22, cvt to int */ \ "fcvtas v1.4s, v17.4s\n" /* 30, cvt to int */ \ "fcvtas v2.4s, v18.4s\n" /* 31, cvt to int */ \ @@ -1214,7 +1295,22 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, "sqxtn v19.4h, v6.4s\n" /* 42, cvt int32 to int16 */ \ "sqxtn v20.4h, v7.4s\n" /* 50, cvt int32 to int16 */ \ "sqxtn2 v20.8h, v8.4s\n" /* 51, cvt int32 to int16 */ \ + "ld1 {v6.4s}, [%[vmax]]\n" /* v8 = -127.f */ \ "sqxtn v21.4h, v9.4s\n" /* 52, cvt int32 to int16 */ \ + /* data >= -127 */ \ + "fcmge v0.4s, v26.4s, v6.4s\n" \ + "fcmge v1.4s, v27.4s, v6.4s\n" \ + "fcmge v2.4s, v28.4s, v6.4s\n" \ + "fcmge v3.4s, v29.4s, v6.4s\n" \ + "fcmge v4.4s, v30.4s, v6.4s\n" \ + "fcmge v5.4s, v31.4s, v6.4s\n" \ + /* choose data */ \ + "bif v26.16b, v6.16b, v0.16b\n" \ + "bif v27.16b, v6.16b, v1.16b\n" \ + "bif v28.16b, v6.16b, v2.16b\n" \ + "bif v29.16b, v6.16b, v3.16b\n" \ + "bif v30.16b, v6.16b, v4.16b\n" \ + "bif v31.16b, v6.16b, v5.16b\n" \ "fcvtas v0.4s, v26.4s\n" /* 60, cvt to int */ \ "fcvtas v1.4s, v27.4s\n" /* 61, cvt to int */ \ "fcvtas v2.4s, v28.4s\n" /* 62, cvt to int */ \ @@ -1318,6 +1414,7 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, int k, int tail) { // clang-format off + float32_t vmax[4] = {-127.0, -127.0, -127.0, -127.0}; asm volatile(GEMM_SDOT_INT8_KERNEL GEMM_SDOT_INT8_OUT : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), @@ -1331,7 +1428,7 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, [c_ptr5] "+r"(c_ptr5), [c_ptr6] "+r"(c_ptr6), [c_ptr7] "+r"(c_ptr7) - : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu) + : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu), [vmax] "r"(vmax) : "cc","memory","v0","v1","v2","v3", "v4","v5","v6","v7","v8","v9","v10", "v11","v12","v13","v14","v15","v16","v17", @@ -1614,6 +1711,24 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, "vadd.f32 q3, q11, q3\n" /* r21, add offset */ \ "vadd.f32 q4, q12, q4\n" /* r30, add offset */ \ "vadd.f32 q5, q13, q5\n" /* r31, add offset */ \ + "vld1.32 {d12-d13}, [%[vmax]]\n" /* set q4 = -127 \n"*/ \ + "vcge.f32 q7, q8, q6\n" /* @ q8 >= -127 \n */ \ + "vcge.f32 q10, q9, q6\n" /* @ q8 >= -127 \n */ \ + "vcge.f32 q11, q0, q6\n" /* @ q8 >= -127 \n */ \ + "vcge.f32 q12, q1, q6\n" /* @ q8 >= -127 \n */ \ + "vcge.f32 q13, q2, q6\n" /* @ q8 >= -127 \n */ \ + "vcge.f32 q14, q3, q6\n" /* @ q8 >= -127 \n */ \ + "vcge.f32 q15, q4, q6\n" /* @ q8 >= -127 \n */ \ + /* choose data */ \ + "vbif q8, q6, q7\n" /* @ choose */ \ + "vcge.f32 q7, q5, q6\n" /* @ q8 >= -127 \n */ \ + "vbif q9, q6, q10\n" /* @ choose */ \ + "vbif q0, q6, q11\n" /* @ choose */ \ + "vbif q1, q6, q12\n" /* @ choose */ \ + "vbif q2, q6, q13\n" /* @ choose */ \ + "vbif q3, q6, q14\n" /* @ choose */ \ + "vbif q4, q6, q15\n" /* @ choose */ \ + "vbif q5, q6, q7\n" /* @ choose */ \ "vcvt.s32.f32 q6, q8\n" /* r00, fp32->int32 */ \ "vcvt.s32.f32 q7, q9\n" /* r01, fp32->int32 */ \ "vcvt.s32.f32 q10, q0\n" /* r10, fp32->int32 */ \ @@ -1682,7 +1797,8 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, "q14", "q15", "r0", - "cc"); + "cc", + "memory"); } template <> @@ -1697,6 +1813,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, bool is_relu, int k, int rem) { + float vmax[4] = {-127.0, -127.0, -127.0, -127.0}; asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), @@ -1708,6 +1825,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, : [is_relu] "r"(is_relu), [bias] "r"(bias), [rem] "r"(rem), + [vmax] "r"(vmax), [scale] "r"(scale) : "q0", "q1", @@ -1726,7 +1844,8 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, "q14", "q15", "r0", - "cc"); + "cc", + "memory"); } #endif // __aarch64__ // NOLINT diff --git a/lite/backends/arm/math/gemv_arm_int8.cc b/lite/backends/arm/math/gemv_arm_int8.cc index dab42cdeca28d40622590632985603ce8eab1fb9..98c50de9e370fbe39c35156bf631b35362ff21b4 100644 --- a/lite/backends/arm/math/gemv_arm_int8.cc +++ b/lite/backends/arm/math/gemv_arm_int8.cc @@ -79,6 +79,7 @@ inline void write_gemv_out(const int* in, for (int i = 0; i < size; ++i) { out[0] = saturate_cast(roundf(*(in++) * *(scale++) + *(bias++))); + out[0] = out[0] < -127 ? -127 : out[0]; // -127 - 127 if (flag_relu) { out[0] = out[0] > 0 ? out[0] : 0; } @@ -87,6 +88,7 @@ inline void write_gemv_out(const int* in, } else { for (int i = 0; i < size; ++i) { out[0] = saturate_cast(roundf(*(in++) * *(scale++))); + out[0] = out[0] < -127 ? -127 : out[0]; // -127 - 127 if (flag_relu) { out[0] = out[0] > 0 ? out[0] : 0; } diff --git a/lite/backends/arm/math/increment.cc b/lite/backends/arm/math/increment.cc index 583ff52077e720510e66fcdb9604d1dc8992a90d..62c4f41eacda0356ca3967af877244856b3156d7 100644 --- a/lite/backends/arm/math/increment.cc +++ b/lite/backends/arm/math/increment.cc @@ -20,18 +20,7 @@ namespace paddle { namespace lite { namespace arm { -namespace math { -void increment(const float* input, - const int n, - const float step, - float* out, - Context* ctx) { - for (int i = 0; i < n; i++) { - out[i] = input[i] + step; - } -} - -} // namespace math +namespace math {} // namespace math } // namespace arm } // namespace lite } // namespace paddle diff --git a/lite/backends/arm/math/increment.h b/lite/backends/arm/math/increment.h index 028db0fd55e9507aa4f161339e4a8b0cd2e59ffe..ec6217d105bb73b5ab230518876471af91880d2d 100644 --- a/lite/backends/arm/math/increment.h +++ b/lite/backends/arm/math/increment.h @@ -21,11 +21,16 @@ namespace paddle { namespace lite { namespace arm { namespace math { -void increment(const float* input, +template +void increment(const T* input, const int n, const float step, - float* out, - Context* ctx); + T* out, + Context* ctx) { + for (int i = 0; i < n; i++) { + out[i] = input[i] + static_cast(step); + } +} } // namespace math } // namespace arm diff --git a/lite/backends/arm/math/layout.cc b/lite/backends/arm/math/layout.cc index fd9126ab48c8f829c82d0c78a338074c695f0b9c..214c386d553e3d5548bb4750c3130191a650830f 100644 --- a/lite/backends/arm/math/layout.cc +++ b/lite/backends/arm/math/layout.cc @@ -358,6 +358,8 @@ void NCHW2NHWC(int N, int C, int size, const int8_t* X, int8_t* Y) { "v14", "v15"); #else +#if 0 // TOOD(ysh329): caused assembly code error with register for armv7 + // **clang** compile asm volatile(TRANS_C8 : [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), @@ -375,6 +377,7 @@ void NCHW2NHWC(int N, int C, int size, const int8_t* X, int8_t* Y) { [stride_w] "+r"(stride_w) : : "cc", "memory", "q0", "q1", "q2", "q3"); +#endif #endif } // const int8_t* din_ptr = din + 8 * cnt * size + s; // remain channel @@ -478,6 +481,8 @@ void NHWC2NCHW(int N, int C, int size, const float* X, float* Y) { "v10", "v11"); #else +#if 0 // TOOD(ysh329): caused assembly code error with register for armv7 + // **clang** compile asm volatile(TRANS_C4 : [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), @@ -491,6 +496,7 @@ void NHWC2NCHW(int N, int C, int size, const float* X, float* Y) { [stride] "+r"(stride) : : "cc", "memory", "q0", "q1", "q2", "q3"); +#endif #endif } for (int i = 0; i < remain; i++) { @@ -593,6 +599,8 @@ void NHWC2NCHW(int N, int C, int size, const int8_t* X, int8_t* Y) { "v14", "v15"); #else +#if 0 // TOOD(ysh329): caused assembly code error with register for armv7 + // **clang** compile asm volatile(TRANS_C8 : [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), @@ -610,6 +618,7 @@ void NHWC2NCHW(int N, int C, int size, const int8_t* X, int8_t* Y) { [stride_w] "+r"(stride_w) : : "cc", "memory", "q0", "q1", "q2", "q3"); +#endif #endif } for (int i = 0; i < remain; i++) { diff --git a/lite/backends/arm/math/lstm.cc b/lite/backends/arm/math/lstm.cc new file mode 100644 index 0000000000000000000000000000000000000000..5a2a263bb4fa2dc7b4ec54d84c698651a058f933 --- /dev/null +++ b/lite/backends/arm/math/lstm.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/arm/math/lstm.h" +#include "lite/backends/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void add_bias_rowwise(Tensor* input, + const Tensor* bias, + int start_w, + int end_w) { + auto in_dim = input->dims(); + int width = input->numel() / in_dim[0]; + int w_adds = width < end_w ? width : end_w; + float* i_data = input->mutable_data(); + const float* b_data = bias->data(); + for (int i = 0; i < in_dim[0]; ++i) { + for (int w = start_w; w < w_adds; ++w) { + i_data[w] += b_data[w]; + } + } +} +void vector_dot( + float* out, const float* in, const float* v1, int size, const float* v2) { + int loop = size >> 2; + int remain = size & 3; + const float* in_ptr = in; + float* out_ptr = out; + const float* v1_ptr = v1; + const float* v2_ptr = v2; + for (int i = 0; i < loop; ++i) { + float32x4_t in = vld1q_f32(in_ptr); + float32x4_t data1 = vld1q_f32(v1_ptr); + if (!v2) { + // in_out * v1 + float32x4_t out = vmulq_f32(in, data1); + vst1q_f32(out_ptr, out); + in_ptr += 4; + v1_ptr += 4; + out_ptr += 4; + } else { + // in_out + v1 * v2 + float32x4_t data2 = vld1q_f32(v2_ptr); + float32x4_t out = vmlaq_f32(in, data1, data2); + vst1q_f32(out_ptr, out); + in_ptr += 4; + v1_ptr += 4; + out_ptr += 4; + v2_ptr += 4; + } + } + for (int i = 0; i < remain; ++i) { + if (!v2) { + out_ptr[i] = in_ptr[i] * v1_ptr[i]; + ++out_ptr; + ++in_ptr; + ++v1_ptr; + } else { + out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i]; + ++out_ptr; + ++in_ptr; + ++v1_ptr; + ++v2_ptr; + } + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/lstm.h b/lite/backends/arm/math/lstm.h new file mode 100644 index 0000000000000000000000000000000000000000..e04581b055a93ac09da5ec6d5d57263fa2ad6261 --- /dev/null +++ b/lite/backends/arm/math/lstm.h @@ -0,0 +1,137 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/backends/arm/math/activation.h" +#include "lite/core/tensor.h" +#include "lite/utils/logging.h" +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void add_bias_rowwise(Tensor* input, + const Tensor* bias, + int start_w, + int end_w); + +inline float* row_offset(Tensor& input, int start) { // NOLINT + auto in_dim = input.dims(); + int width = input.numel() / in_dim[0]; + int offset = start < in_dim[0] ? start * width : input.numel(); + return input.mutable_data() + offset; +} +template +struct LstmMetaValue { + T* gate_value; + T* prev_state_value; + T* state_value; + T* state_active_value; + T* output_value; + T* check_ig; + T* check_fg; + T* check_og; +}; + +template +void activation( + const T* din, T* dout, int size, std::string act_str, int threads) { + if (act_str == "sigmoid") { + act_sigmoid(din, dout, size, threads); + } else if (act_str == "tanh") { + act_tanh(din, dout, size, threads); + } else if (act_str == "relu") { + act_relu(din, dout, size, threads); + } else { + LOG(FATAL) << "unsupport activation " << act_str; + } +} + +void vector_dot(float* out, + const float* in, + const float* v1, + int size, + const float* v2 = nullptr); + +template +struct LstmUnitFunctor { + static void compute(LstmMetaValue value, + int frame_size, + int batch_size, + T cell_clip, + std::string gate_act, + std::string cell_act, + std::string cand_act, + int threads) { + for (int b = 0; b < batch_size; ++b) { + const int temp_len = frame_size; + float zero_ptr[temp_len]; // NOLINT + memset(zero_ptr, 0, sizeof(float) * temp_len); + + T* value_in = value.gate_value; + T* value_ig = value_in + frame_size; + T* value_fg = value_ig + frame_size; + T* value_og = value_fg + frame_size; + T* state = value.state_value; + T* state_act = value.state_active_value; + T* output = value.output_value; + + T* check_i = value.check_ig ? value.check_ig : zero_ptr; + T* check_f = value.check_fg ? value.check_fg : zero_ptr; + T* check_o = value.check_og ? value.check_og : zero_ptr; + T* prev_state = + value.prev_state_value ? value.prev_state_value : zero_ptr; + + activation(value_in, value_in, frame_size, gate_act, threads); + vector_dot(value_ig, value_ig, prev_state, frame_size, check_i); + vector_dot(value_fg, value_fg, prev_state, frame_size, check_f); + activation(value_ig, value_ig, frame_size, cell_act, threads); + activation(value_fg, value_fg, frame_size, cell_act, threads); + vector_dot(state, value_in, value_ig, frame_size); + vector_dot(state, state, prev_state, frame_size, value_fg); + + for (int i = 0; i < frame_size; ++i) { + if (cell_clip > 0.0) { + if (state[i] < -1.0 * cell_clip) { + state[i] = -1.0 * cell_clip; + } + if (state[i] > cell_clip) { + state[i] = cell_clip; + } + } + } + + vector_dot(value_og, value_og, state, frame_size, check_o); + activation(value_og, value_og, frame_size, cell_act, threads); + activation(state, state_act, frame_size, cand_act, threads); + vector_dot(value.output_value, value_og, state_act, frame_size); + + value.gate_value += frame_size * 4; + value.state_value += frame_size; + value.state_active_value += frame_size; + value.output_value += frame_size; + if (value.prev_state_value) { + value.prev_state_value += frame_size; + } + } + } +}; + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/packed_sgemm.cc b/lite/backends/arm/math/packed_sgemm.cc index cb9c049d81aee73b65bacd27a64138779d1532cc..b41afc1c29e121f905b0abc48bae98705bc0ee16 100644 --- a/lite/backends/arm/math/packed_sgemm.cc +++ b/lite/backends/arm/math/packed_sgemm.cc @@ -2289,6 +2289,29 @@ void sgemm_prepacked_8x12(bool is_transB, size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; auto workspace = ctx->workspace_data(); int threads = ctx->threads(); + + auto act_type = act_param.active_type; + float alpha[4] = {0.f, 0.f, 0.f, 0.f}; + int flag_act = 0x00; // relu: 1, relu6: 2, leakey: 3 + if (act_param.has_active) { + if (act_type == lite_api::ActivationType::kRelu) { + flag_act = 0x01; + } else if (act_type == lite_api::ActivationType::kRelu6) { + flag_act = 0x02; + float local_alpha = act_param.Relu_clipped_coef; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } else if (act_type == lite_api::ActivationType::kLeakyRelu) { + flag_act = 0x03; + float local_alpha = act_param.Leaky_relu_alpha; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } + } //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 int x_block = (l2_cache - (MBLOCK * K)) / (sizeof(float) * (K + MBLOCK)); x_block /= NBLOCK; @@ -2837,7 +2860,172 @@ void sgemm_prepacked_8x12(bool is_transB, "fmla v25.4s, v4.4s, v1.s[1]\n" /* out21 = b2 * a10[0], b2 =q7*/ "fmla v28.4s, v4.4s, v1.s[2]\n" /* out22 = b2 * a10[0], b2 =q7*/ "fmla v31.4s, v4.4s, v1.s[3]\n" /* out23 = b2 * a10[0], b2 =q7*/ - "11: \n" /* check if relu */ + + "11: \n" /* check activation */ + "cmp %w[flag_act], #1 \n" /* check if has relu */ + "bne 12f \n" /* jump if no relu */ + "movi v0.4s, #0 \n" /* for relu*/ + "fmax v8.4s, v8.4s, v0.4s \n" /* relu*/ + "fmax v9.4s, v9.4s, v0.4s \n" /* relu*/ + "fmax v10.4s, v10.4s, v0.4s \n" /* relu*/ + "fmax v11.4s, v11.4s, v0.4s \n" /* relu*/ + "fmax v12.4s, v12.4s, v0.4s \n" /* relu*/ + "fmax v13.4s, v13.4s, v0.4s \n" /* relu*/ + "fmax v14.4s, v14.4s, v0.4s \n" /* relu*/ + "fmax v15.4s, v15.4s, v0.4s \n" /* relu*/ + "fmax v16.4s, v16.4s, v0.4s \n" /* relu*/ + "fmax v17.4s, v17.4s, v0.4s \n" /* relu*/ + "fmax v18.4s, v18.4s, v0.4s \n" /* relu*/ + "fmax v19.4s, v19.4s, v0.4s \n" /* relu*/ + "fmax v20.4s, v20.4s, v0.4s \n" /* relu*/ + "fmax v21.4s, v21.4s, v0.4s \n" /* relu*/ + "fmax v22.4s, v22.4s, v0.4s \n" /* relu*/ + "fmax v23.4s, v23.4s, v0.4s \n" /* relu*/ + "fmax v24.4s, v24.4s, v0.4s \n" /* relu*/ + "fmax v25.4s, v25.4s, v0.4s \n" /* relu*/ + "fmax v26.4s, v26.4s, v0.4s \n" /* relu*/ + "fmax v27.4s, v27.4s, v0.4s \n" /* relu*/ + "fmax v28.4s, v28.4s, v0.4s \n" /* relu*/ + "fmax v29.4s, v29.4s, v0.4s \n" /* relu*/ + "fmax v30.4s, v30.4s, v0.4s \n" /* relu*/ + "fmax v31.4s, v31.4s, v0.4s \n" /* relu*/ + "b 20f \n" /* relu end */ + //! no act + "12: \n" /* no relu */ + "cmp %w[flag_act], #0 \n" /* check no act */ + "beq 20f \n" /* no act end */ + //! relu6 + "cmp %w[flag_act], #2 \n" /* check if has relu6 */ + "bne 13f \n" /* jump if no relu6 */ + "movi v0.4s, #0 \n" /* for relu6 */ + "ld1 {v1.4s}, [%[alpha]] \n" /* relu6 alpha */ + "fmax v8.4s, v8.4s, v0.4s \n" /* relu6 */ + "fmax v9.4s, v9.4s, v0.4s \n" /* relu6 */ + "fmax v10.4s, v10.4s, v0.4s \n" /* relu6 */ + "fmax v11.4s, v11.4s, v0.4s \n" /* relu6 */ + "fmax v12.4s, v12.4s, v0.4s \n" /* relu6 */ + "fmax v13.4s, v13.4s, v0.4s \n" /* relu6 */ + "fmax v14.4s, v14.4s, v0.4s \n" /* relu6 */ + "fmax v15.4s, v15.4s, v0.4s \n" /* relu6 */ + "fmax v16.4s, v16.4s, v0.4s \n" /* relu6 */ + "fmax v17.4s, v17.4s, v0.4s \n" /* relu6 */ + "fmax v18.4s, v18.4s, v0.4s \n" /* relu6 */ + "fmax v19.4s, v19.4s, v0.4s \n" /* relu6 */ + "fmax v20.4s, v20.4s, v0.4s \n" /* relu6 */ + "fmax v21.4s, v21.4s, v0.4s \n" /* relu6 */ + "fmax v22.4s, v22.4s, v0.4s \n" /* relu6 */ + "fmax v23.4s, v23.4s, v0.4s \n" /* relu6 */ + "fmax v24.4s, v24.4s, v0.4s \n" /* relu6 */ + "fmax v25.4s, v25.4s, v0.4s \n" /* relu6 */ + "fmax v26.4s, v26.4s, v0.4s \n" /* relu6 */ + "fmax v27.4s, v27.4s, v0.4s \n" /* relu6 */ + "fmax v28.4s, v28.4s, v0.4s \n" /* relu6 */ + "fmax v29.4s, v29.4s, v0.4s \n" /* relu6 */ + "fmax v30.4s, v30.4s, v0.4s \n" /* relu6 */ + "fmax v31.4s, v31.4s, v0.4s \n" /* relu6 */ + "fmin v8.4s, v8.4s, v1.4s \n" /* relu6 */ + "fmin v9.4s, v9.4s, v1.4s \n" /* relu6 */ + "fmin v10.4s, v10.4s, v1.4s \n" /* relu6 */ + "fmin v11.4s, v11.4s, v1.4s \n" /* relu6 */ + "fmin v12.4s, v12.4s, v1.4s \n" /* relu6 */ + "fmin v13.4s, v13.4s, v1.4s \n" /* relu6 */ + "fmin v14.4s, v14.4s, v1.4s \n" /* relu6 */ + "fmin v15.4s, v15.4s, v1.4s \n" /* relu6 */ + "fmin v16.4s, v16.4s, v1.4s \n" /* relu6 */ + "fmin v17.4s, v17.4s, v1.4s \n" /* relu6 */ + "fmin v18.4s, v18.4s, v1.4s \n" /* relu6 */ + "fmin v19.4s, v19.4s, v1.4s \n" /* relu6 */ + "fmin v20.4s, v20.4s, v1.4s \n" /* relu6 */ + "fmin v21.4s, v21.4s, v1.4s \n" /* relu6 */ + "fmin v22.4s, v22.4s, v1.4s \n" /* relu6 */ + "fmin v23.4s, v23.4s, v1.4s \n" /* relu6 */ + "fmin v24.4s, v24.4s, v1.4s \n" /* relu6 */ + "fmin v25.4s, v25.4s, v1.4s \n" /* relu6 */ + "fmin v26.4s, v26.4s, v1.4s \n" /* relu6 */ + "fmin v27.4s, v27.4s, v1.4s \n" /* relu6 */ + "fmin v28.4s, v28.4s, v1.4s \n" /* relu6 */ + "fmin v29.4s, v29.4s, v1.4s \n" /* relu6 */ + "fmin v30.4s, v30.4s, v1.4s \n" /* relu6 */ + "fmin v31.4s, v31.4s, v1.4s \n" /* relu6 */ + "b 20f \n" /* relu6 end */ + //! leakey relu + "13: \n" /* otherwise is leakey relu */ + "movi v0.4s, #0 \n" /* for leakey relu */ + "ld1 {v1.4s}, [%[alpha]] \n" /* leakey relu alpha */ + "fcmge v2.4s, v8.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v8.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v4.4s, v9.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v5.4s, v9.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v6.4s, v10.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v7.4s, v10.4s, v1.4s \n" /* vmulq_f32 */ + "bif v8.16b, v3.16b, v2.16b \n" /* choose*/ + "bif v9.16b, v5.16b, v4.16b \n" /* choose*/ + "bif v10.16b, v7.16b, v6.16b \n" /* choose*/ + "fcmge v2.4s, v11.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v11.4s, v1.4s \n" /* vmulq_f32 */ + "bif v11.16b, v3.16b, v2.16b \n" /* choose*/ + "fcmge v2.4s, v12.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v12.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v4.4s, v13.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v5.4s, v13.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v6.4s, v14.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v7.4s, v14.4s, v1.4s \n" /* vmulq_f32 */ + "bif v12.16b, v3.16b, v2.16b \n" /* choose*/ + "bif v13.16b, v5.16b, v4.16b \n" /* choose*/ + "bif v14.16b, v7.16b, v6.16b \n" /* choose*/ + "fcmge v2.4s, v15.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v15.4s, v1.4s \n" /* vmulq_f32 */ + "bif v15.16b, v3.16b, v2.16b \n" /* choose*/ + "fcmge v2.4s, v16.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v16.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v4.4s, v17.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v5.4s, v17.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v6.4s, v18.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v7.4s, v18.4s, v1.4s \n" /* vmulq_f32 */ + "bif v16.16b, v3.16b, v2.16b \n" /* choose*/ + "bif v17.16b, v5.16b, v4.16b \n" /* choose*/ + "bif v18.16b, v7.16b, v6.16b \n" /* choose*/ + "fcmge v2.4s, v19.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v19.4s, v1.4s \n" /* vmulq_f32 */ + "bif v19.16b, v3.16b, v2.16b \n" /* choose*/ + "fcmge v2.4s, v20.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v20.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v4.4s, v21.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v5.4s, v21.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v6.4s, v22.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v7.4s, v22.4s, v1.4s \n" /* vmulq_f32 */ + "bif v20.16b, v3.16b, v2.16b \n" /* choose*/ + "bif v21.16b, v5.16b, v4.16b \n" /* choose*/ + "bif v22.16b, v7.16b, v6.16b \n" /* choose*/ + "fcmge v2.4s, v23.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v23.4s, v1.4s \n" /* vmulq_f32 */ + "bif v23.16b, v3.16b, v2.16b \n" /* choose*/ + "fcmge v2.4s, v24.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v24.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v4.4s, v25.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v5.4s, v25.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v6.4s, v26.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v7.4s, v26.4s, v1.4s \n" /* vmulq_f32 */ + "bif v24.16b, v3.16b, v2.16b \n" /* choose*/ + "bif v25.16b, v5.16b, v4.16b \n" /* choose*/ + "bif v26.16b, v7.16b, v6.16b \n" /* choose*/ + "fcmge v2.4s, v27.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v27.4s, v1.4s \n" /* vmulq_f32 */ + "bif v27.16b, v3.16b, v2.16b \n" /* choose*/ + "fcmge v2.4s, v28.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v28.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v4.4s, v29.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v5.4s, v29.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v6.4s, v30.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v7.4s, v30.4s, v1.4s \n" /* vmulq_f32 */ + "bif v28.16b, v3.16b, v2.16b \n" /* choose*/ + "bif v29.16b, v5.16b, v4.16b \n" /* choose*/ + "bif v30.16b, v7.16b, v6.16b \n" /* choose*/ + "fcmge v2.4s, v31.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v31.4s, v1.4s \n" /* vmulq_f32 */ + "bif v31.16b, v3.16b, v2.16b \n" /* choose*/ + "20: \n" /* act end */ + "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n" /* store r0 */ "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */ "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */ @@ -2861,7 +3049,9 @@ void sgemm_prepacked_8x12(bool is_transB, [c_ptr7] "+r"(c_ptr7) : [bias_ptr] "r"(bias_local), [has_beta] "r"(has_beta), - [beta] "r"(beta) + [beta] "r"(beta), + [alpha] "r"(alpha), + [flag_act] "r"(flag_act) : "cc","memory", "v0","v1","v2","v3","v4","v5","v6","v7", "v8","v9","v10","v11","v12","v13", @@ -2884,13 +3074,6 @@ void sgemm_prepacked_8x12(bool is_transB, } } } - if (act_param.has_active) { -#pragma omp parallel for num_threads(threads) - for (unsigned int x = 0; x < M; x++) { - float *dst = C + x * ldc; - act_switch_process(dst, dst, N, &act_param); - } - } } void sgemm_prepacked_4x4(bool is_transB, @@ -2911,6 +3094,28 @@ void sgemm_prepacked_4x4(bool is_transB, auto workspace = ctx->workspace_data(); int threads = ctx->threads(); + auto act_type = act_param.active_type; + float alpha[4] = {0.f, 0.f, 0.f, 0.f}; + int flag_act = 0x00; // relu: 1, relu6: 2, leakey: 3 + if (act_param.has_active) { + if (act_type == lite_api::ActivationType::kRelu) { + flag_act = 0x01; + } else if (act_type == lite_api::ActivationType::kRelu6) { + flag_act = 0x02; + float local_alpha = act_param.Relu_clipped_coef; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } else if (act_type == lite_api::ActivationType::kLeakyRelu) { + flag_act = 0x03; + float local_alpha = act_param.Leaky_relu_alpha; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } + } const int n_block = 4; const int m_block = 4; //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 @@ -3137,7 +3342,51 @@ void sgemm_prepacked_4x4(bool is_transB, "fmla v10.4s, v6.4s, v2.s[2]\n" /* out2 = b2 * a20[2], b1 =q6*/ "fmla v11.4s, v6.4s, v2.s[3]\n" /* out3 = b2 * a20[3], b1 =q6*/ - "11: \n" /* check if relu */ + "11: \n" /* check activation */ + "cmp %w[flag_act], #1 \n" /* check if has relu */ + "bne 12f \n" /* jump if no relu */ + "movi v0.4s, #0 \n" /* for relu*/ + "fmax v8.4s, v8.4s, v0.4s \n" /* relu*/ + "fmax v9.4s, v9.4s, v0.4s \n" /* relu*/ + "fmax v10.4s, v10.4s, v0.4s \n" /* relu*/ + "fmax v11.4s, v11.4s, v0.4s \n" /* relu*/ + "b 20f \n" /* relu end */ + //! no act + "12: \n" /* no relu */ + "cmp %w[flag_act], #0 \n" /* check no act */ + "beq 20f \n" /* no act end */ + //! relu6 + "cmp %w[flag_act], #2 \n" /* check if has relu6 */ + "bne 13f \n" /* jump if no relu6 */ + "movi v0.4s, #0 \n" /* for relu6 */ + "ld1 {v1.4s}, [%[alpha]] \n" /* relu6 alpha */ + "fmax v8.4s, v8.4s, v0.4s \n" /* relu6 */ + "fmax v9.4s, v9.4s, v0.4s \n" /* relu6 */ + "fmax v10.4s, v10.4s, v0.4s \n" /* relu6 */ + "fmax v11.4s, v11.4s, v0.4s \n" /* relu6 */ + + "fmin v8.4s, v8.4s, v1.4s \n" /* relu6*/ + "fmin v9.4s, v9.4s, v1.4s \n" /* relu6*/ + "fmin v10.4s, v10.4s, v1.4s \n" /* relu6*/ + "fmin v11.4s, v11.4s, v1.4s \n" /* relu6*/ + "b 20f \n" /* relu6 end */ + //! leakey relu + "13: \n" /* otherwise is leakey relu */ + "movi v0.4s, #0 \n" /* for leakey relu */ + "ld1 {v1.4s}, [%[alpha]] \n" /* leakey relu alpha */ + "fcmge v2.4s, v8.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v3.4s, v8.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v4.4s, v9.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v5.4s, v9.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v6.4s, v10.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v7.4s, v10.4s, v1.4s \n" /* vmulq_f32 */ + "fcmge v12.4s, v11.4s, v0.4s \n" /* vcgeq_f32 */ + "fmul v13.4s, v11.4s, v1.4s \n" /* vmulq_f32 */ + "bif v8.16b, v3.16b, v2.16b \n" /* choose*/ + "bif v9.16b, v5.16b, v4.16b \n" /* choose*/ + "bif v10.16b, v7.16b, v6.16b \n" /* choose*/ + "bif v11.16b, v13.16b, v12.16b \n" /* choose*/ + "20: \n" /* act end */ "st1 {v8.4s}, [%[c_ptr0]], #16\n" /* store r0 */ "st1 {v9.4s}, [%[c_ptr1]], #16\n" /* store r1 */ "st1 {v10.4s}, [%[c_ptr2]], #16\n" /* store r2 */ @@ -3153,7 +3402,9 @@ void sgemm_prepacked_4x4(bool is_transB, [c_ptr3] "+r"(c_ptr3) : [bias_ptr] "r"(bias_local), [has_beta] "r"(has_beta), - [beta] "r"(beta) + [beta] "r"(beta), + [alpha] "r"(alpha), + [flag_act] "r"(flag_act) : "cc","memory", "v0","v1","v2","v3","v4","v5","v6","v7", "v8","v9","v10","v11"); @@ -3169,13 +3420,6 @@ void sgemm_prepacked_4x4(bool is_transB, } } } - if (act_param.has_active) { -#pragma omp parallel for num_threads(threads) - for (unsigned int x = 0; x < M; x++) { - float *dst = C + x * ldc; - act_switch_process(dst, dst, N, &act_param); - } - } } #else // __aarch64__ /** @@ -3206,6 +3450,28 @@ void sgemm_prepacked_6x8(bool is_transB, size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; auto* workspace = ctx->workspace_data(); int threads = ctx->threads(); + auto act_type = act_param.active_type; + float alpha[4] = {0.f, 0.f, 0.f, 0.f}; + int flag_act = 0x00; // relu: 1, relu6: 2, leakey: 3 + if (act_param.has_active) { + if (act_type == lite_api::ActivationType::kRelu) { + flag_act = 0x01; + } else if (act_type == lite_api::ActivationType::kRelu6) { + flag_act = 0x02; + float local_alpha = act_param.Relu_clipped_coef; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } else if (act_type == lite_api::ActivationType::kLeakyRelu) { + flag_act = 0x03; + float local_alpha = act_param.Leaky_relu_alpha; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } + } //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 int x_block = (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH)); @@ -3223,6 +3489,8 @@ void sgemm_prepacked_6x8(bool is_transB, tail_pre = KBLOCK; } + //! merge tail_pre and flag_act + tail_pre = (tail_pre << 2 | flag_act); bool flag_p_remain = false; int remain = 0; @@ -3456,13 +3724,14 @@ void sgemm_prepacked_6x8(bool is_transB, "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a0~a3\n" "vmla.f32 q9, q3, d2[0] @ out8 += b2 * a2\n" "vmla.f32 q11, q3, d2[1] @ out9 += b2 * a3\n" - "subs %[k], %[k], #1 @ k--\n" + "subs %[k], %[k], #1 @ k--\n" "vmla.f32 q13, q3, d3[0] @ out10 += b2 * a4\n" "vmla.f32 q15, q3, d3[1] @ out11 += b2 * a5\n" - "bne 1b @ jump to main loop\n" - "0: @ process tail\n" - "subs %[tails], %[tails], #1 @ tail--\n" - "beq 3f @ jump to tail = 1\n" + "bne 1b @ jump to main loop\n" + "0: @ process tail\n" + "sub %[tails], %[tails], #4 @ tail--\n" + "cmp %[tails], #4 @ cmp with act bits\n" + "blt 3f @ jump to tail = 1\n" /* Unroll 0*/ "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" "vmla.f32 q4, q2, d0[0] @ out0 += b1 * a0\n" @@ -3471,9 +3740,10 @@ void sgemm_prepacked_6x8(bool is_transB, "vmla.f32 q8, q2, d1[0] @ out2 += b1 * a2\n" "vmla.f32 q10, q2, d1[1] @ out3 += b1 * a3\n" "vmla.f32 q12, q2, d2[0] @ out4 += b1 * a4\n" - "subs %[tails], %[tails], #1 @ tail--\n" + "sub %[tails], %[tails], #4 @ tail--\n" "vmla.f32 q14, q2, d2[1] @ out5 += b1 * a5\n" "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" + "cmp %[tails], #4 @ cmp with act bits\n" "vmla.f32 q5, q3, d0[0] @ out6 += b2 * a0\n" "vmla.f32 q7, q3, d0[1] @ out7 += b2 * a1\n" "vmla.f32 q9, q3, d1[0] @ out8 += b2 * a2\n" @@ -3482,16 +3752,17 @@ void sgemm_prepacked_6x8(bool is_transB, "vmla.f32 q13, q3, d2[0] @ out10 += b2 * a4\n" "vmla.f32 q15, q3, d2[1] @ out11 += b2 * a5\n" "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "beq 4f @ jump to tail==2\n" + "blt 4f @ jump to tail==2\n" /* Unroll 1*/ "vmla.f32 q4, q2, d3[0] @ out0 += b1 * a0\n" "vmla.f32 q6, q2, d3[1] @ out1 += b1 * a1\n" - "subs %[tails], %[tails], #1 @ tail--\n" + "sub %[tails], %[tails], #4 @ tail--\n" "vmla.f32 q8, q2, d0[0] @ out2 += b1 * a2\n" "vmla.f32 q10, q2, d0[1] @ out3 += b1 * a3\n" "vmla.f32 q12, q2, d1[0] @ out4 += b1 * a4\n" "vmla.f32 q14, q2, d1[1] @ out5 += b1 * a5\n" "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" + "cmp %[tails], #4 @ cmp with act bits\n" "vmla.f32 q5, q3, d3[0] @ out6 += b2 * a0\n" "vmla.f32 q7, q3, d3[1] @ out7 += b2 * a1\n" "vld1.32 {d2-d3}, [%[a_ptr] :64]! @ load a0~a3\n" @@ -3500,8 +3771,9 @@ void sgemm_prepacked_6x8(bool is_transB, "vmla.f32 q13, q3, d1[0] @ out10 += b2 * a4\n" "vmla.f32 q15, q3, d1[1] @ out11 += b2 * a5\n" "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "beq 5f @ jump to tail==3\n" + "blt 5f @ jump to tail==3\n" /* Unroll 2 */ + "sub %[tails], %[tails], #4 @ tail--\n" "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a4,a5, a0,a1\n" "vmla.f32 q4, q2, d2[0] @ out0 += b1 * a0\n" "vmla.f32 q6, q2, d2[1] @ out1 += b1 * a1\n" @@ -3579,7 +3851,99 @@ void sgemm_prepacked_6x8(bool is_transB, "vmla.f32 q11, q3, d3[1] @ out9 += b2 * a3\n" "vmla.f32 q13, q3, d0[0] @ out10 += b2 * a4\n" "vmla.f32 q15, q3, d0[1] @ out11 += b2 * a5\n" - "2: @ check relu\n" + "2: @ check activation\n" + //! relu + "cmp %[tails], #1 @ check if has relu\n" + "bne 6f @ jump if not relu \n" + "vmov.u32 q0, #0 @ for relu\n" + "vmax.f32 q4, q4, q0 @ for relu\n" + "vmax.f32 q5, q5, q0 @ for relu\n" + "vmax.f32 q6, q6, q0 @ for relu\n" + "vmax.f32 q7, q7, q0 @ for relu\n" + "vmax.f32 q8, q8, q0 @ for relu\n" + "vmax.f32 q9, q9, q0 @ for relu\n" + "vmax.f32 q10, q10, q0 @ for relu\n" + "vmax.f32 q11, q11, q0 @ for relu\n" + "vmax.f32 q12, q12, q0 @ for relu\n" + "vmax.f32 q13, q13, q0 @ for relu\n" + "vmax.f32 q14, q14, q0 @ for relu\n" + "vmax.f32 q15, q15, q0 @ for relu\n" + "b 10f @ relu end\n" + "6: @ no relu \n" + "cmp %[tails], #0 @ check no act\n" + "beq 10f @ no act end \n" + //! relu6 + "cmp %[tails], #2 @ check if has relu6\n" + "bne 7f @ jump if no relu6 \n" + "vmov.u32 q0, #0 @ for relu6\n" + "vmax.f32 q4, q4, q0 @ for relu6\n" + "vmax.f32 q5, q5, q0 @ for relu6\n" + "vmax.f32 q6, q6, q0 @ for relu6\n" + "vmax.f32 q7, q7, q0 @ for relu6\n" + "vmax.f32 q8, q8, q0 @ for relu6\n" + "vmax.f32 q9, q9, q0 @ for relu6\n" + "vld1.f32 {d2-d3}, [%[alpha]] @ load relu6 alpha\n" + "vmax.f32 q10, q10, q0 @ for relu6\n" + "vmax.f32 q11, q11, q0 @ for relu6\n" + "vmax.f32 q12, q12, q0 @ for relu6\n" + "vmax.f32 q13, q13, q0 @ for relu6\n" + "vmax.f32 q14, q14, q0 @ for relu6\n" + "vmax.f32 q15, q15, q0 @ for relu6\n" + + "vmin.f32 q4, q4, q1 @ for relu6\n" + "vmin.f32 q5, q5, q1 @ for relu6\n" + "vmin.f32 q6, q6, q1 @ for relu6\n" + "vmin.f32 q7, q7, q1 @ for relu6\n" + "vmin.f32 q8, q8, q1 @ for relu6\n" + "vmin.f32 q9, q9, q1 @ for relu6\n" + "vmin.f32 q10, q10, q1 @ for relu6\n" + "vmin.f32 q11, q11, q1 @ for relu6\n" + "vmin.f32 q12, q12, q1 @ for relu6\n" + "vmin.f32 q13, q13, q1 @ for relu6\n" + "vmin.f32 q14, q14, q1 @ for relu6\n" + "vmin.f32 q15, q15, q1 @ for relu6\n" + "b 10f @ relu6 end \n" + //! leakey relu + "7: @ otherwise is leakey relu\n" + "vmov.u32 q0, #0 @ for leakey relu \n" + "vld1.f32 {d2-d3}, [%[alpha]] @ load leakey relu alpha\n" + "vcge.f32 q2, q4, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q4, q1 @ vmulq_f32 \n" + "vbif q4, q3, q2 @ choose \n" + "vcge.f32 q2, q5, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q5, q1 @ vmulq_f32 \n" + "vbif q5, q3, q2 @ choose \n" + "vcge.f32 q2, q6, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q6, q1 @ vmulq_f32 \n" + "vbif q6, q3, q2 @ choose \n" + "vcge.f32 q2, q7, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q7, q1 @ vmulq_f32 \n" + "vbif q7, q3, q2 @ choose \n" + "vcge.f32 q2, q8, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q8, q1 @ vmulq_f32 \n" + "vbif q8, q3, q2 @ choose \n" + "vcge.f32 q2, q9, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q9, q1 @ vmulq_f32 \n" + "vbif q9, q3, q2 @ choose \n" + "vcge.f32 q2, q10, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q10, q1 @ vmulq_f32 \n" + "vbif q10, q3, q2 @ choose \n" + "vcge.f32 q2, q11, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q11, q1 @ vmulq_f32 \n" + "vbif q11, q3, q2 @ choose \n" + "vcge.f32 q2, q12, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q12, q1 @ vmulq_f32 \n" + "vbif q12, q3, q2 @ choose \n" + "vcge.f32 q2, q13, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q13, q1 @ vmulq_f32 \n" + "vbif q13, q3, q2 @ choose \n" + "vcge.f32 q2, q14, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q14, q1 @ vmulq_f32 \n" + "vbif q14, q3, q2 @ choose \n" + "vcge.f32 q2, q15, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q15, q1 @ vmulq_f32 \n" + "vbif q15, q3, q2 @ choose \n" + "10: @ act end \n" "vst1.32 {d8-d11}, [%[c_ptr0]]! @ store r0\n" "vst1.32 {d12-d15}, [%[c_ptr1]]! @ store r1\n" "vst1.32 {d16-d19}, [%[c_ptr2]]! @ store r2\n" @@ -3597,7 +3961,8 @@ void sgemm_prepacked_6x8(bool is_transB, [k] "+r"(k), [tails] "+r"(tails) : [bias_ptr] "r"(bias_local), - [beta] "r"(beta) + [beta] "r"(beta), + [alpha] "r" (alpha) : "q0","q1","q2","q3","q4", "q5","q6","q7","q8","q9","q10","q11", "q12","q13","q14","q15","cc","memory"); @@ -3616,13 +3981,6 @@ void sgemm_prepacked_6x8(bool is_transB, } } } - if (act_param.has_active) { -#pragma omp parallel for num_threads(threads) - for (unsigned int x = 0; x < M; x++) { - float* dst = C + x * ldc; - act_switch_process(dst, dst, N, &act_param); - } - } } void sgemm_prepacked_4x8(bool is_transB, @@ -3642,6 +4000,28 @@ void sgemm_prepacked_4x8(bool is_transB, size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; auto* workspace = ctx->workspace_data(); int threads = ctx->threads(); + auto act_type = act_param.active_type; + float alpha[4] = {0.f, 0.f, 0.f, 0.f}; + int flag_act = 0x00; // relu: 1, relu6: 2, leakey: 3 + if (act_param.has_active) { + if (act_type == lite_api::ActivationType::kRelu) { + flag_act = 0x01; + } else if (act_type == lite_api::ActivationType::kRelu6) { + flag_act = 0x02; + float local_alpha = act_param.Relu_clipped_coef; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } else if (act_type == lite_api::ActivationType::kLeakyRelu) { + flag_act = 0x03; + float local_alpha = act_param.Leaky_relu_alpha; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } + } //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 int x_block = (l2_cache - (MBLOCK_A73 * K)) / (sizeof(float) * (K + MBLOCK_A73)); @@ -3786,13 +4166,13 @@ void sgemm_prepacked_4x8(bool is_transB, "vmla.f32 q15, q3, q4\n" /* cr31 += beta * c_r31 */ "11: \n" /* check loop count */ "vld1.32 {d0-d3}, [%[a_ptr] :128]! @ load a0~a3\n" - "vld1.32 {d8-d11}, [%[b_ptr] :128]! @ load b1\n" - "cmp %[k], #0 @ check weather k is bigger than " + "vld1.32 {d8-d11}, [%[b_ptr] :128]! @ load b1\n" + "cmp %[k], #0 @ check weather k is bigger than " "0\n" - "beq 0f @ jump to tail\n" - "1: @ main loop for k\n" + "beq 0f @ jump to tail\n" + "1: @ main loop for k\n" /* Unroll 0*/ - "vld1.32 {d12-d15}, [%[b_ptr] :128]! @ load next b1, b2\n" + "vld1.32 {d12-d15}, [%[b_ptr] :128]! @ load next b1, b2\n" "vmla.f32 q8, q4, d0[0] @ out0 += b1 * a0\n" "vld1.32 {d4-d7}, [%[a_ptr] :128]! @ load next 2xa0~a3\n" "vmla.f32 q10, q4, d0[1] @ out1 += b1 * a1\n" @@ -3920,8 +4300,76 @@ void sgemm_prepacked_4x8(bool is_transB, "vmla.f32 q13, q5, d5[0] @ out6 += b2 * a2\n" "vmla.f32 q15, q5, d5[1] @ out7 += b2 * a3\n" /*aptr - 16*/ - "sub %[a_ptr], %[a_ptr], #16 @ tail--\n" - "2: @ check relu\n" + "sub %[a_ptr], %[a_ptr], #16 @ tail--\n" + "2: @ check relu\n" + //! relu + "cmp %[flag_act], #1 @ check if has relu\n" + "bne 6f @ jump if not relu \n" + "vmov.u32 q0, #0 @ for relu\n" + "vmax.f32 q8, q8, q0 @ for relu\n" + "vmax.f32 q9, q9, q0 @ for relu\n" + "vmax.f32 q10, q10, q0 @ for relu\n" + "vmax.f32 q11, q11, q0 @ for relu\n" + "vmax.f32 q12, q12, q0 @ for relu\n" + "vmax.f32 q13, q13, q0 @ for relu\n" + "vmax.f32 q14, q14, q0 @ for relu\n" + "vmax.f32 q15, q15, q0 @ for relu\n" + "b 10f @ relu end\n" + "6: @ no relu \n" + "cmp %[flag_act], #0 @ check no act\n" + "beq 10f @ no act end \n" + //! relu6 + "cmp %[flag_act], #2 @ check if has relu6\n" + "bne 7f @ jump if no relu6 \n" + "vmov.u32 q0, #0 @ for relu6\n" + "vld1.f32 {d2-d3}, [%[alpha]] @ load relu6 alpha\n" + "vmax.f32 q8, q8, q0 @ for relu6\n" + "vmax.f32 q9, q9, q0 @ for relu6\n" + "vmax.f32 q10, q10, q0 @ for relu6\n" + "vmax.f32 q11, q11, q0 @ for relu6\n" + "vmax.f32 q12, q12, q0 @ for relu6\n" + "vmax.f32 q13, q13, q0 @ for relu6\n" + "vmax.f32 q14, q14, q0 @ for relu6\n" + "vmax.f32 q15, q15, q0 @ for relu6\n" + + "vmin.f32 q8, q8, q1 @ for relu6\n" + "vmin.f32 q9, q9, q1 @ for relu6\n" + "vmin.f32 q10, q10, q1 @ for relu6\n" + "vmin.f32 q11, q11, q1 @ for relu6\n" + "vmin.f32 q12, q12, q1 @ for relu6\n" + "vmin.f32 q13, q13, q1 @ for relu6\n" + "vmin.f32 q14, q14, q1 @ for relu6\n" + "vmin.f32 q15, q15, q1 @ for relu6\n" + "b 10f @ relu6 end \n" + //! leakey relu + "7: @ otherwise is leakey relu\n" + "vmov.u32 q0, #0 @ for leakey relu \n" + "vld1.f32 {d2-d3}, [%[alpha]] @ load leakey relu alpha\n" + "vcge.f32 q2, q8, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q8, q1 @ vmulq_f32 \n" + "vbif q8, q3, q2 @ choose \n" + "vcge.f32 q2, q9, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q9, q1 @ vmulq_f32 \n" + "vbif q9, q3, q2 @ choose \n" + "vcge.f32 q2, q10, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q10, q1 @ vmulq_f32 \n" + "vbif q10, q3, q2 @ choose \n" + "vcge.f32 q2, q11, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q11, q1 @ vmulq_f32 \n" + "vbif q11, q3, q2 @ choose \n" + "vcge.f32 q2, q12, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q12, q1 @ vmulq_f32 \n" + "vbif q12, q3, q2 @ choose \n" + "vcge.f32 q2, q13, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q13, q1 @ vmulq_f32 \n" + "vbif q13, q3, q2 @ choose \n" + "vcge.f32 q2, q14, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q14, q1 @ vmulq_f32 \n" + "vbif q14, q3, q2 @ choose \n" + "vcge.f32 q2, q15, q0 @ vcgeq_u32 \n" + "vmul.f32 q3, q15, q1 @ vmulq_f32 \n" + "vbif q15, q3, q2 @ choose \n" + "10: @ act end \n" "vst1.32 {d16-d19}, [%[c_ptr0]]! @ store r0\n" "vst1.32 {d20-d23}, [%[c_ptr1]]! @ store r1\n" "vst1.32 {d24-d27}, [%[c_ptr2]]! @ store r2\n" @@ -3935,7 +4383,9 @@ void sgemm_prepacked_4x8(bool is_transB, [k] "+r"(k), [tails] "+r"(tails) : [bias_ptr] "r"(bias_local), - [beta] "r"(beta) + [beta] "r"(beta), + [alpha] "r"(alpha), + [flag_act] "r"(flag_act) : "q0","q1","q2","q3", "q4","q5","q6","q7","q8","q9","q10", "q11","q12","q13","q14","q15","cc","memory"); @@ -3951,13 +4401,6 @@ void sgemm_prepacked_4x8(bool is_transB, } } } - if (act_param.has_active) { -#pragma omp parallel for num_threads(threads) - for (unsigned int x = 0; x < M; x++) { - float* dst = C + x * ldc; - act_switch_process(dst, dst, N, &act_param); - } - } } #endif // __aarch64__ diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc index 07cbd00378c082e311e194c7b22b6d3cb195a63a..0955b09d92f64066000b03c4487f359880f1c2a5 100644 --- a/lite/backends/arm/math/pooling.cc +++ b/lite/backends/arm/math/pooling.cc @@ -67,7 +67,6 @@ void pooling_basic(const float* din, } } else if (pooling_type == "avg") { // Pooling_average_include_padding - // Pooling_average_exclude_padding for (int n = 0; n < num; ++n) { float* dout_batch = dout + n * chout * size_channel_out; const float* din_batch = din + n * chin * size_channel_in; @@ -906,7 +905,9 @@ void pooling1x1s2p0_max(const float* din, int wout, int chin, int hin, - int win) { + int win, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1021,7 +1022,9 @@ void pooling2x2s2_max(const float* din, int wout, int chin, int hin, - int win) { + int win, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1104,7 +1107,9 @@ void pooling2x2s2_avg(const float* din, int chin, int hin, int win, - bool exclusive) { + bool exclusive, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1117,6 +1122,9 @@ void pooling2x2s2_avg(const float* din, int w_unroll_size = wout / 4; int w_unroll_remian = wout - w_unroll_size * 4; float32x4_t vcoef = vdupq_n_f32(0.25f); // divided by 4 + auto zero_ptr = + static_cast(TargetMalloc(TARGET(kARM), win * sizeof(float))); + memset(zero_ptr, 0, win * sizeof(float)); for (int n = 0; n < num; ++n) { float* data_out_batch = data_out + n * chout * size_channel_out; @@ -1132,7 +1140,7 @@ void pooling2x2s2_avg(const float* din, auto dr0 = r0; auto dr1 = r1; if (h * S + K - P > hin) { - dr1 = r0; + dr1 = zero_ptr; } int cnt_num = w_unroll_size; if (w_unroll_size > 0) { @@ -1178,6 +1186,7 @@ void pooling2x2s2_avg(const float* din, } } } + TargetFree(TARGET(kARM), zero_ptr); } void pooling3x3s1p1_max(const float* din, @@ -1188,7 +1197,9 @@ void pooling3x3s1p1_max(const float* din, int wout, int chin, int hin, - int win) { + int win, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1331,7 +1342,9 @@ void pooling3x3s1p1_avg(const float* din, int chin, int hin, int win, - bool exclusive) { + bool exclusive, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1389,7 +1402,13 @@ void pooling3x3s1p1_avg(const float* din, if (exclusive) { coef_h = 1.f; } else { - coef_h = 0.5f; + if (pad_bottom > 1) { + coef_h = 1.f / 3; + } else if (pad_bottom == 1) { + coef_h = 0.5f; + } else { + coef_h = 1.f; + } } break; case 1: @@ -1401,7 +1420,11 @@ void pooling3x3s1p1_avg(const float* din, coef_h = 0.5f; } } else { - coef_h = 1.f / 3; + if (pad_bottom >= 1) { + coef_h = 1.f / 3; + } else { + coef_h = 0.5f; + } } default: break; @@ -1477,8 +1500,12 @@ void pooling3x3s1p1_avg(const float* din, int st = wstart > 0 ? wstart : 0; if (wstart + K > win) { wend = win; - if (!exclusive && wstart + K - win == 2) { - coef = coef_h / 2; + if (!exclusive) { + if (wstart + K - pad_right - win == 1) { + coef = coef_h / 2; + } else if (wstart + K - pad_right - win == 2) { + coef = coef_h; + } } } if (exclusive) { @@ -1509,7 +1536,9 @@ void pooling3x3s1p0_max(const float* din, int wout, int chin, int hin, - int win) { + int win, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1646,7 +1675,9 @@ void pooling3x3s1p0_avg(const float* din, int chin, int hin, int win, - bool exclusive) { + bool exclusive, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1692,7 +1723,13 @@ void pooling3x3s1p0_avg(const float* din, if (exclusive) { coef_h = 1.f; } else { - coef_h = 0.5f; + if (pad_bottom > 1) { + coef_h = 1.f / 3; + } else if (pad_bottom = 1) { + coef_h = 0.5f; + } else { + coef_h = 1.f; + } } break; case 1: @@ -1704,7 +1741,11 @@ void pooling3x3s1p0_avg(const float* din, coef_h = 0.5f; } } else { - coef_h = 1.f / 3; + if (pad_bottom >= 1) { + coef_h = 1.f / 3; + } else { + coef_h = 0.5f; + } } default: break; @@ -1776,8 +1817,12 @@ void pooling3x3s1p0_avg(const float* din, int st = wstart > 0 ? wstart : 0; if (wstart + K > win) { wend = win; - if (!exclusive && wstart + K - win == 2) { - coef = coef_h / 2; + if (!exclusive) { + if (wstart + K - pad_right - win == 1) { + coef = coef_h / 2; + } else if (wstart + K - pad_right - win == 2) { + coef = coef_h; + } } } if (exclusive) { @@ -1811,7 +1856,9 @@ void pooling3x3s2p1_max(const float* din, int wout, int chin, int hin, - int win) { + int win, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -1955,7 +2002,9 @@ void pooling3x3s2p1_avg(const float* din, int chin, int hin, int win, - bool exclusive) { + bool exclusive, + int pad_bottom, + int pad_right) { int size_channel_out = wout * hout; int size_channel_in = win * hin; auto data_out = static_cast(dout); @@ -2015,7 +2064,13 @@ void pooling3x3s2p1_avg(const float* din, if (exclusive) { coef_h = 1.f; } else { - coef_h = 0.5f; + if (pad_bottom > 1) { + coef_h = 1.f / 3; + } else if (pad_bottom == 1) { + coef_h = 0.5f; + } else { + coef_h = 1.f; + } } break; case 1: @@ -2027,7 +2082,11 @@ void pooling3x3s2p1_avg(const float* din, coef_h = 0.5f; } } else { - coef_h = 1.f / 3; + if (pad_bottom == 0) { + coef_h = 1.f / 2; + } else { + coef_h = 1.f / 3; + } } default: break; @@ -2102,8 +2161,12 @@ void pooling3x3s2p1_avg(const float* din, float coef = coef_h / 3.f; if (wstart + K > win) { wend = win; - if (!exclusive && wstart + K - win == 2) { - coef = coef_h / 2; + if (!exclusive) { + if (wstart + K - pad_right - win == 1) { + coef = coef_h / 2; + } else if (wstart + K - pad_right - win == 2) { + coef = coef_h; + } } } int st = wstart > 0 ? wstart : 0; @@ -2135,7 +2198,9 @@ void pooling3x3s2p0_max(const float* din, int wout, int chin, int hin, - int win) { + int win, + int pad_bottom, + int pad_right) { const int K = 3; const int P = 0; const int S = 2; @@ -2261,7 +2326,9 @@ void pooling3x3s2p0_avg(const float* din, int chin, int hin, int win, - bool exclusive) { + bool exclusive, + int pad_bottom, + int pad_right) { const int K = 3; const int P = 0; const int S = 2; @@ -2303,11 +2370,33 @@ void pooling3x3s2p0_avg(const float* din, case 2: dr1 = zero_ptr; dr2 = zero_ptr; - coef_h = 1.f; + if (exclusive) { + coef_h = 1.f; + } else { + if (pad_bottom >= 2) { + coef_h = 1.f / 3; + } else if (pad_bottom == 1) { + coef_h = 0.5f; + } else { + coef_h = 1.0f; + } + } break; case 1: dr2 = zero_ptr; - coef_h = 0.5f; + if (exclusive) { + if (fabsf(coef_h - 0.5f) < 1e-6f) { + coef_h = 1.f; + } else { + coef_h = 0.5f; + } + } else { + if (pad_bottom >= 1) { + coef_h = 1.0f / 3; + } else { + coef_h = 0.5f; + } + } break; default: break; @@ -2366,22 +2455,34 @@ void pooling3x3s2p0_avg(const float* din, dr2 -= 8; } // deal with right pad - int rem = win - (w_unroll_size * 4) * S; - int wstart = 0; + int wstart = w_unroll_size * 4 * S - P; for (int j = 0; j < w_unroll_remian; ++j) { - int wend = std::min(wstart + K, rem); - float coef = coef_h / (wend - wstart); + int wend = wstart + K; // std::min(wstart + K, win); + float coef = coef_h / 3.f; + if (wstart + K > win) { + wend = win; + if (!exclusive) { + if (wstart + K - pad_right - win == 1) { + coef = coef_h / 2; + } else if (wstart + K - pad_right - win == 2) { + coef = coef_h; + } + } + } + int st = wstart > 0 ? wstart : 0; + if (exclusive) { + coef = coef_h / (wend - st); + } float tmp = 0.f; - for (int i = wstart; i < wend; i++) { - tmp += dr0[i]; - tmp += dr1[i]; - tmp += dr2[i]; + for (int i = 0; i < wend - st; i++) { + tmp += dr0[i] + dr1[i] + dr2[i]; } - tmp *= coef; - *(dr_out++) = tmp; + *(dr_out++) = tmp * coef; + dr0 += S - (st - wstart); + dr1 += S - (st - wstart); + dr2 += S - (st - wstart); wstart += S; } - r0 = r2; r1 = r0 + win; r2 = r1 + win; diff --git a/lite/backends/arm/math/pooling.h b/lite/backends/arm/math/pooling.h index 701732cb453bfc9f2e970c83c8d713e70a205434..7bbffa8e2f4594da4be589569efc0ef18b8dd0da 100644 --- a/lite/backends/arm/math/pooling.h +++ b/lite/backends/arm/math/pooling.h @@ -72,7 +72,9 @@ void pooling1x1s2p0_max(const float* din, int wout, int chin, int hin, - int win); + int win, + int pad_bottom, + int pad_right); void pooling2x2s2_max(const float* din, float* dout, @@ -82,7 +84,9 @@ void pooling2x2s2_max(const float* din, int wout, int chin, int hin, - int win); + int win, + int pad_bottom, + int pad_right); void pooling2x2s2_avg(const float* din, float* dout, @@ -93,7 +97,9 @@ void pooling2x2s2_avg(const float* din, int chin, int hin, int win, - bool exclusive); + bool exclusive, + int pad_bottom, + int pad_right); void pooling3x3s1p1_max(const float* din, float* dout, @@ -103,7 +109,9 @@ void pooling3x3s1p1_max(const float* din, int wout, int chin, int hin, - int win); + int win, + int pad_bottom, + int pad_right); void pooling3x3s1p1_avg(const float* din, float* dout, @@ -114,7 +122,9 @@ void pooling3x3s1p1_avg(const float* din, int chin, int hin, int win, - bool exclusive); + bool exclusive, + int pad_bottom, + int pad_right); void pooling3x3s2p1_max(const float* din, float* dout, @@ -124,7 +134,9 @@ void pooling3x3s2p1_max(const float* din, int wout, int chin, int hin, - int win); + int win, + int pad_bottom, + int pad_right); void pooling3x3s1p0_max(const float* din, float* dout, @@ -134,7 +146,9 @@ void pooling3x3s1p0_max(const float* din, int wout, int chin, int hin, - int win); + int win, + int pad_bottom, + int pad_right); void pooling3x3s1p0_avg(const float* din, float* dout, @@ -145,7 +159,9 @@ void pooling3x3s1p0_avg(const float* din, int chin, int hin, int win, - bool exclusive); + bool exclusive, + int pad_bottom, + int pad_right); void pooling3x3s2p1_avg(const float* din, float* dout, @@ -156,7 +172,9 @@ void pooling3x3s2p1_avg(const float* din, int chin, int hin, int win, - bool exclusive); + bool exclusive, + int pad_bottom, + int pad_right); void pooling3x3s2p0_max(const float* din, float* dout, @@ -166,7 +184,9 @@ void pooling3x3s2p0_max(const float* din, int wout, int chin, int hin, - int win); + int win, + int pad_bottom, + int pad_right); void pooling3x3s2p0_avg(const float* din, float* dout, @@ -177,7 +197,9 @@ void pooling3x3s2p0_avg(const float* din, int chin, int hin, int win, - bool exclusive); + bool exclusive, + int pad_bottom, + int pad_right); } // namespace math } // namespace arm diff --git a/lite/backends/arm/math/scale.cc b/lite/backends/arm/math/scale.cc index 7f2169a6456bb04bda228cf62b89a125e4e2bb2f..5aad98c05c56f85931b7a0276d0a85b426573c4c 100644 --- a/lite/backends/arm/math/scale.cc +++ b/lite/backends/arm/math/scale.cc @@ -58,6 +58,43 @@ void scale( } } +template <> +void scale(const int* din, int* dout, int num, int scale, int bias) { + int cnt = num >> 4; + int remain = num % 16; + int32x4_t vscale = vdupq_n_s32(scale); + int32x4_t vbias = vdupq_n_s32(bias); +#pragma omp parallel for + for (int i = 0; i < cnt; i++) { + const int* din_ptr = din + (i << 4); + int* dout_ptr = dout + (i << 4); + + int32x4_t din0 = vld1q_s32(din_ptr); + int32x4_t din1 = vld1q_s32(din_ptr + 4); + int32x4_t din2 = vld1q_s32(din_ptr + 8); + int32x4_t din3 = vld1q_s32(din_ptr + 12); + + int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale); + int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale); + int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale); + int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale); + + vst1q_s32(dout_ptr, vsum1); + vst1q_s32(dout_ptr + 4, vsum2); + vst1q_s32(dout_ptr + 8, vsum3); + vst1q_s32(dout_ptr + 12, vsum4); + } + if (remain > 0) { + const int* din_ptr = din + (cnt << 4); + int* dout_ptr = dout + (cnt << 4); + for (int i = 0; i < remain; i++) { + *dout_ptr = *din_ptr * scale + bias; + dout_ptr++; + din_ptr++; + } + } +} + template <> void scale(const float* din, float* dout, diff --git a/lite/backends/arm/math/scale.h b/lite/backends/arm/math/scale.h index a86528c9df18cd6ef807bc116686b766ad905d82..910bea5613997c05e9257507f8f84792e0071a53 100644 --- a/lite/backends/arm/math/scale.h +++ b/lite/backends/arm/math/scale.h @@ -13,14 +13,32 @@ // limitations under the License. #pragma once - +#include "lite/core/tensor.h" +#include "lite/operators/op_params.h" namespace paddle { namespace lite { namespace arm { namespace math { +template +void scale_compute_basic(const operators::ScaleParam& param) { + const dtype* x_data = param.x->data(); + dtype* output_data = param.output->mutable_data(); + DDim x_dims = param.x->dims(); + DDim output_dims = param.output->dims(); + bool bias_after_scale = param.bias_after_scale; + float scale = param.scale; + float bias = param.bias; + if (!bias_after_scale) { + bias *= scale; + } + for (int i = 0; i < output_dims.production(); i++) { + output_data[i] = static_cast(x_data[i] * scale + bias); + } +} + template -void scale(const T* din, T* dout, int num, float scale, float bias); +void scale(const T* din, T* dout, int num, T scale, T bias); template void scale(const T* din, diff --git a/lite/backends/arm/math/sgemv.cc b/lite/backends/arm/math/sgemv.cc index 98404fe60fdb1384d390458e10dac8c967fd2b21..a7d4322326c9413878264400ba8118b510fade10 100644 --- a/lite/backends/arm/math/sgemv.cc +++ b/lite/backends/arm/math/sgemv.cc @@ -922,7 +922,7 @@ void sgemv_trans(const int M, /* end */ \ "4: \n" /* end */ \ "fmov s1, %w[alpha] \n" /* mov alpha to s1 */ \ - "fcmp s8, #0 \n" /* cmp with zero*/ \ + "fcmp s8, #0.0 \n" /* cmp with zero*/ \ "bge 5f \n" /* if ge zero */ \ "fmul s8, s8, s1 \n" /* out * alpha */ \ "5: \n" /* leakey relu label */ \ @@ -983,10 +983,12 @@ void sgemv_trans(const int M, "vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" \ "vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" \ "vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" \ - "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \ - "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \ "vmla.f32 q0, q4, q6 @ mul add\n" \ + "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \ "vmla.f32 q1, q4, q8 @ mul add\n" \ + "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \ + /*"vmla.f32 q0, q4, q6 @ mul add\n" */ \ + /*"vmla.f32 q1, q4, q8 @ mul add\n" */ \ "vmla.f32 q2, q4, q10 @ mul add\n" \ "vmla.f32 q3, q4, q12 @ mul add\n" \ "subs %[cnt], #1 @ sub loop count \n" \ diff --git a/lite/backends/arm/math/topk.cc b/lite/backends/arm/math/topk.cc index c9239134e1c3988f5f9c39af6a69fec52fa0904f..83986dc1505098b0a23cdff31297e325fcb109a1 100644 --- a/lite/backends/arm/math/topk.cc +++ b/lite/backends/arm/math/topk.cc @@ -26,7 +26,7 @@ bool comp_func(std::pair a, std::pair b) { void topk(const float* in_data, float* out_val, - int* out_ind, + int64_t* out_ind, int m, int n, int k, @@ -34,7 +34,7 @@ void topk(const float* in_data, for (int i = 0; i < m; i++) { const float* in_tmp = in_data + i * n; float* out_val_tmp = out_val + i * k; - int* out_ind_tmp = out_ind + i * k; + int64_t* out_ind_tmp = out_ind + i * k; std::vector> vec; for (int j = 0; j < n; j++) { vec.push_back(std::make_pair(in_tmp[j], j)); diff --git a/lite/backends/arm/math/topk.h b/lite/backends/arm/math/topk.h index 5bf472e1af497398309689151f0d5354b3a48f27..a6716623228e6df0598410f52de56db58be7a8dc 100644 --- a/lite/backends/arm/math/topk.h +++ b/lite/backends/arm/math/topk.h @@ -22,7 +22,7 @@ namespace math { void topk(const float* din, float* out_val, - int* out_ind, + int64_t* out_ind, int m, int n, int k, diff --git a/lite/backends/arm/math/type_trans.cc b/lite/backends/arm/math/type_trans.cc index c50abb741ded487efa03d7d46baf2c6f13a8791d..c7c2da678bf55c45c2a2702ed413cf6bfc135c6a 100644 --- a/lite/backends/arm/math/type_trans.cc +++ b/lite/backends/arm/math/type_trans.cc @@ -40,13 +40,11 @@ void fp32_to_int8(const float* din, int cnt = inner_size / 16; int remain = inner_size & 15; int64_t loop_size = outer_size * axis_size; - #pragma omp parallel for for (int j = 0; j < loop_size; ++j) { float inv_scale = 1.f / scale[j % axis_size]; float32x4_t vzero = vdupq_n_f32(0.f); float32x4_t vscale = vdupq_n_f32(inv_scale); - float32x4_t vmax = vdupq_n_f32(-127.f); float32x4_t vpoff = vdupq_n_f32(0.5f); float32x4_t vnoff = vdupq_n_f32(-0.5f); const float* din_c = din + j * inner_size; @@ -56,6 +54,7 @@ void fp32_to_int8(const float* din, const float* din_ptr = din_c; signed char* dout_ptr = dout_c; #ifdef __aarch64__ + float32x4_t vmax = vdupq_n_f32(-127.0); asm volatile( "ldp q0, q1, [%[in]], #32 \n" "ldp q2, q3, [%[in]], #32 \n" @@ -64,16 +63,19 @@ void fp32_to_int8(const float* din, "fmul v5.4s, v1.4s, %[scale].4s \n" "fmul v6.4s, v2.4s, %[scale].4s \n" "fmul v7.4s, v3.4s, %[scale].4s \n" + /* data >= -127 */ "fcmge v8.4s, v4.4s, %[vmax].4s \n" "fcmge v9.4s, v5.4s, %[vmax].4s \n" "fcmge v10.4s, v6.4s, %[vmax].4s \n" "fcmge v11.4s, v7.4s, %[vmax].4s \n" + /* choose data */ "bif v4.16b, %[vmax].16b, v8.16b \n" "bif v5.16b, %[vmax].16b, v9.16b \n" "bif v6.16b, %[vmax].16b, v10.16b \n" "bif v7.16b, %[vmax].16b, v11.16b \n" "ldp q0, q1, [%[in]], #32 \n" "subs %[cnt], %[cnt], #1 \n" + /* fp32 - int32 */ "FCVTAS v8.4s, v4.4s \n" "FCVTAS v9.4s, v5.4s \n" "FCVTAS v10.4s, v6.4s \n" @@ -89,7 +91,9 @@ void fp32_to_int8(const float* din, "bne 0b \n" : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop) : [scale] "w"(vscale), [vmax] "w"(vmax) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -102,6 +106,7 @@ void fp32_to_int8(const float* din, "v10", "v11"); #else + float vmax[4] = {-127.0, -127.0, -127.0, -127.0}; asm volatile( "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" @@ -113,23 +118,27 @@ void fp32_to_int8(const float* din, "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" "vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n" + "vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3\n" "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" - "vcgt.f32 q8, q3, %q[vzero] @ get mask > 0, in3\n" "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" "vbif.f32 q6, %q[vnoff], q10 @ get right offset\n" - "vbif.f32 q7, %q[vnoff], q8 @ get right offset\n" + "vbif.f32 q7, %q[vnoff], q11 @ get right offset\n" "vmla.f32 q4, q0, %q[vscale] @ mul scale\n" + "vld1.32 {d0-d1}, [%[vmax]] @ set q0 = -127 \n" "vmla.f32 q5, q1, %q[vscale] @ mul scale\n" "vmla.f32 q6, q2, %q[vscale] @ mul scale\n" "vmla.f32 q7, q3, %q[vscale] @ mul scale\n" - "vcge.f32 q8, q4, %q[vmax] @ q4 >= vmax \n" - "vcge.f32 q9, q5, %q[vmax] @ q4 >= vmax \n" - "vcge.f32 q10, q6, %q[vmax] @ q4 >= vmax \n" - "vbif q4, %q[vmax], q8 @ choose \n" - "vcge.f32 q8, q7, %q[vmax] @ q4 >= vmax \n" - "vbif q5, %q[vmax], q9 @ choose \n" - "vbif q6, %q[vmax], q10 @ choose \n" - "vbif q7, %q[vmax], q8 @ choose \n" + /* data >= -127 */ + "vcge.f32 q8, q4, q0 @ q4 >= -127 \n" + "vcge.f32 q9, q5, q0 @ q4 >= -127 \n" + "vcge.f32 q10, q6, q0 @ q4 >= -127 \n" + "vcge.f32 q11, q7, q0 @ q4 >= -127 \n" + /* choose data */ + "vbif q4, q0, q8 @ choose \n" + "vbif q5, q0, q9 @ choose \n" + "vbif q6, q0, q10 @ choose \n" + "vbif q7, q0, q11 @ choose \n" + /* fp32 - int32 */ "vcvt.s32.f32 q0, q4 @ cvt to int32\n" "vcvt.s32.f32 q1, q5 @ cvt to int32\n" "vcvt.s32.f32 q2, q6 @ cvt to int32\n" @@ -150,9 +159,22 @@ void fp32_to_int8(const float* din, : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff), - [vzero] "w"(vzero), - [vmax] "w"(vmax) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10"); + [vmax] "r"(vmax), + [vzero] "w"(vzero) + : "cc", + "memory", + "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11"); #endif } const float* din_r = din_c + 16 * cnt; @@ -203,7 +225,7 @@ void fp32_to_int16(const float* din, "bne 0b \n" : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop) : [scale] "w"(vscale) - : "v0", "v1", "v4", "v5", "v8", "v9"); + : "cc", "memory", "v0", "v1", "v4", "v5", "v8", "v9"); #else asm volatile( "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" @@ -232,7 +254,7 @@ void fp32_to_int16(const float* din, [vpoff] "w"(vpoff), [vnoff] "w"(vnoff), [vzero] "w"(vzero) - : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9"); + : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9"); #endif } const float* din_r = din_c + 8 * cnt; @@ -294,7 +316,9 @@ void int8_to_fp32(const int8_t* in, "bne 0b \n" : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [scale] "w"(vscale) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -335,7 +359,7 @@ void int8_to_fp32(const int8_t* in, "bne 0b \n" : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [scale] "w"(vscale) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); #endif // __aarch64__ } const signed char* din_r = din_c + 16 * cnt; @@ -394,7 +418,18 @@ void int16_to_fp32(const int16_t* in, "bne 0b \n" : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [scale] "w"(vscale) - : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); + : "cc", + "memory", + "v0", + "v1", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11"); #else asm volatile( "vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n" @@ -422,7 +457,7 @@ void int16_to_fp32(const int16_t* in, "bne 0b \n" : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [scale] "w"(vscale) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); #endif // __aarch64__ } const int16_t* din_r = din_c + 16 * cnt; @@ -473,7 +508,9 @@ void int32_to_fp32(const int* din, "bne 0b \n" : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [scale] "w"(vscale) - : "v0", + : "cc", + "memory", + "v0", "v1", "v2", "v3", @@ -506,7 +543,9 @@ void int32_to_fp32(const int* din, "bne 0b \n" : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [scale] "w"(vscale) - : "q0", + : "cc", + "memory", + "q0", "q1", "q2", "q3", @@ -551,41 +590,53 @@ void int32_to_int8(const int* din, const int* din_ptr = din_c; int8_t* dout_ptr = dout_c; #ifdef __aarch64__ + float32x4_t vmax = vdupq_n_f32(-127.0); asm volatile( "0: \n" "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" - + /* int32 - fp32 */ "scvtf v4.4s, v0.4s \n" "scvtf v5.4s, v1.4s \n" "scvtf v6.4s, v2.4s \n" "scvtf v7.4s, v3.4s \n" - + /* mul scale */ "fmul v0.4s, v4.4s, %[scale].4s \n" "fmul v1.4s, v5.4s, %[scale].4s \n" "fmul v2.4s, v6.4s, %[scale].4s \n" "fmul v3.4s, v7.4s, %[scale].4s \n" - + /* data >= -127 */ + "fcmge v4.4s, v0.4s, %[vmax].4s \n" + "fcmge v5.4s, v1.4s, %[vmax].4s \n" + "fcmge v6.4s, v2.4s, %[vmax].4s \n" + "fcmge v7.4s, v3.4s, %[vmax].4s \n" + /* choose data */ + "bif v0.16b, %[vmax].16b, v4.16b \n" + "bif v1.16b, %[vmax].16b, v5.16b \n" + "bif v2.16b, %[vmax].16b, v6.16b \n" + "bif v3.16b, %[vmax].16b, v7.16b \n" + /* fp32 - int32 */ "fcvtas v4.4s, v0.4s \n" "fcvtas v5.4s, v1.4s \n" "fcvtas v6.4s, v2.4s \n" "fcvtas v7.4s, v3.4s \n" - + /* int32 - int16 */ "sqxtn v0.4h, v4.4s \n" "sqxtn2 v0.8h, v5.4s \n" "sqxtn v1.4h, v6.4s \n" "sqxtn2 v1.8h, v7.4s \n" - + /* int16 - int8 */ "sqxtn v2.8b, v0.8h \n" "sqxtn2 v2.16b, v1.8h \n" - + /* store */ "st1 {v2.16b}, [%[out]], #16 \n" "subs %[loop], %[loop], #1 \n" "bne 0b \n" : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + : [scale] "w"(vscale), [vmax] "w"(vmax) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); #else + float vmax[4] = {-127.0, -127.0, -127.0, -127.0}; asm volatile( "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" @@ -607,9 +658,21 @@ void int32_to_int8(const int* din, "vbif.f32 q2, %q[vnoff], q10 @ get right offset\n" "vbif.f32 q3, %q[vnoff], q11 @ get right offset\n" "vmla.f32 q0, q4, %q[vscale] @ mul scale\n" + "vld1.32 {d8-d9}, [%[vmax]] @ set q4 = -127 \n" "vmla.f32 q1, q5, %q[vscale] @ mul scale\n" "vmla.f32 q2, q6, %q[vscale] @ mul scale\n" "vmla.f32 q3, q7, %q[vscale] @ mul scale\n" + /* data >= -127 */ + "vcge.f32 q8, q0, q4 @ q0 >= -127 \n" + "vcge.f32 q9, q1, q4 @ q1 >= -127 \n" + "vcge.f32 q10, q2, q4 @ q2 >= -127 \n" + "vcge.f32 q11, q3, q4 @ q3 >= -127 \n" + /* choose data */ + "vbif q0, q4, q8 @ choose \n" + "vbif q1, q4, q9 @ choose \n" + "vbif q2, q4, q10 @ choose \n" + "vbif q3, q4, q11 @ choose \n" + /* fp32 - int32 */ "vcvt.s32.f32 q4, q0 @ cvt to int32\n" "vcvt.s32.f32 q5, q1 @ cvt to int32\n" "vcvt.s32.f32 q6, q2 @ cvt to int32\n" @@ -628,9 +691,12 @@ void int32_to_int8(const int* din, : [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr) : [vscale] "w"(vscale), [vzero] "w"(vzero), + [vmax] "r"(vmax), [vnoff] "w"(vnoff), [vpoff] "w"(vpoff) - : "q0", + : "cc", + "memory", + "q0", "q1", "q2", "q3", @@ -648,6 +714,7 @@ void int32_to_int8(const int* din, int8_t* dout_r = dout_c + 16 * cnt; for (int i = 0; i < remain; ++i) { dout_r[i] = saturate_cast(roundf(in_scale * din_r[i])); + dout_r[i] = dout_r[i] < -127 ? -127 : dout_r[i]; } } } @@ -682,7 +749,7 @@ float compute_max_kernel(const float* din, int64_t size) { "bne 0b \n" : [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val) : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); #else asm volatile( "vld1.32 {d0-d3}, [%[in]]! @ load 8 float\n" @@ -703,7 +770,7 @@ float compute_max_kernel(const float* din, int64_t size) { : [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val) : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); #endif float32x2_t vmax_p = vpmax_f32(vget_high_f32(vmax_val), vget_low_f32(vmax_val)); diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt index fafd74ae7a43d1a769456edfe408c71593d21201..d26b1188c0878916986575b72cc978926ba5a1f6 100644 --- a/lite/backends/cuda/math/CMakeLists.txt +++ b/lite/backends/cuda/math/CMakeLists.txt @@ -2,7 +2,7 @@ if(NOT LITE_WITH_CUDA) return() endif() -get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES) +get_property(cuda_static_deps GLOBAL PROPERTY CUDA_MODULES) nv_library(cuda_activation SRCS activation.cu DEPS ${cuda_static_deps}) nv_library(cuda_scale SRCS scale.cu DEPS ${cuda_static_deps}) diff --git a/lite/backends/cuda/target_wrapper.h b/lite/backends/cuda/target_wrapper.h index 5b57ddf0043c59219aded9836cc0b1ad982eec2d..3eeee84c1c46a65782e38b998bcd8142e08cbec1 100644 --- a/lite/backends/cuda/target_wrapper.h +++ b/lite/backends/cuda/target_wrapper.h @@ -39,13 +39,26 @@ class TargetWrapper { static void CreateStream(stream_t* stream) {} static void DestroyStream(const stream_t& stream) {} - static void CreateEvent(event_t* event) {} - static void DestroyEvent(const event_t& event) {} + static void CreateEvent(event_t* event) { cudaEventCreate(event); } + static void CreateEventWithFlags( + event_t* event, unsigned int flags = cudaEventDisableTiming) { + cudaEventCreateWithFlags(event, flags); + } + static void DestroyEvent(const event_t& event) { cudaEventDestroy(event); } static void RecordEvent(const event_t& event) {} + static void RecordEvent(const event_t& event, const stream_t& stream) { + cudaEventRecord(event, stream); + } static void SyncEvent(const event_t& event) {} - static void StreamSync(const stream_t& stream) {} + static void StreamSync(const stream_t& stream) { + cudaStreamSynchronize(stream); + } + static void StreamSync(const stream_t& stream, const event_t& event) { + cudaStreamWaitEvent(stream, event, 0); + } + static void DeviceSync() { cudaDeviceSynchronize(); } static void* Malloc(size_t size); static void Free(void* ptr); diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp index 9b1189c407d6d601bb3e5ba8172b1455f04710fd..83b8dff70eb8de7cf1d117585d47118fed539a15 100755 --- a/lite/backends/fpga/KD/debugger.hpp +++ b/lite/backends/fpga/KD/debugger.hpp @@ -106,7 +106,7 @@ inline void read_from_file(lite::Tensor* t, const std::string& path) { inline void save_float(float* data, const std::string& name, int len) { static int counter = 0; - std::string old_string = std::to_string(counter); + std::string old_string = paddle::lite::to_string(counter); std::string new_string = std::string(3 - old_string.length(), '0') + old_string; diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp index 988bc1bb507036de8f13a6c6549c549718bd1256..12a60bd27da832b338dc6b1ca11b1c7d6aa192e4 100644 --- a/lite/backends/fpga/KD/tensor.hpp +++ b/lite/backends/fpga/KD/tensor.hpp @@ -351,10 +351,10 @@ class Tensor { void printScale(std::string type) { printScale(); } std::string dimsFileName() { - return std::to_string(shape_->num()) + "_" + - std::to_string(shape_->channel()) + "_" + - std::to_string(shape_->height()) + "_" + - std::to_string(shape_->width()) + ".txt"; + return paddle::lite::to_string(shape_->num()) + "_" + + paddle::lite::to_string(shape_->channel()) + "_" + + paddle::lite::to_string(shape_->height()) + "_" + + paddle::lite::to_string(shape_->width()) + ".txt"; } void saveToFile() { std::string path = dimsFileName(); } @@ -374,7 +374,7 @@ class Tensor { invalidate(); std::ofstream ofs; static int counter = 0; - std::string npath = std::to_string(counter) + "_" + path; + std::string npath = paddle::lite::to_string(counter) + "_" + path; counter++; save_file_with_name(npath); } diff --git a/lite/backends/mlu/CMakeLists.txt b/lite/backends/mlu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..29c90b422044be4e6a7aa9f4a8da45018a41f11a --- /dev/null +++ b/lite/backends/mlu/CMakeLists.txt @@ -0,0 +1,7 @@ +if(NOT LITE_WITH_MLU) + return() +endif() + +message (STATUS "Lite with mlu backend") + +lite_cc_library(target_wrapper_mlu SRCS target_wrapper.cc DEPS cnml_lib cnrt_lib) diff --git a/lite/backends/mlu/mlu_utils.h b/lite/backends/mlu/mlu_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..08dd355e8100a48363704168d264f6116ae58a79 --- /dev/null +++ b/lite/backends/mlu/mlu_utils.h @@ -0,0 +1,67 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +/* + * This file contains some MLU specific uitls. + */ + +#define CNRT_CALL(msg) \ + CHECK_EQ(static_cast(msg), CNRT_RET_SUCCESS) \ + << (msg) \ + << " MLU CNRT: " << cnrtGetErrorStr(static_cast(msg)) + +#define CNML_CALL(msg) \ + CHECK_EQ(static_cast(msg), CNML_STATUS_SUCCESS) \ + << (msg) << " MLU CNML: " \ + << ::paddle::lite::mlu::CnmlErrorInfo(static_cast(msg)) + +namespace paddle { +namespace lite { +namespace mlu { + +static const char* CnmlErrorInfo(int error) { + switch (error) { +#define LITE_CNML_ERROR_INFO(xx) \ + case xx: \ + return #xx; \ + break; + LITE_CNML_ERROR_INFO(CNML_STATUS_NODEVICE); + LITE_CNML_ERROR_INFO(CNML_STATUS_SUCCESS); + LITE_CNML_ERROR_INFO(CNML_STATUS_DOMAINERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_INVALIDARG); + LITE_CNML_ERROR_INFO(CNML_STATUS_LENGTHERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_OUTOFRANGE); + LITE_CNML_ERROR_INFO(CNML_STATUS_RANGEERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_OVERFLOWERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_UNDERFLOWERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_INVALIDPARAM); + LITE_CNML_ERROR_INFO(CNML_STATUS_BADALLOC); + LITE_CNML_ERROR_INFO(CNML_STATUS_BADTYPEID); + LITE_CNML_ERROR_INFO(CNML_STATUS_BADCAST); + LITE_CNML_ERROR_INFO(CNML_STATUS_UNSUPPORT); +#undef LITE_CNML_ERROR_INFO + default: + return "unknown error"; + break; + } +} + +} // namespace mlu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/mlu/target_wrapper.cc b/lite/backends/mlu/target_wrapper.cc new file mode 100644 index 0000000000000000000000000000000000000000..2385f69246a163830e0df855082d728da2743e02 --- /dev/null +++ b/lite/backends/mlu/target_wrapper.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/mlu/target_wrapper.h" + +#include + +#include "lite/backends/mlu/mlu_utils.h" + +namespace paddle { +namespace lite { +namespace mlu { + +void cnrtMemcpyHtoD(void* dst, const void* src, size_t size) { + CNRT_CALL(cnrtMemcpy( + dst, const_cast(src), size, CNRT_MEM_TRANS_DIR_HOST2DEV)) + << " cnrt memcpy htod failed"; +} + +void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) { + CNRT_CALL(cnrtMemcpy( + dst, const_cast(src), size, CNRT_MEM_TRANS_DIR_DEV2HOST)) + << " cnrt memcpy dtoh failed"; +} + +} // namespace mlu + +size_t TargetWrapperMlu::num_devices() { + uint32_t dev_count = 0; + CNRT_CALL(cnrtGetDeviceCount(&dev_count)) << " cnrt get device count failed"; + LOG(INFO) << "Current MLU device count: " << dev_count; + return dev_count; +} + +void* TargetWrapperMlu::Malloc(size_t size) { + void* ptr{}; + CNRT_CALL(cnrtMalloc(&ptr, size)) << " cnrt malloc failed"; + // LOG(INFO) << "Malloc mlu ptr: " << ptr << " with size: " << size; + return ptr; +} + +void TargetWrapperMlu::Free(void* ptr) { + CNRT_CALL(cnrtFree(ptr)) << " cnrt free failed"; +} + +void TargetWrapperMlu::MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir) { + // LOG(INFO) << "dst: " << dst << " src: " << src << " size: " << size + //<< " dir: " << (int)dir; + switch (dir) { + case IoDirection::DtoD: { + std::unique_ptr cpu_tmp_ptr(new char[size]); + mlu::cnrtMemcpyDtoH(cpu_tmp_ptr.get(), src, size); + mlu::cnrtMemcpyHtoD(dst, cpu_tmp_ptr.get(), size); + break; + } + case IoDirection::HtoD: + mlu::cnrtMemcpyHtoD(dst, src, size); + break; + case IoDirection::DtoH: + mlu::cnrtMemcpyDtoH(dst, src, size); + break; + default: + LOG(FATAL) << "Unsupported IoDirection" << static_cast(dir); + } +} + +// void TargetWrapperMlu::MemcpyAsync(void* dst, +// const void* src, +// size_t size, +// IoDirection dir, +// const stream_t& stream) { +// LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync."; +// MemcpySync(dst, src, size, dir); +// } + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/mlu/target_wrapper.h b/lite/backends/mlu/target_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..2d9e10806f78e56f50b04d408dab219c923456fc --- /dev/null +++ b/lite/backends/mlu/target_wrapper.h @@ -0,0 +1,54 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/backends/mlu/mlu_utils.h" +#include "lite/core/target_wrapper.h" + +namespace paddle { +namespace lite { + +using TargetWrapperMlu = TargetWrapper; + +template <> +class TargetWrapper { + public: + using queue_t = cnrtQueue_t; + + static size_t num_devices(); + static size_t maxinum_queue() { return 0; } // TODO(zhangshijin): fix out it. + + static size_t GetCurDevice() { return 0; } + + static void CreateQueue(queue_t* queue) {} + static void DestroyQueue(const queue_t& queue) {} + + static void QueueSync(const queue_t& queue) {} + + static void* Malloc(size_t size); + static void Free(void* ptr); + + static void MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir); + // static void MemcpyAsync(void* dst, + // const void* src, + // size_t size, + // IoDirection dir, + // const queue_t& queue); +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc index d62ac9cad3e5ab4e6f63e3b667e3fa93e244fec1..345b239c320f04eba8426483a23a352e77a71036 100644 --- a/lite/backends/npu/device.cc +++ b/lite/backends/npu/device.cc @@ -19,8 +19,8 @@ namespace paddle { namespace lite { namespace npu { -std::unique_ptr Device::Build( - std::string& model_name, // NOLINT +std::shared_ptr Device::Build( + const std::string model_name, // NOLINT std::vector& input_nodes, // NOLINT std::vector& output_nodes // NOLINT ) { @@ -41,15 +41,15 @@ std::unique_ptr Device::Build( ir_build.ReleaseModelBuff(om_model_buf); return nullptr; } + // Create a HiAI model manager client to load the HiAI om model - std::unique_ptr model_client( + std::shared_ptr model_client( new hiai::AiModelMngerClient()); if (model_client->Init(nullptr) != hiai::AI_SUCCESS) { LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!"; ir_build.ReleaseModelBuff(om_model_buf); return nullptr; } - model_name = "model_" + std::to_string(model_count_++) + ".om"; auto model_desc = std::make_shared( model_name, freq_level(), framework_type(), model_type(), device_type()); model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length); diff --git a/lite/backends/npu/device.h b/lite/backends/npu/device.h index 411600ae0a38e4ee1b4a3ce3d6519b927eeb0a1a..6733a7f6dfa085d2c64274a81ba2a028ebe88f3f 100644 --- a/lite/backends/npu/device.h +++ b/lite/backends/npu/device.h @@ -40,8 +40,8 @@ class Device { // Build the HiAI IR graph to om model, return HiAI model manager client to // load om model and run inference. - std::unique_ptr Build( - std::string& model_name, // NOLINT + std::shared_ptr Build( + const std::string model_name, // NOLINT std::vector& input_nodes, // NOLINT std::vector& output_nodes // NOLINT ); // NOLINT @@ -51,7 +51,6 @@ class Device { int framework_type_{0}; int model_type_{0}; int device_type_{0}; - int model_count_{0}; }; } // namespace npu diff --git a/lite/backends/opencl/CMakeLists.txt b/lite/backends/opencl/CMakeLists.txt index dd7f6b417e0d6416eec9bb3e60ef088432776112..0ac8cf310370f34ae5743113efe1d71579979daf 100644 --- a/lite/backends/opencl/CMakeLists.txt +++ b/lite/backends/opencl/CMakeLists.txt @@ -2,17 +2,16 @@ if (NOT LITE_WITH_OPENCL) return() endif() +lite_cc_library(opencl_kernels_source_cc SRCS opencl_kernels_source.cc) lite_cc_library(cl_wrapper SRCS cl_wrapper.cc) lite_cc_library(cl_utility SRCS cl_utility.cc DEPS cl_wrapper) -lite_cc_library(cl_runtime SRCS cl_runtime.cc DEPS cl_utility) +lite_cc_library(cl_runtime SRCS cl_runtime.cc DEPS cl_utility opencl_kernels_source_cc) lite_cc_library(cl_context SRCS cl_context.cc DEPS cl_runtime) -lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor) +lite_cc_library(cl_half SRCS cl_half.cc) +lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor cl_half) lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runtime) lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image) lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime) -lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper - ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) -lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper - ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) +lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper) add_dependencies(cl_wrapper opencl_clhpp) diff --git a/lite/backends/opencl/cl_caller.cc b/lite/backends/opencl/cl_caller.cc index 6b9cab1056beaa6f516a0d3a202a7816c911f1b2..8421c784d5da224eacaaa9461b737eed1b4bdd4e 100644 --- a/lite/backends/opencl/cl_caller.cc +++ b/lite/backends/opencl/cl_caller.cc @@ -30,7 +30,7 @@ static void CopyImageData(CLContext* context, int width = cl_image.image_dims()[0]; int height = cl_image.image_dims()[1]; - float* image_data = new float[height * width * 4]; + uint16_t* image_data = new uint16_t[height * width * 4]; cl::Image* image = cl_image.cl_image(); cl::array origin = {0, 0, 0}; cl::array region = { @@ -46,9 +46,8 @@ static void CopyImageData(CLContext* context, delete[] image_data; } -bool InitOpenCLRuntime(std::string cl_path) { +bool InitOpenCLRuntime() { auto* runtime = CLRuntime::Global(); - runtime->set_cl_path(cl_path); return runtime->IsInitSuccess(); } diff --git a/lite/backends/opencl/cl_caller.h b/lite/backends/opencl/cl_caller.h index 1817db9f6bd6d9ecf21978b8293bd9534328de0f..d1f1429e44f8872852797dadcbf2f82c1c9c0269 100644 --- a/lite/backends/opencl/cl_caller.h +++ b/lite/backends/opencl/cl_caller.h @@ -21,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace lite { -bool InitOpenCLRuntime(std::string cl_path); +bool InitOpenCLRuntime(); } // namespace lite } // namespace paddle diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc index 0fcb99486eac57e36ee548b809f8f141e0807db8..f0105e060f03df3e4d49c358cf314730cdd16393 100644 --- a/lite/backends/opencl/cl_context.cc +++ b/lite/backends/opencl/cl_context.cc @@ -41,8 +41,7 @@ cl::Program &CLContext::GetProgram(const std::string &file_name, return *(it->second); } - auto program = CLRuntime::Global()->CreateProgram( - GetContext(), CLRuntime::Global()->cl_path() + "/cl_kernel/" + file_name); + auto program = CLRuntime::Global()->CreateProgram(GetContext(), file_name); VLOG(3) << " --- begin build program -> " << program_key << " --- "; CLRuntime::Global()->BuildProgram(program.get(), options); @@ -122,5 +121,34 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) { } } +cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size, + size_t max_work_size) { + int preferred_lws = 0; + int divisor = 2; + + auto tmp0 = global_work_size[0]; + auto tmp1 = global_work_size[1]; + auto tmp2 = global_work_size[2]; + + if (divisor > 1) { + max_work_size /= divisor; + } + if (preferred_lws > 0 && preferred_lws <= max_work_size) { + max_work_size = preferred_lws; + } + while (tmp1 > max_work_size && max_work_size > 0) { + tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; + } + while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { + tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; + } + while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { + tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; + } + return cl::NDRange{static_cast(tmp0), + static_cast(tmp1), + static_cast(tmp2)}; +} + } // namespace lite } // namespace paddle diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h index a28f82f40ecd70a38fcd179e3c7dedfb02a6bcd1..1964c4bf56b55841ba735c79b2f7a17dc1ed451e 100644 --- a/lite/backends/opencl/cl_context.h +++ b/lite/backends/opencl/cl_context.h @@ -44,6 +44,8 @@ class CLContext { cl::NDRange DefaultWorkSize(const CLImage &image); + cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size); + private: std::unordered_map> programs_; std::vector> kernels_; diff --git a/lite/backends/opencl/cl_functions_test.cc b/lite/backends/opencl/cl_functions_test.cc index 70f47b47946641edf4d023437b48d46cae93ca6e..ba32d8c803bfd832289a936fe9150ba8d14cd984 100644 --- a/lite/backends/opencl/cl_functions_test.cc +++ b/lite/backends/opencl/cl_functions_test.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include #include @@ -26,22 +25,18 @@ limitations under the License. */ #include "lite/core/tensor.h" #include "lite/utils/cp_logging.h" -DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path."); - namespace paddle { namespace lite { TEST(cl_test, runtime_test) { auto *runtime = CLRuntime::Global(); CHECK(runtime->IsInitSuccess()); - runtime->set_cl_path(FLAGS_cl_path); runtime->platform(); runtime->device(); runtime->command_queue(); auto &context = runtime->context(); - auto program = runtime->CreateProgram( - context, - runtime->cl_path() + "/cl_kernel/" + "buffer/elementwise_add_kernel.cl"); + auto program = + runtime->CreateProgram(context, "buffer/elementwise_add_kernel.cl"); auto event = runtime->CreateEvent(context); const std::string build_option("-DCL_DTYPE_float"); CHECK(runtime->BuildProgram(program.get(), build_option)); @@ -50,7 +45,6 @@ TEST(cl_test, runtime_test) { TEST(cl_test, context_test) { auto *runtime = CLRuntime::Global(); CHECK(runtime->IsInitSuccess()); - runtime->set_cl_path(FLAGS_cl_path); CLContext context; context.AddKernel("pool_max", "image/pool_kernel.cl", "-DCL_DTYPE_float"); context.AddKernel( @@ -62,7 +56,6 @@ TEST(cl_test, context_test) { TEST(cl_test, kernel_test) { auto *runtime = CLRuntime::Global(); CHECK(runtime->IsInitSuccess()); - runtime->set_cl_path(FLAGS_cl_path); std::unique_ptr context(new CLContext); context->AddKernel( "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float"); @@ -121,7 +114,7 @@ TEST(cl_test, kernel_test) { } TEST(cl_test, target_wrapper_buffer_test) { - bool inited = InitOpenCLRuntime(FLAGS_cl_path); + bool inited = InitOpenCLRuntime(); CHECK(inited) << "Fail to initialize OpenCL runtime."; std::unique_ptr context(new CLContext); std::string kernel_name = "elementwise_add"; diff --git a/lite/backends/opencl/cl_half.cc b/lite/backends/opencl/cl_half.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f27cae549c30eb7295a7c9490d9fb106883dda7 --- /dev/null +++ b/lite/backends/opencl/cl_half.cc @@ -0,0 +1,518 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "lite/backends/opencl/cl_half.h" + +namespace paddle { +namespace lite { + +// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf + +static const uint32_t mantissatable[2048] = { + 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000, + 0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, + 0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000, + 0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000, + 0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000, + 0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000, + 0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000, + 0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000, + 0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000, + 0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000, + 0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000, + 0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000, + 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000, + 0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000, + 0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000, + 0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000, + 0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000, + 0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000, + 0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000, + 0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000, + 0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000, + 0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, + 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, + 0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000, + 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, + 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000, + 0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000, + 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, + 0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000, + 0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000, + 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, + 0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000, + 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, + 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000, + 0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000, + 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, + 0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000, + 0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, + 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, + 0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, + 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000, + 0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000, + 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, + 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, + 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, + 0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000, + 0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000, + 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, + 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, + 0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, + 0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000, + 0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000, + 0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000, + 0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000, + 0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000, + 0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000, + 0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000, + 0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000, + 0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000, + 0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000, + 0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000, + 0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000, + 0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000, + 0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000, + 0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000, + 0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000, + 0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000, + 0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000, + 0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000, + 0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000, + 0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000, + 0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000, + 0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000, + 0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000, + 0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000, + 0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000, + 0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000, + 0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000, + 0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000, + 0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000, + 0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000, + 0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000, + 0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000, + 0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000, + 0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000, + 0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000, + 0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000, + 0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000, + 0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000, + 0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000, + 0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000, + 0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000, + 0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000, + 0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000, + 0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000, + 0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000, + 0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000, + 0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000, + 0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000, + 0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000, + 0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000, + 0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000, + 0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000, + 0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000, + 0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000, + 0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000, + 0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000, + 0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000, + 0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000, + 0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000, + 0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000, + 0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000, + 0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000, + 0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000, + 0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000, + 0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000, + 0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000, + 0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000, + 0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000, + 0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000, + 0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000, + 0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000, + 0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000, + 0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000, + 0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000, + 0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000, + 0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000, + 0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000, + 0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000, + 0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000, + 0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000, + 0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000, + 0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000, + 0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000, + 0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000, + 0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000, + 0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000, + 0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000, + 0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000, + 0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000, + 0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000, + 0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000, + 0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000, + 0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000, + 0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000, + 0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000, + 0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000, + 0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000, + 0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000, + 0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000, + 0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000, + 0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000, + 0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000, + 0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000, + 0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000, + 0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000, + 0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000, + 0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000, + 0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000, + 0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000, + 0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000, + 0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000, + 0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000, + 0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000, + 0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000, + 0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000, + 0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000, + 0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000, + 0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000, + 0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000, + 0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000, + 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000, + 0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000, + 0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000, + 0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000, + 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000, + 0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, + 0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000, + 0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000, + 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000, + 0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, + 0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000, + 0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000, + 0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000, + 0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000, + 0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000, + 0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000, + 0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000, + 0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000, + 0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000, + 0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000, + 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000, + 0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, + 0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000, + 0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000, + 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000, + 0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, + 0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000, + 0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000, + 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000, + 0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, + 0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000, + 0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000, + 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000, + 0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000, + 0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000, + 0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000, + 0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000, + 0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000, + 0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000, + 0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000, + 0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000, + 0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, + 0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000, + 0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000, + 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000, + 0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, + 0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000, + 0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000, + 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000, + 0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000, + 0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000, + 0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000, + 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000, + 0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, + 0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000, + 0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000, + 0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000, + 0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000, + 0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000, + 0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000, + 0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000, + 0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000, + 0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000, + 0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000, + 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000, + 0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, + 0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000, + 0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000, + 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000, + 0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, + 0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000, + 0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000, + 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000, + 0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, + 0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000, + 0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000, + 0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000, + 0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000, + 0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000, + 0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000, + 0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000, + 0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000, + 0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000, + 0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000, + 0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, + 0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000, + 0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000, + 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000, + 0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, + 0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000, + 0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000, + 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000, + 0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, + 0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000, + 0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000, + 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000, + 0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000, + 0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000, + 0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000, + 0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000, + 0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000, + 0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000, + 0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000, + 0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000, + 0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, + 0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000, + 0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000, + 0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, + 0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000, + 0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000, + 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000, + 0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, + 0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000, + 0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000, + 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000, + 0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, + 0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000, + 0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000, + 0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000, + 0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000, + 0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000, + 0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000, + 0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000, + 0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000, + 0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000, + 0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000, + 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000, + 0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, + 0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000, + 0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000, + 0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, + 0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000, + 0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000, + 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000, + 0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, + 0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000, + 0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000, + 0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000, + 0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000, + 0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000, + 0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000, + 0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000, + 0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000, + 0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000, + 0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000, + 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000, + 0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, + 0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000, + 0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000, + 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000, + 0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, + 0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000, + 0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000, + 0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, + 0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000, + 0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000, + 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000, + 0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000, + 0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000, + 0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000, + 0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000, + 0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000, + 0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000, + 0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000, + 0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000, + 0x387fc000, 0x387fe000}; + +static const uint16_t offsettable[64] = { + 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400}; + +static const uint32_t exponenttable[64] = { + 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, + 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, + 0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000, + 0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000, + 0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000, + 0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000, + 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, + 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, + 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000, + 0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000, + 0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000}; + +static const uint16_t basetable[512] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, + 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000, + 0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, + 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800, + 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, + 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, + 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400, + 0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, + 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, + 0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00}; + +static const uint8_t shifttable[512] = { + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, + 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, + 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d}; + +half_t Float2Half(float f) { + uint32_t v = *reinterpret_cast(&f); + return basetable[(v >> 23) & 0x1ff] + + ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]); +} + +float Half2Float(half_t h) { + uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + + exponenttable[h >> 10]; + return *reinterpret_cast(&v); +} + +void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) { + for (int i = 0; i < count; ++i) { + h_array[i] = Float2Half(f_array[i]); + } +} + +void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) { + for (int i = 0; i < count; ++i) { + f_array[i] = Half2Float(h_array[i]); + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/opencl/cl_kernel/image/relu_kernel.cl b/lite/backends/opencl/cl_half.h similarity index 52% rename from lite/backends/opencl/cl_kernel/image/relu_kernel.cl rename to lite/backends/opencl/cl_half.h index 43a27067c2f2c418d314f9bce95bccbbb51a9be0..0dcf325db2bc13b8fff68f1e777d4680d937abce 100644 --- a/lite/backends/opencl/cl_kernel/image/relu_kernel.cl +++ b/lite/backends/opencl/cl_half.h @@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#pragma once +#include -__kernel void relu(__read_only image2d_t input, - __write_only image2d_t output) { +namespace paddle { +namespace lite { - const int x = get_global_id(0); // image_width - const int y = get_global_id(1); // image_height +typedef uint16_t half_t; - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; +half_t Float2Half(float f); - CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); - in = max((CL_DTYPE4)(0.0f), in); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); -} +float Half2Float(half_t h); + +void FloatArray2HalfArray(float *f_array, half_t *h_array, int count); + +void HalfArray2FloatArray(half_t *h_array, float *f_array, int count); + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/opencl/cl_image.cc b/lite/backends/opencl/cl_image.cc index b67f4040bff4cac15624c1440ca741d2b9dfa6ba..1e21b3d03a4a231f4bb171e83f4038e7922fe19a 100644 --- a/lite/backends/opencl/cl_image.cc +++ b/lite/backends/opencl/cl_image.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "lite/backends/opencl/cl_image.h" +#include +#include "lite/backends/opencl/cl_half.h" #include "lite/backends/opencl/cl_runtime.h" #include "lite/backends/opencl/cl_utility.h" #include "lite/utils/cp_logging.h" @@ -24,7 +26,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) { int width = cl_image.image_dims_[0]; int height = cl_image.image_dims_[1]; - float* image_data = new float[height * width * 4]; + uint16_t* image_data = new uint16_t[height * width * 4]; cl::Image* image = cl_image.cl_image(); cl::array origin = {0, 0, 0}; @@ -41,7 +43,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) { int stride = cl_image.numel() / 20; stride = stride > 0 ? stride : 1; - os << " dims: " << cl_image.tensor_dims_ << "\n"; + os << " dims: "; // << cl_image.tensor_dims_ << "\n"; for (int i = 0; i < cl_image.numel(); i += stride) { os << tensor_data[i] << " "; } @@ -123,7 +125,7 @@ void CLImage::InitCLImage(const cl::Context& context, VLOG(3) << " begin init cl image "; image_dims_ = converter->InitImageDimInfoWith(tensor_dims_); - float* image_data = new float[image_dims_.production() * 4]; + uint16_t* image_data = new uint16_t[image_dims_.production() * 4]; VLOG(3) << " convert to image "; converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_); diff --git a/lite/backends/opencl/cl_image_converter.cc b/lite/backends/opencl/cl_image_converter.cc index 402f710d7a226de089134b4abc41dc41027e0da1..7e6f83a4d12f82c780b8e2a8ba582d6a13d8dc07 100644 --- a/lite/backends/opencl/cl_image_converter.cc +++ b/lite/backends/opencl/cl_image_converter.cc @@ -37,7 +37,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) { } void CLImageConverterDefault::NCHWToImage(float *nchw, - float *image, + half_t *image, const DDim &tensor_dim) { size_t new_dims[] = {1, 1, 1, 1}; for (size_t j = 0; j < tensor_dim.size(); ++j) { @@ -69,7 +69,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, if (c < C) { // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + // (c % 4); - image[i2] = *p; + image[i2] = Float2Half(*p); i2 += 4; p++; } else { @@ -84,7 +84,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, } } -void CLImageConverterDefault::ImageToNCHW(float *image, +void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { @@ -109,7 +109,7 @@ void CLImageConverterDefault::ImageToNCHW(float *image, for (size_t h = 0; h < H; h++) { size_t i2 = (i1 << 2) + c % 4; for (size_t w = 0; w < W; w++) { - *p = image[i2]; + *p = Half2Float(image[i2]); i2 += 4; p++; } @@ -164,7 +164,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) { } void CLImageConverterFolder::NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) { CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) << " Tensor dim is not support!"; @@ -187,13 +187,14 @@ void CLImageConverterFolder::NCHWToImage(float *tensor, for (size_t h = 0; h < tdim[0]; h++) { for (size_t w = 0; w < tdim[1]; w++) { - image[(h * width + w / 4) * 4 + (w % 4)] = tensor[h * tdim[1] + w]; + image[(h * width + w / 4) * 4 + (w % 4)] = + Float2Half(tensor[h * tdim[1] + w]); } } } } -void CLImageConverterFolder::ImageToNCHW(float *image, +void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { @@ -216,7 +217,7 @@ void CLImageConverterFolder::ImageToNCHW(float *image, for (size_t h = 0; h < H; h++) { for (size_t w = 0; w < W; w++) { - p[h * W + w] = image[(h * width + w / 4) * 4 + (w % 4)]; + p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]); } } } @@ -237,7 +238,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { } void CLImageConverterNWBlock::NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) { CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; auto image_dim = InitImageDimInfoWith(tensor_dim); @@ -257,7 +258,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + w * 4 + n % 4; if (n < N) { - image[index] = *p; + image[index] = Float2Half(*p); p++; } else { image[index] = 0.0; @@ -272,7 +273,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, VLOG(3) << " init done"; } -void CLImageConverterNWBlock::ImageToNCHW(float *image, +void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { @@ -291,7 +292,7 @@ void CLImageConverterNWBlock::ImageToNCHW(float *image, for (size_t w = 0; w < W; ++w) { size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + w * 4 + n % 4; - *p = image[index]; + *p = Half2Float(image[index]); p++; if (index >= (width * height * 4)) { LOG(INFO) << " index out of range "; @@ -318,7 +319,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { } void CLImageConverterDWBlock::NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) { size_t new_dims[] = {1, 1, 1, 1}; for (size_t j = 0; j < tensor_dim.size(); ++j) { @@ -350,7 +351,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, if (c < C) { // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + // (c % 4); - image[i2] = *p; + image[i2] = Float2Half(*p); i2 += 4; p++; } else { @@ -365,7 +366,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, } } -void CLImageConverterDWBlock::ImageToNCHW(float *image, +void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { @@ -384,7 +385,7 @@ void CLImageConverterDWBlock::ImageToNCHW(float *image, for (size_t h = 0; h < H; h++) { size_t i2 = (i1 << 2) + c % 4; for (size_t w = 0; w < W; w++) { - *p = image[i2]; + *p = Half2Float(image[i2]); i2 += 4; p++; } @@ -418,7 +419,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) { } void CLImageConverterNormal::NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) { CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) << " Tensor dim is not support!"; @@ -427,7 +428,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor, default_converter.NCHWToImage(tensor, image, tensor_dim); } -void CLImageConverterNormal::ImageToNCHW(float *image, +void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) { @@ -449,10 +450,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith( } void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) {} -void CLImageConverterWinoTransWeight::ImageToNCHW(float *image, +void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) {} diff --git a/lite/backends/opencl/cl_image_converter.h b/lite/backends/opencl/cl_image_converter.h index 962eb8d3ef35bdb603aa4a56181b1124885d5506..bb8602f6adae377f21c8fe92448e8feae64a773f 100644 --- a/lite/backends/opencl/cl_image_converter.h +++ b/lite/backends/opencl/cl_image_converter.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "lite/backends/opencl/cl_half.h" #include "lite/core/tensor.h" namespace paddle { @@ -24,10 +25,10 @@ class CLImageConverterBase { virtual ~CLImageConverterBase() {} virtual void NCHWToImage(float *nchw, - float *image, + half_t *image, const DDim &tensor_dim) = 0; - virtual void ImageToNCHW(float *image, + virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim, const DDim &tensor_dim) = 0; @@ -37,8 +38,8 @@ class CLImageConverterBase { class CLImageConverterDefault : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim) override; - void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim) override; - void ImageToNCHW(float *image, + void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim) override; + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) override; @@ -48,9 +49,9 @@ class CLImageConverterFolder : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim) override; void NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) override; - void ImageToNCHW(float *image, + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) override; @@ -77,9 +78,9 @@ class CLImageConverterNormal : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim) override; void NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) override; - void ImageToNCHW(float *image, + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) override; @@ -106,9 +107,9 @@ class CLImageConverterNWBlock : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim) override; void NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) override; - void ImageToNCHW(float *image, + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) override; @@ -117,9 +118,9 @@ class CLImageConverterDWBlock : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim) override; void NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) override; - void ImageToNCHW(float *image, + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) override; @@ -129,9 +130,9 @@ class CLImageConverterWinoTransWeight : public CLImageConverterBase { public: DDim InitImageDimInfoWith(const DDim &tensor_dim) override; void NCHWToImage(float *tensor, - float *image, + half_t *image, const DDim &tensor_dim) override; - void ImageToNCHW(float *image, + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim) override; diff --git a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl index b8dbf62c06f85ef6237378d8ceab37f8fa2cd69f..a14748c69f3eafce515c90f2b8a226703fe5883d 100644 --- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl @@ -91,11 +91,7 @@ void gemm_batch_naive(__global const CL_DTYPE* a, c0 += a0 * b0; } -#ifdef RELU cur_c[row * N + col] = activation(c0); -#else - cur_c[row * N + col] = c0; -#endif } @@ -103,7 +99,7 @@ void gemm_batch_naive(__global const CL_DTYPE* a, // a: filter_d // b: x_d // c: output_d - +#if 0 // TODO(ysh239): cause CL_OUT_OF_HOST_MEMORY on some devices(such as snapdragon 855) //#define PRINT_KERNEL __kernel void gemm_batch(__global const CL_DTYPE* Aptr, @@ -213,7 +209,7 @@ void gemm_batch(__global const CL_DTYPE* Aptr, } } } - +#endif // fc_gemv_naive: keep for check // used for fc with M = 1 @@ -259,7 +255,7 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, const int col = get_global_id(0) << 2; // gws[0]: [0, N >> 2) height of B == N if (col + 3 < N) { - CL_DTYPE4 c0 = 0.0f; + half4 c0 = 0.0f; if (bias) { c0.x = bias[col]; c0.y = bias[col+1]; @@ -270,11 +266,12 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, // main loop of K int p = 0; for (; p < K - 3; p += 4) { - CL_DTYPE4 a0 = vload4(0, a + p); - CL_DTYPE4 b0 = vload4(0, b + p * N + col); - CL_DTYPE4 b1 = vload4(0, b + (p+1) * N + col); - CL_DTYPE4 b2 = vload4(0, b + (p+2) * N + col); - CL_DTYPE4 b3 = vload4(0, b + (p+3) * N + col); + half4 a0 = convert_half4(vload4(0, a + p)); + + half4 b0 = convert_half4(vload4(0, b + p * N + col)); + half4 b1 = convert_half4(vload4(0, b + (p+1) * N + col)); + half4 b2 = convert_half4(vload4(0, b + (p+2) * N + col)); + half4 b3 = convert_half4(vload4(0, b + (p+3) * N + col)); c0 += a0.x * b0; c0 += a0.y * b1; @@ -283,21 +280,21 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, } // compute left K - CL_DTYPE4 b2 = 0.0f, - b1 = 0.0f, - b0 = 0.0f, - a0 = 0.0f; + half4 b2 = 0.0f, + b1 = 0.0f, + b0 = 0.0f, + a0 = 0.0f; switch (K - p) { case 3: { - b2 = vload4(0, b + (p+2) * N + col); + b2 = convert_half4(vload4(0, b + (p+2) * N + col)); a0.z = a[p + 2]; } case 2: { - b1 = vload4(0, b + (p+1) * N + col); + b1 = convert_half4(vload4(0, b + (p+1) * N + col)); a0.y = a[p + 1]; } case 1: { - b0 = vload4(0, b + (p) * N + col); + b0 = convert_half4(vload4(0, b + (p) * N + col)); a0.x = a[p]; } } @@ -308,7 +305,8 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, // store res #ifdef RELU if (col % 4 == 0) { - vstore4(fmax(c0, (CL_DTYPE4)0.f), 0, c + col); + float4 act_res = convert_float4(fmax(c0, (half4)0.f)); + vstore4(act_res, 0, c + col); } else { switch (col % 4) { case 3: @@ -321,7 +319,7 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, } #else if (col % 4 == 0) { - vstore4(c0, 0, c + col); + vstore4(convert_float4(c0), 0, c + col); } else { switch (col % 4) { case 3: @@ -336,10 +334,10 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, } else { const int left_col = N - col; for (int col_offset = 0; col_offset < left_col; ++col_offset) { - CL_DTYPE c0 = bias ? bias[col] : 0; + half c0 = bias ? bias[col] : 0; for (int p = 0; p < K; ++p) { - CL_DTYPE b0 = *(b + p * N + col + col_offset); - CL_DTYPE a0 = *(a + p); + half b0 = *(b + p * N + col + col_offset); + half a0 = *(a + p); c0 += a0 * b0; } #ifdef RELU @@ -366,18 +364,18 @@ void fc_gemm_4x4(__global const CL_DTYPE* a, const int col = get_global_id(1) << 2; // id: [0, N>>2) width of out == N if (row+3 < M && col+3 < N) { - CL_DTYPE bias0 = bias ? bias[col] : 0, - bias1 = bias ? bias[col+1] : 0, - bias2 = bias ? bias[col+2] : 0, - bias3 = bias ? bias[col+3] : 0; + CL_COMPUTE_DTYPE bias0 = bias ? bias[col] : 0, + bias1 = bias ? bias[col+1] : 0, + bias2 = bias ? bias[col+2] : 0, + bias3 = bias ? bias[col+3] : 0; - CL_DTYPE c00 = bias0, c01 = bias1, c02 = bias2, c03 = bias3, - c10 = bias0, c11 = bias1, c12 = bias2, c13 = bias3, - c20 = bias0, c21 = bias1, c22 = bias2, c23 = bias3, - c30 = bias0, c31 = bias1, c32 = bias2, c33 = bias3; + CL_COMPUTE_DTYPE c00 = bias0, c01 = bias1, c02 = bias2, c03 = bias3, + c10 = bias0, c11 = bias1, c12 = bias2, c13 = bias3, + c20 = bias0, c21 = bias1, c22 = bias2, c23 = bias3, + c30 = bias0, c31 = bias1, c32 = bias2, c33 = bias3; for (int p = 0; p < K; ++p) { - CL_DTYPE + CL_COMPUTE_DTYPE a00 = *(a + row * K + p), a10 = *(a + (row + 1) * K + p), a20 = *(a + (row + 2) * K + p), @@ -407,7 +405,7 @@ void fc_gemm_4x4(__global const CL_DTYPE* a, } else { for (int cidx = col; cidx < N; ++cidx) { for (int ridx = row; ridx < M; ++ridx) { - CL_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0; + CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0; for (int p = 0; p < K; ++p) { a0 = *(a + ridx * K + p); b0 = *(b + p * N + cidx), diff --git a/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl index fe71f4c6ff8856ca679f2e6b29fc20a0d64da9ac..8d3456fa66973b04eaf24a04a42615790a133ddb 100644 --- a/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl @@ -15,6 +15,8 @@ limitations under the License. */ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define CL_DTYPE float +#include + __kernel void im2col(__global const CL_DTYPE* data_im, const int img_offset, const int col_chw, diff --git a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl deleted file mode 100644 index 532f947dd342b1ee4db69a084111a97ec014237f..0000000000000000000000000000000000000000 --- a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl +++ /dev/null @@ -1,167 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -// buffer -> image2d -__kernel void buffer_to_image2d(__global CL_DTYPE *in, - __write_only image2d_t output_image, - __private const int out_H, - __private const int out_W, - __private const int out_C, - __private const int Stride0, - __private const int Stride1, - __private const int Stride2) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_n; - const int in_c0 = out_c * 4 + 0; - const int in_c1 = out_c * 4 + 1; - const int in_c2 = out_c * 4 + 2; - const int in_c3 = out_c * 4 + 3; - const int in_h = out_h; - const int in_w = out_w; - - int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w; - int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w; - int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w; - int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - CL_DTYPE4 output = (CL_DTYPE4)0.0f; - output.x = convert_float(in[input_pos0]); - if(out_C - 4 * out_c >= 2){ - output.y = convert_float(in[input_pos1]); - } - if(out_C - 4 * out_c >= 3){ - output.z = convert_float(in[input_pos2]); - } - if(out_C - 4 * out_c >= 4){ - output.w = convert_float(in[input_pos3]); - } - write_imagef(output_image, output_pos, output); -} - -// buffer -> image2d_nw -__kernel void buffer_to_image2d_nw(__global CL_DTYPE* in, - __write_only image2d_t output_image, - __private const int out_H, - __private const int out_W, - __private const int out_N, - __private const int Stride0, - __private const int Stride1, - __private const int Stride2) { - const int out_n = get_global_id(0); - const int out_w = get_global_id(1); - const int out_ch = get_global_id(2); - - const int out_c = out_ch / out_H; - const int out_h = out_ch % out_H; - - const int in_c = out_c; // index of c in h direction - - const int in_n0 = out_n * 4 + 0; - const int in_n1 = out_n * 4 + 1; - const int in_n2 = out_n * 4 + 2; - const int in_n3 = out_n * 4 + 3; - - const int in_h = out_h; - const int in_w = out_w; - - int input_pos0 = in_n0 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; - int input_pos1 = in_n1 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; - int input_pos2 = in_n2 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; - int input_pos3 = in_n3 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; - - int2 output_pos; - output_pos.x = out_n * out_W + out_w; - output_pos.y = out_ch; - - CL_DTYPE4 output = (CL_DTYPE4)0.0f; - output.x = convert_float(in[input_pos0]); - if (out_N - 4 * out_n >= 2) { - output.y = convert_float(in[input_pos1]); - } - if (out_N - 4 * out_n >= 3) { - output.z = convert_float(in[input_pos2]); - } - if (out_N - 4 * out_n >= 4) { - output.w = convert_float(in[input_pos3]); - } - write_imagef(output_image, output_pos, output); -} - - - -// image2d -> buffer -__kernel void image2d_to_buffer(__read_only image2d_t input, - __private const int in_width, - __private const int in_height, - __global CL_DTYPE* out, - __private const int size_ch, - __private const int size_block, - __private const int size_batch, - __private const int C) { - const int in_c = get_global_id(0); - const int in_w = get_global_id(1); - const int in_nh = get_global_id(2); - const int in_n = in_nh / in_height; - const int in_h = in_nh % in_height; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - const int pos_x = mad24(in_c, in_width, in_w); - CL_DTYPE4 in = read_imagef(input, sampler, (int2)(pos_x, in_nh)); - - const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; - out[index] = convert_float(in.x); - if (C - 4 * in_c >= 2) { - out[index + size_ch] = convert_float(in.y); - } - if(C - 4 * in_c >= 3) { - out[index + size_ch * 2] = convert_float(in.z); - } - if(C - 4 * in_c >= 4) { - out[index + size_ch * 3] = convert_float(in.w); - } -} - -// image2d -> buffer -__kernel void image2d_to_buffer_2d(__private const int in_height, - __private const int in_width, - __read_only image2d_t input, - __global CL_DTYPE* out) { - const int in_w = get_global_id(1); - const int in_h = get_global_id(2); - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - CL_DTYPE4 in = read_imagef(input, sampler, (int2)(in_w, in_h)); - - const int index = (in_h * in_width + in_w) * 4; - out[index] = convert_float(in.x); - out[index + 1] = convert_float(in.y); - out[index + 2] = convert_float(in.z); - out[index + 3] = convert_float(in.w); -} diff --git a/lite/backends/opencl/cl_kernel/cl_common.h b/lite/backends/opencl/cl_kernel/cl_common.h index c127c6cec79cb2eb8d82ce6aa6190b23d373ff64..582e6a08b16ea7b5b8edd5850b1c9af04db56aad 100644 --- a/lite/backends/opencl/cl_kernel/cl_common.h +++ b/lite/backends/opencl/cl_kernel/cl_common.h @@ -29,11 +29,15 @@ limitations under the License. */ #ifdef CL_DTYPE_float #define CL_DTYPE float #define CL_DTYPE_CHAR f +#define CL_COMPUTE_DTYPE half +#define CL_COMPUTE_DTYPE_CHAR h #endif #ifdef CL_DTYPE_half #define CL_DTYPE half #define CL_DTYPE_CHAR h +#define CL_COMPUTE_DTYPE half +#define CL_COMPUTE_DTYPE_CHAR h #endif ///////////////////////////////// @@ -43,6 +47,7 @@ limitations under the License. */ #define GET_VEC_TYPE(type__, size__) type__##size__ #define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__) #define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4) +#define CL_COMPUTE_DTYPE4 VECTORIZED_TYPE(CL_COMPUTE_DTYPE, 4) ///////////////////////////////// // CONVERT_TYPE_TO diff --git a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..cb29860dc7556bdaea3c09589a8c6120c5ef2a1a --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl @@ -0,0 +1,150 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + + +__kernel void relu(__read_only image2d_t input, + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); + in = max((CL_DTYPE4)(0.0f), in); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); +} + + +__kernel void relu6(__read_only image2d_t input, + __write_only image2d_t output, + __private const float threshold, + __private const float scale){ + + const int x = get_global_id(0); + const int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); + in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in); + in = min((CL_DTYPE4)(threshold, threshold, threshold, threshold), in); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); +} + + +__kernel void sigmoid(__read_only image2d_t input, + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); + CL_DTYPE4 out = 1 / (1 + exp(-in)); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); +} + +__kernel void leaky_relu(__read_only image2d_t input, + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + const int x = get_global_id(0); + const int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); + CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in; + if (in.x < 0.0f){ + in.x = s_val.x; + } + if (in.y < 0.0f){ + in.y = s_val.y; + } + if (in.z < 0.0f){ + in.z = s_val.z; + } + if (in.w < 0.0f){ + in.w = s_val.w; + } + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); +} + +__kernel void tanh_act(__read_only image2d_t input, + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); + CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in)); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); +} + +__kernel void exp_act(__read_only image2d_t input, + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); + CL_DTYPE4 out = exp(in); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); +} + +__kernel void swish(__read_only image2d_t input, + __write_only image2d_t output, + __private const float threshold, + __private const float scale) { + + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); + CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in)); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); +} + diff --git a/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl b/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..9427692f1267d363222295b33b6834e28517d0a4 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl @@ -0,0 +1,96 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + + +__kernel void bilinear_interp(__read_only image2d_t input, + __write_only image2d_t output, + __private const float scale_h, + __private const float scale_w, + __private const float align_delta, + __private const int in_dims_h, + __private const int in_dims_w, + __private const int out_dims_h, + __private const int out_dims_w){ + const int c = get_global_id(0); + const int w = get_global_id(1); + const int nh = get_global_id(2); + + int2 output_pos; + output_pos.x = c * out_dims_w + w; + output_pos.y = nh; + + // calculate center pixel's pos + int out_n = nh / out_dims_h; + int out_h = nh % out_dims_h; + float center_w = (w + align_delta) * scale_w - align_delta; + float center_h = (out_h + align_delta) * scale_h - align_delta; + + int floor_w = (int)center_w; + int floor_h = (int)center_h; + int ceil_w = floor_w + 1; + int ceil_h = floor_h + 1; + if (floor_w < 0){ + floor_w = 0; + } + if (floor_h < 0){ + floor_h = 0; + } + if (ceil_w > in_dims_w - 1) { + ceil_w = in_dims_w - 1; + } + if (ceil_h > in_dims_h - 1) { + ceil_h = in_dims_h- 1; + } + float wight0_w = center_w - floor_w; + float wight0_h = center_h - floor_h; + float wight1_w = 1.0 - wight0_w; + float wight1_h = 1.0 - wight0_h; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + // get left up pixel data + int2 left_up; + left_up.x = c * in_dims_w + floor_w; + left_up.y = out_n * in_dims_h + ceil_h; + CL_DTYPE4 left_up_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, left_up); + + + // get left down pixel data + int2 left_down; + left_down.x = c * in_dims_w + floor_w; + left_down.y = out_n * in_dims_h + floor_h; + CL_DTYPE4 left_down_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, left_down); + + // get right up pixel data + int2 right_up; + right_up.x = c * in_dims_w + ceil_w; + right_up.y = out_n * in_dims_h + ceil_h; + CL_DTYPE4 right_up_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, right_up); + + // get right down pixel's data + int2 right_down; + right_down.x = c * in_dims_w + ceil_w; + right_down.y = out_n * in_dims_h + floor_h; + CL_DTYPE4 right_down_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, right_down); + + // calculate output data + CL_DTYPE4 out = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h + + (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h; + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, out); +} diff --git a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl index f0335116f87aac34740dd22ac68f2b6265e62445..40cc52d54d0a9847ea71b017bdd3c633c74faa89 100644 --- a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl @@ -1,11 +1,8 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -15,50 +12,153 @@ limitations under the License. */ #include __kernel void concat2(__read_only image2d_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, - int axis_size, int flag, int width) { - const int x = get_global_id(0); // image_width cxw/4 - const int y = get_global_id(1); // image_height nxh + __read_only image2d_t input1, + __write_only image2d_t output, + int flag, int C_0, int out_C, int out_W, int width) { + const int out_w = get_global_id(0); // image_width cxw/4 + const int out_c = get_global_id(1); // image_width cxw/4 + const int out_nh = get_global_id(2); // image_height nxh const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int xx = x / width; - if (flag == 0){ - xx = y / width; + if (flag == 1){ // by channel + int c_in = out_c; + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + CL_DTYPE4 output_data; + for (int i = 0; i < 4; i++) { + int c = out_c * 4 + i; + if (c >= out_C) { + break; + } + int c_in; + CL_DTYPE4 input_data; + if (c < C_0) { + c_in = c; + int2 input_pos; + input_pos.x = (c_in / 4) * out_W + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos); + } else { + c_in = c - C_0; + int2 input_pos; + input_pos.x = (c_in / 4) * out_W + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos); + } + int value_offset = c_in % 4; + CL_DTYPE value; + if (value_offset == 0) { + value = input_data.x; + } else if (value_offset == 1) { + value = input_data.y; + } else if (value_offset == 2) { + value = input_data.z; + } else if (value_offset == 3) { + value = input_data.w; + } + if (i == 0) { + output_data.x = value; + } else if (i == 1) { + output_data.y = value; + } else if (i == 2) { + output_data.z = value; + } else if (i == 3) { + output_data.w = value; + } + } + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data); + }else if (flag == 2){ // by height, width == n + int2 input_pos; + input_pos.x = out_c * out_W + out_w; + int h = out_nh / width; + CL_DTYPE4 input; + if (h < C_0){ + input_pos.y = out_nh; + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos); + }else{ + input_pos.y = (h - C_0) * width; + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos); + } + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input); + }else if (flag == 3){ // by width, width == C + int2 input_pos; + input_pos.y = out_nh; + CL_DTYPE4 input; + if (out_w < C_0){ + input_pos.x = out_c * out_W + out_w; + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos); + }else{ + input_pos.x = out_c * out_W + (out_w - C_0); + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos); + } + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input); } - if (xx < axis_size){ - CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(x, y)); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); - }else{ - int new_val = xx - axis_size; - new_val *= width; - CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(new_val, y)); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); - } - // WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); } -__kernel void concat_mul(__read_only image2d_t input0, - __write_only image2d_t output, - int axis_size, int flag, int width, int start) { - const int x = get_global_id(0); // image_width cxw/4 - const int y = get_global_id(1); // image_height nxh +__kernel void concat_mul(__read_only image2d_t input, + __write_only image2d_t output, + int flag, int C_0, int out_C, int out_W, int in_W, int width) { + const int in_w = get_global_id(0); // image_width cxw/4 + const int in_c = get_global_id(1); // image_width cxw/4 + const int in_nh = get_global_id(2); // image_height nxh const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int xx = x / width; - if (flag == 0){ - xx = y / width; - } - - if (xx < axis_size && xx >= start){ - xx -= start; - xx *= width; - CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(xx, y)); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); + int2 input_pos; + int2 output_pos; + input_pos.x = in_c * in_W + in_w; + input_pos.y = in_nh; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); + if (flag == 1){ // by channel + CL_DTYPE4 output_data; + for (int i = 0; i < 4; i++) { + int c_out = C_0 + in_c * 4 + i; + if (c_out >= out_C) { + break; + } + int2 output_pos; + output_pos.x = (c_out / 4) * in_W + in_w; + output_pos.y = in_nh; + CL_DTYPE val; + if (i == 0) { + val = input_data.x; + } else if (i == 1) { + val = input_data.y; + } else if (i == 2) { + val = input_data.z; + } else if (i == 3) { + val = input_data.w; + } + if (c_out % 4 == 0){ + output_data.x = val; + }else if (c_out % 4 == 1){ + output_data.y = val; + }else if (c_out % 4 == 2){ + output_data.z = val; + }else if (c_out % 4 == 3){ + output_data.w = val; + } + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data); + } + }else if (flag == 2){ // by height, width == n + int2 output_pos; + output_pos.x = in_c * in_W + in_w; + output_pos.y = in_nh + C_0 * width; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data); + }else if (flag == 3){ // by width, width == C + int2 output_pos; + output_pos.y = in_nh; + output_pos.x = in_c * out_W + (in_w + C_0); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data); } - } diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl similarity index 64% rename from lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl rename to lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl index 37e03e802c56d3de9ba08e97c9dfb62f8cd76e9a..4b2d5ba32072e7eb31adbf347360e0bbcee7bc5b 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl @@ -1,21 +1,21 @@ #include -__kernel void conv2d_1x1(__private const int global_size_dim0, +__kernel void conv2d_1x1_opt(__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, +__read_only image2d_t new_scale, __read_only image2d_t new_biase, #endif __write_only image2d_t output_image, __private const int stride, __private const int offset, - __private const int input_c, + __private const int input_c_block, __private const int input_c_origin, __private const int dilation, __private const int input_width, /* of one block */ @@ -23,7 +23,7 @@ __kernel void conv2d_1x1(__private const int global_size_dim0, __private const int output_width, __private const int output_height, __private const int old_w) { - CL_DTYPE zero = 0.0f; + const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -79,14 +79,9 @@ __kernel void conv2d_1x1(__private const int global_size_dim0, CL_DTYPE4 output3 = 0.0f; #endif - int max_w_bound = input_c * input_width; - int burndary_index = input_c * 4 - input_c_origin; - bool burndary_index_w = - burndary_index == 1 || burndary_index == 2 || burndary_index == 3; - bool burndary_index_z = burndary_index == 2 || burndary_index == 3; - bool burndary_index_y = burndary_index == 3; - - for (int i = 0; i < input_c; ++i) { + int max_w_bound = input_c_block * input_width; + int burndary_index = input_c_block * 4 - input_c_origin; + for (int i = 0; i < input_c_block; ++i) { // ------------0--------------- int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); @@ -101,34 +96,73 @@ __kernel void conv2d_1x1(__private const int global_size_dim0, READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 2)); CL_DTYPE4 weight3 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 3)); - int bound_gap = max_w_bound - pos_in.x - 1; - bool outof_bound = bound_gap < input_width && bound_gap >= 0; - input0.w = select(input0.w, zero, outof_bound && burndary_index_w); - input0.z = select(input0.z, zero, outof_bound && burndary_index_z); - input0.y = select(input0.y, zero, outof_bound && burndary_index_y); + if ((max_w_bound - pos_in.x - 1) < input_width && + (max_w_bound - pos_in.x - 1) >= 0) { + if (burndary_index == 0) { + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(input0.z, weight2, output0); + output0 = mad(input0.w, weight3, output0); + } else if (burndary_index == 1) { + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(input0.z, weight2, output0); + output0 = mad(0.0f, weight3, output0); + + } else if (burndary_index == 2) { + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(0.0f, weight2, output0); + output0 = mad(0.0f, weight3, output0); + } else if (burndary_index == 3) { + output0 = mad(input0.x, weight0, output0); + output0 = mad(0.0f, weight1, output0); + output0 = mad(0.0f, weight2, output0); + output0 = mad(0.0f, weight3, output0); + } + } else { + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(input0.z, weight2, output0); + output0 = mad(input0.w, weight3, output0); + } - output0 = mad(input0.x, weight0, output0); - output0 = mad(input0.y, weight1, output0); - output0 = mad(input0.z, weight2, output0); - output0 = mad(input0.w, weight3, output0); // -------------1-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); CL_DTYPE4 input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); - bound_gap = max_w_bound - pos_in.x - 1; - - outof_bound = bound_gap < input_width && bound_gap >= 0; - input1.w = select(input1.w, zero, outof_bound && burndary_index_w); - input1.z = select(input1.z, zero, outof_bound && burndary_index_z); - input1.y = select(input1.y, zero, outof_bound && burndary_index_y); - - output1 = mad(input1.x, weight0, output1); - output1 = mad(input1.y, weight1, output1); - output1 = mad(input1.z, weight2, output1); - output1 = mad(input1.w, weight3, output1); + if (abs(max_w_bound - pos_in.x) < input_width) { + if (burndary_index == 0) { + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(input1.z, weight2, output1); + output1 = mad(input1.w, weight3, output1); + } else if (burndary_index == 1) { + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(input1.z, weight2, output1); + output1 = mad(0.0f, weight3, output1); + + } else if (burndary_index == 2) { + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(0.0f, weight2, output1); + output1 = mad(0.0f, weight3, output1); + } else if (burndary_index == 3) { + output1 = mad(input1.x, weight0, output1); + output1 = mad(0.0f, weight1, output1); + output1 = mad(0.0f, weight2, output1); + output1 = mad(0.0f, weight3, output1); + } + } else { + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(input1.z, weight2, output1); + output1 = mad(input1.w, weight3, output1); + } // -------------2-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, @@ -136,41 +170,71 @@ __kernel void conv2d_1x1(__private const int global_size_dim0, CL_DTYPE4 input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); - bound_gap = max_w_bound - pos_in.x - 1; - - outof_bound = bound_gap < input_width && bound_gap >= 0; - input2.w = select(input2.w, zero, outof_bound && burndary_index_w); - input2.z = select(input2.z, zero, outof_bound && burndary_index_z); - input2.y = select(input2.y, zero, outof_bound && burndary_index_y); - - output2 = mad(input2.x, weight0, output2); - output2 = mad(input2.y, weight1, output2); - output2 = mad(input2.z, weight2, output2); - output2 = mad(input2.w, weight3, output2); + if (abs(max_w_bound - pos_in.x) < input_width) { + if (burndary_index == 0) { + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(input2.z, weight2, output2); + output2 = mad(input2.w, weight3, output2); + } else if (burndary_index == 1) { + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(input2.z, weight2, output2); + output2 = mad(0.0f, weight3, output2); + + } else if (burndary_index == 2) { + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(0.0f, weight2, output2); + output2 = mad(0.0f, weight3, output2); + } else if (burndary_index == 3) { + output2 = mad(input2.x, weight0, output2); + output2 = mad(0.0f, weight1, output2); + output2 = mad(0.0f, weight2, output2); + output2 = mad(0.0f, weight3, output2); + } + } else { + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(input2.z, weight2, output2); + output2 = mad(input2.w, weight3, output2); + } // -------------3-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); CL_DTYPE4 input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); - bound_gap = max_w_bound - pos_in.x - 1; - - outof_bound = bound_gap < input_width && bound_gap >= 0; - input3.w = - select(input3.w, - zero, - outof_bound && (burndary_index == 1 || burndary_index == 2 || - burndary_index == 3)); - input3.z = - select(input3.z, - zero, - outof_bound && (burndary_index == 2 || burndary_index == 3)); - input3.y = select(input3.y, zero, outof_bound && burndary_index == 3); - output3 = mad(input3.x, weight0, output3); - output3 = mad(input3.y, weight1, output3); - output3 = mad(input3.z, weight2, output3); - output3 = mad(input3.w, weight3, output3); + if (abs(max_w_bound - pos_in.x) < input_width) { + if (burndary_index == 0) { + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(input3.z, weight2, output3); + output3 = mad(input3.w, weight3, output3); + } else if (burndary_index == 1) { + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(input3.z, weight2, output3); + output3 = mad(0.0f, weight3, output3); + + } else if (burndary_index == 2) { + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(0.0f, weight2, output3); + output3 = mad(0.0f, weight3, output3); + } else if (burndary_index == 3) { + output3 = mad(input3.x, weight0, output3); + output3 = mad(0.0f, weight1, output3); + output3 = mad(0.0f, weight2, output3); + output3 = mad(0.0f, weight3, output3); + } + } else { + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(input3.z, weight2, output3); + output3 = mad(input3.w, weight3, output3); + } } #ifdef BATCH_NORM @@ -191,12 +255,10 @@ __kernel void conv2d_1x1(__private const int global_size_dim0, READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); #endif -#ifdef RELU output0 = activation_type4(output0); output1 = activation_type4(output1); output2 = activation_type4(output2); output3 = activation_type4(output3); -#endif if (out_w0 < old_w) { WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0); @@ -215,29 +277,30 @@ __kernel void conv2d_1x1(__private const int global_size_dim0, } } -__kernel void conv2d_1x1_simple(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter, +__kernel void conv2d_1x1_simple( + __private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, #endif #ifdef BATCH_NORM __read_only image2d_t new_scale, - __read_only image2d_t new_biase, + __read_only image2d_t new_biase, #endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int input_c_origin, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height, - __private const int old_w) { + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int input_c_origin, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height, + __private const int old_w) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -360,13 +423,11 @@ __read_only image2d_t new_scale, READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); #endif - output0 = activation_type4(output0); output1 = activation_type4(output1); output2 = activation_type4(output2); output3 = activation_type4(output3); - if (out_w0 < old_w) { WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0); } diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..79f3922e89549fc15b7a849efb0e2b6595357102 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl @@ -0,0 +1,505 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void conv2d_3x3_opt(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * 3; + int filter_h_val1 = filter_h_val0 + 3; + int filter_h_val2 = filter_h_val1 + 3; + int filter_h_val3 = filter_h_val2 + 3; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * 3; + + for (int h = 0; h < 3; h++) { + int in_h_val = select(out_batch_id * in_h + in_h_id + h, + -1, + (out_batch_id * in_h + in_h_id + h < 0 || + out_batch_id * in_h + in_h_id + h >= in_h)); + + for (int w = 0; w < 3; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} + +// support batch > 1 +__kernel void conv2d_3x3_multi_batch(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * 3; + int filter_h_val1 = filter_h_val0 + 3; + int filter_h_val2 = filter_h_val1 + 3; + int filter_h_val3 = filter_h_val2 + 3; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * 3; + + for (int h = 0; h < 3; h++) { + int in_h_val = select( + out_batch_id * in_h + in_h_id + h, + -1, + (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h || + out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h)); + + for (int w = 0; w < 3; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..4ed2e072022dc4b457a86d634bf4bc21ab62bc45 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl @@ -0,0 +1,516 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +// opt version of conv5x5 +__kernel void conv2d_5x5_opt(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // filter + const int filter_w = 5; + const int filter_h = 5; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = + select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} +// support batch > 1 +__kernel void conv2d_5x5_multi_batch(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // filter + const int filter_w = 5; + const int filter_h = 5; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = select( + out_batch_id * in_h + in_h_id + h, + -1, + (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h || + out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} \ No newline at end of file diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl index 1f99322812c13287af92b52aee6c346309ee006c..4998dc99279fffad8750ef3b6495597e9fc4ad65 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl @@ -36,10 +36,10 @@ __kernel void conv2d_7x7(__private const int global_size_dim0, const int batch_index = out_nh / output_height; const int out_nh_in_one_batch = out_nh % output_height; - const filter_n0 = 4 * out_c + 0; - const filter_n1 = 4 * out_c + 1; - const filter_n2 = 4 * out_c + 2; - const filter_n3 = 4 * out_c + 3; + const int filter_n0 = 4 * out_c + 0; + const int filter_n1 = 4 * out_c + 1; + const int filter_n2 = 4 * out_c + 2; + const int filter_n3 = 4 * out_c + 3; int2 stride_xy; stride_xy.x = stride; diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..d82f4b4c96b586b6ecf948827402afd0766dcea4 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl @@ -0,0 +1,516 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +// opt version of con7x7 +__kernel void conv2d_7x7_opt(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // filter + const int filter_w = 7; + const int filter_h = 7; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = + select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} +// support batch > 1 +__kernel void conv2d_7x7_multi_batch(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // filter + const int filter_w = 7; + const int filter_h = 7; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = select( + out_batch_id * in_h + in_h_id + h, + -1, + (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h || + out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} \ No newline at end of file diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl index 14086dcd16bd1a8770f444bdcd0b6bea78e23b7e..6ab2b59343f09c1284ec21a7913f67c26707301c 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl @@ -22,10 +22,6 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0, __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, #endif __write_only image2d_t output_image, __private const int stride, @@ -137,13 +133,8 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0, for(int i = 0 ;i < 9 ; i++){ output += inputs[i] * filters[i]; } -#ifdef BATCH_NORM - output = output * READ_IMG_TYPE(CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) + READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); -#endif -#ifdef RELU output = activation_type4(output); -#endif /* @@ -179,10 +170,6 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, #endif __write_only image2d_t output_image, __private const int stride, @@ -299,19 +286,9 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, output[0] = mad(inputs[10], filters[8], output[0]); output[1] = mad(inputs[11], filters[8], output[1]); -#ifdef BATCH_NORM - CL_DTYPE4 scale = READ_IMG_TYPE(CL_DTYPE_CHAR, new_scale, sampler, (int2)(ou_ch_blk_id, 0)); - CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(ou_ch_blk_id, 0)); - output[0] = mad(scale, output[0], biase); - if (ou_col_id + 1 < ou_w) { - output[1] = mad(scale, output[1], biase); - } -#endif -#ifdef RELU output[0] = activation_type4(output[0]); output[1] = activation_type4(output[1]); -#endif WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]); if (ou_col_id + 1 < ou_w) { diff --git a/lite/backends/opencl/cl_kernel/image/dropout_kernel.cl b/lite/backends/opencl/cl_kernel/image/dropout_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..116b4452dd17e800da20238ad688daf5630d55fb --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/dropout_kernel.cl @@ -0,0 +1,43 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void dropout(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_W, + __private const float dropoutPro) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + half4 input; + half4 output; + + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,output_pos); + half4 dropout = (half4)(1 - dropoutPro); + output = dropout * input; + + write_imageh(output_image, output_pos, output); +} + + diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl index 17b6e8c72a82718a541841ff3c69c175649d7056..73a089d7591b98486daac2d4aaa29fe4f2192134 100644 --- a/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl @@ -14,7 +14,8 @@ limitations under the License. */ #include -__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias, +__kernel void elementwise_mul(__global image2d_t input, + __global image2d_t bias, __write_only image2d_t outputImage) { int x = get_global_id(0); int y = get_global_id(1); @@ -29,8 +30,148 @@ __kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias, WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); } -__kernel void channel_mul_d1(__read_only image2d_t input, __read_only image2d_t bias, - __write_only image2d_t outputImage, int w) { +__kernel void channel_mul(__global image2d_t input, + __global image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x / w; + coords_bias.y = 0; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + CL_DTYPE4 output = in * biase; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + +// etc : 1 1 1 72 +// run time Y [value,0,0,0] * 72 +__kernel void channel_mul_d2(__global image2d_t input, + __global image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias0; + int2 coords_bias1; + int2 coords_bias2; + int2 coords_bias3; + /* if (x == 0 && y == 0) { + CL_DTYPE4 b = (CL_DTYPE4){0, 0, 0, 0}; + #define PPI(j, k) \ + b = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2){j, k}); \ + printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \ + convert_float(b.y), convert_float(b.z), convert_float(b.w)); + for (int i = 0; i < 73; ++i) { + PPI(i, 0); + } + #undef PPI + }*/ + coords_bias0.x = x / w * 4; + coords_bias0.y = 0; + coords_bias1.x = x / w * 4 + 1; + coords_bias1.y = 0; + coords_bias2.x = x / w * 4 + 2; + coords_bias2.y = 0; + coords_bias3.x = x / w * 4 + 3; + coords_bias3.y = 0; + CL_DTYPE4 biase0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias0); + CL_DTYPE4 biase1 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias1); + CL_DTYPE4 biase2 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias2); + CL_DTYPE4 biase3 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias3); + /* if (x == 0 && y == 0) { + printf("bias0={ %f , %f , %f , %f }\n ", + convert_float(biase0.x), convert_float(biase0.y), + convert_float(biase0.z), convert_float(biase0.w)); + printf("bias1={ %f , %f , %f , %f }\n ", + convert_float(biase1.x), convert_float(biase1.y), + convert_float(biase1.z), convert_float(biase1.w)); + printf("bias2={ %f , %f , %f , %f }\n ", + convert_float(biase2.x), convert_float(biase2.y), + convert_float(biase2.z), convert_float(biase2.w)); + printf("bias3={ %f , %f , %f , %f }\n ", + convert_float(biase3.x), convert_float(biase3.y), + convert_float(biase3.z), convert_float(biase3.w)); + }*/ + CL_DTYPE4 biase = {biase0.x, biase1.x, biase2.x, biase3.x}; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 output = mad(in, biase, 0); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + +// c 1 1 +__kernel void channel_mul_d3(__global image2d_t input, + __global image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x / w; + coords_bias.y = 0; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + CL_DTYPE4 output = in * biase; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + +__kernel void channel_mul_d4(__global image2d_t input, +__global image2d_t bias, + __write_only image2d_t outputImage, int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x / w; + coords_bias.y = 0; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + CL_DTYPE4 output = in * biase; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + +#if 0 // TODO(ysh329): comment code below +__kernel void elementwise_mul(__global image2d_t input, + __global image2d_t bias, + __write_only image2d_t outputImage) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords); + CL_DTYPE4 output = in * biase; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + + +__kernel void channel_mul_d1(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage, + int w) { int x = get_global_id(0); int y = get_global_id(1); @@ -52,8 +193,88 @@ __kernel void channel_mul_d1(__read_only image2d_t input, __read_only image2d_t WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); } -__kernel void channel_mul_d2(__read_only image2d_t input, __read_only image2d_t bias, - __write_only image2d_t outputImage, int w, int h) { + +// #define DEBUG +__kernel void channel_mul_d2_nc(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + +#ifdef DEBUG + printf("x:%d y:%d\n", x, y); +#endif + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int2 coords; + coords.x = x; + coords.y = y; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + + int2 coords_bias0 = (int2)(x / w * 4, 0); + int2 coords_bias1 = (int2)(x / w * 4 + 1, 0); + int2 coords_bias2 = (int2)(x / w * 4 + 2, 0); + int2 coords_bias3 = (int2)(x / w * 4 + 3, 0); + + CL_DTYPE4 b0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias0); + CL_DTYPE4 b1 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias1); + CL_DTYPE4 b2 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias2); + CL_DTYPE4 b3 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias3); + + CL_DTYPE4 biase = {b0.x, b1.x, b2.x, b3.x}; + CL_DTYPE4 output = mad(in, biase, 0); + +#ifdef DEBUG + if (x == 0 && y == 0) { + printf("w:%d\n", w); + + printf("biase:%.1f %.1f %.1f %.1f\n", biase.x, biase.y, biase.z, biase.w); + printf("output:%.1f %.1f %.1f %.1f\n", output.x, output.y, output.z, output.w); + + coords.x = 0; + coords.y = 0; + in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + printf("in(%d,%d):%.2f %.2f %.2f %.2f\n", coords.x, coords.y, in.x, in.y, in.z, in.w); + coords.x = 0; + coords.y = 1; + in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + printf("in(%d,%d):%.2f %.2f %.2f %.2f\n", coords.x, coords.y, in.x, in.y, in.z, in.w); + coords.x = 1; + coords.y = 0; + in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + printf("in(%d,%d):%.2f %.2f %.2f %.2f\n", coords.x, coords.y, in.x, in.y, in.z, in.w); + coords.x = 1; + coords.y = 1; + in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + printf("in(%d,%d):%.2f %.2f %.2f %.2f\n", coords.x, coords.y, in.x, in.y, in.z, in.w); + + coords_bias.x = 0; + coords_bias.y = 0; + biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + printf("biase(%d,%d):%.2f %.2f %.2f %.2f\n", coords_bias.x, coords_bias.y, biase.x, biase.y, biase.z, biase.w); + coords_bias.x = 1; + coords_bias.y = 0; + biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + printf("biase(%d,%d):%.2f %.2f %.2f %.2f\n", coords_bias.x, coords_bias.y, biase.x, biase.y, biase.z, biase.w); + coords_bias.x = 2; + coords_bias.y = 0; + biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + printf("biase(%d,%d):%.2f %.2f %.2f %.2f\n", coords_bias.x, coords_bias.y, biase.x, biase.y, biase.z, biase.w); + } +#endif + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + + +__kernel void channel_mul_d2_hw(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage, + int w, + int h) { int x = get_global_id(0); int y = get_global_id(1); @@ -75,8 +296,11 @@ __kernel void channel_mul_d2(__read_only image2d_t input, __read_only image2d_t WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); } -__kernel void channel_mul_d4(__read_only image2d_t input, __read_only image2d_t bias, - __write_only image2d_t outputImage, int w) { + +__kernel void channel_mul_d4(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage, + int w) { int x = get_global_id(0); int y = get_global_id(1); @@ -97,4 +321,4 @@ __kernel void channel_mul_d4(__read_only image2d_t input, __read_only image2d_t WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); } - +#endif diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..6ed6af298f23bcfb396aefe7593ccfd52c732937 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl @@ -0,0 +1,85 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void elementwise_sub(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage) { + int x = get_global_id(0); + int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int2 coords; + coords.x = x; + coords.y = y; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords); + CL_DTYPE4 output = activation_type4(in - biase); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage,coords,output); + } + +__kernel void channel_sub(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + + int2 coords_bias; + coords_bias.x = x % w; + coords_bias.y = 0; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + CL_DTYPE4 output = in - (CL_DTYPE4)(biase.x); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); + } + +__kernel void width_sub(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + + int2 coords_bias; + coords_bias.x = x % w; + coords_bias.y = 0; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + CL_DTYPE4 output; + + output.x = in.x - biase.x; + output.y = in.y - biase.x; + output.z = in.z - biase.x; + output.w = in.w - biase.x; + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} diff --git a/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl b/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..360d8c753ef64b1da2ff2aeebddd94ff0f41db96 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl @@ -0,0 +1,168 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +__kernel void grid_sampler(__read_only image2d_t input, + __read_only image2d_t grid, + __write_only image2d_t output, + __private const int out_height, + __private const int out_width) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2) * 4; + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords1, coords2, outpoints; + coords1.x = out_h / 4 * 2; + coords1.y = out_n * out_width + out_w; + coords2.x = coords1.x + 1; + coords2.y = coords1.y; + outpoints.x = out_c * out_width + out_w; + outpoints.x = out_n * out_height + out_h; + + CL_DTYPE4 g1 = READ_IMG_TYPE(CL_DTYPE_CHAR, grid, sampler, coords1); + CL_DTYPE4 g2 = READ_IMG_TYPE(CL_DTYPE_CHAR, grid, sampler, coords2); + + // x + float x = (g1.x + 1) * (out_width - 1) * 0.5; + float y = (g2.x + 1) * (out_height - 1) * 0.5; + int x0 = floor(x); + int y0 = floor(y); + int x_p = out_c * out_width + x0; + int y_p = out_n * out_height + y0; + + float xs = x - x0; + float xe = x0 + 1 - x; + float ys = y - y0; + float ye = y0 + 1 - y; + + CL_DTYPE4 input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p)); + CL_DTYPE4 input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p)); + CL_DTYPE4 input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1)); + CL_DTYPE4 input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1)); + + if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input0 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input1 = (CL_DTYPE4)(0.0); + } + if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input2 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input3 = (CL_DTYPE4)(0.0); + } + CL_DTYPE4 out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, outpoints, out_val); + + // y + x = (g1.y + 1) * (out_width - 1) / 2; + y = (g2.y + 1) * (out_height - 1) / 2; + x0 = floor(x); + y0 = floor(y); + x_p = out_c * out_width + x0; + y_p = out_n * out_height + y0; + + xs = x - x0; + xe = x0 + 1 - x; + ys = y - y0; + ye = y0 + 1 - y; + + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p)); + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p)); + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1)); + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1)); + + if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input0 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input1 = (CL_DTYPE4)(0.0); + } + if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input2 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input3 = (CL_DTYPE4)(0.0); + } + + out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 1), out_val); + + // z + x = (g1.z + 1) * (out_width - 1) / 2; + y = (g2.z + 1) * (out_height - 1) / 2; + x0 = floor(x); + y0 = floor(y); + x_p = out_c * out_width + x0; + y_p = out_n * out_height + y0; + + xs = x - x0; + xe = x0 + 1 - x; + ys = y - y0; + ye = y0 + 1 - y; + + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p)); + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p)); + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1)); + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1)); + + if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input0 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input1 = (CL_DTYPE4)(0.0); + } + if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input2 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input3 = (CL_DTYPE4)(0.0); + } + out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 2), out_val); + + // w + x = (g1.w + 1) * (out_width - 1) / 2; + y = (g2.w + 1) * (out_height - 1) / 2; + x0 = floor(x); + y0 = floor(y); + x_p = out_c * out_width + x0; + y_p = out_n * out_height + y0; + + xs = x - x0; + xe = x0 + 1 - x; + ys = y - y0; + ye = y0 + 1 - y; + + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p)); + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p)); + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1)); + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1)); + + if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input0 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){ + input1 = (CL_DTYPE4)(0.0); + } + if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input2 = (CL_DTYPE4)(0.0); + } + if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ + input3 = (CL_DTYPE4)(0.0); + } + out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 3), out_val); +} diff --git a/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..3e3d65394f9924edac735084c2fe5ce550f20684 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl @@ -0,0 +1,192 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +// onnx/pytorch instancenorm by lijian +__kernel void instance_norm_onnx(__private const int in_width, + __private const int in_height, + __private const int in_c_group, + __private const int local_work_size_x, + __private const int local_work_size_y, + __private const float epsilon, + __read_only image2d_t input, + __write_only image2d_t output) { + const int out_cn = get_global_id(0); + const int n = out_cn / in_c_group; + const int c = out_cn % in_c_group; + const int w = get_local_id(1); + const int h = get_local_id(2); + const int local_id = w * local_work_size_y + h; + const int local_total_size = local_work_size_x * local_work_size_y; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +#ifdef LOCAL_MEM_128 + __local float4 shared_mem[128]; +#elif defined(LOCAL_MEM_64) + __local float4 shared_mem[64]; +#else + __local float4 shared_mem[256]; +#endif + int xOffset = c * in_width; + int yOffset = n * in_height; + float4 sum = 0.0f; + for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { + for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { + sum += read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex)); + } + } + shared_mem[local_id] = sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + if (local_id < 32) { + for (int i = local_id + 32; i < local_total_size; i += 32) { + sum += shared_mem[i]; + } + } + shared_mem[local_id] += sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + if (local_id == 0) { + int top = min(32, local_total_size); + for (int i = 0; i < top; i += 1) { + sum += shared_mem[i]; + } + shared_mem[0] = sum / (in_width * in_height); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + const float4 mean_val = shared_mem[0]; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { + for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { + float4 temp = read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex)) - mean_val; + sum += temp * temp; + } + } + shared_mem[local_id] = sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + if (local_id < 32) { + for (int i = local_id + 32; i < local_total_size; i += 32) { + sum += shared_mem[i]; + } + } + shared_mem[local_id] += sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + if (local_id == 0) { + int top = min(32, local_total_size); + for (int i = 0; i < top; i += 1) { + sum += shared_mem[i]; + } + shared_mem[0] = sum / (in_width * in_height); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + const float4 sigma = sqrt(shared_mem[0] + (float4)(epsilon)); + + float4 s = 1 / sigma; + + for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { + for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { + int2 intout_pos = (int2)(xOffset + xIndex, yOffset + yIndex); + float4 in_val = read_imagef(input, sampler, intout_pos); + half4 out_val = convert_half4((in_val - mean_val) * s); +#ifdef RELU + out_val = activation(out_val); +#endif + write_imageh(output, intout_pos, out_val); + } + } +} + + +// paddle instancenorm by zhangxi +__kernel void instance_norm_paddle(__read_only image2d_t input, + __write_only image2d_t output, + __read_only image2d_t scale, + __read_only image2d_t bias, + const float epsilon, + const int in_h, + const int in_w){ + __local CL_DTYPE4 saved_mean[1024]; + __local CL_DTYPE4 saved_variance[1024]; + const int lid = get_local_id(0); + const int lsize = get_local_size(0); + const int gidx = get_group_id(0); + const int gidy = get_group_id(1); + const int spatial_size = in_h * in_w; + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + CL_DTYPE4 mean = (CL_DTYPE4)(0.f, 0.f, 0.f, 0.f); + CL_DTYPE4 variance = (CL_DTYPE4)(0.f, 0.f, 0.f, 0.f); + CL_DTYPE4 vepsilon = (CL_DTYPE4)(epsilon, epsilon, epsilon, epsilon); + const int x_offset = gidx * in_w; + const int y_offset = gidy * in_h; + int2 coor; + for (int i = lid; i < spatial_size; i += lsize) { + coor.x = i % in_w + x_offset; + coor.y = i / in_w + y_offset; + CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor); + mean += pixel; + variance += pixel * pixel; + } + saved_mean[lid] = mean; + saved_variance[lid] = variance; + barrier(CLK_LOCAL_MEM_FENCE); + + //! do reduction + int dynamic_size = lsize >> 1; + for (; dynamic_size > 0; dynamic_size >>= 1){ + if (lid < dynamic_size) { + saved_mean[lid] += saved_mean[lid + dynamic_size]; + saved_variance[lid] += saved_variance[lid + dynamic_size]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + mean = saved_mean[0] / spatial_size; + variance = saved_variance[0] / spatial_size - mean * mean; + variance = rsqrt(variance + vepsilon); + + //! do instance norm + coor.x = gidx; + coor.y = gidy; + CL_DTYPE4 vscale = READ_IMG_TYPE(CL_DTYPE_CHAR, scale, sampler, coor); + vscale *= variance; + CL_DTYPE4 vbias = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coor); + for (int i = lid; i < spatial_size; i += lsize) { + coor.x = i % in_w + x_offset; + coor.y = i / in_w + y_offset; + CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor); + pixel = (pixel - mean) * vscale + vbias; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, coor, pixel); + } +} diff --git a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..6c419fe3c134614d28b3bcee3eabac5e8f7bdf6e --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl @@ -0,0 +1,298 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +// #define DEBUG +//////////////////////////////////////////////////////// +// buffer -> image2d +//////////////////////////////////////////////////////// +__kernel void buffer_to_image2d(__global CL_DTYPE *in, + __write_only image2d_t output_image, + __private const int out_H, + __private const int out_W, + __private const int out_C, + __private const int Stride0, + __private const int Stride1, + __private const int Stride2) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_n; + const int in_c0 = out_c * 4 + 0; + const int in_c1 = out_c * 4 + 1; + const int in_c2 = out_c * 4 + 2; + const int in_c3 = out_c * 4 + 3; + const int in_h = out_h; + const int in_w = out_w; + + int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w; + int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w; + int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w; + int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)(0.f, 0.f, 0.f, 0.f); + output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE); + + if (out_C - 4 * out_c >= 2) { + output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE); + } + if (out_C - 4 * out_c >= 3) { + output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE); + } + if (out_C - 4 * out_c >= 4) { + output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE); + } + +#ifdef DEBUG + if (out_w > 2045) { + printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n", + out_w, + out_C - 4 * out_c, + (float)(in[input_pos0]), + (float)(in[input_pos1]), + (float)(in[input_pos2]), + (float)(in[input_pos3])); + printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh, + output_pos.x, output_pos.y, + (float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w)); + } +#endif + + WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output); +} + +//////////////////////////////////////////////////////// +// image2d -> buffer +//////////////////////////////////////////////////////// +__kernel void image2d_to_buffer(__read_only image2d_t input, + __private const int in_width, + __private const int in_height, + __global CL_DTYPE* out, + __private const int size_ch, + __private const int size_block, + __private const int size_batch, + __private const int C) { + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + + const int in_n = in_nh / in_height; + const int in_h = in_nh % in_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + const int pos_x = mad24(in_c, in_width, in_w); + CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)); + +#ifdef DEBUG + if (in_w > 2045) { + printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh, + pos_x, in_nh, + (float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w)); + } +#endif + + const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; + out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE); + if (C - 4 * in_c >= 2) { + out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE); + } + if(C - 4 * in_c >= 3) { + out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE); + } + if(C - 4 * in_c >= 4) { + out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE); + } +} + + +#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile +//////////////////////////////////////////////////////// +// buffer -> image2d_nw +//////////////////////////////////////////////////////// +__kernel void buffer_to_image2d_nw(__global CL_DTYPE* in, + __write_only image2d_t output_image, + __private const int out_H, + __private const int out_W, + __private const int out_N, + __private const int Stride0, + __private const int Stride1, + __private const int Stride2) { + const int out_n = get_global_id(0); + const int out_w = get_global_id(1); + const int out_ch = get_global_id(2); + + const int out_c = out_ch / out_H; + const int out_h = out_ch % out_H; + + const int in_c = out_c; // index of c in h direction + + const int in_n0 = out_n * 4 + 0; + const int in_n1 = out_n * 4 + 1; + const int in_n2 = out_n * 4 + 2; + const int in_n3 = out_n * 4 + 3; + + const int in_h = out_h; + const int in_w = out_w; + + int input_pos0 = in_n0 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; + int input_pos1 = in_n1 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; + int input_pos2 = in_n2 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; + int input_pos3 = in_n3 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w; + + int2 output_pos; + output_pos.x = out_n * out_W + out_w; + output_pos.y = out_ch; + + CL_DTYPE4 output = (CL_DTYPE4)0.0f; + output.x = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos0]); + if (out_N - 4 * out_n >= 2) { + output.y = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos1]); + } + if (out_N - 4 * out_n >= 3) { + output.z = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos2]); + } + if (out_N - 4 * out_n >= 4) { + output.w = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos3]); + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} +#endif + + +#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile +// image2d -> buffer +__kernel void image2d_to_buffer_2d(__private const int in_height, + __private const int in_width, + __read_only image2d_t input, + __global CL_DTYPE* out) { + const int in_w = get_global_id(1); + const int in_h = get_global_id(2); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(in_w, in_h)); + + const int index = (in_h * in_width + in_w) * 4; + out[index] = CONVERT_TYPE_TO(CL_DTYPE, in.x); + out[index + 1] = CONVERT_TYPE_TO(CL_DTYPE, in.y); + out[index + 2] = CONVERT_TYPE_TO(CL_DTYPE, in.z); + out[index + 3] = CONVERT_TYPE_TO(CL_DTYPE, in.w); +} +#endif + +//////////////////////////////////////////////////////// +// buffer -> image2d (divide by 255 to normalize) +//////////////////////////////////////////////////////// +__kernel void buffer_to_image2d_with_pre255(__global uchar *in, + __write_only image2d_t output_image, + __private const int out_H, + __private const int out_W, + __private const int out_C, + __private const int Stride0, + __private const int Stride1, + __private const int Stride2){ + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_n; + const int in_c0 = out_c * 4 + 0; + const int in_c1 = out_c * 4 + 1; + const int in_c2 = out_c * 4 + 2; + const int in_c3 = out_c * 4 + 3; + const int in_h = out_h; + const int in_w = out_w; + + + int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w; + int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w; + int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w; + int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f; + output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255; + if(out_C - 4 * out_c>=2){ + output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255; + } + if(out_C - 4 * out_c>=3){ + output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255; + } + if(out_C - 4 * out_c>=4){ + output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255; + } + WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output); +} + + +//////////////////////////////////////////////////////// +// image2d -> buffer (multiply by 255 to de-normalize) +//////////////////////////////////////////////////////// +__kernel void image2d_to_buffer_with_post255(__read_only image2d_t input, + __private const int in_width, + __private const int in_height, + __global uchar* out, + __private const int size_ch, + __private const int size_block, + __private const int size_batch, + __private const int C) { + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + const int in_n = in_nh / in_height; + const int in_h = in_nh % in_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + const int pos_x = mad24(in_c, in_width, in_w); + CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) * 255; + +#ifdef DEBUG + printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n", + in_c, in_w, in_nh, pos_x, in_nh, in.x, in.y, in.z, in.w); +#endif + + const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; + out[index] = convert_uchar_sat(in.x); + if(C - 4 * in_c>=2){ + out[index + size_ch] = convert_uchar_sat(in.y); + } + if(C - 4 * in_c>=3){ + out[index + size_ch * 2] = convert_uchar_sat(in.z); + } + if(C - 4 * in_c>=4){ + out[index + size_ch * 3] = convert_uchar_sat(in.w); + } +} diff --git a/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl b/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..655a2657e07c419d4e50aed0e78cb8c37afa4b2a --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl @@ -0,0 +1,159 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void lrn(__read_only image2d_t input, + __write_only image2d_t output, + __private const int out_C, + __private const int out_W, + __private const int local_size, + __private const float k, + __private const float alpha, + __private const float beta){ + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const int out_c0 = out_c * 4; + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const int out_c1 = out_c0 + 1; + const int out_c2 = out_c0 + 2; + const int out_c3 = out_c0 + 3; + + const int pad = (local_size - 1) / 2; + const int start = out_c0 - pad; + const int end = out_c0 + pad; + start = start > 0 ? start : 0; + end = end < out_C - 1 ? end : out_C - 1; + float square0 = 0.0; + float square1 = 0.0; + float square2 = 0.0; + float square3 = 0.0; + for (int i = start; i <= end; i++){ + int input_c0 = i / 4; + int2 input_pos; + input_pos.x = input_c0 * out_C + out_w; + input_pos.y = out_nh; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); + int num = i % 4; + switch (num){ + case 0: + square0 += input_data.x * input_data.x; + break; + case 1: + square0 += input_data.y * input_data.y; + break; + case 2: + square0 += input_data.z * input_data.z; + break; + case 3: + square0 += input_data.w * input_data.w; + break; + } + } + start = out_c1 - pad; + end = out_c1 + pad; + for (int i = start; i <= end; i++){ + int input_c0 = i / 4; + int2 input_pos; + input_pos.x = input_c0 * out_C + out_w; + input_pos.y = out_nh; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); + int num = i % 4; + switch (num){ + case 0: + square1 += input_data.x * input_data.x; + break; + case 1: + square1 += input_data.y * input_data.y; + break; + case 2: + square1 += input_data.z * input_data.z; + break; + case 3: + square1 += input_data.w * input_data.w; + break; + } + } + start = out_c2 - pad; + end = out_c2 + pad; + for (int i = start; i <= end; i++){ + int input_c0 = i / 4; + int2 input_pos; + input_pos.x = input_c0 * out_C + out_w; + input_pos.y = out_nh; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); + int num = i % 4; + switch (num){ + case 0: + square2 += input_data.x * input_data.x; + break; + case 1: + square2 += input_data.y * input_data.y; + break; + case 2: + square2 += input_data.z * input_data.z; + break; + case 3: + square2 += input_data.w * input_data.w; + break; + } + } + start = out_c3 - pad; + end = out_c3 + pad; + for (int i = start; i <= end; i++){ + int input_c0 = i / 4; + int2 input_pos; + input_pos.x = input_c0 * out_C + out_w; + input_pos.y = out_nh; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); + int num = i % 4; + switch (num){ + case 0: + square3 += input_data.x * input_data.x; + break; + case 1: + square3 += input_data.y * input_data.y; + break; + case 2: + square3 += input_data.z * input_data.z; + break; + case 3: + square3 += input_data.w * input_data.w; + break; + } + } + int2 out_pos; + out_pos.x = out_c * out_W + out_w; + out_pos.y = out_nh; + CL_DTYPE4 input = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, out_pos); + + float4 out_val; + out_val.x = input.x / (pow(k + alpha * (square0), beta)); + if (out_c1 < out_C){ + out_val.y = input.y / (pow(k + alpha * (square1), beta)); + } + if (out_c2 < out_C){ + out_val.z = input.z / (pow(k + alpha * (square1), beta)); + } + if (out_c3 < out_C){ + out_val.w = input.w / (pow(k + alpha * (square1), beta)); + } + CL_DTYPE4 out_data = CONVERT_TYPE_TO(out_val, CL_DTYPE4); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, out_pos, out_data); +} diff --git a/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl b/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl index b74449d9c8a02551cd74d366849768b4a91a4dce..1df1f0c18b7abb7e715716856dbec7c7d4d5108a 100644 --- a/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl @@ -12,26 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output, - __private const float scale_h, __private const float scale_w, - __private const int in_dims_h, __private const int out_dims_h, - __private const int in_dims_w, __private const int out_dims_w) { - const int c = get_global_id(0); - const int w = get_global_id(1); - const int nh = get_global_id(2); - int2 output_pos; - output_pos.x = c * out_dims_w + w; - output_pos.y = nh; - int out_n = nh / out_dims_h; - int out_h = nh % out_dims_h; - int2 input_pos; - input_pos.x = c * in_dims_w + w / scale_w; - input_pos.y = out_n * in_dims_h + out_h / scale_h; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - half4 input_data = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y)); - write_imageh(output, (int2)(output_pos.x , output_pos.y), input_data); +#include + + +__kernel void nearest_interp(__read_only image2d_t input, + __write_only image2d_t output, + __private const float scale_h, + __private const float scale_w, + __private const int in_dims_h, + __private const int out_dims_h, + __private const int in_dims_w, + __private const int out_dims_w) { + + const int c = get_global_id(0); + const int w = get_global_id(1); + const int nh = get_global_id(2); + + int2 output_pos; + output_pos.x = c * out_dims_w + w; + output_pos.y = nh; + + int out_n = nh / out_dims_h; + int out_h = nh % out_dims_h; + + int2 input_pos; + input_pos.x = c * in_dims_w + w / scale_w; + input_pos.y = out_n * in_dims_h + out_h / scale_h; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(input_pos.x, input_pos.y)); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(output_pos.x , output_pos.y), input_data); } diff --git a/lite/backends/opencl/cl_kernel/image/pad2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/pad2d_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..e65aad3d19bc674aff2f71d2403e611cd247abf1 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/pad2d_kernel.cl @@ -0,0 +1,108 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void pad2d_constant( + __read_only image2d_t input, __write_only image2d_t output, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_h0, const int pad_h1, + const int pad_w0, const int pad_w1, + const float pad_value) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + int2 output_pos = (int2)(mad24(out_c, out_width, out_w), out_nh); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int x = out_w - pad_w0; + int y = out_h - pad_h0; + + if (x < 0 || y < 0 || x >= in_width || y >= in_height) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, (CL_DTYPE4)(pad_value)); + } else { + int2 coor = (int2)(out_c * in_width + x, out_n * in_height + y); + CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, pixel); + } +} + +__kernel void pad2d_reflect( + __read_only image2d_t input, __write_only image2d_t output, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_h0, const int pad_h1, + const int pad_w0, const int pad_w1, + const float pad_value) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + int2 output_pos = (int2)(mad24(out_c, out_width, out_w), out_nh); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int x = out_w - pad_w0; + int y = out_h - pad_h0; + + x = abs(x); + y = abs(y); + x = x < in_width ? x : 2 * in_width - 2 - x; + y = y < in_height ? y : 2 * in_height - 2 - y; + int2 coor = (int2)(out_c * in_width + x, out_n * in_height + y); + CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, pixel); +} + +__kernel void pad2d_edge( + __read_only image2d_t input, __write_only image2d_t output, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_h0, const int pad_h1, + const int pad_w0, const int pad_w1, + const float pad_value) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + int2 output_pos = (int2)(mad24(out_c, out_width, out_w), out_nh); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int x = out_w - pad_w0; + int y = out_h - pad_h0; + + x = x > 0 ? x : 0; + x = x < in_width ? x : in_width - 1; + y = y > 0 ? y : 0; + y = y < in_height ? y : in_height - 1; + int2 coor = (int2)(out_c * in_width + x, out_n * in_height + y); + CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, pixel); +} diff --git a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl index 775166261d01dc639cd5af8cee49f7e7fb30cb19..f64c2b5e7b21d81a50acd485938ca4f74c3f013b 100644 --- a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl @@ -15,17 +15,17 @@ limitations under the License. */ #include __kernel void pool_max(__read_only image2d_t input, - __write_only image2d_t output, - __private const int in_height, - __private const int in_width, - __private const int out_height, - __private const int out_width, - __private const int ksize_h, - __private const int ksize_w, - __private const int stride_h, - __private const int stride_w, - __private const int pad_top, - __private const int pad_left) { + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int ksize_h, + __private const int ksize_w, + __private const int stride_h, + __private const int stride_w, + __private const int pad_top, + __private const int pad_left) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -37,18 +37,19 @@ __kernel void pool_max(__read_only image2d_t input, int start_h = out_h * stride_h - pad_top; int end_h = min(start_h + ksize_h, in_height); - start_h = max(start_h,0); + start_h = max(start_h, 0); int start_w = out_w * stride_w - pad_left; int end_w = min(start_w + ksize_w, in_width); - start_w = max(start_w,0); + start_w = max(start_w, 0); const int pos_in_x = out_c * in_width; const int pos_in_y = out_n * in_height; CL_DTYPE4 max_value = (CL_DTYPE4)(MIN_VALUE); for (int y = start_h; y < end_h; ++y) { for (int x = start_w; x < end_w; ++x) { - CL_DTYPE4 tmp = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + CL_DTYPE4 tmp = READ_IMG_TYPE( + CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); max_value = max(max_value, tmp); } } @@ -58,17 +59,17 @@ __kernel void pool_max(__read_only image2d_t input, } __kernel void pool_avg(__read_only image2d_t input, - __write_only image2d_t output, - __private const int in_height, - __private const int in_width, - __private const int out_height, - __private const int out_width, - __private const int ksize_h, - __private const int ksize_w, - __private const int stride_h, - __private const int stride_w, - __private const int pad_top, - __private const int pad_left) { + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int ksize_h, + __private const int ksize_w, + __private const int stride_h, + __private const int stride_w, + __private const int pad_top, + __private const int pad_left) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -90,10 +91,121 @@ __kernel void pool_avg(__read_only image2d_t input, for (int y = start_h; y < end_h; ++y) { for (int x = start_w; x < end_w; ++x) { - sum += READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + sum += READ_IMG_TYPE( + CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); } } CL_DTYPE4 avg = sum / (ksize_h * ksize_w); const int pos_out_x = mad24(out_c, out_width, out_w); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), avg); } + +__kernel void pool_avg_global(__read_only image2d_t input, + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int ksize_h, + __private const int ksize_w, + __private const int stride_h, + __private const int stride_w, + __private const int pad_top, + __private const int pad_left) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); // =1 + const int out_nh = get_global_id(2); // = n*1 + + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // do not use dtype4 here + // skip issue for half 2048 + float4 sum = (float4)(0.0f); + + const int pos_in_x = out_c * in_width; + const int pos_in_y = out_n * in_height; + for (int y = 0; y < in_height; ++y) { + for (int x = 0; x < in_width; ++x) { + half4 tmp = READ_IMG_TYPE( + CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + + sum.x = convert_float(tmp.x) + sum.x; + sum.y = convert_float(tmp.y) + sum.y; + sum.z = convert_float(tmp.z) + sum.z; + sum.w = convert_float(tmp.w) + sum.w; + } + } + const float global_size_div = 1.0f / (in_height * in_width); + half4 avg; + avg.x = convert_half((sum.x * global_size_div)); + avg.y = convert_half((sum.y * global_size_div)); + avg.z = convert_half((sum.z * global_size_div)); + avg.w = convert_half((sum.w * global_size_div)); + +#ifdef DEBUG + if (out_c == 0) { + printf("\033[31msum.x= %f \033 \n ", sum.x); + printf("sum.y=%f \n ", sum.y); + printf("sum.z=%f \n ", sum.z); + printf("sum.w=%f \n ", sum.w); + printf("one4.x=%f \n ", convert_float(one4.x)); + + printf("in_height=%d \n ", in_height); + printf("in_width=%d \n ", in_width); + printf("ksize_h=%d \n ", ksize_h); + printf("ksize_w=%d \n ", ksize_w); + printf("stride_h=%d \n ", stride_h); + printf("stride_w=%d \n ", stride_w); + printf("pad_top=%d \n ", pad_top); + printf("pad_left=%d \n ", pad_left); + printf("out_width=%d \n ", out_width); + printf("out_height=%d \n ", out_height); + printf("i++=%d \n ", i++); + printf("avg.x=%f \n ", convert_float(avg.x)); + printf("avg.y=%f \n ", convert_float(avg.y)); + printf("avg.z=%f \n ", convert_float(avg.z)); + printf("avg.w=%f \n ", convert_float(avg.w)); + } +#endif + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(out_c, out_nh), avg); +} +__kernel void pool_max_global(__read_only image2d_t input, + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int ksize_h, + __private const int ksize_w, + __private const int stride_h, + __private const int stride_w, + __private const int pad_top, + __private const int pad_left) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); // =1 + const int out_nh = get_global_id(2); // = n*1 + + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 max_value = (CL_DTYPE4)(MIN_VALUE); + const int pos_in_x = out_c * in_width; + const int pos_in_y = out_n * in_height; + for (int y = 0; y < in_height; ++y) { + for (int x = 0; x < in_width; ++x) { + max_value = max(max_value, + READ_IMG_TYPE(CL_DTYPE_CHAR, + input, + sampler, + (int2)(pos_in_x + x, pos_in_y + y))); + } + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(out_c, out_nh), max_value); +} \ No newline at end of file diff --git a/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl b/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl deleted file mode 100644 index 7750bd98a29151ba2428bdafd462420393fe7433..0000000000000000000000000000000000000000 --- a/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -__kernel void relu6(__read_only image2d_t input, - __write_only image2d_t output, - __private const float threshold){ - - const int x = get_global_id(0); - const int y = get_global_id(1); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); - in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in); - in = min((CL_DTYPE4)(threshold, threshold, threshold, threshold), in); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); -} diff --git a/lite/backends/opencl/cl_kernel/image/scale_kernel.cl b/lite/backends/opencl/cl_kernel/image/scale_kernel.cl index 739ff1338582b65d87dbd9c92f1ea86e0c49f0ff..dfc25063cc2e36d768f1bc4d7ff992c87fe17592 100644 --- a/lite/backends/opencl/cl_kernel/image/scale_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/scale_kernel.cl @@ -27,6 +27,6 @@ __kernel void scale(__read_only image2d_t input, CLK_FILTER_NEAREST; CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); - in = convert_float(scale) * in + convert_float(bias); + in = CONVERT_TYPE_TO(scale, CL_DTYPE) * in + CONVERT_TYPE_TO(bias, CL_DTYPE); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); } diff --git a/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl b/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl deleted file mode 100644 index d2cb8fa36e21167979172fba634a7862c932b74c..0000000000000000000000000000000000000000 --- a/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -__kernel void sigmoid(__read_only image2d_t input, - __write_only image2d_t output) { - - const int x = get_global_id(0); // image_width - const int y = get_global_id(1); // image_height - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); - CL_DTYPE4 out = 1 / (1 + exp(-in)); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); -} diff --git a/lite/backends/opencl/cl_kernel/image/slice_kernel.cl b/lite/backends/opencl/cl_kernel/image/slice_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..1ef74bb14213beaa0e83e28b99b592ac1dcc667d --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/slice_kernel.cl @@ -0,0 +1,78 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void slice(__read_only image2d_t input, __write_only image2d_t output, + __private const int start, __private const int end, + __private const int dims_w){ + + const int c = get_global_id(0); + const int w = get_global_id(1); + const int nh = get_global_id(2); + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 output_pos; + output_pos.x = c * dims_w + w; + output_pos.y = nh; + + int2 input_pos; + half4 input_data; + half4 output_data; + + if (start % 4 == 0) { + input_pos.x = (4 * c + start) / 4 * dims_w + w; + input_pos.y = nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos); + output_data = input_data; + } else if (start % 4 == 1) { + input_pos.x = (4 * c + start) / 4 * dims_w + w; + input_pos.y = nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos); + output_data.x = input_data.y; + output_data.y = input_data.z; + output_data.z = input_data.w; + input_pos.x = input_pos.x + dims_w; + input_pos.y = nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos); + output_data.w = input_data.x; + } else if (start % 4 == 2) { + input_pos.x = (4 * c + start) / 4 * dims_w + w; + input_pos.y = nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos); + output_data.x = input_data.z; + output_data.y = input_data.w; + input_pos.x = input_pos.x + dims_w; + input_pos.y = nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos); + output_data.z = input_data.x; + output_data.w = input_data.y; + } else if (start % 4 == 3) { + input_pos.x = (4 * c + start) / 4 * dims_w + w; + input_pos.y = nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos); + output_data.x = input_data.w; + input_pos.x = input_pos.x + dims_w; + input_pos.y = nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos); + output_data.y = input_data.x; + output_data.z = input_data.y; + output_data.w = input_data.z; + } + write_imageh(output, output_pos, output_data); + +} + diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc index 0c7b2f8575a88082f6d79a5392c4468715a701b9..63c9954f9181e9252c4d14f57b6ed29107965fe3 100644 --- a/lite/backends/opencl/cl_runtime.cc +++ b/lite/backends/opencl/cl_runtime.cc @@ -75,13 +75,8 @@ cl::CommandQueue& CLRuntime::command_queue() { std::unique_ptr CLRuntime::CreateProgram( const cl::Context& context, std::string file_name) { - std::ifstream file{file_name, std::ios::binary | std::ios::ate}; - CHECK(file.is_open()) << "Can't open file from " << file_name; - auto size = file.tellg(); - CHECK(size > 0) << "size is too small."; - std::string content(size, '\0'); - file.seekg(0); - file.read(&content[0], size); + auto cl_file = opencl_kernels_files.find(file_name); + std::string content(cl_file->second.begin(), cl_file->second.end()); cl::Program::Sources sources; sources.push_back(content); auto prog = @@ -101,8 +96,8 @@ std::unique_ptr CLRuntime::CreateEvent( } bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) { - std::string build_option = options + " -cl-fast-relaxed-math -I " + - CLRuntime::Global()->cl_path() + "/cl_kernel"; + /* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/ + std::string build_option = options + " -cl-fast-relaxed-math "; VLOG(4) << "OpenCL build_option: " << build_option; status_ = program->build({*device_}, build_option.c_str()); CL_CHECK_ERROR(status_); @@ -133,6 +128,12 @@ bool CLRuntime::InitializePlatform() { } bool CLRuntime::InitializeDevice() { + // ===================== BASIC ===================== + // CL_DEVICE_TYPE_GPU + // CL_DEVICE_NAME + // CL_DEVICE_SUPPORT + // CL_DEVICE_MAX_COMPUTE_UNITS + // CL_DEVICE_MAX_CLOCK_FREQUENCY std::vector all_devices; status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices); CL_CHECK_ERROR(status_); @@ -145,27 +146,153 @@ bool CLRuntime::InitializeDevice() { auto device_name = device_->getInfo(); LOG(INFO) << "Using device: " << device_name; + + cl_device_type device_type = device_->getInfo(); + auto device_type_to_str = [](cl_device_type t) -> std::string { + std::string t_str{""}; + switch (t) { + case CL_DEVICE_TYPE_CPU: + t_str = "CPU"; + break; + case CL_DEVICE_TYPE_GPU: + t_str = "GPU"; + break; + case CL_DEVICE_TYPE_ACCELERATOR: + t_str = "Accelerator"; + break; + case CL_DEVICE_TYPE_DEFAULT: + t_str = "Default"; + break; + default: + t_str = "Unknown"; + } + return t_str; + }; + LOG(INFO) << "device_type:" << device_type_to_str(device_type); + device_info_["CL_DEVICE_TYPE"] = device_type; + + auto max_units = device_->getInfo(); + LOG(INFO) << "The chosen device has " << max_units << " compute units."; + device_info_["CL_DEVICE_MAX_COMPUTE_UNITS"] = max_units; + + auto max_clock_freq = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_MAX_CLOCK_FREQUENCY:" << max_clock_freq; + device_info_["CL_DEVICE_MAX_CLOCK_FREQUENCY"] = max_clock_freq; + + // ===================== MEMORY ===================== + // CL_DEVICE_LOCAL_MEM_SIZE + // CL_DEVICE_GLOBAL_MEM_CACHE_SIZE + // CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE + // CL_DEVICE_GLOBAL_MEM_SIZE + auto local_mem_kb = + static_cast(device_->getInfo()) / 1024; + LOG(INFO) << "The local memory size of the chosen device is " << local_mem_kb + << " KB."; + device_info_["CL_DEVICE_LOCAL_MEM_SIZE_KB"] = local_mem_kb; + + auto global_mem_cache_size_kb = + static_cast(device_->getInfo()) / + 1024; + LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_CACHE_SIZE(KB):" + << global_mem_cache_size_kb << " KB."; + device_info_["CL_DEVICE_GLOBAL_MEM_CACHE_SIZE_KB"] = global_mem_cache_size_kb; + + auto global_mem_cacheline_size_kb = + static_cast( + device_->getInfo()) / + 1024; + LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE(KB):" + << global_mem_cacheline_size_kb << " KB."; + device_info_["CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE_KB"] = + global_mem_cacheline_size_kb; + + auto global_mem_size_kb = + static_cast(device_->getInfo()) / 1024; + LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_SIZE(KB):" << global_mem_size_kb << " KB."; + device_info_["CL_DEVICE_GLOBAL_MEM_SIZE_KB"] = global_mem_size_kb; + + // ===================== WORK_GROUP ===================== + // CL_DEVICE_MAX_WORK_GROUP_SIZE + // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS + // CL_DEVICE_MAX_WORK_ITEM_SIZES + auto max_work_group_size = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_MAX_WORK_GROUP_SIZE:" << max_work_group_size; + device_info_["CL_DEVICE_MAX_WORK_GROUP_SIZE"] = max_work_group_size; + + auto max_dims_num = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:" << max_dims_num; + device_info_["CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS"] = max_dims_num; + + auto max_work_item_sizes = device_->getInfo(); + for (size_t i = 0; i < max_work_item_sizes.size(); ++i) { + LOG(INFO) << "max_work_item_sizes[" << i << "]:" << max_work_item_sizes[i]; + std::string dim_key = "CL_DEVICE_MAX_WORK_ITEM_SIZES_" + std::to_string(i); + device_info_[dim_key] = max_work_item_sizes[i]; + } + + // ===================== BUFFER ===================== + // CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE + auto max_constant_buffer_size_kb = + static_cast( + device_->getInfo()) / + 1024; + LOG(INFO) << "CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:" + << max_constant_buffer_size_kb; + device_info_["CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE"] = + max_constant_buffer_size_kb; + + // ===================== IMAGE ===================== + // CL_DEVICE_IMAGE_SUPPORT + // CL_DEVICE_IMAGE2D_MAX_HEIGHT + // CL_DEVICE_IMAGE2D_MAX_WIDTH auto image_support = device_->getInfo(); if (image_support) { LOG(INFO) << "The chosen device supports image processing."; + device_info_["CL_DEVICE_IMAGE_SUPPORT"] = 1; } else { LOG(INFO) << "The chosen device doesn't support image processing!"; + device_info_["CL_DEVICE_IMAGE_SUPPORT"] = 0; return false; } + + auto image2d_max_height = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_IMAGE2D_MAX_HEIGHT:" << image2d_max_height; + device_info_["CL_DEVICE_IMAGE2D_MAX_HEIGHT"] = image2d_max_height; + + auto image2d_max_width = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_IMAGE2D_MAX_WIDTH:" << image2d_max_width; + device_info_["CL_DEVICE_IMAGE2D_MAX_WIDTH"] = image2d_max_width; + + // ===================== OTHERS / EXTENSION / VERSION ===================== + // CL_DEVICE_EXTENSIONS + // CL_DEVICE_ADDRESS_BITS auto ext_data = device_->getInfo(); VLOG(4) << "The extensions supported by this device: " << ext_data; if (ext_data.find("cl_khr_fp16") != std::string::npos) { LOG(INFO) << "The chosen device supports the half data type."; + device_info_["CL_DEVICE_EXTENSIONS_FP16"] = 1; } else { LOG(INFO) << "The chosen device doesn't support the half data type!"; + device_info_["CL_DEVICE_EXTENSIONS_FP16"] = 0; } - auto max_units = device_->getInfo(); - LOG(INFO) << "The chosen device has " << max_units << " compute units."; - auto local_mem = device_->getInfo(); - LOG(INFO) << "The local memory size of the chosen device is " - << static_cast(local_mem) / 1024 << " KB."; + + auto address_bits = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_ADDRESS_BITS:" << address_bits; + device_info_["CL_DEVICE_ADDRESS_BITS"] = address_bits; + + auto driver_version = device_->getInfo(); + LOG(INFO) << "CL_DRIVER_VERSION:" << driver_version; + return true; } +std::map& CLRuntime::GetDeviceInfo() { + if (0 != device_info_.size()) { + return device_info_; + } + InitializeDevice(); + return device_info_; +} + } // namespace lite } // namespace paddle diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h index 0859780c69cc8647c1fd54bf1ab12be29217c9e1..1a5ededeff37d9f6820af6a49dc22c669620734b 100644 --- a/lite/backends/opencl/cl_runtime.h +++ b/lite/backends/opencl/cl_runtime.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include #include @@ -24,6 +25,9 @@ limitations under the License. */ namespace paddle { namespace lite { +extern const std::map> + opencl_kernels_files; + class CLRuntime { public: static CLRuntime* Global(); @@ -51,6 +55,8 @@ class CLRuntime { void set_cl_path(std::string cl_path) { cl_path_ = cl_path; } + std::map& GetDeviceInfo(); + private: CLRuntime() = default; @@ -80,6 +86,8 @@ class CLRuntime { return queue; } + std::map device_info_; + std::string cl_path_; std::shared_ptr platform_{nullptr}; diff --git a/lite/backends/opencl/target_wrapper.cc b/lite/backends/opencl/target_wrapper.cc index 310567baa539697f6a67b59f6c0e5f29ce46a80e..9cf07dfc0c474b0b5c57b8355c099eba15610a91 100644 --- a/lite/backends/opencl/target_wrapper.cc +++ b/lite/backends/opencl/target_wrapper.cc @@ -81,10 +81,10 @@ void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, return cl_image; } -template <> // use int16_t represents half float -void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, - const size_t cl_image2d_height, - void *host_ptr) { +template <> // use uint16_t represents half float +void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, + const size_t cl_image2d_height, + void *host_ptr) { cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFP16))); cl_int status; cl::Image2D *cl_image = diff --git a/lite/backends/x86/jit/gen/blas.h b/lite/backends/x86/jit/gen/blas.h index 39920195b245e1c44ff68ab91af94d25c949bd02..4317d558c6252e9163bc545cba4859fbcb89f804 100644 --- a/lite/backends/x86/jit/gen/blas.h +++ b/lite/backends/x86/jit/gen/blas.h @@ -17,6 +17,7 @@ #include #include "glog/logging.h" #include "lite/backends/x86/jit/gen/jitcode.h" +#include "lite/utils/string.h" namespace paddle { namespace lite { @@ -64,7 +65,7 @@ class VXXJitCode : public JitCode { base += "_Vec"; } base += (with_relu_ ? "_Relu" : ""); - base += "_D" + std::to_string(num_); + base += "_D" + paddle::lite::to_string(num_); return base; } void genCode() override; diff --git a/lite/backends/x86/jit/gen/embseqpool.h b/lite/backends/x86/jit/gen/embseqpool.h index 7cae76f9dd99cf904e831b196bd493623ff7eb1d..999960ece4170d561419ad24bd94c512ce167eb0 100644 --- a/lite/backends/x86/jit/gen/embseqpool.h +++ b/lite/backends/x86/jit/gen/embseqpool.h @@ -47,7 +47,7 @@ class EmbSeqPoolJitCode : public JitCode { } else if (type_ == SeqPoolType::kSqrt) { base += "_Sqrt"; } - base += ("_W" + std::to_string(tbl_w_)); + base += ("_W" + paddle::lite::to_string(tbl_w_)); return base; } void genCode() override; diff --git a/lite/backends/x86/jit/gen/matmul.h b/lite/backends/x86/jit/gen/matmul.h index b1b302b7904a5d92952f4385c483eccdc5df3592..e7be6750cf0d232b41d3be61001eb0af4c52a129 100644 --- a/lite/backends/x86/jit/gen/matmul.h +++ b/lite/backends/x86/jit/gen/matmul.h @@ -38,8 +38,8 @@ class MatMulJitCode : public JitCode { std::string name() const override { std::string base = "MatMulJitCode"; - base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" + - std::to_string(k_); + base = base + "_M" + paddle::lite::to_string(m_) + "_N" + + paddle::lite::to_string(n_) + "_K" + paddle::lite::to_string(k_); return base; } void genCode() override; diff --git a/lite/backends/x86/jit/gen/seqpool.h b/lite/backends/x86/jit/gen/seqpool.h index 346179cfbbd0e8291dc17b266366c5df07114b7f..60e27993057b58eb8a4a07fcd0a368fc0a9441fc 100644 --- a/lite/backends/x86/jit/gen/seqpool.h +++ b/lite/backends/x86/jit/gen/seqpool.h @@ -47,7 +47,7 @@ class SeqPoolJitCode : public JitCode { } else if (type_ == SeqPoolType::kSqrt) { base += "_Sqrt"; } - base += ("_W" + std::to_string(w_)); + base += ("_W" + paddle::lite::to_string(w_)); return base; } void genCode() override; diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index 1d0558451fce67433d966d1f4bff82af26459e33..db8bc29d70d4764f14f24915fcbc254ba2af91df 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -34,9 +34,9 @@ lite_cc_library(scope SRCS scope.cc DEPS tensor) lite_cc_library(device_info SRCS device_info.cc DEPS tensor) if (LITE_WITH_ARM) -lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags) +lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context) else() -lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags) +lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context) endif() #-------------------------------------------- GET CODE META INFO ------------------------------------------ @@ -67,6 +67,13 @@ message(STATUS "commit: ${PADDLE_LITE_COMMIT}") configure_file(version.h.in version.h) #----------------------------------------------- NOT CHANGE ----------------------------------------------- +# A trick to generate the opencl_kernels_source.cc +#add_custom_command( +# COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/gen_opencl_code.py +# ${CMAKE_SOURCE_DIR}/lite/backends/opencl/cl_kernel +# ${CMAKE_BINARY_DIR}/lite/backends/opencl/opencl_kernels_source.cc +# OUTPUT opencl_kernels_source.cc # not a real path to the output to force it execute every time. +# ) # A trick to generate the paddle_use_kernels.h add_custom_command( COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_kernel_registry.py @@ -86,9 +93,13 @@ add_custom_command( OUTPUT ops.h # not a real path to the output to force it execute every time. ) # generate fake kernels for memory_optimize_tool + +#-------------------------------opt---------------------------------------------------------------- +# tricks to create headfiles for opt add_custom_command( COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/create_fake_kernel_registry.py ${kernels_src_list} + ${fake_kernels_src_list} ${CMAKE_BINARY_DIR}/all_kernel_faked.cc ${CMAKE_BINARY_DIR}/kernel_src_map.h OUTPUT all_kernel_faked.cc # not a real path to the output to force it execute every time. @@ -96,10 +107,12 @@ add_custom_command( add_custom_target(op_list_h DEPENDS ops.h) add_custom_target(kernel_list_h DEPENDS kernels.h) add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc) + # create headfile to restore ops info sorted by suppported platforms add_custom_command( COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/record_supported_kernel_op.py ${kernels_src_list} + ${fake_kernels_src_list} ${ops_src_list} ${CMAKE_BINARY_DIR}/supported_kernel_op_info.h OUTPUT supported_kernel_op_info.h # not a real path to the output to force it execute every time. diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc index fe36f1e1ba16ad85c44136b09a0d2e5d3fadf688..614ee990a9811ab74ceedb4fa000fa385698d679 100644 --- a/lite/core/arena/framework.cc +++ b/lite/core/arena/framework.cc @@ -59,6 +59,8 @@ void TestCase::CreateInstruction() { CHECK(it != kernels.end()) << "failed to create the kernel in " << place_.DebugString() << " with alias: " << alias_; + // reset final place + place_ = (*it)->place(); // prepare context (*it)->SetContext(std::move(ctx_)); instruction_.reset(new Instruction(op, std::move(*it))); @@ -74,25 +76,164 @@ void TestCase::PrepareInputsForInstruction() { const auto* param_type = ParamTypeRegistry::Global().RetrieveInArgument( place_, kernel_key, arg); - const auto* inst_type = Type::GetTensorTy(TARGET(kHost)); + const Type* inst_type = nullptr; + if (param_type->type->IsTensor()) { + inst_type = Type::GetTensorTy(TARGET(kHost)); + } else if (param_type->type->IsTensorList()) { + inst_type = Type::GetTensorListTy(TARGET(kHost)); + } else { + LOG(FATAL) << "unsupported param_type"; + } + CHECK(scope_->FindVar(var)); - const auto* shared_tensor = scope_->FindTensor((var)); if (!TargetCompatibleTo(*inst_type, *param_type->type)) { - /// Create a tensor in the instruction's scope, alloc memory and then - /// copy data there. - auto* target_tensor = inst_scope_->NewTensor(var); - CHECK(!shared_tensor->dims().empty()) << "shared_tensor is empty yet"; - target_tensor->Resize(shared_tensor->dims()); - TargetCopy(param_type->type->target(), - target_tensor->mutable_data(param_type->type->target(), - shared_tensor->memory_size()), - shared_tensor->raw_data(), - shared_tensor->memory_size()); + /// Create a tensor or tensor_array in the instruction's scope, + /// alloc memory and then copy data there. + if (param_type->type->IsTensor()) { + const auto* shared_tensor = scope_->FindTensor(var); + auto* target_tensor = inst_scope_->NewTensor(var); + CHECK(!shared_tensor->dims().empty()) << "shared_tensor is empty yet"; + target_tensor->Resize(shared_tensor->dims()); + TargetCopy(param_type->type->target(), + target_tensor->mutable_data(param_type->type->target(), + shared_tensor->memory_size()), + shared_tensor->raw_data(), + shared_tensor->memory_size()); + } else if (param_type->type->IsTensorList()) { + const auto* shared_tensor_array = + scope_->FindVar(var)->GetMutable>(); + auto* target_tensor_array = + inst_scope_->Var(var)->GetMutable>(); + CHECK(!shared_tensor_array->empty()) + << "shared_tensor_array is empty yet"; + target_tensor_array->resize(shared_tensor_array->size()); + for (int i = 0; i < shared_tensor_array->size(); i++) { + target_tensor_array->at(i).Resize( + shared_tensor_array->at(i).dims()); + TargetCopy(param_type->type->target(), + target_tensor_array->at(i).mutable_data( + param_type->type->target(), + shared_tensor_array->at(i).memory_size()), + shared_tensor_array->at(i).raw_data(), + shared_tensor_array->at(i).memory_size()); + } + } else { + LOG(FATAL) << "not support"; + } } } } } +template +bool TestCase::CheckTensorPrecision(const Tensor* a_tensor, + const Tensor* b_tensor, + float abs_error) { + CHECK(a_tensor); + CHECK(b_tensor); + + CHECK(ShapeEquals(a_tensor->dims(), b_tensor->dims())); + + CHECK(a_tensor->lod() == b_tensor->lod()) << "lod not match"; + + // The baseline should output in host devices. + CHECK(b_tensor->target() == TARGET(kHost) || + b_tensor->target() == TARGET(kX86) || + b_tensor->target() == TARGET(kARM)); + + const T* a_data{}; + switch (a_tensor->target()) { + case TARGET(kX86): + case TARGET(kHost): + case TARGET(kARM): + a_data = static_cast(a_tensor->raw_data()); + break; + + default: + // Before compare, need to copy data from `target` device to host. + LOG(FATAL) << "Not supported"; + } + + CHECK(a_data); + + const T* b_data = static_cast(b_tensor->raw_data()); + + bool success = true; + for (int i = 0; i < a_tensor->dims().production(); i++) { + EXPECT_NEAR(a_data[i], b_data[i], abs_error); + if (fabsf(a_data[i] - b_data[i]) > abs_error) { + success = false; + } + } + return success; +} + +bool TestCase::CheckPrecision(const Tensor* a_tensor, + const Tensor* b_tensor, + float abs_error, + PrecisionType precision_type) { + PrecisionType precision_type_t = precision_type; + if (precision_type == PRECISION(kAny)) { + precision_type_t = b_tensor->precision(); + } + CHECK(precision_type_t == b_tensor->precision()) + << "arg precision type and base tensor precision type are not matched! " + "arg precision type is: " + << PrecisionToStr(precision_type) << ", base tensor precision type is: " + << PrecisionToStr(b_tensor->precision()); + CHECK(a_tensor->precision() == b_tensor->precision()) + << "real tensor precision type and base tensor precision type are not " + "matched! real tensor precision type is: " + << PrecisionToStr(a_tensor->precision()) + << ", base tensor precision type is: " + << PrecisionToStr(b_tensor->precision()); + switch (precision_type_t) { + case PRECISION(kFloat): + return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + case PRECISION(kInt8): + return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + case PRECISION(kInt32): + return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + case PRECISION(kInt64): + return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + case PRECISION(kBool): + return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + default: + LOG(FATAL) << "not support type: " << PrecisionToStr(precision_type); + return false; + } +} + +bool TestCase::CheckPrecision(const std::string& var_name, + float abs_error, + PrecisionType precision_type) { + bool success = true; + if (inst_scope_->FindVar(var_name)->IsType()) { + auto a_tensor = inst_scope_->FindTensor(var_name); + auto b_tensor = base_scope_->FindTensor(var_name); + success = success && + CheckPrecision(a_tensor, b_tensor, abs_error, precision_type); + } else if (inst_scope_->FindVar(var_name)->IsType>()) { + auto a_tensor_array = + inst_scope_->FindVar(var_name)->GetMutable>(); + auto b_tensor_array = + base_scope_->FindVar(var_name)->GetMutable>(); + CHECK_EQ(a_tensor_array->size(), b_tensor_array->size()); + for (int i = 0; i < a_tensor_array->size(); i++) { + Tensor* a_tensor = &(a_tensor_array->at(i)); + Tensor* b_tensor = &(b_tensor_array->at(i)); + if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) { + continue; + } + success = success && + CheckPrecision(a_tensor, b_tensor, abs_error, precision_type); + } + } else { + LOG(FATAL) << "unsupported var type"; + } + return success; +} + TestCase::~TestCase() { if (op_desc_->Type() == "subgraph") { // Release the subblock desc of Subgraph op diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h index 85edda26e6591bada967165317de00b169a2d0cd..7050355fbfae55b9ba626119cd95f8e952c27430 100644 --- a/lite/core/arena/framework.h +++ b/lite/core/arena/framework.h @@ -66,11 +66,24 @@ class TestCase { /// output. virtual void RunBaseline(Scope* scope) = 0; - /// Check the precision of the output tensors. It will compare the same tensor - /// in two scopes, one of the instruction execution, and the other for the - /// baseline. + // checkout the precision of the two tensors with type T. b_tensor is baseline template - bool CheckPrecision(const std::string& var_name, float abs_error); + bool CheckTensorPrecision(const Tensor* a_tensor, + const Tensor* b_tensor, + float abs_error); + + // checkout the precision of the two tensors. b_tensor is baseline + bool CheckPrecision(const Tensor* a_tensor, + const Tensor* b_tensor, + float abs_error, + PrecisionType precision_type); + + /// Check the precision of the output variables. It will compare the same + /// tensor (or all tensors of the tensor_array) in two scopes, one of the + /// instruction execution, and the other for the baseline. + bool CheckPrecision(const std::string& var_name, + float abs_error, + PrecisionType precision_type); const cpp::OpDesc& op_desc() { return *op_desc_; } @@ -78,20 +91,6 @@ class TestCase { // kernel registry. void CheckKernelConsistWithDefinition() {} - // Get the real precision of the output for check precision. When the declare - // precision obtained from the kernel is any, we should set the precision of - // the output in test case. - bool GetPrecisonType(const std::string& var_name, - PrecisionType* precision_type) { - auto res = precision_type_map_.find(var_name); - if (res == precision_type_map_.end()) { - return false; - } else { - *precision_type = precision_type_map_.at(var_name); - return true; - } - } - Scope& scope() { return *scope_; } Scope* baseline_scope() { return base_scope_; } @@ -120,22 +119,37 @@ class TestCase { tensor->set_persistable(is_persistable); } - // Prepare for the operator. - virtual void PrepareOpDesc(cpp::OpDesc* op_desc) = 0; + /// Prepare a tensor_array in host. The tensors will be created in scope_. + /// Need to specify the targets other than X86 or ARM. + template + void SetCommonTensorList(const std::string& var_name, + const std::vector& array_tensor_dims, + const std::vector>& datas, + const std::vector& lods = {}) { + CHECK_EQ(array_tensor_dims.size(), datas.size()); + if (!lods.empty()) { + CHECK_EQ(array_tensor_dims.size(), lods.size()); + } - // Set the real precision of the output for check precision. When the declare - // precision obtained from the kernel is any, we should set the precision of - // the output in test case. - void SetPrecisionType(const std::string& var_name, - const PrecisionType& precision_type) { - auto res = precision_type_map_.find(var_name); - if (res == precision_type_map_.end()) { - precision_type_map_.insert({var_name, precision_type}); - } else { - precision_type_map_.at(var_name) = precision_type; + auto* tensor_array = + scope_->Var(var_name)->GetMutable>(); + for (int i = 0; i < array_tensor_dims.size(); i++) { + Tensor tmp; + tmp.Resize(array_tensor_dims[i]); + auto* tmp_data = tmp.mutable_data(); + memcpy(tmp_data, + datas[i].data(), + array_tensor_dims[i].production() * sizeof(T)); + if (!lods.empty()) { + tmp.set_lod(lods[i]); + } + tensor_array->push_back(tmp); } } + // Prepare for the operator. + virtual void PrepareOpDesc(cpp::OpDesc* op_desc) = 0; + public: const Instruction& instruction() { return *instruction_; } @@ -179,7 +193,6 @@ class TestCase { Scope* base_scope_{}; std::unique_ptr op_desc_; std::unique_ptr instruction_; - std::unordered_map precision_type_map_; }; class Arena { @@ -236,22 +249,7 @@ class Arena { const Type* type = tester_->instruction().kernel()->GetOutputDeclType(arg_name); auto precision_type = type->precision(); - if (precision_type == PRECISION(kAny)) { - CHECK(tester_->GetPrecisonType(var_name, &precision_type)); - } - switch (precision_type) { - case PRECISION(kFloat): - return tester_->CheckPrecision(var_name, abs_error_); - case PRECISION(kInt8): - return tester_->CheckPrecision(var_name, abs_error_); - case PRECISION(kInt32): - return tester_->CheckPrecision(var_name, abs_error_); - case PRECISION(kBool): - return tester_->CheckPrecision(var_name, abs_error_); - default: - LOG(FATAL) << "not support type " << PrecisionToStr(type->precision()); - return false; - } + return tester_->CheckPrecision(var_name, abs_error_, precision_type); } private: @@ -260,49 +258,6 @@ class Arena { float abs_error_; }; -template -bool TestCase::CheckPrecision(const std::string& var_name, float abs_error) { - auto a_tensor = inst_scope_->FindTensor(var_name); - auto b_tensor = base_scope_->FindTensor(var_name); - CHECK(a_tensor); - CHECK(b_tensor); - - CHECK(ShapeEquals(a_tensor->dims(), b_tensor->dims())); - - CHECK(a_tensor->lod() == b_tensor->lod()) << "lod not match"; - - // The baseline should output in host devices. - CHECK(b_tensor->target() == TARGET(kHost) || - b_tensor->target() == TARGET(kX86) || - b_tensor->target() == TARGET(kARM)); - - const T* a_data{}; - switch (a_tensor->target()) { - case TARGET(kX86): - case TARGET(kHost): - case TARGET(kARM): - a_data = static_cast(a_tensor->raw_data()); - break; - - default: - // Before compare, need to copy data from `target` device to host. - LOG(FATAL) << "Not supported"; - } - - CHECK(a_data); - - const T* b_data = static_cast(b_tensor->raw_data()); - - bool success = true; - for (int i = 0; i < a_tensor->dims().production(); i++) { - EXPECT_NEAR(a_data[i], b_data[i], abs_error); - if (fabsf(a_data[i] - b_data[i]) > abs_error) { - success = false; - } - } - return success; -} - } // namespace arena } // namespace lite } // namespace paddle diff --git a/lite/core/context.cc b/lite/core/context.cc index 948aac0c794969304b585520bfb7229410555578..be886168e02e21d192305d701110ce5075ffba63 100644 --- a/lite/core/context.cc +++ b/lite/core/context.cc @@ -14,10 +14,6 @@ #include "lite/core/context.h" -#ifdef LITE_WITH_OPENCL -DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path."); -#endif - namespace paddle { namespace lite {} // namespace lite } // namespace paddle diff --git a/lite/core/context.h b/lite/core/context.h index 653329e4f24b1f391ea41ed39819b60c8a598a3b..88fe00d0f2aab41cfd3e5562d29f0a8a82598428 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -20,7 +20,6 @@ #include "lite/backends/cuda/cuda_utils.h" #endif #ifdef LITE_WITH_OPENCL -#include #include #include "lite/backends/opencl/cl_context.h" #include "lite/backends/opencl/cl_runtime.h" @@ -36,10 +35,7 @@ #include "lite/core/target_wrapper.h" #include "lite/core/tensor.h" #include "lite/utils/all.h" - -#ifdef LITE_WITH_OPENCL -DECLARE_string(cl_path); -#endif +#include "lite/utils/env.h" namespace paddle { namespace lite { @@ -56,6 +52,7 @@ using XPUContext = Context; using OpenCLContext = Context; using FPGAContext = Context; using BMContext = Context; +using MLUContext = Context; template <> class Context { @@ -304,7 +301,6 @@ class Context { void InitOnce() { // Init cl runtime. CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed"; - CLRuntime::Global()->set_cl_path(FLAGS_cl_path); cl_context_ = std::make_shared(); cl_wait_list_ = std::make_shared(); @@ -400,7 +396,7 @@ class ContextScheduler { break; #endif default: -#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL +#if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON) LOG(FATAL) << "unsupported target " << TargetToStr(target); #endif break; diff --git a/lite/core/lite.map b/lite/core/lite.map index 9cfd272eb6d3017a75b40481d25527d7c14478bf..406f578fab545709b90939cdfe475a8620be6841 100644 --- a/lite/core/lite.map +++ b/lite/core/lite.map @@ -1,6 +1,6 @@ { global: - *paddle*; + *paddle*lite*; *touch_*; *mir_pass_*; local: diff --git a/lite/core/lite_tensor_test.cc b/lite/core/lite_tensor_test.cc index d667a9f8852d49bd850274bbb3c895e14d233f77..500dae3e283084ff8218fc758e1a7c5119eff16b 100644 --- a/lite/core/lite_tensor_test.cc +++ b/lite/core/lite_tensor_test.cc @@ -13,19 +13,49 @@ // limitations under the License. #include +#include #include "lite/core/tensor.h" namespace paddle { namespace lite { -TEST(tensor, test) { - TensorLite tensor; - DDimLite ddim({1, 8}); - tensor.Resize(ddim); +template +void test_shared_memory_tensor() { + const std::vector data({0, 1, 2, 3}); + const std::vector shape({2, 2}); + const size_t size = data.size() * sizeof(Dtype); + TensorLite init_tensor; + init_tensor.Assign(data.data(), + static_cast(shape)); + Dtype* init_raw_data = init_tensor.mutable_data(); - for (int i = 0; i < 8; i++) { - tensor.mutable_data()[i] = i; + TensorLite shared_tensor( + std::make_shared(Buffer(init_raw_data, Target, size))); + Buffer host_buffer; + host_buffer.ResetLazy(TargetType::kHost, size); + if (Target == TargetType::kHost) { + CopySync( + host_buffer.data(), init_raw_data, size, IoDirection::HtoH); + } else { + CopySync( + host_buffer.data(), init_raw_data, size, IoDirection::DtoH); } + EXPECT_EQ(std::memcmp(host_buffer.data(), data.data(), size), 0); + + shared_tensor.Resize({1, 5}); + ASSERT_DEATH(shared_tensor.mutable_data(), ""); +} + +TEST(tensor, shared_memory) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + test_shared_memory_tensor(); + test_shared_memory_tensor(); + test_shared_memory_tensor(); +#ifdef LITE_WITH_CUDA + test_shared_memory_tensor(); + test_shared_memory_tensor(); + test_shared_memory_tensor(); +#endif } } // namespace lite diff --git a/lite/core/memory.cc b/lite/core/memory.cc index cfb0b3ae1765864200ecf2d70107a3aa0046899c..0ee973a8b6412a2fd20e33745b7b86561696efae 100644 --- a/lite/core/memory.cc +++ b/lite/core/memory.cc @@ -51,7 +51,7 @@ void* TargetMalloc(TargetType target, size_t size) { return data; } -void TargetFree(TargetType target, void* data) { +void TargetFree(TargetType target, void* data, std::string free_flag) { switch (target) { case TargetType::kHost: case TargetType::kX86: @@ -66,7 +66,11 @@ void TargetFree(TargetType target, void* data) { #endif // LITE_WITH_CUDA #ifdef LITE_WITH_OPENCL case TargetType::kOpenCL: - TargetWrapperCL::Free(data); + if (free_flag == "cl_use_image2d_") { + TargetWrapperCL::FreeImage(data); + } else { + TargetWrapperCL::Free(data); + } break; #endif // LITE_WITH_OPENCL #ifdef LITE_WITH_FPGA diff --git a/lite/core/memory.h b/lite/core/memory.h index 051d47bdde102f5fe058163d0c746fe3c4acf26e..691415aecb53bf7f48faf5fbb4dbca448da04a10 100644 --- a/lite/core/memory.h +++ b/lite/core/memory.h @@ -13,8 +13,10 @@ // limitations under the License. #pragma once +#include #include "lite/api/paddle_place.h" #include "lite/core/target_wrapper.h" +#include "lite/utils/logging.h" #include "lite/utils/macros.h" #ifdef LITE_WITH_OPENCL @@ -38,7 +40,9 @@ LITE_API void* TargetMalloc(TargetType target, size_t size); // Free memory for a specific Target. All the targets should be an element in // the `switch` here. -void LITE_API TargetFree(TargetType target, void* data); +void LITE_API TargetFree(TargetType target, + void* data, + std::string free_flag = ""); // Copy a buffer from host to another target. void TargetCopy(TargetType target, void* dst, const void* src, size_t size); @@ -81,6 +85,9 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) { TargetWrapper::MemcpySync(dst, src, size, dir); break; #endif + default: + LOG(FATAL) + << "The copy function of this target has not been implemented yet."; } } @@ -89,17 +96,24 @@ class Buffer { public: Buffer() = default; Buffer(TargetType target, size_t size) : space_(size), target_(target) {} + Buffer(void* data, TargetType target, size_t size) + : space_(size), data_(data), own_data_(false), target_(target) {} void* data() const { return data_; } TargetType target() const { return target_; } size_t space() const { return space_; } + bool own_data() const { return own_data_; } void ResetLazy(TargetType target, size_t size) { if (target != target_ || space_ < size) { + CHECK_EQ(own_data_, true) << "Can not reset unowned buffer."; Free(); data_ = TargetMalloc(target, size); target_ = target; space_ = size; +#ifdef LITE_WITH_OPENCL + cl_use_image2d_ = false; +#endif } } @@ -111,14 +125,15 @@ class Buffer { const size_t img_w, const size_t img_h, void* host_ptr = nullptr) { - size_t size = sizeof(T) * img_w * img_h * - 4; // 4 for RGBA, un-used for opencl Image2D if (target != target_ || cl_image2d_width_ < img_w || cl_image2d_height_ < img_h) { + CHECK_EQ(own_data_, true) << "Can not reset unowned buffer."; Free(); data_ = TargetWrapperCL::MallocImage(img_w, img_h, host_ptr); target_ = target; - space_ = size; // un-used for opencl Image2D + space_ = sizeof(T) * img_w * img_h * + 4; // un-used for opencl Image2D, 4 for RGBA, + cl_use_image2d_ = true; cl_image2d_width_ = img_w; cl_image2d_height_ = img_h; } @@ -126,8 +141,12 @@ class Buffer { #endif void Free() { - if (space_ > 0) { - TargetFree(target_, data_); + if (space_ > 0 && own_data_) { + if (!cl_use_image2d_) { + TargetFree(target_, data_); + } else { + TargetFree(target_, data_, "cl_use_image2d_"); + } } data_ = nullptr; target_ = TargetType::kHost; @@ -146,9 +165,11 @@ class Buffer { private: // memory it actually malloced. size_t space_{0}; + bool cl_use_image2d_{false}; // only used for OpenCL Image2D size_t cl_image2d_width_{0}; // only used for OpenCL Image2D size_t cl_image2d_height_{0}; // only used for OpenCL Image2D void* data_{nullptr}; + bool own_data_{true}; TargetType target_{TargetType::kHost}; }; diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt index 379ef67f2996519d0c8007d8f191efbd2166a9e3..82b19b030c35e69ad2a666f93475c556cc51fd23 100644 --- a/lite/core/mir/CMakeLists.txt +++ b/lite/core/mir/CMakeLists.txt @@ -36,6 +36,7 @@ lite_cc_library(mir_passes runtime_context_assign_pass.cc memory_optimize_pass.cc weight_quantization_preprocess_pass.cc + quantized_op_attributes_inference_pass.cc DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs}) # lite_cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc index b688bbc1083a6ab0f521381c4a988a12badc3141..68c07c0ffd0694aec0ff073082e1192213a0ef4a 100644 --- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc @@ -24,17 +24,28 @@ namespace mir { void ConvActivationFusePass::Apply(const std::unique_ptr& graph) { std::vector act_types{"relu"}; + bool has_int8 = false; + bool has_arm_float = false; + bool has_cuda = false; for (auto& place : graph->valid_places()) { - if (place.target == TARGET(kCUDA)) { - act_types.push_back("leaky_relu"); - break; + if (place.precision == PRECISION(kInt8)) { + has_int8 = true; } if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) { - act_types.push_back("relu6"); - act_types.push_back("leaky_relu"); - break; + has_arm_float = true; + } + if (place.target == TARGET(kCUDA)) { + has_cuda = true; } } + + if (!has_int8 && has_arm_float) { + act_types.push_back("relu6"); + act_types.push_back("leaky_relu"); + } + if (!has_int8 && has_cuda) { + act_types.push_back("leaky_relu"); + } for (auto conv_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) { for (auto act_type : act_types) { for (auto has_bias : {true, false}) { diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc index ff5a7a1f25239d9dbfc79491bd137804b16b6cfa..ab81f3d809507dd340056c97a39998c908a75dc7 100644 --- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc +++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc @@ -45,7 +45,7 @@ void QuantDequantFusePass::Apply(const std::unique_ptr& graph) { } // delete quant_dequant_node - for (auto op_type : {"pool2d", "elementwise_add"}) { + for (auto op_type : {"pool2d", "softmax", "elementwise_add"}) { fusion::DeleteQuantDequantOpFuser fuser(op_type); fuser(graph.get()); } diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc index da611e4490f4ba7268d9011b3dbb391a63a88305..7797864a2e4b75f52fd7da93ea81613a2175f423 100644 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc @@ -297,7 +297,7 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) { void DeleteQuantDequantOpFuser::BuildPattern() { std::string quant_dequant_op_type = "fake_quantize_dequantize_moving_average_abs_max"; - if (quantized_op_type_ == "pool2d") { + if (quantized_op_type_ == "pool2d" || quantized_op_type_ == "softmax") { auto* input_scale_node = VarNode("input_scale_node") ->assert_is_op_input(quant_dequant_op_type, "InScale"); @@ -374,7 +374,7 @@ void DeleteQuantDequantOpFuser::BuildPattern() { void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { - if (quantized_op_type_ == "pool2d") { + if (quantized_op_type_ == "pool2d" || quantized_op_type_ == "softmax") { auto* input_scale_node = matched.at("input_scale_node"); auto* input_act_node = matched.at("input_act_node"); auto* quant_dequant_node = matched.at("quant_dequant_node"); diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc index 3a27360f94d7d828e1c19214d621f1dfe4e048ca..28ec814fa85451b5292bfde6bddc6b64b57b2f08 100644 --- a/lite/core/mir/graph_visualize_pass.cc +++ b/lite/core/mir/graph_visualize_pass.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include "lite/core/mir/pass_registry.h" #include "lite/utils/string.h" @@ -28,56 +29,101 @@ namespace mir { using inference::analysis::Dot; void GraphVisualizePass::Apply(const std::unique_ptr& graph) { - Visualize(graph.get()); + VLOG(5) << "\n" << Visualize(graph.get()); } std::string Visualize(mir::SSAGraph* graph) { + std::ostringstream os; inference::analysis::Dot dot; - - int id = 0; - std::set exists_args; - for (auto& node : graph->mutable_nodes()) { - std::string key; - if (node.IsArg()) { - key = node.AsArg().name; - } else { - key = string_format("%s%d", node.AsStmt().op_type().c_str(), id++); + auto string_trunc = [](const std::string& str) -> std::string { + const int max_disp_size = 100; + if (str.length() > max_disp_size) + return str.substr(0, max_disp_size) + "..."; + return str; + }; + auto attr_repr = [&](const OpInfo* op_info, + const std::string& attr_name) -> std::string { + std::ostringstream os; + using AttrType = cpp::OpDesc::AttrType; + auto attr_type = op_info->GetAttrType(attr_name); + switch (attr_type) { + case AttrType::INT: + os << ":int:" + << paddle::lite::to_string(op_info->GetAttr(attr_name)); + break; + case AttrType::FLOAT: + os << ":float:" + << paddle::lite::to_string(op_info->GetAttr(attr_name)); + break; + case AttrType::BOOLEAN: + os << ":int:" + << paddle::lite::to_string(op_info->GetAttr(attr_name)); + break; + case AttrType::STRING: + os << ":string: \"" + << string_trunc(op_info->GetAttr(attr_name)) << "\""; + break; + case AttrType::FLOATS: { + auto vals = op_info->GetAttr>(attr_name); + os << ":floats: {" + Join(vals, ",") << "}"; + } break; + case AttrType::INTS: { + auto vals = op_info->GetAttr>(attr_name); + os << ":ints: {" + Join(vals, ",") + "}"; + } break; + case AttrType::STRINGS: { + auto vals = op_info->GetAttr>(attr_name); + os << ":strings: {" + string_trunc(Join(vals, ",")) << "}"; + } break; + default: + os << ":Unknow type(" << static_cast(attr_type) << ")"; + break; } - if (node.IsStmt()) { - dot.AddNode(key, - {Dot::Attr("shape", "box"), - Dot::Attr("style", "filled"), - Dot::Attr("color", "black"), - Dot::Attr("fillcolor", "yellow")}); - for (auto& x : node.inlinks) { - auto name = x->AsArg().name; - if (!exists_args.count(name)) { - dot.AddNode(name, {}); - } - dot.AddEdge(name, key, {}); - exists_args.insert(name); + return os.str(); + }; + int op_idx = 0; + std::set exists_var_names; + for (auto& node : graph->StmtTopologicalOrder()) { + if (!node->IsStmt()) continue; + auto op_info = node->AsStmt().op_info(); + auto op_type = op_info->Type(); + std::string op_name = string_format("%s%d", op_type.c_str(), op_idx++); + // Add its input&output variables as the Dot nodes + dot.AddNode(op_name, + {Dot::Attr("shape", "box"), + Dot::Attr("style", "filled"), + Dot::Attr("color", "black"), + Dot::Attr("fillcolor", "yellow")}); + for (auto& x : node->inlinks) { + auto var_name = x->AsArg().name; + if (!exists_var_names.count(var_name)) { + dot.AddNode(var_name, {}); + exists_var_names.insert(var_name); } - for (auto& x : node.outlinks) { - auto name = x->AsArg().name; - if (!exists_args.count(name)) { - dot.AddNode(name, {}); - } - dot.AddEdge(key, name, {}); - exists_args.insert(name); + dot.AddEdge(var_name, op_name, {}); + } + for (auto& x : node->outlinks) { + auto var_name = x->AsArg().name; + if (!exists_var_names.count(var_name)) { + dot.AddNode(var_name, {}); + exists_var_names.insert(var_name); } + dot.AddEdge(op_name, var_name, {}); + } + // Output its all of attributes(name and values) + os << "* " << op_name << "\n"; + const auto& attr_names = op_info->AttrNames(); + for (auto& attr_name : attr_names) { + os << " - " << attr_name << attr_repr(op_info, attr_name) << "\n"; } } - - auto res = dot.Build(); - // If we use VLOG here, we can not type all graph out. - // So we change VLOG to std::cout. - std::cout << "dot:\n" << res << std::endl; - return res; + os << dot.Build(); + return os.str(); } } // namespace mir } // namespace lite } // namespace paddle -REGISTER_MIR_PASS(graph_visualze, paddle::lite::mir::GraphVisualizePass) +REGISTER_MIR_PASS(graph_visualize_pass, paddle::lite::mir::GraphVisualizePass) .BindTargets({TARGET(kAny)}); diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc index 6256a49a99b9097664c192d40502daf506437a31..38293ede76ed35bf05767ce1333947b7dfdbc4ac 100644 --- a/lite/core/mir/memory_optimize_pass.cc +++ b/lite/core/mir/memory_optimize_pass.cc @@ -39,52 +39,109 @@ void MemoryOptimizePass::CollectLifeCycleByDevice( auto is_host = [](TargetType x) -> bool { return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM); }; - // The vars which inputs or outputs are invalid op will not be reused. - auto valid_var = [&](Node* node) -> bool { - std::set invalid_op = {"while", - "conditional_block", - "conditional_block_infer", - "merge_lod_tensor_infer", - "merge_lod_tensor", - "equal", - "lod_reset", - "concat", - "yolo_box", - "subgraph", - "feed", - "fetch"}; - for (auto* tmp : node->inlinks) { - CHECK(tmp->IsStmt()); - std::string op_type = tmp->AsStmt().op_info()->Type(); - if (std::find(invalid_op.begin(), invalid_op.end(), op_type) != - invalid_op.end()) { - return false; + + // The all of input and output variables of the Ops will not be reused. + std::unordered_set invalid_op_nodes = {"while", + "conditional_block", + "conditional_block_infer", + "merge_lod_tensor_infer", + "merge_lod_tensor", + "equal", + "lod_reset", + "concat", + "yolo_box", + "subgraph", + "feed", + "fetch"}; + + auto insert_invalid_op_nodes_for_specific_target = [&]( + std::unordered_set op_node_set, TargetType specific_target) { + std::unordered_set invalid_op_nodes_opencl = {"layout", "fc"}; + for (auto& op_node : graph->StmtTopologicalOrder()) { + if (!op_node->IsStmt()) continue; + TargetType op_target_type = op_node->AsStmt().place().target; + if (op_target_type == specific_target && + specific_target == TARGET(kOpenCL)) { + invalid_op_nodes.insert(invalid_op_nodes_opencl.begin(), + invalid_op_nodes_opencl.end()); + break; } + // else if // you can add more targets } - for (auto* tmp : node->outlinks) { - CHECK(tmp->IsStmt()); - std::string op_type = tmp->AsStmt().op_info()->Type(); - if (std::find(invalid_op.begin(), invalid_op.end(), op_type) != - invalid_op.end()) { - return false; + }; + + VLOG(4) << "invalid_op_nodes.size();" << invalid_op_nodes.size(); + insert_invalid_op_nodes_for_specific_target(invalid_op_nodes, + TARGET(kOpenCL)); + VLOG(4) << "invalid_op_nodes.size();" << invalid_op_nodes.size(); + + // Collect the invalid input and output variables that will not be reused. + std::unordered_set invalid_var_names; + for (auto& op_node : graph->StmtTopologicalOrder()) { + // variables of invalid_op_nodes wil not be reused + if (!op_node->IsStmt()) continue; + auto op_info = op_node->AsStmt().op_info(); + auto op_type = op_info->Type(); + auto invalid_op_node = invalid_op_nodes.find(op_type); + if (invalid_op_node != invalid_op_nodes.end()) { + for (auto in_var_node : op_node->inlinks) { + CHECK(in_var_node->IsArg()); + invalid_var_names.insert(in_var_node->AsArg().name); } + for (auto out_var_node : op_node->outlinks) { + CHECK(out_var_node->IsArg()); + invalid_var_names.insert(out_var_node->AsArg().name); + } + continue; } - return true; - }; + // The specified input and output variables of the Ops whose 'inplace' attr + // is true will not be reused, such as reshape/reshape2's X and Out + // variables + std::unordered_map, + std::unordered_set>> + inplace_op_nodes = {{"reshape", {{"X"}, {"Out"}}}, + {"reshape2", {{"X"}, {"Out"}}}}; + auto inplace_op_node = inplace_op_nodes.find(op_type); + if (inplace_op_node != inplace_op_nodes.end()) { + bool inplace = false; + if (op_info->HasAttr("inplace")) { + inplace = op_info->GetAttr("inplace"); + } + if (inplace) { + for (auto& in_param_name : inplace_op_node->second.first) { + const auto& in_arg_names = op_info->Input(in_param_name); + invalid_var_names.insert(in_arg_names.begin(), in_arg_names.end()); + } + for (auto& out_param_name : inplace_op_node->second.second) { + const auto& out_arg_names = op_info->Output(out_param_name); + invalid_var_names.insert(out_arg_names.begin(), out_arg_names.end()); + } + } + } + } + + // non-tensor(like tensor_array) variables will not be reused + for (auto& node : graph->nodes()) { + if (node.IsArg() && (node.arg()->type != nullptr) && + !node.arg()->type->IsTensor()) { + invalid_var_names.insert(node.arg()->name); + } + } for (auto& op_node : graph->StmtTopologicalOrder()) { if (op_node->IsStmt()) { - auto inputs = op_node->inlinks; - auto outputs = op_node->outlinks; - std::vector requires(inputs.begin(), inputs.end()); - requires.insert(requires.end(), outputs.begin(), outputs.end()); - for (Node* node : requires) { - CHECK(node->IsArg()); - auto& arg = node->AsArg(); + std::vector var_nodes(op_node->inlinks.begin(), + op_node->inlinks.end()); + var_nodes.insert( + var_nodes.end(), op_node->outlinks.begin(), op_node->outlinks.end()); + for (auto* var_node : var_nodes) { + CHECK(var_node->IsArg()); + auto& arg = var_node->AsArg(); if (arg.is_weight || arg.is_persist) continue; - if (!valid_var(node)) continue; std::string var_name = arg.name; - TargetType target_type = node->AsArg().type->target(); + if (invalid_var_names.count(var_name)) continue; + TargetType target_type = arg.type->target(); if (is_host(target_type)) target_type = TARGET(kHost); if (!(*lifecycles)[TargetToStr(target_type)].count(var_name)) { @@ -181,7 +238,7 @@ void MemoryOptimizePass::PerformReusePlan( if (reuse_table.count(name) && reuse_table.at(name) != name) { auto replace_name = reuse_table.at(name); input_node->AsArg().name = - replace_name + "(" + std::to_string(node_append_idx) + ")"; + replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")"; node_append_idx++; } } @@ -205,7 +262,7 @@ void MemoryOptimizePass::PerformReusePlan( if (reuse_table.count(name) && reuse_table.at(name) != name) { auto replace_name = reuse_table.at(name); out_node->AsArg().name = - replace_name + "(" + std::to_string(node_append_idx) + ")"; + replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")"; node_append_idx++; } } @@ -255,5 +312,5 @@ void MemoryOptimizePass::Apply(const std::unique_ptr& graph) { } // namespace paddle REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass) - .BindTargets({TARGET(kARM)}) - .ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU), TARGET(kBM)}); + .BindTargets({TARGET(kARM), TARGET(kOpenCL)}) + .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM)}); diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..d6240888d0806486f478511ef81ba8179b46ab43 --- /dev/null +++ b/lite/core/mir/mlu_postprocess_pass.cc @@ -0,0 +1,499 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/mlu_postprocess_pass.h" +#include +#include +#include +#include +#include +#include +#include "lite/core/mir/graph_visualize_pass.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/operators/subgraph_op.h" + +namespace paddle { +namespace lite { +namespace mir { + +Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, + const std::string& cast_arg_name, + SSAGraph* graph, + Node* cur_node, + Node* inst_node, + const Type* cast_type) { + // create the arg node + auto* cast_arg = graph->NewArgumentNode(cast_arg_name); + cast_arg->AsArg().type = cast_type; + inst_node->AsStmt().op()->scope()->Var(cast_arg_name); + + // create the stmt node + auto* cast_inst = graph->NewInstructNode(); + // create op + auto cast_op = LiteOpRegistry::Global().Create(op_type); + CHECK(cast_op) << "create op [" << op_type << "] failed"; + cpp::OpDesc op_desc; + op_desc.SetType(op_type); + if (op_type == "cast") { + op_desc.SetAttr("in_dtype", 5); // FP32 + op_desc.SetAttr("out_dtype", 4); // FP16 + op_desc.SetInput("X", {cur_node->AsArg().name}); + op_desc.SetOutput("Out", {cast_arg_name}); + } else if (op_type == "transpose") { + // NCHW -> NHWC + op_desc.SetAttr>("axis", {0, 2, 3, 1}); + op_desc.SetInput("X", {cur_node->AsArg().name}); + op_desc.SetOutput("Out", {cast_arg_name}); + } else if (op_type == "io_copy") { + op_desc.SetInput("Input", {cur_node->AsArg().name}); + op_desc.SetOutput("Out", {cast_arg_name}); + } else { + CHECK(0) << "Unsupport cast type"; + } + cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); + // create kernels + auto kernels = cast_op->CreateKernels(graph->valid_places()); + std::vector> selected_kernels; + bool is_found = false; + for (auto& kernel : kernels) { + if (op_type == "cast") { + const Type* in_arg_ty = kernel->GetInputDeclType("X"); + if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) { + is_found = true; + } + } else if (op_type == "transpose") { + is_found = true; + } else if (op_type == "io_copy") { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (TargetCompatibleTo(*in_arg_ty, *cur_node->AsArg().type) && + TargetCompatibleTo(*out_arg_ty, *cast_type)) { + is_found = true; + } + } else { + CHECK(0) << "Unsupport cast type"; + } + if (is_found) { + selected_kernels.emplace_back(std::move(kernel)); + // we pick the kernel + cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); + auto& stmt = cast_inst->AsStmt(); + stmt.picked_kernel().SetContext( + ContextScheduler::Global().NewContext(stmt.picked_kernel().target())); + break; + } + } + CHECK(is_found) << "Can't find a Cast kernel for Cast op: " + << cur_node->AsArg().name << "->" << op_type; + // modify links + DirectedLink(cur_node, cast_inst); + DirectedLink(cast_inst, cast_arg); + return cast_arg; +} + +Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, + const std::string& cast_arg_name, + SSAGraph* graph, + Node* cur_node, + Node* inst_node, + const Type* cast_type) { + // create the arg node + auto* cast_arg = graph->NewArgumentNode(cast_arg_name); + cast_arg->AsArg().type = cast_type; + auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name); + // for CastAfter manully set the tensor's type + var->GetMutable<::paddle::lite::Tensor>(); + + // create the stmt node + auto* cast_inst = graph->NewInstructNode(); + // create op + auto cast_op = LiteOpRegistry::Global().Create(op_type); + CHECK(cast_op) << "create op [" << op_type << "] failed"; + cpp::OpDesc op_desc; + op_desc.SetType(op_type); + if (op_type == "cast") { + op_desc.SetAttr("in_dtype", 4); // FP32 + op_desc.SetAttr("out_dtype", 5); // FP16 + op_desc.SetInput("X", {cast_arg_name}); + op_desc.SetOutput("Out", {cur_node->AsArg().name}); + } else if (op_type == "transpose") { + // NHWC -> NCHW + op_desc.SetAttr>("axis", {0, 3, 1, 2}); + op_desc.SetInput("X", {cast_arg_name}); + op_desc.SetOutput("Out", {cur_node->AsArg().name}); + } else if (op_type == "io_copy") { + op_desc.SetInput("Input", {cast_arg_name}); + op_desc.SetOutput("Out", {cur_node->AsArg().name}); + } else { + CHECK(0) << "Unsupport cast type"; + } + + cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); + + // create kernels + auto kernels = cast_op->CreateKernels(graph->valid_places()); + std::vector> selected_kernels; + bool is_found = false; + for (auto& kernel : kernels) { + if (op_type == "cast") { + const Type* in_arg_ty = kernel->GetInputDeclType("X"); + if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) { + is_found = true; + } + } else if (op_type == "transpose") { + is_found = true; + } else if (op_type == "io_copy") { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (TargetCompatibleTo(*in_arg_ty, *cast_type) && + TargetCompatibleTo(*out_arg_ty, *cur_node->AsArg().type)) { + is_found = true; + } + } else { + CHECK(0) << "Unsupport cast type"; + } + if (is_found) { + selected_kernels.emplace_back(std::move(kernel)); + // we pick the kernel + cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); + auto& stmt = cast_inst->AsStmt(); + stmt.picked_kernel().SetContext( + ContextScheduler::Global().NewContext(stmt.picked_kernel().target())); + break; + } + } + CHECK(is_found) << "Can't find a Cast kernel for Cast op: " + << cur_node->AsArg().name << "->" << op_type; + // modify links + DirectedLink(cast_arg, cast_inst); + DirectedLink(cast_inst, cur_node); + return cast_arg; +} + +void MLUPostprocessPass::InsertBefore(SSAGraph* graph, + Node* head_node, + Node* inst_node, + const Type* inst_type) { + const auto* head_type = head_node->AsArg().type; + + // break original link + RemoveDirectedLink(head_node, inst_node); + + auto* cur_node = head_node; + const auto name_prefix = + head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; + + // layout cast node + if (head_type->layout() != inst_type->layout()) { + cur_node = InsertCastBefore( + "transpose", + name_prefix + "transpose", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + head_type->target(), head_type->precision(), inst_type->layout())); + } + + // precision cast node + if (head_type->precision() != inst_type->precision()) { + cur_node = InsertCastBefore( + "cast", + name_prefix + "cast", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + head_type->target(), inst_type->precision(), inst_type->layout())); + } + + // io copy + cur_node = InsertCastBefore( + "io_copy", + name_prefix + "io_copy", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + inst_type->target(), inst_type->precision(), inst_type->layout())); + + // connect cur_node to inst_node + DirectedLink(cur_node, inst_node); + + // reset opdesc and update kernel information + UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), + head_node->AsArg().name, + cur_node->AsArg().name); + // for subgraph op, modify the BlockDesc + auto* sub_block_desc = dynamic_cast( + inst_node->AsStmt().op().get()) + ->GetSubBlock(); + for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) { + auto* sub_block_op_desc = sub_block_desc->GetOp(i); + UpdateInputTo( + sub_block_op_desc, head_node->AsArg().name, cur_node->AsArg().name); + } + + // recreate the op + RecreateOp(inst_node, graph); + + graph->CheckValid(); +} + +void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node, + const Type** arg_type, + SSAGraph* graph) { + CHECK(inst_node->IsStmt()); + constexpr auto subgraph_target = TARGET(kMLU); + constexpr auto subgraph_layout = DATALAYOUT(kNHWC); + + // get subgraph's valid precision + const auto& places = graph->valid_places(); + std::set<::paddle::lite_api::PrecisionType> prec_set; + for (const auto& place : places) { + if (place.target == TARGET(kMLU)) { + prec_set.insert(place.precision); + } + } + + // get subgraph op's type info + size_t kernel_size = inst_node->AsStmt().kernels().size(); + CHECK_GT(kernel_size, 0); + VLOG(4) << "subgraph kernel size: " << kernel_size; + + for (size_t i = 0; i < kernel_size; ++i) { + auto* kernel = inst_node->AsStmt().kernels()[i].get(); + VLOG(4) << i << "th kernel: " << TargetToStr(kernel->target()) << ", " + << PrecisionToStr(kernel->precision()) << ", " + << DataLayoutToStr(kernel->layout()); + } + + for (size_t i = 0; i < kernel_size; ++i) { + auto* kernel = inst_node->AsStmt().kernels()[i].get(); + CHECK(kernel->target() == subgraph_target); + CHECK(kernel->layout() == subgraph_layout); + if (prec_set.count(kernel->precision()) == 1) { + const auto subgraph_precision = kernel->precision(); + CHECK(subgraph_precision == PRECISION(kFloat) || + subgraph_precision == PRECISION(kFP16)) + << "Mlu node has unsupport precision"; + VLOG(4) << "picked kernel precision: " + << PrecisionToStr(subgraph_precision); + *arg_type = LiteType::GetTensorTy( + subgraph_target, subgraph_precision, subgraph_layout); + break; + } + } +} + +bool MLUPostprocessPass::NeedInsert(Node* node, const Type* inst_type) { + CHECK(node->IsArg()); + + // some op, for example batch_norm, has output nodes useless + if (node->outlinks.size() == 0) { + return false; + } + + // check if node is weight or persistent + bool is_persist = node->AsArg().is_weight || node->AsArg().is_persist; + if (is_persist) { + VLOG(4) << "Persistent arg name: " << node->AsArg().name + << " is_weight: " << node->AsArg().is_weight + << " is_persist: " << node->AsArg().is_persist; + return false; + } + + const auto target = node->AsArg().type->target(); + const auto precision = node->AsArg().type->precision(); + const auto layout = node->AsArg().type->layout(); + VLOG(4) << "arg name: " << node->AsArg().name + << " type: " << TargetToStr(target) << ", " + << PrecisionToStr(precision) << ", " << DataLayoutToStr(layout); + + // do not insert nodes if previous node is on mlu already + if (target == inst_type->target()) { + CHECK(layout == inst_type->layout()) << "Mlu node has wrong layout"; + return false; + } + + return true; +} + +void MLUPostprocessPass::InsertAfter(SSAGraph* graph, + Node* tail_node, + Node* inst_node, + const Type* inst_type) { + const auto* tail_type = tail_node->AsArg().type; + + // break original link + RemoveDirectedLink(inst_node, tail_node); + + auto* cur_node = tail_node; + const auto name_prefix = + tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; + + // layout cast node + if (tail_type->layout() != inst_type->layout()) { + cur_node = InsertCastAfter( + "transpose", + name_prefix + "transpose", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + tail_type->target(), tail_type->precision(), inst_type->layout())); + } + + // precision cast node + if (tail_type->precision() != inst_type->precision()) { + cur_node = InsertCastAfter( + "cast", + name_prefix + "cast", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + tail_type->target(), inst_type->precision(), inst_type->layout())); + } + + // io copy + cur_node = InsertCastAfter( + "io_copy", + name_prefix + "io_copy", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + inst_type->target(), inst_type->precision(), inst_type->layout())); + + // connect cur_node to inst_node + DirectedLink(inst_node, cur_node); + + // reset opdesc and update kernel information + UpdateOutputTo(inst_node->AsStmt().op()->mutable_op_info(), + tail_node->AsArg().name, + cur_node->AsArg().name); + // for subgraph op, modify the BlockDesc + auto* sub_block_desc = dynamic_cast( + inst_node->AsStmt().op().get()) + ->GetSubBlock(); + for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) { + auto* sub_block_op_desc = sub_block_desc->GetOp(i); + UpdateOutputTo( + sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name); + } + + // recreate the op + RecreateOp(inst_node, graph); + + graph->CheckValid(); +} + +void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) { + auto original_selected_kernel = + std::move(inst_node->AsStmt().kernels().front()); + auto updated_op_info = *inst_node->AsStmt().mutable_op_info(); + + inst_node->AsStmt().ResetOp(updated_op_info, graph->valid_places()); + inst_node->AsStmt().kernels().clear(); + inst_node->AsStmt().kernels().emplace_back( + std::move(original_selected_kernel)); + for (auto& kernel : inst_node->AsStmt().kernels()) { + VLOG(4) << "kernel info: " << kernel->name(); + inst_node->AsStmt().op()->AttachKernel(kernel.get()); + } +} + +void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { + for (auto& node : graph->mutable_nodes()) { + if (!node.IsStmt()) continue; + if (node.AsStmt().op_type() == "feed") { + for (auto& out : node.outlinks) { + bool change = true; + for (auto& inst : out->outlinks) { + if (inst->AsStmt().op_type() != "subgraph") { + change = false; + break; + } + } + if (change) { + const auto* old_type = out->AsArg().type; + out->AsArg().type = + LiteType::GetTensorTy(old_type->target(), + old_type->precision(), + ::paddle::lite_api::DataLayoutType::kNHWC, + old_type->device()); + } + } + } + if (node.AsStmt().op_type() == "fetch") { + for (auto& inp : node.inlinks) { + bool change = true; + for (auto& inst : inp->inlinks) { + if (inst->AsStmt().op_type() != "subgraph") { + change = false; + break; + } + } + if (change) { + const auto* old_type = inp->AsArg().type; + inp->AsArg().type = + LiteType::GetTensorTy(old_type->target(), + old_type->precision(), + ::paddle::lite_api::DataLayoutType::kNHWC, + old_type->device()); + } + } + } + } +} + +void MLUPostprocessPass::Apply(const std::unique_ptr& graph) { + // currently for non-persistent input and output args, mlu subgraph op + // only support float16/float32 data type + + // in two situations as folllows: + // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch; + // arg_in and arg_out are assumed to be NHWC which user should be aware of. + // Thus here we change these args' layout to NHWC + ModifyLayout(graph.get()); + + // insert io_copy, layout and precision cast of subgraph's inputs and outputs + for (auto& node : graph->mutable_nodes()) { + if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") { + const Type* subgraph_arg_type = nullptr; + GetSubgraphOpArgType(&node, &subgraph_arg_type, graph.get()); + + auto links_tmp = node.inlinks; + for (auto p_in : links_tmp) { + if (NeedInsert(p_in, subgraph_arg_type)) { + InsertBefore(graph.get(), p_in, &node, subgraph_arg_type); + } + } + links_tmp.assign(node.outlinks.begin(), node.outlinks.end()); + for (auto p_out : links_tmp) { + if (NeedInsert(p_out, subgraph_arg_type)) { + InsertAfter(graph.get(), p_out, &node, subgraph_arg_type); + } + } + } + } +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(mlu_postprocess_pass, paddle::lite::mir::MLUPostprocessPass) + .BindTargets({TARGET(kMLU)}); diff --git a/lite/core/mir/mlu_postprocess_pass.h b/lite/core/mir/mlu_postprocess_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..8ffcbc952a44abea272bdd22467d86cd04baa207 --- /dev/null +++ b/lite/core/mir/mlu_postprocess_pass.h @@ -0,0 +1,114 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/core/mir/pass.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace mir { + +static void UpdateInputTo(cpp::OpDesc* desc, + const std::string& from, + const std::string& to) { + for (auto& item : *desc->mutable_inputs()) { + for (auto& input : item.second) { + if (input == from) { + input = to; + } + } + } + if (desc->Type() != "subgraph") return; + auto input_names = + desc->GetAttr>("input_data_names"); + for (size_t i = 0; i < input_names.size(); ++i) { + if (input_names[i] == from) { + input_names[i] = to; + } + } + desc->SetAttr>("input_data_names", input_names); +} + +static void UpdateOutputTo(cpp::OpDesc* desc, + const std::string& from, + const std::string& to) { + for (auto& item : *desc->mutable_outputs()) { + for (auto& output : item.second) { + if (output == from) { + output = to; + } + } + } + if (desc->Type() != "subgraph") return; + auto output_names = + desc->GetAttr>("output_data_names"); + for (size_t i = 0; i < output_names.size(); ++i) { + if (output_names[i] == from) { + output_names[i] = to; + } + } + desc->SetAttr>("output_data_names", output_names); +} + +/* + * The pass changes the node's target to mlu which follows a mlu subgraph op + * */ +class MLUPostprocessPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; + + private: + void GetSubgraphOpArgType(Node* inst_node, + const Type** arg_type, + SSAGraph* graph); + + void ModifyLayout(SSAGraph* graph); + + bool NeedInsert(Node* node, const Type* inst_type); + + void InsertBefore(SSAGraph* graph, + Node* head_node, + Node* inst_node, + const Type* type); + + void InsertAfter(SSAGraph* graph, + Node* tail_node, + Node* inst_node, + const Type* type); + + Node* InsertCastBefore(const std::string& op_type, + const std::string& cast_arg_name, + SSAGraph* graph, + Node* cur_node, + Node* inst_node, + const Type* cast_type); + + Node* InsertCastAfter(const std::string& op_type, + const std::string& cast_arg_name, + SSAGraph* graph, + Node* cur_node, + Node* inst_node, + const Type* cast_type); + + void RecreateOp(Node* inst_node, SSAGraph* graph); +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h index e7c44d2be689a9d890158c097e198314413d1ba3..45b15812fadb0789edea3f89fb00b4612bdb010f 100644 --- a/lite/core/mir/node.h +++ b/lite/core/mir/node.h @@ -85,7 +85,7 @@ class Node { struct Arg { std::string name; int id{0}; - const Type* type{}; + const Type* type{nullptr}; // Weight is a special kind of argument, it is marked as weight explicitly // so that some weight related optimization can take place. bool is_weight{false}; diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.cc b/lite/core/mir/quantized_op_attributes_inference_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..40cad8f6af75300ab85753b16e391daeeadc6c2f --- /dev/null +++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/quantized_op_attributes_inference_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include "lite/core/mir/graph_visualize_pass.h" +#include "lite/core/mir/pass_registry.h" + +namespace paddle { +namespace lite { +namespace mir { + +void QuantizedOpAttributesInferencePass::Apply( + const std::unique_ptr& graph) { + // Only for fully quantized model which is only supported by MTK and RK NPU. + // Replace the output_scale with the input_scale of the adjacent quantized + // ops, and fix the missing of the attribute 'enable_int8'. + for (auto& op_node : graph->StmtTopologicalOrder()) { + if (!op_node->IsStmt()) continue; + auto& inst = op_node->AsStmt(); + auto op_info = inst.op_info(); + auto op_type = op_info->Type(); + if (!op_info->HasAttr("input_scale")) continue; + bool found = false; + float output_scale; + for (auto out_var_node : op_node->outlinks) { + CHECK(out_var_node->IsArg()); + for (auto out_op_node : out_var_node->outlinks) { + CHECK(out_op_node->IsStmt()); + auto& out_inst = out_op_node->AsStmt(); + auto out_op_info = out_inst.op_info(); + if (!out_op_info->HasAttr("input_scale")) continue; + auto input_scale = out_op_info->GetAttr("input_scale"); + if (!found) { + found = true; + output_scale = input_scale; + } else { + CHECK_EQ(output_scale, input_scale); + } + } + } + if (found) { + inst.mutable_op_info()->SetAttr("output_scale", output_scale); + } else if (op_info->HasAttr("output_scale")) { + int bit_length = op_info->GetAttr("bit_length"); + int range = (1 << (bit_length - 1)) - 1; + output_scale = op_info->GetAttr("output_scale"); + inst.mutable_op_info()->SetAttr("output_scale", output_scale / range); + } + if (op_info->HasAttr("output_scale")) { + inst.mutable_op_info()->SetAttr("enable_int8", true); + } + } + VLOG(5) << "\n" << Visualize(graph.get()); +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(quantized_op_attributes_inference_pass, + paddle::lite::mir::QuantizedOpAttributesInferencePass) + .BindTargets({TARGET(kNPU)}); diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.h b/lite/core/mir/quantized_op_attributes_inference_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..2b475e0b3d662a9837b7766efb4ccc8f87037b7a --- /dev/null +++ b/lite/core/mir/quantized_op_attributes_inference_pass.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/core/mir/pass.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace mir { + +class QuantizedOpAttributesInferencePass : public mir::StmtPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc index 2b5b65ce5903ede41137311c585c0e87eaaa0e9d..6c45ce828249c3e236706c297db3d434c71c351a 100644 --- a/lite/core/mir/ssa_graph.cc +++ b/lite/core/mir/ssa_graph.cc @@ -140,9 +140,18 @@ void SSAGraph::Build(const Program &program, arg_node->AsArg(name, node_storage_.size() - 1); arg_update_node_map_[name] = arg_node; } - if (var_types.count(name) && !arg_node->arg()->type) { - arg_node->arg()->type = LiteType::GetTensorTy( - TARGET(kUnk), var_types[name], DATALAYOUT(kUnk)); + if (var_types.count(name)) { + if (!arg_node->arg()->type) { + arg_node->arg()->type = LiteType::GetTensorTy( + TARGET(kUnk), var_types[name], DATALAYOUT(kUnk)); + } + // Store the original data type of the output tensors for + // type_precision_cast_pass, to keep the consistency between the + // output types of original graph and optimized graph's + if (op->op_info()->Type() == "fetch") { + op->mutable_op_info()->SetAttr( + "data_type", static_cast(var_types[name])); + } } if (is_weights(name)) arg_node->AsArg().is_weight = true; CHECK(arg_node->IsRoleSet()); diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h index f655b298bf2d800f4adf142ad14b8ac05ca00482..6d45be3b898271f0801d289d16235d3fb5fdd706 100644 --- a/lite/core/mir/static_kernel_pick_pass.h +++ b/lite/core/mir/static_kernel_pick_pass.h @@ -58,7 +58,7 @@ class StaticKernelPickPass : public mir::StmtPass { const std::unordered_map& out_types, const std::vector& in_names, const std::vector& out_names) { - CHECK_GT(places.size(), 0) << "valid_places is empty."; + CHECK_GT(places.size(), static_cast(0)) << "valid_places is empty."; float final_score{-1.}; Place winner_place{places[0]}; const int kMax = @@ -145,11 +145,12 @@ class StaticKernelPickPass : public mir::StmtPass { } VLOG(4) << "[score(final)]:" << final_score; - VLOG(4) << "-------- pick summary --------"; - VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision) + VLOG(2) << "-------- pick summary for " << instruct.op_type() + << " --------"; + VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision) << " " << DataLayoutToStr(winner_place.layout) << " " << TargetToStr(winner_place.target); - VLOG(4) << " ===> kernel.place():" + VLOG(2) << " ===> kernel.place():" << PrecisionToStr(kernel.place().precision) << " " << DataLayoutToStr(kernel.place().layout) << " " << TargetToStr(kernel.place().target); diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc index 6d48b053a1a4140252d35e85d2351644d3c216e9..6844fd96688d5086b47d66a32f770a757f56fda4 100644 --- a/lite/core/mir/subgraph/subgraph_detector.cc +++ b/lite/core/mir/subgraph/subgraph_detector.cc @@ -22,6 +22,9 @@ #include "lite/core/mir/pass_registry.h" #include "lite/core/mir/pattern_matcher.h" #include "lite/operators/subgraph_op.h" +#include "lite/utils/env.h" +#include "lite/utils/io.h" +#include "lite/utils/string.h" namespace paddle { namespace lite { @@ -63,11 +66,11 @@ std::string SubgraphVisualizer::operator()() { } else { exists_ops[op_type]++; } - auto op_name = op_type + std::to_string(exists_ops[op_type]); + auto op_name = op_type + paddle::lite::to_string(exists_ops[op_type]); std::string op_color = "white"; if (subgraph_indices.count(node)) { auto subgraph_idx = subgraph_indices[node]; - op_name += "_subgraph_" + std::to_string(subgraph_idx); + op_name += "_subgraph_" + paddle::lite::to_string(subgraph_idx); op_color = subgraph_colors[subgraph_idx % subgraph_colors.size()]; } dot.AddNode(op_name, @@ -209,8 +212,82 @@ void SubgraphDetector::FlexibleDFS( } } +std::unordered_set SubgraphDetector::GetExcludedNodesFromConfigFile() { + // get exclude nodes from config file + std::unordered_set excluded_nodes; + std::string config_file_path = + GetStringFromEnv(SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE); + if (!IsFileExists(config_file_path)) { + return excluded_nodes; + } + std::vector lines = ReadLines(config_file_path); + + for (std::string line : lines) { + if (line.empty()) continue; + std::vector node_info = Split(line, ":"); + std::string op_type = node_info.at(0); + std::vector in_vars_name; + if (node_info.size() > 1) { + in_vars_name = Split(node_info.at(1), ","); + } + std::vector out_vars_name; + if (node_info.size() > 2) { + out_vars_name = Split(node_info.at(2), ","); + } + + for (auto &node : graph_->mutable_nodes()) { + if (node.IsArg()) continue; + auto stmt = node.stmt(); + if (op_type != stmt->op_type()) continue; + auto in_nodes = node.inlinks; + auto out_nodes = node.outlinks; + if (in_vars_name.size() > in_nodes.size() || + out_vars_name.size() > out_nodes.size()) { + continue; + } + + bool matched = true; + + for (auto in_var_name : in_vars_name) { + bool find_var = false; + for (auto *in_node : in_nodes) { + if (in_node->arg()->name == in_var_name) { + find_var = true; + break; + } + } + if (!find_var) { + matched = false; + break; + } + } + + for (auto out_var_name : out_vars_name) { + bool find_var = false; + for (auto *out_node : out_nodes) { + if (out_node->arg()->name == out_var_name) { + find_var = true; + break; + } + } + if (!find_var) { + matched = false; + break; + } + } + + if (matched) { + excluded_nodes.insert(&node); + } + } + } + + return excluded_nodes; +} + void SubgraphDetector::InitNodes(node_map_t *nodes) { // Initialize and mark the subgraph detector nodes based on teller. + std::unordered_set excluded_nodes = GetExcludedNodesFromConfigFile(); for (auto &it : *nodes) { for (auto &in_node : it.first->inlinks) { it.second->inlinks.push_back((*nodes)[in_node]); @@ -218,7 +295,7 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) { for (auto &out_node : it.first->outlinks) { it.second->outlinks.push_back((*nodes)[out_node]); } - if (teller_(it.first)) { + if (teller_(it.first) && excluded_nodes.count(it.first) == 0) { it.second->marked = true; if (it.first->IsStmt()) { // If a function is inside the subgraph, mark all the output variables @@ -331,7 +408,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, cpp::OpDesc subgraph_op_desc; subgraph_op_desc.SetType("subgraph"); - // Create a new sub block desc for storing all of Ops an Vars of the target + // Create a new sub block desc for storing all of Ops and Vars of the target // subgraph and sub_block_idx is set as a attribute of subgraph op, // sub_block_idx < 0 means it's a new subgraph op int sub_block_idx = -(subgraph_idx + 1); @@ -341,9 +418,6 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, for (auto &op_node : subgraph_nodes) { auto sub_block_op_desc = sub_block_desc->AddOp(); *sub_block_op_desc = *op_node->AsStmt().op_info(); - sub_block_op_desc->SetAttr( - kKernelTypeAttr, - op_node->AsStmt().picked_kernel().SerializedKernelType()); } subgraph_op_desc.SetAttr("sub_block", sub_block_idx); @@ -375,6 +449,37 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, subgraph_op_desc.SetAttr>("output_data_names", output_var_names); + // Set input/output scale values of input/output var nodes for + // type_precision_cast_pass. + std::vector input_data_scales; + std::vector output_data_scales; + for (auto &var_node : input_var_nodes) { + auto any_op_node = var_node->outlinks.front(); + CHECK(any_op_node->IsStmt()); + auto &any_inst = any_op_node->AsStmt(); + if (any_inst.op_info()->HasAttr("input_scale")) { + input_data_scales.push_back( + any_inst.op_info()->GetAttr("input_scale")); + } + } + for (auto &var_node : output_var_nodes) { + auto any_op_node = var_node->inlinks.front(); + CHECK(any_op_node->IsStmt()); + auto &any_inst = any_op_node->AsStmt(); + if (any_inst.op_info()->HasAttr("output_scale")) { + output_data_scales.push_back( + any_inst.op_info()->GetAttr("output_scale")); + } + } + if (input_data_scales.size() > 0) { + subgraph_op_desc.SetAttr>("input_data_scales", + input_data_scales); + } + if (output_data_scales.size() > 0) { + subgraph_op_desc.SetAttr>("output_data_scales", + output_data_scales); + } + // Set all of the inputs and outputs to the target subgraph op // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram() for (auto &var_node : weight_var_nodes) { @@ -413,12 +518,6 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, IR_OP_VAR_LINK(subgraph_op_node, var_node); } - // Create and assign the context to the picked kernel of the new subgraph - // node - auto &inst = subgraph_op_node->AsStmt(); - inst.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(inst.picked_kernel().target())); - // Remove subgraph nodes and unused var nodes auto nodes2rm = GetNodes2RM(subgraph_nodes, {input_var_nodes, diff --git a/lite/core/mir/subgraph/subgraph_detector.h b/lite/core/mir/subgraph/subgraph_detector.h index b6873655e976a785383269972221f001196431f8..567f2446a2af31c739b049005d2960ffbc802ef9 100644 --- a/lite/core/mir/subgraph/subgraph_detector.h +++ b/lite/core/mir/subgraph/subgraph_detector.h @@ -63,6 +63,7 @@ class SubgraphDetector { node_dat_t* UnionFindAncestor(); void UnionFindCombine(node_dat_t* candidate); }; + SubgraphDetector(SSAGraph* graph, const SubgraphTeller& teller) : graph_(graph), teller_(teller) {} std::vector> operator()(); @@ -71,7 +72,11 @@ class SubgraphDetector { bool reverse, const std::function& enter, const std::function& leave); + + std::unordered_set GetExcludedNodesFromConfigFile(); + void InitNodes(node_map_t* nodes); + std::vector> ExtractSubgraphs(node_map_t* nodes); protected: diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc index 3b0d7c5cd5c8a0d0901750148359f430b6d49894..974772a9839c1e089359be3ae98e1833645ccd7a 100644 --- a/lite/core/mir/subgraph/subgraph_detector_test.cc +++ b/lite/core/mir/subgraph/subgraph_detector_test.cc @@ -39,7 +39,7 @@ std::vector AddFCDesc( CHECK_EQ(input_var_names.size(), 1); CHECK_EQ(wshape.size(), 2); static int id = 0; - std::string prefix = "fc_" + std::to_string(id); + std::string prefix = "fc_" + paddle::lite::to_string(id); auto* op_desc = block_desc->AddOp(); auto* wgt = block_desc->AddVar(); @@ -76,7 +76,7 @@ std::vector AddElementwiseAddDesc( const std::vector& input_Y_names) { // CHECK_EQ(input_var_names.size(), 2); static int id = 0; - std::string prefix = "elementwise_add_" + std::to_string(id); + std::string prefix = "elementwise_add_" + paddle::lite::to_string(id); auto* op_desc = block_desc->AddOp(); auto* out = block_desc->AddVar(); @@ -100,7 +100,7 @@ std::vector AddFeedDesc( const std::vector& input_X_names) { // CHECK_EQ(input_var_names.size(), 1); static int id = 0; - std::string prefix = "feed_" + std::to_string(id); + std::string prefix = "feed_" + paddle::lite::to_string(id); auto* op_desc = block_desc->AddOp(); auto* out = block_desc->AddVar(); @@ -123,7 +123,7 @@ std::vector AddFetchDesc( const std::vector& input_X_names) { // CHECK_EQ(input_var_names.size(), 1); static int id = 0; - std::string prefix = "fetch_" + std::to_string(id); + std::string prefix = "fetch_" + paddle::lite::to_string(id); auto* op_desc = block_desc->AddOp(); auto* out = block_desc->AddVar(); @@ -220,8 +220,8 @@ TEST(Subgraph, detect_custom_model) { }; std::vector> subgraphs = mir::SubgraphDetector(graph.get(), teller)(); - ASSERT_EQ(subgraphs.size(), 1); mir::SubgraphVisualizer(graph.get(), subgraphs)(); + ASSERT_EQ(subgraphs.size(), 1); } } // namespace lite diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc index 247795a86ce2cbe962b161311f7845622ee3983e..7117e1b3399fe823194f7f1a4d4c239099580955 100644 --- a/lite/core/mir/subgraph/subgraph_pass_test.cc +++ b/lite/core/mir/subgraph/subgraph_pass_test.cc @@ -15,11 +15,9 @@ #include #include #include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" #include "lite/api/test_helper.h" #include "lite/utils/cp_logging.h" +#include "lite/utils/string.h" DEFINE_string(model_file, "", "model file path of combined protobuf model"); DEFINE_string(params_file, "", "params file path of combined protobuf model"); @@ -34,43 +32,17 @@ namespace lite { // The helper functions for loading and running model from command line and // verifying output data std::vector TypeParsing(std::string text) { - std::vector types; - while (!text.empty()) { - size_t index = text.find_first_of(":"); - std::string type = text.substr(0, index); - VLOG(3) << type; - types.push_back(type); - if (index == std::string::npos) { - break; - } else { - text = text.substr(index + 1); - } - } - return types; + return Split(text, ":"); } std::vector> ShapeParsing(std::string text) { std::vector> shapes; - while (!text.empty()) { - size_t index = text.find_first_of(":"); - std::string slice = text.substr(0, index); - std::vector shape; - while (!slice.empty()) { - size_t index = slice.find_first_of(","); - int d = atoi(slice.substr(0, index).c_str()); - VLOG(3) << d; - shape.push_back(d); - if (index == std::string::npos) { - break; - } else { - slice = slice.substr(index + 1); - } - } - shapes.push_back(shape); - if (index == std::string::npos) { - break; - } else { - text = text.substr(index + 1); + std::vector shape_strings = Split(text, ":"); + shapes.resize(shape_strings.size()); + for (int i = 0; i < shape_strings.size(); i++) { + std::vector shape_nums = Split(shape_strings[i], ","); + for (auto shape_num : shape_nums) { + shapes[i].push_back(atoi(shape_num.c_str())); } } return shapes; diff --git a/lite/core/mir/subgraph_cast_display_pass.cc b/lite/core/mir/subgraph_cast_display_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..3a2c94d23298fcb607de0bf821d0dc92c95da7bb --- /dev/null +++ b/lite/core/mir/subgraph_cast_display_pass.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/pass.h" +#include "lite/core/mir/pass_registry.h" + +namespace paddle { +namespace lite { +namespace mir { + +class SubgraphCastDisplayPass : public DebugPass { + public: + void Apply(const std::unique_ptr& graph) override { + VLOG(3) << "== Argument types =="; + for (auto& node : graph->mutable_nodes()) { + if (!node.IsArg()) continue; + + auto* type = node.AsArg().type; + if (type) { + VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type; + } else { + VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK"; + } + } + VLOG(3) << "---------------------"; + + // + VLOG(0) << "== SubgraphOp Debug Info =="; + for (auto& node : graph->mutable_nodes()) { + if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") { + VLOG(0) << "FOUND SUBGRAPH OP"; + display_debug_info(node, "subgraph"); + break; + } + } + VLOG(0) << "---------------------"; + } + + void display_debug_info(const Node& node, + std::string op_type, + bool display_in_nodes = true, + bool display_out_nodes = true) { + CHECK(node.IsStmt()); + VLOG(0) << node.AsStmt(); + if (display_in_nodes) { + for (auto p_in_arg_node : node.inlinks) { + CHECK(p_in_arg_node->IsArg()); + VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name + << " type: " << *p_in_arg_node->AsArg().type + << " is_weight: " << p_in_arg_node->AsArg().is_weight + << " is_persist: " << p_in_arg_node->AsArg().is_persist + << " input_count: " << p_in_arg_node->inlinks.size(); + if (p_in_arg_node->inlinks.size() == 0) { + VLOG(0) << "** END with No Op"; + } + for (auto p_in_stmt_node : p_in_arg_node->inlinks) { + CHECK(p_in_stmt_node->IsStmt()); + std::string stmt_op_type = p_in_stmt_node->AsStmt().op_type(); + if (stmt_op_type == "cast" || stmt_op_type == "transpose" || + stmt_op_type == "io_copy") { + display_debug_info(*p_in_stmt_node, stmt_op_type, true, false); + } else { + VLOG(0) << "** END with op type: " << stmt_op_type; + } + } + } + } + if (display_out_nodes) { + for (auto p_out_arg_node : node.outlinks) { + CHECK(p_out_arg_node->IsArg()); + VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name + << " type: " << *p_out_arg_node->AsArg().type + << " is_weight: " << p_out_arg_node->AsArg().is_weight + << " is_persist: " << p_out_arg_node->AsArg().is_persist + << " output_count: " << p_out_arg_node->outlinks.size(); + if (p_out_arg_node->outlinks.size() == 0) { + VLOG(0) << "** END with No Op"; + } + for (auto p_out_stmt_node : p_out_arg_node->outlinks) { + CHECK(p_out_stmt_node->IsStmt()); + std::string stmt_op_type = p_out_stmt_node->AsStmt().op_type(); + if (stmt_op_type == "cast" || stmt_op_type == "transpose" || + stmt_op_type == "io_copy") { + display_debug_info(*p_out_stmt_node, stmt_op_type, false, true); + } else { + VLOG(0) << "** END with op type: " << stmt_op_type; + } + } + } + } + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(subgraph_cast_display_pass, + paddle::lite::mir::SubgraphCastDisplayPass) + .BindTargets({TARGET(kAny)}); diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc index b3b7a858f68367ac789f390c6bd3bd94873f77d5..1133e5ba8203ec9fea177844a6311c993f6b8ff7 100644 --- a/lite/core/mir/type_layout_cast_pass.cc +++ b/lite/core/mir/type_layout_cast_pass.cc @@ -20,6 +20,8 @@ #include #include "lite/core/mir/graph_visualize_pass.h" #include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/type_precision_cast_pass.h" +#include "lite/operators/subgraph_op.h" #include "lite/utils/string.h" namespace paddle { @@ -39,8 +41,9 @@ void TypeLayoutTransformPass::Apply(const std::unique_ptr& graph) { VLOG(4) << "!node->IsStmt():" << !node->IsStmt(); if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue; auto inlinks = node->inlinks; - VLOG(4) << "node->AsStmt().desc:" << node->AsStmt().desc - << " inlinks.size():" << inlinks.size(); + VLOG(4) << "============== node->AsStmt().op_type():" + << node->AsStmt().op_type() << " inlinks.size():" << inlinks.size() + << " ================"; for (auto* in : inlinks) { ComplementInputs(graph.get(), node, in); } @@ -66,13 +69,25 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph, CHECK(inst.op_info()->GetInputArgname(in_arg_name, &inst_in_tensor_name)); auto decl_arg_type = inst.picked_kernel().GetInputDeclType(inst_in_tensor_name); + CHECK(in->AsArg().type); - VLOG(5) << "\n inst_in_tensor_name:" << inst_in_tensor_name + VLOG(3) << "\n inst_in_tensor_name:" << inst_in_tensor_name << "\n in->AsArg().name:" << in->AsArg().name << "\n *in->AsArg().type:" << *in->AsArg().type << "\n *decl_arg_type:" << *decl_arg_type << "\n inst.op()->DebugString():" << inst.op()->DebugString(); + // TODO(ysh329): conflict if tensor with kARM target but kImageDefault(OpenCL + // layout). + // not a good judge, but don't find the source of this issue from + // static_pick_kernel_pass + // to this pass. + auto* in_arg_type = const_cast(in->AsArg().type); + if (in_arg_type->target() == TARGET(kARM) && + in_arg_type->layout() == DATALAYOUT(kImageDefault)) { + return; + } + if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) { VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name << " for kernel " << inst.op()->DebugString() << " " @@ -170,9 +185,8 @@ void TypeLayoutTransformPass::AddLayoutInst( DirectedLink(layout_output_arg, inst_node); // reset opdesc and update kernel information - UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), - in->AsArg().name, - layout_output_name); + UpdateInputs( + inst_node->AsStmt().op().get(), in->AsArg().name, layout_output_name); auto original_selected_kernel = std::move(inst_node->AsStmt().kernels().front()); auto update_op_info = *inst_node->AsStmt().op_info(); @@ -204,6 +218,30 @@ void TypeLayoutTransformPass::SetValidPlaces( valid_places_ = valid_places; } +void OpenCLTypeLayoutTransformPass::Apply( + const std::unique_ptr& graph) { + // Start from inputs of the graph, those should have place set. + VLOG(4) << "\n" << Visualize(graph.get()); + std::list nodes; + for (auto& node : graph->StmtTopologicalOrder()) { + nodes.push_back(node); + } + + VLOG(4) << "nodes.size():" << nodes.size(); + for (auto& node : nodes) { + VLOG(4) << "!node->IsStmt():" << !node->IsStmt(); + if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue; + VLOG(1) << "node->AsStmt().op_type():" << node->AsStmt().op_type(); + if (node->AsStmt().op_type() == "layout" || + node->AsStmt().op_type() == "io_copy") { + auto new_op = node->AsStmt().mutable_op_info(); + int process_type = 1; + new_op->SetAttr("process_type", process_type); + } + } + VLOG(4) << "\n" << Visualize(graph.get()); +} + } // namespace mir } // namespace lite } // namespace paddle @@ -213,3 +251,9 @@ REGISTER_MIR_PASS(type_layout_cast_pass, .BindTargets({TARGET(kAny)}) .BindKernel("layout_once") .BindKernel("layout"); + +REGISTER_MIR_PASS(type_layout_cast_preprocess_pass, + paddle::lite::mir::OpenCLTypeLayoutTransformPass) + .BindTargets({TARGET(kAny)}) + .BindKernel("layout_once") + .BindKernel("layout"); diff --git a/lite/core/mir/type_layout_cast_pass.h b/lite/core/mir/type_layout_cast_pass.h index bf36214e1dce33352468155a6817adda9039727a..4a3e4c02d1053e84dd39bee14a0e01260f0626e4 100644 --- a/lite/core/mir/type_layout_cast_pass.h +++ b/lite/core/mir/type_layout_cast_pass.h @@ -24,18 +24,6 @@ namespace paddle { namespace lite { namespace mir { -static void UpdateInputTo(cpp::OpDesc* desc, - const std::string& from, - const std::string& to) { - for (auto& item : *desc->mutable_inputs()) { - for (auto& input : item.second) { - if (input == from) { - input = to; - } - } - } -} - class TypeLayoutTransformPass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override; @@ -57,6 +45,15 @@ class TypeLayoutTransformPass : public ProgramPass { std::vector valid_places_; }; +// add preprocess and postprocess attribute for layout op +class OpenCLTypeLayoutTransformPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; + + private: + std::vector valid_places_; +}; + } // namespace mir } // namespace lite } // namespace paddle diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc index 2f177383fc2b3a035313c0654c961c0b21a7f197..ecccf89fa76287a3f30756f7138fcce229e8f337 100644 --- a/lite/core/mir/type_precision_cast_pass.cc +++ b/lite/core/mir/type_precision_cast_pass.cc @@ -20,11 +20,116 @@ #include #include "lite/core/mir/graph_visualize_pass.h" #include "lite/core/mir/pass_registry.h" +#include "lite/operators/subgraph_op.h" namespace paddle { namespace lite { namespace mir { +// For the subgraph op, we also need to update the attr 'input_data_names' and +// the input variables names of the Ops in the subblock. +void UpdateInputsForSubgraph(OpLite* op, + const std::string& from, + const std::string& to) { + auto* op_desc = op->mutable_op_info(); + auto input_data_names = + op_desc->GetAttr>("input_data_names"); + std::replace(input_data_names.begin(), input_data_names.end(), from, to); + op_desc->SetAttr("input_data_names", input_data_names); + auto* subblock_desc = static_cast(op)->GetSubBlock(); + CHECK(subblock_desc); + for (size_t i = 0; i < subblock_desc->OpsSize(); i++) { + auto* subblock_op_desc = subblock_desc->GetOp(i); + for (auto& subblock_op_input : *subblock_op_desc->mutable_inputs()) { + for (auto& subblock_var_name : subblock_op_input.second) { + if (subblock_var_name == from) { + subblock_var_name = to; + } + } + } + } +} + +// Update the input variable names from 'from' to 'to' for the target Op +void UpdateInputs(OpLite* op, const std::string& from, const std::string& to) { + auto* op_desc = op->mutable_op_info(); + auto op_type = op_desc->Type(); + for (auto& op_input : *op_desc->mutable_inputs()) { + for (auto& var_name : op_input.second) { + if (var_name == from) { + var_name = to; + } + } + } + if (op_type == "subgraph") { + UpdateInputsForSubgraph(op, from, to); + } +} + +// Infer the scale value for the new calib op from the subgraph op +static bool InferScaleFromSubgraph(std::string var_name, + const OpInfo* op_info, + float* scale, + bool reverse = false) { + std::string attr_name = reverse ? "output_data_names" : "input_data_names"; + if (!op_info->HasAttr(attr_name)) return false; + auto input_or_output_names = + op_info->GetAttr>(attr_name); + attr_name = reverse ? "output_data_scales" : "input_data_scales"; + if (!op_info->HasAttr(attr_name)) return false; + auto input_or_output_scales = op_info->GetAttr>(attr_name); + auto size = input_or_output_names.size(); + CHECK(size == input_or_output_scales.size()); + for (int i = 0; i < size; i++) { + if (input_or_output_names[i] == var_name) { + *scale = input_or_output_scales[i]; + return true; + } + } + return false; +} + +// Infer the scale value for the new calib op from the input_scale of the +// current op and output_scale of the previous op. +// case 1: prev_op->var_node->op_node(int8->any op, with input_scale). +// case 2: prev_op->var_node->op_node(subgraph op, int8->any, with +// input_data_scales). +// case 3: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any, +// without input_scale). +// case 4: prev_op(any->int8, subgraph_op, with +// output_data_scales)->var_node->op_node(fp32->any, without input_scale). +static bool InferScale(Node* var_node, Node* op_node, float* scale) { + bool found = false; + auto& inst = op_node->AsStmt(); + auto op_info = inst.op_info(); + auto op_type = op_info->Type(); + auto var_name = var_node->AsArg().name; + if (op_type == "subgraph") { + found = InferScaleFromSubgraph(var_name, op_info, scale, false); + } else { + if (op_info->HasAttr("input_scale")) { + *scale = op_info->GetAttr("input_scale"); + found = true; + } else { + // Obtain the output_scale from one of its previous Ops + auto prev_op_node = var_node->inlinks.front(); + CHECK(prev_op_node->IsStmt()); + auto& prev_inst = prev_op_node->AsStmt(); + auto prev_op_info = prev_inst.op_info(); + auto prev_op_type = prev_op_info->Type(); + if (prev_op_type == "subgraph") { + found = InferScaleFromSubgraph(var_name, prev_op_info, scale, true); + } else { + if (prev_op_info->HasAttr("output_scale")) { + *scale = prev_op_info->GetAttr("output_scale"); + found = true; + } + } + } + } + return found; +} + void PrecisionCastPass::Apply(const std::unique_ptr& graph) { // Start from inputs of the graph, those should have place set. std::list nodes; @@ -59,6 +164,14 @@ void PrecisionCastPass::ComplementInputs(SSAGraph* graph, auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp); CHECK(in->AsArg().type); VLOG(4) << inst.picked_kernel().name(); + if (inst.op_info()->Type() == "fetch") { + if (inst.op_info()->HasAttr("data_type")) { + auto data_type = + static_cast(inst.op_info()->GetAttr("data_type")); + decl_arg_type = LiteType::GetTensorTy( + decl_arg_type->target(), data_type, decl_arg_type->layout()); + } + } // if (!in->AsArg().is_weight && !PrecisionCompatibleTo(*in->AsArg().type, // *decl_arg_type)) { if (!PrecisionCompatibleTo(*in->AsArg().type, *decl_arg_type)) { @@ -88,7 +201,8 @@ void PrecisionCastPass::AddCastInst(const Type& from, CHECK(in->IsArg()); // auto node_id = [&] { return graph->nodes().size(); }; auto cast_op_output_name = in->AsArg().name + "/precision_trans"; - // in->AsArg().name + "/precision_trans/" + std::to_string(node_id()); + // in->AsArg().name + "/precision_trans/" + + // paddle::lite::to_string(node_id()); auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name); cast_op_output_arg->AsArg().type = LiteType::GetTensorTy(from.target(), to.precision(), from.layout()); @@ -109,10 +223,11 @@ void PrecisionCastPass::AddCastInst(const Type& from, op_desc.SetType(cast_type); op_desc.SetInput("Input", {in->AsArg().name}); op_desc.SetOutput("Out", {cast_op_output_name}); - if (inst_node->AsStmt().op_info()->HasAttr("input_scale")) { - op_desc.SetAttr( - "scale", inst_node->AsStmt().op_info()->GetAttr("input_scale")); + float scale; + if (InferScale(in, inst_node, &scale)) { + op_desc.SetAttr("scale", scale); } + cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); auto kernels = cast_op->CreateKernels(valid_places); std::vector> selected_kernels; @@ -146,9 +261,8 @@ void PrecisionCastPass::AddCastInst(const Type& from, DirectedLink(cast_op_output_arg, inst_node); // reset opdesc and update kernel information - UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), - in->AsArg().name, - cast_op_output_name); + UpdateInputs( + inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name); // recreate the op auto original_selected_kernel = @@ -178,5 +292,6 @@ void PrecisionCastPass::SetValidPlaces(const std::vector& valid_places) { REGISTER_MIR_PASS(type_precision_cast_pass, paddle::lite::mir::PrecisionCastPass) .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kOpenCL)}) .BindKernel("calib_once") .BindKernel("calib"); diff --git a/lite/core/mir/type_precision_cast_pass.h b/lite/core/mir/type_precision_cast_pass.h index 3f55e52ef9fed1f0b456533141654d1dcadb16f7..b5f7c5d902a998e369f0b1775c59f50cbf8dc256 100644 --- a/lite/core/mir/type_precision_cast_pass.h +++ b/lite/core/mir/type_precision_cast_pass.h @@ -24,17 +24,7 @@ namespace paddle { namespace lite { namespace mir { -static void UpdateInputTo(cpp::OpDesc* desc, - const std::string& from, - const std::string& to) { - for (auto& item : *desc->mutable_inputs()) { - for (auto& input : item.second) { - if (input == from) { - input = to; - } - } - } -} +void UpdateInputs(OpLite* op, const std::string& from, const std::string& to); /* * The pass complement the necessary instruction to make data diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc index ae74bd8d4d5647139a13509dfda0bb2b41ecc5c7..75d8022d5f5f9d8572a5e020c11ae5d8cf630c10 100644 --- a/lite/core/mir/type_target_cast_pass.cc +++ b/lite/core/mir/type_target_cast_pass.cc @@ -21,6 +21,7 @@ #include #include "lite/core/mir/graph_visualize_pass.h" #include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/type_precision_cast_pass.h" #include "lite/utils/string.h" namespace paddle { @@ -240,9 +241,8 @@ void TypeTargetTransformPass::UpdateInstNode(Node* in, Node* inst_node, std::string io_copy_output_name) { // reset opdesc and update kernel information - UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), - in->AsArg().name, - io_copy_output_name); + UpdateInputs( + inst_node->AsStmt().op().get(), in->AsArg().name, io_copy_output_name); auto original_selected_kernel = std::move(inst_node->AsStmt().kernels().front()); auto update_op_info = *inst_node->AsStmt().op_info(); diff --git a/lite/core/mir/type_target_cast_pass.h b/lite/core/mir/type_target_cast_pass.h index e9a275882f7c2cb813c1c0b8add5cc4ca89b0c8b..3561a0a7dd22709648450a4b8f3c8f3f11448b38 100644 --- a/lite/core/mir/type_target_cast_pass.h +++ b/lite/core/mir/type_target_cast_pass.h @@ -25,18 +25,6 @@ namespace paddle { namespace lite { namespace mir { -static void UpdateInputTo(cpp::OpDesc* desc, - const std::string& from, - const std::string& to) { - for (auto& item : *desc->mutable_inputs()) { - for (auto& input : item.second) { - if (input == from) { - input = to; - } - } - } -} - /* * IoComplementPass complement the necessary instruction to make data * transferring or transformation between different places. diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc index 0936a44a66e4777633b84dadf0a1dc049213faab..a9ccd1b9ae9a5d45f8d0e5638b3aab1d73d1903c 100644 --- a/lite/core/op_lite.cc +++ b/lite/core/op_lite.cc @@ -22,6 +22,61 @@ namespace paddle { namespace lite { +bool OpLite::InferShape() { + // if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_ + // InferShapeByMemoryInternal will be applied. + if (param_.input_tensor_ptrs() && param_.output_tensor_ptrs()) { + return this->InferShapeWithCache(); + } else { + // otherwise, InferShapeImpl is applied directly. + return this->InferShapeImpl(); + } +} +bool OpLite::InferShapeWithCache() { + // 1. Get vector of current input tensors + auto *current_inputs = param_.input_tensor_ptrs(); + // 2. Get hash value of current inputs shape and lod + size_t new_hash = 0; + for (auto iter = current_inputs->begin(); iter != current_inputs->end(); + iter++) { + // combined dims value into new_hash value. + auto &element_dims = (*iter)->dims(); + for (int i = 0; i < element_dims.size(); i++) { + new_hash = + lite::hash_combine(new_hash, static_cast(element_dims[i])); + } + // combine lod value into new_hash valud. + auto &emement_lods = (*iter)->lod(); + for (auto lod_iter = emement_lods.begin(); lod_iter != emement_lods.end(); + lod_iter++) { + for (int i = 0; i < lod_iter->size(); i++) { + new_hash = + lite::hash_combine(new_hash, static_cast(lod_iter->at(i))); + } + } + } + // 3. infer shapes of output tensors + if (new_hash == io_shape_lod_hash_ && new_hash != 0) { + // if current hash value is consistent with io_shape_lod_hash_, + // previous outputs shape and lod are reused. + auto *current_outputs = param_.output_tensor_ptrs(); + for (int i = 0; i < current_outputs->size(); i++) { + current_outputs->at(i)->Resize(last_output_shapes[i]); + current_outputs->at(i)->set_lod(last_output_lods[i]); + } + } else { + // otherwise, current hash value is changed, InferShapeImpl will apply. + io_shape_lod_hash_ = new_hash; + this->InferShapeImpl(); + auto *current_outputs = param_.output_tensor_ptrs(); + for (int i = 0; i < current_outputs->size(); i++) { + last_output_shapes[i] = current_outputs->at(i)->dims(); + last_output_lods[i] = current_outputs->at(i)->lod(); + } + } + return true; +} + std::vector> OpLite::CreateKernels( const std::vector &places, const std::string &kernel_type) { std::vector> kernels; @@ -47,18 +102,19 @@ std::vector> OpLite::CreateKernels( return kernels; } - std::set place_set; - for (auto place : places) { - place_set.insert(place); - // Pick kernels those support any Precision and any DataLayout - place.precision = PRECISION(kAny); - place_set.insert(place); - place.layout = DATALAYOUT(kAny); - place_set.insert(place); + std::set expanded_places(places.begin(), places.end()); + for (auto &place : places) { + // Pick kernels those support any Precision and any DataLayout, For example: + // kARM,kFloat,kNCHW -> kARM,kFloat,kAny; kARM,kAny,kNCHW; kARM,kAny,kAny + expanded_places.insert( + Place(place.target, place.precision, DATALAYOUT(kAny))); + expanded_places.insert(Place(place.target, PRECISION(kAny), place.layout)); + expanded_places.insert( + Place(place.target, PRECISION(kAny), DATALAYOUT(kAny))); } std::set targets; - for (auto place : place_set) { + for (auto place : expanded_places) { pick_kernel(place); targets.insert(place.target); } diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h index 5dec9ed7aace837e3eb085a55d7b9b5382f7dea3..4c6c66be7e41889c116aed023d863df8a4a912c8 100644 --- a/lite/core/op_lite.h +++ b/lite/core/op_lite.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -24,6 +25,7 @@ #include "lite/core/kernel.h" #include "lite/core/scope.h" #include "lite/model_parser/cpp/op_desc.h" +#include "lite/operators/op_params.h" namespace paddle { namespace lite { @@ -64,7 +66,8 @@ class OpLite : public Registry { // Check the shape. virtual bool CheckShape() const { return true; } // Inference the outputs' shape. - virtual bool InferShape() const { return true; } + virtual bool InferShapeImpl() const { return true; } + virtual bool InferShape(); // Run this operator. virtual bool Run(); // Indicate whether the Op runs only once or not @@ -150,6 +153,16 @@ class OpLite : public Registry { std::vector valid_places_; Place kernel_place_{TARGET(kHost), PRECISION(kFloat)}; std::unique_ptr op_info_; + + std::vector last_output_shapes{}; + std::vector>> last_output_lods{}; + size_t io_shape_lod_hash_{}; + mutable operators::ParamBase param_; + + private: + // Infer Shape according to memory, if current input shapes are consistent + // with that of previous inputs, output shapes of last time will be reused. + bool InferShapeWithCache(); }; /* diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc index b49670eefb8b2c6aae30cb041de4d055a2b9964c..fe1dff3c99c1d2413888e78c89c999caea0ab030 100644 --- a/lite/core/op_registry.cc +++ b/lite/core/op_registry.cc @@ -19,6 +19,10 @@ namespace paddle { namespace lite { +const std::map &GetOp2PathDict() { + return OpKernelInfoCollector::Global().GetOp2PathDict(); +} + std::list> KernelRegistry::Create( const std::string &op_type, TargetType target, @@ -103,6 +107,9 @@ std::list> KernelRegistry::Create( case TARGET(kBM): { CREATE_KERNEL(kBM); } break; + case TARGET(kMLU): { + CREATE_KERNEL(kMLU); + } break; default: CHECK(false) << "not supported kernel target " << TargetToStr(target); } @@ -135,6 +142,15 @@ KernelRegistry::KernelRegistry() INIT_FOR(kCUDA, kInt64, kNCHW); INIT_FOR(kCUDA, kInt64, kNHWC); + INIT_FOR(kMLU, kFloat, kNHWC); + INIT_FOR(kMLU, kFloat, kNCHW); + INIT_FOR(kMLU, kFP16, kNHWC); + INIT_FOR(kMLU, kFP16, kNCHW); + INIT_FOR(kMLU, kInt8, kNHWC); + INIT_FOR(kMLU, kInt8, kNCHW); + INIT_FOR(kMLU, kInt16, kNHWC); + INIT_FOR(kMLU, kInt16, kNCHW); + INIT_FOR(kHost, kFloat, kNCHW); INIT_FOR(kHost, kAny, kNCHW); INIT_FOR(kHost, kFloat, kNHWC); @@ -150,10 +166,13 @@ KernelRegistry::KernelRegistry() INIT_FOR(kX86, kInt64, kNCHW); INIT_FOR(kARM, kFloat, kNCHW); + INIT_FOR(kARM, kFloat, kNHWC); INIT_FOR(kARM, kInt8, kNCHW); + INIT_FOR(kARM, kInt8, kNHWC); INIT_FOR(kARM, kAny, kNCHW); INIT_FOR(kARM, kAny, kAny); INIT_FOR(kARM, kInt32, kNCHW); + INIT_FOR(kARM, kInt64, kNCHW); INIT_FOR(kOpenCL, kFloat, kNCHW); INIT_FOR(kOpenCL, kFloat, kNHWC); @@ -175,8 +194,11 @@ KernelRegistry::KernelRegistry() INIT_FOR(kOpenCL, kAny, kImageNW); INIT_FOR(kNPU, kFloat, kNCHW); + INIT_FOR(kNPU, kFloat, kNHWC); INIT_FOR(kNPU, kInt8, kNCHW); + INIT_FOR(kNPU, kInt8, kNHWC); INIT_FOR(kNPU, kAny, kNCHW); + INIT_FOR(kNPU, kAny, kNHWC); INIT_FOR(kNPU, kAny, kAny); INIT_FOR(kXPU, kFloat, kNCHW); diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h index a49682eea68240bfa178eb3d3351b8c7fb41048d..3c41c1fd8af240401c3edf0343433f8d8d9c85db 100644 --- a/lite/core/op_registry.h +++ b/lite/core/op_registry.h @@ -72,6 +72,8 @@ class OpKernelInfoCollector { namespace paddle { namespace lite { +const std::map &GetOp2PathDict(); + using KernelFunc = std::function; using KernelFuncCreator = std::function()>; class LiteOpRegistry final : public Factory> { @@ -145,6 +147,9 @@ class KernelRegistry final { KernelRegistryForTarget *, // + KernelRegistryForTarget *, // KernelRegistryForTarget *, // @@ -263,7 +268,32 @@ class KernelRegistry final { DATALAYOUT(kAny)> *, // KernelRegistryForTarget * // + DATALAYOUT(kAny)> *, // + + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget * // >; KernelRegistry(); diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index ddd94484ac4bb8d96d5c55300c985d21b44f1843..ca22c86907d4f582ef9d7ca84b908711ba1b8660 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -53,7 +53,7 @@ class Optimizer { SpecifyKernelPickTactic(kernel_pick_factor); InitTargetTypeTransformPass(); - if (passes.empty()) { + if (passes.empty() || passes.size() == 1) { std::vector passes_local{ {"lite_quant_dequant_fuse_pass", // "weight_quantization_preprocess_pass", // @@ -75,6 +75,15 @@ class Optimizer { (defined LITE_WITH_ARM) "lite_elementwise_add_activation_fuse_pass", // #endif + "quantized_op_attributes_inference_pass", // Only for fully + // quantized model, infer + // the output scale and + // fix the attribute + // 'enable_int8' for all + // of the quantized ops. + "npu_subgraph_pass", + "xpu_subgraph_pass", + "bm_subgraph_pass", "static_kernel_pick_pass", // pick original kernel from graph "variable_place_inference_pass", // inference arg/var's // info(target/precision/layout/device) @@ -108,9 +117,10 @@ class Optimizer { "runtime_context_assign_pass", "argument_type_display_pass", - "memory_optimize_pass", - "npu_subgraph_pass", - "xpu_subgraph_pass"}}; + "memory_optimize_pass"}}; + if (passes.size() == 1) { + passes_local.push_back(passes[0]); + } RunPasses(passes_local); } else { RunPasses(passes); diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h index d9111e5c46c9217b181e5a3e5a8c7981f46250df..39213a33cebd05d9cfa50d82cdfb09ad3f7ad637 100644 --- a/lite/core/profile/precision_profiler.h +++ b/lite/core/profile/precision_profiler.h @@ -22,18 +22,25 @@ #include #include "lite/core/program.h" +#ifdef LITE_WITH_OPENCL +#include "lite/backends/opencl/cl_image_converter.h" +#include "lite/backends/opencl/cl_include.h" +#include "lite/kernels/opencl/image_helper.h" +#endif + namespace paddle { namespace lite { namespace profile { template -static void write_tensorfile(const Tensor* tensor, const std::string& locate) { +static bool write_tensorfile(const Tensor* tensor, const std::string& locate) { if (locate.find('/') != std::string::npos) { - return; + return false; } FILE* fp = fopen(locate.c_str(), "w"); if (fp == nullptr) { LOG(ERROR) << "file open field " << locate; + return false; } else { const dtype* data = tensor->data(); for (int i = 0; i < tensor->numel(); ++i) { @@ -41,63 +48,227 @@ static void write_tensorfile(const Tensor* tensor, const std::string& locate) { } } fclose(fp); + return true; } class PrecisionProfiler { public: - explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {} - ~PrecisionProfiler() { - LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr() - << " on Target " << TargetToStr(inst_->kernel()->target()) << " " - << PrecisionToStr(inst_->kernel()->precision()); - auto tensor_mean = [](const Tensor* in, - PrecisionType ptype, - std::string name = "inst") -> double { - if (!in->data()) { - return -99999; - } - double sum = 0.; - switch (ptype) { + // TODO(ysh329): need to remove `explicit PrecisionProfiler` + // keep this method only for arm/math/conditional + explicit PrecisionProfiler(const Instruction* inst) { + std::string inst_precison_str = GetInstPrecision(inst); + } + + PrecisionProfiler() {} + + std::string GetSummaryHeader() { + using std::setw; + using std::left; + using std::fixed; + STL::stringstream ss; + ss << "========================================= " + << "Detailed Precision Profiler Summary " + << "=========================================" << std::endl; + ss << setw(45) << left << "operator:(kernel_info)" + << " " << setw(70) << left << "output_tensor_name:(tensor_info)" + << " " << setw(15) << left << "dims" + << " " << setw(15) << left << "mean" + << " " << setw(15) << left << "std_deviation" + << " " << setw(15) << left << "ave_grow_rate*" << std::endl; + + return ss.str(); + } + + template + double compute_mean(const T* in, const size_t length) { + double sum = 0.; + for (size_t i = 0; i < length; ++i) { + sum += in[i]; + } + return sum / length; + } + + template + double compute_standard_deviation(const T* in, + const size_t length, + bool has_mean = false, + double mean = 10000) { + if (!has_mean) { + mean = compute_mean(in, length); + } + + double variance = 0.; + for (size_t i = 0; i < length; ++i) { + variance += pow((in[i] - mean), 2); + } + variance /= length; + return sqrt(variance); + } + + template + double compute_average_grow_rate(const T* in, const size_t length) { + const double eps = 1e-5; + double ave_grow_rate = 0.0f; + for (size_t i = 1; i < length; ++i) { + ave_grow_rate += (in[i] - in[i - 1]) / (in[i - 1] + eps); + } + ave_grow_rate /= length; + return ave_grow_rate; + } + + // check if output tensor unused + bool is_unused(const Tensor* in) { + if (!in->data()) { + return true; + } + return false; + } + + void compute_tensor_precision_info(const Tensor* in, + TargetType target_type, + PrecisionType precision_type, + DataLayoutType layout_type, + double* mean, + double* std_dev, + double* ave_grow_rate, + std::string name = "inst", + bool write_result_to_file = false) { + std::string unsupported_error_log = + "Unsupported precision profile for kernel registered on" + + TargetToStr(target_type) + "/" + PrecisionToStr(precision_type) + "/" + + DataLayoutToStr(layout_type); + + if (target_type == TARGET(kARM) || target_type == TARGET(kHost) || + target_type == TARGET(kX86)) { + switch (precision_type) { case PRECISION(kFloat): { auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = + compute_standard_deviation(ptr, in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; } case PRECISION(kAny): { auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = + compute_standard_deviation(ptr, in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; } case PRECISION(kInt8): { auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = + compute_standard_deviation(ptr, in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; } case PRECISION(kInt32): { auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = compute_standard_deviation( + ptr, in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; } default: - LOG(INFO) << "unsupport data type: " << PrecisionToStr(ptype); - return 0.; + *mean = -333333333333; + *std_dev = -33333333333; + *ave_grow_rate = -33333333333; + LOG(ERROR) << unsupported_error_log; + return; } - }; - if (inst_->op()->op_info()->Type() != "fetch") { - auto op = const_cast(inst_->op()); - auto kernel = inst_->kernel(); +#ifdef LITE_WITH_OPENCL + } else if (target_type == TARGET(kOpenCL)) { + switch (layout_type) { + case DATALAYOUT(kImageDefault): { + paddle::lite::CLImageConverterDefault default_convertor; + auto image_shape = default_convertor.InitImageDimInfoWith(in->dims()); + size_t im_w = image_shape[0]; + size_t im_h = image_shape[1]; + VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " " + << im_h; + std::vector in_data_v(im_w * im_h * 4); + std::vector real_out_v(in->numel()); + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + TargetWrapperCL::ImgcpySync(in_data_v.data(), + in->data(), + im_w, + im_h, + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + default_convertor.ImageToNCHW( + in_data_v.data(), real_out_v.data(), image_shape, in->dims()); + CHECK(real_out_v.size() == in->numel()); + *mean = compute_mean(real_out_v.data(), real_out_v.size()); + *std_dev = compute_standard_deviation( + real_out_v.data(), in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(real_out_v.data(), + real_out_v.size()); + write_result_to_file&& write_tensorfile(in, name); + return; + } + case DATALAYOUT(kNCHW): { + std::vector in_data_v(in->numel(), 0); + TargetWrapperCL::MemcpySync(in_data_v.data(), + in->data(), + in->numel() * sizeof(float), + IoDirection::DtoH); + VLOG(1) << name << ":" << in->numel(); + *mean = compute_mean(in_data_v.data(), in->numel()); + *std_dev = compute_standard_deviation( + in_data_v.data(), in->numel(), true, *mean); + *ave_grow_rate = + compute_average_grow_rate(in_data_v.data(), in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; + } + default: + *mean = -222222222222; + *std_dev = -22222222222; + *ave_grow_rate = -22222222222; + LOG(ERROR) << unsupported_error_log; + return; + } +#endif + } else { + *mean = -111111111111; + *std_dev = -11111111111; + *ave_grow_rate = -11111111111; + LOG(ERROR) << unsupported_error_log; + return; + } + } + + std::string GetInstPrecision(const Instruction* inst = nullptr) { + using std::setw; + using std::left; + using std::fixed; + STL::stringstream ss; + bool write_result_to_file = false; + + VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr() + << " registered on " << TargetToStr(inst->kernel()->target()) << "/" + << PrecisionToStr(inst->kernel()->precision()) << "/" + << DataLayoutToStr(inst->kernel()->layout()); + + std::string kernel_repr = inst->op()->op_info()->Repr(); + std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" + + PrecisionToStr(inst->kernel()->precision()) + + "/" + DataLayoutToStr(inst->kernel()->layout()); + std::string op_name = inst->op()->op_info()->Type(); + + if (inst->op()->op_info()->Type() != "fetch") { + auto op = const_cast(inst->op()); + auto kernel = inst->kernel(); auto op_scope = op->scope(); auto out_names = op->op_info()->output_names(); for (auto& out_name : out_names) { @@ -106,32 +277,90 @@ class PrecisionProfiler { auto type = kernel->GetOutputDeclType(out_arg_name); if (type->IsTensor()) { - auto tout = op_scope->FindVar(out_name)->GetMutable(); - double mean = tensor_mean(tout, type->precision(), out_name); - LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims() - << ", precision: " << PrecisionToStr(type->precision()) - << ", mean value: " << mean << " shape:" << tout->dims(); + const Tensor* tout = + op_scope->FindVar(out_name)->GetMutable(); + double mean = -999999; + double std_dev = -100000; + double ave_grow_rate = 99999; + std::string mean_str{"unused"}; + std::string std_dev_str{"unused"}; + std::string ave_grow_rate_str{"unused"}; + + if (!is_unused(tout)) { + compute_tensor_precision_info(tout, + type->target(), + type->precision(), + type->layout(), + &mean, + &std_dev, + &ave_grow_rate, + out_name, + write_result_to_file); + mean_str = std::to_string(mean); + std_dev_str = std::to_string(std_dev); + ave_grow_rate_str = std::to_string(ave_grow_rate); + } + std::string kernel_info = op_name + ":" + kernel_place; + std::string output_arg_info = out_name + ":" + + TargetToStr(type->target()) + "/" + + PrecisionToStr(type->precision()) + + "/" + DataLayoutToStr(type->layout()); + + ss << setw(45) << left << kernel_info << " " << setw(70) << left + << output_arg_info << " " << setw(15) << left << tout->dims() + << " " << setw(15) << left << mean_str << " " << setw(15) << left + << std_dev_str << " " << setw(15) << left << ave_grow_rate_str + << std::endl; } else if (type->IsTensorList()) { - auto tout = + auto touts = op_scope->FindVar(out_name)->GetMutable>(); - for (auto& t : *tout) { - double mean = tensor_mean(&t, type->precision(), out_name); - LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims() - << ", precision: " << PrecisionToStr(type->precision()) - << ", mean value: " << mean; + for (auto t : *touts) { + const Tensor* tout = &t; + double mean = -999999; + double std_dev = -100000; + double ave_grow_rate = 99999; + std::string mean_str{"unused"}; + std::string std_dev_str{"unused"}; + std::string ave_grow_rate_str{"unused"}; + + if (!is_unused(tout)) { + compute_tensor_precision_info(tout, + type->target(), + type->precision(), + type->layout(), + &mean, + &std_dev, + &ave_grow_rate, + out_name, + write_result_to_file); + mean_str = std::to_string(mean); + std_dev_str = std::to_string(std_dev); + ave_grow_rate_str = std::to_string(ave_grow_rate); + } + std::string kernel_info = op_name + ":" + kernel_place; + std::string output_arg_info = out_name + ":" + + TargetToStr(type->target()) + "/" + + PrecisionToStr(type->precision()) + + "/" + DataLayoutToStr(type->layout()); + + ss << setw(45) << left << kernel_info << " " << setw(70) << left + << output_arg_info << " " << setw(15) << left << tout->dims() + << " " << setw(15) << left << mean_str << " " << setw(15) << left + << std_dev_str << " " << setw(15) << left << ave_grow_rate_str + << std::endl; } } } } + return ss.str(); } - - private: - const Instruction* inst_{nullptr}; }; } // namespace profile } // namespace lite } // namespace paddle +// TODO(ysh329): need to remove. +// keep this method only for arm/math/conditional_block_compute #define LITE_PRECISION_PROFILE(inst) \ { auto a = paddle::lite::profile::PrecisionProfiler(&inst); } diff --git a/lite/core/program.cc b/lite/core/program.cc index 0895643a6adde0095f9d2892c41f263eedd4284f..7284c3983cb34a0db2387ece40f6d07b9d9a8511 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -136,6 +136,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { } void RuntimeProgram::Run() { +#ifdef LITE_WITH_PROFILE +#ifdef LITE_WITH_PRECISION_PROFILE + auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler(); + std::string precision_profiler_summary = + inst_precision_profiler.GetSummaryHeader(); +#endif +#endif + for (auto& inst : instructions_) { #ifndef LITE_WITH_FPGA if (inst.is_feed_fetch_op()) continue; @@ -144,13 +152,17 @@ void RuntimeProgram::Run() { #ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE #ifndef LITE_WITH_FPGA - LITE_PRECISION_PROFILE(inst) + precision_profiler_summary += + inst_precision_profiler.GetInstPrecision(&inst); #endif #endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PROFILE } #ifdef LITE_WITH_PROFILE LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0); +#ifdef LITE_WITH_PRECISION_PROFILE + LOG(INFO) << "\n" << precision_profiler_summary; +#endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PROFILE } diff --git a/lite/core/program_fake_utils.h b/lite/core/program_fake_utils.h index edcbb101aa5ddb090cc585a16597967cb5114936..fbee253872237bce08f3f67b948da79becbae21a 100644 --- a/lite/core/program_fake_utils.h +++ b/lite/core/program_fake_utils.h @@ -30,9 +30,9 @@ Program FakeProgram() { auto add_fc = [&](int id, std::string x) { // create variables - std::string w1 = "w" + std::to_string(id); - std::string b1 = "b" + std::to_string(id); - std::string out1 = "out" + std::to_string(id); + std::string w1 = "w" + paddle::lite::to_string(id); + std::string b1 = "b" + paddle::lite::to_string(id); + std::string out1 = "out" + paddle::lite::to_string(id); auto w1v = program.scope()->Var(w1)->GetMutable(); auto b1v = program.scope()->Var(b1)->GetMutable(); auto out1v = program.scope()->Var(out1)->GetMutable(); diff --git a/lite/core/tensor.cc b/lite/core/tensor.cc index 38a6be6767eae62f9d91c9c11811bc49639331bf..ecb9935dfd13c09cbd1a20f3833e6ab76161192a 100644 --- a/lite/core/tensor.cc +++ b/lite/core/tensor.cc @@ -75,6 +75,7 @@ void TensorLite::ShareDataWith(const TensorLite &other) { target_ = other.target_; lod_ = other.lod_; memory_size_ = other.memory_size_; + precision_ = other.precision_; } void TensorLite::CopyDataFrom(const TensorLite &other) { @@ -82,6 +83,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) { target_ = other.target_; lod_ = other.lod_; memory_size_ = other.memory_size_; + precision_ = other.precision_; buffer_->CopyDataFrom(*other.buffer_, memory_size_); } @@ -96,6 +98,21 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) { return mutable_data(memory_size); } +void TensorLite::ResetBuffer(std::shared_ptr buffer, + size_t memory_size) { + CHECK_EQ(offset_, 0) + << "Only the offset is supported to zero when the Buffer is reset."; + if (buffer_) { + CHECK_LE(memory_size_, buffer->space()) + << "The space of buffer is not enough to store the tensor."; + CHECK_LE(memory_size, buffer->space()) + << "The buffer is smaller than the specified minimum size."; + } + buffer_ = buffer; + memory_size_ = memory_size; + target_ = buffer->target(); +} + #ifdef LITE_WITH_OPENCL template <> const cl::Image2D *TensorLite::data() const { @@ -103,8 +120,8 @@ const cl::Image2D *TensorLite::data() const { return static_cast(buffer_->data()); } -template <> // use int16_t represent half float -const cl::Image2D *TensorLite::data() const { +template <> // use uint16_t represent half float +const cl::Image2D *TensorLite::data() const { if (nullptr == buffer_->data()) return nullptr; return static_cast(buffer_->data()); } diff --git a/lite/core/tensor.h b/lite/core/tensor.h index 04e540002b553a0e0f7db0144fd970bdb6a4d9ed..2209e524f413b4cedf255566bfc1b6b1f1229f8d 100644 --- a/lite/core/tensor.h +++ b/lite/core/tensor.h @@ -102,9 +102,10 @@ using LoD = std::vector>; class TensorLite { public: TensorLite() : buffer_(std::make_shared()) {} + explicit TensorLite(std::shared_ptr buffer) : buffer_(buffer) {} template - void Assign(DType *data, const DimT &dim) { + void Assign(const DType *data, const DimT &dim) { Resize(dim); auto *dst = mutable_data(Target); CopySync( @@ -178,6 +179,11 @@ class TensorLite { (static_cast(buffer_->data()) + offset_)); } + void *raw_data() { + return static_cast( + (static_cast(buffer_->data()) + offset_)); + } + void clear() { buffer_->Free(); offset_ = 0; @@ -195,6 +201,8 @@ class TensorLite { void CopyDataFrom(const TensorLite &other); + void ResetBuffer(std::shared_ptr buffer, size_t memory_size); + TargetType target() const { return target_; } template @@ -260,8 +268,8 @@ bool TensorCompareWith(const TensorT &a, const TensorT &b) { template <> const cl::Image2D *TensorLite::data() const; -template <> // use int16_t represent half float -const cl::Image2D *TensorLite::data() const; +template <> // use uint16_t represent half float +const cl::Image2D *TensorLite::data() const; #endif } // namespace lite diff --git a/lite/core/type_system.h b/lite/core/type_system.h index aeddf965c3b999750c7cca3595cc9f669b32d50e..2cf8366a2a1cbb6eb0c5f4e3dff3e4ac2623ff66 100644 --- a/lite/core/type_system.h +++ b/lite/core/type_system.h @@ -177,8 +177,9 @@ static bool TargetCompatibleTo(const Type& a, const Type& b) { return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM); }; if (a.IsVoid() || b.IsVoid()) return true; - if (a.IsTensor() || b.IsTensor()) { - if (a.IsTensor() && b.IsTensor()) { + if (a.IsTensor() || b.IsTensor() || a.IsTensorList() || b.IsTensorList()) { + if ((a.IsTensor() && b.IsTensor()) || + (a.IsTensorList() && b.IsTensorList())) { return is_host(a.target()) ? is_host(b.target()) : a.target() == b.target(); } diff --git a/lite/core/version.h.in b/lite/core/version.h.in index d34c32073b852a50b5d26984ed4812ac4f38a870..da2d5f3ed99631973d97a94741e1711391237261 100644 --- a/lite/core/version.h.in +++ b/lite/core/version.h.in @@ -53,9 +53,9 @@ static std::string version() { static int64_t int_version(const std::string& version) { const std::vector vec = Split(version, "."); if (vec.size() == 3) { - return std::stoi(vec[0]) * MAJOR_COEFF + - std::stoi(vec[1]) * MINOR_COEFF + - std::stoi(vec[2]) * PATCH_COEFF; + return atoi(vec[0].c_str()) * MAJOR_COEFF + + atoi(vec[1].c_str()) * MINOR_COEFF + + atoi(vec[2].c_str()) * PATCH_COEFF; } return -1; } diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md index 447bcbaff018d15a1bc3075c1153f724672f40a8..c2bdb25f4e3b46265bcc4830b613b6d0d6d8232d 100644 --- a/lite/demo/cxx/README.md +++ b/lite/demo/cxx/README.md @@ -8,12 +8,42 @@ 2. 人脸识别和佩戴口罩判断的Demo +目前,PaddleLite提供了shell端的人脸识别和佩戴口罩判断的Demo,首先基于已经准备好的Demo进行演示,然后介绍如何基于代码编译Demo并执行。 + +**下载Demo并执行** + +下载压缩包[mask_detection_files](https://paddle-inference-dist.cdn.bcebos.com/PaddleLiteDemo/mask_detection_files.tgz),解压到本地,其中包括编译好的可执行文件、模型文件、测试图片、PaddleLite 2.3版本动态库。 + +电脑连接安卓手机,在电脑shell端执行如下命令,将mask_detection_files文件夹push到安卓手机上。 +``` +adb push mask_detection_files /data/local/tmp/ +``` + +在电脑shell端执行如下命令,进入安卓手机,执行demo。 +``` +adb shell +cd /data/local/tmp/mask_detection_files +export LD_LIBRARY_PATH=/data/local/tmp/mask_detection_files:$LD_LIBRARY_PATH +./mask_detection face_detection mask_classification test.jpg +``` + +回到电脑端,将结果图片(test_mask_detection_result.jpg)取出,查看检测结果。 +``` +exit +adb pull /data/local/tmp/mask_detection_files/test_mask_detection_result.jpg ./ +``` + + +**编译Demo并执行** + 参考[源码编译](https://paddlepaddle.github.io/Paddle-Lite/v2.2.0/source_compile/)准备编译环境。 -执行下面命令,下载PaddleLite代码。 +执行下面命令,下载PaddleLite代码,切换到2.3版本分支。 ```shell git clone https://github.com/PaddlePaddle/Paddle-Lite.git cd Paddle-Lite +git fetch origin release/v2.3:release/v2.3 +git checkout release/v2.3 ``` 进入PaddleLite根目录,编译预测库。 @@ -25,7 +55,7 @@ cd Paddle-Lite --android_stl=c++_static \ --build_extra=ON \ --shutdown_log=OFF \ - tiny_publish + full_publish ``` 进入编译目录,下载模型和图片的压缩包,编译可执行文件。 @@ -70,7 +100,11 @@ export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH adb pull /data/local/tmp/test_mask_detection_result.jpg ./ ``` -![test_mask_detection_result](https://user-images.githubusercontent.com/7383104/74279176-6200cd00-4d55-11ea-9fc0-83cfc2b3b37d.jpg) +![test_mask_detection_result](https://user-images.githubusercontent.com/7383104/75131866-bae64300-570f-11ea-9cad-17acfaea1cfc.jpg) + +注:mask_detetion.cc 中的缩放因子shrink, 检测阈值detect_threshold, 可供自由配置: + - 缩放因子越大,模型运行速度越慢,检测准确率越高。 + - 检测阈值越高,人脸筛选越严格,检测出的人脸框可能越少。 3. 编译并运行全量api的demo(注:当编译模式为tiny_pubish时将不存在该demo) ```shell diff --git a/lite/demo/cxx/cuda_demo/CMakeLists.txt b/lite/demo/cxx/cuda_demo/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e27548b4e56ce03098c5c82b3eee49add62cc0a4 --- /dev/null +++ b/lite/demo/cxx/cuda_demo/CMakeLists.txt @@ -0,0 +1,20 @@ +project(demo CXX C) +cmake_minimum_required(VERSION 2.8) + +set(TARGET demo) +set(CMAKE_CXX_FLAGS "-std=c++11 -O3") + +set(LITE_LIB "${PROJECT_SOURCE_DIR}/../../cxx") +set(PROTOBUF_LIB "${PROJECT_SOURCE_DIR}/../../third_party/protobuf") + +include_directories("${LITE_LIB}/include") +link_directories("${LITE_LIB}/lib") +link_directories("${PROTOBUF_LIB}/lib") + +add_executable(${TARGET} ${TARGET}.cc) + +set(DEPS ${LITE_LIB}/lib/libpaddle_full_api_shared.so) +set(DEPS ${DEPS} protobuf-lite) +set(DEPS ${DEPS} "-lrt -lpthread -ldl") + +target_link_libraries(${TARGET} ${DEPS}) diff --git a/lite/demo/cxx/cuda_demo/demo.cc b/lite/demo/cxx/cuda_demo/demo.cc new file mode 100644 index 0000000000000000000000000000000000000000..593e73cf83cd491fd8e33e415d17106dc8f4ce14 --- /dev/null +++ b/lite/demo/cxx/cuda_demo/demo.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle_api.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +int64_t ShapeProduction(const shape_t& shape) { + int64_t res = 1; + for (auto i : shape) res *= i; + return res; +} + +void RunModel(std::string model_dir) { + // 1. Create CxxConfig + CxxConfig config; + config.set_model_file(model_dir + "/__model__"); + config.set_param_file(model_dir + "/__params__"); + config.set_valid_places({ + Place{TARGET(kCUDA), PRECISION(kFloat)}, + }); + // 2. Create PaddlePredictor by CxxConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + // 3. Prepare input data + int num = 1; + int channels = 3; + int height = 608; + int width = 608; + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize({num, channels, height, width}); + // fake input data + std::vector data(num * channels * height * width, 0); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = i % 10 * 0.1; + } + input_tensor->CopyFromCpu(data.data()); + std::unique_ptr size_tensor(std::move(predictor->GetInput(1))); + size_tensor->Resize({1, 2}); + std::vector size_data{608, 608}; + size_tensor->CopyFromCpu(size_data.data()); + + // 4. Run predictor + predictor->Run(); + + // 5. Get output + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + std::vector out_cpu(ShapeProduction(output_tensor->shape()), 0); + std::cout << "output size is " << ShapeProduction(output_tensor->shape()) + << std::endl; + output_tensor->CopyToCpu(out_cpu.data()); + for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { + std::cout << "Output[" << i << "]: " << out_cpu[i] << std::endl; + } +} + +int main(int argc, char** argv) { + if (argc < 2) { + std::cerr << "[ERROR] usage: ./" << argv[0] << " model_dir\n"; + exit(1); + } + std::string model_dir = argv[1]; + RunModel(model_dir); + return 0; +} diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 index d659a316cd856fd550e83b125573409f239b8cf2..4a63563c4ff12b825e881327ec77adc5b2f03aeb 100644 --- a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 +++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 @@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) +#CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) ############################################################### # How to use one of static libaray: # @@ -40,7 +40,7 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYS # 1. Comment above line using `libpaddle_light_api_shared.so` # 2. Undo comment below line using `libpaddle_api_light_bundled.a` -#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) +CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) test_model_cv: fetch_opencv test_model_cv.o $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 index c80b07d5c029a3624a514e07375fd08e8770da25..70d6bed52b84be7d050ef15ab483e8d06342c82d 100644 --- a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 +++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 @@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) +#CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) ############################################################### # How to use one of static libaray: # # `libpaddle_api_full_bundled.a` # @@ -39,7 +39,7 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYS # 1. Comment above line using `libpaddle_light_api_shared.so` # 2. Undo comment below line using `libpaddle_api_light_bundled.a` -#CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) +CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) test_model_cv: fetch_opencv test_model_cv.o $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) diff --git a/lite/demo/cxx/mask_detection/mask_detection.cc b/lite/demo/cxx/mask_detection/mask_detection.cc index 748b84365fc70aa59171a6bf8847f554308fdc8c..09a9c0ee158e7d5913a78877711d831fc5738cf1 100644 --- a/lite/demo/cxx/mask_detection/mask_detection.cc +++ b/lite/demo/cxx/mask_detection/mask_detection.cc @@ -81,6 +81,29 @@ void neon_mean_scale(const float* din, } } +cv::Mat crop_img(const cv::Mat& img, + cv::Rect rec, + int res_width, + int res_height) { + float xmin = rec.x; + float ymin = rec.y; + float w = rec.width; + float h = rec.height; + float center_x = xmin + w / 2; + float center_y = ymin + h / 2; + cv::Point2f center(center_x, center_y); + float max_wh = std::max(w / 2, h / 2); + float scale = res_width / (2 * max_wh * 1.5); + cv::Mat rot_mat = cv::getRotationMatrix2D(center, 0.f, scale); + rot_mat.at(0, 2) = + rot_mat.at(0, 2) - (center_x - res_width / 2.0); + rot_mat.at(1, 2) = + rot_mat.at(1, 2) - (center_y - res_width / 2.0); + cv::Mat affine_img; + cv::warpAffine(img, affine_img, rot_mat, cv::Size(res_width, res_height)); + return affine_img; +} + void pre_process(const cv::Mat& img, int width, int height, @@ -89,8 +112,12 @@ void pre_process(const cv::Mat& img, float* data, bool is_scale = false) { cv::Mat resized_img; - cv::resize( - img, resized_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC); + if (img.cols != width || img.rows != height) { + cv::resize( + img, resized_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC); + } else { + resized_img = img; + } cv::Mat imgf; float scale_factor = is_scale ? 1.f / 256 : 1.f; resized_img.convertTo(imgf, CV_32FC3, scale_factor); @@ -98,12 +125,12 @@ void pre_process(const cv::Mat& img, neon_mean_scale(dimg, data, width * height, mean, scale); } -void RunModel(std::string det_model_dir, - std::string class_model_dir, +void RunModel(std::string det_model_file, + std::string class_model_file, std::string img_path) { // Prepare cv::Mat img = imread(img_path, cv::IMREAD_COLOR); - float shrink = 0.2; + float shrink = 0.4; int width = img.cols; int height = img.rows; int s_width = static_cast(width * shrink); @@ -111,11 +138,12 @@ void RunModel(std::string det_model_dir, // Detection MobileConfig config; - config.set_model_dir(det_model_dir); + config.set_model_from_file(det_model_file); // Create Predictor For Detction Model std::shared_ptr predictor = CreatePaddlePredictor(config); + std::cout << "Load detecion model succeed." << std::endl; // Get Input Tensor std::unique_ptr input_tensor0(std::move(predictor->GetInput(0))); @@ -136,9 +164,10 @@ void RunModel(std::string det_model_dir, auto* outptr = output_tensor0->data(); auto shape_out = output_tensor0->shape(); int64_t out_len = ShapeProduction(shape_out); + std::cout << "Detecting face succeed." << std::endl; // Filter Out Detection Box - float detect_threshold = 0.3; + float detect_threshold = 0.7; std::vector detect_result; for (int i = 0; i < out_len / 6; ++i) { if (outptr[1] >= detect_threshold) { @@ -158,10 +187,11 @@ void RunModel(std::string det_model_dir, } // Classification - config.set_model_dir(class_model_dir); + config.set_model_from_file(class_model_file); // Create Predictor For Classification Model predictor = CreatePaddlePredictor(config); + std::cout << "Load classification model succeed." << std::endl; // Get Input Tensor std::unique_ptr input_tensor1(std::move(predictor->GetInput(0))); @@ -172,10 +202,14 @@ void RunModel(std::string det_model_dir, int detect_num = detect_result.size(); std::vector classify_mean = {0.5f, 0.5f, 0.5f}; std::vector classify_scale = {1.f, 1.f, 1.f}; - float classify_threshold = 0.5; for (int i = 0; i < detect_num; ++i) { cv::Rect rec_clip = detect_result[i].rec; - cv::Mat roi = img(rec_clip); + cv::Mat roi = crop_img(img, rec_clip, classify_w, classify_h); + + // uncomment two lines below, save roi img to disk + // std::string roi_name = "roi_" + paddle::lite::to_string(i) + // + ".jpg"; + // imwrite(roi_name, roi); // Do PreProcess pre_process(roi, @@ -191,37 +225,60 @@ void RunModel(std::string det_model_dir, // Get Output Tensor std::unique_ptr output_tensor1( - std::move(predictor->GetOutput(1))); + std::move(predictor->GetOutput(0))); auto* outptr = output_tensor1->data(); + float prob = outptr[1]; // Draw Detection and Classification Results - cv::rectangle(img, rec_clip, cv::Scalar(0, 0, 255), 2, cv::LINE_AA); - std::string text = outptr[1] > classify_threshold ? "wear mask" : "no mask"; - int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL; - double font_scale = 1.f; - int thickness = 1; + bool flag_mask = prob > 0.5f; + cv::Scalar roi_color; + std::string text; + if (flag_mask) { + text = "MASK: "; + roi_color = cv::Scalar(0, 255, 0); + } else { + text = "NO MASK: "; + roi_color = cv::Scalar(0, 0, 255); + prob = 1 - prob; + } + std::string prob_str = std::to_string(prob * 100); + int point_idx = prob_str.find_last_of("."); + + text += prob_str.substr(0, point_idx + 3) + "%"; + int font_face = cv::FONT_HERSHEY_SIMPLEX; + double font_scale = 0.25; + float thickness = 1; cv::Size text_size = cv::getTextSize(text, font_face, font_scale, thickness, nullptr); - float new_font_scale = rec_clip.width * 0.7 * font_scale / text_size.width; - text_size = - cv::getTextSize(text, font_face, new_font_scale, thickness, nullptr); + + int top_space = std::max(0.35 * text_size.height, 2.0); + int bottom_space = top_space + 2; + int right_space = 0.05 * text_size.width; + int back_width = text_size.width + right_space; + int back_height = text_size.height + top_space + bottom_space; + + // Configure text background + cv::Rect text_back = + cv::Rect(rec_clip.x, rec_clip.y - back_height, back_width, back_height); + + // Draw roi object, text, and background + cv::rectangle(img, rec_clip, roi_color, 1); + cv::rectangle(img, text_back, cv::Scalar(225, 225, 225), -1); cv::Point origin; - origin.x = rec_clip.x + 5; - origin.y = rec_clip.y + text_size.height + 5; + origin.x = rec_clip.x; + origin.y = rec_clip.y - bottom_space; cv::putText(img, text, origin, font_face, - new_font_scale, - cv::Scalar(0, 255, 255), - thickness, - cv::LINE_AA); + font_scale, + cv::Scalar(0, 0, 0), + thickness); std::cout << "detect face, location: x=" << rec_clip.x << ", y=" << rec_clip.y << ", width=" << rec_clip.width - << ", height=" << rec_clip.height - << ", wear mask: " << (outptr[1] > classify_threshold) - << std::endl; + << ", height=" << rec_clip.height << ", wear mask: " << flag_mask + << ", prob: " << prob << std::endl; } // Write Result to Image File @@ -230,17 +287,19 @@ void RunModel(std::string det_model_dir, std::string img_name = img_path.substr(start + 1, end - start - 1); std::string result_name = img_name + "_mask_detection_result.jpg"; cv::imwrite(result_name, img); + std::cout << "write result to file: " << result_name << ", success." + << std::endl; } int main(int argc, char** argv) { if (argc < 3) { std::cerr << "[ERROR] usage: " << argv[0] - << " detction_model_dir classification_model_dir image_path\n"; + << " detction_model_file classification_model_file image_path\n"; exit(1); } - std::string detect_model_dir = argv[1]; - std::string classify_model_dir = argv[2]; + std::string detect_model_file = argv[1]; + std::string classify_model_file = argv[2]; std::string img_path = argv[3]; - RunModel(detect_model_dir, classify_model_dir, img_path); + RunModel(detect_model_file, classify_model_file, img_path); return 0; } diff --git a/lite/demo/cxx/mobile_classify/mobile_classify.cc b/lite/demo/cxx/mobile_classify/mobile_classify.cc index d0cf59e185e1330b7d8487d562afa0af29236007..518040ebd07bb4e8940f6a885cddd4f3c98143f3 100644 --- a/lite/demo/cxx/mobile_classify/mobile_classify.cc +++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc @@ -126,7 +126,7 @@ void pre_process(const cv::Mat& img, neon_mean_scale(dimg, data, width * height, means, scales); } -void RunModel(std::string model_dir, +void RunModel(std::string model_file, std::string img_path, const std::vector& labels, const int topk, @@ -134,7 +134,7 @@ void RunModel(std::string model_dir, int height) { // 1. Set MobileConfig MobileConfig config; - config.set_model_dir(model_dir); + config.set_model_from_file(model_file); // 2. Create PaddlePredictor by MobileConfig std::shared_ptr predictor = @@ -169,12 +169,12 @@ void RunModel(std::string model_dir, int main(int argc, char** argv) { if (argc < 4) { std::cerr << "[ERROR] usage: " << argv[0] - << " model_dir image_path label_file\n"; + << " model_file image_path label_file\n"; exit(1); } - printf("parameter: model_dir, image_path and label_file are necessary \n"); + printf("parameter: model_file, image_path and label_file are necessary \n"); printf("parameter: topk, input_width, input_height, are optional \n"); - std::string model_dir = argv[1]; + std::string model_file = argv[1]; std::string img_path = argv[2]; std::string label_file = argv[3]; std::vector labels; @@ -190,6 +190,6 @@ int main(int argc, char** argv) { height = atoi(argv[6]); } - RunModel(model_dir, img_path, labels, topk, width, height); + RunModel(model_file, img_path, labels, topk, width, height); return 0; } diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc index 9d923cb87da5244e4550be3fb6936a650ec9b53a..150bcd231c27c25d8510fc8dfa3281a8351514dd 100644 --- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc +++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc @@ -12,8 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include +#include #include +#include #include + #include "paddle_api.h" // NOLINT using namespace paddle::lite_api; // NOLINT @@ -24,13 +29,57 @@ int64_t ShapeProduction(const shape_t& shape) { return res; } -void RunModel(std::string model_dir) { +std::string ShapePrint(const shape_t& shape) { + std::string shape_str{""}; + for (auto i : shape) { + shape_str += std::to_string(i) + " "; + } + return shape_str; +} + +template +double compute_mean(const T* in, const size_t length) { + double sum = 0.; + for (size_t i = 0; i < length; ++i) { + sum += in[i]; + } + return sum / length; +} + +template +double compute_standard_deviation(const T* in, + const size_t length, + bool has_mean = false, + double mean = 10000) { + if (!has_mean) { + mean = compute_mean(in, length); + } + + double variance = 0.; + for (size_t i = 0; i < length; ++i) { + variance += pow((in[i] - mean), 2); + } + variance /= length; + return sqrt(variance); +} + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +void RunModel(std::string model_dir, + const shape_t& input_shape, + int repeats, + int warmup, + int print_output_elem) { // 1. Set MobileConfig MobileConfig config; - config.set_model_dir(model_dir); - // To load model transformed by opt after release/v2.3.0, plese use - // `set_model_from_file` listed below. - // config.set_model_from_file(model_dir); + config.set_model_from_file(model_dir); + // NOTE: To load model transformed by model_optimize_tool before + // release/v2.3.0, plese use `set_model_dir` API as listed below. + // config.set_model_dir(model_dir); // 2. Create PaddlePredictor by MobileConfig std::shared_ptr predictor = @@ -38,31 +87,108 @@ void RunModel(std::string model_dir) { // 3. Prepare input data std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); - input_tensor->Resize({1, 3, 224, 224}); + input_tensor->Resize( + {input_shape[0], input_shape[1], input_shape[2], input_shape[3]}); auto* data = input_tensor->mutable_data(); for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { data[i] = 1; } // 4. Run predictor - predictor->Run(); + for (size_t widx = 0; widx < warmup; ++widx) { + predictor->Run(); + } + + double sum_duration = 0.0; // millisecond; + double max_duration = 1e-5; + double min_duration = 1e5; + double avg_duration = -1; + for (size_t ridx = 0; ridx < repeats; ++ridx) { + auto start = GetCurrentUS(); + + predictor->Run(); + + auto duration = (GetCurrentUS() - start) / 1000.0; + sum_duration += duration; + max_duration = duration > max_duration ? duration : max_duration; + min_duration = duration < min_duration ? duration : min_duration; + std::cout << "run_idx:" << ridx + 1 << " / " << repeats << ": " << duration + << " ms" << std::endl; + } + avg_duration = sum_duration / static_cast(repeats); + std::cout << "\n======= benchmark summary =======\n" + << "input_shape(NCHW):" << ShapePrint(input_shape) << "\n" + << "model_dir:" << model_dir << "\n" + << "warmup:" << warmup << "\n" + << "repeats:" << repeats << "\n" + << "max_duration:" << max_duration << "\n" + << "min_duration:" << min_duration << "\n" + << "avg_duration:" << avg_duration << "\n"; // 5. Get output - std::unique_ptr output_tensor( - std::move(predictor->GetOutput(0))); - std::cout << "Output shape " << output_tensor->shape()[1] << std::endl; - for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { - std::cout << "Output[" << i << "]: " << output_tensor->data()[i] + std::cout << "\n====== output summary ====== " << std::endl; + size_t output_tensor_num = predictor->GetOutputNames().size(); + std::cout << "output tensor num:" << output_tensor_num << std::endl; + + for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { + std::unique_ptr output_tensor = + predictor->GetOutput(tidx); + std::cout << "\n--- output tensor " << tidx << " ---" << std::endl; + auto out_shape = output_tensor->shape(); + auto out_data = output_tensor->data(); + auto out_mean = compute_mean(out_data, ShapeProduction(out_shape)); + auto out_std_dev = compute_standard_deviation( + out_data, ShapeProduction(out_shape), true, out_mean); + + std::cout << "output shape(NCHW):" << ShapePrint(out_shape) << std::endl; + std::cout << "output tensor " << tidx + << " elem num:" << ShapeProduction(out_shape) << std::endl; + std::cout << "output tensor " << tidx + << " standard deviation:" << out_std_dev << std::endl; + std::cout << "output tensor " << tidx << " mean value:" << out_mean << std::endl; + + // print output + if (print_output_elem) { + for (int i = 0; i < ShapeProduction(out_shape); ++i) { + std::cout << "out[" << tidx << "][" << i + << "]:" << output_tensor->data()[i] << std::endl; + } + } } } int main(int argc, char** argv) { - if (argc < 2) { - std::cerr << "[ERROR] usage: ./" << argv[0] << " naive_buffer_model_dir\n"; - exit(1); + shape_t input_shape{1, 3, 224, 224}; // shape_t ==> std::vector + int repeats = 10; + int warmup = 10; + int print_output_elem = 0; + + if (argc > 2 && argc < 9) { + std::cerr << "usage: ./" << argv[0] << "\n" + << " \n" + << " \n" + << " \n" + << " \n" + << " \n" + << " \n" + << " \n" + << " " << std::endl; + return 0; } + std::string model_dir = argv[1]; - RunModel(model_dir); + if (argc >= 9) { + input_shape[0] = atoi(argv[2]); + input_shape[1] = atoi(argv[3]); + input_shape[2] = atoi(argv[4]); + input_shape[3] = atoi(argv[5]); + repeats = atoi(argv[6]); + warmup = atoi(argv[7]); + print_output_elem = atoi(argv[8]); + } + + RunModel(model_dir, input_shape, repeats, warmup, print_output_elem); + return 0; } diff --git a/lite/demo/cxx/ssd_detection/ssd_detection.cc b/lite/demo/cxx/ssd_detection/ssd_detection.cc index 2408afcbf64a24924eca119a9d9481dc030250c9..0be4561cd8d083f26e562c2346da217bb4b48283 100644 --- a/lite/demo/cxx/ssd_detection/ssd_detection.cc +++ b/lite/demo/cxx/ssd_detection/ssd_detection.cc @@ -162,10 +162,10 @@ std::vector detect_object(const float* data, return rect_out; } -void RunModel(std::string model_dir, std::string img_path) { +void RunModel(std::string model_file, std::string img_path) { // 1. Set MobileConfig MobileConfig config; - config.set_model_dir(model_dir); + config.set_model_from_file(model_file); // 2. Create PaddlePredictor by MobileConfig std::shared_ptr predictor = @@ -199,11 +199,11 @@ void RunModel(std::string model_dir, std::string img_path) { int main(int argc, char** argv) { if (argc < 3) { - std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n"; + std::cerr << "[ERROR] usage: " << argv[0] << " model_file image_path\n"; exit(1); } - std::string model_dir = argv[1]; + std::string model_file = argv[1]; std::string img_path = argv[2]; - RunModel(model_dir, img_path); + RunModel(model_file, img_path); return 0; } diff --git a/lite/demo/cxx/test_cv/README.md b/lite/demo/cxx/test_cv/README.md index 36d2985a4fd4f243027f8caab9b6c5a8beb94cad..21574a9bf9fd0ebb3ecf1663f49beed93fdf51bb 100644 --- a/lite/demo/cxx/test_cv/README.md +++ b/lite/demo/cxx/test_cv/README.md @@ -1,5 +1,5 @@ # 图像预测库的使用 -1. 下载源码(https://github.com/PaddlePaddle/Paddle-Lite),打开LITE_WITH_CV=ON,编译full_publish模式 +1. 下载源码(https://github.com/PaddlePaddle/Paddle-Lite),打开LITE_WITH_CV=ON,编译full_publish or tiny_publish模式 example: ```shell set BUILD_WITH_CV=ON or LITE_WITH_CV=ON @@ -8,7 +8,7 @@ set BUILD_WITH_CV=ON or LITE_WITH_CV=ON --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static -full_publish +tiny_publish ``` 2. 准备模型和优化模型 @@ -17,7 +17,7 @@ example: wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz tar zxvf mobilenet_v1.tar.gz ./lite/tools/build.sh build_optimize_tool -./build.model_optimize_tool/lite/api/model_optimize_tool +./build.opt/lite/api/opt --optimize_out_type=naive_buffer --optimize_out=model_dir --model_dir=model_dir @@ -68,7 +68,8 @@ make adb -s device_id push mobilenet_v1 /data/local/tmp/ adb -s device_id push test_model_cv /data/local/tmp/ adb -s device_id push test.jpg /data/local/tmp/ -adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb -s device_id push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +#adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ adb -s device_id shell chmod +x /data/local/tmp/test_model_cv adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/test_model_cv /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg 1 3 224 224 " @@ -119,7 +120,8 @@ make adb -s device_id push mobilenet_v1 /data/local/tmp/ adb -s device_id push test_img_propress /data/local/tmp/ adb -s device_id push test.jpg /data/local/tmp/ -adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb -s device_id push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +#adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ adb -s device_id shell chmod +x /data/local/tmp/test_model_cv adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/test_img_propress /data/local/tmp/test.jpg /data/local/tmp/ 3 3 1 3 224 224 /data/local/tmp/mobilenet_v1 " diff --git a/lite/demo/cxx/test_cv/test_img_prepross.cc b/lite/demo/cxx/test_cv/test_img_prepross.cc index c2cbd66cc0a15a1032141641d83fbf8db85d20bf..3115ba8f0bf1459541d067d466b80c12548f36a8 100644 --- a/lite/demo/cxx/test_cv/test_img_prepross.cc +++ b/lite/demo/cxx/test_cv/test_img_prepross.cc @@ -28,362 +28,874 @@ typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess; typedef paddle::lite_api::DataLayoutType LayoutType; using namespace paddle::lite_api; // NOLINT -void fill_with_mat(cv::Mat& mat, uint8_t* src) { // NOLINT +void fill_with_mat(cv::Mat& mat, uint8_t* src, int num) { // NOLINT for (int i = 0; i < mat.rows; i++) { for (int j = 0; j < mat.cols; j++) { - int tmp = (i * mat.cols + j) * 3; - cv::Vec3b& rgb = mat.at(i, j); - rgb[0] = src[tmp]; - rgb[1] = src[tmp + 1]; - rgb[2] = src[tmp + 2]; + if (num == 1) { + int tmp = (i * mat.cols + j); + } else if (num == 2) { + int tmp = (i * mat.cols + j) * 2; + cv::Vec2b& rgb = mat.at(i, j); + rgb[0] = src[tmp]; + rgb[1] = src[tmp + 1]; + rgb[2] = src[tmp + 2]; + } else if (num == 3) { + int tmp = (i * mat.cols + j) * 3; + cv::Vec3b& rgb = mat.at(i, j); + rgb[0] = src[tmp]; + rgb[1] = src[tmp + 1]; + rgb[2] = src[tmp + 2]; + } else if (num == 4) { + int tmp = (i * mat.cols + j) * 4; + cv::Vec4b& rgb = mat.at(i, j); + rgb[0] = src[tmp]; + rgb[1] = src[tmp + 1]; + rgb[2] = src[tmp + 2]; + rgb[3] = src[tmp + 3]; + } else { + std::cout << "it is not support" << std::endl; + return; + } } } } -void test_img(std::vector cluster_id, - std::vector thread_num, - std::string img_path, - std::string dst_path, - ImageFormat srcFormat, - ImageFormat dstFormat, - int width, - int height, - float rotate, - FlipParam flip, - LayoutType layout, - std::string model_dir, - int test_iter = 1) { - // init - // paddle::lite::DeviceInfo::Init(); - // read img and pre-process - cv::Mat img = imread(img_path, cv::IMREAD_COLOR); - float means[3] = {0.485f, 0.456f, 0.406f}; - float scales[3] = {0.229f, 0.224f, 0.225f}; - int srch = img.rows; - int srcw = img.cols; - for (auto& cls : cluster_id) { - for (auto& th : thread_num) { - std::cout << "cluster: " << cls << ", threads: " << th << std::endl; - // 1. Set MobileConfig - MobileConfig config; - config.set_model_dir(model_dir); - config.set_power_mode((PowerMode)cls); - config.set_threads(th); - std::cout << "model: " << model_dir; - - // 2. Create PaddlePredictor by MobileConfig - std::shared_ptr predictor = - CreatePaddlePredictor(config); - // 3. Prepare input data from image - std::unique_ptr input_tensor(predictor->GetInput(0)); +double compare_diff(uint8_t* data1, uint8_t* data2, int size, uint8_t* diff_v) { + double diff = 0.0; + for (int i = 0; i < size; i++) { + double val = abs(data1[i] - data2[i]); + diff_v[i] = val; + diff = val > diff ? val : diff; + } + return diff; +} +void print_data(const uint8_t* data, int size) { + for (int i = 0; i < size; i++) { + printf("%d ", data[i]); + if ((i + 1) % 10 == 0) { + std::cout << std::endl; + } + } + std::cout << std::endl; +} +bool test_convert(bool cv_run, + const uint8_t* src, + cv::Mat img, + ImagePreprocess image_preprocess, + int in_size, + int out_size, + ImageFormat srcFormat, + ImageFormat dstFormat, + int dsth, + int dstw, + std::string dst_path, + int test_iter = 1) { + // out + uint8_t* resize_cv = new uint8_t[out_size]; + uint8_t* resize_lite = new uint8_t[out_size]; + cv::Mat im_resize; - /* - imread(img_path, param) - IMREAD_UNCHANGED(<0) 表示加载原图,不做任何改变 - IMREAD_GRAYSCALE ( 0)表示把原图作为灰度图像加载进来 - IMREAD_COLOR (>0) 表示把原图作为RGB图像加载进来 - */ - cv::Mat img; - if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { - img = imread(img_path, cv::IMREAD_COLOR); - } else if (srcFormat == ImageFormat::GRAY) { - img = imread(img_path, cv::IMREAD_GRAYSCALE); - } else { - printf("this format %d does not support \n", srcFormat); - return; - } - if (img.empty()) { - std::cout << "opencv read image " << img_path.c_str() << " failed" - << std::endl; - return; + double to_cv = 0.0; + double to_lite = 0.0; + std::cout << "opencv compute:" << std::endl; + if (cv_run) { + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // convert bgr-gray + if (dstFormat == srcFormat) { + im_resize = img; + } else if ((dstFormat == ImageFormat::BGR || + dstFormat == ImageFormat::RGB) && + srcFormat == ImageFormat::GRAY) { + cv::cvtColor(img, im_resize, cv::COLOR_GRAY2BGR); + } else if ((srcFormat == ImageFormat::BGR || + dstFormat == ImageFormat::RGBA) && + dstFormat == ImageFormat::GRAY) { + cv::cvtColor(img, im_resize, cv::COLOR_BGR2GRAY); + } else if (dstFormat == srcFormat) { + printf("convert format error \n"); + return false; } - int srch = img.rows; - int srcw = img.cols; - int dsth = height; - int dstw = width; + clock_t end = clock(); + to_cv += (end - begin); + } + } - std::cout << " input tensor size, num= " << 1 << ", channel= " << 1 - << ", height= " << srch << ", width= " << srcw - << ", srcFormat= " << (ImageFormat)srcFormat << std::endl; - // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12, - if (srcFormat == ImageFormat::GRAY) { - std::cout << "srcFormat: GRAY" << std::endl; - } - if (srcFormat == ImageFormat::BGR) { - std::cout << "srcFormat: BGR" << std::endl; - } - if (srcFormat == ImageFormat::RGB) { - std::cout << "srcFormat: RGB" << std::endl; - } - std::cout << " output tensor size, num=" << 1 << ", channel=" << 1 - << ", height=" << dsth << ", width=" << dstw - << ", dstFormat= " << (ImageFormat)dstFormat << std::endl; + std::cout << "lite compute:" << std::endl; + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // resize default linear + image_preprocess.imageConvert(src, resize_lite); + clock_t end = clock(); + to_lite += (end - begin); + } + to_cv = 1000 * to_cv / CLOCKS_PER_SEC; + to_lite = 1000 * to_lite / CLOCKS_PER_SEC; - if (dstFormat == ImageFormat::GRAY) { - std::cout << "dstFormat: GRAY" << std::endl; - } - if (dstFormat == ImageFormat::BGR) { - std::cout << "dstFormat: BGR" << std::endl; - } - if (dstFormat == ImageFormat::RGB) { - std::cout << "dstFormat: RGB" << std::endl; + std::cout << "---opencv convert run time: " << to_cv + << "ms, avg: " << to_cv / test_iter << std::endl; + std::cout << "---lite convert run time: " << to_lite + << "ms, avg: " << to_lite / test_iter << std::endl; + std::cout << "compare diff: " << std::endl; + + if (cv_run) { + resize_cv = im_resize.data; + uint8_t* diff_v = new uint8_t[out_size]; + double diff = compare_diff(resize_cv, resize_lite, out_size, diff_v); + if (diff > 1) { + std::cout << "din: " << std::endl; + print_data(src, in_size); + std::cout << "cv out: " << std::endl; + print_data(resize_cv, out_size); + std::cout << "lite out: " << std::endl; + print_data(resize_lite, out_size); + std::cout << "lite out: " << std::endl; + print_data(diff_v, out_size); + return false; + } else { + // save_img + std::cout << "write image: " << std::endl; + std::string resize_name = dst_path + "/convert.jpg"; + cv::Mat resize_mat; + int num = 1; + if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC3); + num = 3; + } else if (dstFormat == ImageFormat::BGRA || + dstFormat == ImageFormat::RGBA) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC4); + num = 4; + } else if (dstFormat == ImageFormat::GRAY) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC1); + num = 1; + } else if (dstFormat == ImageFormat::NV12) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC2); + num = 2; } + fill_with_mat(resize_mat, resize_lite, num); + cv::imwrite(resize_name, resize_mat); + + std::cout << "convert successed!" << std::endl; + return true; + } + } +} + +bool test_flip(bool cv_run, + const uint8_t* src, + cv::Mat img, + ImagePreprocess image_preprocess, + int in_size, + int out_size, + FlipParam flip, + ImageFormat dstFormat, + int dsth, + int dstw, + std::string dst_path, + int test_iter = 1) { + // out + uint8_t* resize_cv = new uint8_t[out_size]; + uint8_t* resize_lite = new uint8_t[out_size]; + cv::Mat im_resize; - std::cout << "Rotate = " << rotate << ", Flip = " << flip - << ", Layout = " << static_cast(layout) << std::endl; - if (static_cast(layout) != 1 && static_cast(layout) != 3) { - std::cout << "this layout" << static_cast(layout) - << " is no support" << std::endl; + double to_cv = 0.0; + double to_lite = 0.0; + std::cout << "opencv compute:" << std::endl; + if (cv_run) { + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // resize default linear + cv::flip(img, im_resize, flip); + clock_t end = clock(); + to_cv += (end - begin); + } + } + std::cout << "lite compute:" << std::endl; + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // resize default linear + image_preprocess.imageFlip(src, resize_lite); + clock_t end = clock(); + to_lite += (end - begin); + } + to_cv = 1000 * to_cv / CLOCKS_PER_SEC; + to_lite = 1000 * to_lite / CLOCKS_PER_SEC; + + std::cout << "---opencv flip run time: " << to_cv + << "ms, avg: " << to_cv / test_iter << std::endl; + std::cout << "---lite flip run time: " << to_lite + << "ms, avg: " << to_lite / test_iter << std::endl; + std::cout << "compare diff: " << std::endl; + + if (cv_run) { + resize_cv = im_resize.data; + uint8_t* diff_v = new uint8_t[out_size]; + double diff = compare_diff(resize_cv, resize_lite, out_size, diff_v); + if (diff > 1) { + std::cout << "din: " << std::endl; + print_data(src, in_size); + std::cout << "cv out: " << std::endl; + print_data(resize_cv, out_size); + std::cout << "lite out: " << std::endl; + print_data(resize_lite, out_size); + std::cout << "diff out: " << std::endl; + print_data(diff_v, out_size); + return false; + } else { + // save_img + std::cout << "write image: " << std::endl; + std::string resize_name = dst_path + "/flip.jpg"; + cv::Mat resize_mat; + int num = 1; + if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC3); + num = 3; + } else if (dstFormat == ImageFormat::BGRA || + dstFormat == ImageFormat::RGBA) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC4); + num = 4; + } else if (dstFormat == ImageFormat::GRAY) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC1); + num = 1; + } else if (dstFormat == ImageFormat::NV12) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC2); + num = 2; } - int size = 3 * srch * srcw; - if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { - size = 3 * srch * srcw; - } else if (srcFormat == ImageFormat::GRAY) { - size = srch * srcw; + fill_with_mat(resize_mat, resize_lite, num); + cv::imwrite(resize_name, resize_mat); + std::cout << "flip successed!" << std::endl; + return true; + } + } +} + +bool test_rotate(bool cv_run, + const uint8_t* src, + cv::Mat img, + ImagePreprocess image_preprocess, + int in_size, + int out_size, + float rotate, + ImageFormat dstFormat, + int dsth, + int dstw, + std::string dst_path, + int test_iter = 1) { + // out + uint8_t* resize_cv = new uint8_t[out_size]; + uint8_t* resize_lite = new uint8_t[out_size]; + cv::Mat im_resize; + + double to_cv = 0.0; + double to_lite = 0.0; + std::cout << "opencv compute:" << std::endl; + if (cv_run) { + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // rotate 90 + if (rotate == 90) { + cv::flip(img.t(), im_resize, 1); + } else if (rotate == 180) { + cv::flip(img, im_resize, -1); + } else if (rotate == 270) { + cv::flip(img.t(), im_resize, 0); } - uint8_t* src = img.data; + clock_t end = clock(); + to_cv += (end - begin); + } + } + // lite + std::cout << "lite compute:" << std::endl; + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // resize default linear + image_preprocess.imageRotate(src, resize_lite); + clock_t end = clock(); + to_lite += (end - begin); + } + to_cv = 1000 * to_cv / CLOCKS_PER_SEC; + to_lite = 1000 * to_lite / CLOCKS_PER_SEC; - int out_size = srch * srcw; - int resize = dstw * dsth; + std::cout << "---opencv rotate run time: " << to_cv + << "ms, avg: " << to_cv / test_iter << std::endl; + std::cout << "---lite rotate run time: " << to_lite + << "ms, avg: " << to_lite / test_iter << std::endl; + std::cout << "compare diff: " << std::endl; + if (cv_run) { + resize_cv = im_resize.data; + uint8_t* diff_v = new uint8_t[out_size]; + double diff = compare_diff(resize_cv, resize_lite, out_size, diff_v); + if (diff > 1) { + std::cout << "din: " << std::endl; + print_data(src, in_size); + std::cout << "cv out: " << std::endl; + print_data(resize_cv, out_size); + std::cout << "lite out: " << std::endl; + print_data(resize_lite, out_size); + std::cout << "diff out: " << std::endl; + print_data(diff_v, out_size); + return false; + } else { + // save_img + std::cout << "write image: " << std::endl; + std::string resize_name = dst_path + "/rotate.jpg"; + cv::Mat resize_mat; + int num = 1; if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) { - out_size = 3 * srch * srcw; - resize = 3 * dsth * dstw; + resize_mat = cv::Mat(dsth, dstw, CV_8UC3); + num = 3; + } else if (dstFormat == ImageFormat::BGRA || + dstFormat == ImageFormat::RGBA) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC4); + num = 4; } else if (dstFormat == ImageFormat::GRAY) { - out_size = srch * srcw; - resize = dsth * dstw; + resize_mat = cv::Mat(dsth, dstw, CV_8UC1); + num = 1; + } else if (dstFormat == ImageFormat::NV12) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC2); + num = 2; } - // out - uint8_t* lite_dst = new uint8_t[out_size]; - uint8_t* resize_tmp = new uint8_t[resize]; - uint8_t* tv_out_ratote = new uint8_t[out_size]; - uint8_t* tv_out_flip = new uint8_t[out_size]; - std::vector shape_out = {1, 3, srch, srcw}; + fill_with_mat(resize_mat, resize_lite, num); + cv::imwrite(resize_name, resize_mat); + std::cout << "rotate successed!" << std::endl; + return true; + } + } +} - input_tensor->Resize(shape_out); - Tensor dst_tensor = *input_tensor; - std::cout << "opencv compute" << std::endl; - cv::Mat im_convert; - cv::Mat im_resize; - cv::Mat im_rotate; - cv::Mat im_flip; - double to_1 = 0; - double to_2 = 0; - double to_3 = 0; - double to_4 = 0; - double to1 = 0; - for (int i = 0; i < test_iter; i++) { - clock_t start = clock(); - clock_t begin = clock(); - // convert bgr-gray - if (dstFormat == srcFormat) { - im_convert = img; - } else if (dstFormat == ImageFormat::BGR && - srcFormat == ImageFormat::GRAY) { - cv::cvtColor(img, im_convert, cv::COLOR_GRAY2BGR); - } else if (srcFormat == ImageFormat::BGR && - dstFormat == ImageFormat::GRAY) { - cv::cvtColor(img, im_convert, cv::COLOR_BGR2GRAY); - } else if (dstFormat == srcFormat) { - printf("convert format error \n"); - return; - } - clock_t end = clock(); - to_1 += (end - begin); +bool test_resize(bool cv_run, + const uint8_t* src, + cv::Mat img, + ImagePreprocess image_preprocess, + int in_size, + int out_size, + ImageFormat dstFormat, + int dsth, + int dstw, + std::string dst_path, + int test_iter = 1) { + // out + uint8_t* resize_cv = new uint8_t[out_size]; + uint8_t* resize_lite = new uint8_t[out_size]; + cv::Mat im_resize; - begin = clock(); - // resize default linear - cv::resize(im_convert, im_resize, cv::Size(dstw, dsth), 0.f, 0.f); - end = clock(); - to_2 += (end - begin); + double to_cv = 0.0; + double to_lite = 0.0; + std::cout << "opencv compute:" << std::endl; + if (cv_run) { + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // resize default linear + cv::resize(img, im_resize, cv::Size(dstw, dsth), 0.f, 0.f); + clock_t end = clock(); + to_cv += (end - begin); + } + } + // param + std::cout << "lite compute:" << std::endl; + for (int i = 0; i < test_iter; i++) { + clock_t begin = clock(); + // resize default linear + image_preprocess.imageResize(src, resize_lite); + clock_t end = clock(); + to_lite += (end - begin); + } + to_cv = 1000 * to_cv / CLOCKS_PER_SEC; + to_lite = 1000 * to_lite / CLOCKS_PER_SEC; - begin = clock(); - // rotate 90 - if (rotate == 90) { - cv::flip(im_convert.t(), im_rotate, 1); - } else if (rotate == 180) { - cv::flip(im_convert, im_rotate, -1); - } else if (rotate == 270) { - cv::flip(im_convert.t(), im_rotate, 0); - } - end = clock(); - to_3 += (end - begin); + std::cout << "---opencv resize run time: " << to_cv + << "ms, avg: " << to_cv / test_iter << std::endl; + std::cout << "---lite resize run time: " << to_lite + << "ms, avg: " << to_lite / test_iter << std::endl; + std::cout << "compare diff: " << std::endl; - begin = clock(); - // flip - cv::flip(im_convert, im_flip, flip); - end = clock(); - to_4 += (end - begin); - clock_t ovet = clock(); - to1 += (ovet - start); + if (cv_run) { + resize_cv = im_resize.data; + uint8_t* diff_v = new uint8_t[out_size]; + double diff = compare_diff(resize_cv, resize_lite, out_size, diff_v); + if (diff > 10) { + std::cout << "din: " << std::endl; + print_data(src, in_size); + std::cout << "cv out: " << std::endl; + print_data(resize_cv, out_size); + std::cout << "lite out: " << std::endl; + print_data(resize_lite, out_size); + std::cout << "diff out: " << std::endl; + print_data(diff_v, out_size); + return false; + } else { + // save_img + std::cout << "write image: " << std::endl; + std::string resize_name = dst_path + "/resize.jpg"; + cv::Mat resize_mat; + int num = 1; + if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC3); + num = 3; + } else if (dstFormat == ImageFormat::BGRA || + dstFormat == ImageFormat::RGBA) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC4); + num = 4; + } else if (dstFormat == ImageFormat::GRAY) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC1); + num = 1; + } else if (dstFormat == ImageFormat::NV12) { + resize_mat = cv::Mat(dsth, dstw, CV_8UC2); + num = 2; } + fill_with_mat(resize_mat, resize_lite, num); + cv::imwrite(resize_name, resize_mat); + std::cout << "resize successed!" << std::endl; + return true; + } + } +} + +void test_custom(bool has_img, // input is image + std::string img_path, + std::string in_txt, + std::string dst_path, + ImageFormat srcFormat, + ImageFormat dstFormat, + int srcw, + int srch, + int dstw, + int dsth, + float rotate, + FlipParam flip, + int test_iter = 1) { + // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12, + cv::Mat img; + uint8_t* src = nullptr; + int in_size = 0; + if (has_img) { + if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + img = imread(img_path, cv::IMREAD_COLOR); + } else if (srcFormat == ImageFormat::GRAY) { + img = imread(img_path, cv::IMREAD_GRAYSCALE); + } else { + printf("this format %d does not support \n", srcFormat); + return; + } + srcw = img.cols; + srch = img.rows; + src = img.data; + } + bool cv_run = true; + if (srcFormat == ImageFormat::GRAY) { + std::cout << "srcFormat: GRAY" << std::endl; + cv_run = false; + } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + in_size = 3 * srch * srcw; + std::cout << "srcFormat: BGR/RGB" << std::endl; + } else if (srcFormat == ImageFormat::RGBA || srcFormat == ImageFormat::BGRA) { + in_size = 4 * srch * srcw; + std::cout << "srcFormat: BGRA/RGBA" << std::endl; + } else if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) { + in_size = (3 * srch * srcw) / 2; + cv_run = false; + std::cout << "srcFormat: NV12/NV12" << std::endl; + } + int out_size = dstw * dsth; + // out + if (dstFormat == ImageFormat::GRAY) { + std::cout << "dstFormat: GRAY" << std::endl; + } else if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) { + out_size = 3 * dsth * dstw; + std::cout << "dstFormat: BGR/RGB" << std::endl; + } else if (dstFormat == ImageFormat::RGBA || dstFormat == ImageFormat::BGRA) { + out_size = 4 * dsth * dstw; + std::cout << "dstFormat: BGRA/RGBA" << std::endl; + } else if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) { + out_size = (3 * dsth * dstw) / 2; + cv_run = false; + std::cout << "dstFormat: NV12/NV12" << std::endl; + } - std::cout << "Paddle-lite compute" << std::endl; - double lite_to = 0; - double lite_to_1 = 0; - double lite_to_2 = 0; - double lite_to_3 = 0; - double lite_to_4 = 0; - double lite_to_5 = 0; - TransParam tparam; - tparam.ih = srch; - tparam.iw = srcw; - tparam.oh = dsth; - tparam.ow = dstw; - tparam.flip_param = flip; - tparam.rotate_param = rotate; + if (!has_img) { + src = new uint8_t[in_size]; + // read txt + FILE* fp = fopen(in_txt.c_str(), "r"); + for (int i = 0; i < in_size; i++) { + fscanf(fp, "%d\n", &src[i]); + } + fclose(fp); + int num = 1; + if (srcFormat == ImageFormat::GRAY) { + img = cv::Mat(srch, srcw, CV_8UC1); + } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + img = cv::Mat(srch, srcw, CV_8UC3); + num = 3; + } else if (srcFormat == ImageFormat::BGRA || + srcFormat == ImageFormat::RGBA) { + img = cv::Mat(srch, srcw, CV_8UC4); + num = 4; + } else if (srcFormat == ImageFormat::NV12 || + srcFormat == ImageFormat::NV21) { + img = cv::Mat(srch, srcw, CV_8UC2); + num = 2; + std::cout << "CV not support NV12"; + } + fill_with_mat(img, src, num); + std::string name = dst_path + "input.jpg"; + cv::imwrite(name, img); // shurutup + } - ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); + TransParam tparam; + tparam.ih = srch; + tparam.iw = srcw; + tparam.oh = srch; + tparam.ow = srcw; + tparam.flip_param = flip; + tparam.rotate_param = rotate; - for (int i = 0; i < test_iter; ++i) { - clock_t start = clock(); - clock_t begin = clock(); - image_preprocess.imageConvert(src, lite_dst); - clock_t end = clock(); - lite_to_1 += (end - begin); + TransParam tparam1; + tparam1.ih = srch; + tparam1.iw = srcw; + tparam1.oh = dsth; + tparam1.ow = dstw; + tparam1.flip_param = flip; + tparam1.rotate_param = rotate; - begin = clock(); - image_preprocess.imageResize(lite_dst, resize_tmp); - end = clock(); - lite_to_2 += (end - begin); + ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); + std::cout << "image convert testing" << std::endl; + bool re = test_convert(cv_run, + src, + img, + image_preprocess, + in_size, + out_size, + srcFormat, + dstFormat, + srch, + srcw, + dst_path, + test_iter); + if (!re) { + return; + } + std::cout << "image resize testing" << std::endl; + tparam.oh = dsth; + tparam.ow = dstw; + ImagePreprocess image_preprocess1(srcFormat, srcFormat, tparam1); + re = test_resize(cv_run, + src, + img, + image_preprocess1, + in_size, + out_size, + srcFormat, + dsth, + dstw, + dst_path, + test_iter); + if (!re) { + return; + } - begin = clock(); - image_preprocess.imageRotate( - lite_dst, tv_out_ratote, (ImageFormat)dstFormat, srcw, srch, 90); - end = clock(); - lite_to_3 += (end - begin); + std::cout << "image rotate testing" << std::endl; + if (rotate == 90 || rotate == 270) { + tparam.oh = srcw; + tparam.ow = srch; + dsth = srcw; + dstw = srch; + } else { + tparam.oh = srch; + tparam.ow = srcw; + dsth = srch; + dstw = srcw; + } + ImagePreprocess image_preprocess2(srcFormat, srcFormat, tparam); + re = test_rotate(cv_run, + src, + img, + image_preprocess2, + in_size, + out_size, + rotate, + srcFormat, + dsth, + dstw, + dst_path, + test_iter); + if (!re) { + return; + } + tparam.oh = srch; + tparam.ow = srcw; + ImagePreprocess image_preprocess3(srcFormat, srcFormat, tparam); + std::cout << "image flip testing" << std::endl; + re = test_flip(cv_run, + src, + img, + image_preprocess3, + in_size, + out_size, + flip, + srcFormat, + srch, + srcw, + dst_path, + test_iter); + if (!re) { + return; + } +} - begin = clock(); - image_preprocess.imageFlip( - lite_dst, tv_out_flip, (ImageFormat)dstFormat, srcw, srch, flip); - end = clock(); - lite_to_4 += (end - begin); +#if 0 +void test_all_r(std::string dst_path, int test_iter = 1) { + // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12, + cv::Mat img; + uint8_t* src = nullptr; + int in_size = 0; + for (auto& srcFormat : {1, 3, 4, 11}) { + for (auto& dstFormat : {1, 3, 4, 11}) { + for (auto& srcw : {10, 112, 200}) { + for (auto& srch : {10, 224, 400}) { + for (auto& dstw : {12, 224, 180}) { + for (auto& dsth : {12, 224, 320}) { + for (auto& flip : {-1, 0, 1}) { + for (auto& rotate : {90, 180, 270}) { + TransParam tparam; + tparam.ih = srch; + tparam.iw = srcw; + tparam.oh = srch; + tparam.ow = srcw; + tparam.flip_param = (FlipParam)flip; + tparam.rotate_param = rotate; - clock_t over = clock(); - lite_to += (over - start); + TransParam tparam1; + tparam1.ih = srch; + tparam1.iw = srcw; + tparam1.oh = dsth; + tparam1.ow = dstw; + tparam1.flip_param = (FlipParam)flip; + tparam.rotate_param = rotate; - begin = clock(); - image_preprocess.image2Tensor(lite_dst, - &dst_tensor, - (ImageFormat)dstFormat, - srcw, - srch, - layout, - means, - scales); - end = clock(); - lite_to_5 += (end - begin); - } - to_1 = 1000 * to_1 / CLOCKS_PER_SEC; - to_2 = 1000 * to_2 / CLOCKS_PER_SEC; - to_3 = 1000 * to_3 / CLOCKS_PER_SEC; - to_4 = 1000 * to_4 / CLOCKS_PER_SEC; - to1 = 1000 * to1 / CLOCKS_PER_SEC; - std::cout << "opencv convert run time: " << to_1 - << "ms, avg: " << to_1 / test_iter << std::endl; - std::cout << "opencv resize run time: " << to_2 - << "ms, avg: " << to_2 / test_iter << std::endl; - std::cout << "opencv rotate run time: " << to_3 - << "ms, avg: " << to_3 / test_iter << std::endl; - std::cout << "opencv flip time: " << to_4 - << "ms, avg: " << to_4 / test_iter << std::endl; - std::cout << "opencv total run time: " << to1 - << "ms, avg: " << to1 / test_iter << std::endl; - std::cout << "------" << std::endl; + ImagePreprocess image_preprocess( + (ImageFormat)srcFormat, (ImageFormat)dstFormat, tparam); + ImagePreprocess image_preprocess1( + (ImageFormat)srcFormat, (ImageFormat)srcFormat, tparam1); + ImagePreprocess image_preprocess2( + (ImageFormat)srcFormat, (ImageFormat)srcFormat, tparam); + int h = srch; + int w = srcw; + if (rotate == 90 || rotate == 270) { + tparam.oh = srcw; + h = srcw; + tparam.ow = srch; + w = srch; + } + ImagePreprocess image_preprocess3( + (ImageFormat)srcFormat, (ImageFormat)srcFormat, tparam); + int in_size = srcw * srch; + int out_size = dstw * dsth; + if (srcFormat == ImageFormat::GRAY) { + std::cout << "srcFormat: GRAY" << std::endl; + } else if (srcFormat == ImageFormat::BGR || + srcFormat == ImageFormat::RGB) { + in_size = 3 * srch * srcw; + std::cout << "srcFormat: BGR/RGB" << std::endl; + } else if (srcFormat == ImageFormat::RGBA || + srcFormat == ImageFormat::BGRA) { + in_size = 4 * srch * srcw; + std::cout << "srcFormat: BGRA/RGBA" << std::endl; + } else if (srcFormat == ImageFormat::NV12 || + srcFormat == ImageFormat::NV21) { + in_size = (3 * srch * srcw) / 2; + std::cout << "srcFormat: NV12/NV12" << std::endl; + } + // out + if (dstFormat == ImageFormat::GRAY) { + std::cout << "dstFormat: GRAY" << std::endl; + } else if (dstFormat == ImageFormat::BGR || + dstFormat == ImageFormat::RGB) { + out_size = 3 * dsth * dstw; + std::cout << "dstFormat: BGR/RGB" << std::endl; + } else if (dstFormat == ImageFormat::RGBA || + dstFormat == ImageFormat::BGRA) { + out_size = 4 * dsth * dstw; + std::cout << "dstFormat: BGRA/RGBA" << std::endl; + } else if (dstFormat == ImageFormat::NV12 || + dstFormat == ImageFormat::NV21) { + out_size = (3 * dsth * dstw) / 2; + std::cout << "dstFormat: NV12/NV12" << std::endl; + } + // init + uint8_t* src = new uint8_t[in_size]; + for (int i = 0; i < in_size; i++) { + src[i] = i % 255; + } + cv::Mat img; + int num = 1; + bool cv_run = true; + if (srcFormat == ImageFormat::GRAY) { + img = cv::Mat(srch, srcw, CV_8UC1); + cv_run = false; + } else if (srcFormat == ImageFormat::BGR || + srcFormat == ImageFormat::RGB) { + img = cv::Mat(srch, srcw, CV_8UC3); + num = 3; + } else if (srcFormat == ImageFormat::BGRA || + srcFormat == ImageFormat::RGBA) { + img = cv::Mat(srch, srcw, CV_8UC4); + num = 4; + } else if (srcFormat == ImageFormat::NV12 || + srcFormat == ImageFormat::NV21) { + img = cv::Mat(srch, srcw, CV_8UC2); + num = 2; + cv_run = false; + } + fill_with_mat(img, src, num); + std::string name = dst_path + "input.jpg"; + cv::imwrite(name, img); // shurutup + // convert + bool convert = true; + if (srcFormat == 11 || dstFormat == 11) { + // NV12, cv not support + convert = false; + cv_run = false; + } + if (convert) { + std::cout << "image convert testing"; + bool re = test_convert(cv_run, + src, + img, + image_preprocess, + in_size, + out_size, + (ImageFormat)srcFormat, + (ImageFormat)dstFormat, + srch, + srcw, + dst_path, + test_iter); + if (!re) { + return; + } + } - lite_to_1 = 1000 * lite_to_1 / CLOCKS_PER_SEC; - lite_to_2 = 1000 * lite_to_2 / CLOCKS_PER_SEC; - lite_to_3 = 1000 * lite_to_3 / CLOCKS_PER_SEC; - lite_to_4 = 1000 * lite_to_4 / CLOCKS_PER_SEC; - lite_to_5 = 1000 * lite_to_5 / CLOCKS_PER_SEC; - lite_to = 1000 * lite_to / CLOCKS_PER_SEC; - std::cout << "lite convert run time: " << lite_to_1 - << "ms, avg: " << lite_to_1 / test_iter << std::endl; - std::cout << "lite resize run time: " << lite_to_2 - << "ms, avg: " << lite_to_2 / test_iter << std::endl; - std::cout << "lite rotate run time: " << lite_to_3 - << "ms, avg: " << lite_to_3 / test_iter << std::endl; - std::cout << "lite flip time: " << lite_to_4 - << "ms, avg: " << lite_to_4 / test_iter << std::endl; - std::cout << "lite total run time: " << lite_to - << "ms, avg: " << lite_to / test_iter << std::endl; - std::cout << "lite img2tensor time: " << lite_to_5 - << "ms, avg: " << lite_to_5 / test_iter << std::endl; - std::cout << "------" << std::endl; + // resize + std::cout << "image resize testing"; + bool re = test_resize(cv_run, + src, + img, + image_preprocess1, + in_size, + out_size, + (ImageFormat)srcFormat, + dsth, + dstw, + dst_path, + test_iter); + if (convert && !re) { + return; + } + // rotate + std::cout << "image rotate testing"; - double max_ratio = 0; - double max_diff = 0; - const double eps = 1e-6f; - // save_img - std::cout << "write image: " << std::endl; - std::string resize_name = dst_path + "/resize.jpg"; - std::string convert_name = dst_path + "/convert.jpg"; - std::string rotate_name = dst_path + "/rotate.jpg"; - std::string flip_name = dst_path + "/flip.jpg"; - cv::Mat resize_mat(dsth, dstw, CV_8UC3); - cv::Mat convert_mat(srch, srcw, CV_8UC3); - cv::Mat rotate_mat; - if (rotate == 90 || rotate == 270) { - rotate_mat = cv::Mat(srcw, srch, CV_8UC3); - } else { - rotate_mat = cv::Mat(srch, srcw, CV_8UC3); + re = test_rotate(cv_run, + src, + img, + image_preprocess3, + in_size, + out_size, + rotate, + (ImageFormat)srcFormat, + h, + w, + dst_path, + test_iter); + if (convert && !re) { + return; + } + // flip + std::cout << "image rotate testing"; + re = test_flip(cv_run, + src, + img, + image_preprocess2, + in_size, + out_size, + (FlipParam)flip, + (ImageFormat)srcFormat, + srch, + srcw, + dst_path, + test_iter); + if (convert && !re) { + return; + } + } + } + } + } + } } - cv::Mat flip_mat(srch, srcw, CV_8UC3); - fill_with_mat(resize_mat, resize_tmp); - fill_with_mat(convert_mat, lite_dst); - fill_with_mat(rotate_mat, tv_out_ratote); - fill_with_mat(flip_mat, tv_out_flip); - cv::imwrite(convert_name, convert_mat); - cv::imwrite(resize_name, resize_mat); - cv::imwrite(rotate_name, rotate_mat); - cv::imwrite(flip_name, flip_mat); - delete[] lite_dst; - delete[] resize_tmp; - delete[] tv_out_ratote; - delete[] tv_out_flip; } } } +#endif int main(int argc, char** argv) { if (argc < 7) { std::cerr << "[ERROR] usage: " << argv[0] - << " image_path dst_apth srcFormat dstFormat width height\n"; + << " has_img image_path/txt_path dst_apth srcFormat dstFormat " + "dstw dsth " + << "[options] srcw srch flip rotate test_iter\n "; exit(1); } - std::string image_path = argv[1]; - std::string dst_path = argv[2]; - int srcFormat = atoi(argv[3]); - int dstFormat = atoi(argv[4]); - int width = atoi(argv[5]); - int height = atoi(argv[6]); + bool has_img = atoi(argv[1]); + std::string path = argv[2]; + std::string dst_path = argv[3]; + int srcFormat = atoi(argv[4]); + int dstFormat = atoi(argv[5]); + int dstw = atoi(argv[6]); + int dsth = atoi(argv[7]); + int srcw = 100; + int srch = 100; int flip = -1; float rotate = 90; - int layout = 1; - std::string model_dir = "mobilenet_v1"; - if (argc > 7) { - model_dir = argv[7]; - } - if (argc > 8) { - flip = atoi(argv[8]); - } - if (argc > 9) { - rotate = atoi(argv[9]); - } - if (argc > 10) { - layout = atoi(argv[10]); + int test_iter = 10; + if (!has_img) { + std::cout << "It needs srcw and srch"; + srcw = atoi(argv[8]); + srch = atoi(argv[9]); + if (argc > 10) { + flip = atoi(argv[10]); + } + if (argc > 11) { + rotate = atoi(argv[11]); + } + if (argc > 12) { + test_iter = atoi(argv[12]); + } + } else { + if (argc > 8) { + flip = atoi(argv[8]); + } + if (argc > 9) { + rotate = atoi(argv[9]); + } + if (argc > 10) { + test_iter = atoi(argv[10]); + } } - test_img({3}, - {1, 2, 4}, - image_path, - dst_path, - (ImageFormat)srcFormat, - (ImageFormat)dstFormat, - width, - height, - rotate, - (FlipParam)flip, - (LayoutType)layout, - model_dir, - 20); + test_custom(has_img, + path, + path, + dst_path, + (ImageFormat)srcFormat, + (ImageFormat)dstFormat, + srcw, + srch, + dstw, + dsth, + rotate, + (FlipParam)flip, + test_iter); +#if 0 + test_all_r(dst_path, test_iter); +#endif return 0; } diff --git a/lite/demo/cxx/test_cv/test_model_cv.cc b/lite/demo/cxx/test_cv/test_model_cv.cc index 24f408bf4a55ea2d499e39902201597c0e8c6e4e..caa085eecb81e54859c1bdd5cd7c0654175b7a9a 100644 --- a/lite/demo/cxx/test_cv/test_model_cv.cc +++ b/lite/demo/cxx/test_cv/test_model_cv.cc @@ -111,7 +111,7 @@ void pre_process(const cv::Mat& img, int width, int height, Tensor dstTensor) { #endif } -void RunModel(std::string model_dir, +void RunModel(std::string model_file, std::string img_path, std::vector input_shape, PowerMode power_mode, @@ -120,7 +120,7 @@ void RunModel(std::string model_dir, int warmup = 0) { // 1. Set MobileConfig MobileConfig config; - config.set_model_dir(model_dir); + config.set_model_from_file(model_file); config.set_power_mode(power_mode); config.set_threads(thread_num); @@ -161,7 +161,7 @@ void RunModel(std::string model_dir, } std::cout << "================== Speed Report ===================" << std::endl; - std::cout << "Model: " << model_dir + std::cout << "Model: " << model_file << ", power_mode: " << static_cast(power_mode) << ", threads num " << thread_num << ", warmup: " << warmup << ", repeats: " << test_iter << ", avg time: " << lps / test_iter @@ -187,10 +187,10 @@ void RunModel(std::string model_dir, int main(int argc, char** argv) { if (argc < 7) { std::cerr << "[ERROR] usage: " << argv[0] - << " model_dir image_path input_shape\n"; + << " model_file image_path input_shape\n"; exit(1); } - std::string model_dir = argv[1]; + std::string model_file = argv[1]; std::string img_path = argv[2]; std::vector input_shape; input_shape.push_back(atoi(argv[3])); @@ -213,7 +213,7 @@ int main(int argc, char** argv) { if (argc > 10) { warmup = atoi(argv[10]); } - RunModel(model_dir, + RunModel(model_file, img_path, input_shape, (PowerMode)power_mode, diff --git a/lite/demo/cxx/yolov3_detection/yolov3_detection.cc b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc index a9beb1ed28de1f3c28eb5c03b3b660d518ee10c5..d34319050392c74c3fa552bd24c0ea24245ced99 100644 --- a/lite/demo/cxx/yolov3_detection/yolov3_detection.cc +++ b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc @@ -182,10 +182,10 @@ std::vector detect_object(const float* data, return rect_out; } -void RunModel(std::string model_dir, std::string img_path) { +void RunModel(std::string model_file, std::string img_path) { // 1. Set MobileConfig MobileConfig config; - config.set_model_dir(model_dir); + config.set_model_from_file(model_file); // 2. Create PaddlePredictor by MobileConfig std::shared_ptr predictor = @@ -228,11 +228,11 @@ void RunModel(std::string model_dir, std::string img_path) { int main(int argc, char** argv) { if (argc < 3) { - std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n"; + std::cerr << "[ERROR] usage: " << argv[0] << " model_file image_path\n"; exit(1); } - std::string model_dir = argv[1]; + std::string model_file = argv[1]; std::string img_path = argv[2]; - RunModel(model_dir, img_path); + RunModel(model_file, img_path); return 0; } diff --git a/lite/gen_code/gen_code.cc b/lite/gen_code/gen_code.cc index 0d8f4d0d192f3563d00bb66778ca4e13a17b93b1..6c43f6e0116d9adfc4fc6f315d5653b2634dfe7b 100644 --- a/lite/gen_code/gen_code.cc +++ b/lite/gen_code/gen_code.cc @@ -111,11 +111,11 @@ void Module::AddOpDescHelper(const std::string &op_id, switch (type) { case AttrType::INT: - return std::to_string(desc.GetAttr(name)); + return paddle::lite::to_string(desc.GetAttr(name)); case AttrType::FLOAT: - return std::to_string(desc.GetAttr(name)); + return paddle::lite::to_string(desc.GetAttr(name)); case AttrType::BOOLEAN: - return std::to_string(desc.GetAttr(name)); + return paddle::lite::to_string(desc.GetAttr(name)); case AttrType::STRING: return "\"" + desc.GetAttr(name) + "\""; case AttrType::FLOATS: { diff --git a/lite/gen_code/gen_code.h b/lite/gen_code/gen_code.h index 58a7959f4eb34cb438bf0e25b49b36110435cc6b..d316eac43f99664fa71cba54b3ab5360852300a0 100644 --- a/lite/gen_code/gen_code.h +++ b/lite/gen_code/gen_code.h @@ -153,16 +153,16 @@ class Module { private: std::string WeightUniqueName() const { - return "w_" + std::to_string(weight_counter_++); + return "w_" + paddle::lite::to_string(weight_counter_++); } std::string TmpVarUniqueName() const { - return "tmp_" + std::to_string(tmp_var_counter_++); + return "tmp_" + paddle::lite::to_string(tmp_var_counter_++); } std::string OpUniqueName() const { - return "op_" + std::to_string(op_counter_++); + return "op_" + paddle::lite::to_string(op_counter_++); } std::string KernelUniqueName() const { - return "kernel_" + std::to_string(kernel_counter_++); + return "kernel_" + paddle::lite::to_string(kernel_counter_++); } std::string DataRepr(const std::string &raw_data, PrecisionType dtype); diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt index 60d5e3b5e234ef19cd144100d07441eb4acf48de..7550d770145d92ebd343f96a82c6f34d72c91ea5 100644 --- a/lite/kernels/arm/CMakeLists.txt +++ b/lite/kernels/arm/CMakeLists.txt @@ -1,6 +1,6 @@ # NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered # to the model_optimize_tool. -if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))) +if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))) return() endif() @@ -68,6 +68,7 @@ add_kernel(reduce_max_compute_arm ARM extra SRCS reduce_max_compute.cc DEPS ${li add_kernel(sequence_expand_compute_arm ARM extra SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sequence_pool_compute_arm ARM extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(sequence_conv_compute_arm ARM extra SRCS sequence_conv_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(layer_norm_compute_arm ARM extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(gather_compute_arm ARM extra SRCS gather_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(reduce_prod_compute_arm ARM extra SRCS reduce_prod_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -82,15 +83,14 @@ add_kernel(conditional_block_compute_arm ARM extra SRCS conditional_block_comput add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm) - # for OCR specific add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(less_than_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(compare_compute_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -99,9 +99,20 @@ add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc D add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(fill_constant_batch_size_like_compute_arm ARM basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math_arm) +# 4. training kernels +add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm) +if(LITE_WITH_TRAIN) + add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) + add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) + add_kernel(elementwise_grad_compute_arm ARM basic SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) + add_kernel(mul_grad_compute_arm ARM extra SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) + add_kernel(sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm) +endif() lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm) lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm) @@ -121,5 +132,4 @@ if(LITE_BUILD_EXTRA) lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm) lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm) lite_cc_test(test_layer_norm_compute_arm SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_arm) - lite_cc_test(test_lookup_table_compute_arm SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_arm) endif() diff --git a/lite/kernels/arm/activation_compute.cc b/lite/kernels/arm/activation_compute.cc index d50049d48748cf7ec43485a12fa7c65c0171a63d..d609716ee53ec584b8340e9b72498ed95afd5820 100644 --- a/lite/kernels/arm/activation_compute.cc +++ b/lite/kernels/arm/activation_compute.cc @@ -169,6 +169,16 @@ void RsqrtCompute::Run() { x_data, output_data, x_dims.production(), ctx.threads()); } +void SquareCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto x_dims = param.X->dims(); + auto x_data = param.X->data(); + auto output_data = param.Out->mutable_data(); + lite::arm::math::act_square( + x_data, output_data, x_dims.production(), ctx.threads()); +} + } // namespace arm } // namespace kernels } // namespace lite @@ -260,3 +270,8 @@ REGISTER_LITE_KERNEL( .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); +REGISTER_LITE_KERNEL( + square, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SquareCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/activation_compute.h b/lite/kernels/arm/activation_compute.h index ba1318ea36d01d1c3352679e7b5de12d013c0e84..476d7bb0a32db193d9afb1451507699d0af71736 100644 --- a/lite/kernels/arm/activation_compute.h +++ b/lite/kernels/arm/activation_compute.h @@ -139,6 +139,15 @@ class RsqrtCompute : public KernelLite { virtual ~RsqrtCompute() = default; }; +class SquareCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + + virtual ~SquareCompute() = default; +}; + } // namespace arm } // namespace kernels } // namespace lite diff --git a/lite/kernels/arm/activation_grad_compute.cc b/lite/kernels/arm/activation_grad_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..137668fa5e0d1bd07e838b3040a31e084a7475c8 --- /dev/null +++ b/lite/kernels/arm/activation_grad_compute.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/arm/activation_grad_compute.h" +#include "lite/backends/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void SquareGradCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto out_grad_dims = param.Out_grad->dims(); + auto out_grad_data = param.Out_grad->data(); + + auto x_data = param.X->data(); + auto x_grad_data = param.X_grad->mutable_data(); + lite::arm::math::act_square_grad(x_data, + out_grad_data, + x_grad_data, + out_grad_dims.production(), + ctx.threads()); +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(square_grad, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::SquareGradCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/activation_grad_compute.h b/lite/kernels/arm/activation_grad_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..ef03f58fa8cd499192aa6edfe3a7c51b49b14f65 --- /dev/null +++ b/lite/kernels/arm/activation_grad_compute.h @@ -0,0 +1,37 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class SquareGradCompute : public KernelLite { + public: + using param_t = operators::ActivationGradParam; + + void Run() override; + + virtual ~SquareGradCompute() = default; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/arm/argmax_compute.cc b/lite/kernels/arm/argmax_compute.cc index ad279e8f8e1f80639c0b2512f89595d01ef062fd..dda38809875e46835c99b35e564473056391d2c6 100644 --- a/lite/kernels/arm/argmax_compute.cc +++ b/lite/kernels/arm/argmax_compute.cc @@ -30,6 +30,9 @@ void ArgmaxCompute::Run() { lite::Tensor* input = param.X; lite::Tensor* output = param.Out; int axis = param.Axis; + if (axis < 0) { + axis += input->dims().size(); + } lite::arm::math::argmax_func(input, axis, output); return; @@ -47,5 +50,5 @@ REGISTER_LITE_KERNEL(arg_max, paddle::lite::kernels::arm::ArgmaxCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) .Finalize(); diff --git a/lite/kernels/arm/argmax_compute_test.cc b/lite/kernels/arm/argmax_compute_test.cc index 58bdf18474ae69b2bdb863b9818dab41e25bf17b..034d57cdaba77130b319d203c3ae0616720c9d31 100644 --- a/lite/kernels/arm/argmax_compute_test.cc +++ b/lite/kernels/arm/argmax_compute_test.cc @@ -33,7 +33,7 @@ void argmax_compute_ref(const operators::ArgmaxParam& param) { int axis = param.Axis; auto x_data = x->data(); - auto output_data = output->mutable_data(); + auto output_data = output->mutable_data(); DDim x_dims = x->dims(); DDim output_dims = output->dims(); @@ -59,7 +59,7 @@ void argmax_compute_ref(const operators::ArgmaxParam& param) { std::greater>()); // out - dtype* out_ptr = output_data + n * out_channel + k; + auto* out_ptr = output_data + n * out_channel + k; *out_ptr = vec[0].second; } } @@ -115,12 +115,12 @@ TEST(argmax_arm, compute) { param.Axis = axis; argmaxOp.SetParam(param); argmaxOp.Launch(); - auto* output_data = output.mutable_data(); + auto* output_data = output.mutable_data(); // obtain output_ref_data param.Out = &output_ref; argmax_compute_ref(param); - auto* output_ref_data = output_ref.mutable_data(); + auto* output_ref_data = output_ref.mutable_data(); // compare for (int i = 0; i < output.dims().production(); i++) { diff --git a/lite/kernels/arm/assign_compute.cc b/lite/kernels/arm/assign_compute.cc index b0a5529c368c67c30dfb8517a89bb35c5440e122..8398634bb365c628b64e1ddd2b14984d5f2acb59 100644 --- a/lite/kernels/arm/assign_compute.cc +++ b/lite/kernels/arm/assign_compute.cc @@ -23,16 +23,9 @@ namespace lite { namespace kernels { namespace arm { -void AssignCompute::PrepareForRun() { - // CHECK_OR_FALSE(param_t.Out); -} - void AssignCompute::Run() { - // LOG(INFO) << "into kernel compute run"; auto& param = Param(); - const lite::Tensor* input = param.X; - lite::Tensor* output = param.Out; - output->CopyDataFrom(*input); + param.Out->CopyDataFrom(*param.X); } } // namespace arm @@ -41,7 +34,7 @@ void AssignCompute::Run() { } // namespace paddle REGISTER_LITE_KERNEL( - assign, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::AssignCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + assign, kARM, kAny, kNCHW, paddle::lite::kernels::arm::AssignCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .Finalize(); diff --git a/lite/kernels/arm/assign_compute.h b/lite/kernels/arm/assign_compute.h index 3f0dd8e281047c4201ba4561dbd60250ce5749d2..e144486b5970b4e4e82c58148e33ccc5b2d37ff4 100644 --- a/lite/kernels/arm/assign_compute.h +++ b/lite/kernels/arm/assign_compute.h @@ -22,10 +22,10 @@ namespace lite { namespace kernels { namespace arm { -class AssignCompute : public KernelLite { +class AssignCompute : public KernelLite { public: using param_t = operators::AssignParam; - void PrepareForRun() override; + void Run() override; virtual ~AssignCompute() = default; diff --git a/lite/kernels/arm/assign_value_compute.cc b/lite/kernels/arm/assign_value_compute.cc index 45f28ba36369cc79d70d683894c8a934b9308863..1d097e336f156966689823f4ef6d0d36bc536545 100644 --- a/lite/kernels/arm/assign_value_compute.cc +++ b/lite/kernels/arm/assign_value_compute.cc @@ -58,9 +58,9 @@ void AssignValueCompute::Run() { REGISTER_LITE_KERNEL(assign_value, kARM, - kFloat, + kAny, kNCHW, paddle::lite::kernels::arm::AssignValueCompute, def) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .Finalize(); diff --git a/lite/kernels/arm/assign_value_compute.h b/lite/kernels/arm/assign_value_compute.h index f0c33f865bb770adc64a1727521fad10d0516ede..32b1fb41ab733dc3827496833a633dd415f098b9 100644 --- a/lite/kernels/arm/assign_value_compute.h +++ b/lite/kernels/arm/assign_value_compute.h @@ -22,7 +22,7 @@ namespace lite { namespace kernels { namespace arm { -class AssignValueCompute : public KernelLite { +class AssignValueCompute : public KernelLite { public: using param_t = operators::AssignValueParam; diff --git a/lite/kernels/arm/beam_search_compute.cc b/lite/kernels/arm/beam_search_compute.cc index 5ac53b3b96d0ba676e2909d6102e9edded5e9a92..437ba070b7eaf2d6edc8ecd2dd161f57c8fac345 100644 --- a/lite/kernels/arm/beam_search_compute.cc +++ b/lite/kernels/arm/beam_search_compute.cc @@ -20,8 +20,6 @@ namespace lite { namespace kernels { namespace arm { -void BeamSearchCompute::PrepareForRun() {} - void BeamSearchCompute::Run() { auto& ctx = this->ctx_->template As(); auto& param = this->Param(); @@ -50,11 +48,17 @@ REGISTER_LITE_KERNEL(beam_search, kNCHW, paddle::lite::kernels::arm::BeamSearchCompute, def) - .BindInput("pre_ids", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("pre_scores", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("ids", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("scores", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("selected_ids", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("selected_scores", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("parent_idx", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("pre_ids", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("pre_scores", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) + .BindInput("ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("scores", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) + .BindOutput("selected_ids", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindOutput("selected_scores", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) + .BindOutput("parent_idx", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .Finalize(); diff --git a/lite/kernels/arm/beam_search_compute.h b/lite/kernels/arm/beam_search_compute.h index ebd72732bb25e826c24f20cd28588b170f344268..854696e5b9f40b480f2c92592245e52f46bc8f14 100644 --- a/lite/kernels/arm/beam_search_compute.h +++ b/lite/kernels/arm/beam_search_compute.h @@ -25,10 +25,6 @@ namespace arm { class BeamSearchCompute : public KernelLite { public: - using param_t = operators::BeamSearchParam; - - void PrepareForRun() override; - void Run() override; ~BeamSearchCompute() {} diff --git a/lite/kernels/arm/beam_search_decode_compute.cc b/lite/kernels/arm/beam_search_decode_compute.cc index 49ca51bf697f272dacf55db655bc237aff2cc460..e0d4ae3f13c6b8bf2364ab5d50ec45bb245377c6 100644 --- a/lite/kernels/arm/beam_search_decode_compute.cc +++ b/lite/kernels/arm/beam_search_decode_compute.cc @@ -38,7 +38,7 @@ const size_t kSentenceLevel = 1; template struct Sentence { - std::vector word_ids; + std::vector word_ids; std::vector scores; }; @@ -73,7 +73,7 @@ struct BeamSearchDecoder { std::vector source_level_lod = {0}; std::vector sentence_level_lod = {0}; - std::vector id_data; + std::vector id_data; std::vector score_data; for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { @@ -117,9 +117,9 @@ struct BeamSearchDecoder { *(id_tensor->mutable_lod()) = lod; id_tensor->Resize({static_cast(id_data.size())}); - auto id_ptr = id_tensor->mutable_data(); + auto id_ptr = id_tensor->mutable_data(); TargetCopy( - TARGET(kARM), id_ptr, id_data.data(), id_data.size() * sizeof(float)); + TARGET(kARM), id_ptr, id_data.data(), id_data.size() * sizeof(int64_t)); *(score_tensor->mutable_lod()) = lod; score_tensor->Resize({static_cast(score_data.size())}); @@ -169,7 +169,7 @@ struct BeamSearchDecoder { ++candidate_idx) { prefix_idx_vector.push_back(prefix_idx); size_t idx = prefix_idx_vector.size() - 1; - auto cur_id = cur_ids.data()[candidate_idx]; + auto cur_id = cur_ids.data()[candidate_idx]; auto cur_score = cur_scores.data()[candidate_idx]; sentence_vector.at(idx).word_ids.push_back(cur_id); sentence_vector.at(idx).scores.push_back(cur_score); @@ -184,7 +184,7 @@ struct BeamSearchDecoder { cur_ids.lod().at(kSentenceLevel)[prefix_idx]; for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) { auto candidate_idx = prefix_idx_vector.at(idx); - auto cur_id = cur_ids.data()[candidate_idx]; + auto cur_id = cur_ids.data()[candidate_idx]; auto cur_score = cur_scores.data()[candidate_idx]; if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) { // to skip redundant end tokens @@ -293,8 +293,12 @@ REGISTER_LITE_KERNEL(beam_search_decode, kNCHW, paddle::lite::kernels::arm::BeamSearchDecodeCompute, def) - .BindInput("Ids", {LiteType::GetTensorListTy(TARGET(kARM))}) - .BindInput("Scores", {LiteType::GetTensorListTy(TARGET(kARM))}) - .BindOutput("SentenceIds", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("SentenceScores", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Ids", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("Scores", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kFloat))}) + .BindOutput("SentenceIds", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindOutput("SentenceScores", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) .Finalize(); diff --git a/lite/kernels/arm/calib_compute.cc b/lite/kernels/arm/calib_compute.cc index 525e5aefd63474cfac09900e9c411ca5e5868311..6dac97dcbc59991d4680ab1a98a54a900573f631 100644 --- a/lite/kernels/arm/calib_compute.cc +++ b/lite/kernels/arm/calib_compute.cc @@ -23,24 +23,24 @@ namespace lite { namespace kernels { namespace arm { -void CalibComputeFp32ToInt8::Run() { - auto& param = this->Param(); +template +void CalibComputeFp32ToInt8::Run() { + auto& param = this->template Param(); std::vector scale = {param.scale}; - const auto* din = param.input->data(); - auto* dout = param.output->mutable_data(); + const auto* din = param.input->template data(); + auto* dout = param.output->template mutable_data(); lite::arm::math::fp32_to_int8( din, dout, scale.data(), 1, 1, param.input->numel()); - return; } -void CalibComputeInt8ToFp32::Run() { - auto& param = this->Param(); - const auto* din = param.input->data(); +template +void CalibComputeInt8ToFp32::Run() { + auto& param = this->template Param(); + const auto* din = param.input->template data(); std::vector scale = {param.scale}; - auto* dout = param.output->mutable_data(); + auto* dout = param.output->template mutable_data(); lite::arm::math::int8_to_fp32( din, dout, scale.data(), 1, 1, param.input->numel()); - return; } } // namespace arm @@ -48,43 +48,116 @@ void CalibComputeInt8ToFp32::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL(calib, - kARM, - kInt8, - kNCHW, - paddle::lite::kernels::arm::CalibComputeFp32ToInt8, - fp32_to_int8) +REGISTER_LITE_KERNEL( + calib, + kARM, + kInt8, + kNCHW, + paddle::lite::kernels::arm::CalibComputeFp32ToInt8, + fp32_to_int8) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .Finalize(); -REGISTER_LITE_KERNEL(calib, - kARM, - kInt8, - kNCHW, - paddle::lite::kernels::arm::CalibComputeInt8ToFp32, - int8_to_fp32) +REGISTER_LITE_KERNEL( + calib, + kARM, + kInt8, + kNCHW, + paddle::lite::kernels::arm::CalibComputeInt8ToFp32, + int8_to_fp32) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .Finalize(); -REGISTER_LITE_KERNEL(calib_once, - kARM, - kInt8, - kNCHW, - paddle::lite::kernels::arm::CalibComputeFp32ToInt8, - fp32_to_int8) + +REGISTER_LITE_KERNEL( + calib, + kARM, + kInt8, + kNHWC, + paddle::lite::kernels::arm::CalibComputeFp32ToInt8, + fp32_to_int8) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + calib, + kARM, + kInt8, + kNHWC, + paddle::lite::kernels::arm::CalibComputeInt8ToFp32, + int8_to_fp32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + calib_once, + kARM, + kInt8, + kNCHW, + paddle::lite::kernels::arm::CalibComputeFp32ToInt8, + fp32_to_int8) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .Finalize(); -REGISTER_LITE_KERNEL(calib_once, - kARM, - kInt8, - kNCHW, - paddle::lite::kernels::arm::CalibComputeInt8ToFp32, - int8_to_fp32) +REGISTER_LITE_KERNEL( + calib_once, + kARM, + kInt8, + kNCHW, + paddle::lite::kernels::arm::CalibComputeInt8ToFp32, + int8_to_fp32) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .Finalize(); + +REGISTER_LITE_KERNEL( + calib_once, + kARM, + kInt8, + kNHWC, + paddle::lite::kernels::arm::CalibComputeFp32ToInt8, + fp32_to_int8) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + calib_once, + kARM, + kInt8, + kNHWC, + paddle::lite::kernels::arm::CalibComputeInt8ToFp32, + int8_to_fp32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/arm/calib_compute.h b/lite/kernels/arm/calib_compute.h index 8d9a32bc245579b861607389bac3a4258a0e7abe..a4c8b4c1232101416e95171d70ab629f6a37177b 100644 --- a/lite/kernels/arm/calib_compute.h +++ b/lite/kernels/arm/calib_compute.h @@ -21,8 +21,9 @@ namespace lite { namespace kernels { namespace arm { +template class CalibComputeFp32ToInt8 - : public KernelLite { + : public KernelLite { public: using param_t = operators::CalibParam; @@ -33,8 +34,9 @@ class CalibComputeFp32ToInt8 private: }; +template class CalibComputeInt8ToFp32 - : public KernelLite { + : public KernelLite { public: using param_t = operators::CalibParam; diff --git a/lite/kernels/arm/cast_compute.cc b/lite/kernels/arm/cast_compute.cc index 266ae1fc916af4303aca274c39b9b4923fdbb154..3b3ef07e105c583b7e3eb8b64b14610ca0f9e41a 100644 --- a/lite/kernels/arm/cast_compute.cc +++ b/lite/kernels/arm/cast_compute.cc @@ -73,7 +73,7 @@ void CastCompute::Run() { } // namespace paddle REGISTER_LITE_KERNEL( - cast, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::CastCompute, def) + cast, kARM, kAny, kNCHW, paddle::lite::kernels::arm::CastCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .Finalize(); diff --git a/lite/kernels/arm/cast_compute.h b/lite/kernels/arm/cast_compute.h index d342a405ad593b8457b2899fa3ee6ae843d8f792..1f8da056a8be61de20b5d6e98e455e850b9c9f8d 100644 --- a/lite/kernels/arm/cast_compute.h +++ b/lite/kernels/arm/cast_compute.h @@ -23,7 +23,7 @@ namespace lite { namespace kernels { namespace arm { -class CastCompute : public KernelLite { +class CastCompute : public KernelLite { public: using param_t = operators::CastParam; diff --git a/lite/kernels/arm/compare_compute.cc b/lite/kernels/arm/compare_compute.cc index 6118cbc6e403645cada84d2434497b084636a4a3..709942a0d9f385e4ba55be32657633c0edc378cf 100644 --- a/lite/kernels/arm/compare_compute.cc +++ b/lite/kernels/arm/compare_compute.cc @@ -73,8 +73,6 @@ inline void get_mid_dims(const lite::DDim &x_dims, (*post) *= x_dims[i]; } } -template