未验证 提交 5dd85c2c 编写于 作者: H HappyAngel 提交者: GitHub

Merge pull request #89 from PaddlePaddle/develop

pull code
......@@ -105,3 +105,5 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
metal/MobileNetDemo/MobileNetDemo/Resources
build*
......@@ -61,6 +61,7 @@ lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF)
lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF)
lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF)
......@@ -76,6 +77,7 @@ lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
# cv build options
lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF)
lite_option(LITE_WITH_STATIC_CUDA "Statically link cuda libraries." ON)
lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF)
# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
if(ANDROID OR IOS OR ARMLINUX)
......@@ -130,7 +132,8 @@ endif()
if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
message(STATUS "Building the mobile framework")
include(cross_compiling/postproject)
include(cross_compiling/npu) # check and prepare NPU DDK
include(device/npu) # check and prepare NPU DDK
include(device/xpu) # check and prepare XPU SDK
# We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON
# So the following third party dependencies are not needed.
......@@ -171,7 +174,7 @@ endif()
########################################################################################
if(LITE_WITH_XPU)
include(xpu)
include(device/xpu)
endif()
include(external/mklml) # download mklml package
......
......@@ -122,6 +122,9 @@ if (LITE_WITH_ARM)
endif()
endif()
if (LITE_WITH_TRAIN)
add_definitions("-DLITE_WITH_TRAIN")
endif()
if (WITH_ARM_DOTPROD)
add_definitions("-DWITH_ARM_DOTPROD")
......
......@@ -23,7 +23,7 @@ endif()
get_filename_component(AR_PATH ${CMAKE_CXX_COMPILER} PATH)
find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH})
find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH} NO_DEFAULT_PATH)
if(NOT AR_TOOL)
message(ERROR "Failed to find AR_TOOL in ${AR_PATH}")
......
......@@ -57,10 +57,14 @@ function(check_linker_flag)
endforeach()
set(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} PARENT_SCOPE)
endfunction()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if (LITE_ON_TINY_PUBLISH)
if(NOT LITE_WITH_PYTHON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
if((NOT LITE_WITH_PYTHON))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
endif()
if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
......
......@@ -17,15 +17,16 @@ if(NOT LITE_WITH_NPU)
endif()
if(NOT DEFINED NPU_DDK_ROOT)
set(NPU_DDK_ROOT $ENV{NPU_DDK_ROOT})
if(NOT NPU_DDK_ROOT)
message(FATAL_ERROR "Must set NPU_DDK_ROOT or env NPU_DDK_ROOT when LITE_WITH_NPU=ON")
endif()
set(NPU_DDK_ROOT $ENV{NPU_DDK_ROOT})
if(NOT NPU_DDK_ROOT)
message(FATAL_ERROR "Must set NPU_DDK_ROOT or env NPU_DDK_ROOT when LITE_WITH_NPU=ON")
endif()
endif()
message(STATUS "NPU_DDK_ROOT: ${NPU_DDK_ROOT}")
find_path(NPU_DDK_INC NAMES HiAiModelManagerService.h
PATHS ${NPU_DDK_ROOT}/include NO_DEFAULT_PATH)
PATHS ${NPU_DDK_ROOT}/include
NO_DEFAULT_PATH)
if(NOT NPU_DDK_INC)
message(FATAL_ERROR "Can not find HiAiModelManagerService.h in ${NPU_DDK_ROOT}/include")
endif()
......@@ -34,21 +35,24 @@ include_directories("${NPU_DDK_ROOT}/include")
set(NPU_SUB_LIB_PATH "lib64")
if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
set(NPU_SUB_LIB_PATH "lib64")
set(NPU_SUB_LIB_PATH "lib64")
endif()
if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
set(NPU_SUB_LIB_PATH "lib")
set(NPU_SUB_LIB_PATH "lib")
endif()
find_library(NPU_DDK_HIAI_FILE NAMES hiai
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
NO_DEFAULT_PATH)
find_library(NPU_DDK_IR_FILE NAMES hiai_ir
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
NO_DEFAULT_PATH)
find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
NO_DEFAULT_PATH)
if(NOT NPU_DDK_HIAI_FILE)
message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
......@@ -76,6 +80,3 @@ endif()
set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
......@@ -17,15 +17,16 @@ if(NOT LITE_WITH_XPU)
endif()
if(NOT DEFINED XPU_SDK_ROOT)
set(XPU_SDK_ROOT $ENV{XPU_SDK_ROOT})
if(NOT XPU_SDK_ROOT)
message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
endif()
set(XPU_SDK_ROOT $ENV{XPU_SDK_ROOT})
if(NOT XPU_SDK_ROOT)
message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
endif()
endif()
message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}")
find_path(XPU_SDK_INC NAMES xtcl.h
PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl
NO_DEFAULT_PATH)
if(NOT XPU_SDK_INC)
message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
endif()
......@@ -34,7 +35,8 @@ include_directories("${XPU_SDK_ROOT}/XTCL/include")
include_directories("${XPU_SDK_ROOT}/XTDK/include")
find_library(XPU_SDK_XTCL_FILE NAMES xtcl
PATHS ${XPU_SDK_ROOT}/XTCL/so)
PATHS ${XPU_SDK_ROOT}/XTCL/so
NO_DEFAULT_PATH)
if(NOT XPU_SDK_XTCL_FILE)
message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
......@@ -45,7 +47,8 @@ else()
endif()
find_library(XPU_SDK_TVM_FILE NAMES tvm
PATHS ${XPU_SDK_ROOT}/XTCL/so)
PATHS ${XPU_SDK_ROOT}/XTCL/so
NO_DEFAULT_PATH)
if(NOT XPU_SDK_TVM_FILE)
message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
......@@ -56,7 +59,8 @@ else()
endif()
find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi
PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
PATHS ${XPU_SDK_ROOT}/XTDK/shlib
NO_DEFAULT_PATH)
if(NOT XPU_SDK_XPU_API_FILE)
message(FATAL_ERROR "Can not find XPU API Library in ${XPU_SDK_ROOT}")
......@@ -67,7 +71,8 @@ else()
endif()
find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt
PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
PATHS ${XPU_SDK_ROOT}/XTDK/shlib
NO_DEFAULT_PATH)
if(NOT XPU_SDK_XPU_RT_FILE)
message(FATAL_ERROR "Can not find XPU RT Library in ${XPU_SDK_ROOT}")
......@@ -78,18 +83,12 @@ else()
endif()
find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc
PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
if(NOT XPU_SDK_XPU_JITC_FILE)
message(FATAL_ERROR "Can not find XPU JITC Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU JITC Library: ${XPU_SDK_XPU_JITC_FILE}")
add_library(xpu_sdk_xpu_jitc SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_xpu_jitc PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_JITC_FILE})
endif()
PATHS ${XPU_SDK_ROOT}/XTDK/shlib
NO_DEFAULT_PATH)
find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
PATHS ${XPU_SDK_ROOT}/XTDK/shlib
NO_DEFAULT_PATH)
if(NOT XPU_SDK_LLVM_FILE)
message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
......@@ -99,7 +98,7 @@ else()
set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1 -D_GLIBCXX_USE_CXX11_ABI=0")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=0")
set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
......@@ -275,6 +275,11 @@ set(host_kernels CACHE INTERNAL "host kernels")
set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt")
file(WRITE ${kernels_src_list} "") # clean
# file to record faked kernels for opt python lib
set(fake_kernels_src_list "${CMAKE_BINARY_DIR}/fake_kernels_src_list.txt")
file(WRITE ${fake_kernels_src_list} "") # clean
if(LITE_BUILD_TAILOR)
set(tailored_kernels_list_path "${LITE_OPTMODEL_DIR}/.tailored_kernels_source_list")
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
......@@ -303,62 +308,74 @@ function(add_kernel TARGET device level)
return()
endif()
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
# the source list will collect for model_optimize_tool to fake kernel generation.
foreach(src ${args_SRCS})
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
# when compiling the model_optimize_tool, a source file with all the fake kernel definitions will be generated,
# no need to continue the compilation of the true kernel source.
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
return()
endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
if ("${device}" STREQUAL "Host")
set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "ARM")
if (NOT LITE_WITH_ARM)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "X86")
if (NOT LITE_WITH_X86)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
elseif (LITE_ON_MODEL_OPTIMIZE_TOOL)
foreach(src ${args_SRCS})
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "NPU")
if (NOT LITE_WITH_NPU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "XPU")
if (NOT LITE_WITH_XPU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "FPGA")
if (NOT LITE_WITH_FPGA)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "BM")
if (NOT LITE_WITH_BM)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "OPENCL")
if (NOT LITE_WITH_OPENCL)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "")
......@@ -366,6 +383,9 @@ function(add_kernel TARGET device level)
if ("${device}" STREQUAL "CUDA")
if (NOT LITE_WITH_CUDA)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(cuda_kernels "${cuda_kernels};${TARGET}" CACHE INTERNAL "")
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT LITE_WITH_MLU)
return()
endif()
if(NOT DEFINED NEUWARE_HOME)
set(NEUWARE_HOME $ENV{NEUWARE_HOME})
if(NOT NEUWARE_HOME)
message(FATAL_ERROR "Must set NEUWARE_HOME or env NEUWARE_HOME when LITE_WITH_MLU=ON")
endif()
endif()
message(STATUS "LITE_WITH_MLU: ${LITE_WITH_MLU}")
find_path(CNML_INC NAMES cnml.h
PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH)
if(NOT CNML_INC)
message(FATAL_ERROR "Can not find cnml.h in ${NEUWARE_HOME}/include")
endif()
find_path(CNRT_INC NAMES cnrt.h
PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH)
if(NOT CNRT_INC)
message(FATAL_ERROR "Can not find cnrt.h in ${NEUWARE_HOME}/include")
endif()
include_directories("${NEUWARE_HOME}/include")
find_library(CNML_LIB_FILE NAMES cnml
PATHS ${NEUWARE_HOME}/lib64)
if(NOT CNML_LIB_FILE)
message(FATAL_ERROR "Can not find CNML Library in ${NEUWARE_HOME}/lib64")
else()
message(STATUS "Found CNML Library: ${CNML_LIB_FILE}")
add_library(cnml_lib SHARED IMPORTED GLOBAL)
set_property(TARGET cnml_lib PROPERTY IMPORTED_LOCATION ${CNML_LIB_FILE})
endif()
find_library(CNRT_LIB_FILE NAMES cnrt
PATHS ${NEUWARE_HOME}/lib64)
if(NOT CNRT_LIB_FILE)
message(FATAL_ERROR "Can not find CNRT Library in ${NEUWARE_HOME}/lib64")
else()
message(STATUS "Found CNRT Library: ${CNRT_LIB_FILE}")
add_library(cnrt_lib SHARED IMPORTED GLOBAL)
set_property(TARGET cnrt_lib PROPERTY IMPORTED_LOCATION ${CNRT_LIB_FILE})
endif()
# 测试工具
Basic profiler 用于 CPU 上kernel 耗时的统计。
## 开启方法:
参照 [编译安装](../installation/source_compile) 中的**full_publish**部分进行环境配置,在 cmake 时添加 `-DLITE_WITH_PROFILER=ON` ,就可以开启相应支持。
## 使用示例:
在模型执行完毕后,会自动打印类似如下 profiler 的日志
```
kernel average min max count
feed/def/1/4/2 0 0 0 1
conv2d/def/4/1/1 1175 1175 1175 1
conv2d/def/4/1/1 1253 1253 1253 1
depthwise_conv2d/def/4/1/1 519 519 519 1
conv2d/def/4/1/1 721 721 721 1
elementwise_add/def/4/1/1 18 18 18 1
conv2d/def/4/1/1 2174 2174 2174 1
depthwise_conv2d/def/4/1/1 380 380 380 1
conv2d/def/4/1/1 773 773 773 1
elementwise_add/def/4/1/1 2 2 2 1
conv2d/def/4/1/1 1248 1248 1248 1
depthwise_conv2d/def/4/1/1 492 492 492 1
conv2d/def/4/1/1 1150 1150 1150 1
elementwise_add/def/4/1/1 33 33 33 1
elementwise_add/def/4/1/1 3 3 3 1
conv2d/def/4/1/1 1254 1254 1254 1
depthwise_conv2d/def/4/1/1 126 126 126 1
```
# X2Paddle 支持模型列表
## 多框架支持
|模型 | caffe | tensorflow | onnx |
|---|---|---|---|
|mobilenetv1 | Y | Y | |
|mobilenetv2 | Y | Y | Y |
|resnet18 | Y | Y | |
|resnet50 | Y | Y | Y |
|mnasnet | Y | Y | |
|efficientnet | Y | Y | Y |
|squeezenetv1.1 | Y | Y | Y |
|shufflenet | Y | Y | |
|mobilenet_ssd | Y | Y | |
|mobilenet_yolov3 | | Y | |
|inceptionv4 | | | |
|mtcnn | Y | Y | |
|facedetection | Y | | |
|unet | Y | Y | |
|ocr_attention | | | |
|vgg16 | | | |
# CV图像预处理
# CV图像预处理API
请把编译脚本`Paddle-Lite/lite/too/build.sh``BUILD_CV`变量设置为`ON`, 其他编译参数设置请参考[源码编译](../source_compile), 以确保 Lite 可以正确编译。这样`CV`图像的加速库就会编译进去,且会生成`paddle_image_preprocess.h`的API文件
请把编译脚本`Paddle-Lite/lite/too/build.sh``BUILD_CV`变量设置为`ON`, 其他编译参数设置请参考[源码编译](../user_guides/source_compile), 以确保 Lite 可以正确编译。这样`CV`图像的加速库就会编译进去,且会生成`paddle_image_preprocess.h`的API文件
- 硬件平台: `ARM`
- 操作系统:`MAC``LINUX`
......
# C++ API文档
# C++ API
## CreatePaddlePredictor
......@@ -260,7 +260,7 @@ class MobileConfig;
`MobileConfig`用来配置构建轻量级PaddlePredictor的配置信息,如NaiveBuffer格式的模型地址、模型的内存地址(从内存加载模型时使用)、能耗模式、工作线程数等等。
*注意:输入的模型需要使用[Model Optimize Tool](../model_optimize_tool)转化为NaiveBuffer格式的优化模型。*
*注意:输入的模型需要使用[Model Optimize Tool](../user_guides/model_optimize_tool)转化为NaiveBuffer格式的优化模型。*
示例:
......@@ -277,13 +277,13 @@ config.set_power_mode(LITE_POWER_HIGH);
std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
```
### `set_model_from_file(model_dir)`
### `set_model_from_file(model_file)`
设置模型文件,当需要从磁盘加载模型时使用。
参数:
- `model_dir(std::string)` - 模型文件路径
- `model_file(std::string)` - 模型文件路径
返回:`None`
......@@ -589,7 +589,7 @@ for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
根据名称获取输出Tensor的指针。
**注意**`GetTensor`接口是为开发者设计的调试接口,可以输出[转化](../model_optimize_tool)后模型中的任一节点。如果出现`GetTensor(InputName)`返回值为空`Tensor`,可能原因是以该`InputName`命名的Tensor在模型转化的**子图融合**过程被融合替换了。
**注意**`GetTensor`接口是为开发者设计的调试接口,可以输出[转化](../user_guides/model_optimize_tool)后模型中的任一节点。如果出现`GetTensor(InputName)`返回值为空`Tensor`,可能原因是以该`InputName`命名的Tensor在模型转化的**子图融合**过程被融合替换了。
参数:
......
# Java API 文档
# Java API
## MobileConfig
......@@ -301,6 +301,12 @@ Tensor是Paddle-Lite的数据组织形式,用于对底层数据进行封装并
示例:
```java
// 导入Java API
import com.baidu.paddle.lite.MobileConfig;
import com.baidu.paddle.lite.Tensor;
import com.baidu.paddle.lite.Predictor;
import com.baidu.paddle.lite.PowerMode;
// 设置MobileConfig
MobileConfig config = new MobileConfig();
config.setModelDir(modelPath);
......@@ -325,7 +331,7 @@ input.setData(inputBuffer);
predictor.run();
// 获取输出Tensor
Tensor output = predictor.getOutput(0);
Tensor result = predictor.getOutput(0);
// 获取输出数据
float[] output = result.getFloatData();
for (int i = 0; i < 1000; ++i) {
......
# Python API文档
# Python API
## create_paddle_predictor
......
# Benchmark 数据
# 性能数据
可以参考[benchmark_tools](benchmark_tools),推荐**一键benchmark**
......@@ -15,14 +15,12 @@
* int8模型
* mobilenet_v1
* mobilenet_v2
* resnet50
* 测试机器(android ndk ndk-r17c)
* 骁龙855
* xiaomi mi9, snapdragon 855
* 4xA76(1@2.84GHz + 3@2.4GHz) + 4xA55@1.78GHz
* 骁龙845
* xiaomi mi8, 845
* 2.8GHz(大四核),1.7GHz(小四核)
......@@ -30,20 +28,12 @@
* 骁龙835
* xiaomi mix2, snapdragon 835
* 2.45GHz(大四核),1.9GHz(小四核)
* 骁龙625
* oppo R9s, snapdragon625
* A53 x 8, big core@2.0GHz
* 骁龙653
* 360 N5, snapdragon 653
* 4 x A73@2.0GHz + 4 x A53@1.4GHz
* 麒麟970
* HUAWEI Mate10
* 测试说明
* branch: release/2.0.0
* branch: release/v2.3.0
* warmup=10, repeats=30,统计平均时间,单位是ms
* 当线程数为1时,```DeviceInfo::Global().SetRunMode```设置LITE_POWER_HIGH,否者设置LITE_POWER_NO_BIND
* 模型的输入图像的维度是{1, 3, 224, 224},输入图像的每一位数值是1
......@@ -55,78 +45,59 @@
#### paddlepaddle model
骁龙855|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4
mobilenet_v1 |32.19 |18.81 |10.90 |30.92 |18.31 |10.15
mobilenet_v2 |22.91 |13.75 |8.64 |21.15 |12.79 |7.84
shufflenet_v2 |4.67 |3.37 |2.65 |4.43 |3.15 |2.66
squeezenet_v1.1 |25.10 |15.93 |9.68 |23.28 |14.61 |8.71
mnasnet |21.84 |13.14 |7.96 |19.61 |11.88 |7.55
mobilenet_v1 |33.27 |19.52 |11.14 |31.72 |18.76 |10.24 |
mobilenet_v2 |29.08 |15.79 |9.25 |25.89 |14.17 |8.38 |
shufflenet_v2 |4.40 |3.09 |2.30 |4.28 |3.02 |2.35 |
squeezenet_v1.1 |19.96 |12.61 |8.76 |18.25 |11.46 |7.97 |
mnasnet |21.00 |12.54 |7.28 |19.65 |11.65 |6.96 |
骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
骁龙845|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4
mobilenet_v1 |94.13 |52.17 |30.68 |88.28 |47.58 |26.64
mobilenet_v2 |61.24 |34.64 |22.36 |56.66 |32.19 |19.63
shufflenet_v2 |10.87 |6.92 |5.12 |10.41 |6.76 |4.97
squeezenet_v1.1 |73.61 |42.25 |24.44 |64.87 |38.43 |23.06
mnasnet |58.22 |33.43 |20.44 |53.43 |30.20 |18.09
mobilenet_v1 |66.36 |35.97 |19.45 |62.66 |33.87 |17.85 |
mobilenet_v2 |45.86 |25.53 |14.6 |41.58 |23.24 |13.39 |
shufflenet_v2 |7.58 |4.89 |3.41 |7.44 |4.91 |3.58 |
squeezenet_v1.1 |37.15 |22.74 |13.51 |34.69 |21.27 |12.74 |
mnasnet |40.09 |21.73 |11.91 |38.19 |21.02 |12.11 |
麒麟980|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4
mobilenet_v1 |55.11 |28.24 |13.27 |34.24 |17.74 |12.41
mobilenet_v2 |37.03 |19.80 |51.94 |23.64 |12.98 |9.38
shufflenet_v2 |7.26 |4.94 |15.06 |5.32 |3.33 |2.82
squeezenet_v1.1 |42.73 |23.66 |57.39 |26.03 |14.53 |13.66
mnasnet |36.87 |20.15 |46.04 |21.85 |12.06 |8.68
麒麟970|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4
mobilenet_v1 |97.80 |52.64 |34.46 |94.51 |49.36 |28.43
mobilenet_v2 |66.55 |38.52 |23.19 |62.89 |34.93 |21.53
shufflenet_v2 |13.78 |8.11 |5.93 |11.95 |7.90 |5.91
squeezenet_v1.1 |77.64 |43.67 |25.72 |69.91 |40.66 |24.62
mnasnet |61.86 |34.62 |22.68 |59.61 |32.79 |19.56
mobilenet_v1 |96.98 |53.92 |32.24 |89.31 |48.02 |27.58 |
mobilenet_v2 |67.72 |37.66 |23.82 |60.10 |34.36 |21.05 |
shufflenet_v2 |10.72 |6.62 |4.63 |10.10 |6.44 |4.63 |
squeezenet_v1.1 |53.89 |33.28 |20.73 |50.83 |32.31 |19.51 |
mnasnet |59.55 |33.53 |20.32 |56.21 |31.58 |19.06 |
#### caffe model
骁龙855|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4 |
mobilenet_v1 |32.42 |18.68 |10.86 |30.92 |18.35 |10.07 |
mobilenet_v2 |29.53 |17.76 |10.89 |27.19 |16.53 |9.75 |
shufflenet_v2 |4.61 |3.29 |2.61 |4.36 |3.11 |2.51 |
骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4 |
mobilenet_v1 |92.52 |52.34 |30.37 |88.31 |49.75 |27.29 |
mobilenet_v2 |79.50 |45.67 |28.79 |76.13 |44.01 |26.13 |
shufflenet_v2 |10.94 |7.08 |5.16 |10.64 |6.83 |5.01 |
mobilenet_v1 |33.36 |19.45 |11.26 |31.63 |18.74 |10.31 |
mobilenet_v2 |31.63 |19.21 |11.61 |28.34 |17.14 |10.16 |
shufflenet_v2 |4.46 |3.08 |2.32 |4.26 |2.98 |2.35 |
麒麟980|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
骁龙845|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4 |
mobilenet_v1 |55.36 |28.18 |13.31 |34.42 |17.93 |12.52 |
mobilenet_v2 |49.17 |26.10 |65.49 |30.50 |16.66 |11.72 |
shufflenet_v2 |8.45 |5.00 |15.65 |4.58 |3.14 |2.83 |
mobilenet_v1 |66.32 |35.83 |19.56 |62.52 |33.79 |17.91 |
mobilenet_v2 |58.46 |32.69 |18.56 |53.72 |29.86 |16.80 |
shufflenet_v2 |7.65 |4.82 |3.46 |7.55 |4.97 |3.62 |
麒麟970|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4 |
mobilenet_v1 |97.85 |53.38 |33.85 |94.29 |49.42 |28.29 |
mobilenet_v2 |87.40 |50.25 |31.85 |85.55 |48.11 |28.24 |
shufflenet_v2 |12.16 |8.39 |6.21 |12.21 |8.33 |6.32 |
mobilenet_v1 |95.38 |54.09 |32.03 |95.05 |48.33 |27.54 |
mobilenet_v2 |88.46 |48.98 |30.23 |79.28 |44.64 |27.10 |
shufflenet_v2 |10.07 |6.51 |4.61 |10.31 |6.50 |4.66 |
#### int8量化模型测试数据
......@@ -136,6 +107,7 @@ threads num|1 |2 |4 |1 |2 |4 |
mobilenet_v1 |36.80 |21.58 |11.12 | 14.01 |8.13 |4.32 |
mobilenet_v2 |28.72 |19.08 |12.49 | 17.24 |11.55 |7.82 |
骁龙835|armv7 | armv7 | armv7 |armv8 | armv8 |armv8
----| ---- | ---- | ---- | ---- |---- |----
threads num|1 |2 |4 |1 |2 |4 |
......
# Benchmark 测试方法
# 测试方法
本文将会介绍,在**Ubuntu:16.04交叉编译环境**下,用安卓手机在终端测试Paddle-Lite的性能,并介绍两种Benchmark方法:
......@@ -57,7 +57,7 @@ wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/bench
#### 方式二:由源码编译benchmark_bin文件
根据[源码编译](../installation/source_compile)准备编译环境,拉取PaddleLite最新release发布版代码,并在仓库根目录下,执行:
根据[源码编译](../user_guides/source_compile)准备编译环境,拉取PaddleLite最新release发布版代码,并在仓库根目录下,执行:
```shell
###########################################
......@@ -135,53 +135,53 @@ sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true
> 不同手机,不同版本,测试模型的性能数据不同。
```shell
run benchmark armv7
run benchmark armv8
--------------------------------------
PaddleLite Benchmark
Threads=1 Warmup=10 Repeats=30
-- mnasnet avg = 159.8427 ms
-- mobilenet_v1 avg = 235.0072 ms
-- mobilenet_v2 avg = 173.0387 ms
-- shufflenet_v2 avg = 76.0040 ms
-- squeezenet_v11 avg = 164.2957 ms
mnasnet min = 19.83500 max = 19.38500 average = 19.65503
mobilenetv1 min = 32.00600 max = 31.56900 average = 31.81983
mobilenetv2 min = 22.37900 max = 22.08700 average = 22.28623
shufflenetv2 min = 10.80400 max = 10.62900 average = 10.68890
squeezenet min = 17.67400 max = 17.47900 average = 17.57677
Threads=2 Warmup=10 Repeats=30
-- mnasnet avg = 83.1287 ms
-- mobilenet_v1 avg = 121.6029 ms
-- mobilenet_v2 avg = 86.6175 ms
-- shufflenet_v2 avg = 41.5761 ms
-- squeezenet_v11 avg = 87.8678 ms
mnasnet min = 11.85600 max = 11.72000 average = 11.77127
mobilenetv1 min = 18.75000 max = 18.64300 average = 18.70593
mobilenetv2 min = 14.05100 max = 13.59900 average = 13.71450
shufflenetv2 min = 6.67200 max = 6.58300 average = 6.63400
squeezenet min = 12.07100 max = 11.33400 average = 11.41253
Threads=4 Warmup=10 Repeats=30
-- mnasnet avg = 73.3880 ms
-- mobilenet_v1 avg = 119.0739 ms
-- mobilenet_v2 avg = 85.3050 ms
-- shufflenet_v2 avg = 38.0762 ms
-- squeezenet_v11 avg = 64.2201 ms
mnasnet min = 7.19300 max = 7.02600 average = 7.08480
mobilenetv1 min = 10.42000 max = 10.29100 average = 10.34267
mobilenetv2 min = 8.61900 max = 8.46900 average = 8.54707
shufflenetv2 min = 4.55200 max = 4.41900 average = 4.46477
squeezenet min = 8.60000 max = 7.85200 average = 7.98407
--------------------------------------
run benchmark armv8
run benchmark armv7
--------------------------------------
PaddleLite Benchmark
Threads=1 Warmup=10 Repeats=30
-- mnasnet avg = 165.3073 ms
-- mobilenet_v1 avg = 306.0188 ms
-- mobilenet_v2 avg = 195.1884 ms
-- shufflenet_v2 avg = 99.3692 ms
-- squeezenet_v11 avg = 156.6971 ms
mnasnet min = 20.98300 max = 20.81400 average = 20.92527
mobilenetv1 min = 33.19000 max = 32.81700 average = 33.08490
mobilenetv2 min = 25.91400 max = 25.61700 average = 25.73097
shufflenetv2 min = 11.14300 max = 10.97600 average = 11.06757
squeezenet min = 19.31800 max = 19.20000 average = 19.26530
Threads=2 Warmup=10 Repeats=30
-- mnasnet avg = 90.2290 ms
-- mobilenet_v1 avg = 157.0007 ms
-- mobilenet_v2 avg = 118.1607 ms
-- shufflenet_v2 avg = 68.6804 ms
-- squeezenet_v11 avg = 91.3090 ms
mnasnet min = 12.59900 max = 12.46600 average = 12.52207
mobilenetv1 min = 19.05800 max = 18.94700 average = 18.97897
mobilenetv2 min = 15.28400 max = 15.11300 average = 15.19843
shufflenetv2 min = 6.97000 max = 6.81400 average = 6.90863
squeezenet min = 12.87900 max = 12.12900 average = 12.22530
Threads=4 Warmup=10 Repeats=30
-- mnasnet avg = 179.9730 ms
-- mobilenet_v1 avg = 204.0684 ms
-- mobilenet_v2 avg = 181.6486 ms
-- shufflenet_v2 avg = 123.2728 ms
-- squeezenet_v11 avg = 412.9046 ms
mnasnet min = 7.31400 max = 7.12900 average = 7.20357
mobilenetv1 min = 11.44000 max = 10.86900 average = 10.94383
mobilenetv2 min = 9.14900 max = 9.03800 average = 9.09907
shufflenetv2 min = 4.60600 max = 4.49400 average = 4.53360
squeezenet min = 8.27000 max = 8.10600 average = 8.19000
--------------------------------------
```
# Android Demo
## 多种应用场景
我们提供的Paddle-Lite示例工程[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo),其中包含[Android](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo)[iOS](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-ios-demo)[Armlinux](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-armlinux-demo)平台的示例工程。涵盖[人脸识别](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/face_detection_demo)[人像分割](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/human_segmentation_demo)[图像分类](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/image_classification_demo)[目标检测](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/object_detection_demo)4个应用场景。
### 1. 人脸识别
人脸检测是Paddle-Lite提供的人像检测demo。在移动端上提供了高精度、实时的人脸检测能力,能处理基于人脸检测的业务场景。在移动端预测的效果图如下:
<p align="center"><img width="300" height="250" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/face.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="300" height="250" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/face2.jpg"/></p>
### 2. 人像分割
人像分割是Paddle-Lite 提供的图像分割demo ,在移动端上提供了实时的人像分割能力,可以应用证件照自动抠图、面积测量、智能交通(标记车道和交通标志)等场景。 在移动端预测的效果图如下:
<p align="center"><img width="250" height="250" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/human.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/human2.jpg"/></p>
### 3. 图像分类
图像分类是Paddle-Lite 提供的图像处理demo ,在移动端上提供了实时的物体识别能力,可以应用到生产线自动分拣或质检、识别医疗图像、辅助医生肉眼诊断等场景。在移动端预测的效果图如下:
<p align="center"><img width="250" height="250" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/tabby_cat.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/tabby_cat2.jpg"/></p>
### 4. 物体检测
物体检测是Paddle-Lite 提供的图像识别demo ,在移动端上提供了检测多个物体的位置、名称、位置及数量的能力。可以应用到视频监控(是否有违规物体或行为)、工业质检(微小瑕疵的数量和位置)、医疗诊断(细胞计数、中药识别)等场景。在移动端预测的效果图如下:
<p align="center"><img width="250" height="250" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/dog.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/dog2.jpg"/></p>
## Android demo部署方法
下面我们以 **目标检测示例(object_detection_demo)** 为例讲解如何部署。
**目的**:将基于Paddle-Lite预测库的Android APP 部署到手机,实现物体检测
**需要的环境**: Android Studio、Android手机(开启USB调试模式)、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程
**部署步骤**
1、 目标检测的Android示例位于 `Paddle-Lite-Demo\PaddleLite-android-demo\object_detection_demo`
2、用Android Studio 打开object_detection_demo工程 (本步骤需要联网)。
3、手机连接电脑,打开**USB调试****文件传输模式**,在Android Studio上连接自己的手机设备(手机需要开启允许从 USB安装软件权限)
![Android_studio](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Android_studio.png)
4、按下 Run按钮,自动编译APP并安装到手机。(该过程会自动下载Paddle-Lite预测库和模型,需要联网)
成功后效果如下,图一:APP安装到手机 图二: APP打开后的效果,会自动识别图片中的物体并标记
<p align="center"><img width="300" height="450" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/AndroidApp0.png"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="300" height="450" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/AndroidApp1.jpg"/></p>
## Android demo结构讲解
Android 示例的代码结构如下图所示:
<p align="center"><img width="600" height="450" src="http://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Android_struct.png"/>
1、 Predictor.java: 预测代码
```shell
# 位置:
object_detection_demo/app/src/main/java/com/baidu/paddle/lite/demo/object_detection/Predictor.java
```
2、 model.nb : 模型文件 (opt 工具转化后Paddle-Lite模型);pascalvoc_label_list:训练模型时的`labels`文件
```shell
# 位置:
object_detection_demo/app/src/main/assets/models/ssd_mobilenet_v1_pascalvoc_for_cpu/model.nb
object_detection_demo/app/src/main/assets/labels/pascalvoc_label_list
```
3、 libpaddle_lite_jni.so、PaddlePredictor.jar:Paddle-Lite Java 预测库与Jar包
```shell
# 位置
object_detection_demo/app/src/main/jniLibs/arm64-v8a/libpaddle_lite_jni.so
object_detection_demo/app/libs/PaddlePredictor.jar
```
4、 build.gradle : 定义编译过程的 gradle 脚本。(不用改动,定义了自动下载Paddle-Lite预测和模型的过程)
```shell
# 位置
object_detection_demo/app/build.gradle
```
## 代码讲解 (使用Paddle-Lite Java API 执行预测)
Android 示例基于Java API 开发,调用Paddle-Lite Java API包括以下五步。更详细的API 描述参考: [Paddle-Lite Java API](https://paddle-lite.readthedocs.io/zh/latest/api_reference/java_api_doc.html)
```c++
// 导入Java API
import com.baidu.paddle.lite.MobileConfig;
import com.baidu.paddle.lite.Tensor;
import com.baidu.paddle.lite.Predictor;
import com.baidu.paddle.lite.PowerMode;
// 1. 写入配置:设置MobileConfig
MobileConfig config = new MobileConfig();
config.setModelFromFile(<modelPath>); // 设置Paddle-Lite模型路径
config.setPowerMode(PowerMode.LITE_POWER_NO_BIND); // 设置CPU运行模式
config.setThreads(4); // 设置工作线程数
// 2. 创建 PaddlePredictor
PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config);
// 3. 设置输入数据
long[] dims = {100, 100};
float[] inputBuffer = new float[10000];
for (int i = 0; i < 10000; ++i) {
inputBuffer[i] = i;
}
Tensor input = predictor.getInput(0);
input.resize(dims);
input.setData(inputBuffer);
// 4. 执行预测
predictor.run();
// 5. 获取输出数据
Tensor result = predictor.getOutput(0);
float[] output = result.getFloatData();
for (int i = 0; i < 1000; ++i) {
System.out.println(output[i]);
}
```
......@@ -2,7 +2,7 @@
## 1. 下载最新版本预测库
预测库下载界面位于[Paddle-Lite官方预编译库](../installation/release_lib),可根据需求选择合适版本。
预测库下载界面位于[Paddle-Lite官方预编译库](../user_guides/release_lib),可根据需求选择合适版本。
**Android-ARMv8架构**为例,可以下载以下版本:
......@@ -173,7 +173,7 @@ predictor->Run();
std::unique_ptr<const Tensor> output_tensor(
std::move(predictor->GetOutput(0)));
// 转化为数据
auto output_data=output_tensor<float>();
auto output_data=output_tensor->data<float>();
```
......
# Lite基于CUDA的模型预测
# PaddleLite使用CUDA预测部署
Lite支持在x86_64,arm64架构上(如:TX2)进行CUDA的编译运行。
......@@ -28,7 +28,27 @@ cd Paddle-Lite
./lite/tools/build.sh --build_python=ON cuda
```
编译结束会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`
## 编译结果说明
cuda的编译结果位于 `build_cuda/inference_lite_lib`
**具体内容**说明:
1、 `bin`文件夹:可执行工具文件,目前为空
2、 `cxx`文件夹:包含c++的库文件与相应的头文件
- `include` : 头文件
- `lib` : 库文件
- 打包的静态库文件:
- `libpaddle_api_full_bundled.a` :包含 full_api 和 light_api 功能的静态库
- 打包的动态态库文件:
- `libpaddle_full_api_shared.so` :包含 full_api 和 light_api 功能的动态库
3、 `third_party` 文件夹:第三方库文件
4、 `demo` 文件夹:c++ demo.
如果编译打开了python选项,则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`
## 运行
......@@ -36,7 +56,6 @@ cd Paddle-Lite
一: 下载darknet_yolov3模型,模型信息请参考[这里](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/yolov3)
```
# 下载模型
wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz
......@@ -47,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg
二: 运行
**NOTE:**此处示例使用的是python接口,后续会开放C++接口以及示例
**NOTE:**此处示例使用的是python接口。
``` python
#-*- coding: utf-8 -*-
......@@ -107,4 +126,14 @@ print (output_tensor.float_data()[:6])
```
**NOTE:** 对CUDA的支持还在持续开发中。
**NOTE:** 此处示例使用的是C++接口。
```
cd build_cuda/inference_lite_lib/demo/cxx/
mkdir build && cd build
cmake ..
make
wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz
tar -zxf yolov3_infer.tar.gz
./demo yolov3_infer
```
# Lite基于FPGA的模型预测
# PaddleLite使用FPGA预测部署
Paddle Lite支持基于arm的FPGA zu3/zu5/zu9的模型预测,提供armv8的交叉编译
......@@ -22,7 +22,7 @@ CMAKE编译选项:
- 设置`LITE_WITH_FPGA=ON``LITE_WITH_ARM=ON`
其他编译选项与ARM编译相同,可以参考[“Paddle Lite在Docker下的ARM编译”](../source_compile)
其他编译选项与ARM编译相同,可以参考[“Paddle Lite在Docker下的ARM编译”](../user_guides/source_compile)
示例如下:
```shell
cmake .. \
......
# iOS Demo
## 多种应用场景
我们提供Paddle-Lite示例工程[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo),其中包含[Android](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo)[iOS](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-ios-demo)[Armlinux](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-armlinux-demo)平台的示例工程。iOS demo涵盖[图像分类](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/image_classification_demo)[目标检测](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/object_detection_demo)2个应用场景。
### 1. 图像分类
图像分类是Paddle-Lite 提供的图像处理demo ,在移动端上提供了实时的物体识别能力,可以应用到生产线自动分拣或质检、识别医疗图像、辅助医生肉眼诊断等场景。在移动端预测的效果图如下:
<p align="center"><img width="250" height="250" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/tabby_cat.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/tabby_cat2.jpg"/></p>
### 2. 物体检测
物体检测是Paddle-Lite 提供的图像识别demo ,在移动端上提供了检测多个物体的位置、名称、位置及数量的能力。可以应用到视频监控(是否有违规物体或行为)、工业质检(微小瑕疵的数量和位置)、医疗诊断(细胞计数、中药识别)等场景。在移动端预测的效果图如下:
<p align="center"><img width="250" height="250" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/dog.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/dog2.jpg"/></p>
## iOS demo部署方法
下面我们以**目标检测(object_detection_demo)**为例讲解如何部署iOS工程。
**目的**:将基于Paddle-Lite预测库的iOS APP部署到苹果手机,实现物体检测。
**需要的环境**:Mac 电脑上安装Xcode、苹果手机、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程
**部署步骤**
1、 目标检测的iOS示例位于 `Paddle-Lite-Demo\PaddleLite-ios-demo\object_detection_demo`
2、终端中执行 `download_dependencies.sh`脚本自动下载模型和Paddle-Lite预测库
```shell
cd PaddleLite-ios-demo # 1. 终端中进入 Paddle-Lite-Demo\PaddleLite-ios-demo
sh download_dependencies.sh # 2. 执行脚本下载依赖项 (需要联网)
```
下载完成后会出现提示: `Extract done `
3、用Xcode打开`object_detection_demo/detection_demo.xcodeproj`文件,修改工程配置。
依次修改 `General/Identity``Signing&Capabilities`属性,替换为自己的工程代号和团队名称。(必须修改,不然无法通过编译)
![Xcode1](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/Xcode1.png)
![Xcode2](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/Xcode2.png)
4、 IPhone手机连接电脑,在Xcode中连接自己的手机 (第一次连接IPhone到电脑时,需要在IPhone的`设置->通用->设备管理`中选择本电脑并信任)
<p align="center"><img width="600" height="250" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/Xcode-phone.jpg"/>
5、按下左上角的 Run按钮,自动编译APP并安装到手机。在苹果手机中设置信任该APP(进入`设置->通用->设备管理`,选中新安装的APP并`验证该应用`
成功后效果如下,图一:APP安装到手机 图二: APP打开后的效果,会自动识别图片中的物体并标记
<p align="center"><img width="300" height="450" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/IOS2.jpeg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="300" height="450" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/IOS3.jpeg"/></p>
## iOS demo结构讲解
iOS 示例的代码结构如下图所示:
<p align="center"><img width="600" height="450" src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/IOS-struct.png"/>
1、 mobilenetv1-ssd: 模型文件 (opt 工具转化后Paddle-Lite模型)
```shell
# 位置:
ios-detection_demo/detection_demo/models/mobilenetv1-ssd
```
2、 libpaddle_api_light_bundled.a、paddle_api.h : Paddle-Lite C++ 预测库和头文件
```shell
# 位置:
# iOS预测库
ios-detection_demo/detection_demo/lib/libpaddle_api_light_bundled.a
# 预测库头文件
ios-detection_demo/detection_demo/include/paddle_api.h
ios-detection_demo/detection_demo/include/paddle_use_kernels.h
ios-detection_demo/detection_demo/include/paddle_use_ops.h
```
3、 ViewController.mm:主要预测代码
```shell
# 位置
ios-detection_demo/detection_demo/ViewController.mm
```
## 代码讲解 (如何使用Paddle-Lite C++ API 执行预测)
IOS 示例基于C++ API 开发,调用Paddle-Lite C++ API包括以下五步。更详细的API 描述参考: [Paddle-Lite C++ API](https://paddle-lite.readthedocs.io/zh/latest/api_reference/java_api_doc.html)
```c++
#include <iostream>
// 引入C++ API
#include "paddle_lite/paddle_api.h"
#include "paddle_lite/paddle_use_ops.h"
#include "paddle_lite/paddle_use_kernels.h"
// 1. 设置MobileConfig
MobileConfig config;
config.set_model_from_file(<modelPath>); // 设置NaiveBuffer格式模型路径
config.set_power_mode(LITE_POWER_NO_BIND); // 设置CPU运行模式
config.set_threads(4); // 设置工作线程数
// 2. 创建PaddlePredictor
std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
// 3. 设置输入数据
std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
input_tensor->Resize({1, 3, 224, 224});
auto* data = input_tensor->mutable_data<float>();
for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
data[i] = 1;
}
// 4. 执行预测
predictor->run();
// 5. 获取输出数据
std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
<< std::endl;
}
```
......@@ -9,7 +9,7 @@
## 编译
首先在PaddleLite的开发 [Docker镜像](../installation/source_compile) 中,拉取最新PaddleLite代码,编译对应你手机架构的预测库,
首先在PaddleLite的开发 [Docker镜像](../user_guides/source_compile) 中,拉取最新PaddleLite代码,编译对应你手机架构的预测库,
下面我们以arm8 架构举例。进入paddlelite 目录,运行以下命令:
```shell
......
# 使用华为NPU
# PaddleLite使用NPU(华为)预测部署
Paddle Lite是首款支持华为自研达芬奇架构NPU(Kirin 810/990 SoC搭载的NPU)的预测框架。
原理是在线分析Paddle模型,将Paddle算子转成HiAI IR后,调用HiAI IR/Builder/Runtime APIs生成并执行HiAI模型。
......@@ -91,7 +91,7 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an
$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_shared tiny_publish
```
注意:为了保证编译环境一致,建议参考[源码编译](../installation/source_compile)中的Docker开发环境进行配置,然后再执行上述命令。
注意:为了保证编译环境一致,建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置,然后再执行上述命令。
## 优化生成NPU模型
......@@ -103,7 +103,6 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an
--optimize_out_type=(protobuf|naive_buffer) \
--optimize_out=<output_optimize_model_dir> \
--valid_targets=npu,arm \
--prefer_int8_kernel=(true|false) \
--record_tailoring_info =(true|false)
```
- model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子,并没有真正生成NPU HiAI模型,只有在执行时才会将标记的Paddle算子转成HiAI IR,最终生成并执行HiAI模型,具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)
......@@ -111,13 +110,13 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an
## 通过JAVA接口加载并执行NPU模型
- 使用方法和[Java实例](../user_guides/java_demo)一致,无需额外设置任何参数,只需将模型换成NPU模型即可。[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中的Image Classification Demo for Android是同时支持CPU和NPU两种模型的图像分类Demo。
- 使用方法和[Java实例](java_demo)一致,无需额外设置任何参数,只需将模型换成NPU模型即可。[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中的Image Classification Demo for Android是同时支持CPU和NPU两种模型的图像分类Demo。
注意:在拷贝libpaddle_lite_jni.so的时候,由于依赖HiAI DDK so和libc++_shared.so库,需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so,拷到libpaddle_lite_jni.so同级目录下。
## 通过C++接口加载并执行NPU模型
- 使用方法和[C++实例](../user_guides/cpp_demo)一致,同样无需额外设置任何参数,只需将模型换成NPU模型即可。
- 使用方法和[C++实例](cpp_demo)一致,同样无需额外设置任何参数,只需将模型换成NPU模型即可。
注意:1)不能使用安卓模拟器,需要使用真实设备,且必须是支持NPU的华为手机。2)在使用adb push命令向手机推送目标程序时,需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so,推送到目标程序同级目录下。
......
# Lite基于OpenCL的模型预测
# PaddleLite使用OpenCL预测部署
Lite支持在Android系统上运行基于OpenCL的程序,目前支持Ubuntu环境下armv8、armv7的交叉编译。
......@@ -11,18 +11,45 @@ Lite支持在Android系统上运行基于OpenCL的程序,目前支持Ubuntu环
详见 **源码编译指南-环境准备** 章节。
### 编译选项
|参数|介绍|值|
|--------|--------|--------|
|--arm_os|代表目标操作系统|目前仅支持且默认为`android`|
|--arm_abi|代表体系结构类型,支持armv8和armv7|默认为`armv8`即arm64-v8a;`armv7`即armeabi-v7a|
|--arm_lang|代表编译目标文件所使用的编译器|默认为gcc,支持 gcc和clang两种|
### 编译Paddle-Lite OpenCL库范例
注:以android-armv8-opencl的目标、Docker容器的编译开发环境为例,CMake3.10,android-ndk-r17c位于`/opt/`目录下。
#### 针对 Lite 用户的编译命令(无单元测试,有编译产物)
- `arm_os`: `[android]`,目前不支持linux;
- `arm_abi`: `[armv7 | armv8]`
- `arm_lang`: `[gcc]`,目前不支持clang;
- `build_extra`: `[OFF | ON]`,编译全量op和kernel,体积会大,编译时间长;
- `build_cv`: `[OFF | ON]`,编译arm cpu neon实现的的cv预处理模块;
- `android_stl`: `[c++_shared | c++_static]`,paddlelite的库以何种方式链接`android_stl`,选择`c++_shared`得到的动态库体积更小,但使用时候记得上传paddlelite所编译版本(armv7或armv8)一致的`libc++_shared.so`(来自Android-NDK);
注:调用`./lite/tools/build.sh`执行编译。
```bash
# 假设当前位于处于Lite源码根目录下
# 导入NDK_ROOT变量,注意检查您的安装目录若与本示例不同
export NDK_ROOT=/opt/android-ndk-r17c
# 删除上一次CMake自动生成的.h文件
rm ./lite/api/paddle_use_kernels.h
rm ./lite/api/paddle_use_ops.h
# 根据指定编译参数编译
./lite/tools/build.sh \
--arm_os=android \
--arm_abi=armv8 \
--arm_lang=gcc \
--build_extra=OFF \
--build_cv=OFF \
--android_stl=c++_shared \
opencl
```
#### 针对 Lite 开发者的编译命令(有单元测试,编译产物)
注:调用`./lite/tools/ci_build.sh`执行编译,该命令会编译armv7和armv8的opencl库。虽然有编译产物,但因编译单元测试,编译产物包体积可能较大,不推荐使用。
```bash
# 假设当前位于处于Lite源码根目录下
......@@ -38,16 +65,20 @@ rm ./lite/api/paddle_use_ops.h
--arm_os=android \
--arm_abi=armv8 \
--arm_lang=gcc \
build_test_arm_opencl
build_opencl
```
注:如果要调试cl kernel,假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件,保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`,该命令会自动将修改后,再切到build目录下执行`make publish_inference`或者你要编译的单测的可执行文件名,cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。
### 编译产物说明
编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内,这里仅罗列关键产物:
- `cxx`:该目录是编译目标的C++的头文件和库文件;
- `demo`:该目录包含了两个demo,用来调用使用`libpaddle_api_full_bundled.a``libpaddle_api_light_bundled.a`,分别对应`mobile_full``mobile_light`文件夹。编译对应的demo仅需在`mobile_full``mobile_light`
- `mobile_full`:使用cxx config,可直接加载fluid模型,若使用OpenCL需要在`mobilenetv1_full_api.cc`代码里开启`DEMO_USE_OPENCL`的宏,详细见代码注释;
- `mobile_light`:使用mobile config,只能加载`model_optimize_tool`优化过的模型;
- `opencl`:该目录存放opencl实现的相关kernel
- `mobile_light`:使用mobile config,只能加载`model_optimize_tool`优化过的模型
注:`opencl`实现的相关kernel已经打包到动态库中
```bash
.
......@@ -65,40 +96,23 @@ rm ./lite/api/paddle_use_ops.h
| |-- libpaddle_api_light_bundled.a
| |-- libpaddle_full_api_shared.so
| `-- libpaddle_light_api_shared.so
|-- demo
| `-- cxx
| |-- Makefile.def
| |-- README.md
| |-- include
| | |-- paddle_api.h
| | |-- paddle_lite_factory_helper.h
| | |-- paddle_place.h
| | |-- paddle_use_kernels.h
| | |-- paddle_use_ops.h
| | `-- paddle_use_passes.h
| |-- mobile_full
| | |-- Makefile
| | `-- mobilenetv1_full_api.cc
| `-- mobile_light
| |-- Makefile
| `-- mobilenetv1_light_api.cc
`-- opencl
`-- cl_kernel
|-- buffer
| |-- depthwise_conv2d_kernel.cl
| |-- elementwise_add_kernel.cl
| |-- fc_kernel.cl
| |-- im2col_kernel.cl
| |-- layout_kernel.cl
| |-- mat_mul_kernel.cl
| |-- pool_kernel.cl
| `-- relu_kernel.cl
|-- cl_common.h
`-- image
|-- channel_add_kernel.cl
|-- elementwise_add_kernel.cl
|-- pool_kernel.cl
`-- relu_kernel.cl
`-- demo
`-- cxx
|-- Makefile.def
|-- README.md
|-- include
| |-- paddle_api.h
| |-- paddle_lite_factory_helper.h
| |-- paddle_place.h
| |-- paddle_use_kernels.h
| |-- paddle_use_ops.h
| `-- paddle_use_passes.h
|-- mobile_full
| |-- Makefile
| `-- mobilenetv1_full_api.cc
`-- mobile_light
|-- Makefile
`-- mobilenetv1_light_api.cc
```
调用`libpaddle_api_full_bundled.a``libpaddle_api_light_bundled.a`见下一部分运行示例。
......@@ -109,48 +123,9 @@ rm ./lite/api/paddle_use_ops.h
下面以android、ARMv8、gcc的环境为例,介绍3个示例,分别如何在手机上执行基于OpenCL的ARM GPU推理过程。
**注意:** 以下命令均在Lite源码根目录下运行。在3个示例前,下面这段命令都先要执行用来准备环境:
```bash
# 在/data/local/tmp目录下创建OpenCL文件目录
adb shell mkdir -p /data/local/tmp/opencl
adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/buffer
adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/image
# 将OpenCL的kernels文件推送到/data/local/tmp/opencl目录下
adb push lite/backends/opencl/cl_kernel/cl_common.h /data/local/tmp/opencl/cl_kernel/
adb push lite/backends/opencl/cl_kernel/buffer/* /data/local/tmp/opencl/cl_kernel/buffer/
adb push lite/backends/opencl/cl_kernel/image/* /data/local/tmp/opencl/cl_kernel/image/
```
### 运行示例1: 编译产物demo示例
```bash
######################################################################
# 编译mobile_full的demo #
######################################################################
# 步骤: #
# 0.确保编译Paddle-Lite时编译了OpenCL; #
# 1.编辑`mobilenetv1_full_api.cc`代码, 开启`DEMO_USE_OPENCL`的宏; #
# 2.在产物目录`demo/cxx/mobile_full`下编译`mobile_full`的demo; #
# 3.上传demo, 模型, opencl kernel文件到手机; #
# 4.运行demo得到预期结果. #
######################################################################
adb shell mkdir /data/local/tmp/opencl/mobilenet_v1
chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api
adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api /data/local/tmp/opencl/
adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1
# use mobile_full run mobilenet_v1
# `GLOG_v` is log level
adb shell "export GLOG_v=0; \
/data/local/tmp/opencl/mobilenetv1_full_api \
--model_dir=/data/local/tmp/opencl/mobilenet_v1 \
--optimized_model_dir=/data/local/tmp/opencl/full_api_opt_model"
######################################################################
# 编译mobile_light的demo #
######################################################################
......@@ -158,33 +133,40 @@ adb shell "export GLOG_v=0; \
# 0.确保编译Paddle-Lite时编译了OpenCL; #
# 1.编译model_optimize_tool并对模型优化, `targets`参数为`opencl`; #
# 2.在产物目录`demo/cxx/mobile_light`下编译`mobile_light`的demo; #
# 3.上传demo, 模型, opencl kernel文件到手机; #
# 3.上传demo, 模型文件到手机; #
# 4.运行demo得到预期结果. #
######################################################################
# 在/data/local/tmp目录下创建OpenCL文件目录
adb shell mkdir -p /data/local/tmp/opencl
# use model_optimize_tool to optimize model
./build.model_optimize_tool/lite/api/model_optimize_tool \
--model_dir=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \
--optimize_out_type=naive_buffer \
--optimize_out=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \
--optimize_out=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/mobilenetv1_opt \
--valid_targets=opencl
adb shell mkdir /data/local/tmp/opencl/mobilenet_v1
adb shell mkdir /data/local/tmp/opencl/mobilenet_v1/
chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api
adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp/opencl/
adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1
adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/mobilenetv1_opt.nb /data/local/tmp/opencl/
# use mobile_light run mobilenet_v1
adb shell "export GLOG_v=5; \
adb shell "export GLOG_v=1; \
/data/local/tmp/opencl/mobilenetv1_light_api \
--model_dir=/data/local/tmp/opencl/"
/data/local/tmp/opencl/mobilenetv1_opt.nb"
```
**注:** `GLOG_v`是指定需要显示VLOG的日志级别,默认为0。权重参数会在第一次运行时加载,所以第一次执行时间略长。一般将warmup的值设为10,repeats值设为多次。
### 运行示例2: test_mobilenetv1单元测试
- **运行文件准备**
```bash
# 在/data/local/tmp目录下创建OpenCL文件目录
adb shell mkdir -p /data/local/tmp/opencl
# 将mobilenet_v1的模型文件推送到/data/local/tmp/opencl目录下
adb shell mkdir -p /data/local/tmp/opencl/mobilenet_v1
adb push build.lite.android.armv8.gcc.opencl/third_party/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1/
......@@ -195,41 +177,25 @@ adb push build.lite.android.armv8.gcc.opencl/lite/api/test_mobilenetv1 /data/loc
- **执行OpenCL推理过程**
使用如下命令运行OpenCL程序。其中:
- `--cl_path`指定了OpenCL的kernels文件即cl\_kernel所在目录;
- `--modle_dir`指定了模型文件所在目录。
```bash
adb shell chmod +x /data/local/tmp/opencl/test_mobilenetv1
adb shell /data/local/tmp/opencl/test_mobilenetv1 \
--cl_path=/data/local/tmp/opencl \
--model_dir=/data/local/tmp/opencl/mobilenet_v1 \
--warmup=1 \
--repeats=1
adb shell "export GLOG_v=1; \
/data/local/tmp/opencl-image/test_mobilenetv1 \
--model_dir=/data/local/tmp/opencl-image/mobilenetv1_fluid/ \
--warmup=10 \
--repeats=100"
```
**注意:** 因为权重参数均会在Op Kernel第一次运行时进行加载,所以第一次的执行时间会略长。一般将warmup的值设为1,repeats值设为多次。
### 运行示例3: test_layout_opencl单元测试
- **运行文件准备**
```bash
# 将OpenCL单元测试程序test_layout_opencl,推送到/data/local/tmp/opencl目录下
adb push build.lite.android.armv8.gcc.opencl/lite/kernels/opencl/test_layout_opencl /data/local/tmp/opencl/
```
OpenCL推理过程**
```bash
adb shell mkdir -p /data/local/tmp/opencl
adb shell chmod +x /data/local/tmp/opencl/test_layout_opencl
adb shell /data/local/tmp/opencl/test_layout_opencl
adb shell "export GLOG_v=4; \
/data/local/tmp/opencl/test_layout_opencl"
```
### 如何在Code中使用
见运行示例1的demo代码:
......
# 使用X86预测库
# PaddleLite使用X86预测部署
Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../installation/source_compile)
Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../user_guides/source_compile)
(注意:非docker Linux环境需要是Ubuntu16.04)
......@@ -9,8 +9,8 @@ Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考
1、 下载代码
```bash
git clone https://github.com/PaddlePaddle/Paddle-Lite.git
#需要切换到 release/v2.0.0之后版本
git checkout <release_tag>
# 切换到release分支
git checkout release/v2.3
```
2、 源码编译
......@@ -42,43 +42,56 @@ x86编译结果位于 `build.lite.x86/inference_lite_lib`
## x86预测API使用示例
1、我们提供Linux环境下x86 API运行mobilenet_v1的示例:[mobilenet_full_x86demo](https://paddlelite-data.bj.bcebos.com/x86/mobilenet_full_x86demo.zip)。下载解压后内容如下:
![](https://paddlelite-data.bj.bcebos.com/x86/x86-doc/demo.png)
`mobilenet_v1`为模型文件、`lib``include`分别是Paddle-Lite的预测库和头文件、`third_party`下是编译时依赖的第三方库`mklml``mobilenet_full_api.cc`是x86示例的源代码、`build.sh`为编译的脚本。
2、demo内容与使用方法
``` bash
# 1、编译
sh build.sh
```
编译结果为当前目录下的 `mobilenet_full_api `
``` bash
# 2、执行预测
mobilenet_full_api mobilenet_v1
```
`mobilenet_v1`为当前目录下的模型路径,`mobilenet_full_api`为第一步编译出的可执行文件。
3、示例源码`mobilenet_full_api.cc`
```c++
#include <gflags/gflags.h>
#include <iostream>
#include <vector>
#include "paddle_api.h" // NOLINT
#include "paddle_use_kernels.h" // NOLINT
#include "paddle_use_ops.h" // NOLINT
#include "paddle_use_passes.h" // NOLINT
#include "paddle_api.h"
using namespace paddle::lite_api; // NOLINT
DEFINE_string(model_dir, "", "Model dir path.");
DEFINE_string(optimized_model_dir, "", "Optimized model dir.");
DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
using namespace paddle::lite_api; // NOLINT
int64_t ShapeProduction(const shape_t& shape) {
int64_t res = 1;
for (auto i : shape) res *= i;
return res;
}
void RunModel() {
// 1. Set CxxConfig
CxxConfig config;
config.set_model_file(FLAGS_model_dir + "model");
config.set_param_file(FLAGS_model_dir + "params");
config.set_valid_places({
lite_api::Place{TARGET(kX86), PRECISION(kFloat)}
});
void RunModel(std::string model_dir) {
// 1. Create CxxConfig
CxxConfig config;
config.set_model_dir(model_dir);
config.set_valid_places({
Place{TARGET(kX86), PRECISION(kFloat)},
Place{TARGET(kHost), PRECISION(kFloat)}
});
// 2. Create PaddlePredictor by CxxConfig
std::shared_ptr<PaddlePredictor> predictor =
CreatePaddlePredictor<CxxConfig>(config);
// 3. Prepare input data
std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
input_tensor->Resize(shape_t({1, 3, 224, 224}));
input_tensor->Resize({1, 3, 224, 224});
auto* data = input_tensor->mutable_data<float>();
for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
data[i] = 1;
......@@ -90,15 +103,21 @@ void RunModel() {
// 5. Get output
std::unique_ptr<const Tensor> output_tensor(
std::move(predictor->GetOutput(0)));
std::cout << "Output dim: " << output_tensor->shape()[1] << std::endl;
std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
std::cout << "Output[" << i << "]:" << output_tensor->data<float>()[i] << std::endl;
std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
<< std::endl;
}
}
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
RunModel();
if (argc < 2) {
std::cerr << "[ERROR] usage: ./" << argv[0] << " naive_buffer_model_dir\n";
exit(1);
}
std::string model_dir = argv[1];
RunModel(model_dir);
return 0;
}
```
# 如何增加Layout
# 新增Layout
Paddle-Lite中Place包含了Target、Layout、Precision信息,用来注册和选择模型中的具体Kernel。下面以增加Place中的layout:`ImageDefault``ImageFolder``ImageNW`为例,讲解如何增加新Layout。
......
# 新增Pass方法
# 新增Pass
本文从三个方面介绍了`Lite`中的`Pass`结构:**Pass是什么****Pass的实现与接口****Pass的一般注册流程**。最后以`Fc_fuse_pass`为例介绍了`fusion_pass`的作用与注册方法。
......
# 新增OP的方法
# 新增OP
以下以添加argmax为例,详细说明新增op的方法。
......
......@@ -14,11 +14,11 @@ Welcome to Paddle-Lite's documentation!
introduction/tech_highlights
introduction/architecture
introduction/support_hardware
introduction/roadmap
introduction/support_operation_list
.. toctree::
:maxdepth: 1
:caption: Benchmark数据和方法
:caption: Benchmark
:name: sec-benchmark
benchmark/benchmark
......@@ -26,60 +26,60 @@ Welcome to Paddle-Lite's documentation!
.. toctree::
:maxdepth: 1
:caption: 安装
:name: sec-install
installation/release_lib
installation/source_compile
.. toctree::
:maxdepth: 1
:caption: 使用指南
:caption: 使用方法
:name: sec-user-guides
user_guides/tutorial
user_guides/release_lib
user_guides/source_compile
user_guides/x2paddle
user_guides/model_optimize_tool
user_guides/cpp_demo
user_guides/java_demo
user_guides/android_ios_app_demo
user_guides/post_quant_with_data
user_guides/post_quant_no_data
user_guides/model_quantization
user_guides/debug
user_guides/library_tailoring
.. toctree::
:maxdepth: 1
:caption: 进阶使用指南
advanced_user_guides/x2paddle
advanced_user_guides/x2paddle_models_doc
advanced_user_guides/post_quant_with_data
advanced_user_guides/post_quant_no_data
advanced_user_guides/model_quantization
advanced_user_guides/support_operation_list
advanced_user_guides/add_operation
advanced_user_guides/add_layout
advanced_user_guides/add_new_pass
advanced_user_guides/test_tools
advanced_user_guides/debug_tools
advanced_user_guides/npu
advanced_user_guides/opencl
advanced_user_guides/fpga
advanced_user_guides/cuda
advanced_user_guides/x86
advanced_user_guides/cv
:caption: 部署示例
:name: sec-demo_guides
demo_guides/cpp_demo
demo_guides/java_demo
demo_guides/android_app_demo
demo_guides/ios_app_demo
demo_guides/x86
demo_guides/cuda
demo_guides/opencl
demo_guides/fpga
demo_guides/npu
.. toctree::
:maxdepth: 1
:caption: 开发者文档
:caption: API文档
api_reference/cxx_api_doc
api_reference/java_api_doc
api_reference/python_api_doc
api_reference/cv
.. toctree::
:maxdepth: 1
:caption: 开发者贡献
develop_guides/for-developer
develop_guides/architecture-intro
develop_guides/add_operation
develop_guides/add_layout
develop_guides/add_new_pass
.. toctree::
:maxdepth: 1
:caption: API文档
:caption: Roadmap
:name: sec-roadmap
api_reference/cxx_api_doc
api_reference/java_api_doc
api_reference/python_api_doc
introduction/roadmap
.. toctree::
:maxdepth: 1
......
# FAQ
# FAQ 常见问题
问题或建议可以发Issue,为加快问题解决效率,可先检索是否有类似问题,我们也会及时解答!
欢迎加入Paddle-Lite百度官方QQ群:696965088
1. 在Host端采用交叉编译方式编译PaddleLite,将编译后的libpaddle_light_api_shared.so和可执行程序放到板卡上运行,出现了如下图所示的错误,怎么解决?
![host_target_compiling_env_miss_matched](https://user-images.githubusercontent.com/9973393/75761527-31b8b700-5d74-11ea-8a9a-0bc0253ee003.png)
- 原因是Host端的交叉编译环境与Target端板卡的运行环境不一致,导致libpaddle_light_api_shared.so链接的GLIBC库高于板卡环境的GLIBC库。目前有四种解决办法(为了保证编译环境与官方一致,推荐第一种方式):1)在Host端,参考[源码编译](../user_guides/source_compile)中的Docker方式重新编译libpaddle_light_api_shared.so;2)在Host端,使用与Target端版本一致的ARM GCC和GLIBC库重新编译libpaddle_light_api_shared.so;3)在Target端板卡上,参考[源码编译](../user_guides/source_compile)中的ARM Linux本地编译方式重新编译libpaddle_light_api_shared.so;4)在Target端板卡上,将GLIBC库升级到和Host端一致的版本,即GLIBC2.27。
......@@ -23,5 +23,10 @@
- 增加 INT8 量化,从 PaddleSlim 训练到 PaddleLite 部署完整案例
- 支持内存中加载模型,以支持 APP 的简易加密
## 2.0.0-rc ?
预计发布 *2019-9-16~7days*
## 2.3
[v2.3 project](https://github.com/PaddlePaddle/Paddle-Lite/milestone/3?closed=1)
## 2.6
[v2.6 project](https://github.com/PaddlePaddle/Paddle-Lite/milestones/v2.6)
# 支持硬件列表
# 支持硬件
## ARM CPU
......
# 支持OP列表
# 支持OP
## Ops (共计158个算子)
......@@ -110,6 +110,7 @@
- equal
- gather
- generate_proposals
- graph_op
- greater_equal
- greater_than
- gru
......@@ -153,7 +154,6 @@
- sequence_expand
- sequence_expand_as
- sequence_pool
- sequence_pool_concat
- sequence_reshape
- sequence_reverse
- sequence_softmax
......@@ -226,7 +226,6 @@
- generate_proposals
- greater_equal
- greater_than
- grid_sampler
- gru
- gru_unit
- hard_sigmoid
......@@ -312,9 +311,6 @@
- gelu
- gru
- layer_norm
- leaky_relu
- lookup_table
- lookup_table_v2
- match_matrix_tensor
- matmul
- mul
......@@ -392,11 +388,9 @@
- yolo_box
### OpenCL kernels
- concat
- conv2d
- depthwise_conv2d
- elementwise_add
- elementwise_mul
- fc
- fusion_elementwise_add_activation
- layout
......@@ -404,10 +398,5 @@
- io_copy
- io_copy_once
- mul
- nearest_interp
- pool2d
- relu
- reshape
- reshape2
- scale
- sigmoid
# Android/IOS APP demo
请参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)
# 调试方法
# 调试
## Profiler工具
Basic profiler 用于 CPU 上kernel 耗时的统计。
### 开启方法:
参照 [编译安装](../user_guides/source_compile) 中的**full_publish**部分进行环境配置,在 cmake 时添加 `-DLITE_WITH_PROFILE=ON` ,就可以开启相应支持。
### 使用示例:
在模型执行完毕后,会自动打印类似如下 profiler 的日志
```
kernel average min max count
feed/def/1/4/2 0 0 0 1
conv2d/def/4/1/1 1175 1175 1175 1
conv2d/def/4/1/1 1253 1253 1253 1
depthwise_conv2d/def/4/1/1 519 519 519 1
conv2d/def/4/1/1 721 721 721 1
elementwise_add/def/4/1/1 18 18 18 1
conv2d/def/4/1/1 2174 2174 2174 1
depthwise_conv2d/def/4/1/1 380 380 380 1
conv2d/def/4/1/1 773 773 773 1
elementwise_add/def/4/1/1 2 2 2 1
conv2d/def/4/1/1 1248 1248 1248 1
depthwise_conv2d/def/4/1/1 492 492 492 1
conv2d/def/4/1/1 1150 1150 1150 1
elementwise_add/def/4/1/1 33 33 33 1
elementwise_add/def/4/1/1 3 3 3 1
conv2d/def/4/1/1 1254 1254 1254 1
depthwise_conv2d/def/4/1/1 126 126 126 1
```
## Debug工具
**Lite Model Debug Tool** 是用来检查Paddle-Lite框架与Paddle-Fluid框架运行时tensor(包括variable与weight)之间diff信息的基础工具。
## 编译方法:
### 编译方法:
1. 参照 [编译安装](../installation/source_compile) 中的**full_publish**部分进行环境配置和编译。
1. 参照 [编译安装](../user_guides/source_compile) 中的**full_publish**部分进行环境配置和编译。
2. 在生成的`build`目录下,执行`make lite_model_debug_tool``lite_model_debug_tool`产出在编译目录的`lite/tools/debug`目录下。
## 工作流程:
### 工作流程:
1. 运行 `/bin/bash check_model.sh --model_dir=<your_model_path> --build_root_dir=<your_cmake_root_dir> debug_cpp_stage` 获得模型在Paddle-Lite框架下的运行拓扑信息、varibles信息和weights信息。运行后拓扑信息将会存储在默认名为 `topo_file.txt` 的文件中,variables和weights信息将会存储在默认名为 `tensor_cpp.txt` 的文件中。
2. 运行 `/bin/bash check_model.sh --model_dir=<your_model_path> --build_root_dir=<your_cmake_root_dir> debug_py_stage`执行fluid框架预测以获取相同模型在fluid框架下的variable与weight信息(注意:我们使用fluid的python api运行fluid模型,因此您在运行此步之前应确保已正确安装fluid的python api)。然后debug tool将会自动比较Paddle-Lite框架输出的信息和Paddle-Fluid框架输出的信息来检查是否存在运行时diff。 执行Paddle-Fluid框架,输出的信息将会存储在默认名为 `tensor_py.txt` 的文件中,相应的diff信息将会存储在默认名为 `diff.txt`的文件中(默认情况下,只会输出执行拓扑序中第一个有diff的variable相关的信息)。
## 注意事项:
### 注意事项:
1. 输出的结果是在**执行完一次预测后**输出的相应变量/权重的最终值,因此如果您在预测过程进行过诸如变量复用/子图融合等优化方法,则相应的输出可能会出现偏差。
2. 默认情况下debug tools将以全1作为输入进行比对。
3. 默认情况下,为了保证与Paddle-Fluid框架的结果可比对,debug tool将会禁用掉所有的Paddle-Lite的优化策略。
4. Paddle-Lite框架的执行环境由与您的编译选项有关,比如您开启了LITE_WITH_ARM编译选项,那debug tool的`debug_cpp_stage`也需要在ARM平台下运行。
## Diff信息输出:
### Diff信息输出:
如果debug tool检测到diff信息,那么在`diff.txt`中将会输出类似以下结构信息
......@@ -37,7 +72,7 @@ dropout_0.tmp_0 {1,1536,1,1} 0.599913 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.01202671
其中第二行为op相关信息,标明了执行哪个op出现了diff及其对应的输入输出变量名。Tensor File info为Paddle-Lite框架的输出信息,而Fluid Tensor info为Paddle-Fluid框架的相应输出信息。
示例中的`dropout_0.tmp_1`没有相应的tensor信息是因为工具检测到其在预测的后序流程中未被使用,因此不会对预测结果造成影响,从而将其自动屏蔽掉以保证输出尽量简洁。
## 其他选项:
### 其他选项:
| Option | Description |
| --------------------------- | ------------------------------------------------------------ |
......
# 裁剪预测库方法
# 裁剪预测库
Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编译会将所有已注册的operator打包到预测库中,造成库文件体积膨胀;**裁剪预测库**能针对具体的模型,只打包优化后该模型需要的operator,有效降低预测库文件大小。
......@@ -39,7 +39,7 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编
例如:
```bash
./lite/tools/build.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_static --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB full_publish
./lite/tools/build.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_static --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB tiny_publish
```
**注意**:上面命令中的`../mobilenet_v1NB`是第1步得到的转化模型的输出路径
......@@ -88,9 +88,6 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编
#include <stdio.h>
#include <vector>
#include "paddle_api.h" // NOLINT
#include "paddle_use_kernels.h" // NOLINT
#include "paddle_use_ops.h" // NOLINT
#include "paddle_use_passes.h" // NOLINT
using namespace paddle::lite_api; // NOLINT
......@@ -182,4 +179,4 @@ int main(int argc, char** argv) {
1. 模型集合**必须**均为combined参数模型或均为非combined参数模型。
2. 使用非combined参数模型时,模型拓扑文件名应为`__model__`,使用非combined参数模型时,集合中各模型的拓扑与参数名应相同,分别由`--model_filename``--param_filename`指定。
3. 模型集合**必须**均为INT8量化模型或均为非INT8量化模型。
4. 需要使用Paddle-Lite 最新版本(release/v2.1.0之后)代码编译出的model_optimize_tool
4. 需要使用Paddle-Lite `release/v2.1.0`之后版本代码编译出的模型优化工具
# 模型转化方法
# 模型优化工具 opt
Paddle-Lite 提供了多种策略来自动优化原始的训练模型,其中包括量化、子图融合、混合调度、Kernel优选等等方法。为了使优化过程更加方便易用,我们提供了**opt** 工具来自动完成优化步骤,输出一个轻量的、最优的可执行模型。
......@@ -12,9 +12,12 @@ Paddle-Lite 提供了多种策略来自动优化原始的训练模型,其中
1. **推荐!** 可以进入Paddle-Lite Github仓库的[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases),选择release版本下载对应的转化工具`opt`
(release/v2.2.0之前的转化工具为model_optimize_tool、release/v2.3.0之后为opt)
2. 本文提供`release/v2.3``release/v2.2.0`版本的优化工具下载
2. 我们提供`release/v2.3`编译结果下载:[opt](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt)[opt_mac](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt_mac)
`release/v2.2.0`版本的model_optimize_tool: [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool)[model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac)
|版本 | Linux | MacOS|
|---|---|---|
| `release/v2.3`| [opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) |
|`release/v2.2.0` | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) |
3. 如果 release 列表里的工具不符合您的环境,可以下载Paddle-Lite 源码,源码编译出opt工具
......@@ -25,7 +28,7 @@ git checkout <release-version-tag>
./lite/tools/build.sh build_optimize_tool
```
编译结果位于`Paddle-Lite/build.opt/lite/api/opt`
**注意**:从源码编译opt前需要先[安装Paddle-Lite的开发环境](../installation/source_compile)
**注意**:从源码编译opt前需要先[安装Paddle-Lite的开发环境](source_compile)
## 使用opt
......@@ -80,7 +83,6 @@ PaddlePaddle模型有两种保存格式:
--optimize_out_type=(protobuf|naive_buffer) \
--optimize_out=<output_optimize_model_dir> \
--valid_targets=(arm|opencl|x86|npu|xpu) \
--prefer_int8_kernel=(true|false) \
--record_tailoring_info =(true|false)
```
......@@ -92,12 +94,12 @@ PaddlePaddle模型有两种保存格式:
| --optimize_out_type | 输出模型类型,目前支持两种类型:protobuf和naive_buffer,其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测,请将此选项设置为naive_buffer。默认为protobuf。 |
| --optimize_out | 优化模型的输出路径。 |
| --valid_targets | 指定模型可执行的backend,默认为arm。目前可支持x86、arm、opencl、npu、xpu,可以同时指定多个backend(以空格分隔),Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU(Kirin 810/990 Soc搭载的达芬奇架构NPU),应当设置为npu, arm。 |
| --prefer_int8_kernel | 若待优化模型为int8量化模型(如量化训练得到的量化模型),则设置该选项为true以使用int8内核函数进行推理加速,默认为false。 |
| --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时,则设置该选项为true,以记录优化后模型含有的kernel和OP信息,默认为false。 |
* 如果待优化的fluid模型是非combined形式,请设置`--model_dir`,忽略`--model_file``--param_file`
* 如果待优化的fluid模型是combined形式,请设置`--model_file``--param_file`,忽略`--model_dir`
* 优化后的模型为以`.nb`名称结尾的单个文件。
* 删除`prefer_int8_kernel`的输入参数,`opt`自动判别是否是量化模型,进行相应的优化操作。
### 功能二:统计模型算子信息、判断是否支持
......
......@@ -237,7 +237,7 @@ python compress.py \
接下来,使用原始的量化模型生成适合在移动端直接部署的模型。
参考[源码编译](../source_compile)配置编译环境,确保可以编译成功。参考[模型转化方法](../model_optimize_tool),首先编译model_optimize_tool工具,然后执行下面命令对量化训练的模型进行优化(注意,需要自行修改model_file、param_file和optimize_out)。
参考[源码编译](source_compile)配置编译环境,确保可以编译成功。参考[模型转化方法](model_optimize_tool),首先编译model_optimize_tool工具,然后执行下面命令对量化训练的模型进行优化(注意,需要自行修改model_file、param_file和optimize_out)。
```bash
./model_optimize_tool \
--model_file=mobilenet_v1_quant/float/model \
......@@ -245,7 +245,6 @@ python compress.py \
--optimize_out_type=naive_buffer \
--optimize_out=mobilenet_v1_quant_opt \
--valid_targets=arm \
--prefer_int8_kernel=true
```
如前所述,量化训练后,float目录下的模型参数范围为int8,但参数数据类型仍为float32类型,这样确实没有起到模型参数压缩的效果。但是,经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果,且模型结构也被优化(如进行了各种operator fuse操作)。
......@@ -260,7 +259,7 @@ adb push mobilenet_v1_quant_opt /data/local/tmp
### 使用mobilenetv1\_light\_api运行优化后的量化模型
参考[源码编译](../source_compile)配置编译环境后,在Paddle-Lite执行如下命令获取轻量级API的demo:
参考[源码编译](source_compile)配置编译环境后,在Paddle-Lite执行如下命令获取轻量级API的demo:
```bash
cd /Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light
......@@ -288,7 +287,7 @@ Output[700]: 0.002509
Output[800]: 0.000538
Output[900]: 0.000969
```
在C++中使用Paddle-Lite API的方法请猛戳[此处](../cpp_demo),用户也可参考[mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)的代码示例。
在C++中使用Paddle-Lite API的方法请猛戳[此处](../demo_guides/cpp_demo),用户也可参考[mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)的代码示例。
## FAQ
......
......@@ -37,11 +37,13 @@
对于调用无校准数据训练后量化,首先给出一个例子。
```python
from paddle.fluid.contrib.slim.quantization import WeightQuantization
model_dir = path/to/fp32_model_params
save_model_dir = path/to/save_model_path
weight_quant = WeightQuantization(model_dir=model_dir)
weight_quant.quantize_weight_to_int(save_model_dir=save_model_dir,
weight_bits=8,
weight_bits=16,
quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
```
......@@ -73,14 +75,17 @@ WeightQuantization.quantize_weight_to_int(save_model_dir,
## 3 量化模型预测
首先,使用PaddleLite提供的模型转换工具(model_optimize_tool)将量化模型转换成移动端预测的模型,然后加载转换后的模型进行预测部署。
目前,对于无校准数据训练后量化产出的量化模型,不支持PaddlePaddle加载执行,只能使用PaddleLite进行预测部署。
很简单,首先使用PaddleLite提供的模型转换工具(opt)将量化模型转换成移动端预测的模型,然后加载转换后的模型进行预测部署。
注意,PaddleLite 2.3版本才支持无校准数据训练后量化产出的量化,所以转换工具和预测库必须是2.3及之后的版本。
### 3.1 模型转换
参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具,建议从Release页面下载。
参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。
因为该模型会将量化的权重反量化,然后实际加载并执行FP32预测模型,所以opt命令的输入参数--prefer_int8_kernel不需要设置为true,同时其他参数按照实际情况参考文档设置。
比如在安卓手机ARM端进行预测,模型转换的命令为:
```bash
./opt --model_dir=./mobilenet_v1_quant \
......@@ -91,4 +96,4 @@ WeightQuantization.quantize_weight_to_int(save_model_dir,
### 3.2 量化模型预测
和FP32模型一样,转换后的量化模型可以在Android/IOS APP中加载预测,建议参考[C++ Demo](../user_guides/cpp_demo)[Java Demo](../user_guides/java_demo)[Android/IOS Demo](../user_guides/android_ios_app_demo)
和FP32模型一样,转换后的量化模型可以在Android/IOS APP中加载预测,建议参考[C++ Demo](../demo_guides/cpp_demo)[Java Demo](../demo_guides/java_demo)[Android/IOS Demo](../demo_guides/android_app_demo)
......@@ -41,7 +41,7 @@
### 2.3 配置校准数据生成器
有校准数据训练后量化内部使用异步数据读取的方式读取校准数据,大家只需要根据模型的输入,配置读取数据的sample_generator。sample_generator是Python生成器,**必须每次返回单个样本数据**,会用作`DataLoader.set_sample_generator()`的数据源。
建议参考[异步数据读取文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/user_guides/howto/prepare_data/use_py_reader.html)和本文示例,学习如何配置校准数据生成器。
建议参考[异步数据读取文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/data_preparing/use_py_reader.html)和本文示例,学习如何配置校准数据生成器。
### 2.4 调用有校准数据训练后量化
......@@ -147,18 +147,17 @@ with fluid.name_scope('skip_quant'):
参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具,建议从Release页面下载。
参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。注意opt命令的输入参数--prefer_int8_kernel必须设置为true,其他参数按照实际情况参考文档设置。比如在安卓手机ARM端进行预测,模型转换的命令为:
参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具,参数按照实际情况设置。比如在安卓手机ARM端进行预测,模型转换的命令为:
```bash
./opt --model_dir=./mobilenet_v1_quant \
--optimize_out_type=naive_buffer \
--optimize_out=mobilenet_v1_quant_opt \
--valid_targets=arm \
--prefer_int8_kernel=true
--valid_targets=arm
```
### 3.2 量化模型预测
和FP32模型一样,转换后的量化模型可以在Android/IOS APP中加载预测,建议参考[C++ Demo](../user_guides/cpp_demo)[Java Demo](../user_guides/java_demo)[Android/IOS Demo](../user_guides/android_ios_app_demo)
和FP32模型一样,转换后的量化模型可以在Android/IOS APP中加载预测,建议参考[C++ Demo](../demo_guides/cpp_demo)[Java Demo](../demo_guides/java_demo)[Android/IOS Demo](../demo_guides/android_app_demo)
## 4 使用示例
......
# 官方 release 预编译库
# 预编译库
## 编译版本介绍
......@@ -11,7 +11,7 @@
- arm_stl=`c++_static/c++_shared` Lite预测库链接STL库的方式,支持静态或动态链接
- build_extra=`ON/OFF` 是否编译全量OP,OFF时只编译CV相关基础OP,[参数详情](./library.html)
- build_extra=`ON/OFF` 是否编译全量OP,OFF时只编译CV相关基础OP,[参数详情](library)
- `tiny_publish/full_publish` 编译模式,`tiny_publish`编译移动端部署库、`full_publish`编译部署库的同时编译第三方依赖库
......@@ -52,8 +52,8 @@
| 运行系统 | 下载 |
| :---------: | :--------------: |
| Linux | [release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt) |
| MacOs | [release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt_mac) |
| Linux | [release/v2.3](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) |
| MacOs | [release/v2.3](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) |
......@@ -63,7 +63,7 @@
- [Android源码编译](./source_compile.html#paddlelite)
- [iOS源码编译](./source_compile.html#paddlelite)
- [ArmLinux源码编译](./source_compile.html#paddlelite)
- [x86源码编译](../advanced_user_guides/x86)
- [opencl源码编译](../advanced_user_guides/opencl)
- [CUDA源码编译](../advanced_user_guides/cuda)
- [FPGA源码编译](../advanced_user_guides/fpga)
- [x86源码编译](../demo_guides/x86)
- [opencl源码编译](../demo_guides/opencl)
- [CUDA源码编译](../demo_guides/cuda)
- [FPGA源码编译](../demo_guides/fpga)
# 源码编译
# 预测库编译
Paddle-Lite 提供了移动端的一键源码编译脚本 `lite/tools/build.sh`,编译流程如下:
PaddleLite已经提供官方Release预测库下载,请参考[文档](release_lib)
PaddleLite 提供了移动端的一键源码编译脚本 `lite/tools/build.sh`,编译流程如下:
1. 环境准备(选择其一):Docker交叉编译环境、Linux交叉编译环境
2. 编译:调用`build.sh`脚本一键编译
......@@ -234,6 +236,8 @@ brew cask install java
## 二、编译PaddleLite
**注:编译OpenCL、华为NPU、FPGA、CUDA、X86预测库、CV模块,见进阶使用指南的对应章节。**
### 下载代码
```shell
......
......@@ -24,8 +24,7 @@ $ ./opt \
--param_file=<param_path> \
--optimize_out_type=(protobuf|naive_buffer) \
--optimize_out=<output_optimize_model_dir> \
--valid_targets=(arm|opencl|x86) \
--prefer_int8_kernel=(ture|false)
--valid_targets=(arm|opencl|x86)
```
其中,optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式,其目前支持Protobuf与Naive Buffer两种方式,其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测,那么您需要设置optimize_out_type=naive_buffer。
......@@ -48,8 +47,8 @@ $ ./opt \
## 四. Lite API
为了方便您的使用,我们提供了C++、Java、Python三种API,并且提供了相应的api的完整使用示例:[C++完整示例](cpp_demo)[Java完整示例](java_demo)[Python完整示例](../advanced_user_guides/cuda),您可以参考示例中的说明快速了解C++/Java/Python的API使用方法,并集成到您自己的项目中去。需要说明的是,为了减少第三方库的依赖、提高Lite预测框架的通用性,在移动端使用Lite API您需要准备Naive Buffer存储格式的模型,具体方法可参考第2节`模型优化`
为了方便您的使用,我们提供了C++、Java、Python三种API,并且提供了相应的api的完整使用示例:[C++完整示例](../demo_guides/cpp_demo)[Java完整示例](../demo_guides/java_demo)[Python完整示例](../demo_guides/cuda),您可以参考示例中的说明快速了解C++/Java/Python的API使用方法,并集成到您自己的项目中去。需要说明的是,为了减少第三方库的依赖、提高Lite预测框架的通用性,在移动端使用Lite API您需要准备Naive Buffer存储格式的模型,具体方法可参考第2节`模型优化`
## 五. 测试工具
为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Lite Model Debug Tool](../advanced_user_guides/debug_tools)[Profile Monitor Tool](../advanced_user_guides/test_tools)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](../advanced_user_guides/debug_tools) 了解更多内容。
为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Debug工具](debug#debug)[Profile工具](debug#profiler)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
# 通过 X2Paddle 转换模型
# 模型转换工具 X2Paddle
X2Paddle可以将caffe、tensorflow、onnx模型转换成Paddle支持的模型。
[X2Paddle](https://github.com/PaddlePaddle/X2Paddle)支持将Caffe/TensorFlow模型转换为PaddlePaddle模型。目前X2Paddle支持的模型参考[x2paddle_model_zoo](https://github.com/PaddlePaddle/X2Paddle/blob/develop/x2paddle_model_zoo.md)
## 多框架支持
|模型 | caffe | tensorflow | onnx |
|---|---|---|---|
|mobilenetv1 | Y | Y | |
|mobilenetv2 | Y | Y | Y |
|resnet18 | Y | Y | |
|resnet50 | Y | Y | Y |
|mnasnet | Y | Y | |
|efficientnet | Y | Y | Y |
|squeezenetv1.1 | Y | Y | Y |
|shufflenet | Y | Y | |
|mobilenet_ssd | Y | Y | |
|mobilenet_yolov3 | | Y | |
|inceptionv4 | | | |
|mtcnn | Y | Y | |
|facedetection | Y | | |
|unet | Y | Y | |
|ocr_attention | | | |
|vgg16 | | | |
## 安装
```
......
......@@ -12,6 +12,7 @@ message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
message(STATUS "LITE_WITH_ARM_LANG:\t${LITE_WITH_ARM_LANG}")
set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")
set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})
......@@ -64,6 +65,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
if (LITE_WITH_NPU)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.npu")
endif(LITE_WITH_NPU)
if (LITE_WITH_XPU)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.xpu")
endif(LITE_WITH_XPU)
if (LITE_WITH_FPGA)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga")
endif(LITE_WITH_FPGA)
......@@ -79,7 +83,16 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
if (LITE_WITH_PYTHON)
add_custom_target(publish_inference_python_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite_core.so")
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
add_custom_target(publish_inference_python_installer ${TARGET}
COMMAND python setup.py bdist_wheel
WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
DEPENDS publish_inference_python_lib)
add_custom_target(publish_inference_python_light_demo ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/python"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/")
......@@ -91,6 +104,7 @@ if (LITE_WITH_PYTHON)
endif()
add_dependencies(publish_inference_python_lib lite_pybind)
add_dependencies(publish_inference publish_inference_python_lib)
add_dependencies(publish_inference publish_inference_python_installer)
add_dependencies(publish_inference publish_inference_python_light_demo)
endif()
......@@ -123,7 +137,29 @@ if (LITE_WITH_X86)
endif()
if(LITE_WITH_CUDA)
add_dependencies(publish_inference paddle_full_api_shared)
add_custom_target(publish_inference_cuda_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
)
add_dependencies(publish_inference_cuda_cxx_lib bundle_full_api)
add_dependencies(publish_inference_cuda_cxx_lib bundle_light_api)
add_dependencies(publish_inference_cuda_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_cuda_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference publish_inference_cuda_cxx_lib)
add_custom_target(publish_inference_cuda_cxx_demos ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/cuda_demo/*" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
)
add_dependencies(publish_inference_cuda_cxx_lib publish_inference_cuda_cxx_demos)
add_dependencies(publish_inference_cuda_cxx_demos paddle_full_api_shared)
endif(LITE_WITH_CUDA)
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
if (NOT LITE_ON_TINY_PUBLISH)
......@@ -186,6 +222,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
add_dependencies(publish_inference tiny_publish_cxx_lib)
if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_api_light_bundled.a
COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
endif()
endif()
......@@ -282,6 +319,10 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/opencl"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/backends/opencl/cl_kernel" "${INFER_LITE_PUBLISH_ROOT}/opencl"
)
if (NOT LITE_ON_TINY_PUBLISH)
add_dependencies(publish_inference_cxx_lib publish_inference_opencl)
else()
add_dependencies(tiny_publish_cxx_lib publish_inference_opencl)
endif()
endif()
endif()
......@@ -45,7 +45,11 @@ else()
if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
add_library(paddle_light_api_shared SHARED "")
target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc)
set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
set(TARGET_COMIPILE_FLAGS "-fdata-sections")
if (NOT (ARM_TARGET_LANG STREQUAL "clang")) #gcc
set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto")
endif()
set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}")
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
if (LITE_WITH_NPU)
# Need to add HIAI runtime libs (libhiai.so) dependency
......@@ -78,6 +82,7 @@ message(STATUS "get X86 kernels ${x86_kernels}")
message(STATUS "get CUDA kernels ${cuda_kernels}")
message(STATUS "get Host kernels ${host_kernels}")
message(STATUS "get ARM kernels ${arm_kernels}")
message(STATUS "get OpenCL kernels ${opencl_kernels}")
message(STATUS "get NPU kernels ${npu_kernels}")
message(STATUS "get XPU kernels ${xpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}")
......@@ -143,38 +148,40 @@ if(WITH_TESTING)
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
add_dependencies(test_cxx_api extern_lite_download_lite_naive_model_tar_gz)
if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
lite_cc_test(test_googlenet SRCS test_googlenet_lite.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/googlenet)
add_dependencies(test_googlenet extern_lite_download_GoogleNet_inference_tar_gz)
lite_cc_test(test_mobilenetv1_lite_x86 SRCS test_mobilenetv1_lite_x86.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1)
add_dependencies(test_mobilenetv1_lite_x86 extern_lite_download_mobilenet_v1_tar_gz)
lite_cc_test(test_mobilenetv2_lite_x86 SRCS test_mobilenetv2_lite_x86.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu)
add_dependencies(test_mobilenetv2_lite_x86 extern_lite_download_mobilenet_v2_relu_tar_gz)
lite_cc_test(test_inceptionv4_lite_x86 SRCS test_inceptionv4_lite_x86.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/inception_v4_simple)
add_dependencies(test_inceptionv4_lite_x86 extern_lite_download_inception_v4_simple_tar_gz)
lite_cc_test(test_resnet50_lite_x86 SRCS test_resnet50_lite_x86.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
add_dependencies(test_resnet50_lite_x86 extern_lite_download_resnet50_tar_gz)
lite_cc_test(test_step_rnn_lite_x86 SRCS test_step_rnn_lite_x86.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/step_rnn)
add_dependencies(test_step_rnn_lite_x86 extern_lite_download_step_rnn_tar_gz)
if(LITE_WITH_X86)
lite_cc_test(test_googlenet SRCS test_googlenet_lite.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/googlenet)
add_dependencies(test_googlenet extern_lite_download_GoogleNet_inference_tar_gz)
lite_cc_test(test_mobilenetv1_lite_x86 SRCS test_mobilenetv1_lite_x86.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1)
add_dependencies(test_mobilenetv1_lite_x86 extern_lite_download_mobilenet_v1_tar_gz)
lite_cc_test(test_mobilenetv2_lite_x86 SRCS test_mobilenetv2_lite_x86.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu)
add_dependencies(test_mobilenetv2_lite_x86 extern_lite_download_mobilenet_v2_relu_tar_gz)
lite_cc_test(test_inceptionv4_lite_x86 SRCS test_inceptionv4_lite_x86.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/inception_v4_simple)
add_dependencies(test_inceptionv4_lite_x86 extern_lite_download_inception_v4_simple_tar_gz)
lite_cc_test(test_resnet50_lite_x86 SRCS test_resnet50_lite_x86.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
add_dependencies(test_resnet50_lite_x86 extern_lite_download_resnet50_tar_gz)
lite_cc_test(test_step_rnn_lite_x86 SRCS test_step_rnn_lite_x86.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/step_rnn)
add_dependencies(test_step_rnn_lite_x86 extern_lite_download_step_rnn_tar_gz)
endif()
if(LITE_WITH_BM)
lite_cc_test(test_resnet50_lite_bm SRCS test_resnet50_lite_bm.cc
lite_cc_test(test_classify_lite_bm SRCS test_classify_lite_bm.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${bm_kernels} ${bm_bridges}
ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
......@@ -229,6 +236,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
--model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
# brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model
# lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
# DEPS ${lite_model_test_DEPS})
......@@ -295,6 +303,11 @@ if (LITE_ON_TINY_PUBLISH)
return()
endif()
# add library for opt_base
lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
message(STATUS "Compiling opt")
lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
......@@ -377,6 +390,16 @@ if(NOT IOS)
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
endif()
#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
......
......@@ -25,7 +25,11 @@ if (NOT LITE_ON_TINY_PUBLISH)
endif()
else()
add_library(paddle_lite_jni SHARED "")
set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
set(TARGET_COMIPILE_FLAGS "-fdata-sections")
if (NOT (ARM_TARGET_LANG STREQUAL "clang")) #gcc
set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto")
endif()
set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS ${TARGET_COMIPILE_FLAGS})
target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
if (LITE_WITH_NPU)
......
......@@ -17,11 +17,6 @@
#include <jni.h>
/* Header for class com_baidu_paddle_lite_PaddlePredictor */
#include "lite/api/paddle_lite_factory_helper.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/paddle_use_passes.h"
#endif
#ifdef __cplusplus
extern "C" {
#endif
......
......@@ -21,9 +21,6 @@
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/light_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/core/mir/pass_registry.h"
DEFINE_string(model_dir, "", "");
......
......@@ -23,31 +23,28 @@
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/core/device_info.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
DEFINE_string(model_dir,
"",
"the path of the model, set model_dir when the model is no "
"combined formate. This option will be ignored if model_file "
"and param_file are exist.");
DEFINE_string(model_file,
"the path of the model, the model and param files is under "
"model_dir.");
DEFINE_string(model_filename,
"",
"the path of model file, set model_file when the model is "
"combined formate.");
DEFINE_string(param_file,
"the filename of model file. When the model is combined formate, "
"please set model_file.");
DEFINE_string(param_filename,
"",
"the path of param file, set param_file when the model is "
"the filename of param file, set param_file when the model is "
"combined formate.");
DEFINE_string(input_shape,
"1,3,224,224",
"set input shapes according to the model, "
"separated by colon and comma, "
"such as 1,3,244,244:1,3,300,300.");
"such as 1,3,244,244");
DEFINE_string(input_img_path, "", "the path of input image");
DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times");
DEFINE_int32(power_mode,
......@@ -80,12 +77,13 @@ inline double GetCurrentUS() {
return 1e+6 * time.tv_sec + time.tv_usec;
}
void OutputOptModel(const std::string& save_optimized_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) {
void OutputOptModel(const std::string& save_optimized_model_dir) {
lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
config.set_model_file(FLAGS_model_file);
config.set_param_file(FLAGS_param_file);
if (!FLAGS_model_filename.empty() && !FLAGS_param_filename.empty()) {
config.set_model_file(FLAGS_model_dir + "/" + FLAGS_model_filename);
config.set_param_file(FLAGS_model_dir + "/" + FLAGS_param_filename);
}
std::vector<Place> vaild_places = {
Place{TARGET(kARM), PRECISION(kFloat)},
};
......@@ -109,7 +107,7 @@ void OutputOptModel(const std::string& save_optimized_model_dir,
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
void Run(const std::vector<std::vector<int64_t>>& input_shapes,
void Run(const std::vector<int64_t>& input_shape,
const std::string& model_dir,
const std::string model_name) {
// set config and create predictor
......@@ -121,17 +119,27 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
auto predictor = lite_api::CreatePaddlePredictor(config);
// set input
for (int j = 0; j < input_shapes.size(); ++j) {
auto input_tensor = predictor->GetInput(j);
input_tensor->Resize(input_shapes[j]);
auto input_data = input_tensor->mutable_data<float>();
int input_num = 1;
for (size_t i = 0; i < input_shapes[j].size(); ++i) {
input_num *= input_shapes[j][i];
}
auto input_tensor = predictor->GetInput(0);
input_tensor->Resize(input_shape);
auto input_data = input_tensor->mutable_data<float>();
int input_num = 1;
for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i];
}
if (FLAGS_input_img_path.empty()) {
for (int i = 0; i < input_num; ++i) {
input_data[i] = 1.f;
}
} else {
std::fstream fs(FLAGS_input_img_path);
if (!fs.is_open()) {
LOG(FATAL) << "open input image " << FLAGS_input_img_path << " error.";
}
for (int i = 0; i < input_num; i++) {
fs >> input_data[i];
}
// LOG(INFO) << "input data:" << input_data[0] << " " <<
// input_data[input_num-1];
}
// warmup
......@@ -178,25 +186,12 @@ int main(int argc, char** argv) {
exit(0);
}
if (FLAGS_model_dir.back() == '/') {
FLAGS_model_dir.pop_back();
}
std::size_t found = FLAGS_model_dir.find_last_of("/");
std::string model_name = FLAGS_model_dir.substr(found + 1);
std::string save_optimized_model_dir = FLAGS_model_dir + "opt2";
auto split_string =
[](const std::string& str_in) -> std::vector<std::string> {
std::vector<std::string> str_out;
std::string tmp_str = str_in;
while (!tmp_str.empty()) {
size_t next_offset = tmp_str.find(":");
str_out.push_back(tmp_str.substr(0, next_offset));
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return str_out;
};
std::string save_optimized_model_dir = FLAGS_model_dir + "_opt2";
auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
std::vector<int64_t> shape;
......@@ -214,22 +209,18 @@ int main(int argc, char** argv) {
return shape;
};
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes;
for (size_t i = 0; i < str_input_shapes.size(); ++i) {
input_shapes.push_back(get_shape(str_input_shapes[i]));
}
std::vector<int64_t> input_shape = get_shape(FLAGS_input_shape);
// Output optimized model if needed
if (FLAGS_run_model_optimize) {
paddle::lite_api::OutputOptModel(save_optimized_model_dir, input_shapes);
paddle::lite_api::OutputOptModel(save_optimized_model_dir);
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
// Run inference using optimized model
std::string run_model_dir =
FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
paddle::lite_api::Run(input_shapes, run_model_dir, model_name);
paddle::lite_api::Run(input_shape, run_model_dir, model_name);
#endif
return 0;
}
......@@ -294,6 +294,32 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
inner_places.emplace_back(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
const std::vector<std::string> quant_dequant_op = {
"fake_quantize_abs_max",
"fake_quantize_range_abs_max",
"fake_quantize_moving_average_abs_max",
"fake_quantize_dequantize_moving_average_abs_max",
"fake_dequantize_max_abs",
"fake_channel_wise_dequantize_max_abs"};
bool is_quantized_model = false;
for (size_t i = 0; i < program_desc_.BlocksSize() && !is_quantized_model;
++i) {
auto *block_desc = program_desc_.GetBlock<cpp::BlockDesc>(i);
for (size_t j = 0; j < block_desc->OpsSize() && !is_quantized_model; ++j) {
auto *op_desc = block_desc->GetOp<cpp::OpDesc>(j);
std::string op_type = op_desc->Type();
if (std::find(quant_dequant_op.begin(),
quant_dequant_op.end(),
op_type) != quant_dequant_op.end()) {
is_quantized_model = true;
}
}
}
if (is_quantized_model) {
inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)});
}
Program program(desc, scope_, inner_places);
core::KernelPickFactor factor;
......@@ -333,16 +359,16 @@ lite::Tensor *Predictor::GetInputByName(const std::string &name) {
}
}
#ifdef LITE_WITH_TRAIN
void Predictor::FeedVars(const std::vector<framework::Tensor> &tensors) {
auto var = scope_->FindVar("feed");
auto &feed_list = *(var->GetMutable<std::vector<lite::Tensor>>());
feed_list.resize(tensors.size());
// #ifdef LITE_WITH_TRAIN
// void Predictor::FeedVars(const std::vector<framework::Tensor> &tensors) {
// auto var = scope_->FindVar("feed");
// auto &feed_list = *(var->GetMutable<std::vector<lite::Tensor>>());
// feed_list.resize(tensors.size());
for (size_t i = 0; i < tensors.size(); ++i)
feed_list[i].ShareDataWith(tensors[i]);
}
#endif
// for (size_t i = 0; i < tensors.size(); ++i)
// feed_list[i].ShareDataWith(tensors[i]);
// }
// #endif
} // namespace lite
} // namespace paddle
......@@ -101,14 +101,14 @@ class LITE_API Predictor {
bool record_info = false);
void SaveOpKernelInfo(const std::string& model_dir);
#ifdef LITE_WITH_TRAIN
void Run(const std::vector<framework::Tensor>& tensors) {
FeedVars(tensors);
program_->Run();
}
void FeedVars(const std::vector<framework::Tensor>& tensors);
#endif
// #ifdef LITE_WITH_TRAIN
// void Run(const std::vector<framework::Tensor>& tensors) {
// FeedVars(tensors);
// program_->Run();
// }
// void FeedVars(const std::vector<framework::Tensor>& tensors);
// #endif
private:
Optimizer optimizer_;
......
......@@ -67,7 +67,7 @@ void Run(const char* model_dir, int repeat) {
int main(int argc, char** argv) {
CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
paddle::lite::Run(argv[1], std::stoi(argv[2]));
paddle::lite::Run(argv[1], atoi(argv[2]));
return 0;
}
......
......@@ -38,11 +38,13 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
std::vector<std::string> passes{};
auto use_layout_preprocess_pass =
config.model_dir().find("OPENCL_PRE_PRECESS");
if (use_layout_preprocess_pass != std::string::npos) {
VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass;
if (places[0].target == TARGET(kOpenCL) &&
use_layout_preprocess_pass != std::string::npos) {
passes = {"type_layout_cast_preprocess_pass"};
VLOG(1) << "add pass:" << passes[0];
}
raw_predictor_.Build(config, places, passes);
mode_ = config.power_mode();
threads_ = config.threads();
......
......@@ -13,6 +13,12 @@
// limitations under the License.
#include "lite/api/light_api.h"
#include "paddle_use_kernels.h" // NOLINT
#include "paddle_use_ops.h" // NOLINT
#ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/paddle_use_passes.h"
#endif
#include <algorithm>
namespace paddle {
......
......@@ -58,6 +58,7 @@ void LightPredictorImpl::Run() {
std::shared_ptr<lite_api::PaddlePredictor> LightPredictorImpl::Clone() {
LOG(FATAL) << "The Clone API is not supported in LigthPredictor";
return nullptr;
}
std::string LightPredictorImpl::GetVersion() const { return lite::version(); }
......
......@@ -12,11 +12,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/paddle_use_passes.h"
#endif
namespace paddle {
namespace lite_api {
......
......@@ -15,9 +15,6 @@
#include "lite/api/light_api.h"
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
DEFINE_string(optimized_model, "", "");
......
......@@ -16,9 +16,6 @@
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/device_info.h"
#include "lite/core/profile/timer.h"
......
......@@ -53,9 +53,13 @@ void TestModel(const std::vector<Place>& valid_places,
predictor.Run();
}
auto start = GetCurrentUS();
double sum_duration = 0.0; // millisecond;
for (int i = 0; i < FLAGS_repeats; ++i) {
auto start = GetCurrentUS();
predictor.Run();
auto duration = (GetCurrentUS() - start) / 1000.0;
sum_duration += duration;
VLOG(1) << "run_idx:" << i << " " << duration << " ms";
}
if (save_model) {
......@@ -68,8 +72,7 @@ void TestModel(const std::vector<Place>& valid_places,
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
<< ", spend " << sum_duration / FLAGS_repeats << " ms in average.";
std::vector<std::vector<float>> ref;
ref.emplace_back(std::vector<float>(
......@@ -92,7 +95,7 @@ void TestModel(const std::vector<Place>& valid_places,
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000);
double eps = 0.1;
double eps = first_target == TARGET(kOpenCL) ? 0.12 : 0.1;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
......@@ -115,24 +118,22 @@ void TestModel(const std::vector<Place>& valid_places,
}
// Get detailed result
auto* pred = &predictor;
size_t output_tensor_num = pred->GetOutputNames().size();
VLOG(1) << "output tesnor num:" << output_tensor_num;
size_t output_tensor_num = predictor.GetOutputNames().size();
VLOG(1) << "output tensor num:" << output_tensor_num;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
std::unique_ptr<const Tensor> output_tensor(
std::move(pred->GetOutput(tidx)));
auto* output_tensor = predictor.GetOutput(tidx);
VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims;
float sum = 0.f;
for (int i = 0; i < out_dims.production(); ++i) {
sum += output_tensor->data<float>()[i];
}
VLOG(1) << "out_dims.production():" << out_dims.production();
VLOG(1) << "output tensor sum value:" << sum;
VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
auto out_data = output_tensor->data<float>();
auto out_mean = compute_mean<float>(out_data, out_dims.production());
auto out_std_dev = compute_standard_deviation<float>(
out_data, out_dims.production(), true, out_mean);
VLOG(1) << "output tensor dims:" << out_dims;
VLOG(1) << "output tensor elements num:" << out_dims.production();
VLOG(1) << "output tensor standard deviation:" << out_std_dev;
VLOG(1) << "output tensor mean value:" << out_mean;
// print result
for (int i = 0; i < out_dims.production(); ++i) {
......
......@@ -54,9 +54,13 @@ void TestModel(const std::vector<Place>& valid_places,
predictor.Run();
}
auto start = GetCurrentUS();
double sum_duration = 0.0; // millisecond;
for (int i = 0; i < FLAGS_repeats; ++i) {
auto start = GetCurrentUS();
predictor.Run();
auto duration = (GetCurrentUS() - start) / 1000.0;
sum_duration += duration;
VLOG(1) << "run_idx:" << i << " " << duration << " ms";
}
if (save_model) {
......@@ -69,8 +73,7 @@ void TestModel(const std::vector<Place>& valid_places,
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
<< ", spend " << sum_duration / FLAGS_repeats << " ms in average.";
std::vector<std::vector<float>> ref;
// i = 1
......@@ -94,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places,
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000);
double eps = 0.1;
double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
......@@ -117,24 +120,22 @@ void TestModel(const std::vector<Place>& valid_places,
}
// Get detailed result
auto* pred = &predictor;
size_t output_tensor_num = pred->GetOutputNames().size();
VLOG(1) << "output tesnor num:" << output_tensor_num;
size_t output_tensor_num = predictor.GetOutputNames().size();
VLOG(1) << "output tensor num:" << output_tensor_num;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
std::unique_ptr<const Tensor> output_tensor(
std::move(pred->GetOutput(tidx)));
auto* output_tensor = predictor.GetOutput(tidx);
VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims;
float sum = 0.f;
for (int i = 0; i < out_dims.production(); ++i) {
sum += output_tensor->data<float>()[i];
}
VLOG(1) << "out_dims.production():" << out_dims.production();
VLOG(1) << "output tensor sum value:" << sum;
VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
auto out_data = output_tensor->data<float>();
auto out_mean = compute_mean<float>(out_data, out_dims.production());
auto out_std_dev = compute_standard_deviation<float>(
out_data, out_dims.production(), true, out_mean);
VLOG(1) << "output tensor dims:" << out_dims;
VLOG(1) << "output tensor elements num:" << out_dims.production();
VLOG(1) << "output tensor standard deviation:" << out_std_dev;
VLOG(1) << "output tensor mean value:" << out_mean;
// print result
for (int i = 0; i < out_dims.production(); ++i) {
......
......@@ -17,9 +17,6 @@
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/device_info.h"
#include "lite/core/profile/timer.h"
......@@ -141,7 +138,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
std::ofstream out(FLAGS_arg_name + ".txt");
for (size_t i = 0; i < arg_num; ++i) {
sum += arg_tensor->data<float>()[i];
out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
}
LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
<< ", mean value is " << sum * 1. / arg_num;
......
......@@ -17,9 +17,6 @@
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/device_info.h"
#include "lite/core/profile/timer.h"
......@@ -253,7 +250,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
std::ofstream out(FLAGS_arg_name + ".txt");
for (size_t i = 0; i < arg_num; ++i) {
sum += arg_tensor->data<float>()[i];
out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
}
LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
<< ", mean value is " << sum * 1. / arg_num;
......
......@@ -17,9 +17,6 @@
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/device_info.h"
#include "lite/core/profile/timer.h"
......@@ -267,7 +264,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
std::ofstream out(FLAGS_arg_name + ".txt");
for (size_t i = 0; i < arg_num; ++i) {
sum += arg_tensor->data<float>()[i];
out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
}
LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
<< ", mean value is " << sum * 1. / arg_num;
......
......@@ -32,18 +32,10 @@ void TestModel(const std::vector<Place>& valid_places, bool use_npu = false) {
predictor.Build(FLAGS_model_dir, "", "", valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 1, 48, 512})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
for (int i = 0; i < item_size; i++) {
data[i] = 1;
}
auto* init_scores = predictor.GetInput(2);
init_scores->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
auto* data_scores = init_scores->mutable_data<float>();
auto scores_size = input_tensor->dims().production();
auto scores_size = init_scores->dims().production();
for (int i = 0; i < scores_size; i++) {
data_scores[i] = 0;
}
......@@ -53,7 +45,7 @@ void TestModel(const std::vector<Place>& valid_places, bool use_npu = false) {
auto* init_ids = predictor.GetInput(1);
init_ids->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
auto* data_ids = init_ids->mutable_data<float>();
auto* data_ids = init_ids->mutable_data<int64_t>();
auto ids_size = init_ids->dims().production();
for (int i = 0; i < ids_size; i++) {
data_ids[i] = 0;
......@@ -62,6 +54,13 @@ void TestModel(const std::vector<Place>& valid_places, bool use_npu = false) {
std::vector<std::vector<uint64_t>> lod_i{{0, 1}, {0, 1}};
*lod_ids = lod_i;
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 1, 48, 512})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
for (int i = 0; i < item_size; i++) {
data[i] = 1;
}
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run();
}
......@@ -102,6 +101,7 @@ void TestModel(const std::vector<Place>& valid_places, bool use_npu = false) {
TEST(OcrAttention, test_arm) {
std::vector<Place> valid_places({
Place{TARGET(kARM), PRECISION(kInt64)},
Place{TARGET(kARM), PRECISION(kFloat)},
});
......
......@@ -67,7 +67,6 @@ DEFINE_string(valid_targets,
"arm",
"The targets this model optimized for, should be one of (arm, "
"opencl, x86), splitted by space");
DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
DEFINE_bool(print_supported_ops,
false,
"Print supported operators on the inputed target");
......@@ -88,7 +87,10 @@ std::vector<Place> ParserValidPlaces() {
auto target_reprs = lite::Split(FLAGS_valid_targets, ",");
for (auto& target_repr : target_reprs) {
if (target_repr == "arm") {
valid_places.emplace_back(TARGET(kARM));
valid_places.emplace_back(
Place{TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places.emplace_back(
Place{TARGET(kARM), PRECISION(kInt32), DATALAYOUT(kNCHW)});
} else if (target_repr == "opencl") {
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
......@@ -118,11 +120,6 @@ std::vector<Place> ParserValidPlaces() {
<< "At least one target should be set, should set the "
"command argument 'valid_targets'";
if (FLAGS_prefer_int8_kernel) {
LOG(WARNING) << "Int8 mode is only support by ARM target";
valid_places.insert(valid_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)});
}
return valid_places;
}
......@@ -252,7 +249,6 @@ void PrintHelpInfo() {
" `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
" `--prefer_int8_kernel=(true|false)`\n"
" `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of "
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/api/opt_base.h"
#include "all_kernel_faked.cc" // NOLINT
namespace paddle {
namespace lite_api {
void OptBase::SetModelDir(const std::string& model_path) {
opt_config_.set_model_dir(model_path);
}
void OptBase::SetModelFile(const std::string& model_path) {
opt_config_.set_model_file(model_path);
}
void OptBase::SetParamFile(const std::string& param_path) {
opt_config_.set_param_file(param_path);
}
void OptBase::SetModelType(std::string optimize_out_type) {
if (optimize_out_type == "protobuf") {
model_type_ = LiteModelType::kProtobuf;
} else if (optimize_out_type == "naive_buffer") {
model_type_ = LiteModelType::kNaiveBuffer;
} else {
LOG(FATAL) << "Unsupported Model type :" << optimize_out_type;
}
}
void OptBase::SetValidPlaces(const std::string& valid_places) {
valid_places_.clear();
auto target_reprs = lite::Split(valid_places, ",");
for (auto& target_repr : target_reprs) {
if (target_repr == "arm") {
valid_places_.emplace_back(TARGET(kARM));
} else if (target_repr == "opencl") {
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
valid_places_.emplace_back(
TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel
} else if (target_repr == "x86") {
valid_places_.emplace_back(TARGET(kX86));
} else if (target_repr == "npu") {
valid_places_.emplace_back(TARGET(kNPU));
} else if (target_repr == "xpu") {
valid_places_.emplace_back(TARGET(kXPU));
} else {
LOG(FATAL) << lite::string_format(
"Wrong target '%s' found, please check the command flag "
"'valid_targets'",
target_repr.c_str());
}
}
CHECK(!valid_places_.empty())
<< "At least one target should be set, should set the "
"command argument 'valid_targets'";
}
void OptBase::SetOptimizeOut(const std::string& optimized_out_path) {
optimize_out_path_ = optimized_out_path;
}
void OptBase::RunOptimize(bool record_strip_info) {
CheckIfModelSupported(false);
OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
opt_config_.set_valid_places(valid_places_);
if (model_set_dir_ != "") {
RunOptimizeFromModelSet(record_strip_info);
} else {
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel(
optimize_out_path_, model_type_, record_strip_info);
auto resulted_model_name =
record_strip_info ? "information of striped model" : "optimized model";
std::cout << "Save the " << resulted_model_name
<< " into :" << optimize_out_path_ << "successfully";
}
}
// collect ops info of modelset
void CollectModelMetaInfo(const std::string& output_dir,
const std::vector<std::string>& models,
const std::string& filename) {
std::set<std::string> total;
for (const auto& name : models) {
std::string model_path =
lite::Join<std::string>({output_dir, name, filename}, "/");
auto lines = lite::ReadLines(model_path);
total.insert(lines.begin(), lines.end());
}
std::string output_path =
lite::Join<std::string>({output_dir, filename}, "/");
lite::WriteLines(std::vector<std::string>(total.begin(), total.end()),
output_path);
}
void OptBase::SetModelSetDir(const std::string& model_set_path) {
model_set_dir_ = model_set_path;
}
void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
// 1. mkdir of outputed optimized model set.
lite::MkDirRecur(optimize_out_path_);
auto model_dirs = lite::ListDir(model_set_dir_, true);
if (model_dirs.size() == 0) {
LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model";
}
// 2. optimize each model in inputed model set dir.
std::string model_file = opt_config_.model_file();
std::string param_file = opt_config_.param_file();
for (const auto& name : model_dirs) {
std::string input_model_dir =
lite::Join<std::string>({model_set_dir_, name}, "/");
std::string output_model_dir =
lite::Join<std::string>({optimize_out_path_, name}, "/");
if (opt_config_.model_file() != "" && opt_config_.param_file() != "") {
auto model_file_path =
lite::Join<std::string>({input_model_dir, model_file}, "/");
auto param_file_path =
lite::Join<std::string>({input_model_dir, param_file}, "/");
}
std::cout << "Start optimize model: " << input_model_dir;
opt_config_.set_model_dir(input_model_dir);
opt_config_.set_model_file(model_file);
opt_config_.set_param_file(param_file);
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel(
optimize_out_path_, model_type_, record_strip_info);
std::cout << "Optimize done. ";
}
// 3. if record_strip_info = true, we will record striping info
if (record_strip_info) {
// Collect all models information
CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
CollectModelMetaInfo(optimize_out_path_,
model_dirs,
lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
std::cout << "Record the information of stripped models into :"
<< optimize_out_path_ << "successfully";
}
}
void OptBase::PrintHelpInfo() {
const std::string opt_version = lite::version();
const char help_info[] =
"At least one argument should be inputed. Valid arguments are listed "
"below:\n"
" Arguments of help information:\n"
" `help()` Print help infomation\n"
" Arguments of model optimization:\n"
" `set_model_dir(model_dir)`\n"
" `set_model_file(model_file_path)`\n"
" `set_param_file(param_file_path)`\n"
" `set_model_type(protobuf|naive_buffer)`\n"
" `set_optimize_out(output_optimize_model_dir)`\n"
" `set_valid_places(arm|opencl|x86|npu|xpu)`\n"
" `run_optimize(false|true)`\n"
" ` ----fasle&true refer to whether to record ops info for "
"tailoring lib, false by default`\n"
" Arguments of model checking and ops information:\n"
" `print_all_ops()` Display all the valid operators of "
"Paddle-Lite\n"
" `print_supported_ops` Display supported operators of valid "
"places\n"
" `check_if_model_supported()` Check if the input model is "
"supported\n";
std::cout << "opt version:" << opt_version << std::endl
<< help_info << std::endl;
}
// 2. Print supported info of inputed ops
void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
std::vector<std::string> lite_supported_targets = {"kHost",
"kX86",
"kCUDA",
"kARM",
"kOpenCL",
"kFPGA",
"kNPU",
"kXPU",
"kAny",
"kUnk"};
// Get the lengh of the first column: maximum length of the op_type
size_t maximum_optype_length = 0;
for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
maximum_optype_length = it->first.size() > maximum_optype_length
? it->first.size()
: maximum_optype_length;
}
std::cout << std::setiosflags(std::ios::internal);
// Print the first row: OP_nam taget1 target2 ...
std::cout << std::setw(maximum_optype_length) << "OP_name";
for (size_t i = 0; i < lite_supported_targets.size(); i++) {
std::cout << std::setw(10) << lite_supported_targets[i].substr(1);
}
std::cout << std::endl;
// Print the name of supported ops and mark if it's supported by each target
// print the support info of inputed ops: valid_ops
for (auto op = valid_ops.begin(); op != valid_ops.end(); op++) {
std::cout << std::setw(maximum_optype_length) << *op;
// Check: If this kernel doesn't match any operator, we will skip it.
if (supported_ops.find(*op) == supported_ops.end()) {
continue;
}
// Print OP info.
auto ops_valid_places = supported_ops.at(*op);
for (size_t i = 0; i < lite_supported_targets.size(); i++) {
if (std::find(ops_valid_places.begin(),
ops_valid_places.end(),
lite_supported_targets[i]) != ops_valid_places.end()) {
std::cout << std::setw(10) << "Y";
} else {
std::cout << std::setw(10) << " ";
}
}
std::cout << std::endl;
}
}
void OptBase::DisplayKernelsInfo() { // Display kernel information
std::cout << ::paddle::lite::KernelRegistry::Global().DebugString();
}
void OptBase::PrintAllOps() {
// 1. Get supported ops on these targets
std::set<std::string> valid_ops;
for (size_t i = 0; i < supported_ops_target.size(); i++) {
auto ops = supported_ops_target[i];
valid_ops.insert(ops.begin(), ops.end());
}
// 2. Print support info of these ops
PrintOpsInfo(valid_ops);
}
void OptBase::PrintSupportedOps() {
// 1. Get the valid hardware targets
std::vector<TargetType> target_types = {};
for (size_t i = 0; i < valid_places_.size(); i++) {
target_types.push_back(valid_places_[i].target);
}
std::string targets_str = TargetToStr(target_types[0]);
for (size_t i = 1; i < target_types.size(); i++) {
targets_str = targets_str + TargetToStr(target_types[i]);
}
std::cout << "Supported OPs on '" << targets_str << "': " << std::endl;
target_types.push_back(TARGET(kHost));
target_types.push_back(TARGET(kUnk));
// 2. Get supported ops on these targets
std::set<std::string> valid_ops;
for (size_t i = 0; i < target_types.size(); i++) {
auto ops = supported_ops_target[static_cast<int>(target_types[i])];
valid_ops.insert(ops.begin(), ops.end());
}
// 3. Print support info of these ops
PrintOpsInfo(valid_ops);
}
// test whether this model is supported
void OptBase::CheckIfModelSupported(bool print_ops_info) {
// 1. parse valid places and valid targets
auto valid_ops = supported_ops_target[static_cast<int>(TARGET(kHost))];
auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
valid_ops.insert(
valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
for (size_t i = 0; i < valid_places_.size(); i++) {
auto target = valid_places_[i].target;
auto ops = supported_ops_target[static_cast<int>(target)];
valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
}
// get valid ops
std::set<std::string> valid_ops_set(valid_ops.begin(), valid_ops.end());
// 2.Load model into program to get ops in model
std::string prog_path = opt_config_.model_dir() + "/__model__";
if (!(opt_config_.model_file()).empty() &&
!(opt_config_.param_file()).empty()) {
prog_path = opt_config_.model_file();
}
lite::cpp::ProgramDesc cpp_prog;
framework::proto::ProgramDesc pb_proto_prog =
*lite::LoadProgram(prog_path, false);
lite::pb::ProgramDesc pb_prog(&pb_proto_prog);
// Transform to cpp::ProgramDesc
lite::TransformProgramDescAnyToCpp(pb_prog, &cpp_prog);
std::set<std::string> unsupported_ops;
std::set<std::string> input_model_ops;
for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) {
auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
for (size_t i = 0; i < current_block->OpsSize(); ++i) {
auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
auto op_type = op_desc.Type();
input_model_ops.insert(op_type);
if (valid_ops_set.count(op_type) == 0) {
unsupported_ops.insert(op_type);
}
}
}
// 3. Print ops_info of input model and check if this model is supported
if (print_ops_info) {
std::cout << "OPs in the input model include:\n";
PrintOpsInfo(input_model_ops);
}
if (!unsupported_ops.empty()) {
std::string unsupported_ops_str = *unsupported_ops.begin();
for (auto op_str = ++unsupported_ops.begin();
op_str != unsupported_ops.end();
op_str++) {
unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
}
std::vector<TargetType> targets = {};
for (size_t i = 0; i < valid_places_.size(); i++) {
targets.push_back(valid_places_[i].target);
}
std::sort(targets.begin(), targets.end());
targets.erase(unique(targets.begin(), targets.end()), targets.end());
std::string targets_str = TargetToStr(targets[0]);
for (size_t i = 1; i < targets.size(); i++) {
targets_str = targets_str + "," + TargetToStr(targets[i]);
}
LOG(ERROR) << "Error: This model is not supported, because "
<< unsupported_ops.size() << " ops are not supported on '"
<< targets_str << "'. These unsupported ops are: '"
<< unsupported_ops_str << "'.";
exit(1);
}
if (print_ops_info) {
std::cout << "Paddle-Lite supports this model!" << std::endl;
exit(1);
}
}
} // namespace lite_api
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file defines Opt and basic functions about model transformation.
*/
#ifndef PADDLE_LITE_OPT_H_ // NOLINT
#define PADDLE_LITE_OPT_H_
#include <algorithm>
#include <iomanip>
#include <set>
#include <string>
#include <vector>
// stores the map that records the source_file path of each kernel.
#include "kernel_src_map.h" // NOLINT
#include "lite/api/cxx_api.h"
// version of Paddle-lite
#include "lite/core/version.h"
// model parser functions to pre-load model to verify if this model is supported
#include "lite/model_parser/compatible_pb.h"
#include "lite/model_parser/pb/program_desc.h"
#include "lite/utils/string.h"
// recorded all the ops supported by paddle-lite
#include "supported_kernel_op_info.h" // NOLINT
namespace paddle {
namespace lite_api {
/// The PaddlePredictor defines the basic interfaces for different kinds of
/// predictors.
class LITE_API OptBase {
public:
OptBase() = default;
void SetModelSetDir(const std::string &model_set_path);
void SetModelDir(const std::string &model_path);
void SetModelFile(const std::string &model_path);
void SetParamFile(const std::string &param_path);
void SetValidPlaces(const std::string &valid_places);
void SetOptimizeOut(const std::string &optimized_out_path);
// set optimized_model type
void SetModelType(std::string model_type);
// transform and save the optimized model
void RunOptimize(bool record_strip_info = false);
// fuctions of printing info
// 1. help info
void PrintHelpInfo();
// 2. PrintOpsInfo
void PrintOpsInfo(const std::set<std::string> &valid_ops =
{}); // print supported ops on target_types
void PrintAllOps(); // print all ops
void PrintSupportedOps(); // print ops supported on valid_places_
void DisplayKernelsInfo(); // Display kernel information
// 3. Check if this model is supported
void CheckIfModelSupported(bool print_ops_info = true);
private:
CxxConfig opt_config_;
// valid places for the optimized_model
std::vector<Place> valid_places_;
// filename of the optimized_model
std::string optimize_out_path_;
// type of the optimized_model, kNaiveBuffer default.
LiteModelType model_type_{LiteModelType::kNaiveBuffer};
// Dir path of a set of models, this should be combined with model
std::string model_set_dir_;
void RunOptimizeFromModelSet(bool record_strip_info = false);
};
} // namespace lite_api
} // namespace paddle
#endif // NOLINT
......@@ -38,6 +38,7 @@ void Tensor::Resize(const shape_t &shape) {
tensor(raw_tensor_)->Resize(shape);
}
// Tensor::data
template <>
const float *Tensor::data() const {
return ctensor(raw_tensor_)->data<float>();
......@@ -47,15 +48,19 @@ const int8_t *Tensor::data() const {
return ctensor(raw_tensor_)->data<int8_t>();
}
template <>
const uint8_t *Tensor::data() const {
return ctensor(raw_tensor_)->data<uint8_t>();
}
template <>
const int64_t *Tensor::data() const {
return ctensor(raw_tensor_)->data<int64_t>();
}
template <>
const int32_t *Tensor::data() const {
return ctensor(raw_tensor_)->data<int32_t>();
}
// Tensor::mutable_data
template <>
int *Tensor::mutable_data(TargetType type) const {
return tensor(raw_tensor_)->mutable_data<int>(type);
......@@ -69,6 +74,10 @@ int8_t *Tensor::mutable_data(TargetType type) const {
return tensor(raw_tensor_)->mutable_data<int8_t>(type);
}
template <>
uint8_t *Tensor::mutable_data(TargetType type) const {
return tensor(raw_tensor_)->mutable_data<uint8_t>(type);
}
template <>
int64_t *Tensor::mutable_data(TargetType type) const {
return tensor(raw_tensor_)->mutable_data<int64_t>(type);
}
......@@ -116,18 +125,22 @@ void Tensor::CopyToCpu(T *data) const {
template void Tensor::CopyFromCpu<int, TargetType::kHost>(const int *);
template void Tensor::CopyFromCpu<float, TargetType::kHost>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kHost>(const int8_t *);
template void Tensor::CopyFromCpu<uint8_t, TargetType::kHost>(const uint8_t *);
template void Tensor::CopyFromCpu<int, TargetType::kARM>(const int *);
template void Tensor::CopyFromCpu<float, TargetType::kARM>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kARM>(const int8_t *);
template void Tensor::CopyFromCpu<uint8_t, TargetType::kARM>(const uint8_t *);
template void Tensor::CopyFromCpu<int, TargetType::kCUDA>(const int *);
template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
template void Tensor::CopyToCpu(int8_t *) const;
template void Tensor::CopyToCpu(float *) const;
template void Tensor::CopyToCpu(int *) const;
template void Tensor::CopyToCpu(int8_t *) const;
template void Tensor::CopyToCpu(uint8_t *) const;
shape_t Tensor::shape() const {
return ctensor(raw_tensor_)->dims().Vectorize();
......
......@@ -206,7 +206,7 @@ class LITE_API MobileConfig : public ConfigBase {
};
template <typename ConfigT>
std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&);
LITE_API std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&);
} // namespace lite_api
} // namespace paddle
......
......@@ -15,9 +15,6 @@
#include "lite/api/paddle_api.h"
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/io.h"
DEFINE_string(model_dir, "", "");
......
......@@ -45,6 +45,21 @@ std::string Place::DebugString() const {
return os.str();
}
const std::string& ActivationTypeToStr(ActivationType act) {
static const std::string act2string[] = {"unk",
"Relu",
"Relu6",
"PRelu",
"LeakyRelu",
"Sigmoid",
"Tanh",
"Swish",
"Exp"};
auto x = static_cast<int>(act);
CHECK_LT(x, static_cast<int>(ActivationType::NUM));
return act2string[x];
}
const std::string& TargetToStr(TargetType target) {
static const std::string target2string[] = {"unk",
"host",
......
......@@ -96,7 +96,9 @@ enum class ActivationType : int {
kLeakyRelu = 4,
kSigmoid = 5,
kTanh = 6,
kSwish = 7
kSwish = 7,
kExp = 8,
NUM = 9,
};
static size_t PrecisionTypeLength(PrecisionType type) {
......@@ -148,6 +150,8 @@ _ForEachPrecisionType(DefinePrecisionTypeTrait);
#define PRECISION(item__) paddle::lite_api::PrecisionType::item__
#define DATALAYOUT(item__) paddle::lite_api::DataLayoutType::item__
const std::string& ActivationTypeToStr(ActivationType act);
const std::string& TargetToStr(TargetType target);
const std::string& PrecisionToStr(PrecisionType precision);
......
......@@ -24,7 +24,7 @@ USE_MIR_PASS(generate_program_pass);
USE_MIR_PASS(io_copy_kernel_pick_pass);
USE_MIR_PASS(argument_type_display_pass);
USE_MIR_PASS(runtime_context_assign_pass);
USE_MIR_PASS(graph_visualze);
USE_MIR_PASS(graph_visualize_pass);
USE_MIR_PASS(lite_conv_bn_fuse_pass);
USE_MIR_PASS(lite_fc_fuse_pass);
......@@ -46,3 +46,4 @@ USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass);
......@@ -2,6 +2,23 @@ if (NOT LITE_WITH_PYTHON)
return()
endif()
# to create setup.py for packeting whl for Paddle-Lite and opt
execute_process(
COMMAND git describe --tags --exact-match
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_LITE_TAG
OUTPUT_STRIP_TRAILING_WHITESPACE
)
execute_process(
COMMAND git log -1 --format=%h
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_LITE_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py)
add_subdirectory(pybind)
#add_subdirectory(interface)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set(PYBIND_DEPS pybind python paddle_api_light paddle_api)
if (NOT LITE_ON_TINY_PUBLISH)
set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full)
set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base)
endif()
lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
......
......@@ -26,13 +26,11 @@
#ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/opt_base.h"
#endif
#include "lite/api/light_api.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/core/tensor.h"
namespace py = pybind11;
......@@ -50,10 +48,27 @@ using lite_api::PrecisionType;
using lite_api::DataLayoutType;
using lite_api::Place;
using lite::LightPredictorImpl;
using lite_api::OptBase;
#ifndef LITE_ON_TINY_PUBLISH
using lite::CxxPaddleApiImpl;
static void BindLiteCxxPredictor(py::module *m);
void BindLiteOpt(py::module *m) {
py::class_<OptBase> opt_base(*m, "Opt");
opt_base.def(py::init<>())
.def("set_model_dir", &OptBase::SetModelDir)
.def("set_modelset_dir", &OptBase::SetModelSetDir)
.def("set_model_file", &OptBase::SetModelFile)
.def("set_param_file", &OptBase::SetParamFile)
.def("set_valid_places", &OptBase::SetValidPlaces)
.def("set_optimize_out", &OptBase::SetOptimizeOut)
.def("set_model_type", &OptBase::SetModelType)
.def("run_optimize", &OptBase::RunOptimize)
.def("help", &OptBase::PrintHelpInfo)
.def("print_supported_ops", &OptBase::PrintSupportedOps)
.def("display_kernels_info", &OptBase::DisplayKernelsInfo)
.def("print_all_ops", &OptBase::PrintAllOps);
}
#endif
static void BindLiteLightPredictor(py::module *m);
static void BindLiteCxxConfig(py::module *m);
......
......@@ -22,11 +22,15 @@ namespace lite {
namespace pybind {
void BindLiteApi(pybind11::module *m);
void BindLiteOpt(pybind11::module *m);
PYBIND11_MODULE(lite_core, m) {
PYBIND11_MODULE(lite, m) {
m.doc() = "C++ core of Paddle-Lite";
BindLiteApi(&m);
#ifndef LITE_ON_TINY_PUBLISH
BindLiteOpt(&m);
#endif
}
} // namespace pybind
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# module of pack whl installer for Paddle-lite
import shutil
import os
from setuptools import setup, Distribution
class BinaryDistribution(Distribution):
'binary distribution'
def has_ext_modules(foo):
return True
# get paddle-lite version, if it's not based on a release tag, we use commit id instead
PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@"
PADDLELITE_TAG = "@PADDLE_LITE_TAG@"
if PADDLELITE_TAG == "":
PADDLELITE_VERSION = PADDLELITE_COMMITE
else:
PADDLELITE_VERSION = PADDLELITE_TAG
# core lib of paddlelite is stored as lite.so
LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
PACKAGE_DATA = {'paddlelite': ['lite.so']}
# put all thirdparty libraries in paddlelite.libs
PACKAGE_DATA['paddlelite.libs'] = []
LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
# link lite.so to paddlelite.libs
COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
/inference_lite_lib/python/install/lite/lite.so"
if os.system(COMMAND) != 0:
raise Exception("patch third_party libs failed, command: %s" % COMMAND)
# remove unused paddle/libs/__init__.py
if os.path.isfile(LIB_PATH+'/__init__.py'):
os.remove(LIB_PATH+'/__init__.py')
# set dir path of each package
PACKAGE_DIR = {
# The paddle.fluid.proto will be generated while compiling.
# So that package points to other directory.
'paddlelite.libs': LIB_PATH,
'paddlelite': LITE_PATH
}
setup(
name='paddlelite',
version=PADDLELITE_VERSION,
description='Paddle-Lite Library',
packages=['paddlelite', 'paddlelite.libs'],
package_dir=PACKAGE_DIR,
package_data=PACKAGE_DATA,
distclass=BinaryDistribution
)
......@@ -80,7 +80,7 @@ void TestModel(const std::vector<Place>& valid_places) {
fclose(fp);
}
TEST(ResNet50, test_bm) {
TEST(Classify, test_bm) {
std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)}});
......
......@@ -17,6 +17,7 @@
#include <gflags/gflags.h>
#include <sys/time.h>
#include <time.h>
#include <cmath>
// for eval
DEFINE_string(model_dir, "", "model dir");
......@@ -43,5 +44,31 @@ inline double GetCurrentUS() {
return 1e+6 * time.tv_sec + time.tv_usec;
}
template <typename T>
double compute_mean(const T* in, const size_t length) {
double sum = 0.;
for (size_t i = 0; i < length; ++i) {
sum += in[i];
}
return sum / length;
}
template <typename T>
double compute_standard_deviation(const T* in,
const size_t length,
bool has_mean = false,
double mean = 10000) {
if (!has_mean) {
mean = compute_mean<T>(in, length);
}
double variance = 0.;
for (size_t i = 0; i < length; ++i) {
variance += pow((in[i] - mean), 2);
}
variance /= length;
return sqrt(variance);
}
} // namespace lite
} // namespace paddle
......@@ -28,11 +28,10 @@ DEFINE_int32(batch, 1, "batch");
namespace paddle {
namespace lite {
namespace test_transformer {
namespace test_transformer {
std::vector<std::string> inputed_lines;
void LoadInputLines(const char* filename) {
void load_input_lines(const char* filename) {
static const int max_line_buf_size = 100 * 1024 * 1024;
char* line_buffer = (char*)calloc(max_line_buf_size, sizeof(char)); // NOLINT
FILE* input_file = fopen(filename, "r");
......@@ -49,7 +48,7 @@ void LoadInputLines(const char* filename) {
line_buffer = NULL;
fclose(input_file);
}
void Split2(const std::string& main_str,
void split2(const std::string& main_str,
std::vector<std::string>& str_list, // NOLINT
const std::string& delimiter) {
size_t pre_pos = 0;
......@@ -75,19 +74,19 @@ void Split2(const std::string& main_str,
}
} // NOLINT
void PadBatchInput(std::vector<std::string>& input_lines, // NOLINT
int pad_idx,
int n_head,
Tensor* src_word,
Tensor* src_pos,
Tensor* src_attn_bias,
Tensor* trg_word,
Tensor* init_scores,
Tensor* init_idx,
Tensor* trg_bias,
int line_start,
int batch_size,
int bos_idx) {
void pad_batch_input(std::vector<std::string>& input_lines, // NOLINT
int pad_idx,
int n_head,
Tensor* src_word,
Tensor* src_pos,
Tensor* src_attn_bias,
Tensor* trg_word,
Tensor* init_scores,
Tensor* init_idx,
Tensor* trg_bias,
int line_start,
int batch_size,
int bos_idx) {
int max_len = 0;
int max_line = input_lines.size();
......@@ -98,27 +97,27 @@ void PadBatchInput(std::vector<std::string>& input_lines, // NOLINT
std::vector<std::string> split_str;
test_transformer::Split2(cur_line, split_str, " ");
test_transformer::split2(cur_line, split_str, " ");
batch_lines.push_back(split_str);
max_len = max_len >= split_str.size() ? max_len : split_str.size();
}
src_word->Resize(std::vector<DDim::value_type>({batch_size, max_len, 1}));
src_pos->Resize(std::vector<DDim::value_type>({batch_size, max_len, 1}));
src_word->Resize(std::vector<DDim::value_type>({batch_size, max_len}));
src_pos->Resize(std::vector<DDim::value_type>({batch_size, max_len}));
src_attn_bias->Resize(
std::vector<DDim::value_type>({batch_size, n_head, max_len, max_len}));
trg_bias->Resize(
std::vector<DDim::value_type>({batch_size, n_head, 1, max_len}));
float* src_word_data = src_word->mutable_data<float>();
float* src_pos_data = src_pos->mutable_data<float>();
std::vector<DDim::value_type>({batch_size, n_head, max_len, max_len}));
auto* src_word_data = src_word->mutable_data<int64_t>();
auto* src_pos_data = src_pos->mutable_data<int64_t>();
float* src_bias_data = src_attn_bias->mutable_data<float>();
float* trg_bias_data = trg_bias->mutable_data<float>();
for (int i = 0; i < batch_size; ++i) {
std::vector<std::string> cur_words = batch_lines[i];
int fill_len = cur_words.size();
int src_bias_start = i * n_head * max_len * max_len;
int trg_bias_start = i * n_head * max_len;
int trg_bias_start = i * n_head * max_len * max_len;
for (int j = 0; j < fill_len; ++j) {
src_word_data[i * max_len + j] = (atoi(cur_words[j].c_str()));
src_pos_data[i * max_len + j] = j;
......@@ -137,22 +136,24 @@ void PadBatchInput(std::vector<std::string>& input_lines, // NOLINT
int value_ind = j % max_len + src_bias_start;
src_bias_data[j] = src_bias_data[value_ind];
}
for (int j = trg_bias_start; j < trg_bias_start + n_head * max_len; ++j) {
for (int j = trg_bias_start;
j < trg_bias_start + n_head * max_len * max_len;
++j) {
int value_ind = j % max_len + trg_bias_start;
trg_bias_data[j] = trg_bias_data[value_ind];
}
}
trg_word->Resize(std::vector<DDim::value_type>({batch_size, 1, 1}));
auto* trg_word_data = trg_word->mutable_data<float>();
for (int i = 0; i < batch_size; ++i) {
trg_word->Resize(std::vector<DDim::value_type>({batch_size, max_len}));
auto* trg_word_data = trg_word->mutable_data<int64_t>();
for (int i = 0; i < batch_size * max_len; ++i) {
trg_word_data[i] = bos_idx;
}
init_scores->Resize(std::vector<DDim::value_type>({batch_size, 1}));
init_idx->Resize(std::vector<DDim::value_type>({batch_size}));
float* score_data = init_scores->mutable_data<float>();
float* idx_data = init_idx->mutable_data<float>();
auto* idx_data = init_idx->mutable_data<int32_t>();
for (int i = 0; i < init_scores->numel(); ++i) {
score_data[i] = 0;
}
......@@ -175,21 +176,25 @@ void PadBatchInput(std::vector<std::string>& input_lines, // NOLINT
void TestModel(const std::vector<Place>& valid_places,
const Place& preferred_place,
bool use_npu = false) {
#ifdef LITE_WITH_ARM
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
#endif
lite::Predictor predictor;
std::string test_data_path = FLAGS_input;
predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
predictor.Build("",
FLAGS_model_dir + "/__model__",
FLAGS_model_dir + "/weights",
valid_places);
// predictor.Build(FLAGS_model_dir, "", "", valid_places);
int n_head = 8;
int batch_size = FLAGS_batch;
int bos_idx = 0;
int eos_idx = 1;
LOG(INFO) << "reading";
test_transformer::LoadInputLines(test_data_path.c_str());
LOG(INFO) << "reading finished";
test_transformer::load_input_lines(test_data_path.c_str());
auto* trg_bias = predictor.GetInput(6);
auto* src_word = predictor.GetInput(0);
......@@ -205,28 +210,31 @@ void TestModel(const std::vector<Place>& valid_places,
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
auto start_i = GetCurrentUS();
PadBatchInput(test_transformer::inputed_lines,
eos_idx,
n_head,
src_word, // src_word
src_pos, // src_pos
src_bias, // src_bias
trg_word, // trg_word
init_score, // init_score
init_idx, // init_idx
trg_bias, // trg_bias
i * batch_size,
batch_size,
bos_idx);
LOG(INFO) << "src_word:" << src_word->dims();
auto start_ii = GetCurrentUS();
LOG(INFO) << i << "->ii:" << (start_ii - start_i) / 1000.0;
pad_batch_input(test_transformer::inputed_lines,
eos_idx,
n_head,
src_word, // src_word
src_pos, // src_pos
src_bias, // src_bias
trg_word, // trg_word
init_score, // init_score
init_idx, // init_idx
trg_bias, // trg_bias
i * batch_size,
batch_size,
bos_idx);
predictor.Run();
auto start_iii = GetCurrentUS();
LOG(INFO) << i << "->iii:" << (start_iii - start_ii) / 1000.0;
auto* outs = predictor.GetOutputs();
LOG(INFO) << "out:" << (*outs)[0].dims();
auto* outs = predictor.GetOutput(0);
auto o_data = outs->data<int64_t>();
auto lod = outs->lod();
for (int i = 0; i < outs->numel(); ++i) {
LOG(INFO) << o_data[i];
}
for (int i = 0; i < lod.size(); ++i) {
for (int j = 0; j < lod[i].size(); ++j) {
LOG(INFO) << lod[i][j];
}
}
}
LOG(INFO) << "================== Speed Report ===================";
......@@ -234,25 +242,18 @@ void TestModel(const std::vector<Place>& valid_places,
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
auto* outs = predictor.GetOutputs();
for (auto out : *outs) {
LOG(INFO) << "======"
<< "here";
LOG(INFO) << out;
}
LOG(INFO) << "======"
<< "hereggg";
}
TEST(OcrAttention, test_arm) {
} // namespace lite
} // namespace paddle
using namespace paddle::lite; // NOLINT
int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
std::vector<Place> valid_places({
Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kInt64)},
Place{TARGET(kARM), PRECISION(kFloat)},
Place{TARGET(kHost), PRECISION(kFloat)},
});
TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}));
}
} // namespace lite
} // namespace paddle
......@@ -68,6 +68,8 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
gemv_arm_int8.cc
conv3x3s1_direct_fp32.cc
conv3x3s2_direct_fp32.cc
conv3x3s1p01_depthwise_fp32_relu.cc
conv3x3s2p01_depthwise_fp32_relu.cc
conv3x3s1p01_depthwise_fp32.cc
conv3x3s2p01_depthwise_fp32.cc
conv3x3s1px_depthwise_fp32.cc
......
......@@ -700,6 +700,35 @@ void act_rsqrt<float>(const float* din, float* dout, int size, int threads) {
}
}
template <>
void act_square<float>(const float* din, float* dout, int size, int threads) {
const float* ptr_in = din;
float* ptr_out = dout;
for (int i = 0; i < size; ++i) {
ptr_out[0] = ptr_in[0] * ptr_in[0];
ptr_in++;
ptr_out++;
}
}
#ifdef LITE_WITH_TRAIN
template <>
void act_square_grad(const float* din,
const float* dout_grad,
float* din_grad,
int size,
int threads) {
const float* ptr_out_grad = dout_grad;
float* ptr_in_grad = din_grad;
for (int i = 0; i < size; ++i) {
ptr_in_grad[0] = ptr_out_grad[0] * 2.0 * din[0];
ptr_out_grad++;
ptr_in_grad++;
din++;
}
}
#endif
} // namespace math
} // namespace arm
} // namespace lite
......
......@@ -69,6 +69,15 @@ void act_hard_sigmoid(const T* din,
template <typename T>
void act_rsqrt(const T* din, T* dout, int size, int threads);
template <typename T>
void act_square(const T* din, T* dout, int size, int threads);
#ifdef LITE_WITH_TRAIN
template <typename T>
void act_square_grad(
const T* din, const T* dout_grad, T* din_grad, int size, int threads);
#endif
} // namespace math
} // namespace arm
} // namespace lite
......
......@@ -70,7 +70,7 @@ void PruneEndBeams(const Tensor *pre_ids,
std::vector<std::vector<Item>> *items,
size_t lod_level,
int end_id) {
auto *pre_ids_data = pre_ids->data<float>();
auto *pre_ids_data = pre_ids->data<int64_t>();
auto &high_level = abs_lod[lod_level];
for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
size_t src_prefix_start = high_level[src_idx];
......@@ -152,10 +152,10 @@ std::vector<std::vector<Item>> SelectTopBeamSizeItems(const Tensor *pre_ids,
// find the current candidates
// auto abs_lod = framework::ToAbsOffset(scores->lod());
auto abs_lod = scores->lod();
auto *pre_ids_data = pre_ids->data<float>();
auto *pre_ids_data = pre_ids->data<int64_t>();
auto *pre_scores_data = pre_scores->data<float>();
auto *ids_data = ids ? ids->data<int>() : nullptr;
auto *ids_data = ids ? ids->data<int64_t>() : nullptr;
auto *scores_data = scores->data<float>();
size_t num_seqs = abs_lod[lod_level].size() - 1;
......@@ -236,7 +236,7 @@ void beam_search(const Tensor *pre_ids,
if (parent_idx) {
parent_idx->Resize(dims);
}
auto *selected_ids_data = selected_ids->mutable_data<float>();
auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
auto *selected_scores_data = selected_scores->mutable_data<float>();
auto *parent_idx_data =
parent_idx ? parent_idx->mutable_data<int>() : nullptr;
......
......@@ -508,6 +508,8 @@ void act_switch_3x3s1(const float* inr0,
"x5",
"x6",
"x7");
#else
#if 1 // def LITE_WITH_ARM_CLANG
#else
asm volatile(COMPUTE RELU STORE
: [r0] "+r"(inr0),
......@@ -541,6 +543,7 @@ void act_switch_3x3s1(const float* inr0,
"r3",
"r4",
"r5");
#endif
#endif
break;
case lite_api::ActivationType::kRelu6:
......@@ -593,6 +596,8 @@ void act_switch_3x3s1(const float* inr0,
"x5",
"x6",
"x7");
#else
#if 1 // def LITE_WITH_ARM_CLANG
#else
asm volatile(COMPUTE RELU RELU6 STORE
: [r0] "+r"(inr0),
......@@ -626,6 +631,7 @@ void act_switch_3x3s1(const float* inr0,
"r3",
"r4",
"r5");
#endif
#endif
break;
case lite_api::ActivationType::kLeakyRelu:
......@@ -678,6 +684,8 @@ void act_switch_3x3s1(const float* inr0,
"x5",
"x6",
"x7");
#else
#if 1 // def LITE_WITH_ARM_CLANG
#else
asm volatile(COMPUTE LEAKY_RELU STORE
: [r0] "+r"(inr0),
......@@ -711,6 +719,7 @@ void act_switch_3x3s1(const float* inr0,
"r3",
"r4",
"r5");
#endif
#endif
break;
default:
......@@ -768,6 +777,8 @@ void act_switch_3x3s1(const float* inr0,
"x5",
"x6",
"x7");
#else
#if 1 // def LITE_WITH_ARM_CLANG
#else
asm volatile(COMPUTE STORE
: [r0] "+r"(inr0),
......@@ -801,6 +812,7 @@ void act_switch_3x3s1(const float* inr0,
"r3",
"r4",
"r5");
#endif
#endif
}
}
......@@ -988,6 +1000,8 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
w8,
vbias,
act_param);
#else
#if 1 // def LITE_WITH_ARM_CLANG
#else
act_switch_3x3s1(inr0,
inr1,
......@@ -1008,6 +1022,7 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
vbias,
vbias,
act_param);
#endif
#endif
outl[0] += 4;
outl[1] += 4;
......
......@@ -102,7 +102,7 @@ void conv_depthwise_5x5s2_int8(Dtype* dout,
if (h + hout_r_block > hout) {
h_kernel = hout - h;
}
int hs = h - padh;
int hs = h * 2 - padh;
int he = hs + h_kernel * 2 + 3;
#pragma omp parallel for num_threads(threads)
......
......@@ -2203,7 +2203,7 @@ inline void act_switch_c8_fp32(const float* din_ptr,
[cnt] "+r"(cnt_loop)
:
: "cc",
"meemory",
"memory",
"q0",
"q1",
"q2",
......
......@@ -207,6 +207,118 @@ void conv_depthwise_5x5s2_int8(Dtype* dout,
int padh,
ARMContext* ctx);
void conv_depthwise_3x3s1p0_bias_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s1p0_bias_s_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s1p1_bias_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s1p1_bias_s_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s2p0_bias_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s2p1_bias_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
const float* din,
const float* weights,
const float* bias,
bool flag_bias,
bool flag_relu,
const int num,
const int ch_in,
const int h_in,
const int w_in,
const int h_out,
const int w_out,
ARMContext* ctx);
} // namespace math
} // namespace arm
} // namespace lite
......
......@@ -573,6 +573,22 @@ template void conv_im2col_gemm_int8<float>(const int8_t* i_data,
ARMContext* ctx,
const float* scale);
template void im2col<float>(const float* data_im,
int channels,
int height,
int width,
int kernel_h,
int kernel_w,
int pad_top,
int pad_bottom,
int pad_left,
int pad_right,
int stride_h,
int stride_w,
int dilation_h,
int dilation_w,
float* data_col);
void conv_depthwise_3x3_fp32(const void* din,
void* dout,
int num,
......@@ -613,6 +629,26 @@ void conv_depthwise_3x3_fp32(const void* din,
act_param,
ctx);
} else {
#ifdef __aarch64__
conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
reinterpret_cast<float*>(dout),
num,
ch_out,
h_out,
w_out,
ch_in,
h_in,
w_in,
reinterpret_cast<const float*>(weights),
bias,
param,
act_param,
ctx);
#else
#ifdef LITE_WITH_ARM_CLANG
LOG(FATAL) << "fp32 depthwise conv3x3s1px doesnot support in v7-clang, "
"this can run in basic";
#else
conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
reinterpret_cast<float*>(dout),
num,
......@@ -627,6 +663,8 @@ void conv_depthwise_3x3_fp32(const void* din,
param,
act_param,
ctx);
#endif
#endif
}
} else if (stride == 2) {
if (pads_less && pad_h == pad_w && (pad < 2)) { // support pad = [0, 1]
......
......@@ -359,6 +359,24 @@ void conv_compute_2x2_3x3_small(const float* input,
const float* bias,
const operators::ConvParam& param,
ARMContext* ctx);
template <typename Dtype>
void im2col(const Dtype* data_im,
int channels,
int height,
int width,
int kernel_h,
int kernel_w,
int pad_top,
int pad_bottom,
int pad_left,
int pad_right,
int stride_h,
int stride_w,
int dilation_h,
int dilation_w,
Dtype* data_col);
} // namespace math
} // namespace arm
} // namespace lite
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册