add static lite demo (#2899)

7ebb6f29 · Guanghua Yu · GitHub · f331f0a1 · 7ebb6f29 · 7ebb6f29
6 changed file
--- a/deploy/lite/README.md
+++ b/deploy/lite/README.md
@@ -195,18 +195,19 @@ cp ../../../cxx/lib/libpaddle_light_api_shared.so ./debug/
 执行完成后，detection文件夹下将有如下文件格式：
 ```
-demo/cxx/clas/
+demo/cxx/detection/
 |-- debug/
-|   |--ppyolo_tiny.nb                优化后的检测器模型文件
+|   |--ppyolo_tiny.nb                   优化后的检测器模型文件
-|   |--000000014439.jpg                    	待测试图像
+|   |--000000014439.jpg                 待测试图像
-|   |--coco_label_list.txt                类别映射文件
+|   |--coco_label_list.txt              类别映射文件
 |   |--libpaddle_light_api_shared.so    C++预测库文件
-|   |--config_ppyolo_tiny.txt                       分类预测超参数配置
+|   |--config_ppyolo_tiny.txt           检测模型预测超参数配置
-|-- image_classfication.cpp            	图像分类代码文件
+|-- run_detection.cc                    目标检测代码文件
-|-- Makefile                    				编译文件
+|-- Makefile                            编译文件
 ```
-#### 注意：
+**注意：**
 * 上述文件中，`coco_label_list.txt` 是COCO数据集的类别映射文件，如果使用自定义的类别，需要更换该类别映射文件。
 *  `config_ppyolo_tiny.txt` 包含了检测器的超参数，如下：

--- a/static/deploy/lite/Makefile
+++ b/static/deploy/lite/Makefile
+ARM_ABI = arm8
+export ARM_ABI
+include ../Makefile.def
+LITE_ROOT=../../../
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+OPENCV_VERSION=opencv4.1.0
+OPENCV_LIBS = ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+detect_system: fetch_opencv detect_system.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) detect_system.o -o detect_system  $(CXX_LIBS) $(LDFLAGS)
+detect_system.o: run_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o detect_system.o -c run_detection.cc
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+.PHONY: clean
+clean:
+	rm -f detect_system.o
+	rm -f detect_system
--- a/static/deploy/lite/README.md
+++ b/static/deploy/lite/README.md
+# Paddle-Lite端侧部署
+本教程将介绍基于[Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite) 在移动端部署PaddleDetection的**静态图**模型的详细步骤。
+Paddle Lite是飞桨轻量化推理引擎，为手机、IOT端提供高效推理能力，并广泛整合跨平台硬件，为端侧部署及应用落地问题提供轻量化的部署方案。
+## 1. 准备环境
+### 运行准备
+- 电脑（编译Paddle Lite）
+- 安卓手机（armv7或armv8）
+### 1.1 准备交叉编译环境
+交叉编译环境用于编译 Paddle Lite 和 PaddleDetection 的C++ demo。
+支持多种开发环境，不同开发环境的编译流程请参考对应文档。
+1. [Docker](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#docker)
+2. [Linux](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#linux)
+3. [MAC OS](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#mac-os)
+### 1.2 准备预测库
+预测库有两种获取方式：
+1. [**建议**]直接下载，预测库下载链接如下：
+      |平台|预测库下载链接|
+      |-|-|
+      |Android|[arm7](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.8/inference_lite_lib.android.armv7.gcc.c++_static.with_extra.with_cv.tar.gz) / [arm8](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.8/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.with_cv.tar.gz)|
+      |iOS|[arm7](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.8/inference_lite_lib.ios.armv7.with_cv.with_extra.tiny_publish.tar.gz) / [arm8](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.8/inference_lite_lib.ios.armv8.with_cv.with_extra.tiny_publish.tar.gz)|
+      注：1. 如果是从 Paddle-Lite [官方文档](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html#android-toolchain-gcc)下载的预测库，注意选择`with_extra=ON，with_cv=ON`的下载链接。
+2. 编译Paddle-Lite得到预测库，Paddle-Lite的编译方式如下：
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+# 如果使用编译方式，建议使用develop分支编译预测库
+git checkout develop
+./lite/tools/build_android.sh  --arch=armv8  --with_cv=ON --with_extra=ON
+```
+**注意**：编译Paddle-Lite获得预测库时，需要打开`--with_cv=ON --with_extra=ON`两个选项，`--arch`表示`arm`版本，这里指定为armv8，更多编译命令介绍请参考[链接](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_andriod.html#id2)。
+直接下载预测库并解压后，可以得到`inference_lite_lib.android.armv8.gcc.c++_static.with_extra.with_cv/`文件夹，通过编译Paddle-Lite得到的预测库位于`Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/`文件夹下。
+预测库的文件目录如下：
+```
+inference_lite_lib.android.armv8/
+|-- cxx                                        C++ 预测库和头文件
+|   |-- include                                C++ 头文件
+|   |   |-- paddle_api.h
+|   |   |-- paddle_image_preprocess.h
+|   |   |-- paddle_lite_factory_helper.h
+|   |   |-- paddle_place.h
+|   |   |-- paddle_use_kernels.h
+|   |   |-- paddle_use_ops.h
+|   |   `-- paddle_use_passes.h
+|   `-- lib                                           C++预测库
+|       |-- libpaddle_api_light_bundled.a             C++静态库
+|       `-- libpaddle_light_api_shared.so             C++动态库
+|-- java                                     Java预测库
+|   |-- jar
+|   |   `-- PaddlePredictor.jar
+|   |-- so
+|   |   `-- libpaddle_lite_jni.so
+|   `-- src
+|-- demo                                     C++和Java示例代码
+|   |-- cxx                                  C++  预测库demo
+|   `-- java                                 Java 预测库demo
+```
+## 2 开始运行
+### 2.1 模型优化
+Paddle-Lite 提供了多种策略来自动优化原始的模型，其中包括量化、子图融合、混合调度、Kernel优选等方法，使用Paddle-Lite的`opt`工具可以自动对inference模型进行优化，目前支持两种优化方式，优化后的模型更轻量，模型运行速度更快。
+**注意**：如果已经准备好了 `.nb` 结尾的模型文件，可以跳过此步骤。
+#### 2.1.1 安装paddle_lite_opt工具
+安装paddle_lite_opt工具有如下两种方法：
+1. [**建议**]pip安装paddlelite并进行转换
+    ```shell
+    pip install paddlelite
+    ```
+2. 源码编译Paddle-Lite生成opt工具
+    模型优化需要Paddle-Lite的`opt`可执行文件，可以通过编译Paddle-Lite源码获得，编译步骤如下：
+    ```shell
+    # 如果准备环境时已经clone了Paddle-Lite，则不用重新clone Paddle-Lite
+    git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+    cd Paddle-Lite
+    git checkout develop
+    # 启动编译
+    ./lite/tools/build.sh build_optimize_tool
+    ```
+    编译完成后，`opt`文件位于`build.opt/lite/api/`下，可通过如下方式查看`opt`的运行选项和使用方式；
+    ```shell
+    cd build.opt/lite/api/
+    ./opt
+    ```
+    `opt`的使用方式与参数与上面的`paddle_lite_opt`完全一致。
+之后使用`paddle_lite_opt`工具可以进行inference模型的转换。`paddle_lite_opt`的部分参数如下：
+|选项|说明|
+|-|-|
+|--model_file|待优化的PaddlePaddle模型（combined形式）的网络结构文件路径|
+|--param_file|待优化的PaddlePaddle模型（combined形式）的权重文件路径|
+|--optimize_out_type|输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现，默认为naive_buffer|
+|--optimize_out|优化模型的输出路径|
+|--valid_targets|指定模型可执行的backend，默认为arm。目前可支持x86、arm、opencl、npu、xpu，可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU（Kirin 810/990 Soc搭载的达芬奇架构NPU），应当设置为npu, arm|
+更详细的`paddle_lite_opt`工具使用说明请参考[使用opt转化模型文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/opt/opt_bin.html)
+`--model_file`表示inference模型的model文件地址，`--param_file`表示inference模型的param文件地址；`optimize_out`用于指定输出文件的名称（不需要添加`.nb`的后缀）。直接在命令行中运行`paddle_lite_opt`，也可以查看所有参数及其说明。
+#### 2.1.3 转换示例
+下面以PaddleDetection中的 `PP-YOLO-tiny` 模型为例，介绍使用`paddle_lite_opt`完成预训练模型到inference模型，再到Paddle-Lite优化模型的转换。
+```shell
+# 进入PaddleDetection根目录
+cd PaddleDetection_root_path
+# 进入静态图模型文件夹
+cd static
+# 将预训练模型导出为inference模型
+python tools/export_model.py -c configs/ppyolo/ppyolo_tiny.yml -o weights=https://paddlemodels.bj.bcebos.com/object_detection/ppyolo_tiny.pdparams
+# 将inference模型转化为Paddle-Lite优化模型
+paddle_lite_opt --model_file=output/ppyolo_tiny/__model__ --param_file=output/ppyolo_tiny/__params__ --optimize_out=ppyolo_tiny
+```
+最终在当前文件夹下生成`ppyolo_tiny.nb`的文件。
+**注意**：`--optimize_out` 参数为优化后模型的保存路径，无需加后缀`.nb`；`--model_file` 参数为模型结构信息文件的路径，`--param_file` 参数为模型权重信息文件的路径，请注意文件名。
+### 2.2 与手机联调
+首先需要进行一些准备工作。
+1. 准备一台arm8的安卓手机，如果编译的预测库和opt文件是armv7，则需要arm7的手机，并修改Makefile中`ARM_ABI = arm7`。
+2. 电脑上安装ADB工具，用于调试。 ADB安装方式如下：
+    2.1. MAC电脑安装ADB:
+    ```shell
+    brew cask install android-platform-tools
+    ```
+    2.2. Linux安装ADB
+    ```shell
+    sudo apt update
+    sudo apt install -y wget adb
+    ```
+    2.3. Window安装ADB
+    win上安装需要去谷歌的安卓平台下载ADB软件包进行安装：[链接](https://developer.android.com/studio)
+3. 手机连接电脑后，开启手机`USB调试`选项，选择`文件传输`模式，在电脑终端中输入：
+```shell
+adb devices
+```
+如果有device输出，则表示安装成功，如下所示：
+```
+List of devices attached
+744be294    device
+```
+4. 准备优化后的模型、预测库文件、测试图像和类别映射文件。
+```shell
+cd PaddleDetection_root_path
+cd static/deploy/lite/
+# 将预测库文件、测试图像和使用的类别字典文件放置在预测库中的demo/cxx/detection文件夹下
+inference_lite_path=/{lite prediction library path}/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.with_cv/
+mkdir -p  $inference_lite_path/demo/cxx/detection/debug/
+cp ../../ppyolo_tiny.nb $inference_lite_path/demo/cxx/detection/debug/
+cp  ./coco_label_list.txt  $inference_lite_path/demo/cxx/detection/debug/
+cp Makefile run_detection.cc  $inference_lite_path/demo/cxx/detection/
+cp ./config_ppyolo_tiny.txt  $inference_lite_path/demo/cxx/detection/debug/
+cp ../../demo/000000014439.jpg  $inference_lite_path/demo/cxx/detection/debug/
+# 进入lite demo的工作目录
+cd /{lite prediction library path}/inference_lite_lib.android.armv8/
+cd demo/cxx/detection/
+# 将C++预测动态库so文件复制到debug文件夹中
+cp ../../../cxx/lib/libpaddle_light_api_shared.so ./debug/
+```
+执行完成后，detection文件夹下将有如下文件格式：
+```
+demo/cxx/detection/
+|-- debug/
+|   |--ppyolo_tiny.nb                   优化后的检测器模型文件
+|   |--000000014439.jpg                 待测试图像
+|   |--coco_label_list.txt              类别映射文件
+|   |--libpaddle_light_api_shared.so    C++预测库文件
+|   |--config_ppyolo_tiny.txt           检测模型预测超参数配置
+|-- run_detection.cc                    目标检测代码文件
+|-- Makefile                            编译文件
+```
+**注意：**
+* 上述文件中，`coco_label_list.txt` 是COCO数据集的类别映射文件，如果使用自定义的类别，需要更换该类别映射文件。
+*  `config_ppyolo_tiny.txt` 包含了检测器的超参数，如下：
+```shell
+model_file ./ppyolo_tiny.nb         # 模型文件地址
+label_path ./coco_label_list.txt    # 类别映射文本文件
+num_threads 1                       # 线程数
+enable_benchmark 1                  # 是否运行benchmark
+Resize 320,320                      # resize图像尺寸
+keep_ratio False                    # 是否keep ratio
+mean 0.485,0.456,0.406              # 预处理均值
+std 0.229,0.224,0.225               # 预处理方差
+precision fp32                      # 模型精度
+```
+5. 启动调试，上述步骤完成后就可以使用ADB将文件夹 `debug/` push到手机上运行，步骤如下：
+```shell
+# 执行编译，得到可执行文件detect_system
+make
+# 将编译得到的可执行文件移动到debug文件夹中
+mv detect_system ./debug/
+# 将上述debug文件夹push到手机上
+adb push debug /data/local/tmp/
+adb shell
+cd /data/local/tmp/debug
+export LD_LIBRARY_PATH=/data/local/tmp/debug:$LD_LIBRARY_PATH
+# detect_system可执行文件的使用方式为:
+# ./detect_system 配置文件路径  测试图像路径
+./detect_system ./config_ppyolo_tiny.txt ./000000014439.jpg
+```
+如果对代码做了修改，则需要重新编译并push到手机上。
+运行效果如下：
+<div align="center">
+    <img src="../../../docs/images/lite_demo.jpg" width="600">
+</div>
+## FAQ
+Q1：如果想更换模型怎么办，需要重新按照流程走一遍吗？  
+A1：如果已经走通了上述步骤，更换模型只需要替换 `.nb` 模型文件即可，同时要注意修改下配置文件中的 `.nb` 文件路径以及类别映射文件（如有必要）。
+Q2：换一个图测试怎么做？  
+A2：替换 debug 下的测试图像为你想要测试的图像，使用 ADB 再次 push 到手机上即可。
--- a/static/deploy/lite/coco_label_list.txt
+++ b/static/deploy/lite/coco_label_list.txt
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+dining table
+toilet
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
\ No newline at end of file
--- a/static/deploy/lite/config_ppyolo_tiny.txt
+++ b/static/deploy/lite/config_ppyolo_tiny.txt
+model_file ./ppyolo_tiny.nb
+label_path ./coco_label_list.txt
+num_threads 1
+precision fp32
+enable_benchmark 1
+arch YOLO
+image_shape 3,320,320
+Resize 320,320
+keep_ratio False
+mean 0.485,0.456,0.406
+std 0.229,0.224,0.225
+PadStride 0
--- a/static/deploy/lite/run_detection.cc
+++ b/static/deploy/lite/run_detection.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <fstream>
+#include <iostream>
+#include <vector>
+#include <chrono>
+#include <numeric>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"  // NOLINT
+using namespace paddle::lite_api;  // NOLINT
+using namespace std;
+struct Object {
+  cv::Rect rec;
+  int class_id;
+  float prob;
+};
+// Object for storing all preprocessed data
+struct ImageBlob {
+  // image width and height
+  std::vector<float> im_shape_;
+  // Buffer for image data after preprocessing
+  const float* im_data_;
+  std::vector<float> mean_;
+  std::vector<float> scale_;
+};
+void PrintBenchmarkLog(std::vector<double> det_time,
+                       std::map<std::string, std::string> config,
+                       int img_num) {
+  std::cout << "----------------- Config info ------------------" << std::endl;
+  std::cout << "runtime_device: armv8" << std::endl;
+  std::cout << "precision: " << config.at("precision") << std::endl;
+  std::cout << "num_threads: " << config.at("num_threads") << std::endl;
+  std::cout << "---------------- Data info ---------------------" << std::endl;
+  std::cout << "batch_size: " << 1 << std::endl;
+  std::cout << "---------------- Model info --------------------" << std::endl;
+  std::cout << "Model_name: " << config.at("model_file") << std::endl;
+  std::cout << "---------------- Perf info ---------------------" << std::endl;
+  std::cout << "Total number of predicted data: " << img_num
+            << " and total time spent(s): "
+            << std::accumulate(det_time.begin(), det_time.end(), 0) << std::endl;
+  std::cout << "preproce_time(ms): " << det_time[0] / img_num
+            << ", inference_time(ms): " << det_time[1] / img_num
+            << ", postprocess_time(ms): " << det_time[2] << std::endl;
+}
+std::vector<std::string> LoadLabels(const std::string &path) {
+  std::ifstream file;
+  std::vector<std::string> labels;
+  file.open(path);
+  while (file) {
+    std::string line;
+    std::getline(file, line);
+    std::string::size_type pos = line.find(" ");
+    if (pos != std::string::npos) {
+      line = line.substr(pos);
+    }
+    labels.push_back(line);
+  }
+  file.clear();
+  file.close();
+  return labels;
+}
+std::vector<std::string> ReadDict(std::string path) {
+  std::ifstream in(path);
+  std::string filename;
+  std::string line;
+  std::vector<std::string> m_vec;
+  if (in) {
+    while (getline(in, line)) {
+      m_vec.push_back(line);
+    }
+  } else {
+    std::cout << "no such file" << std::endl;
+  }
+  return m_vec;
+}
+std::vector<std::string> split(const std::string &str,
+                               const std::string &delim) {
+  std::vector<std::string> res;
+  if ("" == str)
+    return res;
+  char *strs = new char[str.length() + 1];
+  std::strcpy(strs, str.c_str());
+  char *d = new char[delim.length() + 1];
+  std::strcpy(d, delim.c_str());
+  char *p = std::strtok(strs, d);
+  while (p) {
+    string s = p;
+    res.push_back(s);
+    p = std::strtok(NULL, d);
+  }
+  return res;
+}
+std::map<std::string, std::string> LoadConfigTxt(std::string config_path) {
+  auto config = ReadDict(config_path);
+  std::map<std::string, std::string> dict;
+  for (int i = 0; i < config.size(); i++) {
+    std::vector<std::string> res = split(config[i], " ");
+    dict[res[0]] = res[1];
+  }
+  return dict;
+}
+void PrintConfig(const std::map<std::string, std::string> &config) {
+  std::cout << "=======PaddleDetection lite demo config======" << std::endl;
+  for (auto iter = config.begin(); iter != config.end(); iter++) {
+    std::cout << iter->first << " : " << iter->second << std::endl;
+  }
+  std::cout << "===End of PaddleDetection lite demo config===" << std::endl;
+}
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(const float* din,
+                     float* dout,
+                     int size,
+                     const std::vector<float> mean,
+                     const std::vector<float> scale) {
+  if (mean.size() != 3 || scale.size() != 3) {
+    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
+    exit(1);
+  }
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+std::vector<Object> visualize_result(
+                        const float* data,
+                        int count,
+                        float thresh,
+                        cv::Mat& image,
+                        const std::vector<std::string> &class_names) {
+  if (data == nullptr) {
+    std::cerr << "[ERROR] data can not be nullptr\n";
+    exit(1);
+  }
+  std::vector<Object> rect_out;
+  for (int iw = 0; iw < count; iw++) {
+    int oriw = image.cols;
+    int orih = image.rows;
+    if (data[1] > thresh) {
+      Object obj;
+      int x = static_cast<int>(data[2]);
+      int y = static_cast<int>(data[3]);
+      int w = static_cast<int>(data[4] - data[2] + 1);
+      int h = static_cast<int>(data[5] - data[3] + 1);
+      cv::Rect rec_clip =
+          cv::Rect(x, y, w, h) & cv::Rect(0, 0, image.cols, image.rows);
+      obj.class_id = static_cast<int>(data[0]);
+      obj.prob = data[1];
+      obj.rec = rec_clip;
+      if (w > 0 && h > 0 && obj.prob <= 1) {
+        rect_out.push_back(obj);
+        cv::rectangle(image, rec_clip, cv::Scalar(0, 0, 255), 1, cv::LINE_AA);
+        std::string str_prob = std::to_string(obj.prob);
+        std::string text = std::string(class_names[obj.class_id]) + ": " +
+                           str_prob.substr(0, str_prob.find(".") + 4);
+        int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+        double font_scale = 1.f;
+        int thickness = 1;
+        cv::Size text_size =
+            cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+        float new_font_scale = w * 0.5 * font_scale / text_size.width;
+        text_size = cv::getTextSize(
+            text, font_face, new_font_scale, thickness, nullptr);
+        cv::Point origin;
+        origin.x = x + 3;
+        origin.y = y + text_size.height + 3;
+        cv::putText(image,
+                    text,
+                    origin,
+                    font_face,
+                    new_font_scale,
+                    cv::Scalar(0, 255, 255),
+                    thickness,
+                    cv::LINE_AA);
+        std::cout << "detection, image size: " << image.cols << ", "
+                  << image.rows
+                  << ", detect object: " << class_names[obj.class_id]
+                  << ", score: " << obj.prob << ", location: x=" << x
+                  << ", y=" << y << ", width=" << w << ", height=" << h
+                  << std::endl;
+      }
+    }
+    data += 6;
+  }
+  return rect_out;
+}
+// Load Model and create model predictor
+std::shared_ptr<PaddlePredictor> LoadModel(std::string model_file,
+                                           int num_theads) {
+  MobileConfig config;
+  config.set_threads(num_theads);
+  config.set_model_from_file(model_file);
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+  return predictor;
+}
+ImageBlob prepare_imgdata(const cv::Mat& img,
+                          std::map<std::string,
+                          std::string> config) {
+  ImageBlob img_data;
+  std::vector<int> target_size_;
+  std::vector<std::string> size_str = split(config.at("Resize"), ",");
+  transform(size_str.begin(), size_str.end(), back_inserter(target_size_),
+            [](std::string const& s){return stoi(s);});
+  int width = target_size_[0];
+  int height = target_size_[1];
+  img_data.im_shape_ = {
+      static_cast<float>(target_size_[0]),
+      static_cast<float>(target_size_[1])
+  };
+  std::vector<float> mean_;
+  std::vector<float> scale_;
+  std::vector<std::string> mean_str = split(config.at("mean"), ",");
+  std::vector<std::string> std_str = split(config.at("std"), ",");
+  transform(mean_str.begin(), mean_str.end(), back_inserter(mean_),
+            [](std::string const& s){return stof(s);});
+  transform(std_str.begin(), std_str.end(), back_inserter(scale_),
+            [](std::string const& s){return stof(s);});
+  img_data.mean_ = mean_;
+  img_data.scale_ = scale_;
+  return img_data;
+}
+void preprocess(const cv::Mat& img, const ImageBlob img_data, float* data) {
+  cv::Mat rgb_img;
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(
+      rgb_img, rgb_img, cv::Size(img_data.im_shape_[0],img_data.im_shape_[1]),
+      0.f, 0.f, cv::INTER_CUBIC);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  neon_mean_scale(
+    dimg, data, int(img_data.im_shape_[0] * img_data.im_shape_[1]),
+    img_data.mean_, img_data.scale_);
+}
+void RunModel(std::map<std::string, std::string> config,
+              std::string img_path,
+              const int repeats,
+              std::vector<double>* times) {
+  std::string model_file = config.at("model_file");
+  std::string label_path = config.at("label_path");
+  // Load Labels
+  std::vector<std::string> class_names = LoadLabels(label_path);
+  auto predictor = LoadModel(model_file, stoi(config.at("num_threads")));
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  auto img_data = prepare_imgdata(img, config);
+  auto preprocess_start = std::chrono::steady_clock::now();
+  // 1. Prepare input data from image
+  // input 0
+  std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
+  input_tensor0->Resize({1, 3, img_data.im_shape_[0], img_data.im_shape_[1]});
+  auto* data0 = input_tensor0->mutable_data<float>();
+  preprocess(img, img_data, data0);
+  // input1
+  std::unique_ptr<Tensor> input_tensor1(std::move(predictor->GetInput(1)));
+  input_tensor1->Resize({1, 2});
+  auto* data1 = input_tensor1->mutable_data<int>();
+  data1[0] = img_data.im_shape_[0];
+  data1[1] = img_data.im_shape_[1];
+  auto preprocess_end = std::chrono::steady_clock::now();
+  // 2. Run predictor
+  // warm up
+  for (int i = 0; i < repeats / 2; i++)
+  {
+    predictor->Run();
+  }
+  auto inference_start = std::chrono::steady_clock::now();
+  for (int i = 0; i < repeats; i++)
+  {
+    predictor->Run();
+  }
+  auto inference_end = std::chrono::steady_clock::now();
+  // 3. Get output and post process
+  auto postprocess_start = std::chrono::steady_clock::now();
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  const float* outptr = output_tensor->data<float>();
+  auto shape_out = output_tensor->shape();
+  int64_t cnt = 1;
+  for (auto& i : shape_out) {
+    cnt *= i;
+  }
+  auto rec_out = visualize_result(
+      outptr, static_cast<int>(cnt / 6), 0.5f, img, class_names);
+  std::string result_name =
+      img_path.substr(0, img_path.find(".")) + "_result.jpg";
+  cv::imwrite(result_name, img);
+  auto postprocess_end = std::chrono::steady_clock::now();
+  std::chrono::duration<float> prep_diff = preprocess_end - preprocess_start;
+  times->push_back(double(prep_diff.count() * 1000));
+  std::chrono::duration<float> infer_diff = inference_end - inference_start;
+  times->push_back(double(infer_diff.count() / repeats * 1000));
+  std::chrono::duration<float> post_diff = postprocess_end - postprocess_start;
+  times->push_back(double(post_diff.count() * 1000));
+}
+int main(int argc, char** argv) {
+  if (argc < 3) {
+    std::cerr << "[ERROR] usage: " << argv[0] << " config_path image_path\n";
+    exit(1);
+  }
+  std::string config_path = argv[1];
+  std::string img_path = argv[2];
+  // load config
+  auto config = LoadConfigTxt(config_path);
+  PrintConfig(config);
+  bool enable_benchmark = bool(stoi(config.at("enable_benchmark")));
+  int repeats = enable_benchmark ? 50 : 1;
+  std::vector<double> det_times;
+  RunModel(config, img_path, repeats, &det_times);
+  PrintBenchmarkLog(det_times, config, 1);
+  return 0;
+}