Merge pull request #1652 from RainFrost1/ppshitu_lite

添加PP-ShiTu Lite

Merge pull request #1652 from RainFrost1/ppshitu_lite
添加PP-ShiTu Lite
b5cf03e7 · Wei Shengyu · GitHub · 6fec774f · 267f6508 · b5cf03e7
17 changed file
--- a/deploy/lite_shitu/Makefile
+++ b/deploy/lite_shitu/Makefile
+ARM_ABI = arm8#[arm7/arm8]
+export ARM_ABI
+
+ifeq ($(ARM_ABI), arm8)
+    ARM_PLAT=arm64-v8a
+else
+    ARM_PLAT=armeabi-v7a
+endif
+${info ARM_ABI: ${ARM_ABI}}
+${info ARM_PLAT: ${ARM_PLAT}; option[arm7/arm8]}
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+${info LITE_ROOT: $(abspath ${LITE_ROOT})}
+
+THIRD_PARTY_DIR=third_party
+${info THIRD_PARTY_DIR: $(abspath ${THIRD_PARTY_DIR})}
+
+
+OPENCV_VERSION=opencv4.1.0
+OPENCV_LIBS = ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_imgcodecs.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_imgproc.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_core.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libtegra_hal.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibjpeg-turbo.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibwebp.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibpng.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibjasper.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibtiff.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libIlmImf.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libtbb.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libcpufeatures.a
+
+
+LITE_LIBS = -L${LITE_ROOT}/cxx/lib/ -lpaddle_light_api_shared
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+# LITE_LIBS = ${LITE_ROOT}/cxx/lib/libpaddle_api_light_bundled.a
+
+CXX_LIBS = $(LITE_LIBS) ${OPENCV_LIBS} $(SYSTEM_LIBS)
+
+LOCAL_DIRSRCS=$(wildcard src/*.cc)
+LOCAL_SRCS=$(notdir $(LOCAL_DIRSRCS))
+LOCAL_OBJS=$(patsubst %.cpp, %.o, $(patsubst %.cc, %.o, $(LOCAL_SRCS)))
+
+JSON_OBJS = json_reader.o json_value.o json_writer.o
+
+pp_shitu: $(LOCAL_OBJS) $(JSON_OBJS) fetch_opencv
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) $(LOCAL_OBJS) $(JSON_OBJS) -o pp_shitu $(CXX_LIBS) $(LDFLAGS)
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+fetch_json_code:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/jsoncpp_code.tar.gz || \
+      (echo "fetch jsoncpp_code.tar.gz" && \
+      wget -P ${THIRD_PARTY_DIR} https://bj.bcebos.com/v1/paddledet/deploy/jsoncpp_code.tar.gz )
+	@ test -d ${THIRD_PARTY_DIR}/jsoncpp_code || \
+      tar -zxf ${THIRD_PARTY_DIR}/jsoncpp_code.tar.gz -C ${THIRD_PARTY_DIR}
+
+LOCAL_INCLUDES = -I./ -Iinclude
+OPENCV_INCLUDE = -I${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/include
+JSON_INCLUDE = -I${THIRD_PARTY_DIR}/jsoncpp_code/include
+CXX_INCLUDES = ${LOCAL_INCLUDES} ${INCLUDES} ${OPENCV_INCLUDE} ${JSON_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+
+$(LOCAL_OBJS): %.o: src/%.cc fetch_opencv fetch_json_code
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -c $< -o $@
+
+$(JSON_OBJS): %.o: ${THIRD_PARTY_DIR}/jsoncpp_code/%.cpp fetch_json_code
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -c $< -o $@
+
+.PHONY: clean fetch_opencv fetch_json_code
+clean:
+	rm -rf $(LOCAL_OBJS) $(JSON_OBJS)
+	rm -f pp_shitu
--- a/deploy/lite_shitu/README.md
+++ b/deploy/lite_shitu/README.md
+# PP-ShiTu在Paddle-Lite端侧部署
+
+本教程将介绍基于[Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite) 在移动端部署PaddleDetection模型的详细步骤。
+
+Paddle Lite是飞桨轻量化推理引擎，为手机、IOT端提供高效推理能力，并广泛整合跨平台硬件，为端侧部署及应用落地问题提供轻量化的部署方案。
+
+## 1. 准备环境
+
+### 运行准备
+- 电脑（编译Paddle Lite）
+- 安卓手机（armv7或armv8）
+
+### 1.1 准备交叉编译环境
+交叉编译环境用于编译 Paddle Lite 和 PaddleDetection 的C++ demo。
+支持多种开发环境，不同开发环境的编译流程请参考对应文档，请确保安装完成Java jdk、Android NDK(R17以上)。
+
+1. [Docker](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#docker)
+2. [Linux](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#linux)
+3. [MAC OS](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#mac-os)
+
+### 1.2 准备预测库
+
+预测库有两种获取方式：
+1. [**建议**]直接下载，预测库下载链接如下：
+      |平台| 架构 | 预测库下载链接|
+      |-|-|-|
+      |Android| arm7 | [inference_lite_lib](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.10-rc/inference_lite_lib.android.armv7.clang.c++_static.with_extra.with_cv.tar.gz) |
+      | Android | arm8 | [inference_lite_lib](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.10-rc/inference_lite_lib.android.armv8.clang.c++_static.with_extra.with_cv.tar.gz)  |
+      | Android | arm8(FP16) | [inference_lite_lib](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.10-rc/inference_lite_lib.android.armv8_clang_c++_static_with_extra_with_cv_with_fp16.tiny_publish_427e46.zip)  |
+
+**注意**：1. 如果是从 Paddle-Lite [官方文档](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html#android-toolchain-gcc)下载的预测库，注意选择`with_extra=ON，with_cv=ON`的下载链接。2. 目前只提供Android端demo，IOS端demo可以参考[Paddle-Lite IOS demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-ios-demo)
+
+
+2. 编译Paddle-Lite得到预测库，Paddle-Lite的编译方式如下：
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+# 如果使用编译方式，建议使用develop分支编译预测库
+git checkout develop
+# FP32
+./lite/tools/build_android.sh --arch=armv8 --toolchain=clang --with_cv=ON --with_extra=ON
+# FP16
+./lite/tools/build_android.sh --arch=armv8 --toolchain=clang --with_cv=ON --with_extra=ON --with_arm82_fp16=ON
+```
+
+**注意**：编译Paddle-Lite获得预测库时，需要打开`--with_cv=ON --with_extra=ON`两个选项，`--arch`表示`arm`版本，这里指定为armv8，更多编译命令介绍请参考[链接](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_andriod.html#id2)。
+
+直接下载预测库并解压后，可以得到`inference_lite_lib.android.armv8.clang.c++_static.with_extra.with_cv/`文件夹，通过编译Paddle-Lite得到的预测库位于`Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/`文件夹下。
+预测库的文件目录如下：
+
+```
+inference_lite_lib.android.armv8/
+|-- cxx                                        C++ 预测库和头文件
+|   |-- include                                C++ 头文件
+|   |   |-- paddle_api.h
+|   |   |-- paddle_image_preprocess.h
+|   |   |-- paddle_lite_factory_helper.h
+|   |   |-- paddle_place.h
+|   |   |-- paddle_use_kernels.h
+|   |   |-- paddle_use_ops.h
+|   |   `-- paddle_use_passes.h
+|   `-- lib                                           C++预测库
+|       |-- libpaddle_api_light_bundled.a             C++静态库
+|       `-- libpaddle_light_api_shared.so             C++动态库
+|-- java                                     Java预测库
+|   |-- jar
+|   |   `-- PaddlePredictor.jar
+|   |-- so
+|   |   `-- libpaddle_lite_jni.so
+|   `-- src
+|-- demo                                     C++和Java示例代码
+|   |-- cxx                                  C++  预测库demo
+|   `-- java                                 Java 预测库demo
+```
+
+## 2 开始运行
+
+### 2.1 模型优化
+
+Paddle-Lite 提供了多种策略来自动优化原始的模型，其中包括量化、子图融合、混合调度、Kernel优选等方法，使用Paddle-Lite的`opt`工具可以自动对inference模型进行优化，目前支持两种优化方式，优化后的模型更轻量，模型运行速度更快。
+
+**注意**：如果已经准备好了 `.nb` 结尾的模型文件，可以跳过此步骤。
+
+#### 2.1.1 安装paddle_lite_opt工具
+安装`paddle_lite_opt`工具有如下两种方法：
+1. [**建议**]pip安装paddlelite并进行转换
+    ```shell
+    pip install paddlelite==2.10rc
+    ```
+
+2. 源码编译Paddle-Lite生成`paddle_lite_opt`工具
+
+    模型优化需要Paddle-Lite的`opt`可执行文件，可以通过编译Paddle-Lite源码获得，编译步骤如下：
+    ```shell
+    # 如果准备环境时已经clone了Paddle-Lite，则不用重新clone Paddle-Lite
+    git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+    cd Paddle-Lite
+    git checkout develop
+    # 启动编译
+    ./lite/tools/build.sh build_optimize_tool
+    ```
+
+    编译完成后，`opt`文件位于`build.opt/lite/api/`下，可通过如下方式查看`opt`的运行选项和使用方式；
+    ```shell
+    cd build.opt/lite/api/
+    ./opt
+    ```
+
+    `opt`的使用方式与参数与上面的`paddle_lite_opt`完全一致。
+
+之后使用`paddle_lite_opt`工具可以进行inference模型的转换。`paddle_lite_opt`的部分参数如下：
+
+|选项|说明|
+|-|-|
+|--model_file|待优化的PaddlePaddle模型（combined形式）的网络结构文件路径|
+|--param_file|待优化的PaddlePaddle模型（combined形式）的权重文件路径|
+|--optimize_out_type|输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现，默认为naive_buffer|
+|--optimize_out|优化模型的输出路径|
+|--valid_targets|指定模型可执行的backend，默认为arm。目前可支持x86、arm、opencl、npu、xpu，可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU（Kirin 810/990 Soc搭载的达芬奇架构NPU），应当设置为npu, arm|
+
+更详细的`paddle_lite_opt`工具使用说明请参考[使用opt转化模型文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/opt/opt_bin.html)
+
+`--model_file`表示inference模型的model文件地址，`--param_file`表示inference模型的param文件地址；`optimize_out`用于指定输出文件的名称（不需要添加`.nb`的后缀）。直接在命令行中运行`paddle_lite_opt`，也可以查看所有参数及其说明。
+
+
+#### 2.1.3 转换示例
+
+下面介绍使用`paddle_lite_opt`完成主体检测模型和识别模型的预训练模型，转成inference模型，最终转换成Paddle-Lite的优化模型的过程。
+
+##### 2.1.3.1 转换主体检测模型
+
+```shell
+# 当前目录为 $PaddleClas/deploy/lite_shitu
+# $code_path需替换成相应的运行目录,可以根据需要，将$code_path设置成需要的目录
+export $code_path=~
+cd $code_path
+git clone https://github.com/PaddlePaddle/PaddleDetection.git
+# 进入PaddleDetection根目录
+cd PaddleDetection
+
+# 将预训练模型导出为inference模型
+python tools/export_model.py -c configs/picodet/application/mainbody_detection/picodet_lcnet_x2_5_640_mainbody.yml -o weights=https://paddledet.bj.bcebos.com/models/picodet_lcnet_x2_5_640_mainbody.pdparams  --output_dir=inference
+
+# 将inference模型转化为Paddle-Lite优化模型
+paddle_lite_opt --model_file=inference/picodet_lcnet_x2_5_640_mainbody/model.pdmodel --param_file=inference/picodet_lcnet_x2_5_640_mainbody/model.pdiparams --optimize_out=inference/picodet_lcnet_x2_5_640_mainbody/mainbody_det
+
+# 将转好的模型复制到lite_shitu目录下
+cd $PaddleClas/deploy/lite_shitu
+mkdir models
+cp $code_path/PaddleDetection/inference/picodet_lcnet_x2_5_640_mainbody/mainbody_det.nb $PaddleClas/deploy/lite_shitu/models
+```
+
+##### 2.1.3.2 转换识别模型
+
+```shell
+# 转换inference model
+待补充,生成的inference model存储在PaddleClas/inference下，同时生成label.txt，也存在此文件夹下
+
+# 转换为Paddle-Lite模型
+paddle_lite_opt --model_file=inference/inference.pdmodel --param_file=inference/inference.pdiparams --optimize_out=inference/rec
+
+# 将模型、label文件拷贝到lite_shitu下
+cp inference/rec.nb deploy/lite_shitu/models/
+cp inference/label.txt deploy/lite_shitu/models/
+cd deploy/lite_shitu
+```
+
+**注意**：`--optimize_out` 参数为优化后模型的保存路径，无需加后缀`.nb`；`--model_file` 参数为模型结构信息文件的路径，`--param_file` 参数为模型权重信息文件的路径，请注意文件名。
+
+##### 2.1.3.3 准备测试图像
+
+```shell
+mkdir images
+# 根据需要准备测试图像，可以在images文件夹中存放多张图像
+cp ../images/wangzai.jpg images/
+```
+
+
+
+##### 2.1.3.4 将yaml文件转换成json文件
+
+```shell
+# 如果测试单张图像
+python generate_json_config.py --det_model_path models/mainbody_det.nb  --rec_model_path models/rec.nb --rec_label_path models/label.txt --img_path images/wangzai.jpg
+# or
+# 如果测试多张图像
+python generate_json_config.py --det_model_path models/mainbody_det.nb  --rec_model_path models/rec.nb --rec_label_path models/label.txt --img_dir images
+
+# 执行完成后，会在lit_shitu下生成shitu_config.json配置文件
+
+```
+
+### 2.2 与手机联调
+
+首先需要进行一些准备工作。
+1. 准备一台arm8的安卓手机，如果编译的预测库是armv7，则需要arm7的手机，并修改Makefile中`ARM_ABI=arm7`。
+2. 电脑上安装ADB工具，用于调试。 ADB安装方式如下：
+
+    2.1. MAC电脑安装ADB:
+
+    ```shell
+    brew cask install android-platform-tools
+    ```
+    2.2. Linux安装ADB
+    ```shell
+    sudo apt update
+    sudo apt install -y wget adb
+    ```
+    2.3. Window安装ADB
+
+    win上安装需要去谷歌的安卓平台下载ADB软件包进行安装：[链接](https://developer.android.com/studio)
+
+3. 手机连接电脑后，开启手机`USB调试`选项，选择`文件传输`模式，在电脑终端中输入：
+
+```shell
+adb devices
+```
+如果有device输出，则表示安装成功，如下所示：
+```
+List of devices attached
+744be294    device
+```
+
+4. 编译lite部署代码生成移动端可执行文件
+
+```shell
+cd $PaddleClas/deploy/lite_shitu
+
+inference_lite_path=/{lite prediction library path}/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.with_cv/
+mkdir $inference_lite_path/demo/cxx/ppshitu_lite
+
+cp -r Makefile src/ include/ *.json models/ images/ $inference_lite_path/demo/cxx/ppshitu_lite
+
+cd $inference_lite_path/demo/cxx/ppshitu_lite
+
+# 执行编译，等待完成后得到可执行文件main
+make ARM_ABI=arm8
+#如果是arm7，则执行 make ARM_ABI = arm7 (或者在Makefile中修改该项)
+```
+
+5. 准备优化后的模型、预测库文件、测试图像。
+
+```shell
+mkdir deploy
+mv models deploy/
+mv images deploy/
+cp pp_shitu deploy/
+cd deploy
+
+# 将C++预测动态库so文件复制到deploy文件夹中
+cp ../../../cxx/lib/libpaddle_light_api_shared.so ./
+```
+
+执行完成后，deploy文件夹下将有如下文件格式：
+
+```
+deploy/
+|-- models/
+|   |--mainbody_det.nb             优化后的主体检测模型文件
+|   |--rec.nb             				 优化后的识别模型文件
+|   |--label.txt                   识别模型的label文件
+|-- images/
+|   ...                            图片文件
+|-- pp_shitu                       生成的移动端执行文件
+|-- shitu_config.json              执行时参数配置文件
+|-- libpaddle_light_api_shared.so  Paddle-Lite库文件
+```
+
+**注意：**
+*  `shitu_config.json` 包含了目标检测的超参数，请按需进行修改
+
+6. 启动调试，上述步骤完成后就可以使用ADB将文件夹 `deploy/` push到手机上运行，步骤如下：
+
+```shell
+# 将上述deploy文件夹push到手机上
+adb push deploy /data/local/tmp/
+
+adb shell
+cd /data/local/tmp/deploy
+export LD_LIBRARY_PATH=/data/local/tmp/deploy:$LD_LIBRARY_PATH
+
+# 修改权限为可执行
+chmod 777 pp_shitu
+# 执行程序
+./pp_shitu shitu_config.json
+```
+
+如果对代码做了修改，则需要重新编译并push到手机上。
+
+运行效果如下：
+
+![](../../docs/images/ppshitu_lite_demo.png)
+
+## FAQ
+Q1：如果想更换模型怎么办，需要重新按照流程走一遍吗？
+A1：如果已经走通了上述步骤，更换模型只需要替换 `.nb` 模型文件即可，同时要注意修改下配置文件中的 `.nb` 文件路径以及类别映射文件（如有必要）。
+
+Q2：换一个图测试怎么做？
+A2：替换 deploy 下的测试图像为你想要测试的图像，并重新生成json配置文件（或者直接修改图像路径），使用 ADB 再次 push 到手机上即可。
--- a/deploy/lite_shitu/generate_json_config.py
+++ b/deploy/lite_shitu/generate_json_config.py
+import argparse
+import json
+import os
+
+import yaml
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--yaml_path', type=str, default='../configs/inference_drink.yaml')
+    parser.add_argument(
+        '--img_dir',
+        type=str,
+        default=None,
+        help='The dir path for inference images')
+    parser.add_argument(
+        '--img_path',
+        type=str,
+        default=None,
+        help='The dir path for inference images')
+    parser.add_argument(
+        '--det_model_path',
+        type=str,
+        default='./det.nb',
+        help="The model path for mainbody  detection")
+    parser.add_argument(
+        '--rec_model_path',
+        type=str,
+        default='./rec.nb',
+        help="The rec model path")
+    parser.add_argument(
+        '--rec_label_path',
+        type=str,
+        default='./label.txt',
+        help='The rec model label')
+    parser.add_argument(
+        '--arch',
+        type=str,
+        default='PicoDet',
+        help='The model structure for detection model')
+    parser.add_argument(
+        '--fpn-stride',
+        type=list,
+        default=[8, 16, 32, 64],
+        help="The fpn strid for detection model")
+    parser.add_argument(
+        '--keep_top_k',
+        type=int,
+        default=100,
+        help='The params for nms(postprocess for detection)')
+    parser.add_argument(
+        '--nms-name',
+        type=str,
+        default='MultiClassNMS',
+        help='The nms name for postprocess of detection model')
+    parser.add_argument(
+        '--nms_threshold',
+        type=float,
+        default=0.5,
+        help='The nms nms_threshold for detection postprocess')
+    parser.add_argument(
+        '--nms_top_k',
+        type=int,
+        default=1000,
+        help='The nms_top_k in postprocess of detection model')
+    parser.add_argument(
+        '--score_threshold',
+        type=float,
+        default=0.3,
+        help='The score_threshold for postprocess of detection')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    config_yaml = yaml.safe_load(open(args.yaml_path))
+    config_json = {}
+    config_json["Global"] = {}
+    config_json["Global"][
+        "infer_imgs"] = args.img_path if args.img_path else config_yaml[
+            "Global"]["infer_imgs"]
+    if args.img_dir is not None:
+        config_json["Global"]["infer_imgs_dir"] = args.img_dir
+        config_json["Global"]["infer_imgs"] = None
+    else:
+        config_json["Global"][
+            "infer_imgs"] = args.img_path if args.img_path else config_yaml[
+                "Global"]["infer_imgs"]
+    config_json["Global"]["batch_size"] = config_yaml["Global"]["batch_size"]
+    config_json["Global"]["cpu_num_threads"] = min(
+        config_yaml["Global"]["cpu_num_threads"], 4)
+    config_json["Global"]["image_shape"] = config_yaml["Global"]["image_shape"]
+    config_json["Global"]["det_model_path"] = args.det_model_path
+    config_json["Global"]["rec_model_path"] = args.rec_model_path
+    config_json["Global"]["rec_label_path"] = args.rec_label_path
+    config_json["Global"]["label_list"] = config_yaml["Global"]["labe_list"]
+    config_json["Global"]["rec_nms_thresold"] = config_yaml["Global"][
+        "rec_nms_thresold"]
+    config_json["Global"]["max_det_results"] = config_yaml["Global"][
+        "max_det_results"]
+    config_json["Global"]["det_fpn_stride"] = args.fpn_stride
+    config_json["Global"]["det_arch"] = args.arch
+    config_json["Global"]["return_k"] = config_yaml["IndexProcess"]["return_k"]
+
+    # config_json["DetPreProcess"] = config_yaml["DetPreProcess"]
+    config_json["DetPreProcess"] = {}
+    config_json["DetPreProcess"]["transform_ops"] = []
+    for x in config_yaml["DetPreProcess"]["transform_ops"]:
+        k = list(x.keys())[0]
+        y = x[k]
+        y['type'] = k
+        config_json["DetPreProcess"]["transform_ops"].append(y)
+
+    config_json["DetPostProcess"] = {
+        "keep_top_k": args.keep_top_k,
+        "name": args.nms_name,
+        "nms_threshold": args.nms_threshold,
+        "nms_top_k": args.nms_top_k,
+        "score_threshold": args.score_threshold
+    }
+    #  config_json["RecPreProcess"] = config_yaml["RecPreProcess"]
+    config_json["RecPreProcess"] = {}
+    config_json["RecPreProcess"]["transform_ops"] = []
+    for x in config_yaml["RecPreProcess"]["transform_ops"]:
+        k = list(x.keys())[0]
+        y = x[k]
+        if y is not None:
+            y["type"] = k
+            config_json["RecPreProcess"]["transform_ops"].append(y)
+
+    with open('shitu_config.json', 'w') as fd:
+        json.dump(config_json, fd, indent=4)
+
+
+if __name__ == '__main__':
+    main()
--- a/deploy/lite_shitu/include/config_parser.h
+++ b/deploy/lite_shitu/include/config_parser.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "json/json.h"
+
+#ifdef _WIN32
+#define OS_PATH_SEP "\\"
+#else
+#define OS_PATH_SEP "/"
+#endif
+
+namespace PPShiTu {
+
+void load_jsonf(std::string jsonfile, Json::Value& jsondata);
+
+// Inference model configuration parser
+class ConfigPaser {
+ public:
+  ConfigPaser() {}
+
+  ~ConfigPaser() {}
+
+  bool load_config(const Json::Value& config) {
+
+    // Get model arch : YOLO, SSD, RetinaNet, RCNN, Face
+    if (config["Global"].isMember("det_arch")) {
+      arch_ = config["Global"]["det_arch"].as<std::string>();
+    } else {
+      std::cerr << "Please set model arch,"
+                << "support value : YOLO, SSD, RetinaNet, RCNN, Face."
+                << std::endl;
+      return false;
+    }
+
+    // Get Preprocess for preprocessing
+    if (config.isMember("DetPreProcess")) {
+      preprocess_info_ = config["DetPreProcess"]["transform_ops"];
+    } else {
+      std::cerr << "Please set Preprocess." << std::endl;
+      return false;
+    }
+    // Get label_list for visualization
+    if (config["Global"].isMember("label_list")) {
+      label_list_.clear();
+      for (auto item : config["Global"]["label_list"]) {
+        label_list_.emplace_back(item.as<std::string>());
+      }
+    } else {
+      std::cerr << "Please set label_list." << std::endl;
+      return false;
+    }
+
+    // Get NMS for postprocess
+    if (config.isMember("DetPostProcess")) {
+      nms_info_ = config["DetPostProcess"];
+    }
+    // Get fpn_stride in PicoDet
+    if (config["Global"].isMember("det_fpn_stride")) {
+      fpn_stride_.clear();
+      for (auto item : config["Global"]["det_fpn_stride"]) {
+        fpn_stride_.emplace_back(item.as<int>());
+      }
+    }
+
+    return true;
+  }
+  std::string arch_;
+  Json::Value preprocess_info_;
+  Json::Value nms_info_;
+  std::vector<std::string> label_list_;
+  std::vector<int> fpn_stride_;
+};
+
+}  // namespace PPShiTu
--- a/deploy/lite_shitu/include/object_detector.h
+++ b/deploy/lite_shitu/include/object_detector.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ctime>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include <stdlib.h>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "json/json.h"
+
+#include "paddle_api.h"  // NOLINT
+
+#include "include/config_parser.h"
+#include "include/preprocess_op.h"
+#include "include/utils.h"
+#include "include/picodet_postprocess.h"
+
+using namespace paddle::lite_api;  // NOLINT
+
+namespace PPShiTu {
+
+// Generate visualization colormap for each class
+std::vector<int> GenerateColorMap(int num_class);
+
+// Visualiztion Detection Result
+cv::Mat VisualizeResult(const cv::Mat& img,
+                        const std::vector<PPShiTu::ObjectResult>& results,
+                        const std::vector<std::string>& lables,
+                        const std::vector<int>& colormap,
+                        const bool is_rbox);
+
+class ObjectDetector {
+ public:
+  explicit ObjectDetector(const Json::Value& config,
+		  	  const std::string& model_dir,
+                          int cpu_threads = 1,
+                          const int batch_size = 1) {
+    config_.load_config(config);
+    printf("config created\n");
+    preprocessor_.Init(config_.preprocess_info_);
+    printf("before object detector\n");
+    if(config["Global"]["det_model_path"].as<std::string>().empty()){
+	std::cout << "Please set [det_model_path] in config file" << std::endl;
+	exit(-1);
+    }
+    LoadModel(config["Global"]["det_model_path"].as<std::string>(), cpu_threads);
+    printf("create object detector\n"); }
+
+  // Load Paddle inference model
+  void LoadModel(std::string model_file, int num_theads);
+
+  // Run predictor
+  void Predict(const std::vector<cv::Mat>& imgs,
+               const int warmup = 0,
+               const int repeats = 1,
+               std::vector<PPShiTu::ObjectResult>* result = nullptr,
+               std::vector<int>* bbox_num = nullptr,
+               std::vector<double>* times = nullptr);
+
+  // Get Model Label list
+  const std::vector<std::string>& GetLabelList() const {
+    return config_.label_list_;
+  }
+
+ private:
+  // Preprocess image and copy data to input buffer
+  void Preprocess(const cv::Mat& image_mat);
+  // Postprocess result
+  void Postprocess(const std::vector<cv::Mat> mats,
+                   std::vector<PPShiTu::ObjectResult>* result,
+                   std::vector<int> bbox_num,
+                   bool is_rbox);
+
+  std::shared_ptr<PaddlePredictor> predictor_;
+  Preprocessor preprocessor_;
+  ImageBlob inputs_;
+  std::vector<float> output_data_;
+  std::vector<int> out_bbox_num_data_;
+  float threshold_;
+  ConfigPaser config_;
+
+};
+
+}  // namespace PPShiTu
--- a/deploy/lite_shitu/include/picodet_postprocess.h
+++ b/deploy/lite_shitu/include/picodet_postprocess.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <utility>
+#include <ctime>
+#include <numeric>
+
+#include "include/utils.h"
+
+namespace PPShiTu {
+
+void PicoDetPostProcess(std::vector<PPShiTu::ObjectResult>* results,
+                         std::vector<const float *> outs,
+                         std::vector<int> fpn_stride,
+                         std::vector<float> im_shape,
+                         std::vector<float> scale_factor,
+                         float score_threshold = 0.3,
+                         float nms_threshold = 0.5,
+                         int num_class = 80,
+                         int reg_max = 7);
+
+}  // namespace PPShiTu
--- a/deploy/lite_shitu/include/preprocess_op.h
+++ b/deploy/lite_shitu/include/preprocess_op.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "json/json.h"
+
+namespace PPShiTu {
+
+// Object for storing all preprocessed data
+class ImageBlob {
+ public:
+  // image width and height
+  std::vector<float> im_shape_;
+  // Buffer for image data after preprocessing
+  std::vector<float> im_data_;
+  // in net data shape(after pad)
+  std::vector<float> in_net_shape_;
+  // Evaluation image width and height
+  // std::vector<float>  eval_im_size_f_;
+  // Scale factor for image size to origin image size
+  std::vector<float> scale_factor_;
+};
+
+// Abstraction of preprocessing opration class
+class PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) = 0;
+  virtual void Run(cv::Mat* im, ImageBlob* data) = 0;
+};
+
+class InitInfo : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {}
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+};
+
+class NormalizeImage : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    mean_.clear();
+    scale_.clear();
+    for (auto tmp : item["mean"]) {
+      mean_.emplace_back(tmp.as<float>());
+    }
+    for (auto tmp : item["std"]) {
+      scale_.emplace_back(tmp.as<float>());
+    }
+    is_scale_ = item["is_scale"].as<bool>();
+  }
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  // CHW or HWC
+  std::vector<float> mean_;
+  std::vector<float> scale_;
+  bool is_scale_;
+};
+
+class Permute : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {}
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+};
+
+class Resize : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    interp_ = item["interp"].as<int>();
+    // max_size_ = item["target_size"].as<int>();
+    keep_ratio_ = item["keep_ratio"].as<bool>();
+    target_size_.clear();
+    for (auto tmp : item["target_size"]) {
+      target_size_.emplace_back(tmp.as<int>());
+    }
+  }
+
+  // Compute best resize scale for x-dimension, y-dimension
+  std::pair<float, float> GenerateScale(const cv::Mat& im);
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  int interp_;
+  bool keep_ratio_;
+  std::vector<int> target_size_;
+  std::vector<int> in_net_shape_;
+};
+
+// Models with FPN need input shape % stride == 0
+class PadStride : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    stride_ = item["stride"].as<int>();
+  }
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  int stride_;
+};
+
+class TopDownEvalAffine : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    trainsize_.clear();
+    for (auto tmp : item["trainsize"]) {
+      trainsize_.emplace_back(tmp.as<int>());
+    }
+  }
+
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+
+ private:
+  int interp_ = 1;
+  std::vector<int> trainsize_;
+};
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio = 0.15);
+
+class Preprocessor {
+ public:
+  void Init(const Json::Value& config_node) {
+    // initialize image info at first
+    ops_["InitInfo"] = std::make_shared<InitInfo>();
+    for (const auto& item : config_node) {
+      auto op_name = item["type"].as<std::string>();
+
+      ops_[op_name] = CreateOp(op_name);
+      ops_[op_name]->Init(item);
+    }
+  }
+
+  std::shared_ptr<PreprocessOp> CreateOp(const std::string& name) {
+    if (name == "DetResize") {
+      return std::make_shared<Resize>();
+    } else if (name == "DetPermute") {
+      return std::make_shared<Permute>();
+    } else if (name == "DetNormalizeImage") {
+      return std::make_shared<NormalizeImage>();
+    } else if (name == "DetPadStride") {
+      // use PadStride instead of PadBatch
+      return std::make_shared<PadStride>();
+    } else if (name == "TopDownEvalAffine") {
+      return std::make_shared<TopDownEvalAffine>();
+    }
+    std::cerr << "can not find function of OP: " << name
+              << " and return: nullptr" << std::endl;
+    return nullptr;
+  }
+
+  void Run(cv::Mat* im, ImageBlob* data);
+
+ public:
+  static const std::vector<std::string> RUN_ORDER;
+
+ private:
+  std::unordered_map<std::string, std::shared_ptr<PreprocessOp>> ops_;
+};
+
+}  // namespace PPShiTu
--- a/deploy/lite_shitu/include/recognition.h
+++ b/deploy/lite_shitu/include/recognition.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle_api.h" // NOLINT
+#include "json/json.h"
+#include <arm_neon.h>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <math.h>
+#include <opencv2/opencv.hpp>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <vector>
+
+using namespace paddle::lite_api; // NOLINT
+using namespace std;
+
+namespace PPShiTu {
+
+struct RESULT {
+  std::string class_name;
+  int class_id;
+  float score;
+};
+
+class Recognition {
+
+public:
+  explicit Recognition(const Json::Value &config_file) {
+    MobileConfig config;
+    if (config_file["Global"]["rec_model_path"].as<std::string>().empty()) {
+      std::cout << "Please set [rec_model_path] in config file" << std::endl;
+      exit(-1);
+    }
+    config.set_model_from_file(
+        config_file["Global"]["rec_model_path"].as<std::string>());
+    this->predictor = CreatePaddlePredictor<MobileConfig>(config);
+
+    if (config_file["Global"]["rec_label_path"].as<std::string>().empty()) {
+      std::cout << "Please set [rec_label_path] in config file" << std::endl;
+      exit(-1);
+    }
+    LoadLabel(config_file["Global"]["rec_label_path"].as<std::string>());
+    SetPreProcessParam(config_file["RecPreProcess"]["transform_ops"]);
+    if (!config_file["Global"].isMember("return_k")){
+      this->topk = config_file["Global"]["return_k"].as<int>();
+    }
+    printf("rec model create!\n");
+  }
+
+  void LoadLabel(std::string path) {
+    std::ifstream file;
+    std::vector<std::string> label_list;
+    file.open(path);
+    while (file) {
+      std::string line;
+      std::getline(file, line);
+      std::string::size_type pos = line.find(" ");
+      if (pos != std::string::npos) {
+        line = line.substr(pos);
+      }
+      this->label_list.push_back(line);
+    }
+    file.clear();
+    file.close();
+  }
+
+  void SetPreProcessParam(const Json::Value &config_file) {
+    for (const auto &item : config_file) {
+      auto op_name = item["type"].as<std::string>();
+      if (op_name == "ResizeImage") {
+        this->size = item["size"].as<int>();
+      } else if (op_name == "NormalizeImage") {
+        this->mean.clear();
+        this->std.clear();
+        for (auto tmp : item["mean"]) {
+          this->mean.emplace_back(tmp.as<float>());
+        }
+        for (auto tmp : item["std"]) {
+          this->std.emplace_back(1 / tmp.as<float>());
+        }
+        this->scale = item["scale"].as<double>();
+      }
+    }
+  }
+
+  std::vector<RESULT> RunRecModel(const cv::Mat &img, double &cost_time);
+  std::vector<RESULT> PostProcess(const float *output_data, int output_size,
+                                  cv::Mat &output_image);
+  cv::Mat ResizeImage(const cv::Mat &img);
+  void NeonMeanScale(const float *din, float *dout, int size);
+
+private:
+  std::shared_ptr<PaddlePredictor> predictor;
+  std::vector<std::string> label_list;
+  std::vector<float> mean = {0.485f, 0.456f, 0.406f};
+  std::vector<float> std = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f};
+  double scale = 0.00392157;
+  float size = 224;
+  int topk = 5;
+};
+} // namespace PPShiTu
--- a/deploy/lite_shitu/include/utils.h
+++ b/deploy/lite_shitu/include/utils.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <ctime>
+#include <include/recognition.h>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace PPShiTu {
+
+// Object Detection Result
+struct ObjectResult {
+  // Rectangle coordinates of detected object: left, right, top, down
+  std::vector<int> rect;
+  // Class id of detected object
+  int class_id;
+  // Confidence of detected object
+  float confidence;
+
+  // RecModel result
+  std::vector<RESULT> rec_result;
+};
+
+void nms(std::vector<ObjectResult> &input_boxes, float nms_threshold, bool rec_nms=false);
+
+} // namespace PPShiTu
--- a/deploy/lite_shitu/src/config_parser.cc
+++ b/deploy/lite_shitu/src/config_parser.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/config_parser.h"
+
+namespace PPShiTu {
+
+void load_jsonf(std::string jsonfile, Json::Value &jsondata) {
+  std::ifstream ifs;
+  ifs.open(jsonfile);
+
+  Json::CharReaderBuilder builder;
+  builder["collectComments"] = true;
+  JSONCPP_STRING errs;
+  if (!parseFromStream(builder, ifs, &jsondata, &errs)) {
+    std::cout << errs << std::endl;
+    return;
+  }
+}
+
+}  // namespace PPShiTu
--- a/deploy/lite_shitu/src/main.cc
+++ b/deploy/lite_shitu/src/main.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <math.h>
+#include <numeric>
+#include <stdarg.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <vector>
+
+#include "include/config_parser.h"
+#include "include/object_detector.h"
+#include "include/preprocess_op.h"
+#include "include/recognition.h"
+#include "json/json.h"
+
+Json::Value RT_Config;
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(OS_PATH_SEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static bool PathExists(const std::string &path) {
+  struct stat buffer;
+  return (stat(path.c_str(), &buffer) == 0);
+}
+
+static void MkDir(const std::string &path) {
+  if (PathExists(path))
+    return;
+  int ret = 0;
+  ret = mkdir(path.c_str(), 0755);
+  if (ret != 0) {
+    std::string path_error(path);
+    path_error += " mkdir failed!";
+    throw std::runtime_error(path_error);
+  }
+}
+
+static void MkDirs(const std::string &path) {
+  if (path.empty())
+    return;
+  if (PathExists(path))
+    return;
+
+  MkDirs(DirName(path));
+  MkDir(path);
+}
+
+void DetPredictImage(const std::vector<cv::Mat> &batch_imgs,
+                     std::vector<PPShiTu::ObjectResult> &im_result,
+                     const int batch_size_det, const int max_det_num,
+                     const bool run_benchmark, PPShiTu::ObjectDetector *det) {
+  std::vector<double> det_t = {0, 0, 0};
+  int steps = ceil(float(batch_imgs.size()) / batch_size_det);
+  for (int idx = 0; idx < steps; idx++) {
+    int left_image_cnt = batch_imgs.size() - idx * batch_size_det;
+    if (left_image_cnt > batch_size_det) {
+      left_image_cnt = batch_size_det;
+    }
+    // Store all detected result
+    std::vector<PPShiTu::ObjectResult> result;
+    std::vector<int> bbox_num;
+    std::vector<double> det_times;
+
+    bool is_rbox = false;
+    if (run_benchmark) {
+      det->Predict(batch_imgs, 50, 50, &result, &bbox_num, &det_times);
+    } else {
+      det->Predict(batch_imgs, 0, 1, &result, &bbox_num, &det_times);
+    }
+
+    int item_start_idx = 0;
+    for (int i = 0; i < left_image_cnt; i++) {
+      cv::Mat im = batch_imgs[i];
+      // std::vector<PPShiTu::ObjectResult> im_result;
+      int detect_num = 0;
+      for (int j = 0; j < min(bbox_num[i], max_det_num); j++) {
+        PPShiTu::ObjectResult item = result[item_start_idx + j];
+        if (item.class_id == -1) {
+          continue;
+        }
+        detect_num += 1;
+        im_result.push_back(item);
+      }
+      item_start_idx = item_start_idx + bbox_num[i];
+    }
+
+    det_t[0] += det_times[0];
+    det_t[1] += det_times[1];
+    det_t[2] += det_times[2];
+  }
+}
+
+void PrintResult(const std::string &image_path,
+                 std::vector<PPShiTu::ObjectResult> &det_result) {
+  printf("%s:\n", image_path.c_str());
+  for (int i = 0; i < det_result.size(); ++i) {
+    printf("\tresult%d: bbox[%d, %d, %d, %d], score: %f, label: %s\n", i,
+           det_result[i].rect[0], det_result[i].rect[1], det_result[i].rect[2],
+           det_result[i].rect[3], det_result[i].rec_result[0].score,
+           det_result[i].rec_result[0].class_name.c_str());
+  }
+}
+
+int main(int argc, char **argv) {
+  std::cout << "Usage: " << argv[0]
+            << " [config_path](option) [image_dir](option)\n";
+  if (argc < 2) {
+    std::cout << "Usage: ./main det_runtime_config.json" << std::endl;
+    return -1;
+  }
+  std::string config_path = argv[1];
+  std::string img_dir = "";
+
+  if (argc >= 3) {
+    img_dir = argv[2];
+  }
+  // Parsing command-line
+  PPShiTu::load_jsonf(config_path, RT_Config);
+  if (RT_Config["Global"]["det_model_path"].as<std::string>().empty()) {
+    std::cout << "Please set [det_model_path] in " << config_path << std::endl;
+    return -1;
+  }
+  if (RT_Config["Global"]["infer_imgs"].as<std::string>().empty() &&
+      img_dir.empty()) {
+    std::cout << "Please set [infer_imgs] in " << config_path
+              << " Or use command: <" << argv[0] << " [shitu_config]"
+              << " [image_dir]>" << std::endl;
+    return -1;
+  }
+  if (!img_dir.empty()) {
+    std::cout << "Use image_dir in command line overide the path in config file"
+              << std::endl;
+    RT_Config["Global"]["infer_imgs_dir"] = img_dir;
+    RT_Config["Global"]["infer_imgs"] = "";
+  }
+  // Load model and create a object detector
+  PPShiTu::ObjectDetector det(
+      RT_Config, RT_Config["Global"]["det_model_path"].as<std::string>(),
+      RT_Config["Global"]["cpu_num_threads"].as<int>(),
+      RT_Config["Global"]["batch_size"].as<int>());
+  // create rec model
+  PPShiTu::Recognition rec(RT_Config);
+  // Do inference on input image
+
+  std::vector<PPShiTu::ObjectResult> det_result;
+  std::vector<cv::Mat> batch_imgs;
+  double rec_time;
+  if (!RT_Config["Global"]["infer_imgs"].as<std::string>().empty() ||
+      !RT_Config["Global"]["infer_imgs_dir"].as<std::string>().empty()) {
+    std::vector<std::string> all_img_paths;
+    std::vector<cv::String> cv_all_img_paths;
+    if (!RT_Config["Global"]["infer_imgs"].as<std::string>().empty()) {
+      all_img_paths.push_back(
+          RT_Config["Global"]["infer_imgs"].as<std::string>());
+      if (RT_Config["Global"]["batch_size"].as<int>() > 1) {
+        std::cout << "batch_size_det should be 1, when set `image_file`."
+                  << std::endl;
+        return -1;
+      }
+    } else {
+      cv::glob(RT_Config["Global"]["infer_imgs_dir"].as<std::string>(),
+               cv_all_img_paths);
+      for (const auto &img_path : cv_all_img_paths) {
+        all_img_paths.push_back(img_path);
+      }
+    }
+    for (int i = 0; i < all_img_paths.size(); ++i) {
+      std::string img_path = all_img_paths[i];
+      cv::Mat srcimg = cv::imread(img_path, cv::IMREAD_COLOR);
+      if (!srcimg.data) {
+        std::cerr << "[ERROR] image read failed! image path: " << img_path
+                  << "\n";
+        exit(-1);
+      }
+      cv::cvtColor(srcimg, srcimg, cv::COLOR_BGR2RGB);
+      batch_imgs.push_back(srcimg);
+      DetPredictImage(
+          batch_imgs, det_result, RT_Config["Global"]["batch_size"].as<int>(),
+          RT_Config["Global"]["max_det_results"].as<int>(), false, &det);
+
+      // add the whole image for recognition to improve recall
+      PPShiTu::ObjectResult result_whole_img = {
+          {0, 0, srcimg.cols, srcimg.rows}, 0, 1.0};
+      det_result.push_back(result_whole_img);
+
+      // get rec result
+      for (int j = 0; j < det_result.size(); ++j) {
+        int w = det_result[j].rect[2] - det_result[j].rect[0];
+        int h = det_result[j].rect[3] - det_result[j].rect[1];
+        cv::Rect rect(det_result[j].rect[0], det_result[j].rect[1], w, h);
+        cv::Mat crop_img = srcimg(rect);
+        std::vector<PPShiTu::RESULT> result =
+            rec.RunRecModel(crop_img, rec_time);
+        det_result[j].rec_result.assign(result.begin(), result.end());
+      }
+      // rec nms
+      PPShiTu::nms(det_result,
+                   RT_Config["Global"]["rec_nms_thresold"].as<float>(), true);
+      PrintResult(img_path, det_result);
+      batch_imgs.clear();
+      det_result.clear();
+    }
+  }
+  return 0;
+}
--- a/deploy/lite_shitu/src/object_detector.cc
+++ b/deploy/lite_shitu/src/object_detector.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "include/object_detector.h"
+
+namespace PPShiTu {
+
+// Load Model and create model predictor
+void ObjectDetector::LoadModel(std::string model_file, int num_theads) {
+  MobileConfig config;
+  config.set_threads(num_theads);
+  config.set_model_from_file(model_file);
+  config.set_power_mode(LITE_POWER_HIGH);
+
+  predictor_ = CreatePaddlePredictor<MobileConfig>(config);
+}
+
+// Visualiztion MaskDetector results
+cv::Mat VisualizeResult(const cv::Mat& img,
+                        const std::vector<PPShiTu::ObjectResult>& results,
+                        const std::vector<std::string>& lables,
+                        const std::vector<int>& colormap,
+                        const bool is_rbox = false) {
+  cv::Mat vis_img = img.clone();
+  for (int i = 0; i < results.size(); ++i) {
+    // Configure color and text size
+    std::ostringstream oss;
+    oss << std::setiosflags(std::ios::fixed) << std::setprecision(4);
+    oss << lables[results[i].class_id] << " ";
+    oss << results[i].confidence;
+    std::string text = oss.str();
+    int c1 = colormap[3 * results[i].class_id + 0];
+    int c2 = colormap[3 * results[i].class_id + 1];
+    int c3 = colormap[3 * results[i].class_id + 2];
+    cv::Scalar roi_color = cv::Scalar(c1, c2, c3);
+    int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+    double font_scale = 0.5f;
+    float thickness = 0.5;
+    cv::Size text_size =
+        cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+    cv::Point origin;
+
+    if (is_rbox) {
+      // Draw object, text, and background
+      for (int k = 0; k < 4; k++) {
+        cv::Point pt1 = cv::Point(results[i].rect[(k * 2) % 8],
+                                  results[i].rect[(k * 2 + 1) % 8]);
+        cv::Point pt2 = cv::Point(results[i].rect[(k * 2 + 2) % 8],
+                                  results[i].rect[(k * 2 + 3) % 8]);
+        cv::line(vis_img, pt1, pt2, roi_color, 2);
+      }
+    } else {
+      int w = results[i].rect[2] - results[i].rect[0];
+      int h = results[i].rect[3] - results[i].rect[1];
+      cv::Rect roi = cv::Rect(results[i].rect[0], results[i].rect[1], w, h);
+      // Draw roi object, text, and background
+      cv::rectangle(vis_img, roi, roi_color, 2);
+    }
+
+    origin.x = results[i].rect[0];
+    origin.y = results[i].rect[1];
+
+    // Configure text background
+    cv::Rect text_back = cv::Rect(results[i].rect[0],
+                                  results[i].rect[1] - text_size.height,
+                                  text_size.width,
+                                  text_size.height);
+    // Draw text, and background
+    cv::rectangle(vis_img, text_back, roi_color, -1);
+    cv::putText(vis_img,
+                text,
+                origin,
+                font_face,
+                font_scale,
+                cv::Scalar(255, 255, 255),
+                thickness);
+  }
+  return vis_img;
+}
+
+void ObjectDetector::Preprocess(const cv::Mat& ori_im) {
+  // Clone the image : keep the original mat for postprocess
+  cv::Mat im = ori_im.clone();
+  // cv::cvtColor(im, im, cv::COLOR_BGR2RGB);
+  preprocessor_.Run(&im, &inputs_);
+}
+
+void ObjectDetector::Postprocess(const std::vector<cv::Mat> mats,
+                                 std::vector<PPShiTu::ObjectResult>* result,
+                                 std::vector<int> bbox_num,
+                                 bool is_rbox = false) {
+  result->clear();
+  int start_idx = 0;
+  for (int im_id = 0; im_id < mats.size(); im_id++) {
+    cv::Mat raw_mat = mats[im_id];
+    int rh = 1;
+    int rw = 1;
+    if (config_.arch_ == "Face") {
+      rh = raw_mat.rows;
+      rw = raw_mat.cols;
+    }
+    for (int j = start_idx; j < start_idx + bbox_num[im_id]; j++) {
+      if (is_rbox) {
+        // Class id
+        int class_id = static_cast<int>(round(output_data_[0 + j * 10]));
+        // Confidence score
+        float score = output_data_[1 + j * 10];
+        int x1 = (output_data_[2 + j * 10] * rw);
+        int y1 = (output_data_[3 + j * 10] * rh);
+        int x2 = (output_data_[4 + j * 10] * rw);
+        int y2 = (output_data_[5 + j * 10] * rh);
+        int x3 = (output_data_[6 + j * 10] * rw);
+        int y3 = (output_data_[7 + j * 10] * rh);
+        int x4 = (output_data_[8 + j * 10] * rw);
+        int y4 = (output_data_[9 + j * 10] * rh);
+
+        PPShiTu::ObjectResult result_item;
+        result_item.rect = {x1, y1, x2, y2, x3, y3, x4, y4};
+        result_item.class_id = class_id;
+        result_item.confidence = score;
+        result->push_back(result_item);
+      } else {
+        // Class id
+        int class_id = static_cast<int>(round(output_data_[0 + j * 6]));
+        // Confidence score
+        float score = output_data_[1 + j * 6];
+        int xmin = (output_data_[2 + j * 6] * rw);
+        int ymin = (output_data_[3 + j * 6] * rh);
+        int xmax = (output_data_[4 + j * 6] * rw);
+        int ymax = (output_data_[5 + j * 6] * rh);
+        int wd = xmax - xmin;
+        int hd = ymax - ymin;
+
+        PPShiTu::ObjectResult result_item;
+        result_item.rect = {xmin, ymin, xmax, ymax};
+        result_item.class_id = class_id;
+        result_item.confidence = score;
+        result->push_back(result_item);
+      }
+    }
+    start_idx += bbox_num[im_id];
+  }
+}
+
+void ObjectDetector::Predict(const std::vector<cv::Mat>& imgs,
+                             const int warmup,
+                             const int repeats,
+                             std::vector<PPShiTu::ObjectResult>* result,
+                             std::vector<int>* bbox_num,
+                             std::vector<double>* times) {
+  auto preprocess_start = std::chrono::steady_clock::now();
+  int batch_size = imgs.size();
+
+  // in_data_batch
+  std::vector<float> in_data_all;
+  std::vector<float> im_shape_all(batch_size * 2);
+  std::vector<float> scale_factor_all(batch_size * 2);
+  // Preprocess image
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+    Preprocess(im);
+    im_shape_all[bs_idx * 2] = inputs_.im_shape_[0];
+    im_shape_all[bs_idx * 2 + 1] = inputs_.im_shape_[1];
+
+    scale_factor_all[bs_idx * 2] = inputs_.scale_factor_[0];
+    scale_factor_all[bs_idx * 2 + 1] = inputs_.scale_factor_[1];
+
+    // TODO: reduce cost time
+    in_data_all.insert(
+        in_data_all.end(), inputs_.im_data_.begin(), inputs_.im_data_.end());
+  }
+  auto preprocess_end = std::chrono::steady_clock::now();
+  std::vector<const float *> output_data_list_;
+  // Prepare input tensor
+
+  auto input_names = predictor_->GetInputNames();
+  for (const auto& tensor_name : input_names) {
+    auto in_tensor = predictor_->GetInputByName(tensor_name);
+    if (tensor_name == "image") {
+      int rh = inputs_.in_net_shape_[0];
+      int rw = inputs_.in_net_shape_[1];
+      in_tensor->Resize({batch_size, 3, rh, rw});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(in_data_all.data(), in_data_all.size(), inptr);
+    } else if (tensor_name == "im_shape") {
+      in_tensor->Resize({batch_size, 2});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(im_shape_all.data(), im_shape_all.size(), inptr);
+    } else if (tensor_name == "scale_factor") {
+      in_tensor->Resize({batch_size, 2});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(scale_factor_all.data(), scale_factor_all.size(), inptr);
+    }
+  }
+
+  // Run predictor
+  // warmup
+  for (int i = 0; i < warmup; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    if (config_.arch_ == "PicoDet") {
+      for (int j = 0; j < output_names.size(); j++) {
+        auto output_tensor = predictor_->GetTensor(output_names[j]);
+        const float* outptr = output_tensor->data<float>();
+        std::vector<int64_t> output_shape = output_tensor->shape();
+        output_data_list_.push_back(outptr);
+      }
+    } else {
+      auto out_tensor = predictor_->GetTensor(output_names[0]);
+      auto out_bbox_num = predictor_->GetTensor(output_names[1]);
+    }
+  }
+
+  bool is_rbox = false;
+  auto inference_start = std::chrono::steady_clock::now();
+  for (int i = 0; i < repeats; i++) {
+    predictor_->Run();
+  }
+  auto inference_end = std::chrono::steady_clock::now();
+  auto postprocess_start = std::chrono::steady_clock::now();
+  // Get output tensor
+  output_data_list_.clear();
+  int num_class = 1;
+  int reg_max = 7;
+  auto output_names = predictor_->GetOutputNames();
+  // TODO: Unified model output.
+  if (config_.arch_ == "PicoDet") {
+    for (int i = 0; i < output_names.size(); i++) {
+      auto output_tensor = predictor_->GetTensor(output_names[i]);
+      const float* outptr = output_tensor->data<float>();
+      std::vector<int64_t> output_shape = output_tensor->shape();
+      if (i == 0) {
+        num_class = output_shape[2];
+      }
+      if (i == config_.fpn_stride_.size()) {
+        reg_max = output_shape[2] / 4 - 1;
+      }
+      output_data_list_.push_back(outptr);
+    }
+  } else {
+    auto output_tensor = predictor_->GetTensor(output_names[0]);
+    auto output_shape = output_tensor->shape();
+    auto out_bbox_num = predictor_->GetTensor(output_names[1]);
+    auto out_bbox_num_shape = out_bbox_num->shape();
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < output_shape.size(); ++j) {
+      output_size *= output_shape[j];
+    }
+    is_rbox = output_shape[output_shape.size() - 1] % 10 == 0;
+
+    if (output_size < 6) {
+      std::cerr << "[WARNING] No object detected." << std::endl;
+    }
+    output_data_.resize(output_size);
+    std::copy_n(
+        output_tensor->mutable_data<float>(), output_size, output_data_.data());
+
+    int out_bbox_num_size = 1;
+    for (int j = 0; j < out_bbox_num_shape.size(); ++j) {
+      out_bbox_num_size *= out_bbox_num_shape[j];
+    }
+    out_bbox_num_data_.resize(out_bbox_num_size);
+    std::copy_n(out_bbox_num->mutable_data<int>(),
+                out_bbox_num_size,
+                out_bbox_num_data_.data());
+  }
+  // Postprocessing result
+
+  result->clear();
+  if (config_.arch_ == "PicoDet") {
+    PPShiTu::PicoDetPostProcess(
+        result, output_data_list_, config_.fpn_stride_, 
+        inputs_.im_shape_, inputs_.scale_factor_,
+        config_.nms_info_["score_threshold"].as<float>(), 
+        config_.nms_info_["nms_threshold"].as<float>(), num_class, reg_max);
+    bbox_num->push_back(result->size());
+  } else {
+    Postprocess(imgs, result, out_bbox_num_data_, is_rbox);
+    bbox_num->clear();
+    for (int k = 0; k < out_bbox_num_data_.size(); k++) {
+      int tmp = out_bbox_num_data_[k];
+      bbox_num->push_back(tmp);
+    }
+  }
+  auto postprocess_end = std::chrono::steady_clock::now();
+
+  std::chrono::duration<float> preprocess_diff =
+      preprocess_end - preprocess_start;
+  times->push_back(double(preprocess_diff.count() * 1000));
+  std::chrono::duration<float> inference_diff = inference_end - inference_start;
+  times->push_back(double(inference_diff.count() / repeats * 1000));
+  std::chrono::duration<float> postprocess_diff =
+      postprocess_end - postprocess_start;
+  times->push_back(double(postprocess_diff.count() * 1000));
+}
+
+std::vector<int> GenerateColorMap(int num_class) {
+  auto colormap = std::vector<int>(3 * num_class, 0);
+  for (int i = 0; i < num_class; ++i) {
+    int j = 0;
+    int lab = i;
+    while (lab) {
+      colormap[i * 3] |= (((lab >> 0) & 1) << (7 - j));
+      colormap[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j));
+      colormap[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j));
+      ++j;
+      lab >>= 3;
+    }
+  }
+  return colormap;
+}
+
+}  // namespace PPShiTu
--- a/deploy/lite_shitu/src/picodet_postprocess.cc
+++ b/deploy/lite_shitu/src/picodet_postprocess.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// The code is based on:
+// https://github.com/RangiLyu/nanodet/blob/main/demo_mnn/nanodet_mnn.cpp
+
+#include "include/picodet_postprocess.h"
+#include <cmath>
+
+namespace PPShiTu {
+
+float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+
+  return 0;
+}
+
+// PicoDet decode
+PPShiTu::ObjectResult
+disPred2Bbox(const float *&dfl_det, int label, float score, int x, int y,
+             int stride, std::vector<float> im_shape, int reg_max) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float *dis_after_sm = new float[reg_max + 1];
+    activation_function_softmax(dfl_det + i * (reg_max + 1), dis_after_sm,
+                                reg_max + 1);
+    for (int j = 0; j < reg_max + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  int xmin = (int)(std::max)(ct_x - dis_pred[0], .0f);
+  int ymin = (int)(std::max)(ct_y - dis_pred[1], .0f);
+  int xmax = (int)(std::min)(ct_x + dis_pred[2], (float)im_shape[0]);
+  int ymax = (int)(std::min)(ct_y + dis_pred[3], (float)im_shape[1]);
+
+  PPShiTu::ObjectResult result_item;
+  result_item.rect = {xmin, ymin, xmax, ymax};
+  result_item.class_id = label;
+  result_item.confidence = score;
+
+  return result_item;
+}
+
+void PicoDetPostProcess(std::vector<PPShiTu::ObjectResult> *results,
+                        std::vector<const float *> outs,
+                        std::vector<int> fpn_stride,
+                        std::vector<float> im_shape,
+                        std::vector<float> scale_factor, float score_threshold,
+                        float nms_threshold, int num_class, int reg_max) {
+  std::vector<std::vector<PPShiTu::ObjectResult>> bbox_results;
+  bbox_results.resize(num_class);
+  int in_h = im_shape[0], in_w = im_shape[1];
+  for (int i = 0; i < fpn_stride.size(); ++i) {
+    int feature_h = ceil((float)in_h / fpn_stride[i]);
+    int feature_w = ceil((float)in_w / fpn_stride[i]);
+    for (int idx = 0; idx < feature_h * feature_w; idx++) {
+      const float *scores = outs[i] + (idx * num_class);
+
+      int row = idx / feature_w;
+      int col = idx % feature_w;
+      float score = 0;
+      int cur_label = 0;
+      for (int label = 0; label < num_class; label++) {
+        if (scores[label] > score) {
+          score = scores[label];
+          cur_label = label;
+        }
+      }
+      if (score > score_threshold) {
+        const float *bbox_pred =
+            outs[i + fpn_stride.size()] + (idx * 4 * (reg_max + 1));
+        bbox_results[cur_label].push_back(
+            disPred2Bbox(bbox_pred, cur_label, score, col, row, fpn_stride[i],
+                         im_shape, reg_max));
+      }
+    }
+  }
+  for (int i = 0; i < (int)bbox_results.size(); i++) {
+    PPShiTu::nms(bbox_results[i], nms_threshold);
+
+    for (auto box : bbox_results[i]) {
+      box.rect[0] = box.rect[0] / scale_factor[1];
+      box.rect[2] = box.rect[2] / scale_factor[1];
+      box.rect[1] = box.rect[1] / scale_factor[0];
+      box.rect[3] = box.rect[3] / scale_factor[0];
+      results->push_back(box);
+    }
+  }
+}
+
+} // namespace PPShiTu
--- a/deploy/lite_shitu/src/preprocess_op.cc
+++ b/deploy/lite_shitu/src/preprocess_op.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "include/preprocess_op.h"
+
+namespace PPShiTu {
+
+void InitInfo::Run(cv::Mat* im, ImageBlob* data) {
+  data->im_shape_ = {static_cast<float>(im->rows),
+                     static_cast<float>(im->cols)};
+  data->scale_factor_ = {1., 1.};
+  data->in_net_shape_ = {static_cast<float>(im->rows),
+                         static_cast<float>(im->cols)};
+}
+
+void NormalizeImage::Run(cv::Mat* im, ImageBlob* data) {
+  double e = 1.0;
+  if (is_scale_) {
+    e *= 1./255.0;
+  }
+  (*im).convertTo(*im, CV_32FC3, e);
+  for (int h = 0; h < im->rows; h++) {
+    for (int w = 0; w < im->cols; w++) {
+      im->at<cv::Vec3f>(h, w)[0] =
+          (im->at<cv::Vec3f>(h, w)[0] - mean_[0]) / scale_[0];
+      im->at<cv::Vec3f>(h, w)[1] =
+          (im->at<cv::Vec3f>(h, w)[1] - mean_[1]) / scale_[1];
+      im->at<cv::Vec3f>(h, w)[2] =
+          (im->at<cv::Vec3f>(h, w)[2] - mean_[2]) / scale_[2];
+    }
+  }
+}
+
+void Permute::Run(cv::Mat* im, ImageBlob* data) {
+  (*im).convertTo(*im, CV_32FC3);
+  int rh = im->rows;
+  int rw = im->cols;
+  int rc = im->channels();
+  (data->im_data_).resize(rc * rh * rw);
+  float* base = (data->im_data_).data();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(*im, cv::Mat(rh, rw, CV_32FC1, base + i * rh * rw), i);
+  }
+}
+
+void Resize::Run(cv::Mat* im, ImageBlob* data) {
+  auto resize_scale = GenerateScale(*im);
+  data->im_shape_ = {static_cast<float>(im->cols * resize_scale.first),
+                     static_cast<float>(im->rows * resize_scale.second)};
+  data->in_net_shape_ = {static_cast<float>(im->cols * resize_scale.first),
+                         static_cast<float>(im->rows * resize_scale.second)};
+  cv::resize(
+      *im, *im, cv::Size(), resize_scale.first, resize_scale.second, interp_);
+  data->im_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+  data->scale_factor_ = {
+      resize_scale.second, resize_scale.first,
+  };
+}
+
+std::pair<float, float> Resize::GenerateScale(const cv::Mat& im) {
+  std::pair<float, float> resize_scale;
+  int origin_w = im.cols;
+  int origin_h = im.rows;
+
+  if (keep_ratio_) {
+    int im_size_max = std::max(origin_w, origin_h);
+    int im_size_min = std::min(origin_w, origin_h);
+    int target_size_max =
+        *std::max_element(target_size_.begin(), target_size_.end());
+    int target_size_min =
+        *std::min_element(target_size_.begin(), target_size_.end());
+    float scale_min =
+        static_cast<float>(target_size_min) / static_cast<float>(im_size_min);
+    float scale_max =
+        static_cast<float>(target_size_max) / static_cast<float>(im_size_max);
+    float scale_ratio = std::min(scale_min, scale_max);
+    resize_scale = {scale_ratio, scale_ratio};
+  } else {
+    resize_scale.first =
+        static_cast<float>(target_size_[1]) / static_cast<float>(origin_w);
+    resize_scale.second =
+        static_cast<float>(target_size_[0]) / static_cast<float>(origin_h);
+  }
+  return resize_scale;
+}
+
+void PadStride::Run(cv::Mat* im, ImageBlob* data) {
+  if (stride_ <= 0) {
+    return;
+  }
+  int rc = im->channels();
+  int rh = im->rows;
+  int rw = im->cols;
+  int nh = (rh / stride_) * stride_ + (rh % stride_ != 0) * stride_;
+  int nw = (rw / stride_) * stride_ + (rw % stride_ != 0) * stride_;
+  cv::copyMakeBorder(
+      *im, *im, 0, nh - rh, 0, nw - rw, cv::BORDER_CONSTANT, cv::Scalar(0));
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+}
+
+void TopDownEvalAffine::Run(cv::Mat* im, ImageBlob* data) {
+  cv::resize(*im, *im, cv::Size(trainsize_[0], trainsize_[1]), 0, 0, interp_);
+  // todo: Simd::ResizeBilinear();
+  data->in_net_shape_ = {
+      static_cast<float>(trainsize_[1]), static_cast<float>(trainsize_[0]),
+  };
+}
+
+// Preprocessor op running order
+const std::vector<std::string> Preprocessor::RUN_ORDER = {"InitInfo",
+                                                          "DetTopDownEvalAffine",
+                                                          "DetResize",
+                                                          "DetNormalizeImage",
+                                                          "DetPadStride",
+                                                          "DetPermute"};
+
+void Preprocessor::Run(cv::Mat* im, ImageBlob* data) {
+  for (const auto& name : RUN_ORDER) {
+    if (ops_.find(name) != ops_.end()) {
+      ops_[name]->Run(im, data);
+    }
+  }
+}
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio) {
+  int crop_x1 = std::max(0, area[0]);
+  int crop_y1 = std::max(0, area[1]);
+  int crop_x2 = std::min(img.cols - 1, area[2]);
+  int crop_y2 = std::min(img.rows - 1, area[3]);
+  
+  int center_x = (crop_x1 + crop_x2) / 2.;
+  int center_y = (crop_y1 + crop_y2) / 2.;
+  int half_h = (crop_y2 - crop_y1) / 2.;
+  int half_w = (crop_x2 - crop_x1) / 2.;
+
+  if (half_h * 3 > half_w * 4) {
+    half_w = static_cast<int>(half_h * 0.75);
+  } else {
+    half_h = static_cast<int>(half_w * 4 / 3);
+  }
+
+  crop_x1 =
+      std::max(0, center_x - static_cast<int>(half_w * (1 + expandratio)));
+  crop_y1 =
+      std::max(0, center_y - static_cast<int>(half_h * (1 + expandratio)));
+  crop_x2 = std::min(img.cols - 1,
+                     static_cast<int>(center_x + half_w * (1 + expandratio)));
+  crop_y2 = std::min(img.rows - 1,
+                     static_cast<int>(center_y + half_h * (1 + expandratio)));
+  crop_img =
+      img(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1));
+
+  center.clear();
+  center.emplace_back((crop_x1 + crop_x2) / 2);
+  center.emplace_back((crop_y1 + crop_y2) / 2);
+  scale.clear();
+  scale.emplace_back((crop_x2 - crop_x1));
+  scale.emplace_back((crop_y2 - crop_y1));
+}
+
+}  // namespace PPShiTu
--- a/deploy/lite_shitu/src/recognition.cc
+++ b/deploy/lite_shitu/src/recognition.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/recognition.h"
+
+namespace PPShiTu {
+std::vector<RESULT> Recognition::RunRecModel(const cv::Mat &img,
+                                             double &cost_time) {
+
+  // Read img
+  cv::Mat resize_image = ResizeImage(img);
+
+  cv::Mat img_fp;
+  resize_image.convertTo(img_fp, CV_32FC3, scale);
+
+  // Prepare input data from image
+  std::unique_ptr<Tensor> input_tensor(std::move(this->predictor->GetInput(0)));
+  input_tensor->Resize({1, 3, img_fp.rows, img_fp.cols});
+  auto *data0 = input_tensor->mutable_data<float>();
+
+  const float *dimg = reinterpret_cast<const float *>(img_fp.data);
+  NeonMeanScale(dimg, data0, img_fp.rows * img_fp.cols);
+
+  auto start = std::chrono::system_clock::now();
+  // Run predictor
+  this->predictor->Run();
+
+  // Get output and post process
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(this->predictor->GetOutput(1)));
+  auto *output_data = output_tensor->data<float>();
+  auto end = std::chrono::system_clock::now();
+  auto duration =
+      std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  cost_time = double(duration.count()) *
+              std::chrono::microseconds::period::num /
+              std::chrono::microseconds::period::den;
+
+  int output_size = 1;
+  for (auto dim : output_tensor->shape()) {
+    output_size *= dim;
+  }
+
+  cv::Mat output_image;
+  auto results = PostProcess(output_data, output_size, output_image);
+  return results;
+}
+
+void Recognition::NeonMeanScale(const float *din, float *dout, int size) {
+
+  if (this->mean.size() != 3 || this->std.size() != 3) {
+    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
+    exit(1);
+  }
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(std[0]);
+  float32x4_t vscale1 = vdupq_n_f32(std[1]);
+  float32x4_t vscale2 = vdupq_n_f32(std[2]);
+
+  float *dout_c0 = dout;
+  float *dout_c1 = dout + size;
+  float *dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - this->mean[0]) * this->std[0];
+    *(dout_c1++) = (*(din++) - this->mean[1]) * this->std[1];
+    *(dout_c2++) = (*(din++) - this->mean[2]) * this->std[2];
+  }
+}
+
+cv::Mat Recognition::ResizeImage(const cv::Mat &img) {
+  cv::Mat resize_img;
+  cv::resize(img, resize_img, cv::Size(this->size, this->size));
+  return resize_img;
+}
+std::vector<RESULT> Recognition::PostProcess(const float *output_data,
+                                             int output_size,
+                                             cv::Mat &output_image) {
+
+  int max_indices[this->topk];
+  double max_scores[this->topk];
+  for (int i = 0; i < this->topk; i++) {
+    max_indices[i] = 0;
+    max_scores[i] = 0;
+  }
+  for (int i = 0; i < output_size; i++) {
+    float score = output_data[i];
+    int index = i;
+    for (int j = 0; j < this->topk; j++) {
+      if (score > max_scores[j]) {
+        index += max_indices[j];
+        max_indices[j] = index - max_indices[j];
+        index -= max_indices[j];
+        score += max_scores[j];
+        max_scores[j] = score - max_scores[j];
+        score -= max_scores[j];
+      }
+    }
+  }
+
+  std::vector<RESULT> results(this->topk);
+  for (int i = 0; i < results.size(); i++) {
+    results[i].class_name = "Unknown";
+    if (max_indices[i] >= 0 && max_indices[i] < this->label_list.size()) {
+      results[i].class_name = this->label_list[max_indices[i]];
+    }
+    results[i].score = max_scores[i];
+    results[i].class_id = max_indices[i];
+  }
+  return results;
+}
+}
--- a/deploy/lite_shitu/src/utils.cc
+++ b/deploy/lite_shitu/src/utils.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/utils.h"
+
+namespace PPShiTu {
+
+void nms(std::vector<ObjectResult> &input_boxes, float nms_threshold,
+         bool rec_nms) {
+  if (!rec_nms) {
+    std::sort(input_boxes.begin(), input_boxes.end(),
+              [](ObjectResult a, ObjectResult b) {
+                return a.confidence > b.confidence;
+              });
+  } else {
+    std::sort(input_boxes.begin(), input_boxes.end(),
+              [](ObjectResult a, ObjectResult b) {
+                return a.rec_result[0].score > b.rec_result[0].score;
+              });
+  }
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).rect[2] - input_boxes.at(i).rect[0] + 1) *
+               (input_boxes.at(i).rect[3] - input_boxes.at(i).rect[1] + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].rect[0], input_boxes[j].rect[0]);
+      float yy1 = (std::max)(input_boxes[i].rect[1], input_boxes[j].rect[1]);
+      float xx2 = (std::min)(input_boxes[i].rect[2], input_boxes[j].rect[2]);
+      float yy2 = (std::min)(input_boxes[i].rect[3], input_boxes[j].rect[3]);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= nms_threshold) {
+        input_boxes.erase(input_boxes.begin() + j);
+        vArea.erase(vArea.begin() + j);
+      } else {
+        j++;
+      }
+    }
+  }
+}
+
+} // namespace PPShiTu
--- a/docs/images/ppshitu_lite_demo.png
+++ b/docs/images/ppshitu_lite_demo.png