lite deploy x86 ok (#4199)

lite deploy armv8 ok

lite deploy x86 ok (#4199)
lite deploy armv8 ok
a268c2ef · zhiboniu · GitHub · 3a9d38b5 · a268c2ef · 3a9d38b5
19 changed file
--- a/deploy/lite/Makefile
+++ b/deploy/lite/Makefile
-ARM_ABI = arm8
+ARM_ABI = arm8#[arm7/arm8]
 export ARM_ABI
+ifeq ($(ARM_ABI), arm8)
+    ARM_PLAT=arm64-v8a
+else
+    ARM_PLAT=armeabi-v7a
+endif
+${info ARM_ABI: ${ARM_ABI}}
+${info ARM_PLAT: ${ARM_PLAT}; option[arm7/arm8]}
 include ../Makefile.def
 LITE_ROOT=../../../
+${info LITE_ROOT: $(abspath ${LITE_ROOT})}
-THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+THIRD_PARTY_DIR=third_party
+${info THIRD_PARTY_DIR: $(abspath ${THIRD_PARTY_DIR})}
-OPENCV_VERSION=opencv4.1.0
-OPENCV_LIBS = ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
-OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
-CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+OPENCV_VERSION=opencv4.1.0
+OPENCV_LIBS = ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_imgcodecs.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_imgproc.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_core.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libtegra_hal.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibjpeg-turbo.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibwebp.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibpng.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibjasper.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibtiff.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libIlmImf.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libtbb.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libcpufeatures.a
-CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+LITE_LIBS = -L${LITE_ROOT}/cxx/lib/ -lpaddle_light_api_shared
 ###############################################################
 # How to use one of static libaray:                           #
 #  `libpaddle_api_full_bundled.a`                             #
@@ -37,14 +43,18 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SY
 ###############################################################
 # 1. Comment above line using `libpaddle_light_api_shared.so`
 # 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+# LITE_LIBS = ${LITE_ROOT}/cxx/lib/libpaddle_api_light_bundled.a
+CXX_LIBS = $(LITE_LIBS) ${OPENCV_LIBS} $(SYSTEM_LIBS)
-#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+LOCAL_DIRSRCS=$(wildcard src/*.cc)
+LOCAL_SRCS=$(notdir $(LOCAL_DIRSRCS))
+LOCAL_OBJS=$(patsubst %.cpp, %.o, $(patsubst %.cc, %.o, $(LOCAL_SRCS)))
-detect_system: fetch_opencv detect_system.o
+JSON_OBJS = json_reader.o json_value.o json_writer.o
-	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) detect_system.o -o detect_system  $(CXX_LIBS) $(LDFLAGS)
-detect_system.o: run_detection.cc
+main: $(LOCAL_OBJS) $(JSON_OBJS) fetch_opencv
-	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o detect_system.o -c run_detection.cc
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) $(LOCAL_OBJS) $(JSON_OBJS) -o main $(CXX_LIBS) $(LDFLAGS)
 fetch_opencv:
 	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
@@ -52,10 +62,29 @@ fetch_opencv:
      (echo "fetch opencv libs" && \
      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
 	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
-      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+      tar -zxf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+fetch_json_code:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/jsoncpp_code.tar.gz || \
+      (echo "fetch jsoncpp_code.tar.gz" && \
+      wget -P ${THIRD_PARTY_DIR} https://bj.bcebos.com/v1/paddledet/deploy/jsoncpp_code.tar.gz )
+	@ test -d ${THIRD_PARTY_DIR}/jsoncpp_code || \
+      tar -zxf ${THIRD_PARTY_DIR}/jsoncpp_code.tar.gz -C ${THIRD_PARTY_DIR}
+LOCAL_INCLUDES = -I./ -Iinclude
+OPENCV_INCLUDE = -I${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/include
+JSON_INCLUDE = -I${THIRD_PARTY_DIR}/jsoncpp_code/include
+CXX_INCLUDES = ${LOCAL_INCLUDES} ${INCLUDES} ${OPENCV_INCLUDE} ${JSON_INCLUDE} -I$(LITE_ROOT)/cxx/include
+$(LOCAL_OBJS): %.o: src/%.cc fetch_opencv fetch_json_code
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -c $< -o $@
+$(JSON_OBJS): %.o: ${THIRD_PARTY_DIR}/jsoncpp_code/%.cpp fetch_json_code
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -c $< -o $@
-.PHONY: clean
+.PHONY: clean fetch_opencv fetch_json_code
 clean:
-	rm -f detect_system.o
+	rm -rf $(LOCAL_OBJS) $(JSON_OBJS)
-	rm -f detect_system
+	rm -f main
--- a/deploy/lite/Makefile_armv7
+++ b/deploy/lite/Makefile_armv7
-ARM_ABI = arm7
-export ARM_ABI
-include ../Makefile.def
-LITE_ROOT=../../../
-THIRD_PARTY_DIR=${LITE_ROOT}/third_party
-OPENCV_VERSION=opencv4.1.0
-OPENCV_LIBS = ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
-              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
-OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
-CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
-CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
-###############################################################
-# How to use one of static libaray:                           #
-#  `libpaddle_api_full_bundled.a`                             #
-#  `libpaddle_api_light_bundled.a`                            #
-###############################################################
-# Note: default use lite's shared library.                    #
-###############################################################
-# 1. Comment above line using `libpaddle_light_api_shared.so`
-# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
-#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
-detect_system: fetch_opencv detect_system.o
-	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) detect_system.o -o detect_system  $(CXX_LIBS) $(LDFLAGS)
-detect_system.o: run_detection.cc
-	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o detect_system.o -c run_detection.cc
-fetch_opencv:
-	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
-	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
-      (echo "fetch opencv libs" && \
-      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
-	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
-      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
-.PHONY: clean
-clean:
-	rm -f detect_system.o
-	rm -f detect_system
--- a/deploy/lite/README.md
+++ b/deploy/lite/README.md
@@ -130,10 +130,13 @@ cd PaddleDetection_root_path
 python tools/export_model.py -c configs/ppyolo/ppyolo_tiny_650e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyolo_tiny_650e_coco.pdparams
 # 将inference模型转化为Paddle-Lite优化模型
-paddle_lite_opt --model_file=output_inference/ppyolo_tiny_650e_coco/model.pdmodel --param_file=output_inference/ppyolo_tiny_650e_coco/model.pdiparams --optimize_out=ppyolo_tiny
+paddle_lite_opt  --valid_targets=arm --optimize_out_type=naive_buffe --model_file=output_inference/ppyolo_tiny_650e_coco/model.pdmodel --param_file=output_inference/ppyolo_tiny_650e_coco/model.pdiparams --optimize_out=output_inference/ppyolo_tiny_650e_coco/model
+# 将inference模型配置转化为json格式
+python deploy/lite/convert_yml_to_json.py output_inference/ppyolo_tiny_650e_coco/infer_cfg.yml
 ```
-最终在当前文件夹下生成`ppyolo_tiny.nb`的文件。
+最终在output_inference/ppyolo_tiny_650e_coco/文件夹下生成`ppyolo_tiny.nb` 和 `infer_cfg.json`的文件。
 **注意**：`--optimize_out` 参数为优化后模型的保存路径，无需加后缀`.nb`；`--model_file` 参数为模型结构信息文件的路径，`--param_file` 参数为模型权重信息文件的路径，请注意文件名。
@@ -168,82 +171,96 @@ List of devices attached
 744be294    device
 ```
-4. 准备优化后的模型、预测库文件、测试图像和类别映射文件。
+4. 编译lite部署代码生成移动端可执行文件
 ```shell
-cd PaddleDetection_root_path
+cd {PadddleDetection_Root}
 cd deploy/lite/
-# 将预测库文件、测试图像和使用的类别字典文件放置在预测库中的demo/cxx/detection文件夹下
 inference_lite_path=/{lite prediction library path}/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.with_cv/
-mkdir -p  $inference_lite_path/demo/cxx/detection/debug/
+mkdir $inference_lite_path/demo/cxx/lite
-cp ../../ppyolo_tiny.nb $inference_lite_path/demo/cxx/detection/debug/
-cp  ./coco_label_list.txt  $inference_lite_path/demo/cxx/detection/debug/
-cp Makefile run_detection.cc  $inference_lite_path/demo/cxx/detection/
-cp ./config_ppyolo_tiny.txt  $inference_lite_path/demo/cxx/detection/debug/
-cp ../../demo/000000014439.jpg  $inference_lite_path/demo/cxx/detection/debug/
+cp -r Makefile src/ include/ runtime_config.json $inference_lite_path/demo/cxx/lite
-# 进入lite demo的工作目录
+cd $inference_lite_path/demo/cxx/lite
-cd /{lite prediction library path}/inference_lite_lib.android.armv8/
-cd demo/cxx/detection/
+# 执行编译，等待完成后得到可执行文件main
+make ARM_ABI = arm8
+#如果是arm7，则执行 make ARM_ABI = arm7 (或者在Makefile中修改该项)
-# 将C++预测动态库so文件复制到debug文件夹中
-cp ../../../cxx/lib/libpaddle_light_api_shared.so ./debug/
 ```
-执行完成后，detection文件夹下将有如下文件格式：
+5. 准备优化后的模型、预测库文件、测试图像。
-```
+```shell
-demo/cxx/detection/
+mdkir deploy
-|-- debug/
+cp main runtime_config.json deploy/
-|   |--ppyolo_tiny.nb                   优化后的检测器模型文件
+cd deploy
-|   |--000000014439.jpg                 待测试图像
+mkdir model_det
-|   |--coco_label_list.txt              类别映射文件
+mkdir model_keypoint
-|   |--libpaddle_light_api_shared.so    C++预测库文件
-|   |--config_ppyolo_tiny.txt           检测模型预测超参数配置
-|-- run_detection.cc                    目标检测代码文件
-|-- Makefile                            编译文件
-```
-**注意：**
+# 将优化后的模型、预测库文件、测试图像放置在预测库中的demo/cxx/detection文件夹下
+cp {PadddleDetection_Root}/output_inference/ppyolo_tiny_650e_coco/model.nb ./model_det/
+cp {PadddleDetection_Root}/output_inference/ppyolo_tiny_650e_coco/infer_cfg.json ./model_det/
-* 上述文件中，`coco_label_list.txt` 是COCO数据集的类别映射文件，如果使用自定义的类别，需要更换该类别映射文件。
+# 如果需要关键点模型，则只需一下操作
+cp {PadddleDetection_Root}/output_inference/hrnet_w32_256x192/model.nb ./model_keypoint/
+cp {PadddleDetection_Root}/output_inference/hrnet_w32_256x192/infer_cfg.json ./model_keypoint/
-*  `config_ppyolo_tiny.txt` 包含了检测器的超参数，如下：
+# 将测试图像复制到deploy文件夹中
+cp [your_test_img].jpg ./demo.jpg
-```shell
+# 将C++预测动态库so文件复制到deploy文件夹中
-model_file ./ppyolo_tiny.nb         # 模型文件地址
+cp ../../../cxx/lib/libpaddle_light_api_shared.so ./
-label_path ./coco_label_list.txt    # 类别映射文本文件
-num_threads 1                       # 线程数
-enable_benchmark 1                  # 是否运行benchmark
-Resize 320,320                      # resize图像尺寸
-keep_ratio False                    # 是否keep ratio
-mean 0.485,0.456,0.406              # 预处理均值
-std 0.229,0.224,0.225               # 预处理方差
-precision fp32                      # 模型精度
 ```
-5. 启动调试，上述步骤完成后就可以使用ADB将文件夹 `debug/` push到手机上运行，步骤如下：
+执行完成后，deploy文件夹下将有如下文件格式：
+```
+deploy/
+|-- model_det/
+|   |--mdoel.nb                   优化后的检测模型文件
+|   |--infer_cfg.json             检测器模型配置文件
+|-- model_keypoint/
+|   |--mdoel.nb                   优化后的关键点模型文件
+|   |--infer_cfg.json             关键点模型配置文件
+|-- main                          生成的移动端执行文件
+|-- runtime_config.json           移动端执行时参数配置文件
+|-- libpaddle_light_api_shared.so Paddle-Lite库文件
+```
+**注意：**
+*  `runtime_config.json` 包含了检测器的超参数，请按需进行修改（注意配置中路径及文件需存在）：
 ```shell
-# 执行编译，得到可执行文件detect_system
+{
-# 如果是编译armv7的执行程序，需要使用 Makefile_armv7 替换 Makefile 文件
+  "model_dir_det": "./model_det/",              #检测器模型路径
-make
+  "batch_size_det": 1,                          #检测预测时batchsize
+  "threshold_det": 0.5,                         #检测器输出阈值
+  "model_dir_keypoint": "./model_keypoint/",    #关键点模型路径（不使用需为空字符）
+  "batch_size_keypoint": 8,                     #关键点预测时batchsize
+  "threshold_keypoint": 0.5,                    #关键点输出阈值
+  "image_file": "demo.jpg",                     #测试图片
+  "image_dir": "",                              #测试图片文件夹
+  "run_benchmark": false,                       #性能测试开关
+  "cpu_threads": 1                              #线程数
+}
+```
-# 将编译得到的可执行文件移动到debug文件夹中
+6. 启动调试，上述步骤完成后就可以使用ADB将文件夹 `deploy/` push到手机上运行，步骤如下：
-mv detect_system ./debug/
-# 将上述debug文件夹push到手机上
+```shell
-adb push debug /data/local/tmp/
+# 将上述deploy文件夹push到手机上
+adb push deploy /data/local/tmp/
 adb shell
-cd /data/local/tmp/debug
+cd /data/local/tmp/deploy
-export LD_LIBRARY_PATH=/data/local/tmp/debug:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/data/local/tmp/deploy:$LD_LIBRARY_PATH
-# detect_system可执行文件的使用方式为:
+# 修改权限为可执行
-# ./detect_system 配置文件路径  测试图像路径
+chmod 777 main
-./detect_system ./config_ppyolo_tiny.txt ./000000014439.jpg
+# 执行程序
+./main
 ```
 如果对代码做了修改，则需要重新编译并push到手机上。
@@ -260,4 +277,4 @@ Q1：如果想更换模型怎么办，需要重新按照流程走一遍吗？
 A1：如果已经走通了上述步骤，更换模型只需要替换 `.nb` 模型文件即可，同时要注意修改下配置文件中的 `.nb` 文件路径以及类别映射文件（如有必要）。
 Q2：换一个图测试怎么做？  
-A2：替换 debug 下的测试图像为你想要测试的图像，使用 ADB 再次 push 到手机上即可。
+A2：替换 deploy 下的测试图像为你想要测试的图像，使用 ADB 再次 push 到手机上即可。
--- a/deploy/lite/coco_label_list.txt
+++ b/deploy/lite/coco_label_list.txt
-person
-bicycle
-car
-motorcycle
-airplane
-bus
-train
-truck
-boat
-traffic light
-fire hydrant
-stop sign
-parking meter
-bench
-bird
-cat
-dog
-horse
-sheep
-cow
-elephant
-bear
-zebra
-giraffe
-backpack
-umbrella
-handbag
-tie
-suitcase
-frisbee
-skis
-snowboard
-sports ball
-kite
-baseball bat
-baseball glove
-skateboard
-surfboard
-tennis racket
-bottle
-wine glass
-cup
-fork
-knife
-spoon
-bowl
-banana
-apple
-sandwich
-orange
-broccoli
-carrot
-hot dog
-pizza
-donut
-cake
-chair
-couch
-potted plant
-bed
-dining table
-toilet
-tv
-laptop
-mouse
-remote
-keyboard
-cell phone
-microwave
-oven
-toaster
-sink
-refrigerator
-book
-clock
-vase
-scissors
-teddy bear
-hair drier
-toothbrush
\ No newline at end of file
--- a/deploy/lite/config_ppyolo_tiny.txt
+++ b/deploy/lite/config_ppyolo_tiny.txt
-model_file ./ppyolo_tiny.nb
-label_path ./coco_label_list.txt
-num_threads 1
-precision fp32
-enable_benchmark 1
-arch YOLO
-image_shape 3,320,320
-Resize 320,320
-keep_ratio False
-mean 0.485,0.456,0.406
-std 0.229,0.224,0.225
-PadStride 0
--- a/deploy/lite/convert_yml_to_json.py
+++ b/deploy/lite/convert_yml_to_json.py
+import yaml
+import json
+import sys
+yamlf = sys.argv[1]
+assert yamlf.endswith(".yml")
+with open(yamlf, 'r') as rf:
+    yaml_data = yaml.safe_load(rf)
+jsonf = yamlf[:-4] + ".json"
+with open(jsonf, 'w') as wf:
+    json.dump(yaml_data, wf, indent=4)
--- a/deploy/lite/include/config_parser.h
+++ b/deploy/lite/include/config_parser.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+#include "json/json.h"
+#ifdef _WIN32
+#define OS_PATH_SEP "\\"
+#else
+#define OS_PATH_SEP "/"
+#endif
+namespace PaddleDetection {
+void load_jsonf(std::string jsonfile, Json::Value& jsondata);
+// Inference model configuration parser
+class ConfigPaser {
+ public:
+  ConfigPaser() {}
+  ~ConfigPaser() {}
+  bool load_config(const std::string& model_dir,
+                   const std::string& cfg = "infer_cfg") {
+    Json::Value config;
+    load_jsonf(model_dir + OS_PATH_SEP + cfg + ".json", config);
+    // Get model arch : YOLO, SSD, RetinaNet, RCNN, Face
+    if (config.isMember("arch")) {
+      arch_ = config["arch"].as<std::string>();
+    } else {
+      std::cerr << "Please set model arch,"
+                << "support value : YOLO, SSD, RetinaNet, RCNN, Face."
+                << std::endl;
+      return false;
+    }
+    // Get draw_threshold for visualization
+    if (config.isMember("draw_threshold")) {
+      draw_threshold_ = config["draw_threshold"].as<float>();
+    } else {
+      std::cerr << "Please set draw_threshold." << std::endl;
+      return false;
+    }
+    // Get Preprocess for preprocessing
+    if (config.isMember("Preprocess")) {
+      preprocess_info_ = config["Preprocess"];
+    } else {
+      std::cerr << "Please set Preprocess." << std::endl;
+      return false;
+    }
+    // Get label_list for visualization
+    if (config.isMember("label_list")) {
+      label_list_.clear();
+      for (auto item : config["label_list"]) {
+        label_list_.emplace_back(item.as<std::string>());
+      }
+    } else {
+      std::cerr << "Please set label_list." << std::endl;
+      return false;
+    }
+    return true;
+  }
+  float draw_threshold_;
+  std::string arch_;
+  Json::Value preprocess_info_;
+  std::vector<std::string> label_list_;
+};
+}  // namespace PaddleDetection
--- a/deploy/lite/include/keypoint_detector.h
+++ b/deploy/lite/include/keypoint_detector.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <ctime>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "paddle_api.h"  // NOLINT
+#include "include/config_parser.h"
+#include "include/keypoint_postprocess.h"
+#include "include/preprocess_op.h"
+using namespace paddle::lite_api;  // NOLINT
+namespace PaddleDetection {
+// Object KeyPoint Result
+struct KeyPointResult {
+  // Keypoints: shape(N x 3); N: number of Joints; 3: x,y,conf
+  std::vector<float> keypoints;
+  int num_joints = -1;
+};
+// Visualiztion KeyPoint Result
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap);
+class KeyPointDetector {
+ public:
+  explicit KeyPointDetector(const std::string& model_dir,
+                            int cpu_threads = 1,
+                            const int batch_size = 1) {
+    config_.load_config(model_dir);
+    threshold_ = config_.draw_threshold_;
+    preprocessor_.Init(config_.preprocess_info_);
+    printf("before keypoint detector\n");
+    LoadModel(model_dir, cpu_threads);
+    printf("create keypoint detector\n");
+  }
+  // Load Paddle inference model
+  void LoadModel(std::string model_file, int num_theads);
+  // Run predictor
+  void Predict(const std::vector<cv::Mat> imgs,
+               std::vector<std::vector<float>>& center,
+               std::vector<std::vector<float>>& scale,
+               const double threshold = 0.5,
+               const int warmup = 0,
+               const int repeats = 1,
+               std::vector<KeyPointResult>* result = nullptr,
+               std::vector<double>* times = nullptr);
+  // Get Model Label list
+  const std::vector<std::string>& GetLabelList() const {
+    return config_.label_list_;
+  }
+ private:
+  // Preprocess image and copy data to input buffer
+  void Preprocess(const cv::Mat& image_mat);
+  // Postprocess result
+  void Postprocess(const std::vector<float> output,
+                   const std::vector<int64_t> output_shape,
+                   const std::vector<int64_t> idxout,
+                   const std::vector<int64_t> idx_shape,
+                   std::vector<KeyPointResult>* result,
+                   std::vector<std::vector<float>>& center,
+                   std::vector<std::vector<float>>& scale);
+  std::shared_ptr<PaddlePredictor> predictor_;
+  Preprocessor preprocessor_;
+  ImageBlob inputs_;
+  std::vector<float> output_data_;
+  std::vector<int64_t> idx_data_;
+  float threshold_;
+  ConfigPaser config_;
+};
+}  // namespace PaddleDetection
--- a/deploy/lite/include/keypoint_postprocess.h
+++ b/deploy/lite/include/keypoint_postprocess.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <vector>
+std::vector<float> get_3rd_point(std::vector<float>& a, std::vector<float>& b);
+std::vector<float> get_dir(float src_point_x, float src_point_y, float rot_rad);
+void affine_tranform(
+    float pt_x, float pt_y, cv::Mat& trans, float* x, int p, int num);
+cv::Mat get_affine_transform(std::vector<float>& center,
+                             std::vector<float>& scale,
+                             float rot,
+                             std::vector<int>& output_size,
+                             int inv);
+void transform_preds(float* coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int>& dim,
+                     float* target_coords);
+void box_to_center_scale(std::vector<int>& box,
+                         int width,
+                         int height,
+                         std::vector<float>& center,
+                         std::vector<float>& scale);
+void get_max_preds(float* heatmap,
+                   std::vector<int64_t>& dim,
+                   float* preds,
+                   float* maxvals,
+                   int batchid,
+                   int joint_idx);
+void get_final_preds(float* heatmap,
+                     std::vector<int64_t>& dim,
+                     int64_t* idxout,
+                     std::vector<int64_t>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     float* preds,
+                     int batchid);
--- a/deploy/lite/include/object_detector.h
+++ b/deploy/lite/include/object_detector.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <ctime>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "paddle_api.h"  // NOLINT
+#include "include/config_parser.h"
+#include "include/preprocess_op.h"
+using namespace paddle::lite_api;  // NOLINT
+namespace PaddleDetection {
+// Object Detection Result
+struct ObjectResult {
+  // Rectangle coordinates of detected object: left, right, top, down
+  std::vector<int> rect;
+  // Class id of detected object
+  int class_id;
+  // Confidence of detected object
+  float confidence;
+};
+// Generate visualization colormap for each class
+std::vector<int> GenerateColorMap(int num_class);
+// Visualiztion Detection Result
+cv::Mat VisualizeResult(const cv::Mat& img,
+                        const std::vector<ObjectResult>& results,
+                        const std::vector<std::string>& lables,
+                        const std::vector<int>& colormap,
+                        const bool is_rbox);
+class ObjectDetector {
+ public:
+  explicit ObjectDetector(const std::string& model_dir,
+                          int cpu_threads = 1,
+                          const int batch_size = 1) {
+    config_.load_config(model_dir);
+    printf("config created\n");
+    threshold_ = config_.draw_threshold_;
+    preprocessor_.Init(config_.preprocess_info_);
+    printf("before object detector\n");
+    LoadModel(model_dir, cpu_threads);
+    printf("create object detector\n");
+  }
+  // Load Paddle inference model
+  void LoadModel(std::string model_file, int num_theads);
+  // Run predictor
+  void Predict(const std::vector<cv::Mat>& imgs,
+               const double threshold = 0.5,
+               const int warmup = 0,
+               const int repeats = 1,
+               std::vector<ObjectResult>* result = nullptr,
+               std::vector<int>* bbox_num = nullptr,
+               std::vector<double>* times = nullptr);
+  // Get Model Label list
+  const std::vector<std::string>& GetLabelList() const {
+    return config_.label_list_;
+  }
+ private:
+  // Preprocess image and copy data to input buffer
+  void Preprocess(const cv::Mat& image_mat);
+  // Postprocess result
+  void Postprocess(const std::vector<cv::Mat> mats,
+                   std::vector<ObjectResult>* result,
+                   std::vector<int> bbox_num,
+                   bool is_rbox);
+  std::shared_ptr<PaddlePredictor> predictor_;
+  Preprocessor preprocessor_;
+  ImageBlob inputs_;
+  std::vector<float> output_data_;
+  std::vector<int> out_bbox_num_data_;
+  float threshold_;
+  ConfigPaser config_;
+};
+}  // namespace PaddleDetection
--- a/deploy/lite/include/preprocess_op.h
+++ b/deploy/lite/include/preprocess_op.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "json/json.h"
+namespace PaddleDetection {
+// Object for storing all preprocessed data
+class ImageBlob {
+ public:
+  // image width and height
+  std::vector<float> im_shape_;
+  // Buffer for image data after preprocessing
+  std::vector<float> im_data_;
+  // in net data shape(after pad)
+  std::vector<float> in_net_shape_;
+  // Evaluation image width and height
+  // std::vector<float>  eval_im_size_f_;
+  // Scale factor for image size to origin image size
+  std::vector<float> scale_factor_;
+};
+// Abstraction of preprocessing opration class
+class PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) = 0;
+  virtual void Run(cv::Mat* im, ImageBlob* data) = 0;
+};
+class InitInfo : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {}
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+};
+class NormalizeImage : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    mean_.clear();
+    scale_.clear();
+    for (auto tmp : item["mean"]) {
+      mean_.emplace_back(tmp.as<float>());
+    }
+    for (auto tmp : item["std"]) {
+      scale_.emplace_back(tmp.as<float>());
+    }
+    is_scale_ = item["is_scale"].as<bool>();
+  }
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+ private:
+  // CHW or HWC
+  std::vector<float> mean_;
+  std::vector<float> scale_;
+  bool is_scale_;
+};
+class Permute : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {}
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+};
+class Resize : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    interp_ = item["interp"].as<int>();
+    // max_size_ = item["target_size"].as<int>();
+    keep_ratio_ = item["keep_ratio"].as<bool>();
+    target_size_.clear();
+    for (auto tmp : item["target_size"]) {
+      target_size_.emplace_back(tmp.as<int>());
+    }
+  }
+  // Compute best resize scale for x-dimension, y-dimension
+  std::pair<float, float> GenerateScale(const cv::Mat& im);
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+ private:
+  int interp_;
+  bool keep_ratio_;
+  std::vector<int> target_size_;
+  std::vector<int> in_net_shape_;
+};
+// Models with FPN need input shape % stride == 0
+class PadStride : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    stride_ = item["stride"].as<int>();
+  }
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+ private:
+  int stride_;
+};
+class TopDownEvalAffine : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    trainsize_.clear();
+    for (auto tmp : item["trainsize"]) {
+      trainsize_.emplace_back(tmp.as<int>());
+    }
+  }
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+ private:
+  int interp_ = 1;
+  std::vector<int> trainsize_;
+};
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio = 0.15);
+class Preprocessor {
+ public:
+  void Init(const Json::Value& config_node) {
+    // initialize image info at first
+    ops_["InitInfo"] = std::make_shared<InitInfo>();
+    for (const auto& item : config_node) {
+      auto op_name = item["type"].as<std::string>();
+      ops_[op_name] = CreateOp(op_name);
+      ops_[op_name]->Init(item);
+    }
+  }
+  std::shared_ptr<PreprocessOp> CreateOp(const std::string& name) {
+    if (name == "Resize") {
+      return std::make_shared<Resize>();
+    } else if (name == "Permute") {
+      return std::make_shared<Permute>();
+    } else if (name == "NormalizeImage") {
+      return std::make_shared<NormalizeImage>();
+    } else if (name == "PadStride") {
+      // use PadStride instead of PadBatch
+      return std::make_shared<PadStride>();
+    } else if (name == "TopDownEvalAffine") {
+      return std::make_shared<TopDownEvalAffine>();
+    }
+    std::cerr << "can not find function of OP: " << name
+              << " and return: nullptr" << std::endl;
+    return nullptr;
+  }
+  void Run(cv::Mat* im, ImageBlob* data);
+ public:
+  static const std::vector<std::string> RUN_ORDER;
+ private:
+  std::unordered_map<std::string, std::shared_ptr<PreprocessOp>> ops_;
+};
+}  // namespace PaddleDetection
--- a/deploy/lite/run_detection.cc
+++ b/deploy/lite/run_detection.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <fstream>
-#include <iostream>
-#include <vector>
-#include <chrono>
-#include <numeric>
-#include "opencv2/core.hpp"
-#include "opencv2/imgcodecs.hpp"
-#include "opencv2/imgproc.hpp"
-#include "paddle_api.h"  // NOLINT
-using namespace paddle::lite_api;  // NOLINT
-using namespace std;
-struct Object {
-  cv::Rect rec;
-  int class_id;
-  float prob;
-};
-// Object for storing all preprocessed data
-struct ImageBlob {
-  // image width and height
-  std::vector<int> im_shape_;
-  // Buffer for image data after preprocessing
-  const float* im_data_;
-  // Scale factor for image size to origin image size
-  std::vector<float> scale_factor_;
-  std::vector<float> mean_;
-  std::vector<float> scale_;
-};
-void PrintBenchmarkLog(std::vector<double> det_time,
-                       std::map<std::string, std::string> config,
-                       int img_num) {
-  std::cout << "----------------- Config info ------------------" << std::endl;
-  std::cout << "runtime_device: armv8" << std::endl;
-  std::cout << "precision: " << config.at("precision") << std::endl;
-  std::cout << "num_threads: " << config.at("num_threads") << std::endl;
-  std::cout << "---------------- Data info ---------------------" << std::endl;
-  std::cout << "batch_size: " << 1 << std::endl;
-  std::cout << "---------------- Model info --------------------" << std::endl;
-  std::cout << "Model_name: " << config.at("model_file") << std::endl;
-  std::cout << "---------------- Perf info ---------------------" << std::endl;
-  std::cout << "Total number of predicted data: " << img_num
-            << " and total time spent(s): "
-            << std::accumulate(det_time.begin(), det_time.end(), 0) << std::endl;
-  std::cout << "preproce_time(ms): " << det_time[0] / img_num
-            << ", inference_time(ms): " << det_time[1] / img_num
-            << ", postprocess_time(ms): " << det_time[2] << std::endl;
-}
-std::vector<std::string> LoadLabels(const std::string &path) {
-  std::ifstream file;
-  std::vector<std::string> labels;
-  file.open(path);
-  while (file) {
-    std::string line;
-    std::getline(file, line);
-    std::string::size_type pos = line.find(" ");
-    if (pos != std::string::npos) {
-      line = line.substr(pos);
-    }
-    labels.push_back(line);
-  }
-  file.clear();
-  file.close();
-  return labels;
-}
-std::vector<std::string> ReadDict(std::string path) {
-  std::ifstream in(path);
-  std::string filename;
-  std::string line;
-  std::vector<std::string> m_vec;
-  if (in) {
-    while (getline(in, line)) {
-      m_vec.push_back(line);
-    }
-  } else {
-    std::cout << "no such file" << std::endl;
-  }
-  return m_vec;
-}
-std::vector<std::string> split(const std::string &str,
-                               const std::string &delim) {
-  std::vector<std::string> res;
-  if ("" == str)
-    return res;
-  char *strs = new char[str.length() + 1];
-  std::strcpy(strs, str.c_str());
-  char *d = new char[delim.length() + 1];
-  std::strcpy(d, delim.c_str());
-  char *p = std::strtok(strs, d);
-  while (p) {
-    string s = p;
-    res.push_back(s);
-    p = std::strtok(NULL, d);
-  }
-  return res;
-}
-std::map<std::string, std::string> LoadConfigTxt(std::string config_path) {
-  auto config = ReadDict(config_path);
-  std::map<std::string, std::string> dict;
-  for (int i = 0; i < config.size(); i++) {
-    std::vector<std::string> res = split(config[i], " ");
-    dict[res[0]] = res[1];
-  }
-  return dict;
-}
-void PrintConfig(const std::map<std::string, std::string> &config) {
-  std::cout << "=======PaddleDetection lite demo config======" << std::endl;
-  for (auto iter = config.begin(); iter != config.end(); iter++) {
-    std::cout << iter->first << " : " << iter->second << std::endl;
-  }
-  std::cout << "===End of PaddleDetection lite demo config===" << std::endl;
-}
-// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
-void neon_mean_scale(const float* din,
-                     float* dout,
-                     int size,
-                     const std::vector<float> mean,
-                     const std::vector<float> scale) {
-  if (mean.size() != 3 || scale.size() != 3) {
-    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
-    exit(1);
-  }
-  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
-  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
-  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
-  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
-  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
-  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
-  float* dout_c0 = dout;
-  float* dout_c1 = dout + size;
-  float* dout_c2 = dout + size * 2;
-  int i = 0;
-  for (; i < size - 3; i += 4) {
-    float32x4x3_t vin3 = vld3q_f32(din);
-    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
-    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
-    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
-    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
-    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
-    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
-    vst1q_f32(dout_c0, vs0);
-    vst1q_f32(dout_c1, vs1);
-    vst1q_f32(dout_c2, vs2);
-    din += 12;
-    dout_c0 += 4;
-    dout_c1 += 4;
-    dout_c2 += 4;
-  }
-  for (; i < size; i++) {
-    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
-    *(dout_c0++) = (*(din++) - mean[1]) * scale[1];
-    *(dout_c0++) = (*(din++) - mean[2]) * scale[2];
-  }
-}
-std::vector<Object> visualize_result(
-                        const float* data,
-                        int count,
-                        float thresh,
-                        cv::Mat& image,
-                        const std::vector<std::string> &class_names) {
-  if (data == nullptr) {
-    std::cerr << "[ERROR] data can not be nullptr\n";
-    exit(1);
-  }
-  std::vector<Object> rect_out;
-  for (int iw = 0; iw < count; iw++) {
-    int oriw = image.cols;
-    int orih = image.rows;
-    if (data[1] > thresh) {
-      Object obj;
-      int x = static_cast<int>(data[2]);
-      int y = static_cast<int>(data[3]);
-      int w = static_cast<int>(data[4] - data[2] + 1);
-      int h = static_cast<int>(data[5] - data[3] + 1);
-      cv::Rect rec_clip =
-          cv::Rect(x, y, w, h) & cv::Rect(0, 0, image.cols, image.rows);
-      obj.class_id = static_cast<int>(data[0]);
-      obj.prob = data[1];
-      obj.rec = rec_clip;
-      if (w > 0 && h > 0 && obj.prob <= 1) {
-        rect_out.push_back(obj);
-        cv::rectangle(image, rec_clip, cv::Scalar(0, 0, 255), 1, cv::LINE_AA);
-        std::string str_prob = std::to_string(obj.prob);
-        std::string text = std::string(class_names[obj.class_id]) + ": " +
-                           str_prob.substr(0, str_prob.find(".") + 4);
-        int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
-        double font_scale = 1.f;
-        int thickness = 1;
-        cv::Size text_size =
-            cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
-        float new_font_scale = w * 0.5 * font_scale / text_size.width;
-        text_size = cv::getTextSize(
-            text, font_face, new_font_scale, thickness, nullptr);
-        cv::Point origin;
-        origin.x = x + 3;
-        origin.y = y + text_size.height + 3;
-        cv::putText(image,
-                    text,
-                    origin,
-                    font_face,
-                    new_font_scale,
-                    cv::Scalar(0, 255, 255),
-                    thickness,
-                    cv::LINE_AA);
-        std::cout << "detection, image size: " << image.cols << ", "
-                  << image.rows
-                  << ", detect object: " << class_names[obj.class_id]
-                  << ", score: " << obj.prob << ", location: x=" << x
-                  << ", y=" << y << ", width=" << w << ", height=" << h
-                  << std::endl;
-      }
-    }
-    data += 6;
-  }
-  return rect_out;
-}
-// Load Model and create model predictor
-std::shared_ptr<PaddlePredictor> LoadModel(std::string model_file,
-                                           int num_theads) {
-  MobileConfig config;
-  config.set_threads(num_theads);
-  config.set_model_from_file(model_file);
-  std::shared_ptr<PaddlePredictor> predictor =
-      CreatePaddlePredictor<MobileConfig>(config);
-  return predictor;
-}
-ImageBlob prepare_imgdata(const cv::Mat& img,
-                          std::map<std::string,
-                          std::string> config) {
-  ImageBlob img_data;
-  std::vector<int> target_size_;
-  std::vector<std::string> size_str = split(config.at("Resize"), ",");
-  transform(size_str.begin(), size_str.end(), back_inserter(target_size_),
-            [](std::string const& s){return stoi(s);});
-  int width = target_size_[0];
-  int height = target_size_[1];
-  img_data.im_shape_ = {
-      static_cast<int>(target_size_[0]),
-      static_cast<int>(target_size_[1])
-  };
-  img_data.scale_factor_ = {
-    static_cast<float>(target_size_[0]) / static_cast<float>(img.rows),
-    static_cast<float>(target_size_[1]) / static_cast<float>(img.cols)
-  };
-  std::vector<float> mean_;
-  std::vector<float> scale_;
-  std::vector<std::string> mean_str = split(config.at("mean"), ",");
-  std::vector<std::string> std_str = split(config.at("std"), ",");
-  transform(mean_str.begin(), mean_str.end(), back_inserter(mean_),
-            [](std::string const& s){return stof(s);});
-  transform(std_str.begin(), std_str.end(), back_inserter(scale_),
-            [](std::string const& s){return stof(s);});
-  img_data.mean_ = mean_;
-  img_data.scale_ = scale_;
-  return img_data;
-}
-void preprocess(const cv::Mat& img, const ImageBlob img_data, float* data) {
-  cv::Mat rgb_img;
-  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
-  cv::resize(
-      rgb_img, rgb_img, cv::Size(img_data.im_shape_[0],img_data.im_shape_[1]),
-      0.f, 0.f, cv::INTER_CUBIC);
-  cv::Mat imgf;
-  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
-  const float* dimg = reinterpret_cast<const float*>(imgf.data);
-  neon_mean_scale(
-    dimg, data, int(img_data.im_shape_[0] * img_data.im_shape_[1]),
-    img_data.mean_, img_data.scale_);
-}
-void RunModel(std::map<std::string, std::string> config,
-              std::string img_path,
-              const int repeats,
-              std::vector<double>* times) {
-  std::string model_file = config.at("model_file");
-  std::string label_path = config.at("label_path");
-  // Load Labels
-  std::vector<std::string> class_names = LoadLabels(label_path);
-  auto predictor = LoadModel(model_file, stoi(config.at("num_threads")));
-  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
-  auto img_data = prepare_imgdata(img, config);
-  auto preprocess_start = std::chrono::steady_clock::now();
-  // 1. Prepare input data from image
-  // input 0
-  std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
-  input_tensor0->Resize({1, 2});
-  auto* data0 = input_tensor0->mutable_data<float>();
-  data0[0] = img_data.im_shape_[0];
-  data0[1] = img_data.im_shape_[1];
-  // input1
-  std::unique_ptr<Tensor> input_tensor1(std::move(predictor->GetInput(1)));
-  input_tensor1->Resize({1, 3, img_data.im_shape_[0], img_data.im_shape_[1]});
-  auto* data1 = input_tensor1->mutable_data<float>();
-  preprocess(img, img_data, data1);
-  // input2
-  std::unique_ptr<Tensor> input_tensor2(std::move(predictor->GetInput(2)));
-  input_tensor2->Resize({1, 2});
-  auto* data2 = input_tensor2->mutable_data<float>();
-  data2[0] = img_data.scale_factor_[0];
-  data2[1] = img_data.scale_factor_[1];
-  auto preprocess_end = std::chrono::steady_clock::now();
-  // 2. Run predictor
-  // warm up
-  for (int i = 0; i < repeats / 2; i++)
-  {
-    predictor->Run();
-  }
-  auto inference_start = std::chrono::steady_clock::now();
-  for (int i = 0; i < repeats; i++)
-  {
-    predictor->Run();
-  }
-  auto inference_end = std::chrono::steady_clock::now();
-  // 3. Get output and post process
-  auto postprocess_start = std::chrono::steady_clock::now();
-  std::unique_ptr<const Tensor> output_tensor(
-      std::move(predictor->GetOutput(0)));
-  const float* outptr = output_tensor->data<float>();
-  auto shape_out = output_tensor->shape();
-  int64_t cnt = 1;
-  for (auto& i : shape_out) {
-    cnt *= i;
-  }
-  auto rec_out = visualize_result(
-      outptr, static_cast<int>(cnt / 6), 0.5f, img, class_names);
-  std::string result_name =
-      img_path.substr(0, img_path.find(".")) + "_result.jpg";
-  cv::imwrite(result_name, img);
-  auto postprocess_end = std::chrono::steady_clock::now();
-  std::chrono::duration<float> prep_diff = preprocess_end - preprocess_start;
-  times->push_back(double(prep_diff.count() * 1000));
-  std::chrono::duration<float> infer_diff = inference_end - inference_start;
-  times->push_back(double(infer_diff.count() / repeats * 1000));
-  std::chrono::duration<float> post_diff = postprocess_end - postprocess_start;
-  times->push_back(double(post_diff.count() * 1000));
-}
-int main(int argc, char** argv) {
-  if (argc < 3) {
-    std::cerr << "[ERROR] usage: " << argv[0] << " config_path image_path\n";
-    exit(1);
-  }
-  std::string config_path = argv[1];
-  std::string img_path = argv[2];
-  // load config
-  auto config = LoadConfigTxt(config_path);
-  PrintConfig(config);
-  bool enable_benchmark = bool(stoi(config.at("enable_benchmark")));
-  int repeats = enable_benchmark ? 50 : 1;
-  std::vector<double> det_times;
-  RunModel(config, img_path, repeats, &det_times);
-  PrintBenchmarkLog(det_times, config, 1);
-  return 0;
-}
--- a/deploy/lite/runtime_config.json
+++ b/deploy/lite/runtime_config.json
+{
+  "model_dir_det": "./model_det/",
+  "batch_size_det": 1,
+  "threshold_det": 0.5,
+  "model_dir_keypoint": "./model_keypoint/",
+  "batch_size_keypoint": 8,
+  "threshold_keypoint": 0.5,
+  "image_file": "",
+  "image_dir": "",
+  "run_benchmark": false,
+  "cpu_threads": 1
+}
--- a/deploy/lite/src/config_parser.cc
+++ b/deploy/lite/src/config_parser.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "include/config_parser.h"
+namespace PaddleDetection {
+void load_jsonf(std::string jsonfile, Json::Value &jsondata) {
+  std::ifstream ifs;
+  ifs.open(jsonfile);
+  Json::CharReaderBuilder builder;
+  builder["collectComments"] = true;
+  JSONCPP_STRING errs;
+  if (!parseFromStream(builder, ifs, &jsondata, &errs)) {
+    std::cout << errs << std::endl;
+    return;
+  }
+}
+}  // namespace PaddleDetection
--- a/deploy/lite/src/keypoint_detector.cc
+++ b/deploy/lite/src/keypoint_detector.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "include/keypoint_detector.h"
+namespace PaddleDetection {
+// Load Model and create model predictor
+void KeyPointDetector::LoadModel(std::string model_file, int num_theads) {
+  MobileConfig config;
+  config.set_threads(num_theads);
+  config.set_model_from_file(model_file + "/model.nb");
+  config.set_power_mode(LITE_POWER_HIGH);
+  predictor_ = std::move(CreatePaddlePredictor<MobileConfig>(config));
+}
+const int edge[][2] = {{0, 1},
+                       {0, 2},
+                       {1, 3},
+                       {2, 4},
+                       {3, 5},
+                       {4, 6},
+                       {5, 7},
+                       {6, 8},
+                       {7, 9},
+                       {8, 10},
+                       {5, 11},
+                       {6, 12},
+                       {11, 13},
+                       {12, 14},
+                       {13, 15},
+                       {14, 16},
+                       {11, 12}};
+// Visualiztion MaskDetector results
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap) {
+  cv::Mat vis_img = img.clone();
+  for (int batchid = 0; batchid < results.size(); batchid++) {
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[i * 3] > 0.5) {
+        int x_coord = int(results[batchid].keypoints[i * 3 + 1]);
+        int y_coord = int(results[batchid].keypoints[i * 3 + 2]);
+        cv::circle(vis_img,
+                   cv::Point2d(x_coord, y_coord),
+                   1,
+                   cv::Scalar(0, 0, 255),
+                   2);
+      }
+    }
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      int x_start = int(results[batchid].keypoints[edge[i][0] * 3 + 1]);
+      int y_start = int(results[batchid].keypoints[edge[i][0] * 3 + 2]);
+      int x_end = int(results[batchid].keypoints[edge[i][1] * 3 + 1]);
+      int y_end = int(results[batchid].keypoints[edge[i][1] * 3 + 2]);
+      cv::line(vis_img,
+               cv::Point2d(x_start, y_start),
+               cv::Point2d(x_end, y_end),
+               colormap[i],
+               1);
+    }
+  }
+  return vis_img;
+}
+void KeyPointDetector::Preprocess(const cv::Mat& ori_im) {
+  // Clone the image : keep the original mat for postprocess
+  cv::Mat im = ori_im.clone();
+  cv::cvtColor(im, im, cv::COLOR_BGR2RGB);
+  preprocessor_.Run(&im, &inputs_);
+}
+void KeyPointDetector::Postprocess(std::vector<float> output,
+                                   std::vector<int64_t> output_shape,
+                                   std::vector<int64_t> idxout,
+                                   std::vector<int64_t> idx_shape,
+                                   std::vector<KeyPointResult>* result,
+                                   std::vector<std::vector<float>>& center_bs,
+                                   std::vector<std::vector<float>>& scale_bs) {
+  float* preds = new float[output_shape[1] * 3]{0};
+  for (int batchid = 0; batchid < output_shape[0]; batchid++) {
+    get_final_preds(const_cast<float*>(output.data()),
+                    output_shape,
+                    idxout.data(),
+                    idx_shape,
+                    center_bs[batchid],
+                    scale_bs[batchid],
+                    preds,
+                    batchid);
+    KeyPointResult result_item;
+    result_item.num_joints = output_shape[1];
+    result_item.keypoints.clear();
+    for (int i = 0; i < output_shape[1]; i++) {
+      result_item.keypoints.emplace_back(preds[i * 3]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 1]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 2]);
+    }
+    result->push_back(result_item);
+  }
+  delete[] preds;
+}
+void KeyPointDetector::Predict(const std::vector<cv::Mat> imgs,
+                               std::vector<std::vector<float>>& center_bs,
+                               std::vector<std::vector<float>>& scale_bs,
+                               const double threshold,
+                               const int warmup,
+                               const int repeats,
+                               std::vector<KeyPointResult>* result,
+                               std::vector<double>* times) {
+  auto preprocess_start = std::chrono::steady_clock::now();
+  int batch_size = imgs.size();
+  // in_data_batch
+  std::vector<float> in_data_all;
+  // Preprocess image
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+    Preprocess(im);
+    // TODO: reduce cost time
+    in_data_all.insert(
+        in_data_all.end(), inputs_.im_data_.begin(), inputs_.im_data_.end());
+  }
+  // Prepare input tensor
+  auto input_names = predictor_->GetInputNames();
+  for (const auto& tensor_name : input_names) {
+    auto in_tensor = predictor_->GetInputByName(tensor_name);
+    if (tensor_name == "image") {
+      int rh = inputs_.in_net_shape_[0];
+      int rw = inputs_.in_net_shape_[1];
+      in_tensor->Resize({batch_size, 3, rh, rw});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(in_data_all.data(), in_data_all.size(), inptr);
+    }
+  }
+  auto preprocess_end = std::chrono::steady_clock::now();
+  std::vector<int64_t> output_shape, idx_shape;
+  // Run predictor
+  // warmup
+  for (int i = 0; i < warmup; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto out_tensor = predictor_->GetTensor(output_names[0]);
+    auto idx_tensor = predictor_->GetTensor(output_names[1]);
+  }
+  auto inference_start = std::chrono::steady_clock::now();
+  for (int i = 0; i < repeats; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto out_tensor = predictor_->GetTensor(output_names[0]);
+    output_shape = out_tensor->shape();
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < output_shape.size(); ++j) {
+      output_size *= output_shape[j];
+    }
+    if (output_size < 6) {
+      std::cerr << "[WARNING] No object detected." << std::endl;
+    }
+    output_data_.resize(output_size);
+    std::copy_n(
+        out_tensor->mutable_data<float>(), output_size, output_data_.data());
+    auto idx_tensor = predictor_->GetTensor(output_names[1]);
+    idx_shape = idx_tensor->shape();
+    // Calculate output length
+    output_size = 1;
+    for (int j = 0; j < idx_shape.size(); ++j) {
+      output_size *= idx_shape[j];
+    }
+    idx_data_.resize(output_size);
+    std::copy_n(
+        idx_tensor->mutable_data<int64_t>(), output_size, idx_data_.data());
+  }
+  auto inference_end = std::chrono::steady_clock::now();
+  auto postprocess_start = std::chrono::steady_clock::now();
+  // Postprocessing result
+  Postprocess(output_data_,
+              output_shape,
+              idx_data_,
+              idx_shape,
+              result,
+              center_bs,
+              scale_bs);
+  auto postprocess_end = std::chrono::steady_clock::now();
+  std::chrono::duration<float> preprocess_diff =
+      preprocess_end - preprocess_start;
+  times->push_back(double(preprocess_diff.count() * 1000));
+  std::chrono::duration<float> inference_diff = inference_end - inference_start;
+  times->push_back(double(inference_diff.count() / repeats * 1000));
+  std::chrono::duration<float> postprocess_diff =
+      postprocess_end - postprocess_start;
+  times->push_back(double(postprocess_diff.count() * 1000));
+}
+}  // namespace PaddleDetection
--- a/deploy/lite/src/keypoint_postprocess.cc
+++ b/deploy/lite/src/keypoint_postprocess.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "include/keypoint_postprocess.h"
+cv::Point2f get_3rd_point(cv::Point2f& a, cv::Point2f& b) {
+  cv::Point2f direct{a.x - b.x, a.y - b.y};
+  return cv::Point2f(a.x - direct.y, a.y + direct.x);
+}
+std::vector<float> get_dir(float src_point_x,
+                           float src_point_y,
+                           float rot_rad) {
+  float sn = sin(rot_rad);
+  float cs = cos(rot_rad);
+  std::vector<float> src_result{0.0, 0.0};
+  src_result[0] = src_point_x * cs - src_point_y * sn;
+  src_result[1] = src_point_x * sn + src_point_y * cs;
+  return src_result;
+}
+void affine_tranform(
+    float pt_x, float pt_y, cv::Mat& trans, float* preds, int p) {
+  double new1[3] = {pt_x, pt_y, 1.0};
+  cv::Mat new_pt(3, 1, trans.type(), new1);
+  cv::Mat w = trans * new_pt;
+  preds[p * 3 + 1] = static_cast<float>(w.at<double>(0, 0));
+  preds[p * 3 + 2] = static_cast<float>(w.at<double>(1, 0));
+}
+void get_affine_transform(std::vector<float>& center,
+                          std::vector<float>& scale,
+                          float rot,
+                          std::vector<int>& output_size,
+                          cv::Mat& trans,
+                          int inv) {
+  float src_w = scale[0];
+  float dst_w = static_cast<float>(output_size[0]);
+  float dst_h = static_cast<float>(output_size[1]);
+  float rot_rad = rot * 3.1415926535 / 180;
+  std::vector<float> src_dir = get_dir(-0.5 * src_w, 0, rot_rad);
+  std::vector<float> dst_dir{-0.5 * dst_w, 0.0};
+  cv::Point2f srcPoint2f[3], dstPoint2f[3];
+  srcPoint2f[0] = cv::Point2f(center[0], center[1]);
+  srcPoint2f[1] = cv::Point2f(center[0] + src_dir[0], center[1] + src_dir[1]);
+  srcPoint2f[2] = get_3rd_point(srcPoint2f[0], srcPoint2f[1]);
+  dstPoint2f[0] = cv::Point2f(dst_w * 0.5, dst_h * 0.5);
+  dstPoint2f[1] =
+      cv::Point2f(dst_w * 0.5 + dst_dir[0], dst_h * 0.5 + dst_dir[1]);
+  dstPoint2f[2] = get_3rd_point(dstPoint2f[0], dstPoint2f[1]);
+  if (inv == 0) {
+    trans = cv::getAffineTransform(srcPoint2f, dstPoint2f);
+  } else {
+    trans = cv::getAffineTransform(dstPoint2f, srcPoint2f);
+  }
+}
+void transform_preds(float* coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int64_t>& dim,
+                     float* target_coords) {
+  cv::Mat trans(2, 3, CV_64FC1);
+  get_affine_transform(center, scale, 0, output_size, trans, 1);
+  for (int p = 0; p < dim[1]; ++p) {
+    affine_tranform(coords[p * 2], coords[p * 2 + 1], trans, target_coords, p);
+  }
+}
+// only for batchsize == 1
+void get_max_preds(float* heatmap,
+                   std::vector<int>& dim,
+                   float* preds,
+                   float* maxvals,
+                   int batchid,
+                   int joint_idx) {
+  int num_joints = dim[1];
+  int width = dim[3];
+  std::vector<int> idx;
+  idx.resize(num_joints * 2);
+  for (int j = 0; j < dim[1]; j++) {
+    float* index = &(
+        heatmap[batchid * num_joints * dim[2] * dim[3] + j * dim[2] * dim[3]]);
+    float* end = index + dim[2] * dim[3];
+    float* max_dis = std::max_element(index, end);
+    auto max_id = std::distance(index, max_dis);
+    maxvals[j] = *max_dis;
+    if (*max_dis > 0) {
+      preds[j * 2] = static_cast<float>(max_id % width);
+      preds[j * 2 + 1] = static_cast<float>(max_id / width);
+    }
+  }
+}
+void get_final_preds(float* heatmap,
+                     std::vector<int64_t>& dim,
+                     int64_t* idxout,
+                     std::vector<int64_t>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     float* preds,
+                     int batchid) {
+  std::vector<float> coords;
+  coords.resize(dim[1] * 2);
+  int heatmap_height = dim[2];
+  int heatmap_width = dim[3];
+  for (int j = 0; j < dim[1]; ++j) {
+    int index = (batchid * dim[1] + j) * dim[2] * dim[3];
+    int idx = idxout[batchid * dim[1] + j];
+    preds[j * 3] = heatmap[index + idx];
+    coords[j * 2] = idx % heatmap_width;
+    coords[j * 2 + 1] = idx / heatmap_width;
+    int px = int(coords[j * 2] + 0.5);
+    int py = int(coords[j * 2 + 1] + 0.5);
+    if (px > 1 && px < heatmap_width - 1) {
+      float diff_x = heatmap[index + py * dim[3] + px + 1] -
+                     heatmap[index + py * dim[3] + px - 1];
+      coords[j * 2] += diff_x > 0 ? 1 : -1 * 0.25;
+    }
+    if (py > 1 && py < heatmap_height - 1) {
+      float diff_y = heatmap[index + (py + 1) * dim[3] + px] -
+                     heatmap[index + (py - 1) * dim[3] + px];
+      coords[j * 2 + 1] += diff_y > 0 ? 1 : -1 * 0.25;
+    }
+  }
+  std::vector<int> img_size{heatmap_width, heatmap_height};
+  transform_preds(coords.data(), center, scale, img_size, dim, preds);
+}
\ No newline at end of file
--- a/deploy/lite/src/main.cc
+++ b/deploy/lite/src/main.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <math.h>
+#include <stdarg.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+#include "include/config_parser.h"
+#include "include/keypoint_detector.h"
+#include "include/object_detector.h"
+#include "include/preprocess_op.h"
+#include "json/json.h"
+Json::Value RT_Config;
+void PrintBenchmarkLog(std::vector<double> det_time, int img_num) {
+  std::cout << "----------------------- Config info -----------------------"
+            << std::endl;
+  std::cout << "num_threads: " << RT_Config["cpu_threads"].as<int>()
+            << std::endl;
+  std::cout << "----------------------- Data info -----------------------"
+            << std::endl;
+  std::cout << "batch_size_det: " << RT_Config["batch_size_det"].as<int>()
+            << std::endl;
+  std::cout << "batch_size_keypoint: "
+            << RT_Config["batch_size_keypoint"].as<int>() << std::endl;
+  std::cout << "----------------------- Model info -----------------------"
+            << std::endl;
+  RT_Config["model_dir_det"].as<std::string>().erase(
+      RT_Config["model_dir_det"].as<std::string>().find_last_not_of("/") + 1);
+  std::cout
+      << "detection model_name: "
+      << RT_Config["model_dir_det"].as<std::string>().substr(
+             RT_Config["model_dir_det"].as<std::string>().find_last_of('/') + 1)
+      << std::endl;
+  RT_Config["model_dir_keypoint"].as<std::string>().erase(
+      RT_Config["model_dir_keypoint"].as<std::string>().find_last_not_of("/") +
+      1);
+  std::cout
+      << "keypoint model_name: "
+      << RT_Config["model_dir_keypoint"].as<std::string>().substr(
+             RT_Config["model_dir_keypoint"].as<std::string>().find_last_of(
+                 '/') +
+             1)
+      << std::endl;
+  std::cout << "----------------------- Perf info ------------------------"
+            << std::endl;
+  std::cout << "Total number of predicted data: " << img_num
+            << " and total time spent(ms): "
+            << std::accumulate(det_time.begin(), det_time.end(), 0)
+            << std::endl;
+  std::cout << "preproce_time(ms): " << det_time[0] / img_num
+            << ", inference_time(ms): " << det_time[1] / img_num
+            << ", postprocess_time(ms): " << det_time[2] / img_num << std::endl;
+}
+static std::string DirName(const std::string& filepath) {
+  auto pos = filepath.rfind(OS_PATH_SEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+static bool PathExists(const std::string& path) {
+  struct stat buffer;
+  return (stat(path.c_str(), &buffer) == 0);
+}
+static void MkDir(const std::string& path) {
+  if (PathExists(path)) return;
+  int ret = 0;
+  ret = mkdir(path.c_str(), 0755);
+  if (ret != 0) {
+    std::string path_error(path);
+    path_error += " mkdir failed!";
+    throw std::runtime_error(path_error);
+  }
+}
+static void MkDirs(const std::string& path) {
+  if (path.empty()) return;
+  if (PathExists(path)) return;
+  MkDirs(DirName(path));
+  MkDir(path);
+}
+void PredictImage(const std::vector<std::string> all_img_paths,
+                  const int batch_size_det,
+                  const double threshold_det,
+                  const bool run_benchmark,
+                  PaddleDetection::ObjectDetector* det,
+                  PaddleDetection::KeyPointDetector* keypoint,
+                  const std::string& output_dir = "output") {
+  std::vector<double> det_t = {0, 0, 0};
+  int steps = ceil(float(all_img_paths.size()) / batch_size_det);
+  int kpts_imgs = 0;
+  std::vector<double> keypoint_t = {0, 0, 0};
+  for (int idx = 0; idx < steps; idx++) {
+    std::vector<cv::Mat> batch_imgs;
+    int left_image_cnt = all_img_paths.size() - idx * batch_size_det;
+    if (left_image_cnt > batch_size_det) {
+      left_image_cnt = batch_size_det;
+    }
+    for (int bs = 0; bs < left_image_cnt; bs++) {
+      std::string image_file_path = all_img_paths.at(idx * batch_size_det + bs);
+      cv::Mat im = cv::imread(image_file_path, 1);
+      batch_imgs.insert(batch_imgs.end(), im);
+    }
+    // Store all detected result
+    std::vector<PaddleDetection::ObjectResult> result;
+    std::vector<int> bbox_num;
+    std::vector<double> det_times;
+    // Store keypoint results
+    std::vector<PaddleDetection::KeyPointResult> result_kpts;
+    std::vector<cv::Mat> imgs_kpts;
+    std::vector<std::vector<float>> center_bs;
+    std::vector<std::vector<float>> scale_bs;
+    std::vector<int> colormap_kpts = PaddleDetection::GenerateColorMap(20);
+    bool is_rbox = false;
+    if (run_benchmark) {
+      det->Predict(
+          batch_imgs, threshold_det, 10, 10, &result, &bbox_num, &det_times);
+    } else {
+      det->Predict(batch_imgs, 0.5, 0, 1, &result, &bbox_num, &det_times);
+    }
+    // get labels and colormap
+    auto labels = det->GetLabelList();
+    auto colormap = PaddleDetection::GenerateColorMap(labels.size());
+    int item_start_idx = 0;
+    for (int i = 0; i < left_image_cnt; i++) {
+      cv::Mat im = batch_imgs[i];
+      std::vector<PaddleDetection::ObjectResult> im_result;
+      int detect_num = 0;
+      for (int j = 0; j < bbox_num[i]; j++) {
+        PaddleDetection::ObjectResult item = result[item_start_idx + j];
+        if (item.confidence < threshold_det || item.class_id == -1) {
+          continue;
+        }
+        detect_num += 1;
+        im_result.push_back(item);
+        if (item.rect.size() > 6) {
+          is_rbox = true;
+          printf("class=%d confidence=%.4f rect=[%d %d %d %d %d %d %d %d]\n",
+                 item.class_id,
+                 item.confidence,
+                 item.rect[0],
+                 item.rect[1],
+                 item.rect[2],
+                 item.rect[3],
+                 item.rect[4],
+                 item.rect[5],
+                 item.rect[6],
+                 item.rect[7]);
+        } else {
+          printf("class=%d confidence=%.4f rect=[%d %d %d %d]\n",
+                 item.class_id,
+                 item.confidence,
+                 item.rect[0],
+                 item.rect[1],
+                 item.rect[2],
+                 item.rect[3]);
+        }
+      }
+      std::cout << all_img_paths.at(idx * batch_size_det + i)
+                << " The number of detected box: " << detect_num << std::endl;
+      item_start_idx = item_start_idx + bbox_num[i];
+      std::vector<int> compression_params;
+      compression_params.push_back(cv::IMWRITE_JPEG_QUALITY);
+      compression_params.push_back(95);
+      std::string output_path(output_dir);
+      if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) {
+        output_path += OS_PATH_SEP;
+      }
+      std::string image_file_path = all_img_paths.at(idx * batch_size_det + i);
+      if (keypoint) {
+        int imsize = im_result.size();
+        for (int i = 0; i < imsize; i++) {
+          auto item = im_result[i];
+          cv::Mat crop_img;
+          std::vector<double> keypoint_times;
+          std::vector<int> rect = {
+              item.rect[0], item.rect[1], item.rect[2], item.rect[3]};
+          std::vector<float> center;
+          std::vector<float> scale;
+          if (item.class_id == 0) {
+            PaddleDetection::CropImg(im, crop_img, rect, center, scale);
+            center_bs.emplace_back(center);
+            scale_bs.emplace_back(scale);
+            imgs_kpts.emplace_back(crop_img);
+            kpts_imgs += 1;
+          }
+          if (imgs_kpts.size() == RT_Config["batch_size_keypoint"].as<int>() ||
+              ((i == imsize - 1) && !imgs_kpts.empty())) {
+            if (run_benchmark) {
+              keypoint->Predict(imgs_kpts,
+                                center_bs,
+                                scale_bs,
+                                0.5,
+                                10,
+                                10,
+                                &result_kpts,
+                                &keypoint_times);
+            } else {
+              keypoint->Predict(imgs_kpts,
+                                center_bs,
+                                scale_bs,
+                                0.5,
+                                0,
+                                1,
+                                &result_kpts,
+                                &keypoint_times);
+            }
+            imgs_kpts.clear();
+            center_bs.clear();
+            scale_bs.clear();
+            keypoint_t[0] += keypoint_times[0];
+            keypoint_t[1] += keypoint_times[1];
+            keypoint_t[2] += keypoint_times[2];
+          }
+        }
+        std::string kpts_savepath =
+            output_path + "keypoint_" +
+            image_file_path.substr(image_file_path.find_last_of('/') + 1);
+        cv::Mat kpts_vis_img =
+            VisualizeKptsResult(im, result_kpts, colormap_kpts);
+        cv::imwrite(kpts_savepath, kpts_vis_img, compression_params);
+        printf("Visualized output saved as %s\n", kpts_savepath.c_str());
+      } else {
+        // Visualization result
+        cv::Mat vis_img = PaddleDetection::VisualizeResult(
+            im, im_result, labels, colormap, is_rbox);
+        std::string det_savepath =
+            output_path +
+            image_file_path.substr(image_file_path.find_last_of('/') + 1);
+        cv::imwrite(det_savepath, vis_img, compression_params);
+        printf("Visualized output saved as %s\n", det_savepath.c_str());
+      }
+    }
+    det_t[0] += det_times[0];
+    det_t[1] += det_times[1];
+    det_t[2] += det_times[2];
+  }
+  PrintBenchmarkLog(det_t, all_img_paths.size());
+  PrintBenchmarkLog(keypoint_t, kpts_imgs);
+}
+int main(int argc, char** argv) {
+  std::cout << "Usage: " << argv[0]
+            << " [config_path](option) [image_dir](option)\n";
+  std::string config_path = "runtime_config.json";
+  std::string img_path = "";
+  if (argc >= 2) {
+    config_path = argv[1];
+    if (argc >= 3) {
+      img_path = argv[2];
+    }
+  }
+  // Parsing command-line
+  PaddleDetection::load_jsonf(config_path, RT_Config);
+  if (RT_Config["model_dir_det"].as<std::string>().empty()) {
+    std::cout << "Please set [model_det_dir] in " << config_path << std::endl;
+    return -1;
+  }
+  if (RT_Config["image_file"].as<std::string>().empty() &&
+      RT_Config["image_dir"].as<std::string>().empty() && img_path.empty()) {
+    std::cout << "Please set [image_file] or [image_dir] in " << config_path
+              << " Or use command: <" << argv[0] << " [image_dir]>"
+              << std::endl;
+    return -1;
+  }
+  if (!img_path.empty()) {
+    std::cout << "Use image_dir in command line overide the path in config file"
+              << std::endl;
+    RT_Config["image_dir"] = img_path;
+    RT_Config["image_file"] = "";
+  }
+  // Load model and create a object detector
+  PaddleDetection::ObjectDetector det(
+      RT_Config["model_dir_det"].as<std::string>(),
+      RT_Config["cpu_threads"].as<int>(),
+      RT_Config["batch_size_det"].as<int>());
+  PaddleDetection::KeyPointDetector* keypoint = nullptr;
+  if (!RT_Config["model_dir_keypoint"].as<std::string>().empty()) {
+    keypoint = new PaddleDetection::KeyPointDetector(
+        RT_Config["model_dir_keypoint"].as<std::string>(),
+        RT_Config["cpu_threads"].as<int>(),
+        RT_Config["batch_size_keypoint"].as<int>());
+    RT_Config["batch_size_det"] = 1;
+    printf(
+        "batchsize of detection forced to be 1 while keypoint model is not "
+        "empty()");
+  }
+  // Do inference on input image
+  if (!RT_Config["image_file"].as<std::string>().empty() ||
+      !RT_Config["image_dir"].as<std::string>().empty()) {
+    if (!PathExists(RT_Config["output_dir"].as<std::string>())) {
+      MkDirs(RT_Config["output_dir"].as<std::string>());
+    }
+    std::vector<std::string> all_img_paths;
+    std::vector<cv::String> cv_all_img_paths;
+    if (!RT_Config["image_file"].as<std::string>().empty()) {
+      all_img_paths.push_back(RT_Config["image_file"].as<std::string>());
+      if (RT_Config["batch_size_det"].as<int>() > 1) {
+        std::cout << "batch_size_det should be 1, when set `image_file`."
+                  << std::endl;
+        return -1;
+      }
+    } else {
+      cv::glob(RT_Config["image_dir"].as<std::string>(), cv_all_img_paths);
+      for (const auto& img_path : cv_all_img_paths) {
+        all_img_paths.push_back(img_path);
+      }
+    }
+    PredictImage(all_img_paths,
+                 RT_Config["batch_size_det"].as<int>(),
+                 RT_Config["threshold_det"].as<float>(),
+                 RT_Config["run_benchmark"].as<bool>(),
+                 &det,
+                 keypoint,
+                 RT_Config["output_dir"].as<std::string>());
+  }
+  delete keypoint;
+  keypoint = nullptr;
+  return 0;
+}
--- a/deploy/lite/src/object_detector.cc
+++ b/deploy/lite/src/object_detector.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "include/object_detector.h"
+namespace PaddleDetection {
+// Load Model and create model predictor
+void ObjectDetector::LoadModel(std::string model_file, int num_theads) {
+  MobileConfig config;
+  config.set_threads(num_theads);
+  config.set_model_from_file(model_file + "/model.nb");
+  config.set_power_mode(LITE_POWER_HIGH);
+  predictor_ = CreatePaddlePredictor<MobileConfig>(config);
+}
+// Visualiztion MaskDetector results
+cv::Mat VisualizeResult(const cv::Mat& img,
+                        const std::vector<ObjectResult>& results,
+                        const std::vector<std::string>& lables,
+                        const std::vector<int>& colormap,
+                        const bool is_rbox = false) {
+  cv::Mat vis_img = img.clone();
+  for (int i = 0; i < results.size(); ++i) {
+    // Configure color and text size
+    std::ostringstream oss;
+    oss << std::setiosflags(std::ios::fixed) << std::setprecision(4);
+    oss << lables[results[i].class_id] << " ";
+    oss << results[i].confidence;
+    std::string text = oss.str();
+    int c1 = colormap[3 * results[i].class_id + 0];
+    int c2 = colormap[3 * results[i].class_id + 1];
+    int c3 = colormap[3 * results[i].class_id + 2];
+    cv::Scalar roi_color = cv::Scalar(c1, c2, c3);
+    int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+    double font_scale = 0.5f;
+    float thickness = 0.5;
+    cv::Size text_size =
+        cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+    cv::Point origin;
+    if (is_rbox) {
+      // Draw object, text, and background
+      for (int k = 0; k < 4; k++) {
+        cv::Point pt1 = cv::Point(results[i].rect[(k * 2) % 8],
+                                  results[i].rect[(k * 2 + 1) % 8]);
+        cv::Point pt2 = cv::Point(results[i].rect[(k * 2 + 2) % 8],
+                                  results[i].rect[(k * 2 + 3) % 8]);
+        cv::line(vis_img, pt1, pt2, roi_color, 2);
+      }
+    } else {
+      int w = results[i].rect[2] - results[i].rect[0];
+      int h = results[i].rect[3] - results[i].rect[1];
+      cv::Rect roi = cv::Rect(results[i].rect[0], results[i].rect[1], w, h);
+      // Draw roi object, text, and background
+      cv::rectangle(vis_img, roi, roi_color, 2);
+    }
+    origin.x = results[i].rect[0];
+    origin.y = results[i].rect[1];
+    // Configure text background
+    cv::Rect text_back = cv::Rect(results[i].rect[0],
+                                  results[i].rect[1] - text_size.height,
+                                  text_size.width,
+                                  text_size.height);
+    // Draw text, and background
+    cv::rectangle(vis_img, text_back, roi_color, -1);
+    cv::putText(vis_img,
+                text,
+                origin,
+                font_face,
+                font_scale,
+                cv::Scalar(255, 255, 255),
+                thickness);
+  }
+  return vis_img;
+}
+void ObjectDetector::Preprocess(const cv::Mat& ori_im) {
+  // Clone the image : keep the original mat for postprocess
+  cv::Mat im = ori_im.clone();
+  cv::cvtColor(im, im, cv::COLOR_BGR2RGB);
+  preprocessor_.Run(&im, &inputs_);
+}
+void ObjectDetector::Postprocess(const std::vector<cv::Mat> mats,
+                                 std::vector<ObjectResult>* result,
+                                 std::vector<int> bbox_num,
+                                 bool is_rbox = false) {
+  result->clear();
+  int start_idx = 0;
+  for (int im_id = 0; im_id < mats.size(); im_id++) {
+    cv::Mat raw_mat = mats[im_id];
+    int rh = 1;
+    int rw = 1;
+    if (config_.arch_ == "Face") {
+      rh = raw_mat.rows;
+      rw = raw_mat.cols;
+    }
+    for (int j = start_idx; j < start_idx + bbox_num[im_id]; j++) {
+      if (is_rbox) {
+        // Class id
+        int class_id = static_cast<int>(round(output_data_[0 + j * 10]));
+        // Confidence score
+        float score = output_data_[1 + j * 10];
+        int x1 = (output_data_[2 + j * 10] * rw);
+        int y1 = (output_data_[3 + j * 10] * rh);
+        int x2 = (output_data_[4 + j * 10] * rw);
+        int y2 = (output_data_[5 + j * 10] * rh);
+        int x3 = (output_data_[6 + j * 10] * rw);
+        int y3 = (output_data_[7 + j * 10] * rh);
+        int x4 = (output_data_[8 + j * 10] * rw);
+        int y4 = (output_data_[9 + j * 10] * rh);
+        ObjectResult result_item;
+        result_item.rect = {x1, y1, x2, y2, x3, y3, x4, y4};
+        result_item.class_id = class_id;
+        result_item.confidence = score;
+        result->push_back(result_item);
+      } else {
+        // Class id
+        int class_id = static_cast<int>(round(output_data_[0 + j * 6]));
+        // Confidence score
+        float score = output_data_[1 + j * 6];
+        int xmin = (output_data_[2 + j * 6] * rw);
+        int ymin = (output_data_[3 + j * 6] * rh);
+        int xmax = (output_data_[4 + j * 6] * rw);
+        int ymax = (output_data_[5 + j * 6] * rh);
+        int wd = xmax - xmin;
+        int hd = ymax - ymin;
+        ObjectResult result_item;
+        result_item.rect = {xmin, ymin, xmax, ymax};
+        result_item.class_id = class_id;
+        result_item.confidence = score;
+        result->push_back(result_item);
+      }
+    }
+    start_idx += bbox_num[im_id];
+  }
+}
+void ObjectDetector::Predict(const std::vector<cv::Mat>& imgs,
+                             const double threshold,
+                             const int warmup,
+                             const int repeats,
+                             std::vector<ObjectResult>* result,
+                             std::vector<int>* bbox_num,
+                             std::vector<double>* times) {
+  auto preprocess_start = std::chrono::steady_clock::now();
+  int batch_size = imgs.size();
+  // in_data_batch
+  std::vector<float> in_data_all;
+  std::vector<float> im_shape_all(batch_size * 2);
+  std::vector<float> scale_factor_all(batch_size * 2);
+  // Preprocess image
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+    Preprocess(im);
+    im_shape_all[bs_idx * 2] = inputs_.im_shape_[0];
+    im_shape_all[bs_idx * 2 + 1] = inputs_.im_shape_[1];
+    scale_factor_all[bs_idx * 2] = inputs_.scale_factor_[0];
+    scale_factor_all[bs_idx * 2 + 1] = inputs_.scale_factor_[1];
+    // TODO: reduce cost time
+    in_data_all.insert(
+        in_data_all.end(), inputs_.im_data_.begin(), inputs_.im_data_.end());
+  }
+  auto preprocess_end = std::chrono::steady_clock::now();
+  // Prepare input tensor
+  auto input_names = predictor_->GetInputNames();
+  for (const auto& tensor_name : input_names) {
+    auto in_tensor = predictor_->GetInputByName(tensor_name);
+    if (tensor_name == "image") {
+      int rh = inputs_.in_net_shape_[0];
+      int rw = inputs_.in_net_shape_[1];
+      in_tensor->Resize({batch_size, 3, rh, rw});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(in_data_all.data(), in_data_all.size(), inptr);
+    } else if (tensor_name == "im_shape") {
+      in_tensor->Resize({batch_size, 2});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(im_shape_all.data(), im_shape_all.size(), inptr);
+    } else if (tensor_name == "scale_factor") {
+      in_tensor->Resize({batch_size, 2});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(scale_factor_all.data(), scale_factor_all.size(), inptr);
+    }
+  }
+  // Run predictor
+  // warmup
+  for (int i = 0; i < warmup; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto out_tensor = predictor_->GetTensor(output_names[0]);
+    auto out_bbox_num = predictor_->GetTensor(output_names[1]);
+  }
+  bool is_rbox = false;
+  auto inference_start = std::chrono::steady_clock::now();
+  for (int i = 0; i < repeats; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto output_tensor = predictor_->GetTensor(output_names[0]);
+    auto output_shape = output_tensor->shape();
+    auto out_bbox_num = predictor_->GetTensor(output_names[1]);
+    auto out_bbox_num_shape = out_bbox_num->shape();
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < output_shape.size(); ++j) {
+      output_size *= output_shape[j];
+    }
+    is_rbox = output_shape[output_shape.size() - 1] % 10 == 0;
+    if (output_size < 6) {
+      std::cerr << "[WARNING] No object detected." << std::endl;
+    }
+    output_data_.resize(output_size);
+    std::copy_n(
+        output_tensor->mutable_data<float>(), output_size, output_data_.data());
+    int out_bbox_num_size = 1;
+    for (int j = 0; j < out_bbox_num_shape.size(); ++j) {
+      out_bbox_num_size *= out_bbox_num_shape[j];
+    }
+    out_bbox_num_data_.resize(out_bbox_num_size);
+    std::copy_n(out_bbox_num->mutable_data<int>(),
+                out_bbox_num_size,
+                out_bbox_num_data_.data());
+  }
+  auto inference_end = std::chrono::steady_clock::now();
+  auto postprocess_start = std::chrono::steady_clock::now();
+  // Postprocessing result
+  result->clear();
+  Postprocess(imgs, result, out_bbox_num_data_, is_rbox);
+  bbox_num->clear();
+  for (int k = 0; k < out_bbox_num_data_.size(); k++) {
+    int tmp = out_bbox_num_data_[k];
+    bbox_num->push_back(tmp);
+  }
+  auto postprocess_end = std::chrono::steady_clock::now();
+  std::chrono::duration<float> preprocess_diff =
+      preprocess_end - preprocess_start;
+  times->push_back(double(preprocess_diff.count() * 1000));
+  std::chrono::duration<float> inference_diff = inference_end - inference_start;
+  times->push_back(double(inference_diff.count() / repeats * 1000));
+  std::chrono::duration<float> postprocess_diff =
+      postprocess_end - postprocess_start;
+  times->push_back(double(postprocess_diff.count() * 1000));
+}
+std::vector<int> GenerateColorMap(int num_class) {
+  auto colormap = std::vector<int>(3 * num_class, 0);
+  for (int i = 0; i < num_class; ++i) {
+    int j = 0;
+    int lab = i;
+    while (lab) {
+      colormap[i * 3] |= (((lab >> 0) & 1) << (7 - j));
+      colormap[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j));
+      colormap[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j));
+      ++j;
+      lab >>= 3;
+    }
+  }
+  return colormap;
+}
+}  // namespace PaddleDetection
--- a/deploy/lite/src/preprocess_op.cc
+++ b/deploy/lite/src/preprocess_op.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <string>
+#include <thread>
+#include <vector>
+#include "include/preprocess_op.h"
+namespace PaddleDetection {
+void InitInfo::Run(cv::Mat* im, ImageBlob* data) {
+  data->im_shape_ = {static_cast<float>(im->rows),
+                     static_cast<float>(im->cols)};
+  data->scale_factor_ = {1., 1.};
+  data->in_net_shape_ = {static_cast<float>(im->rows),
+                         static_cast<float>(im->cols)};
+}
+void NormalizeImage::Run(cv::Mat* im, ImageBlob* data) {
+  double e = 1.0;
+  if (is_scale_) {
+    e /= 255.0;
+  }
+  (*im).convertTo(*im, CV_32FC3, e);
+  for (int h = 0; h < im->rows; h++) {
+    for (int w = 0; w < im->cols; w++) {
+      im->at<cv::Vec3f>(h, w)[0] =
+          (im->at<cv::Vec3f>(h, w)[0] - mean_[0]) / scale_[0];
+      im->at<cv::Vec3f>(h, w)[1] =
+          (im->at<cv::Vec3f>(h, w)[1] - mean_[1]) / scale_[1];
+      im->at<cv::Vec3f>(h, w)[2] =
+          (im->at<cv::Vec3f>(h, w)[2] - mean_[2]) / scale_[2];
+    }
+  }
+}
+void Permute::Run(cv::Mat* im, ImageBlob* data) {
+  (*im).convertTo(*im, CV_32FC3);
+  int rh = im->rows;
+  int rw = im->cols;
+  int rc = im->channels();
+  (data->im_data_).resize(rc * rh * rw);
+  float* base = (data->im_data_).data();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(*im, cv::Mat(rh, rw, CV_32FC1, base + i * rh * rw), i);
+  }
+}
+void Resize::Run(cv::Mat* im, ImageBlob* data) {
+  auto resize_scale = GenerateScale(*im);
+  data->im_shape_ = {static_cast<float>(im->cols * resize_scale.first),
+                     static_cast<float>(im->rows * resize_scale.second)};
+  data->in_net_shape_ = {static_cast<float>(im->cols * resize_scale.first),
+                         static_cast<float>(im->rows * resize_scale.second)};
+  cv::resize(
+      *im, *im, cv::Size(), resize_scale.first, resize_scale.second, interp_);
+  data->im_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+  data->scale_factor_ = {
+      resize_scale.second, resize_scale.first,
+  };
+}
+std::pair<float, float> Resize::GenerateScale(const cv::Mat& im) {
+  std::pair<float, float> resize_scale;
+  int origin_w = im.cols;
+  int origin_h = im.rows;
+  if (keep_ratio_) {
+    int im_size_max = std::max(origin_w, origin_h);
+    int im_size_min = std::min(origin_w, origin_h);
+    int target_size_max =
+        *std::max_element(target_size_.begin(), target_size_.end());
+    int target_size_min =
+        *std::min_element(target_size_.begin(), target_size_.end());
+    float scale_min =
+        static_cast<float>(target_size_min) / static_cast<float>(im_size_min);
+    float scale_max =
+        static_cast<float>(target_size_max) / static_cast<float>(im_size_max);
+    float scale_ratio = std::min(scale_min, scale_max);
+    resize_scale = {scale_ratio, scale_ratio};
+  } else {
+    resize_scale.first =
+        static_cast<float>(target_size_[1]) / static_cast<float>(origin_w);
+    resize_scale.second =
+        static_cast<float>(target_size_[0]) / static_cast<float>(origin_h);
+  }
+  return resize_scale;
+}
+void PadStride::Run(cv::Mat* im, ImageBlob* data) {
+  if (stride_ <= 0) {
+    return;
+  }
+  int rc = im->channels();
+  int rh = im->rows;
+  int rw = im->cols;
+  int nh = (rh / stride_) * stride_ + (rh % stride_ != 0) * stride_;
+  int nw = (rw / stride_) * stride_ + (rw % stride_ != 0) * stride_;
+  cv::copyMakeBorder(
+      *im, *im, 0, nh - rh, 0, nw - rw, cv::BORDER_CONSTANT, cv::Scalar(0));
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+}
+void TopDownEvalAffine::Run(cv::Mat* im, ImageBlob* data) {
+  cv::resize(*im, *im, cv::Size(trainsize_[0], trainsize_[1]), 0, 0, interp_);
+  // todo: Simd::ResizeBilinear();
+  data->in_net_shape_ = {
+      static_cast<float>(trainsize_[1]), static_cast<float>(trainsize_[0]),
+  };
+}
+// Preprocessor op running order
+const std::vector<std::string> Preprocessor::RUN_ORDER = {"InitInfo",
+                                                          "TopDownEvalAffine",
+                                                          "Resize",
+                                                          "NormalizeImage",
+                                                          "PadStride",
+                                                          "Permute"};
+void Preprocessor::Run(cv::Mat* im, ImageBlob* data) {
+  for (const auto& name : RUN_ORDER) {
+    if (ops_.find(name) != ops_.end()) {
+      ops_[name]->Run(im, data);
+    }
+  }
+}
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio) {
+  int crop_x1 = std::max(0, area[0]);
+  int crop_y1 = std::max(0, area[1]);
+  int crop_x2 = std::min(img.cols - 1, area[2]);
+  int crop_y2 = std::min(img.rows - 1, area[3]);
+  int center_x = (crop_x1 + crop_x2) / 2.;
+  int center_y = (crop_y1 + crop_y2) / 2.;
+  int half_h = (crop_y2 - crop_y1) / 2.;
+  int half_w = (crop_x2 - crop_x1) / 2.;
+  if (half_h * 3 > half_w * 4) {
+    half_w = static_cast<int>(half_h * 0.75);
+  } else {
+    half_h = static_cast<int>(half_w * 4 / 3);
+  }
+  crop_x1 =
+      std::max(0, center_x - static_cast<int>(half_w * (1 + expandratio)));
+  crop_y1 =
+      std::max(0, center_y - static_cast<int>(half_h * (1 + expandratio)));
+  crop_x2 = std::min(img.cols - 1,
+                     static_cast<int>(center_x + half_w * (1 + expandratio)));
+  crop_y2 = std::min(img.rows - 1,
+                     static_cast<int>(center_y + half_h * (1 + expandratio)));
+  crop_img =
+      img(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1));
+  center.clear();
+  center.emplace_back((crop_x1 + crop_x2) / 2);
+  center.emplace_back((crop_y1 + crop_y2) / 2);
+  scale.clear();
+  scale.emplace_back((crop_x2 - crop_x1));
+  scale.emplace_back((crop_y2 - crop_y1));
+}
+}  // namespace PaddleDetection