add keypoint mnn support

del json config & preprocess

add keypoint mnn support
del json config & preprocess
147fb450 · zhiboniu · zhiboniu · 7762b906 · 147fb450 · 147fb450
14 changed file
--- a/configs/keypoint/README.md
+++ b/configs/keypoint/README.md
@@ -128,6 +128,11 @@ python deploy/python/mot_keypoint_unite_infer.py --mot_model_dir=output_inferenc
 **注意:**
 跟踪模型导出教程请参考`configs/mot/README.md`。

+### 4、模型单独部署
+
+    我们提供了PaddleInference(服务器端)、PaddleLite(移动端)、第三方部署(MNN、OpenVino)支持。无需依赖训练代码，deploy文件夹下相应文件夹提供独立完整部署代码。
+详见 [部署文档](../../deploy/README.md)介绍。
+
 ## Benchmark
 我们给出了不同运行环境下的测试结果，供您在选用模型时参考。详细数据请见[Keypoint Inference Benchmark](./KeypointBenchmark.md)。


--- a/configs/keypoint/README_en.md
+++ b/configs/keypoint/README_en.md
@@ -126,6 +126,10 @@ python deploy/python/mot_keypoint_unite_infer.py --mot_model_dir=output_inferenc
 **Note:**
 To export MOT model, please refer to [Here](../../configs/mot/README_en.md).

+### 4、Deploy standalone
+
+    We provide standalone deploy of PaddleInference(Server-GPU)、PaddleLite(mobile、ARM)、Third-Engine(MNN、OpenVino), which is independent of training codes。For detail, please click [Deploy-docs](../../deploy/README_en.md)。
+
 ## Benchmark
 We provide benchmarks in different runtime environments for your reference when choosing models. See [Keypoint Inference Benchmark](./KeypointBenchmark.md) for details.


--- a/deploy/README.md
+++ b/deploy/README.md
@@ -51,7 +51,17 @@ python tools/export_model.py -c configs/yolov3/yolov3_mobilenet_v1_roadsign.yml
 - 详细案例请参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)部署。更多内容，请参考[Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite)


-## 4.Benchmark测试
+## 4.第三方部署（MNN、NCNN、Openvino） 
+- 第三方部署提供PicoDet、TinyPose案例，其他模型请参考修改
+
+| Third_Engine | MNN  | NCNN  | OPENVINO   |
+| ------------ | ---- | ----- | ---------- |
+| PicoDet      | [PicoDet_MNN](./third_engine/demo_mnn/README.md)       | [PicoDet_NCNN](./third_engine/demo_ncnn/README.md) | [PicoDet_OPENVINO](./third_engine/demo_openvino/README.md)   |
+| TinyPose     | [TinyPose_MNN](./third_engine/demo_mnn_kpts/README.md) | -                                                  | [TinyPose_OPENVINO](./third_engine/demo_openvino_kpts/README.md) |
+
+
+
+## 5.Benchmark测试
 - 使用导出的模型，运行Benchmark批量测试脚本：
 ```shell
 sh deploy/benchmark/benchmark.sh {model_dir} {model_name}
@@ -62,7 +72,7 @@ sh deploy/benchmark/benchmark.sh {model_dir} {model_name}
 python deploy/benchmark/log_parser_excel.py --log_path=./output_pipeline --output_name=benchmark_excel.xlsx
 ```

-## 5.常见问题QA
+## 6.常见问题QA
 - 1、`Paddle 1.8.4`训练的模型，可以用`Paddle2.0`部署吗？
  Paddle 2.0是兼容Paddle 1.8.4的，因此是可以的。但是部分模型(如SOLOv2)使用到了Paddle 2.0中新增OP，这类模型不可以。


--- a/deploy/README_en.md
+++ b/deploy/README_en.md
@@ -51,7 +51,16 @@ For details on model export, please refer to the documentation [Tutorial on Padd
 - For details, please refer to [Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo) deployment. For more information, please refer to [Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite)


-## 4. Benchmark Test
+## 4.Third-Engine deploy(MNN、NCNN、Openvino)
+- The Third-Engine deploy take example of PicoDet、TinyPose，the others model is the same
+
+| Third_Engine | MNN                                                    | NCNN                                               | OPENVINO                                                     |
+| ------------ | ------------------------------------------------------ | -------------------------------------------------- | ------------------------------------------------------------ |
+| PicoDet      | [PicoDet_MNN](./third_engine/demo_mnn/README.md)       | [PicoDet_NCNN](./third_engine/demo_ncnn/README.md) | [PicoDet_OPENVINO](./third_engine/demo_openvino/README.md)   |
+| TinyPose     | [TinyPose_MNN](./third_engine/demo_mnn_kpts/README.md) | -                                                  | [TinyPose_OPENVINO](./third_engine/demo_openvino_kpts/README.md) |
+
+
+## 5. Benchmark Test
 - Using the exported model, run the Benchmark batch test script:
 ```shell
 sh deploy/benchmark/benchmark.sh {model_dir} {model_name}
@@ -62,7 +71,7 @@ sh deploy/benchmark/benchmark.sh {model_dir} {model_name}
 python deploy/benchmark/log_parser_excel.py --log_path=./output_pipeline --output_name=benchmark_excel.xlsx
 ```

-## 5. FAQ
+## 6. FAQ
 - 1、Can `Paddle 1.8.4` trained models be deployed with `Paddle2.0`?
  Paddle 2.0 is compatible with Paddle 1.8.4, so it is ok. However, some models (such as SOLOv2) use the new OP in Paddle 2.0, which is not allowed.


--- a/deploy/third_engine/demo_mnn_kpts/CMakeLists.txt
+++ b/deploy/third_engine/demo_mnn_kpts/CMakeLists.txt
+cmake_minimum_required(VERSION 3.9)
+
+project(tinypose-mnn)
+
+set(CMAKE_CXX_STANDARD 17)
+set(MNN_DIR {YOUR_MNN_DIR})
+
+find_package(OpenCV REQUIRED)
+
+include_directories(
+        ${MNN_DIR}/include
+        ${MNN_DIR}/include/MNN
+        ${CMAKE_SOURCE_DIR}
+)
+link_directories(mnn/lib)
+
+add_library(libMNN SHARED IMPORTED)
+set_target_properties(
+        libMNN
+        PROPERTIES IMPORTED_LOCATION
+        ${CMAKE_SOURCE_DIR}/mnn/lib/libMNN.so
+)
+add_executable(tinypose-mnn main.cpp picodet_mnn.cpp keypoint_detector.cpp keypoint_postprocess.cpp)
+
+target_link_libraries(tinypose-mnn MNN ${OpenCV_LIBS} libMNN.so)
+
--- a/deploy/third_engine/demo_mnn_kpts/CMakeLists_armv8.txt
+++ b/deploy/third_engine/demo_mnn_kpts/CMakeLists_armv8.txt
+cmake_minimum_required(VERSION 3.9)
+
+project(tinypose-mnn)
+
+set(CMAKE_CXX_STANDARD 17)
+set(MNN_DIR {YOUR_MNN_DIR})
+set(NDK_ROOT {YOUR_ANDROID_NDK_PATH})
+set(LDFLAGS -latomic -pthread -ldl -llog -lz -static-libstdc++)
+
+set(OpenCV_DIR ${CMAKE_SOURCE_DIR}/third/opencv4.1.0/arm64-v8a)
+
+set(OpenCV_DEPS ${OpenCV_DIR}/libs/libopencv_imgcodecs.a 
+              ${OpenCV_DIR}/libs/libopencv_imgproc.a 
+              ${OpenCV_DIR}/libs/libopencv_core.a 
+              ${OpenCV_DIR}/3rdparty/libs/libtegra_hal.a 
+              ${OpenCV_DIR}/3rdparty/libs/liblibjpeg-turbo.a 
+              ${OpenCV_DIR}/3rdparty/libs/liblibwebp.a 
+              ${OpenCV_DIR}/3rdparty/libs/liblibpng.a 
+              ${OpenCV_DIR}/3rdparty/libs/liblibjasper.a 
+              ${OpenCV_DIR}/3rdparty/libs/liblibtiff.a 
+              ${OpenCV_DIR}/3rdparty/libs/libIlmImf.a 
+              ${OpenCV_DIR}/3rdparty/libs/libtbb.a 
+              ${OpenCV_DIR}/3rdparty/libs/libcpufeatures.a)
+
+set(FLAGS "-pie -Wl,--gc-sections -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=21 -fexceptions -frtti -std=c++11 -O3 -DNDEBUG -fPIE -fopenmp")
+set(CMAKE_CXX_FLAGS "--sysroot=${NDK_ROOT}/sysroot ${FLAGS}")
+
+set(STDCXX ${NDK_ROOT}/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a 
+              ${NDK_ROOT}/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a 
+              ${NDK_ROOT}/platforms/android-21/arch-arm64/usr/lib/libstdc++.a)
+set(SYS_INCS ${NDK_ROOT}/sysroot/usr/include/aarch64-linux-android/ ${NDK_ROOT}/sources/cxx-stl/llvm-libc++/include/  ${NDK_ROOT}/sources/cxx-stl/llvm-libc++abi/include/ ${NDK_ROOT}/sources/android/support/include/ ${NDK_ROOT}/sysroot/usr/include/)
+
+include_directories(
+        ${SYS_INCS} 
+        ${OpenCV_DIR}/include
+        ${MNN_DIR}/include
+        ${MNN_DIR}/include/MNN
+        ${CMAKE_SOURCE_DIR}
+)
+
+link_directories(${NDK_ROOT}/platforms/android-21/arch-arm64)
+link_directories(${MNN_DIR}/project/android/build_64)
+
+add_executable(tinypose-mnn picodet_mnn.cpp keypoint_postprocess.cpp keypoint_detector.cpp main.cpp)
+
+target_link_libraries(tinypose-mnn -lMNN ${OpenCV_DEPS} ${STDCXX} ${LDFLAGS})
+
--- a/deploy/third_engine/demo_mnn_kpts/README.md
+++ b/deploy/third_engine/demo_mnn_kpts/README.md
+# TinyPose MNN Demo
+
+This fold provides PicoDet+TinyPose inference code using
+[Alibaba's MNN framework](https://github.com/alibaba/MNN). Most of the implements in
+this fold are same as *demo_ncnn*.
+
+## Install MNN
+
+### Python library
+
+Just run:
+
+``` shell
+pip install MNN
+```
+
+### C++ library
+
+Please follow the [official document](https://www.yuque.com/mnn/en/build_linux) to build MNN engine.
+
+- Create picodet_m_416_coco.onnx and tinypose256.onnx
+    example:
+    ```shell
+    modelName=picodet_m_416_coco
+    # export model
+    python tools/export_model.py \
+            -c configs/picodet/${modelName}.yml \
+            -o weights=${modelName}.pdparams \
+            --output_dir=inference_model
+    # convert to onnx
+    paddle2onnx --model_dir inference_model/${modelName} \
+            --model_filename model.pdmodel  \
+            --params_filename model.pdiparams \
+            --opset_version 11 \
+            --save_file ${modelName}.onnx
+    # onnxsim
+    python -m onnxsim ${modelName}.onnx ${modelName}_processed.onnx
+    ```
+
+- Convert model
+    example:
+    ``` shell
+    python -m MNN.tools.mnnconvert -f ONNX --modelFile picodet-416.onnx --MNNModel picodet-416.mnn
+    ```
+Here are converted model
+[picodet_m_416](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416.mnn).
+[tinypose256](https://paddledet.bj.bcebos.com/deploy/third_engine/tinypose256.mnn)
+
+## Build
+
+For C++ code, replace `libMNN.so` under *./mnn/lib* with the one you just compiled, modify OpenCV path and MNN path at CMake file,
+and run
+
+``` shell
+mkdir build && cd build
+cmake ..
+make
+```
+
+Note that a flag at `main.cpp` is used to control whether to show the detection result or save it into a fold.
+
+``` c++
+#define __SAVE_RESULT__ // if defined save drawed results to ../results, else show it in windows
+```
+
+#### ARM Build
+
+Prepare OpenCV library [OpenCV_4_1](https://paddle-inference-dist.bj.bcebos.com/opencv4.1.0.tar.gz).
+
+``` shell
+mkdir third && cd third
+wget https://paddle-inference-dist.bj.bcebos.com/opencv4.1.0.tar.gz
+tar -zxvf opencv4.1.0.tar.gz
+cd ..
+
+mkdir build && cd build
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DANDROID_TOOLCHAIN=gcc ..
+make
+```
+
+## Run
+
+To detect images in a fold, run:
+``` shell
+./tinypose-mnn [mode] [image_file]
+```
+|  param   | detail  |
+|  ----  | ----  |
+| --mode  | input mode，0:camera；1:image；2:video；3:benchmark |
+| --image_file  | input image path |
+
+for example:
+
+``` shell
+./tinypose-mnn "1" "../imgs/test.jpg"
+```
+
+For speed benchmark:
+
+``` shell
+./tinypose-mnn "3" "0"
+```
+
+## Benchmark
+Plateform: Kirin980
+Model: [tinypose256](https://paddledet.bj.bcebos.com/deploy/third_engine/tinypose256.mnn)
+
+| param    | Min(s) | Max(s) | Avg(s) |
+| -------- | ------ | ------ | ------ |
+| Thread=4 | 0.018  | 0.021  | 0.019  |
+| Thread=1 | 0.031  | 0.041  | 0.032  |
+
+
+
+## Reference
+[MNN](https://github.com/alibaba/MNN)
--- a/deploy/third_engine/demo_mnn_kpts/keypoint_detector.cpp
+++ b/deploy/third_engine/demo_mnn_kpts/keypoint_detector.cpp
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "keypoint_detector.h"
+
+namespace PaddleDetection {
+
+// Visualiztion MaskDetector results
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap,
+                            float threshold) {
+  const int edge[][2] = {{0, 1},
+                         {0, 2},
+                         {1, 3},
+                         {2, 4},
+                         {3, 5},
+                         {4, 6},
+                         {5, 7},
+                         {6, 8},
+                         {7, 9},
+                         {8, 10},
+                         {5, 11},
+                         {6, 12},
+                         {11, 13},
+                         {12, 14},
+                         {13, 15},
+                         {14, 16},
+                         {11, 12}};
+  cv::Mat vis_img = img.clone();
+  for (int batchid = 0; batchid < results.size(); batchid++) {
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[i * 3] > threshold) {
+        int x_coord = int(results[batchid].keypoints[i * 3 + 1]);
+        int y_coord = int(results[batchid].keypoints[i * 3 + 2]);
+        cv::circle(vis_img,
+                   cv::Point2d(x_coord, y_coord),
+                   1,
+                   cv::Scalar(0, 0, 255),
+                   2);
+      }
+    }
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[edge[i][0] * 3] > threshold &&
+          results[batchid].keypoints[edge[i][1] * 3] > threshold) {
+        int x_start = int(results[batchid].keypoints[edge[i][0] * 3 + 1]);
+        int y_start = int(results[batchid].keypoints[edge[i][0] * 3 + 2]);
+        int x_end = int(results[batchid].keypoints[edge[i][1] * 3 + 1]);
+        int y_end = int(results[batchid].keypoints[edge[i][1] * 3 + 2]);
+        cv::line(vis_img,
+                 cv::Point2d(x_start, y_start),
+                 cv::Point2d(x_end, y_end),
+                 colormap[i],
+                 1);
+      }
+    }
+  }
+  return vis_img;
+}
+
+void KeyPointDetector::Postprocess(std::vector<float>& output,
+                                   std::vector<int>& output_shape,
+                                   std::vector<int>& idxout,
+                                   std::vector<int>& idx_shape,
+                                   std::vector<KeyPointResult>* result,
+                                   std::vector<std::vector<float>>& center_bs,
+                                   std::vector<std::vector<float>>& scale_bs) {
+  std::vector<float> preds(output_shape[1] * 3, 0);
+  for (int batchid = 0; batchid < output_shape[0]; batchid++) {
+    get_final_preds(output,
+                    output_shape,
+                    idxout,
+                    idx_shape,
+                    center_bs[batchid],
+                    scale_bs[batchid],
+                    preds,
+                    batchid,
+                    this->use_dark());
+    KeyPointResult result_item;
+    result_item.num_joints = output_shape[1];
+    result_item.keypoints.clear();
+    for (int i = 0; i < output_shape[1]; i++) {
+      result_item.keypoints.emplace_back(preds[i * 3]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 1]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 2]);
+    }
+    result->push_back(result_item);
+  }
+}
+
+void KeyPointDetector::Predict(const std::vector<cv::Mat> imgs,
+                               std::vector<std::vector<float>>& center_bs,
+                               std::vector<std::vector<float>>& scale_bs,
+                               std::vector<KeyPointResult>* result) {
+  int batch_size = imgs.size();
+  KeyPointDet_interpreter->resizeTensor(input_tensor,
+                                        {batch_size, 3, in_h, in_w});
+  KeyPointDet_interpreter->resizeSession(KeyPointDet_session);
+  auto insize = 3 * in_h * in_w;
+
+  // Preprocess image
+  cv::Mat resized_im;
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+
+    cv::resize(im, resized_im, cv::Size(in_w, in_h));
+    std::shared_ptr<MNN::CV::ImageProcess> pretreat(
+        MNN::CV::ImageProcess::create(
+            MNN::CV::BGR, MNN::CV::RGB, mean_vals, 3, norm_vals, 3));
+    pretreat->convert(
+        resized_im.data, in_w, in_h, resized_im.step[0], input_tensor);
+  }
+
+  // Run predictor
+  auto inference_start = std::chrono::steady_clock::now();
+
+  KeyPointDet_interpreter->runSession(KeyPointDet_session);
+  // Get output tensor
+  auto out_tensor = KeyPointDet_interpreter->getSessionOutput(
+      KeyPointDet_session, "conv2d_441.tmp_1");
+  auto nchwoutTensor = new Tensor(out_tensor, Tensor::CAFFE);
+  out_tensor->copyToHostTensor(nchwoutTensor);
+
+  auto output_shape = nchwoutTensor->shape();
+  // Calculate output length
+  int output_size = 1;
+  for (int j = 0; j < output_shape.size(); ++j) {
+    output_size *= output_shape[j];
+  }
+  output_data_.resize(output_size);
+  std::copy_n(nchwoutTensor->host<float>(), output_size, output_data_.data());
+  delete nchwoutTensor;
+
+  auto idx_tensor = KeyPointDet_interpreter->getSessionOutput(
+      KeyPointDet_session, "argmax_0.tmp_0");
+
+  auto idxhostTensor = new Tensor(idx_tensor, Tensor::CAFFE);
+  idx_tensor->copyToHostTensor(idxhostTensor);
+
+  auto idx_shape = idxhostTensor->shape();
+  // Calculate output length
+  output_size = 1;
+  for (int j = 0; j < idx_shape.size(); ++j) {
+    output_size *= idx_shape[j];
+  }
+
+  idx_data_.resize(output_size);
+  std::copy_n(idxhostTensor->host<int>(), output_size, idx_data_.data());
+  delete idxhostTensor;
+
+  auto inference_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> elapsed = inference_end - inference_start;
+  printf("keypoint inference time: %f s\n", elapsed.count());
+
+  // Postprocessing result
+  Postprocess(output_data_,
+              output_shape,
+              idx_data_,
+              idx_shape,
+              result,
+              center_bs,
+              scale_bs);
+}
+
+}  // namespace PaddleDetection
--- a/deploy/third_engine/demo_mnn_kpts/keypoint_detector.h
+++ b/deploy/third_engine/demo_mnn_kpts/keypoint_detector.h
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ctime>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include "Interpreter.hpp"
+
+#include "ImageProcess.hpp"
+#include "MNNDefine.h"
+#include "Tensor.hpp"
+
+#include "keypoint_postprocess.h"
+
+using namespace MNN;
+
+namespace PaddleDetection {
+// Object KeyPoint Result
+struct KeyPointResult {
+  // Keypoints: shape(N x 3); N: number of Joints; 3: x,y,conf
+  std::vector<float> keypoints;
+  int num_joints = -1;
+};
+
+// Visualiztion KeyPoint Result
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap,
+                            float threshold = 0.2);
+
+class KeyPointDetector {
+ public:
+  explicit KeyPointDetector(const std::string& model_path,
+                            int num_thread = 4,
+                            int input_height = 256,
+                            int input_width = 192,
+                            float score_threshold = 0.3,
+                            const int batch_size = 1,
+                            bool use_dark = true) {
+    printf("config path: %s",
+           model_path.substr(0, model_path.find_last_of('/') + 1).c_str());
+    use_dark_ = use_dark;
+
+    in_w = input_width;
+    in_h = input_height;
+    threshold_ = score_threshold;
+
+    KeyPointDet_interpreter = std::shared_ptr<MNN::Interpreter>(
+        MNN::Interpreter::createFromFile(model_path.c_str()));
+    MNN::ScheduleConfig config;
+    config.type = MNN_FORWARD_CPU;
+    /*modeNum means gpuMode for GPU usage, Or means numThread for CPU usage.*/
+    config.numThread = num_thread;
+    // If type not fount, let it failed
+    config.backupType = MNN_FORWARD_CPU;
+    BackendConfig backendConfig;
+    backendConfig.precision = static_cast<MNN::BackendConfig::PrecisionMode>(1);
+    config.backendConfig = &backendConfig;
+
+    KeyPointDet_session = KeyPointDet_interpreter->createSession(config);
+
+    input_tensor =
+        KeyPointDet_interpreter->getSessionInput(KeyPointDet_session, nullptr);
+  }
+
+  ~KeyPointDetector() {
+    KeyPointDet_interpreter->releaseModel();
+    KeyPointDet_interpreter->releaseSession(KeyPointDet_session);
+  }
+
+  // Load Paddle inference model
+  void LoadModel(std::string model_file, int num_theads);
+
+  // Run predictor
+  void Predict(const std::vector<cv::Mat> imgs,
+               std::vector<std::vector<float>>& center,
+               std::vector<std::vector<float>>& scale,
+               std::vector<KeyPointResult>* result = nullptr);
+
+  bool use_dark() { return this->use_dark_; }
+
+  inline float get_threshold() { return threshold_; };
+
+  // const float mean_vals[3] = { 103.53f, 116.28f, 123.675f };
+  // const float norm_vals[3] = { 0.017429f, 0.017507f, 0.017125f };
+  const float mean_vals[3] = {0.f, 0.f, 0.f};
+  const float norm_vals[3] = {1.f, 1.f, 1.f};
+  int in_w = 128;
+  int in_h = 256;
+
+ private:
+  // Postprocess result
+  void Postprocess(std::vector<float>& output,
+                   std::vector<int>& output_shape,
+                   std::vector<int>& idxout,
+                   std::vector<int>& idx_shape,
+                   std::vector<KeyPointResult>* result,
+                   std::vector<std::vector<float>>& center,
+                   std::vector<std::vector<float>>& scale);
+
+  std::vector<float> output_data_;
+  std::vector<int> idx_data_;
+  float threshold_;
+  bool use_dark_;
+
+  std::shared_ptr<MNN::Interpreter> KeyPointDet_interpreter;
+  MNN::Session* KeyPointDet_session = nullptr;
+  MNN::Tensor* input_tensor = nullptr;
+};
+
+}  // namespace PaddleDetection
--- a/deploy/third_engine/demo_mnn_kpts/keypoint_postprocess.cpp
+++ b/deploy/third_engine/demo_mnn_kpts/keypoint_postprocess.cpp
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "keypoint_postprocess.h"
+#define PI 3.1415926535
+#define HALF_CIRCLE_DEGREE 180
+
+cv::Point2f get_3rd_point(cv::Point2f& a, cv::Point2f& b) {
+  cv::Point2f direct{a.x - b.x, a.y - b.y};
+  return cv::Point2f(a.x - direct.y, a.y + direct.x);
+}
+
+std::vector<float> get_dir(float src_point_x,
+                           float src_point_y,
+                           float rot_rad) {
+  float sn = sin(rot_rad);
+  float cs = cos(rot_rad);
+  std::vector<float> src_result{0.0, 0.0};
+  src_result[0] = src_point_x * cs - src_point_y * sn;
+  src_result[1] = src_point_x * sn + src_point_y * cs;
+  return src_result;
+}
+
+void affine_tranform(
+    float pt_x, float pt_y, cv::Mat& trans, std::vector<float>& preds, int p) {
+  double new1[3] = {pt_x, pt_y, 1.0};
+  cv::Mat new_pt(3, 1, trans.type(), new1);
+  cv::Mat w = trans * new_pt;
+  preds[p * 3 + 1] = static_cast<float>(w.at<double>(0, 0));
+  preds[p * 3 + 2] = static_cast<float>(w.at<double>(1, 0));
+}
+
+void get_affine_transform(std::vector<float>& center,
+                          std::vector<float>& scale,
+                          float rot,
+                          std::vector<int>& output_size,
+                          cv::Mat& trans,
+                          int inv) {
+  float src_w = scale[0];
+  float dst_w = static_cast<float>(output_size[0]);
+  float dst_h = static_cast<float>(output_size[1]);
+  float rot_rad = rot * PI / HALF_CIRCLE_DEGREE;
+  std::vector<float> src_dir = get_dir(-0.5 * src_w, 0, rot_rad);
+  std::vector<float> dst_dir{static_cast<float>(-0.5) * dst_w, 0.0};
+  cv::Point2f srcPoint2f[3], dstPoint2f[3];
+  srcPoint2f[0] = cv::Point2f(center[0], center[1]);
+  srcPoint2f[1] = cv::Point2f(center[0] + src_dir[0], center[1] + src_dir[1]);
+  srcPoint2f[2] = get_3rd_point(srcPoint2f[0], srcPoint2f[1]);
+
+  dstPoint2f[0] = cv::Point2f(dst_w * 0.5, dst_h * 0.5);
+  dstPoint2f[1] =
+      cv::Point2f(dst_w * 0.5 + dst_dir[0], dst_h * 0.5 + dst_dir[1]);
+  dstPoint2f[2] = get_3rd_point(dstPoint2f[0], dstPoint2f[1]);
+  if (inv == 0) {
+    trans = cv::getAffineTransform(srcPoint2f, dstPoint2f);
+  } else {
+    trans = cv::getAffineTransform(dstPoint2f, srcPoint2f);
+  }
+}
+
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int>& dim,
+                     std::vector<float>& target_coords) {
+  cv::Mat trans(2, 3, CV_64FC1);
+  get_affine_transform(center, scale, 0, output_size, trans, 1);
+  for (int p = 0; p < dim[1]; ++p) {
+    affine_tranform(coords[p * 2], coords[p * 2 + 1], trans, target_coords, p);
+  }
+}
+
+// only for batchsize == 1
+void get_max_preds(std::vector<float>& heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   std::vector<float>& maxvals,
+                   int batchid,
+                   int joint_idx) {
+  int num_joints = dim[1];
+  int width = dim[3];
+  std::vector<int> idx;
+  idx.resize(num_joints * 2);
+
+  for (int j = 0; j < dim[1]; j++) {
+    float* index = &(
+        heatmap[batchid * num_joints * dim[2] * dim[3] + j * dim[2] * dim[3]]);
+    float* end = index + dim[2] * dim[3];
+    float* max_dis = std::max_element(index, end);
+    auto max_id = std::distance(index, max_dis);
+    maxvals[j] = *max_dis;
+    if (*max_dis > 0) {
+      preds[j * 2] = static_cast<float>(max_id % width);
+      preds[j * 2 + 1] = static_cast<float>(max_id / width);
+    }
+  }
+}
+
+void dark_parse(std::vector<float>& heatmap,
+                std::vector<int>& dim,
+                std::vector<float>& coords,
+                int px,
+                int py,
+                int index,
+                int ch) {
+  /*DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
+  Representation for Human Pose Estimation (CVPR 2020).
+  1) offset = - hassian.inv() * derivative
+  2) dx = (heatmap[x+1] - heatmap[x-1])/2.
+  3) dxx = (dx[x+1] - dx[x-1])/2.
+  4) derivative = Mat([dx, dy])
+  5) hassian = Mat([[dxx, dxy], [dxy, dyy]])
+  */
+  std::vector<float>::const_iterator first1 = heatmap.begin() + index;
+  std::vector<float>::const_iterator last1 =
+      heatmap.begin() + index + dim[2] * dim[3];
+  std::vector<float> heatmap_ch(first1, last1);
+  cv::Mat heatmap_mat = cv::Mat(heatmap_ch).reshape(0, dim[2]);
+  heatmap_mat.convertTo(heatmap_mat, CV_32FC1);
+  cv::GaussianBlur(heatmap_mat, heatmap_mat, cv::Size(3, 3), 0, 0);
+  heatmap_mat = heatmap_mat.reshape(1, 1);
+  heatmap_ch = std::vector<float>(heatmap_mat.reshape(1, 1));
+
+  float epsilon = 1e-10;
+  // sample heatmap to get values in around target location
+  float xy = log(fmax(heatmap_ch[py * dim[3] + px], epsilon));
+  float xr = log(fmax(heatmap_ch[py * dim[3] + px + 1], epsilon));
+  float xl = log(fmax(heatmap_ch[py * dim[3] + px - 1], epsilon));
+
+  float xr2 = log(fmax(heatmap_ch[py * dim[3] + px + 2], epsilon));
+  float xl2 = log(fmax(heatmap_ch[py * dim[3] + px - 2], epsilon));
+  float yu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px], epsilon));
+  float yd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px], epsilon));
+  float yu2 = log(fmax(heatmap_ch[(py + 2) * dim[3] + px], epsilon));
+  float yd2 = log(fmax(heatmap_ch[(py - 2) * dim[3] + px], epsilon));
+  float xryu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px + 1], epsilon));
+  float xryd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px + 1], epsilon));
+  float xlyu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px - 1], epsilon));
+  float xlyd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px - 1], epsilon));
+
+  // compute dx/dy and dxx/dyy with sampled values
+  float dx = 0.5 * (xr - xl);
+  float dy = 0.5 * (yu - yd);
+  float dxx = 0.25 * (xr2 - 2 * xy + xl2);
+  float dxy = 0.25 * (xryu - xryd - xlyu + xlyd);
+  float dyy = 0.25 * (yu2 - 2 * xy + yd2);
+
+  // finally get offset by derivative and hassian, which combined by dx/dy and
+  // dxx/dyy
+  if (dxx * dyy - dxy * dxy != 0) {
+    float M[2][2] = {dxx, dxy, dxy, dyy};
+    float D[2] = {dx, dy};
+    cv::Mat hassian(2, 2, CV_32F, M);
+    cv::Mat derivative(2, 1, CV_32F, D);
+    cv::Mat offset = -hassian.inv() * derivative;
+    coords[ch * 2] += offset.at<float>(0, 0);
+    coords[ch * 2 + 1] += offset.at<float>(1, 0);
+  }
+}
+
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<int>& dim,
+                     std::vector<int>& idxout,
+                     std::vector<int>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK) {
+  std::vector<float> coords;
+  coords.resize(dim[1] * 2);
+  int heatmap_height = dim[2];
+  int heatmap_width = dim[3];
+
+  for (int j = 0; j < dim[1]; ++j) {
+    int index = (batchid * dim[1] + j) * dim[2] * dim[3];
+
+    int idx = idxout[batchid * dim[1] + j];
+    preds[j * 3] = heatmap[index + idx];
+    coords[j * 2] = idx % heatmap_width;
+    coords[j * 2 + 1] = idx / heatmap_width;
+
+    int px = int(coords[j * 2] + 0.5);
+    int py = int(coords[j * 2 + 1] + 0.5);
+
+    if (DARK && px > 1 && px < heatmap_width - 2 && py > 1 &&
+        py < heatmap_height - 2) {
+      dark_parse(heatmap, dim, coords, px, py, index, j);
+    } else {
+      if (px > 0 && px < heatmap_width - 1) {
+        float diff_x = heatmap[index + py * dim[3] + px + 1] -
+                       heatmap[index + py * dim[3] + px - 1];
+        coords[j * 2] += diff_x > 0 ? 1 : -1 * 0.25;
+      }
+      if (py > 0 && py < heatmap_height - 1) {
+        float diff_y = heatmap[index + (py + 1) * dim[3] + px] -
+                       heatmap[index + (py - 1) * dim[3] + px];
+        coords[j * 2 + 1] += diff_y > 0 ? 1 : -1 * 0.25;
+      }
+    }
+  }
+
+  std::vector<int> img_size{heatmap_width, heatmap_height};
+  transform_preds(coords, center, scale, img_size, dim, preds);
+}
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio) {
+  int crop_x1 = std::max(0, area[0]);
+  int crop_y1 = std::max(0, area[1]);
+  int crop_x2 = std::min(img.cols - 1, area[2]);
+  int crop_y2 = std::min(img.rows - 1, area[3]);
+
+  int center_x = (crop_x1 + crop_x2) / 2.;
+  int center_y = (crop_y1 + crop_y2) / 2.;
+  int half_h = (crop_y2 - crop_y1) / 2.;
+  int half_w = (crop_x2 - crop_x1) / 2.;
+
+  if (half_h * 3 > half_w * 4) {
+    half_w = static_cast<int>(half_h * 0.75);
+  } else {
+    half_h = static_cast<int>(half_w * 4 / 3);
+  }
+
+  crop_x1 =
+      std::max(0, center_x - static_cast<int>(half_w * (1 + expandratio)));
+  crop_y1 =
+      std::max(0, center_y - static_cast<int>(half_h * (1 + expandratio)));
+  crop_x2 = std::min(img.cols - 1,
+                     static_cast<int>(center_x + half_w * (1 + expandratio)));
+  crop_y2 = std::min(img.rows - 1,
+                     static_cast<int>(center_y + half_h * (1 + expandratio)));
+  crop_img =
+      img(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1));
+
+  center.clear();
+  center.emplace_back((crop_x1 + crop_x2) / 2);
+  center.emplace_back((crop_y1 + crop_y2) / 2);
+  scale.clear();
+  scale.emplace_back((crop_x2 - crop_x1));
+  scale.emplace_back((crop_y2 - crop_y1));
+}
--- a/deploy/third_engine/demo_mnn_kpts/keypoint_postprocess.h
+++ b/deploy/third_engine/demo_mnn_kpts/keypoint_postprocess.h
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <vector>
+
+std::vector<float> get_3rd_point(std::vector<float>& a, std::vector<float>& b);
+std::vector<float> get_dir(float src_point_x, float src_point_y, float rot_rad);
+void affine_tranform(float pt_x,
+                     float pt_y,
+                     cv::Mat& trans,
+                     std::vector<float>& x,
+                     int p,
+                     int num);
+cv::Mat get_affine_transform(std::vector<float>& center,
+                             std::vector<float>& scale,
+                             float rot,
+                             std::vector<int>& output_size,
+                             int inv);
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int>& dim,
+                     std::vector<float>& target_coords);
+void box_to_center_scale(std::vector<int>& box,
+                         int width,
+                         int height,
+                         std::vector<float>& center,
+                         std::vector<float>& scale);
+void get_max_preds(std::vector<float>& heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   std::vector<float>& maxvals,
+                   int batchid,
+                   int joint_idx);
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<int>& dim,
+                     std::vector<int>& idxout,
+                     std::vector<int>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK = true);
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio = 0.25);
--- a/deploy/third_engine/demo_mnn_kpts/main.cpp
+++ b/deploy/third_engine/demo_mnn_kpts/main.cpp
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_mnn
+
+#include <iostream>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "keypoint_detector.h"
+#include "picodet_mnn.h"
+
+#define __SAVE_RESULT__  // if defined save drawed results to ../results, else
+                         // show it in windows
+
+using namespace PaddleDetection;
+
+struct object_rect {
+  int x;
+  int y;
+  int width;
+  int height;
+};
+
+int resize_uniform(cv::Mat& src,
+                   cv::Mat& dst,
+                   cv::Size dst_size,
+                   object_rect& effect_area) {
+  int w = src.cols;
+  int h = src.rows;
+  int dst_w = dst_size.width;
+  int dst_h = dst_size.height;
+  dst = cv::Mat(cv::Size(dst_w, dst_h), CV_8UC3, cv::Scalar(0));
+
+  float ratio_src = w * 1.0 / h;
+  float ratio_dst = dst_w * 1.0 / dst_h;
+
+  int tmp_w = 0;
+  int tmp_h = 0;
+  if (ratio_src > ratio_dst) {
+    tmp_w = dst_w;
+    tmp_h = floor((dst_w * 1.0 / w) * h);
+  } else if (ratio_src < ratio_dst) {
+    tmp_h = dst_h;
+    tmp_w = floor((dst_h * 1.0 / h) * w);
+  } else {
+    cv::resize(src, dst, dst_size);
+    effect_area.x = 0;
+    effect_area.y = 0;
+    effect_area.width = dst_w;
+    effect_area.height = dst_h;
+    return 0;
+  }
+  cv::Mat tmp;
+  cv::resize(src, tmp, cv::Size(tmp_w, tmp_h));
+
+  if (tmp_w != dst_w) {
+    int index_w = floor((dst_w - tmp_w) / 2.0);
+    for (int i = 0; i < dst_h; i++) {
+      memcpy(dst.data + i * dst_w * 3 + index_w * 3,
+             tmp.data + i * tmp_w * 3,
+             tmp_w * 3);
+    }
+    effect_area.x = index_w;
+    effect_area.y = 0;
+    effect_area.width = tmp_w;
+    effect_area.height = tmp_h;
+  } else if (tmp_h != dst_h) {
+    int index_h = floor((dst_h - tmp_h) / 2.0);
+    memcpy(dst.data + index_h * dst_w * 3, tmp.data, tmp_w * tmp_h * 3);
+    effect_area.x = 0;
+    effect_area.y = index_h;
+    effect_area.width = tmp_w;
+    effect_area.height = tmp_h;
+  } else {
+    printf("error\n");
+  }
+  return 0;
+}
+
+const int color_list[80][3] = {
+    {216, 82, 24},   {236, 176, 31},  {125, 46, 141},  {118, 171, 47},
+    {76, 189, 237},  {238, 19, 46},   {76, 76, 76},    {153, 153, 153},
+    {255, 0, 0},     {255, 127, 0},   {190, 190, 0},   {0, 255, 0},
+    {0, 0, 255},     {170, 0, 255},   {84, 84, 0},     {84, 170, 0},
+    {84, 255, 0},    {170, 84, 0},    {170, 170, 0},   {170, 255, 0},
+    {255, 84, 0},    {255, 170, 0},   {255, 255, 0},   {0, 84, 127},
+    {0, 170, 127},   {0, 255, 127},   {84, 0, 127},    {84, 84, 127},
+    {84, 170, 127},  {84, 255, 127},  {170, 0, 127},   {170, 84, 127},
+    {170, 170, 127}, {170, 255, 127}, {255, 0, 127},   {255, 84, 127},
+    {255, 170, 127}, {255, 255, 127}, {0, 84, 255},    {0, 170, 255},
+    {0, 255, 255},   {84, 0, 255},    {84, 84, 255},   {84, 170, 255},
+    {84, 255, 255},  {170, 0, 255},   {170, 84, 255},  {170, 170, 255},
+    {170, 255, 255}, {255, 0, 255},   {255, 84, 255},  {255, 170, 255},
+    {42, 0, 0},      {84, 0, 0},      {127, 0, 0},     {170, 0, 0},
+    {212, 0, 0},     {255, 0, 0},     {0, 42, 0},      {0, 84, 0},
+    {0, 127, 0},     {0, 170, 0},     {0, 212, 0},     {0, 255, 0},
+    {0, 0, 42},      {0, 0, 84},      {0, 0, 127},     {0, 0, 170},
+    {0, 0, 212},     {0, 0, 255},     {0, 0, 0},       {36, 36, 36},
+    {72, 72, 72},    {109, 109, 109}, {145, 145, 145}, {182, 182, 182},
+    {218, 218, 218}, {0, 113, 188},   {80, 182, 188},  {127, 127, 0},
+};
+
+void draw_bboxes(const cv::Mat& bgr,
+                 const std::vector<BoxInfo>& bboxes,
+                 object_rect effect_roi,
+                 std::string save_path = "None") {
+  static const char* class_names[] = {
+      "person",        "bicycle",      "car",
+      "motorcycle",    "airplane",     "bus",
+      "train",         "truck",        "boat",
+      "traffic light", "fire hydrant", "stop sign",
+      "parking meter", "bench",        "bird",
+      "cat",           "dog",          "horse",
+      "sheep",         "cow",          "elephant",
+      "bear",          "zebra",        "giraffe",
+      "backpack",      "umbrella",     "handbag",
+      "tie",           "suitcase",     "frisbee",
+      "skis",          "snowboard",    "sports ball",
+      "kite",          "baseball bat", "baseball glove",
+      "skateboard",    "surfboard",    "tennis racket",
+      "bottle",        "wine glass",   "cup",
+      "fork",          "knife",        "spoon",
+      "bowl",          "banana",       "apple",
+      "sandwich",      "orange",       "broccoli",
+      "carrot",        "hot dog",      "pizza",
+      "donut",         "cake",         "chair",
+      "couch",         "potted plant", "bed",
+      "dining table",  "toilet",       "tv",
+      "laptop",        "mouse",        "remote",
+      "keyboard",      "cell phone",   "microwave",
+      "oven",          "toaster",      "sink",
+      "refrigerator",  "book",         "clock",
+      "vase",          "scissors",     "teddy bear",
+      "hair drier",    "toothbrush"};
+
+  cv::Mat image = bgr.clone();
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int dst_w = effect_roi.width;
+  int dst_h = effect_roi.height;
+  float width_ratio = (float)src_w / (float)dst_w;
+  float height_ratio = (float)src_h / (float)dst_h;
+
+  for (size_t i = 0; i < bboxes.size(); i++) {
+    const BoxInfo& bbox = bboxes[i];
+    cv::Scalar color = cv::Scalar(color_list[bbox.label][0],
+                                  color_list[bbox.label][1],
+                                  color_list[bbox.label][2]);
+    cv::rectangle(image,
+                  cv::Rect(cv::Point((bbox.x1 - effect_roi.x) * width_ratio,
+                                     (bbox.y1 - effect_roi.y) * height_ratio),
+                           cv::Point((bbox.x2 - effect_roi.x) * width_ratio,
+                                     (bbox.y2 - effect_roi.y) * height_ratio)),
+                  color);
+
+    char text[256];
+    sprintf(text, "%s %.1f%%", class_names[bbox.label], bbox.score * 100);
+
+    int baseLine = 0;
+    cv::Size label_size =
+        cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
+
+    int x = (bbox.x1 - effect_roi.x) * width_ratio;
+    int y =
+        (bbox.y1 - effect_roi.y) * height_ratio - label_size.height - baseLine;
+    if (y < 0) y = 0;
+    if (x + label_size.width > image.cols) x = image.cols - label_size.width;
+
+    cv::rectangle(
+        image,
+        cv::Rect(cv::Point(x, y),
+                 cv::Size(label_size.width, label_size.height + baseLine)),
+        color,
+        -1);
+
+    cv::putText(image,
+                text,
+                cv::Point(x, y + label_size.height),
+                cv::FONT_HERSHEY_SIMPLEX,
+                0.4,
+                cv::Scalar(255, 255, 255));
+  }
+
+  if (save_path == "None") {
+    cv::imshow("image", image);
+  } else {
+    cv::imwrite(save_path, image);
+    std::cout << save_path << std::endl;
+  }
+}
+
+std::vector<BoxInfo> coordsback(const cv::Mat image,
+                                const object_rect effect_roi,
+                                const std::vector<BoxInfo>& bboxes) {
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int dst_w = effect_roi.width;
+  int dst_h = effect_roi.height;
+  float width_ratio = (float)src_w / (float)dst_w;
+  float height_ratio = (float)src_h / (float)dst_h;
+
+  std::vector<BoxInfo> bboxes_oimg;
+
+  for (int i = 0; i < bboxes.size(); i++) {
+    auto bbox = bboxes[i];
+    bbox.x1 = (bbox.x1 - effect_roi.x) * width_ratio;
+    bbox.y1 = (bbox.y1 - effect_roi.y) * height_ratio;
+    bbox.x2 = (bbox.x2 - effect_roi.x) * width_ratio;
+    bbox.y2 = (bbox.y2 - effect_roi.y) * height_ratio;
+    bboxes_oimg.emplace_back(bbox);
+  }
+  return bboxes_oimg;
+}
+
+void image_infer_kpts(KeyPointDetector* kpts_detector,
+                      cv::Mat image,
+                      const object_rect effect_roi,
+                      const std::vector<BoxInfo>& results,
+                      std::string img_name = "kpts_vis",
+                      bool save_img = true) {
+  std::vector<cv::Mat> cropimgs;
+  std::vector<std::vector<float>> center_bs;
+  std::vector<std::vector<float>> scale_bs;
+  std::vector<KeyPointResult> kpts_results;
+  auto results_oimg = coordsback(image, effect_roi, results);
+
+  for (int i = 0; i < results_oimg.size(); i++) {
+    auto rect = results_oimg[i];
+    if (rect.label == 0) {
+      cv::Mat cropimg;
+      std::vector<float> center, scale;
+      std::vector<int> area = {static_cast<int>(rect.x1),
+                               static_cast<int>(rect.y1),
+                               static_cast<int>(rect.x2),
+                               static_cast<int>(rect.y2)};
+      CropImg(image, cropimg, area, center, scale);
+      //   cv::imwrite("./test_crop_"+std::to_string(i)+".jpg", cropimg);
+      cropimgs.emplace_back(cropimg);
+      center_bs.emplace_back(center);
+      scale_bs.emplace_back(scale);
+    }
+    if (cropimgs.size() == 1 ||
+        (cropimgs.size() > 0 && i == results_oimg.size() - 1)) {
+      kpts_detector->Predict(cropimgs, center_bs, scale_bs, &kpts_results);
+      cropimgs.clear();
+      center_bs.clear();
+      scale_bs.clear();
+    }
+  }
+  std::vector<int> compression_params;
+  compression_params.push_back(cv::IMWRITE_JPEG_QUALITY);
+  compression_params.push_back(95);
+  std::string kpts_savepath =
+      "keypoint_" + img_name.substr(img_name.find_last_of('/') + 1);
+  cv::Mat kpts_vis_img =
+      VisualizeKptsResult(image, kpts_results, {0, 255, 0}, 0.3);
+  if (save_img) {
+    cv::imwrite(kpts_savepath, kpts_vis_img, compression_params);
+    printf("Visualized output saved as %s\n", kpts_savepath.c_str());
+  } else {
+    cv::imshow("image", kpts_vis_img);
+  }
+}
+
+int image_demo(PicoDet& detector,
+               KeyPointDetector* kpts_detector,
+               const char* imagepath) {
+  std::vector<cv::String> filenames;
+  cv::glob(imagepath, filenames, false);
+
+  for (auto img_name : filenames) {
+    cv::Mat image = cv::imread(img_name);
+    if (image.empty()) {
+      fprintf(stderr, "cv::imread %s failed\n", img_name.c_str());
+      return -1;
+    }
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+    std::vector<BoxInfo> results;
+    detector.detect(resized_img, results);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, img_name);
+    }
+  }
+  return 0;
+}
+
+int webcam_demo(PicoDet& detector,
+                KeyPointDetector* kpts_detector,
+                int cam_id) {
+  cv::Mat image;
+  cv::VideoCapture cap(cam_id);
+
+  while (true) {
+    cap >> image;
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+    std::vector<BoxInfo> results;
+    detector.detect(resized_img, results);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, "", false);
+    }
+  }
+  return 0;
+}
+
+int video_demo(PicoDet& detector,
+               KeyPointDetector* kpts_detector,
+               const char* path) {
+  cv::Mat image;
+  cv::VideoCapture cap(path);
+
+  while (true) {
+    cap >> image;
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+    std::vector<BoxInfo> results;
+    detector.detect(resized_img, results);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, "", false);
+    }
+  }
+  return 0;
+}
+
+int benchmark(KeyPointDetector* kpts_detector) {
+  int loop_num = 100;
+  int warm_up = 8;
+
+  double time_min = DBL_MAX;
+  double time_max = -DBL_MAX;
+  double time_avg = 0;
+  cv::Mat image(256, 192, CV_8UC3, cv::Scalar(1, 1, 1));
+  std::vector<float> center = {128, 96};
+  std::vector<float> scale = {256, 192};
+  std::vector<cv::Mat> cropimgs = {image};
+  std::vector<std::vector<float>> center_bs = {center};
+  std::vector<std::vector<float>> scale_bs = {scale};
+  std::vector<KeyPointResult> kpts_results;
+
+  for (int i = 0; i < warm_up + loop_num; i++) {
+    auto start = std::chrono::steady_clock::now();
+    std::vector<BoxInfo> results;
+    kpts_detector->Predict(cropimgs, center_bs, scale_bs, &kpts_results);
+    auto end = std::chrono::steady_clock::now();
+
+    std::chrono::duration<double> elapsed = end - start;
+    double time = elapsed.count();
+    if (i >= warm_up) {
+      time_min = (std::min)(time_min, time);
+      time_max = (std::max)(time_max, time);
+      time_avg += time;
+    }
+  }
+  time_avg /= loop_num;
+  fprintf(stderr,
+          "%20s  min = %7.2f  max = %7.2f  avg = %7.2f\n",
+          "tinypose",
+          time_min,
+          time_max,
+          time_avg);
+  return 0;
+}
+
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    fprintf(stderr,
+            "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; \n "
+            "For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, mode=2; "
+            "\n For benchmark, mode=3 path=0.\n",
+            argv[0]);
+    return -1;
+  }
+  PicoDet detector =
+      PicoDet("../weight/picodet_m_416.mnn", 416, 416, 4, 0.45, 0.3);
+  KeyPointDetector* kpts_detector =
+      new KeyPointDetector("../weight/tinypose256.mnn", 4, 256, 192);
+  int mode = atoi(argv[1]);
+  switch (mode) {
+    case 0: {
+      int cam_id = atoi(argv[2]);
+      webcam_demo(detector, kpts_detector, cam_id);
+      break;
+    }
+    case 1: {
+      const char* images = argv[2];
+      image_demo(detector, kpts_detector, images);
+      break;
+    }
+    case 2: {
+      const char* path = argv[2];
+      video_demo(detector, kpts_detector, path);
+      break;
+    }
+    case 3: {
+      benchmark(kpts_detector);
+      break;
+    }
+    default: {
+      fprintf(stderr,
+              "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; "
+              "\n For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, "
+              "mode=2; \n For benchmark, mode=3 path=0.\n",
+              argv[0]);
+      break;
+    }
+  }
+  delete kpts_detector;
+  kpts_detector = nullptr;
+}
--- a/deploy/third_engine/demo_mnn_kpts/picodet_mnn.cpp
+++ b/deploy/third_engine/demo_mnn_kpts/picodet_mnn.cpp
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_mnn
+
+#include "picodet_mnn.h"
+
+using namespace std;
+
+PicoDet::PicoDet(const std::string &mnn_path,
+                 int input_width,
+                 int input_length,
+                 int num_thread_,
+                 float score_threshold_,
+                 float nms_threshold_) {
+  num_thread = num_thread_;
+  in_w = input_width;
+  in_h = input_length;
+  score_threshold = score_threshold_;
+  nms_threshold = nms_threshold_;
+
+  PicoDet_interpreter = std::shared_ptr<MNN::Interpreter>(
+      MNN::Interpreter::createFromFile(mnn_path.c_str()));
+  MNN::ScheduleConfig config;
+  config.numThread = num_thread;
+  MNN::BackendConfig backendConfig;
+  backendConfig.precision = (MNN::BackendConfig::PrecisionMode)2;
+  config.backendConfig = &backendConfig;
+
+  PicoDet_session = PicoDet_interpreter->createSession(config);
+
+  input_tensor = PicoDet_interpreter->getSessionInput(PicoDet_session, nullptr);
+}
+
+PicoDet::~PicoDet() {
+  PicoDet_interpreter->releaseModel();
+  PicoDet_interpreter->releaseSession(PicoDet_session);
+}
+
+int PicoDet::detect(cv::Mat &raw_image, std::vector<BoxInfo> &result_list) {
+  if (raw_image.empty()) {
+    std::cout << "image is empty ,please check!" << std::endl;
+    return -1;
+  }
+
+  image_h = raw_image.rows;
+  image_w = raw_image.cols;
+  cv::Mat image;
+  cv::resize(raw_image, image, cv::Size(in_w, in_h));
+
+  PicoDet_interpreter->resizeTensor(input_tensor, {1, 3, in_h, in_w});
+  PicoDet_interpreter->resizeSession(PicoDet_session);
+  std::shared_ptr<MNN::CV::ImageProcess> pretreat(MNN::CV::ImageProcess::create(
+      MNN::CV::BGR, MNN::CV::BGR, mean_vals, 3, norm_vals, 3));
+  pretreat->convert(image.data, in_w, in_h, image.step[0], input_tensor);
+
+  auto start = chrono::steady_clock::now();
+
+  // run network
+  PicoDet_interpreter->runSession(PicoDet_session);
+
+  // get output data
+  std::vector<std::vector<BoxInfo>> results;
+  results.resize(num_class);
+
+  for (const auto &head_info : heads_info) {
+    MNN::Tensor *tensor_scores = PicoDet_interpreter->getSessionOutput(
+        PicoDet_session, head_info.cls_layer.c_str());
+    MNN::Tensor *tensor_boxes = PicoDet_interpreter->getSessionOutput(
+        PicoDet_session, head_info.dis_layer.c_str());
+
+    MNN::Tensor tensor_scores_host(tensor_scores,
+                                   tensor_scores->getDimensionType());
+    tensor_scores->copyToHostTensor(&tensor_scores_host);
+
+    MNN::Tensor tensor_boxes_host(tensor_boxes,
+                                  tensor_boxes->getDimensionType());
+    tensor_boxes->copyToHostTensor(&tensor_boxes_host);
+
+    decode_infer(&tensor_scores_host,
+                 &tensor_boxes_host,
+                 head_info.stride,
+                 score_threshold,
+                 results);
+  }
+
+  auto end = chrono::steady_clock::now();
+  chrono::duration<double> elapsed = end - start;
+  cout << "inference time:" << elapsed.count() << " s, ";
+
+  for (int i = 0; i < (int)results.size(); i++) {
+    nms(results[i], nms_threshold);
+
+    for (auto box : results[i]) {
+      box.x1 = box.x1 / in_w * image_w;
+      box.x2 = box.x2 / in_w * image_w;
+      box.y1 = box.y1 / in_h * image_h;
+      box.y2 = box.y2 / in_h * image_h;
+      result_list.push_back(box);
+    }
+  }
+  cout << "detect " << result_list.size() << " objects." << std::endl;
+  ;
+
+  return 0;
+}
+
+void PicoDet::decode_infer(MNN::Tensor *cls_pred,
+                           MNN::Tensor *dis_pred,
+                           int stride,
+                           float threshold,
+                           std::vector<std::vector<BoxInfo>> &results) {
+  int feature_h = in_h / stride;
+  int feature_w = in_w / stride;
+
+  for (int idx = 0; idx < feature_h * feature_w; idx++) {
+    const float *scores = cls_pred->host<float>() + (idx * num_class);
+    int row = idx / feature_w;
+    int col = idx % feature_w;
+    float score = 0;
+    int cur_label = 0;
+    for (int label = 0; label < num_class; label++) {
+      if (scores[label] > score) {
+        score = scores[label];
+        cur_label = label;
+      }
+    }
+    if (score > threshold) {
+      const float *bbox_pred =
+          dis_pred->host<float>() + (idx * 4 * (reg_max + 1));
+      results[cur_label].push_back(
+          disPred2Bbox(bbox_pred, cur_label, score, col, row, stride));
+    }
+  }
+}
+
+BoxInfo PicoDet::disPred2Bbox(
+    const float *&dfl_det, int label, float score, int x, int y, int stride) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float *dis_after_sm = new float[reg_max + 1];
+    activation_function_softmax(
+        dfl_det + i * (reg_max + 1), dis_after_sm, reg_max + 1);
+    for (int j = 0; j < reg_max + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  float xmin = (std::max)(ct_x - dis_pred[0], .0f);
+  float ymin = (std::max)(ct_y - dis_pred[1], .0f);
+  float xmax = (std::min)(ct_x + dis_pred[2], (float)in_w);
+  float ymax = (std::min)(ct_y + dis_pred[3], (float)in_h);
+  return BoxInfo{xmin, ymin, xmax, ymax, score, label};
+}
+
+void PicoDet::nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
+  std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) {
+    return a.score > b.score;
+  });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) *
+               (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
+      float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
+      float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
+      float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= NMS_THRESH) {
+        input_boxes.erase(input_boxes.begin() + j);
+        vArea.erase(vArea.begin() + j);
+      } else {
+        j++;
+      }
+    }
+  }
+}
+
+string PicoDet::get_label_str(int label) { return labels[label]; }
+
+inline float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+
+inline float sigmoid(float x) { return 1.0f / (1.0f + fast_exp(-x)); }
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+
+  return 0;
+}
--- a/deploy/third_engine/demo_mnn_kpts/picodet_mnn.h
+++ b/deploy/third_engine/demo_mnn_kpts/picodet_mnn.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_mnn
+
+#ifndef __PicoDet_H__
+#define __PicoDet_H__
+
+#pragma once
+
+#include "Interpreter.hpp"
+
+#include <algorithm>
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <opencv2/opencv.hpp>
+#include <string>
+#include <vector>
+#include "ImageProcess.hpp"
+#include "MNNDefine.h"
+#include "Tensor.hpp"
+
+typedef struct HeadInfo_ {
+  std::string cls_layer;
+  std::string dis_layer;
+  int stride;
+} HeadInfo;
+
+typedef struct BoxInfo_ {
+  float x1;
+  float y1;
+  float x2;
+  float y2;
+  float score;
+  int label;
+} BoxInfo;
+
+class PicoDet {
+ public:
+  PicoDet(const std::string &mnn_path,
+          int input_width,
+          int input_length,
+          int num_thread_ = 4,
+          float score_threshold_ = 0.5,
+          float nms_threshold_ = 0.3);
+
+  ~PicoDet();
+
+  int detect(cv::Mat &img, std::vector<BoxInfo> &result_list);
+  std::string get_label_str(int label);
+
+ private:
+  void decode_infer(MNN::Tensor *cls_pred,
+                    MNN::Tensor *dis_pred,
+                    int stride,
+                    float threshold,
+                    std::vector<std::vector<BoxInfo>> &results);
+  BoxInfo disPred2Bbox(
+      const float *&dfl_det, int label, float score, int x, int y, int stride);
+  void nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH);
+
+ private:
+  std::shared_ptr<MNN::Interpreter> PicoDet_interpreter;
+  MNN::Session *PicoDet_session = nullptr;
+  MNN::Tensor *input_tensor = nullptr;
+
+  int num_thread;
+  int image_w;
+  int image_h;
+
+  int in_w = 320;
+  int in_h = 320;
+
+  float score_threshold;
+  float nms_threshold;
+
+  const float mean_vals[3] = {103.53f, 116.28f, 123.675f};
+  const float norm_vals[3] = {0.017429f, 0.017507f, 0.017125f};
+
+  const int num_class = 80;
+  const int reg_max = 7;
+
+  std::vector<HeadInfo> heads_info{
+      // cls_pred|dis_pred|stride
+      {"save_infer_model/scale_0.tmp_1", "save_infer_model/scale_4.tmp_1", 8},
+      {"save_infer_model/scale_1.tmp_1", "save_infer_model/scale_5.tmp_1", 16},
+      {"save_infer_model/scale_2.tmp_1", "save_infer_model/scale_6.tmp_1", 32},
+      {"save_infer_model/scale_3.tmp_1", "save_infer_model/scale_7.tmp_1", 64},
+  };
+
+  std::vector<std::string> labels{
+      "person",        "bicycle",      "car",
+      "motorcycle",    "airplane",     "bus",
+      "train",         "truck",        "boat",
+      "traffic light", "fire hydrant", "stop sign",
+      "parking meter", "bench",        "bird",
+      "cat",           "dog",          "horse",
+      "sheep",         "cow",          "elephant",
+      "bear",          "zebra",        "giraffe",
+      "backpack",      "umbrella",     "handbag",
+      "tie",           "suitcase",     "frisbee",
+      "skis",          "snowboard",    "sports ball",
+      "kite",          "baseball bat", "baseball glove",
+      "skateboard",    "surfboard",    "tennis racket",
+      "bottle",        "wine glass",   "cup",
+      "fork",          "knife",        "spoon",
+      "bowl",          "banana",       "apple",
+      "sandwich",      "orange",       "broccoli",
+      "carrot",        "hot dog",      "pizza",
+      "donut",         "cake",         "chair",
+      "couch",         "potted plant", "bed",
+      "dining table",  "toilet",       "tv",
+      "laptop",        "mouse",        "remote",
+      "keyboard",      "cell phone",   "microwave",
+      "oven",          "toaster",      "sink",
+      "refrigerator",  "book",         "clock",
+      "vase",          "scissors",     "teddy bear",
+      "hair drier",    "toothbrush"};
+};
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length);
+
+inline float fast_exp(float x);
+inline float sigmoid(float x);
+
+#endif