add 3rd infer engine (#4336)

* add 3rd infer engine

add 3rd infer engine (#4336)
* add 3rd infer engine
34654225 · qq_30618961 · GitHub · 3ee7bde2 · 34654225 · 34654225
17 changed file
--- a/deploy/third_engine/demo_mnn/CMakeLists.txt
+++ b/deploy/third_engine/demo_mnn/CMakeLists.txt
+cmake_minimum_required(VERSION 3.9)
+project(picodet-mnn)
+set(CMAKE_CXX_STANDARD 17)
+# find_package(OpenCV REQUIRED PATHS "/work/dependence/opencv/opencv-3.4.3/build")
+find_package(OpenCV REQUIRED)
+include_directories(
+        /path/to/MNN/include/MNN
+        /path/to/MNN/include
+        .
+)
+link_directories(mnn/lib)
+add_library(libMNN SHARED IMPORTED)
+set_target_properties(
+        libMNN
+        PROPERTIES IMPORTED_LOCATION
+        ${CMAKE_SOURCE_DIR}/mnn/lib/libMNN.so
+)
+add_executable(picodet-mnn main.cpp picodet_mnn.cpp)
+target_link_libraries(picodet-mnn MNN ${OpenCV_LIBS} libMNN.so)
--- a/deploy/third_engine/demo_mnn/README.md
+++ b/deploy/third_engine/demo_mnn/README.md
+# PicoDet MNN Demo
+This fold provides PicoDet inference code using
+[Alibaba's MNN framework](https://github.com/alibaba/MNN). Most of the implements in
+this fold are same as *demo_ncnn*.
+## Install MNN
+### Python library
+Just run:
+``` shell
+pip install MNN
+```
+### C++ library
+Please follow the [official document](https://www.yuque.com/mnn/en/build_linux) to build MNN engine.
+- Create picodet_m_416_coco.onnx
+    ```shell
+    modelName=picodet_m_416_coco
+    # export model
+    python tools/export_model.py \
+            -c configs/picodet/${modelName}.yml \
+            -o weights=${modelName}.pdparams \
+            --output_dir=inference_model
+    # convert to onnx
+    paddle2onnx --model_dir inference_model/${modelName} \
+            --model_filename model.pdmodel  \
+            --params_filename model.pdiparams \
+            --opset_version 11 \
+            --save_file ${modelName}.onnx
+    # onnxsim
+    python -m onnxsim ${modelName}.onnx ${modelName}_processed.onnx
+    ```
+- Convert model
+   ``` shell
+   python -m MNN.tools.mnnconvert -f ONNX --modelFile picodet-416.onnx --MNNModel picodet-416.mnn
+   ```
+Here are converted model [download link](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416.mnn).
+## Build
+The python code *demo_mnn.py* can run directly and independently without main PicoDet repo.
+`PicoDetONNX` and `PicoDetTorch` are two classes used to check the similarity of MNN inference results
+with ONNX model and Pytorch model. They can be remove with no side effects.
+For C++ code, replace `libMNN.so` under *./mnn/lib* with the one you just compiled, modify OpenCV path and MNN path at CMake file,
+and run
+``` shell
+mkdir build && cd build
+cmake ..
+make
+```
+Note that a flag at `main.cpp` is used to control whether to show the detection result or save it into a fold.
+``` c++
+#define __SAVE_RESULT__ // if defined save drawed results to ../results, else show it in windows
+```
+## Run
+### Python
+`demo_mnn.py` provide an inference class `PicoDetMNN` that combines preprocess, post process, visualization.
+Besides it can be used in command line with the form:
+```shell
+demo_mnn.py [-h] [--model_path MODEL_PATH] [--cfg_path CFG_PATH]
+    [--img_fold IMG_FOLD] [--result_fold RESULT_FOLD]
+    [--input_shape INPUT_SHAPE INPUT_SHAPE]
+    [--backend {MNN,ONNX,torch}]
+```
+For example:
+``` shell
+# run MNN 416 model
+python ./demo_mnn.py --model_path ../model/picodet-416.mnn --img_fold ../imgs --result_fold ../results
+# run MNN 320 model
+python ./demo_mnn.py --model_path ../model/picodet-320.mnn --input_shape 320 320 --backend MNN
+# run onnx model
+python ./demo_mnn.py --model_path ../model/sim.onnx --backend ONNX
+```
+### C++
+C++ inference interface is same with NCNN code, to detect images in a fold, run:
+``` shell
+./picodet-mnn "1" "../imgs/test.jpg"
+```
+For speed benchmark
+``` shell
+./picodet-mnn "3" "0"
+```
+## Reference
+[MNN](https://github.com/alibaba/MNN)
--- a/deploy/third_engine/demo_mnn/main.cpp
+++ b/deploy/third_engine/demo_mnn/main.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_mnn
+#include "picodet_mnn.hpp"
+#include <iostream>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#define __SAVE_RESULT__ // if defined save drawed results to ../results, else show it in windows
+struct object_rect {
+    int x;
+    int y;
+    int width;
+    int height;
+};
+int resize_uniform(cv::Mat& src, cv::Mat& dst, cv::Size dst_size, object_rect& effect_area)
+{
+    int w = src.cols;
+    int h = src.rows;
+    int dst_w = dst_size.width;
+    int dst_h = dst_size.height;
+    dst = cv::Mat(cv::Size(dst_w, dst_h), CV_8UC3, cv::Scalar(0));
+    float ratio_src = w * 1.0 / h;
+    float ratio_dst = dst_w * 1.0 / dst_h;
+    int tmp_w = 0;
+    int tmp_h = 0;
+    if (ratio_src > ratio_dst) {
+        tmp_w = dst_w;
+        tmp_h = floor((dst_w * 1.0 / w) * h);
+    }
+    else if (ratio_src < ratio_dst) {
+        tmp_h = dst_h;
+        tmp_w = floor((dst_h * 1.0 / h) * w);
+    }
+    else {
+        cv::resize(src, dst, dst_size);
+        effect_area.x = 0;
+        effect_area.y = 0;
+        effect_area.width = dst_w;
+        effect_area.height = dst_h;
+        return 0;
+    }
+    cv::Mat tmp;
+    cv::resize(src, tmp, cv::Size(tmp_w, tmp_h));
+    if (tmp_w != dst_w) {
+        int index_w = floor((dst_w - tmp_w) / 2.0);
+        for (int i = 0; i < dst_h; i++) {
+            memcpy(dst.data + i * dst_w * 3 + index_w * 3, tmp.data + i * tmp_w * 3, tmp_w * 3);
+        }
+        effect_area.x = index_w;
+        effect_area.y = 0;
+        effect_area.width = tmp_w;
+        effect_area.height = tmp_h;
+    }
+    else if (tmp_h != dst_h) {
+        int index_h = floor((dst_h - tmp_h) / 2.0);
+        memcpy(dst.data + index_h * dst_w * 3, tmp.data, tmp_w * tmp_h * 3);
+        effect_area.x = 0;
+        effect_area.y = index_h;
+        effect_area.width = tmp_w;
+        effect_area.height = tmp_h;
+    }
+    else {
+        printf("error\n");
+    }
+    return 0;
+}
+const int color_list[80][3] =
+{
+    {216 , 82 , 24},
+    {236 ,176 , 31},
+    {125 , 46 ,141},
+    {118 ,171 , 47},
+    { 76 ,189 ,237},
+    {238 , 19 , 46},
+    { 76 , 76 , 76},
+    {153 ,153 ,153},
+    {255 ,  0 ,  0},
+    {255 ,127 ,  0},
+    {190 ,190 ,  0},
+    {  0 ,255 ,  0},
+    {  0 ,  0 ,255},
+    {170 ,  0 ,255},
+    { 84 , 84 ,  0},
+    { 84 ,170 ,  0},
+    { 84 ,255 ,  0},
+    {170 , 84 ,  0},
+    {170 ,170 ,  0},
+    {170 ,255 ,  0},
+    {255 , 84 ,  0},
+    {255 ,170 ,  0},
+    {255 ,255 ,  0},
+    {  0 , 84 ,127},
+    {  0 ,170 ,127},
+    {  0 ,255 ,127},
+    { 84 ,  0 ,127},
+    { 84 , 84 ,127},
+    { 84 ,170 ,127},
+    { 84 ,255 ,127},
+    {170 ,  0 ,127},
+    {170 , 84 ,127},
+    {170 ,170 ,127},
+    {170 ,255 ,127},
+    {255 ,  0 ,127},
+    {255 , 84 ,127},
+    {255 ,170 ,127},
+    {255 ,255 ,127},
+    {  0 , 84 ,255},
+    {  0 ,170 ,255},
+    {  0 ,255 ,255},
+    { 84 ,  0 ,255},
+    { 84 , 84 ,255},
+    { 84 ,170 ,255},
+    { 84 ,255 ,255},
+    {170 ,  0 ,255},
+    {170 , 84 ,255},
+    {170 ,170 ,255},
+    {170 ,255 ,255},
+    {255 ,  0 ,255},
+    {255 , 84 ,255},
+    {255 ,170 ,255},
+    { 42 ,  0 ,  0},
+    { 84 ,  0 ,  0},
+    {127 ,  0 ,  0},
+    {170 ,  0 ,  0},
+    {212 ,  0 ,  0},
+    {255 ,  0 ,  0},
+    {  0 , 42 ,  0},
+    {  0 , 84 ,  0},
+    {  0 ,127 ,  0},
+    {  0 ,170 ,  0},
+    {  0 ,212 ,  0},
+    {  0 ,255 ,  0},
+    {  0 ,  0 , 42},
+    {  0 ,  0 , 84},
+    {  0 ,  0 ,127},
+    {  0 ,  0 ,170},
+    {  0 ,  0 ,212},
+    {  0 ,  0 ,255},
+    {  0 ,  0 ,  0},
+    { 36 , 36 , 36},
+    { 72 , 72 , 72},
+    {109 ,109 ,109},
+    {145 ,145 ,145},
+    {182 ,182 ,182},
+    {218 ,218 ,218},
+    {  0 ,113 ,188},
+    { 80 ,182 ,188},
+    {127 ,127 ,  0},
+};
+void draw_bboxes(const cv::Mat& bgr, const std::vector<BoxInfo>& bboxes, object_rect effect_roi, std::string save_path="None")
+{
+    static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus",
+                                        "train", "truck", "boat", "traffic light", "fire hydrant",
+                                        "stop sign", "parking meter", "bench", "bird", "cat", "dog",
+                                        "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
+                                        "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+                                        "skis", "snowboard", "sports ball", "kite", "baseball bat",
+                                        "baseball glove", "skateboard", "surfboard", "tennis racket",
+                                        "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
+                                        "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
+                                        "hot dog", "pizza", "donut", "cake", "chair", "couch",
+                                        "potted plant", "bed", "dining table", "toilet", "tv", "laptop",
+                                        "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
+                                        "toaster", "sink", "refrigerator", "book", "clock", "vase",
+                                        "scissors", "teddy bear", "hair drier", "toothbrush"
+    };
+    cv::Mat image = bgr.clone();
+    int src_w = image.cols;
+    int src_h = image.rows;
+    int dst_w = effect_roi.width;
+    int dst_h = effect_roi.height;
+    float width_ratio = (float)src_w / (float)dst_w;
+    float height_ratio = (float)src_h / (float)dst_h;
+    for (size_t i = 0; i < bboxes.size(); i++)
+    {
+        const BoxInfo& bbox = bboxes[i];
+        cv::Scalar color = cv::Scalar(color_list[bbox.label][0], color_list[bbox.label][1], color_list[bbox.label][2]);
+        cv::rectangle(image, cv::Rect(cv::Point((bbox.x1 - effect_roi.x) * width_ratio, (bbox.y1 - effect_roi.y) * height_ratio),
+                                      cv::Point((bbox.x2 - effect_roi.x) * width_ratio, (bbox.y2 - effect_roi.y) * height_ratio)), color);
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[bbox.label], bbox.score * 100);
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
+        int x = (bbox.x1 - effect_roi.x) * width_ratio;
+        int y = (bbox.y1 - effect_roi.y) * height_ratio - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+            color, -1);
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+            cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(255, 255, 255));
+    }
+    if (save_path == "None")
+    {
+        cv::imshow("image", image);
+    }
+    else
+    {
+        cv::imwrite(save_path, image);
+        std::cout << save_path << std::endl;
+    }
+}
+int image_demo(PicoDet &detector, const char* imagepath)
+{
+    std::vector<cv::String> filenames;
+    cv::glob(imagepath, filenames, false);
+    for (auto img_name : filenames)
+    {
+        cv::Mat image = cv::imread(img_name);
+        if (image.empty())
+        {
+            fprintf(stderr, "cv::imread %s failed\n", img_name.c_str());
+            return -1;
+        }
+        object_rect effect_roi;
+        cv::Mat resized_img;
+        resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+        std::vector<BoxInfo> results;
+        detector.detect(resized_img, results);
+        #ifdef __SAVE_RESULT__
+            std::string save_path = img_name;
+            draw_bboxes(image, results, effect_roi, save_path.replace(3, 4, "results"));
+        #else
+            draw_bboxes(image, results, effect_roi);
+            cv::waitKey(0);
+        #endif
+    }
+    return 0;
+}
+int webcam_demo(PicoDet& detector, int cam_id)
+{
+    cv::Mat image;
+    cv::VideoCapture cap(cam_id);
+    while (true)
+    {
+        cap >> image;
+        object_rect effect_roi;
+        cv::Mat resized_img;
+        resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+        std::vector<BoxInfo> results;
+        detector.detect(resized_img, results);
+        draw_bboxes(image, results, effect_roi);
+        cv::waitKey(1);
+    }
+    return 0;
+}
+int video_demo(PicoDet& detector, const char* path)
+{
+    cv::Mat image;
+    cv::VideoCapture cap(path);
+    while (true)
+    {
+        cap >> image;
+        object_rect effect_roi;
+        cv::Mat resized_img;
+        resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+        std::vector<BoxInfo> results;
+        detector.detect(resized_img, results);
+        draw_bboxes(image, results, effect_roi);
+        cv::waitKey(1);
+    }
+    return 0;
+}
+int benchmark(PicoDet& detector)
+{
+    int loop_num = 100;
+    int warm_up = 8;
+    double time_min = DBL_MAX;
+    double time_max = -DBL_MAX;
+    double time_avg = 0;
+    cv::Mat image(320, 320, CV_8UC3, cv::Scalar(1, 1, 1));
+    for (int i = 0; i < warm_up + loop_num; i++)
+    {
+        auto start = std::chrono::steady_clock::now();
+        std::vector<BoxInfo> results;
+        detector.detect(image, results);
+        auto end = std::chrono::steady_clock::now();
+        std::chrono::duration<double> elapsed = end - start;
+        double time = elapsed.count();
+        if (i >= warm_up)
+        {
+            time_min = (std::min)(time_min, time);
+            time_max = (std::max)(time_max, time);
+            time_avg += time;
+        }
+    }
+    time_avg /= loop_num;
+    fprintf(stderr, "%20s  min = %7.2f  max = %7.2f  avg = %7.2f\n", "picodet", time_min, time_max, time_avg);
+    return 0;
+}
+int main(int argc, char** argv)
+{
+    if (argc != 3)
+    {
+        fprintf(stderr, "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; \n For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, mode=2; \n For benchmark, mode=3 path=0.\n", argv[0]);
+        return -1;
+    }
+    PicoDet detector = PicoDet("../weight/picodet-416.mnn", 416, 416, 4, 0.45, 0.3);
+    int mode = atoi(argv[1]);
+    switch (mode)
+    {
+    case 0:{
+        int cam_id = atoi(argv[2]);
+        webcam_demo(detector, cam_id);
+        break;
+        }
+    case 1:{
+        const char* images = argv[2];
+        image_demo(detector, images);
+        break;
+        }
+    case 2:{
+        const char* path = argv[2];
+        video_demo(detector, path);
+        break;
+        }
+    case 3:{
+        benchmark(detector);
+        break;
+        }
+    default:{
+        fprintf(stderr, "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; \n For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, mode=2; \n For benchmark, mode=3 path=0.\n", argv[0]);
+        break;
+        }
+    }
+}
--- a/deploy/third_engine/demo_mnn/picodet_mnn.cpp
+++ b/deploy/third_engine/demo_mnn/picodet_mnn.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_mnn
+#include "picodet_mnn.hpp"
+using namespace std;
+PicoDet::PicoDet(const std::string &mnn_path,
+                     int input_width, int input_length, int num_thread_,
+                     float score_threshold_, float nms_threshold_)
+{
+    num_thread = num_thread_;
+    in_w = input_width;
+    in_h = input_length;
+    score_threshold = score_threshold_;
+    nms_threshold = nms_threshold_;
+    PicoDet_interpreter = std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromFile(mnn_path.c_str()));
+    MNN::ScheduleConfig config;
+    config.numThread = num_thread;
+    MNN::BackendConfig backendConfig;
+    backendConfig.precision = (MNN::BackendConfig::PrecisionMode) 2;
+    config.backendConfig = &backendConfig;
+    PicoDet_session = PicoDet_interpreter->createSession(config);
+    input_tensor = PicoDet_interpreter->getSessionInput(PicoDet_session, nullptr);
+}
+PicoDet::~PicoDet()
+{
+    PicoDet_interpreter->releaseModel();
+    PicoDet_interpreter->releaseSession(PicoDet_session);
+}
+int PicoDet::detect(cv::Mat &raw_image, std::vector<BoxInfo> &result_list)
+{
+    if (raw_image.empty()) {
+        std::cout << "image is empty ,please check!" << std::endl;
+        return -1;
+    }
+    image_h = raw_image.rows;
+    image_w = raw_image.cols;
+    cv::Mat image;
+    cv::resize(raw_image, image, cv::Size(in_w, in_h));
+    PicoDet_interpreter->resizeTensor(input_tensor, {1, 3, in_h, in_w});
+    PicoDet_interpreter->resizeSession(PicoDet_session);
+    std::shared_ptr<MNN::CV::ImageProcess> pretreat(
+        MNN::CV::ImageProcess::create(MNN::CV::BGR, MNN::CV::BGR, mean_vals, 3,
+                                        norm_vals, 3));
+    pretreat->convert(image.data, in_w, in_h, image.step[0], input_tensor);
+    auto start = chrono::steady_clock::now();
+    // run network
+    PicoDet_interpreter->runSession(PicoDet_session);
+    // get output data
+    std::vector<std::vector<BoxInfo>> results;
+    results.resize(num_class);
+    for (const auto &head_info : heads_info)
+    {
+        MNN::Tensor *tensor_scores = PicoDet_interpreter->getSessionOutput(PicoDet_session, head_info.cls_layer.c_str());
+        MNN::Tensor *tensor_boxes = PicoDet_interpreter->getSessionOutput(PicoDet_session, head_info.dis_layer.c_str());
+        MNN::Tensor tensor_scores_host(tensor_scores, tensor_scores->getDimensionType());
+        tensor_scores->copyToHostTensor(&tensor_scores_host);
+        MNN::Tensor tensor_boxes_host(tensor_boxes, tensor_boxes->getDimensionType());
+        tensor_boxes->copyToHostTensor(&tensor_boxes_host);
+        decode_infer(&tensor_scores_host, &tensor_boxes_host, head_info.stride, score_threshold, results);
+    }
+    auto end = chrono::steady_clock::now();
+    chrono::duration<double> elapsed = end - start;
+    cout << "inference time:" << elapsed.count() << " s, ";
+    for (int i = 0; i < (int)results.size(); i++)
+    {
+        nms(results[i], nms_threshold);
+        for (auto box : results[i])
+        {
+            box.x1 = box.x1 / in_w * image_w;
+            box.x2 = box.x2 / in_w * image_w;
+            box.y1 = box.y1 / in_h * image_h;
+            box.y2 = box.y2 / in_h * image_h;
+            result_list.push_back(box);
+        }
+    }
+    cout << "detect " << result_list.size() << " objects" << endl;
+    return 0;
+}
+void PicoDet::decode_infer(MNN::Tensor *cls_pred, MNN::Tensor *dis_pred, int stride, float threshold, std::vector<std::vector<BoxInfo>> &results)
+{
+    int feature_h = in_h / stride;
+    int feature_w = in_w / stride;
+    for (int idx = 0; idx < feature_h * feature_w; idx++)
+    {
+        const float *scores = cls_pred->host<float>() + (idx * num_class);
+        int row = idx / feature_w;
+        int col = idx % feature_w;
+        float score = 0;
+        int cur_label = 0;
+        for (int label = 0; label < num_class; label++)
+        {
+            if (scores[label] > score)
+            {
+                score = scores[label];
+                cur_label = label;
+            }
+        }
+        if (score > threshold)
+        {
+            const float *bbox_pred = dis_pred->host<float>() + (idx * 4 * (reg_max + 1));
+            results[cur_label].push_back(disPred2Bbox(bbox_pred, cur_label, score, col, row, stride));
+        }
+    }
+}
+BoxInfo PicoDet::disPred2Bbox(const float *&dfl_det, int label, float score, int x, int y, int stride)
+{
+    float ct_x = (x + 0.5) * stride;
+    float ct_y = (y + 0.5) * stride;
+    std::vector<float> dis_pred;
+    dis_pred.resize(4);
+    for (int i = 0; i < 4; i++)
+    {
+        float dis = 0;
+        float *dis_after_sm = new float[reg_max + 1];
+        activation_function_softmax(dfl_det + i * (reg_max + 1), dis_after_sm, reg_max + 1);
+        for (int j = 0; j < reg_max + 1; j++)
+        {
+            dis += j * dis_after_sm[j];
+        }
+        dis *= stride;
+        dis_pred[i] = dis;
+        delete[] dis_after_sm;
+    }
+    float xmin = (std::max)(ct_x - dis_pred[0], .0f);
+    float ymin = (std::max)(ct_y - dis_pred[1], .0f);
+    float xmax = (std::min)(ct_x + dis_pred[2], (float)in_w);
+    float ymax = (std::min)(ct_y + dis_pred[3], (float)in_h);
+    return BoxInfo{xmin, ymin, xmax, ymax, score, label};
+}
+void PicoDet::nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH)
+{
+    std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
+    std::vector<float> vArea(input_boxes.size());
+    for (int i = 0; i < int(input_boxes.size()); ++i)
+    {
+        vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) * (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
+    }
+    for (int i = 0; i < int(input_boxes.size()); ++i)
+    {
+        for (int j = i + 1; j < int(input_boxes.size());)
+        {
+            float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
+            float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
+            float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
+            float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);
+            float w = (std::max)(float(0), xx2 - xx1 + 1);
+            float h = (std::max)(float(0), yy2 - yy1 + 1);
+            float inter = w * h;
+            float ovr = inter / (vArea[i] + vArea[j] - inter);
+            if (ovr >= NMS_THRESH)
+            {
+                input_boxes.erase(input_boxes.begin() + j);
+                vArea.erase(vArea.begin() + j);
+            }
+            else
+            {
+                j++;
+            }
+        }
+    }
+}
+string PicoDet::get_label_str(int label)
+{
+    return labels[label];
+}
+inline float fast_exp(float x)
+{
+    union
+    {
+        uint32_t i;
+        float f;
+    } v{};
+    v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+    return v.f;
+}
+inline float sigmoid(float x)
+{
+    return 1.0f / (1.0f + fast_exp(-x));
+}
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length)
+{
+    const _Tp alpha = *std::max_element(src, src + length);
+    _Tp denominator{0};
+    for (int i = 0; i < length; ++i)
+    {
+        dst[i] = fast_exp(src[i] - alpha);
+        denominator += dst[i];
+    }
+    for (int i = 0; i < length; ++i)
+    {
+        dst[i] /= denominator;
+    }
+    return 0;
+}
--- a/deploy/third_engine/demo_mnn/picodet_mnn.hpp
+++ b/deploy/third_engine/demo_mnn/picodet_mnn.hpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_mnn
+#ifndef __PicoDet_H__
+#define __PicoDet_H__
+#pragma once
+#include "Interpreter.hpp"
+#include "MNNDefine.h"
+#include "Tensor.hpp"
+#include "ImageProcess.hpp"
+#include <opencv2/opencv.hpp>
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <memory>
+#include <chrono>
+typedef struct HeadInfo_
+{
+    std::string cls_layer;
+    std::string dis_layer;
+    int stride;
+} HeadInfo;
+typedef struct BoxInfo_
+{
+    float x1;
+    float y1;
+    float x2;
+    float y2;
+    float score;
+    int label;
+} BoxInfo;
+class PicoDet {
+public:
+    PicoDet(const std::string &mnn_path,
+            int input_width, int input_length, int num_thread_ = 4, float score_threshold_ = 0.5, float nms_threshold_ = 0.3);
+    ~PicoDet();
+    int detect(cv::Mat &img, std::vector<BoxInfo> &result_list);
+    std::string get_label_str(int label);
+private:
+    void decode_infer(MNN::Tensor *cls_pred, MNN::Tensor *dis_pred, int stride, float threshold, std::vector<std::vector<BoxInfo>> &results);
+    BoxInfo disPred2Bbox(const float *&dfl_det, int label, float score, int x, int y, int stride);
+    void nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH);
+private:
+    std::shared_ptr<MNN::Interpreter> PicoDet_interpreter;
+    MNN::Session *PicoDet_session = nullptr;
+    MNN::Tensor *input_tensor = nullptr;
+    int num_thread;
+    int image_w;
+    int image_h;
+    int in_w = 320;
+    int in_h = 320;
+    float score_threshold;
+    float nms_threshold;
+    const float mean_vals[3] = { 103.53f, 116.28f, 123.675f };
+    const float norm_vals[3] = { 0.017429f, 0.017507f, 0.017125f };
+    const int num_class = 80;
+    const int reg_max = 7;
+    std::vector<HeadInfo> heads_info{
+        // cls_pred|dis_pred|stride
+        {"save_infer_model/scale_0.tmp_1", "save_infer_model/scale_4.tmp_1", 8},
+        {"save_infer_model/scale_1.tmp_1", "save_infer_model/scale_5.tmp_1", 16},
+        {"save_infer_model/scale_2.tmp_1", "save_infer_model/scale_6.tmp_1", 32},
+        {"save_infer_model/scale_3.tmp_1", "save_infer_model/scale_7.tmp_1", 64},
+    };
+    std::vector<std::string>
+    labels{"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+           "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+           "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+           "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+           "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+           "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+           "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+           "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+           "hair drier", "toothbrush"};
+};
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length);
+inline float fast_exp(float x);
+inline float sigmoid(float x);
+#endif
--- a/deploy/third_engine/demo_mnn/python/demo_mnn.py
+++ b/deploy/third_engine/demo_mnn/python/demo_mnn.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# reference from https://github.com/RangiLyu/nanodet/tree/main/demo_mnn
+# -*- coding: utf-8 -*-
+import argparse
+from abc import ABCMeta, abstractmethod
+from pathlib import Path
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.special import softmax
+from tqdm import tqdm
+_COLORS = (np.array([
+    0.000,
+    0.447,
+    0.741,
+    0.850,
+    0.325,
+    0.098,
+    0.929,
+    0.694,
+    0.125,
+    0.494,
+    0.184,
+    0.556,
+    0.466,
+    0.674,
+    0.188,
+    0.301,
+    0.745,
+    0.933,
+    0.635,
+    0.078,
+    0.184,
+    0.300,
+    0.300,
+    0.300,
+    0.600,
+    0.600,
+    0.600,
+    1.000,
+    0.000,
+    0.000,
+    1.000,
+    0.500,
+    0.000,
+    0.749,
+    0.749,
+    0.000,
+    0.000,
+    1.000,
+    0.000,
+    0.000,
+    0.000,
+    1.000,
+    0.667,
+    0.000,
+    1.000,
+    0.333,
+    0.333,
+    0.000,
+    0.333,
+    0.667,
+    0.000,
+    0.333,
+    1.000,
+    0.000,
+    0.667,
+    0.333,
+    0.000,
+    0.667,
+    0.667,
+    0.000,
+    0.667,
+    1.000,
+    0.000,
+    1.000,
+    0.333,
+    0.000,
+    1.000,
+    0.667,
+    0.000,
+    1.000,
+    1.000,
+    0.000,
+    0.000,
+    0.333,
+    0.500,
+    0.000,
+    0.667,
+    0.500,
+    0.000,
+    1.000,
+    0.500,
+    0.333,
+    0.000,
+    0.500,
+    0.333,
+    0.333,
+    0.500,
+    0.333,
+    0.667,
+    0.500,
+    0.333,
+    1.000,
+    0.500,
+    0.667,
+    0.000,
+    0.500,
+    0.667,
+    0.333,
+    0.500,
+    0.667,
+    0.667,
+    0.500,
+    0.667,
+    1.000,
+    0.500,
+    1.000,
+    0.000,
+    0.500,
+    1.000,
+    0.333,
+    0.500,
+    1.000,
+    0.667,
+    0.500,
+    1.000,
+    1.000,
+    0.500,
+    0.000,
+    0.333,
+    1.000,
+    0.000,
+    0.667,
+    1.000,
+    0.000,
+    1.000,
+    1.000,
+    0.333,
+    0.000,
+    1.000,
+    0.333,
+    0.333,
+    1.000,
+    0.333,
+    0.667,
+    1.000,
+    0.333,
+    1.000,
+    1.000,
+    0.667,
+    0.000,
+    1.000,
+    0.667,
+    0.333,
+    1.000,
+    0.667,
+    0.667,
+    1.000,
+    0.667,
+    1.000,
+    1.000,
+    1.000,
+    0.000,
+    1.000,
+    1.000,
+    0.333,
+    1.000,
+    1.000,
+    0.667,
+    1.000,
+    0.333,
+    0.000,
+    0.000,
+    0.500,
+    0.000,
+    0.000,
+    0.667,
+    0.000,
+    0.000,
+    0.833,
+    0.000,
+    0.000,
+    1.000,
+    0.000,
+    0.000,
+    0.000,
+    0.167,
+    0.000,
+    0.000,
+    0.333,
+    0.000,
+    0.000,
+    0.500,
+    0.000,
+    0.000,
+    0.667,
+    0.000,
+    0.000,
+    0.833,
+    0.000,
+    0.000,
+    1.000,
+    0.000,
+    0.000,
+    0.000,
+    0.167,
+    0.000,
+    0.000,
+    0.333,
+    0.000,
+    0.000,
+    0.500,
+    0.000,
+    0.000,
+    0.667,
+    0.000,
+    0.000,
+    0.833,
+    0.000,
+    0.000,
+    1.000,
+    0.000,
+    0.000,
+    0.000,
+    0.143,
+    0.143,
+    0.143,
+    0.286,
+    0.286,
+    0.286,
+    0.429,
+    0.429,
+    0.429,
+    0.571,
+    0.571,
+    0.571,
+    0.714,
+    0.714,
+    0.714,
+    0.857,
+    0.857,
+    0.857,
+    0.000,
+    0.447,
+    0.741,
+    0.314,
+    0.717,
+    0.741,
+    0.50,
+    0.5,
+    0,
+]).astype(np.float32).reshape(-1, 3))
+def get_resize_matrix(raw_shape, dst_shape, keep_ratio):
+    """
+    Get resize matrix for resizing raw img to input size
+    :param raw_shape: (width, height) of raw image
+    :param dst_shape: (width, height) of input image
+    :param keep_ratio: whether keep original ratio
+    :return: 3x3 Matrix
+    """
+    r_w, r_h = raw_shape
+    d_w, d_h = dst_shape
+    Rs = np.eye(3)
+    if keep_ratio:
+        C = np.eye(3)
+        C[0, 2] = -r_w / 2
+        C[1, 2] = -r_h / 2
+        if r_w / r_h < d_w / d_h:
+            ratio = d_h / r_h
+        else:
+            ratio = d_w / r_w
+        Rs[0, 0] *= ratio
+        Rs[1, 1] *= ratio
+        T = np.eye(3)
+        T[0, 2] = 0.5 * d_w
+        T[1, 2] = 0.5 * d_h
+        return T @Rs @C
+    else:
+        Rs[0, 0] *= d_w / r_w
+        Rs[1, 1] *= d_h / r_h
+        return Rs
+def warp_boxes(boxes, M, width, height):
+    """Apply transform to boxes
+    Copy from picodet/data/transform/warp.py
+    """
+    n = len(boxes)
+    if n:
+        # warp points
+        xy = np.ones((n * 4, 3))
+        xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
+            n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+        xy = xy @M.T  # transform
+        xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)  # rescale
+        # create new boxes
+        x = xy[:, [0, 2, 4, 6]]
+        y = xy[:, [1, 3, 5, 7]]
+        xy = np.concatenate(
+            (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+        # clip boxes
+        xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
+        xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
+        return xy.astype(np.float32)
+    else:
+        return boxes
+def overlay_bbox_cv(img, all_box, class_names):
+    """Draw result boxes
+    Copy from picodet/util/visualization.py
+    """
+    # all_box array of [label, x0, y0, x1, y1, score]
+    all_box.sort(key=lambda v: v[5])
+    for box in all_box:
+        label, x0, y0, x1, y1, score = box
+        color = (_COLORS[label] * 255).astype(np.uint8).tolist()
+        text = "{}:{:.1f}%".format(class_names[label], score * 100)
+        txt_color = (0, 0, 0) if np.mean(_COLORS[label]) > 0.5 else (255, 255,
+                                                                     255)
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        txt_size = cv2.getTextSize(text, font, 0.5, 2)[0]
+        cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
+        cv2.rectangle(
+            img,
+            (x0, y0 - txt_size[1] - 1),
+            (x0 + txt_size[0] + txt_size[1], y0 - 1),
+            color,
+            -1, )
+        cv2.putText(img, text, (x0, y0 - 1), font, 0.5, txt_color, thickness=1)
+    return img
+def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
+    """
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        iou_threshold: intersection over union threshold.
+        top_k: keep top_k results. If k <= 0, keep all the results.
+        candidate_size: only consider the candidates with the highest scores.
+    Returns:
+         picked: a list of indexes of the kept boxes
+    """
+    scores = box_scores[:, -1]
+    boxes = box_scores[:, :-1]
+    picked = []
+    indexes = np.argsort(scores)
+    indexes = indexes[-candidate_size:]
+    while len(indexes) > 0:
+        current = indexes[-1]
+        picked.append(current)
+        if 0 < top_k == len(picked) or len(indexes) == 1:
+            break
+        current_box = boxes[current, :]
+        indexes = indexes[:-1]
+        rest_boxes = boxes[indexes, :]
+        iou = iou_of(
+            rest_boxes,
+            np.expand_dims(
+                current_box, axis=0), )
+        indexes = indexes[iou <= iou_threshold]
+    return box_scores[picked, :]
+def iou_of(boxes0, boxes1, eps=1e-5):
+    """Return intersection-over-union (Jaccard index) of boxes.
+    Args:
+        boxes0 (N, 4): ground truth boxes.
+        boxes1 (N or 1, 4): predicted boxes.
+        eps: a small number to avoid 0 as denominator.
+    Returns:
+        iou (N): IoU values.
+    """
+    overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
+    overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])
+    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
+    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
+    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
+    return overlap_area / (area0 + area1 - overlap_area + eps)
+def area_of(left_top, right_bottom):
+    """Compute the areas of rectangles given two corners.
+    Args:
+        left_top (N, 2): left top corner.
+        right_bottom (N, 2): right bottom corner.
+    Returns:
+        area (N): return the area.
+    """
+    hw = np.clip(right_bottom - left_top, 0.0, None)
+    return hw[..., 0] * hw[..., 1]
+class PicoDetABC(metaclass=ABCMeta):
+    def __init__(
+            self,
+            input_shape=[416, 416],
+            reg_max=7,
+            strides=[8, 16, 32, 64],
+            prob_threshold=0.4,
+            iou_threshold=0.3,
+            num_candidate=1000,
+            top_k=-1, ):
+        self.strides = strides
+        self.input_shape = input_shape
+        self.reg_max = reg_max
+        self.prob_threshold = prob_threshold
+        self.iou_threshold = iou_threshold
+        self.num_candidate = num_candidate
+        self.top_k = top_k
+        self.img_mean = [103.53, 116.28, 123.675]
+        self.img_std = [57.375, 57.12, 58.395]
+        self.input_size = (self.input_shape[1], self.input_shape[0])
+        self.class_names = [
+            "person",
+            "bicycle",
+            "car",
+            "motorcycle",
+            "airplane",
+            "bus",
+            "train",
+            "truck",
+            "boat",
+            "traffic_light",
+            "fire_hydrant",
+            "stop_sign",
+            "parking_meter",
+            "bench",
+            "bird",
+            "cat",
+            "dog",
+            "horse",
+            "sheep",
+            "cow",
+            "elephant",
+            "bear",
+            "zebra",
+            "giraffe",
+            "backpack",
+            "umbrella",
+            "handbag",
+            "tie",
+            "suitcase",
+            "frisbee",
+            "skis",
+            "snowboard",
+            "sports_ball",
+            "kite",
+            "baseball_bat",
+            "baseball_glove",
+            "skateboard",
+            "surfboard",
+            "tennis_racket",
+            "bottle",
+            "wine_glass",
+            "cup",
+            "fork",
+            "knife",
+            "spoon",
+            "bowl",
+            "banana",
+            "apple",
+            "sandwich",
+            "orange",
+            "broccoli",
+            "carrot",
+            "hot_dog",
+            "pizza",
+            "donut",
+            "cake",
+            "chair",
+            "couch",
+            "potted_plant",
+            "bed",
+            "dining_table",
+            "toilet",
+            "tv",
+            "laptop",
+            "mouse",
+            "remote",
+            "keyboard",
+            "cell_phone",
+            "microwave",
+            "oven",
+            "toaster",
+            "sink",
+            "refrigerator",
+            "book",
+            "clock",
+            "vase",
+            "scissors",
+            "teddy_bear",
+            "hair_drier",
+            "toothbrush",
+        ]
+    def preprocess(self, img):
+        # resize image
+        ResizeM = get_resize_matrix((img.shape[1], img.shape[0]),
+                                    self.input_size, True)
+        img_resize = cv2.warpPerspective(img, ResizeM, dsize=self.input_size)
+        # normalize image
+        img_input = img_resize.astype(np.float32) / 255
+        img_mean = np.array(
+            self.img_mean, dtype=np.float32).reshape(1, 1, 3) / 255
+        img_std = np.array(
+            self.img_std, dtype=np.float32).reshape(1, 1, 3) / 255
+        img_input = (img_input - img_mean) / img_std
+        # expand dims
+        img_input = np.transpose(img_input, [2, 0, 1])
+        img_input = np.expand_dims(img_input, axis=0)
+        return img_input, ResizeM
+    def postprocess(self, scores, raw_boxes, ResizeM, raw_shape):
+        # generate centers
+        decode_boxes = []
+        select_scores = []
+        for stride, box_distribute, score in zip(self.strides, raw_boxes,
+                                                 scores):
+            # centers
+            fm_h = self.input_shape[0] / stride
+            fm_w = self.input_shape[1] / stride
+            h_range = np.arange(fm_h)
+            w_range = np.arange(fm_w)
+            ww, hh = np.meshgrid(w_range, h_range)
+            ct_row = (hh.flatten() + 0.5) * stride
+            ct_col = (ww.flatten() + 0.5) * stride
+            center = np.stack((ct_col, ct_row, ct_col, ct_row), axis=1)
+            # box distribution to distance
+            reg_range = np.arange(self.reg_max + 1)
+            box_distance = box_distribute.reshape((-1, self.reg_max + 1))
+            box_distance = softmax(box_distance, axis=1)
+            box_distance = box_distance * np.expand_dims(reg_range, axis=0)
+            box_distance = np.sum(box_distance, axis=1).reshape((-1, 4))
+            box_distance = box_distance * stride
+            # top K candidate
+            topk_idx = np.argsort(score.max(axis=1))[::-1]
+            topk_idx = topk_idx[:C]
+            center = center[topk_idx]
+            score = score[topk_idx]
+            box_distance = box_distance[topk_idx]
+            # decode box
+            decode_box = center + [-1, -1, 1, 1] * box_distance
+            select_scores.append(score)
+            decode_boxes.append(decode_box)
+        # nms
+        bboxes = np.concatenate(decode_boxes, axis=0)
+        confidences = np.concatenate(select_scores, axis=0)
+        picked_box_probs = []
+        picked_labels = []
+        for class_index in range(0, confidences.shape[1]):
+            probs = confidences[:, class_index]
+            mask = probs > self.prob_threshold
+            probs = probs[mask]
+            if probs.shape[0] == 0:
+                continue
+            subset_boxes = bboxes[mask, :]
+            box_probs = np.concatenate(
+                [subset_boxes, probs.reshape(-1, 1)], axis=1)
+            box_probs = hard_nms(
+                box_probs,
+                iou_threshold=self.iou_threshold,
+                top_k=self.top_k, )
+            picked_box_probs.append(box_probs)
+            picked_labels.extend([class_index] * box_probs.shape[0])
+        if not picked_box_probs:
+            return np.array([]), np.array([]), np.array([])
+        picked_box_probs = np.concatenate(picked_box_probs)
+        # resize output boxes
+        picked_box_probs[:, :4] = warp_boxes(picked_box_probs[:, :4],
+                                             np.linalg.inv(ResizeM),
+                                             raw_shape[1], raw_shape[0])
+        return (
+            picked_box_probs[:, :4].astype(np.int32),
+            np.array(picked_labels),
+            picked_box_probs[:, 4], )
+    @abstractmethod
+    def infer_image(self, img_input):
+        pass
+    def detect(self, img):
+        raw_shape = img.shape
+        img_input, ResizeM = self.preprocess(img)
+        scores, raw_boxes = self.infer_image(img_input)
+        if scores[0].ndim == 1:  # handling num_classes=1 case
+            scores = [x[:, None] for x in scores]
+        bbox, label, score = self.postprocess(scores, raw_boxes, ResizeM,
+                                              raw_shape)
+        print(bbox, score)
+        return bbox, label, score
+    def draw_box(self, raw_img, bbox, label, score):
+        img = raw_img.copy()
+        all_box = [[x, ] + y + [z, ]
+                   for x, y, z in zip(label, bbox.tolist(), score)]
+        img_draw = overlay_bbox_cv(img, all_box, self.class_names)
+        return img_draw
+    def detect_folder(self, img_fold, result_path):
+        img_fold = Path(img_fold)
+        result_path = Path(result_path)
+        result_path.mkdir(parents=True, exist_ok=True)
+        img_name_list = filter(
+            lambda x: str(x).endswith(".png") or str(x).endswith(".jpg"),
+            img_fold.iterdir(), )
+        img_name_list = list(img_name_list)
+        print(f"find {len(img_name_list)} images")
+        for img_path in tqdm(img_name_list):
+            img = cv2.imread(str(img_path))
+            bbox, label, score = self.detect(img)
+            img_draw = self.draw_box(img, bbox, label, score)
+            save_path = str(result_path / img_path.name.replace(".png", ".jpg"))
+            cv2.imwrite(save_path, img_draw)
+class PicoDetMNN(PicoDetABC):
+    import MNN as MNNlib
+    def __init__(self, model_path, *args, **kwargs):
+        super(PicoDetMNN, self).__init__(*args, **kwargs)
+        print("Using MNN as inference backend")
+        print(f"Using weight: {model_path}")
+        # load model
+        self.model_path = model_path
+        self.interpreter = self.MNNlib.Interpreter(self.model_path)
+        self.session = self.interpreter.createSession()
+        self.input_tensor = self.interpreter.getSessionInput(self.session)
+    def infer_image(self, img_input):
+        tmp_input = self.MNNlib.Tensor(
+            (1, 3, self.input_size[1], self.input_size[0]),
+            self.MNNlib.Halide_Type_Float,
+            img_input,
+            self.MNNlib.Tensor_DimensionType_Caffe, )
+        self.input_tensor.copyFrom(tmp_input)
+        self.interpreter.runSession(self.session)
+        score_out_name = [
+            "save_infer_model/scale_0.tmp_1", "save_infer_model/scale_1.tmp_1",
+            "save_infer_model/scale_2.tmp_1", "save_infer_model/scale_3.tmp_1"
+        ]
+        scores = [
+            self.interpreter.getSessionOutput(self.session, x).getData()
+            for x in score_out_name
+        ]
+        scores = [np.reshape(x, (-1, 80)) for x in scores]
+        boxes_out_name = [
+            "save_infer_model/scale_4.tmp_1", "save_infer_model/scale_5.tmp_1",
+            "save_infer_model/scale_6.tmp_1", "save_infer_model/scale_7.tmp_1"
+        ]
+        raw_boxes = [
+            self.interpreter.getSessionOutput(self.session, x).getData()
+            for x in boxes_out_name
+        ]
+        raw_boxes = [np.reshape(x, (-1, 32)) for x in raw_boxes]
+        return scores, raw_boxes
+class PicoDetONNX(PicoDetABC):
+    import onnxruntime as ort
+    def __init__(self, model_path, *args, **kwargs):
+        super(PicoDetONNX, self).__init__(*args, **kwargs)
+        print("Using ONNX as inference backend")
+        print(f"Using weight: {model_path}")
+        # load model
+        self.model_path = model_path
+        self.ort_session = self.ort.InferenceSession(self.model_path)
+        self.input_name = self.ort_session.get_inputs()[0].name
+    def infer_image(self, img_input):
+        inference_results = self.ort_session.run(None,
+                                                 {self.input_name: img_input})
+        scores = [np.squeeze(x) for x in inference_results[:3]]
+        raw_boxes = [np.squeeze(x) for x in inference_results[3:]]
+        return scores, raw_boxes
+class PicoDetTorch(PicoDetABC):
+    import torch
+    def __init__(self, model_path, cfg_path, *args, **kwargs):
+        from picodet.model.arch import build_model
+        from picodet.util import Logger, cfg, load_config, load_model_weight
+        super(PicoDetTorch, self).__init__(*args, **kwargs)
+        print("Using PyTorch as inference backend")
+        print(f"Using weight: {model_path}")
+        # load model
+        self.model_path = model_path
+        self.cfg_path = cfg_path
+        load_config(cfg, cfg_path)
+        self.logger = Logger(-1, cfg.save_dir, False)
+        self.model = build_model(cfg.model)
+        checkpoint = self.torch.load(
+            model_path, map_location=lambda storage, loc: storage)
+        load_model_weight(self.model, checkpoint, self.logger)
+    def infer_image(self, img_input):
+        self.model.train(False)
+        with self.torch.no_grad():
+            inference_results = self.model(self.torch.from_numpy(img_input))
+        scores = [
+            x.permute(0, 2, 3, 1).reshape((-1, 80)).sigmoid().detach().numpy()
+            for x in inference_results[0]
+        ]
+        raw_boxes = [
+            x.permute(0, 2, 3, 1).reshape((-1, 32)).detach().numpy()
+            for x in inference_results[1]
+        ]
+        return scores, raw_boxes
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_path",
+        dest="model_path",
+        type=str,
+        default="../model/picodet-320.mnn")
+    parser.add_argument(
+        "--cfg_path", dest="cfg_path", type=str, default="config/picodet-m.yml")
+    parser.add_argument(
+        "--img_fold", dest="img_fold", type=str, default="../imgs")
+    parser.add_argument(
+        "--result_fold", dest="result_fold", type=str, default="../results")
+    parser.add_argument(
+        "--input_shape",
+        dest="input_shape",
+        nargs=2,
+        type=int,
+        default=[320, 320])
+    parser.add_argument(
+        "--backend", choices=["MNN", "ONNX", "torch"], default="MNN")
+    args = parser.parse_args()
+    print(f"Detecting {args.img_fold}")
+    # load detector
+    if args.backend == "MNN":
+        detector = PicoDetMNN(args.model_path, input_shape=args.input_shape)
+    elif args.backend == "ONNX":
+        detector = PicoDetONNX(args.model_path, input_shape=args.input_shape)
+    elif args.backend == "torch":
+        detector = PicoDetTorch(
+            args.model_path, args.cfg_path, input_shape=args.input_shape)
+    else:
+        raise ValueError
+    # detect folder
+    detector.detect_folder(args.img_fold, args.result_fold)
+def test_one():
+    detector = PicoDetMNN("../weight/picodet-416.mnn")
+    img = cv2.imread("../imgs/000252.jpg")
+    bbox, label, score = detector.detect(img)
+    img_draw = detector.draw_box(img, bbox, label, score)
+    cv2.imwrite('picodet_infer.jpg', img_draw)
+if __name__ == "__main__":
+    # main()
+    test_one()
--- a/deploy/third_engine/demo_ncnn/CMakeLists.txt
+++ b/deploy/third_engine/demo_ncnn/CMakeLists.txt
+cmake_minimum_required(VERSION 3.4.1)
+set(CMAKE_CXX_STANDARD 17)
+project(picodet_demo)
+find_package(OpenMP REQUIRED)
+if(OPENMP_FOUND)
+    message("OPENMP FOUND")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif()
+find_package(OpenCV REQUIRED)
+find_package(ncnn REQUIRED)
+if(NOT TARGET ncnn)
+    message(WARNING "ncnn NOT FOUND!  Please set ncnn_DIR environment variable")
+else()
+    message("ncnn FOUND ")
+endif()
+include_directories(
+    ${OpenCV_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_executable(picodet_demo main.cpp picodet.cpp)
+target_link_libraries(
+    picodet_demo
+    ncnn
+    ${OpenCV_LIBS}
+)
--- a/deploy/third_engine/demo_ncnn/README.md
+++ b/deploy/third_engine/demo_ncnn/README.md
+# PicoDet NCNN Demo
+This project provides PicoDet image inference, webcam inference and benchmark using
+[Tencent's NCNN framework](https://github.com/Tencent/ncnn).
+# How to build
+## Windows
+### Step1.
+Download and Install Visual Studio from https://visualstudio.microsoft.com/vs/community/
+### Step2.
+Download and install OpenCV from https://github.com/opencv/opencv/releases
+### Step3(Optional).
+Download and install Vulkan SDK from https://vulkan.lunarg.com/sdk/home
+### Step4.
+Clone NCNN repository
+``` shell script
+git clone --recursive https://github.com/Tencent/ncnn.git
+```
+Build NCNN following this tutorial: [Build for Windows x64 using VS2017](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-windows-x64-using-visual-studio-community-2017)
+### Step5.
+Add `ncnn_DIR` = `YOUR_NCNN_PATH/build/install/lib/cmake/ncnn` to system environment variables.
+Build project: Open x64 Native Tools Command Prompt for VS 2019 or 2017
+``` cmd
+cd <this-folder>
+mkdir -p build
+cd build
+cmake ..
+msbuild picodet_demo.vcxproj /p:configuration=release /p:platform=x64
+```
+## Linux
+### Step1.
+Build and install OpenCV from https://github.com/opencv/opencv
+### Step2(Optional).
+Download Vulkan SDK from https://vulkan.lunarg.com/sdk/home
+### Step3.
+Clone NCNN repository
+``` shell script
+git clone --recursive https://github.com/Tencent/ncnn.git
+```
+Build NCNN following this tutorial: [Build for Linux / NVIDIA Jetson / Raspberry Pi](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-linux)
+### Step4.
+Set environment variables. Run:
+``` shell script
+export ncnn_DIR=YOUR_NCNN_PATH/build/install/lib/cmake/ncnn
+```
+Build project
+``` shell script
+cd <this-folder>
+mkdir build
+cd build
+cmake ..
+make
+```
+# Run demo
+Download PicoDet ncnn model.
+* [PicoDet ncnn model download link](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416_ncnn.zip)
+## Webcam
+```shell script
+picodet_demo 0 0
+```
+## Inference images
+```shell script
+picodet_demo 1 IMAGE_FOLDER/*.jpg
+```
+## Inference video
+```shell script
+picodet_demo 2 VIDEO_PATH
+```
+## Benchmark
+```shell script
+picodet_demo 3 0
+result: picodet  min = 17.74  max = 22.71  avg = 18.16
+```
+****
+Notice:
+If benchmark speed is slow, try to limit omp thread num.
+Linux:
+```shell script
+export OMP_THREAD_LIMIT=4
+```
--- a/deploy/third_engine/demo_ncnn/main.cpp
+++ b/deploy/third_engine/demo_ncnn/main.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_ncnn
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <iostream>
+#include <net.h>
+#include "picodet.h"
+#include <benchmark.h>
+struct object_rect {
+    int x;
+    int y;
+    int width;
+    int height;
+};
+int resize_uniform(cv::Mat& src, cv::Mat& dst, cv::Size dst_size, object_rect& effect_area)
+{
+    int w = src.cols;
+    int h = src.rows;
+    int dst_w = dst_size.width;
+    int dst_h = dst_size.height;
+    dst = cv::Mat(cv::Size(dst_w, dst_h), CV_8UC3, cv::Scalar(0));
+    float ratio_src = w * 1.0 / h;
+    float ratio_dst = dst_w * 1.0 / dst_h;
+    int tmp_w = 0;
+    int tmp_h = 0;
+    if (ratio_src > ratio_dst) {
+        tmp_w = dst_w;
+        tmp_h = floor((dst_w * 1.0 / w) * h);
+    }
+    else if (ratio_src < ratio_dst) {
+        tmp_h = dst_h;
+        tmp_w = floor((dst_h * 1.0 / h) * w);
+    }
+    else {
+        cv::resize(src, dst, dst_size);
+        effect_area.x = 0;
+        effect_area.y = 0;
+        effect_area.width = dst_w;
+        effect_area.height = dst_h;
+        return 0;
+    }
+    cv::Mat tmp;
+    cv::resize(src, tmp, cv::Size(tmp_w, tmp_h));
+    if (tmp_w != dst_w) {
+        int index_w = floor((dst_w - tmp_w) / 2.0);
+        for (int i = 0; i < dst_h; i++) {
+            memcpy(dst.data + i * dst_w * 3 + index_w * 3, tmp.data + i * tmp_w * 3, tmp_w * 3);
+        }
+        effect_area.x = index_w;
+        effect_area.y = 0;
+        effect_area.width = tmp_w;
+        effect_area.height = tmp_h;
+    }
+    else if (tmp_h != dst_h) {
+        int index_h = floor((dst_h - tmp_h) / 2.0);
+        memcpy(dst.data + index_h * dst_w * 3, tmp.data, tmp_w * tmp_h * 3);
+        effect_area.x = 0;
+        effect_area.y = index_h;
+        effect_area.width = tmp_w;
+        effect_area.height = tmp_h;
+    }
+    else {
+        printf("error\n");
+    }
+    return 0;
+}
+const int color_list[80][3] =
+{
+    {216 , 82 , 24},
+    {236 ,176 , 31},
+    {125 , 46 ,141},
+    {118 ,171 , 47},
+    { 76 ,189 ,237},
+    {238 , 19 , 46},
+    { 76 , 76 , 76},
+    {153 ,153 ,153},
+    {255 ,  0 ,  0},
+    {255 ,127 ,  0},
+    {190 ,190 ,  0},
+    {  0 ,255 ,  0},
+    {  0 ,  0 ,255},
+    {170 ,  0 ,255},
+    { 84 , 84 ,  0},
+    { 84 ,170 ,  0},
+    { 84 ,255 ,  0},
+    {170 , 84 ,  0},
+    {170 ,170 ,  0},
+    {170 ,255 ,  0},
+    {255 , 84 ,  0},
+    {255 ,170 ,  0},
+    {255 ,255 ,  0},
+    {  0 , 84 ,127},
+    {  0 ,170 ,127},
+    {  0 ,255 ,127},
+    { 84 ,  0 ,127},
+    { 84 , 84 ,127},
+    { 84 ,170 ,127},
+    { 84 ,255 ,127},
+    {170 ,  0 ,127},
+    {170 , 84 ,127},
+    {170 ,170 ,127},
+    {170 ,255 ,127},
+    {255 ,  0 ,127},
+    {255 , 84 ,127},
+    {255 ,170 ,127},
+    {255 ,255 ,127},
+    {  0 , 84 ,255},
+    {  0 ,170 ,255},
+    {  0 ,255 ,255},
+    { 84 ,  0 ,255},
+    { 84 , 84 ,255},
+    { 84 ,170 ,255},
+    { 84 ,255 ,255},
+    {170 ,  0 ,255},
+    {170 , 84 ,255},
+    {170 ,170 ,255},
+    {170 ,255 ,255},
+    {255 ,  0 ,255},
+    {255 , 84 ,255},
+    {255 ,170 ,255},
+    { 42 ,  0 ,  0},
+    { 84 ,  0 ,  0},
+    {127 ,  0 ,  0},
+    {170 ,  0 ,  0},
+    {212 ,  0 ,  0},
+    {255 ,  0 ,  0},
+    {  0 , 42 ,  0},
+    {  0 , 84 ,  0},
+    {  0 ,127 ,  0},
+    {  0 ,170 ,  0},
+    {  0 ,212 ,  0},
+    {  0 ,255 ,  0},
+    {  0 ,  0 , 42},
+    {  0 ,  0 , 84},
+    {  0 ,  0 ,127},
+    {  0 ,  0 ,170},
+    {  0 ,  0 ,212},
+    {  0 ,  0 ,255},
+    {  0 ,  0 ,  0},
+    { 36 , 36 , 36},
+    { 72 , 72 , 72},
+    {109 ,109 ,109},
+    {145 ,145 ,145},
+    {182 ,182 ,182},
+    {218 ,218 ,218},
+    {  0 ,113 ,188},
+    { 80 ,182 ,188},
+    {127 ,127 ,  0},
+};
+void draw_bboxes(const cv::Mat& bgr, const std::vector<BoxInfo>& bboxes, object_rect effect_roi)
+{
+    static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus",
+                                        "train", "truck", "boat", "traffic light", "fire hydrant",
+                                        "stop sign", "parking meter", "bench", "bird", "cat", "dog",
+                                        "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
+                                        "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+                                        "skis", "snowboard", "sports ball", "kite", "baseball bat",
+                                        "baseball glove", "skateboard", "surfboard", "tennis racket",
+                                        "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
+                                        "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
+                                        "hot dog", "pizza", "donut", "cake", "chair", "couch",
+                                        "potted plant", "bed", "dining table", "toilet", "tv", "laptop",
+                                        "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
+                                        "toaster", "sink", "refrigerator", "book", "clock", "vase",
+                                        "scissors", "teddy bear", "hair drier", "toothbrush"
+    };
+    cv::Mat image = bgr.clone();
+    int src_w = image.cols;
+    int src_h = image.rows;
+    int dst_w = effect_roi.width;
+    int dst_h = effect_roi.height;
+    float width_ratio = (float)src_w / (float)dst_w;
+    float height_ratio = (float)src_h / (float)dst_h;
+    for (size_t i = 0; i < bboxes.size(); i++)
+    {
+        const BoxInfo& bbox = bboxes[i];
+        cv::Scalar color = cv::Scalar(color_list[bbox.label][0], color_list[bbox.label][1], color_list[bbox.label][2]);
+        cv::rectangle(image, cv::Rect(cv::Point((bbox.x1 - effect_roi.x) * width_ratio, (bbox.y1 - effect_roi.y) * height_ratio),
+                                      cv::Point((bbox.x2 - effect_roi.x) * width_ratio, (bbox.y2 - effect_roi.y) * height_ratio)), color);
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[bbox.label], bbox.score * 100);
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
+        int x = (bbox.x1 - effect_roi.x) * width_ratio;
+        int y = (bbox.y1 - effect_roi.y) * height_ratio - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+            color, -1);
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+            cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(255, 255, 255));
+    }
+    cv::imwrite("../result/test_picodet.jpg", image);
+    printf("************infer image success!!!**********\n");
+}
+int image_demo(PicoDet &detector, const char* imagepath)
+{
+    std::vector<std::string> filenames;
+    cv::glob(imagepath, filenames, false);
+    for (auto img_name : filenames)
+    {
+        cv::Mat image = cv::imread(img_name);
+        if (image.empty())
+        {
+            fprintf(stderr, "cv::imread %s failed\n", img_name);
+            return -1;
+        }
+        object_rect effect_roi;
+        cv::Mat resized_img;
+        resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+        auto results = detector.detect(resized_img, 0.4, 0.5);
+        char imgName[20] = {};
+        draw_bboxes(image, results, effect_roi);
+        cv::waitKey(0);
+    }
+    return 0;
+}
+int webcam_demo(PicoDet& detector, int cam_id)
+{
+    cv::Mat image;
+    cv::VideoCapture cap(cam_id);
+    while (true)
+    {
+        cap >> image;
+        object_rect effect_roi;
+        cv::Mat resized_img;
+        resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+        auto results = detector.detect(resized_img, 0.4, 0.5);
+        draw_bboxes(image, results, effect_roi);
+        cv::waitKey(1);
+    }
+    return 0;
+}
+int video_demo(PicoDet& detector, const char* path)
+{
+    cv::Mat image;
+    cv::VideoCapture cap(path);
+    while (true)
+    {
+        cap >> image;
+        object_rect effect_roi;
+        cv::Mat resized_img;
+        resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+        auto results = detector.detect(resized_img, 0.4, 0.5);
+        draw_bboxes(image, results, effect_roi);
+        cv::waitKey(1);
+    }
+    return 0;
+}
+int benchmark(PicoDet& detector)
+{
+    int loop_num = 100;
+    int warm_up = 8;
+    double time_min = DBL_MAX;
+    double time_max = -DBL_MAX;
+    double time_avg = 0;
+    ncnn::Mat input = ncnn::Mat(320, 320, 3);
+    input.fill(0.01f);
+    for (int i = 0; i < warm_up + loop_num; i++)
+    {
+        double start = ncnn::get_current_time();
+        ncnn::Extractor ex = detector.Net->create_extractor();
+        ex.input("image", input); // picodet
+        for (const auto& head_info : detector.heads_info)
+        {
+            ncnn::Mat dis_pred;
+            ncnn::Mat cls_pred;
+            ex.extract(head_info.dis_layer.c_str(), dis_pred);
+            ex.extract(head_info.cls_layer.c_str(), cls_pred);
+        }
+        double end = ncnn::get_current_time();
+        double time = end - start;
+        if (i >= warm_up)
+        {
+            time_min = (std::min)(time_min, time);
+            time_max = (std::max)(time_max, time);
+            time_avg += time;
+        }
+    }
+    time_avg /= loop_num;
+    fprintf(stderr, "%20s  min = %7.2f  max = %7.2f  avg = %7.2f\n", "picodet", time_min, time_max, time_avg);
+    return 0;
+}
+int main(int argc, char** argv)
+{
+    if (argc != 3)
+    {
+        fprintf(stderr, "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; \n For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, mode=2; \n For benchmark, mode=3 path=0.\n", argv[0]);
+        return -1;
+    }
+    PicoDet detector = PicoDet("../weight/picodet_m_416.param", "../weight/picodet_m_416.bin", true);
+    int mode = atoi(argv[1]);
+    switch (mode)
+    {
+    case 0:{
+        int cam_id = atoi(argv[2]);
+        webcam_demo(detector, cam_id);
+        break;
+        }
+    case 1:{
+        const char* images = argv[2];
+        image_demo(detector, images);
+        break;
+        }
+    case 2:{
+        const char* path = argv[2];
+        video_demo(detector, path);
+        break;
+        }
+    case 3:{
+        benchmark(detector);
+        break;
+        }
+    default:{
+        fprintf(stderr, "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; \n For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, mode=2; \n For benchmark, mode=3 path=0.\n", argv[0]);
+        break;
+        }
+    }
+}
--- a/deploy/third_engine/demo_ncnn/picodet.cpp
+++ b/deploy/third_engine/demo_ncnn/picodet.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_ncnn
+#include "picodet.h"
+#include <benchmark.h>
+#include <iostream>
+inline float fast_exp(float x)
+{
+    union {
+        uint32_t i;
+        float f;
+    } v{};
+    v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+    return v.f;
+}
+inline float sigmoid(float x)
+{
+    return 1.0f / (1.0f + fast_exp(-x));
+}
+template<typename _Tp>
+int activation_function_softmax(const _Tp* src, _Tp* dst, int length)
+{
+    const _Tp alpha = *std::max_element(src, src + length);
+    _Tp denominator{ 0 };
+    for (int i = 0; i < length; ++i) {
+        dst[i] = fast_exp(src[i] - alpha);
+        denominator += dst[i];
+    }
+    for (int i = 0; i < length; ++i) {
+        dst[i] /= denominator;
+    }
+    return 0;
+}
+bool PicoDet::hasGPU = false;
+PicoDet* PicoDet::detector = nullptr;
+PicoDet::PicoDet(const char* param, const char* bin, bool useGPU)
+{
+    this->Net = new ncnn::Net();
+#if NCNN_VULKAN
+    this->hasGPU = ncnn::get_gpu_count() > 0;
+#endif
+    this->Net->opt.use_vulkan_compute = this->hasGPU && useGPU;
+    this->Net->opt.use_fp16_arithmetic = true;
+    this->Net->load_param(param);
+    this->Net->load_model(bin);
+}
+PicoDet::~PicoDet()
+{
+    delete this->Net;
+}
+void PicoDet::preprocess(cv::Mat& image, ncnn::Mat& in)
+{
+    int img_w = image.cols;
+    int img_h = image.rows;
+    in = ncnn::Mat::from_pixels(image.data, ncnn::Mat::PIXEL_BGR, img_w, img_h);
+    const float mean_vals[3] = { 103.53f, 116.28f, 123.675f };
+    const float norm_vals[3] = { 0.017429f, 0.017507f, 0.017125f };
+    in.substract_mean_normalize(mean_vals, norm_vals);
+}
+std::vector<BoxInfo> PicoDet::detect(cv::Mat image, float score_threshold, float nms_threshold)
+{
+    ncnn::Mat input;
+    preprocess(image, input);
+    auto ex = this->Net->create_extractor();
+    ex.set_light_mode(false);
+    ex.set_num_threads(4);
+#if NCNN_VULKAN
+    ex.set_vulkan_compute(this->hasGPU);
+#endif
+    ex.input("image", input); //picodet
+    std::vector<std::vector<BoxInfo>> results;
+    results.resize(this->num_class);
+    for (const auto& head_info : this->heads_info)
+    {
+        ncnn::Mat dis_pred;
+        ncnn::Mat cls_pred;
+        ex.extract(head_info.dis_layer.c_str(), dis_pred);
+        ex.extract(head_info.cls_layer.c_str(), cls_pred);
+        this->decode_infer(cls_pred, dis_pred, head_info.stride, score_threshold, results);
+    }
+    std::vector<BoxInfo> dets;
+    for (int i = 0; i < (int)results.size(); i++)
+    {
+        this->nms(results[i], nms_threshold);
+        for (auto box : results[i])
+        {
+            dets.push_back(box);
+        }
+    }
+    return dets;
+}
+void PicoDet::decode_infer(ncnn::Mat& cls_pred, ncnn::Mat& dis_pred, int stride, float threshold, std::vector<std::vector<BoxInfo>>& results)
+{
+    int feature_h = this->input_size[1] / stride;
+    int feature_w = this->input_size[0] / stride;
+    for (int idx = 0; idx < feature_h * feature_w; idx++)
+    {
+        const float* scores = cls_pred.row(idx);
+        int row = idx / feature_w;
+        int col = idx % feature_w;
+        float score = 0;
+        int cur_label = 0;
+        for (int label = 0; label < this->num_class; label++)
+        {
+            if (scores[label] > score)
+            {
+                score = scores[label];
+                cur_label = label;
+            }
+        }
+        if (score > threshold)
+        {
+            const float* bbox_pred = dis_pred.row(idx);
+            results[cur_label].push_back(this->disPred2Bbox(bbox_pred, cur_label, score, col, row, stride));
+        }
+    }
+}
+BoxInfo PicoDet::disPred2Bbox(const float*& dfl_det, int label, float score, int x, int y, int stride)
+{
+    float ct_x = (x + 0.5) * stride;
+    float ct_y = (y + 0.5) * stride;
+    std::vector<float> dis_pred;
+    dis_pred.resize(4);
+    for (int i = 0; i < 4; i++)
+    {
+        float dis = 0;
+        float* dis_after_sm = new float[this->reg_max + 1];
+        activation_function_softmax(dfl_det + i * (this->reg_max + 1), dis_after_sm, this->reg_max + 1);
+        for (int j = 0; j < this->reg_max + 1; j++)
+        {
+            dis += j * dis_after_sm[j];
+        }
+        dis *= stride;
+        dis_pred[i] = dis;
+        delete[] dis_after_sm;
+    }
+    float xmin = (std::max)(ct_x - dis_pred[0], .0f);
+    float ymin = (std::max)(ct_y - dis_pred[1], .0f);
+    float xmax = (std::min)(ct_x + dis_pred[2], (float)this->input_size[0]);
+    float ymax = (std::min)(ct_y + dis_pred[3], (float)this->input_size[1]);
+    return BoxInfo { xmin, ymin, xmax, ymax, score, label };
+}
+void PicoDet::nms(std::vector<BoxInfo>& input_boxes, float NMS_THRESH)
+{
+    std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
+    std::vector<float> vArea(input_boxes.size());
+    for (int i = 0; i < int(input_boxes.size()); ++i) {
+        vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1)
+            * (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
+    }
+    for (int i = 0; i < int(input_boxes.size()); ++i) {
+        for (int j = i + 1; j < int(input_boxes.size());) {
+            float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
+            float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
+            float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
+            float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);
+            float w = (std::max)(float(0), xx2 - xx1 + 1);
+            float h = (std::max)(float(0), yy2 - yy1 + 1);
+            float inter = w * h;
+            float ovr = inter / (vArea[i] + vArea[j] - inter);
+            if (ovr >= NMS_THRESH) {
+                input_boxes.erase(input_boxes.begin() + j);
+                vArea.erase(vArea.begin() + j);
+            }
+            else {
+                j++;
+            }
+        }
+    }
+}
--- a/deploy/third_engine/demo_ncnn/picodet.h
+++ b/deploy/third_engine/demo_ncnn/picodet.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_ncnn
+#ifndef PICODET_H
+#define PICODET_H
+#include <opencv2/core/core.hpp>
+#include <net.h>
+typedef struct HeadInfo
+{
+    std::string cls_layer;
+    std::string dis_layer;
+    int stride;
+};
+typedef struct BoxInfo
+{
+    float x1;
+    float y1;
+    float x2;
+    float y2;
+    float score;
+    int label;
+} BoxInfo;
+class PicoDet
+{
+public:
+    PicoDet(const char* param, const char* bin, bool useGPU);
+    ~PicoDet();
+    static PicoDet* detector;
+    ncnn::Net* Net;
+    static bool hasGPU;
+    std::vector<HeadInfo> heads_info{
+        // cls_pred|dis_pred|stride
+        {"save_infer_model/scale_0.tmp_1", "save_infer_model/scale_4.tmp_1", 8},
+        {"save_infer_model/scale_1.tmp_1", "save_infer_model/scale_5.tmp_1", 16},
+        {"save_infer_model/scale_2.tmp_1", "save_infer_model/scale_6.tmp_1", 32},
+        {"save_infer_model/scale_3.tmp_1", "save_infer_model/scale_7.tmp_1", 64},
+    };
+    std::vector<BoxInfo> detect(cv::Mat image, float score_threshold, float nms_threshold);
+    std::vector<std::string> labels{ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+                                    "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+                                    "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+                                    "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+                                    "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+                                    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+                                    "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+                                    "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+                                    "hair drier", "toothbrush" };
+private:
+    void preprocess(cv::Mat& image, ncnn::Mat& in);
+    void decode_infer(ncnn::Mat& cls_pred, ncnn::Mat& dis_pred, int stride, float threshold, std::vector<std::vector<BoxInfo>>& results);
+    BoxInfo disPred2Bbox(const float*& dfl_det, int label, float score, int x, int y, int stride);
+    static void nms(std::vector<BoxInfo>& result, float nms_threshold);
+    int input_size[2] = {320, 320};
+    int num_class = 80;
+    int reg_max = 7;
+};
+#endif
--- a/deploy/third_engine/demo_ncnn/python/demo_ncnn.py
+++ b/deploy/third_engine/demo_ncnn/python/demo_ncnn.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# reference from https://github.com/RangiLyu/nanodet/tree/main/demo_ncnn
+# -*- coding: utf-8 -*-
+import argparse
+from abc import ABCMeta, abstractmethod
+from pathlib import Path
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.special import softmax
+from tqdm import tqdm
+_COLORS = (np.array([
+    0.000,
+    0.447,
+    0.741,
+    0.850,
+    0.325,
+    0.098,
+    0.929,
+    0.694,
+    0.125,
+    0.494,
+    0.184,
+    0.556,
+    0.466,
+    0.674,
+    0.188,
+    0.301,
+    0.745,
+    0.933,
+    0.635,
+    0.078,
+    0.184,
+    0.300,
+    0.300,
+    0.300,
+    0.600,
+    0.600,
+    0.600,
+    1.000,
+    0.000,
+    0.000,
+    1.000,
+    0.500,
+    0.000,
+    0.749,
+    0.749,
+    0.000,
+    0.000,
+    1.000,
+    0.000,
+    0.000,
+    0.000,
+    1.000,
+    0.667,
+    0.000,
+    1.000,
+    0.333,
+    0.333,
+    0.000,
+    0.333,
+    0.667,
+    0.000,
+    0.333,
+    1.000,
+    0.000,
+    0.667,
+    0.333,
+    0.000,
+    0.667,
+    0.667,
+    0.000,
+    0.667,
+    1.000,
+    0.000,
+    1.000,
+    0.333,
+    0.000,
+    1.000,
+    0.667,
+    0.000,
+    1.000,
+    1.000,
+    0.000,
+    0.000,
+    0.333,
+    0.500,
+    0.000,
+    0.667,
+    0.500,
+    0.000,
+    1.000,
+    0.500,
+    0.333,
+    0.000,
+    0.500,
+    0.333,
+    0.333,
+    0.500,
+    0.333,
+    0.667,
+    0.500,
+    0.333,
+    1.000,
+    0.500,
+    0.667,
+    0.000,
+    0.500,
+    0.667,
+    0.333,
+    0.500,
+    0.667,
+    0.667,
+    0.500,
+    0.667,
+    1.000,
+    0.500,
+    1.000,
+    0.000,
+    0.500,
+    1.000,
+    0.333,
+    0.500,
+    1.000,
+    0.667,
+    0.500,
+    1.000,
+    1.000,
+    0.500,
+    0.000,
+    0.333,
+    1.000,
+    0.000,
+    0.667,
+    1.000,
+    0.000,
+    1.000,
+    1.000,
+    0.333,
+    0.000,
+    1.000,
+    0.333,
+    0.333,
+    1.000,
+    0.333,
+    0.667,
+    1.000,
+    0.333,
+    1.000,
+    1.000,
+    0.667,
+    0.000,
+    1.000,
+    0.667,
+    0.333,
+    1.000,
+    0.667,
+    0.667,
+    1.000,
+    0.667,
+    1.000,
+    1.000,
+    1.000,
+    0.000,
+    1.000,
+    1.000,
+    0.333,
+    1.000,
+    1.000,
+    0.667,
+    1.000,
+    0.333,
+    0.000,
+    0.000,
+    0.500,
+    0.000,
+    0.000,
+    0.667,
+    0.000,
+    0.000,
+    0.833,
+    0.000,
+    0.000,
+    1.000,
+    0.000,
+    0.000,
+    0.000,
+    0.167,
+    0.000,
+    0.000,
+    0.333,
+    0.000,
+    0.000,
+    0.500,
+    0.000,
+    0.000,
+    0.667,
+    0.000,
+    0.000,
+    0.833,
+    0.000,
+    0.000,
+    1.000,
+    0.000,
+    0.000,
+    0.000,
+    0.167,
+    0.000,
+    0.000,
+    0.333,
+    0.000,
+    0.000,
+    0.500,
+    0.000,
+    0.000,
+    0.667,
+    0.000,
+    0.000,
+    0.833,
+    0.000,
+    0.000,
+    1.000,
+    0.000,
+    0.000,
+    0.000,
+    0.143,
+    0.143,
+    0.143,
+    0.286,
+    0.286,
+    0.286,
+    0.429,
+    0.429,
+    0.429,
+    0.571,
+    0.571,
+    0.571,
+    0.714,
+    0.714,
+    0.714,
+    0.857,
+    0.857,
+    0.857,
+    0.000,
+    0.447,
+    0.741,
+    0.314,
+    0.717,
+    0.741,
+    0.50,
+    0.5,
+    0,
+]).astype(np.float32).reshape(-1, 3))
+def get_resize_matrix(raw_shape, dst_shape, keep_ratio):
+    """
+    Get resize matrix for resizing raw img to input size
+    :param raw_shape: (width, height) of raw image
+    :param dst_shape: (width, height) of input image
+    :param keep_ratio: whether keep original ratio
+    :return: 3x3 Matrix
+    """
+    r_w, r_h = raw_shape
+    d_w, d_h = dst_shape
+    Rs = np.eye(3)
+    if keep_ratio:
+        C = np.eye(3)
+        C[0, 2] = -r_w / 2
+        C[1, 2] = -r_h / 2
+        if r_w / r_h < d_w / d_h:
+            ratio = d_h / r_h
+        else:
+            ratio = d_w / r_w
+        Rs[0, 0] *= ratio
+        Rs[1, 1] *= ratio
+        T = np.eye(3)
+        T[0, 2] = 0.5 * d_w
+        T[1, 2] = 0.5 * d_h
+        return T @Rs @C
+    else:
+        Rs[0, 0] *= d_w / r_w
+        Rs[1, 1] *= d_h / r_h
+        return Rs
+def warp_boxes(boxes, M, width, height):
+    """Apply transform to boxes
+    Copy from picodet/data/transform/warp.py
+    """
+    n = len(boxes)
+    if n:
+        # warp points
+        xy = np.ones((n * 4, 3))
+        xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
+            n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+        xy = xy @M.T  # transform
+        xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)  # rescale
+        # create new boxes
+        x = xy[:, [0, 2, 4, 6]]
+        y = xy[:, [1, 3, 5, 7]]
+        xy = np.concatenate(
+            (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+        # clip boxes
+        xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
+        xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
+        return xy.astype(np.float32)
+    else:
+        return boxes
+def overlay_bbox_cv(img, all_box, class_names):
+    """Draw result boxes
+    Copy from picodet/util/visualization.py
+    """
+    all_box.sort(key=lambda v: v[5])
+    for box in all_box:
+        label, x0, y0, x1, y1, score = box
+        color = (_COLORS[label] * 255).astype(np.uint8).tolist()
+        text = "{}:{:.1f}%".format(class_names[label], score * 100)
+        txt_color = (0, 0, 0) if np.mean(_COLORS[label]) > 0.5 else (255, 255,
+                                                                     255)
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        txt_size = cv2.getTextSize(text, font, 0.5, 2)[0]
+        cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
+        cv2.rectangle(
+            img,
+            (x0, y0 - txt_size[1] - 1),
+            (x0 + txt_size[0] + txt_size[1], y0 - 1),
+            color,
+            -1, )
+        cv2.putText(img, text, (x0, y0 - 1), font, 0.5, txt_color, thickness=1)
+    return img
+def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
+    """
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        iou_threshold: intersection over union threshold.
+        top_k: keep top_k results. If k <= 0, keep all the results.
+        candidate_size: only consider the candidates with the highest scores.
+    Returns:
+         picked: a list of indexes of the kept boxes
+    """
+    scores = box_scores[:, -1]
+    boxes = box_scores[:, :-1]
+    picked = []
+    indexes = np.argsort(scores)
+    indexes = indexes[-candidate_size:]
+    while len(indexes) > 0:
+        current = indexes[-1]
+        picked.append(current)
+        if 0 < top_k == len(picked) or len(indexes) == 1:
+            break
+        current_box = boxes[current, :]
+        indexes = indexes[:-1]
+        rest_boxes = boxes[indexes, :]
+        iou = iou_of(
+            rest_boxes,
+            np.expand_dims(
+                current_box, axis=0), )
+        indexes = indexes[iou <= iou_threshold]
+    return box_scores[picked, :]
+def iou_of(boxes0, boxes1, eps=1e-5):
+    """Return intersection-over-union (Jaccard index) of boxes.
+    Args:
+        boxes0 (N, 4): ground truth boxes.
+        boxes1 (N or 1, 4): predicted boxes.
+        eps: a small number to avoid 0 as denominator.
+    Returns:
+        iou (N): IoU values.
+    """
+    overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
+    overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])
+    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
+    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
+    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
+    return overlap_area / (area0 + area1 - overlap_area + eps)
+def area_of(left_top, right_bottom):
+    """Compute the areas of rectangles given two corners.
+    Args:
+        left_top (N, 2): left top corner.
+        right_bottom (N, 2): right bottom corner.
+    Returns:
+        area (N): return the area.
+    """
+    hw = np.clip(right_bottom - left_top, 0.0, None)
+    return hw[..., 0] * hw[..., 1]
+class picodetABC(metaclass=ABCMeta):
+    def __init__(
+            self,
+            input_shape=[320, 320],
+            reg_max=7,
+            strides=[8, 16, 32],
+            prob_threshold=0.4,
+            iou_threshold=0.3,
+            num_candidate=1000,
+            top_k=-1, ):
+        self.strides = strides
+        self.input_shape = input_shape
+        self.reg_max = reg_max
+        self.prob_threshold = prob_threshold
+        self.iou_threshold = iou_threshold
+        self.num_candidate = num_candidate
+        self.top_k = top_k
+        self.img_mean = [103.53, 116.28, 123.675]
+        self.img_std = [57.375, 57.12, 58.395]
+        self.input_size = (self.input_shape[1], self.input_shape[0])
+        self.class_names = [
+            "person",
+            "bicycle",
+            "car",
+            "motorcycle",
+            "airplane",
+            "bus",
+            "train",
+            "truck",
+            "boat",
+            "traffic_light",
+            "fire_hydrant",
+            "stop_sign",
+            "parking_meter",
+            "bench",
+            "bird",
+            "cat",
+            "dog",
+            "horse",
+            "sheep",
+            "cow",
+            "elephant",
+            "bear",
+            "zebra",
+            "giraffe",
+            "backpack",
+            "umbrella",
+            "handbag",
+            "tie",
+            "suitcase",
+            "frisbee",
+            "skis",
+            "snowboard",
+            "sports_ball",
+            "kite",
+            "baseball_bat",
+            "baseball_glove",
+            "skateboard",
+            "surfboard",
+            "tennis_racket",
+            "bottle",
+            "wine_glass",
+            "cup",
+            "fork",
+            "knife",
+            "spoon",
+            "bowl",
+            "banana",
+            "apple",
+            "sandwich",
+            "orange",
+            "broccoli",
+            "carrot",
+            "hot_dog",
+            "pizza",
+            "donut",
+            "cake",
+            "chair",
+            "couch",
+            "potted_plant",
+            "bed",
+            "dining_table",
+            "toilet",
+            "tv",
+            "laptop",
+            "mouse",
+            "remote",
+            "keyboard",
+            "cell_phone",
+            "microwave",
+            "oven",
+            "toaster",
+            "sink",
+            "refrigerator",
+            "book",
+            "clock",
+            "vase",
+            "scissors",
+            "teddy_bear",
+            "hair_drier",
+            "toothbrush",
+        ]
+    def preprocess(self, img):
+        # resize image
+        ResizeM = get_resize_matrix((img.shape[1], img.shape[0]),
+                                    self.input_size, True)
+        img_resize = cv2.warpPerspective(img, ResizeM, dsize=self.input_size)
+        # normalize image
+        img_input = img_resize.astype(np.float32) / 255
+        img_mean = np.array(
+            self.img_mean, dtype=np.float32).reshape(1, 1, 3) / 255
+        img_std = np.array(
+            self.img_std, dtype=np.float32).reshape(1, 1, 3) / 255
+        img_input = (img_input - img_mean) / img_std
+        # expand dims
+        img_input = np.transpose(img_input, [2, 0, 1])
+        img_input = np.expand_dims(img_input, axis=0)
+        return img_input, ResizeM
+    def postprocess(self, scores, raw_boxes, ResizeM, raw_shape):
+        # generate centers
+        decode_boxes = []
+        select_scores = []
+        for stride, box_distribute, score in zip(self.strides, raw_boxes,
+                                                 scores):
+            # centers
+            fm_h = self.input_shape[0] / stride
+            fm_w = self.input_shape[1] / stride
+            h_range = np.arange(fm_h)
+            w_range = np.arange(fm_w)
+            ww, hh = np.meshgrid(w_range, h_range)
+            ct_row = (hh.flatten() + 0.5) * stride
+            ct_col = (ww.flatten() + 0.5) * stride
+            center = np.stack((ct_col, ct_row, ct_col, ct_row), axis=1)
+            # box distribution to distance
+            reg_range = np.arange(self.reg_max + 1)
+            box_distance = box_distribute.reshape((-1, self.reg_max + 1))
+            box_distance = softmax(box_distance, axis=1)
+            box_distance = box_distance * np.expand_dims(reg_range, axis=0)
+            box_distance = np.sum(box_distance, axis=1).reshape((-1, 4))
+            box_distance = box_distance * stride
+            # top K candidate
+            topk_idx = np.argsort(score.max(axis=1))[::-1]
+            topk_idx = topk_idx[:self.num_candidate]
+            center = center[topk_idx]
+            score = score[topk_idx]
+            box_distance = box_distance[topk_idx]
+            # decode box
+            decode_box = center + [-1, -1, 1, 1] * box_distance
+            select_scores.append(score)
+            decode_boxes.append(decode_box)
+        # nms
+        bboxes = np.concatenate(decode_boxes, axis=0)
+        confidences = np.concatenate(select_scores, axis=0)
+        picked_box_probs = []
+        picked_labels = []
+        for class_index in range(0, confidences.shape[1]):
+            probs = confidences[:, class_index]
+            mask = probs > self.prob_threshold
+            probs = probs[mask]
+            if probs.shape[0] == 0:
+                continue
+            subset_boxes = bboxes[mask, :]
+            box_probs = np.concatenate(
+                [subset_boxes, probs.reshape(-1, 1)], axis=1)
+            box_probs = hard_nms(
+                box_probs,
+                iou_threshold=self.iou_threshold,
+                top_k=self.top_k, )
+            picked_box_probs.append(box_probs)
+            picked_labels.extend([class_index] * box_probs.shape[0])
+        if not picked_box_probs:
+            return np.array([]), np.array([]), np.array([])
+        picked_box_probs = np.concatenate(picked_box_probs)
+        # resize output boxes
+        picked_box_probs[:, :4] = warp_boxes(picked_box_probs[:, :4],
+                                             np.linalg.inv(ResizeM),
+                                             raw_shape[1], raw_shape[0])
+        return (
+            picked_box_probs[:, :4].astype(np.int32),
+            np.array(picked_labels),
+            picked_box_probs[:, 4], )
+    @abstractmethod
+    def infer_image(self, img_input):
+        pass
+    def detect(self, img):
+        raw_shape = img.shape
+        img_input, ResizeM = self.preprocess(img)
+        scores, raw_boxes = self.infer_image(img_input)
+        if scores[0].ndim == 1:  # handling num_classes=1 case
+            scores = [x[:, None] for x in scores]
+        bbox, label, score = self.postprocess(scores, raw_boxes, ResizeM,
+                                              raw_shape)
+        return bbox, label, score
+    def draw_box(self, raw_img, bbox, label, score):
+        img = raw_img.copy()
+        all_box = [[x, ] + y + [z, ]
+                   for x, y, z in zip(label, bbox.tolist(), score)]
+        img_draw = overlay_bbox_cv(img, all_box, self.class_names)
+        return img_draw
+    def detect_folder(self, img_fold, result_path):
+        img_fold = Path(img_fold)
+        result_path = Path(result_path)
+        result_path.mkdir(parents=True, exist_ok=True)
+        img_name_list = filter(
+            lambda x: str(x).endswith(".png") or str(x).endswith(".jpg"),
+            img_fold.iterdir(), )
+        img_name_list = list(img_name_list)
+        print(f"find {len(img_name_list)} images")
+        for img_path in tqdm(img_name_list):
+            img = cv2.imread(str(img_path))
+            bbox, label, score = self.detect(img)
+            img_draw = self.draw_box(img, bbox, label, score)
+            save_path = str(result_path / img_path.name.replace(".png", ".jpg"))
+            cv2.imwrite(save_path, img_draw)
+class picodetONNX(picodetABC):
+    def __init__(self, model_path, *args, **kwargs):
+        import onnxruntime as ort
+        super(picodetONNX, self).__init__(*args, **kwargs)
+        print("Using ONNX as inference backend")
+        print(f"Using weight: {model_path}")
+        # load model
+        self.model_path = model_path
+        self.ort_session = ort.InferenceSession(self.model_path)
+        self.input_name = self.ort_session.get_inputs()[0].name
+    def infer_image(self, img_input):
+        inference_results = self.ort_session.run(None,
+                                                 {self.input_name: img_input})
+        scores = [np.squeeze(x) for x in inference_results[:3]]
+        raw_boxes = [np.squeeze(x) for x in inference_results[3:]]
+        return scores, raw_boxes
+class picodetTorch(picodetABC):
+    def __init__(self, model_path, cfg_path, *args, **kwargs):
+        import torch
+        from picodet.model.arch import build_model
+        from picodet.util import Logger, cfg, load_config, load_model_weight
+        super(picodetTorch, self).__init__(*args, **kwargs)
+        print("Using PyTorch as inference backend")
+        print(f"Using weight: {model_path}")
+        # load model
+        self.model_path = model_path
+        self.cfg_path = cfg_path
+        load_config(cfg, cfg_path)
+        self.logger = Logger(-1, cfg.save_dir, False)
+        self.model = build_model(cfg.model)
+        checkpoint = torch.load(
+            model_path, map_location=lambda storage, loc: storage)
+        load_model_weight(self.model, checkpoint, self.logger)
+    def infer_image(self, img_input):
+        import torch
+        self.model.train(False)
+        with torch.no_grad():
+            inference_results = self.model(torch.from_numpy(img_input))
+        scores = [
+            x.permute(0, 2, 3, 1).reshape((-1, 80)).sigmoid().detach().numpy()
+            for x in inference_results[0]
+        ]
+        raw_boxes = [
+            x.permute(0, 2, 3, 1).reshape((-1, 32)).detach().numpy()
+            for x in inference_results[1]
+        ]
+        return scores, raw_boxes
+class picodetNCNN(picodetABC):
+    def __init__(self, model_param, model_bin, *args, **kwargs):
+        import ncnn
+        super(picodetNCNN, self).__init__(*args, **kwargs)
+        print("Using ncnn as inference backend")
+        print(f"Using param: {model_param}, bin: {model_bin}")
+        # load model
+        self.model_param = model_param
+        self.model_bin = model_bin
+        self.net = ncnn.Net()
+        self.net.load_param(model_param)
+        self.net.load_model(model_bin)
+        self.input_name = "input.1"
+    def infer_image(self, img_input):
+        import ncnn
+        mat_in = ncnn.Mat(img_input.squeeze())
+        ex = self.net.create_extractor()
+        ex.input(self.input_name, mat_in)
+        score_out_name = [
+            "save_infer_model/scale_0.tmp_1", "save_infer_model/scale_1.tmp_1",
+            "save_infer_model/scale_2.tmp_1", "save_infer_model/scale_3.tmp_1"
+        ]
+        scores = [np.array(ex.extract(x)[1]) for x in score_out_name]
+        scores = [np.reshape(x, (-1, 80)) for x in scores]
+        boxes_out_name = [
+            "save_infer_model/scale_4.tmp_1", "save_infer_model/scale_5.tmp_1",
+            "save_infer_model/scale_6.tmp_1", "save_infer_model/scale_7.tmp_1"
+        ]
+        raw_boxes = [np.array(ex.extract(x)[1]) for x in boxes_out_name]
+        raw_boxes = [np.reshape(x, (-1, 32)) for x in raw_boxes]
+        return scores, raw_boxes
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_path",
+        dest="model_path",
+        type=str,
+        default="../model/picodet.param")
+    parser.add_argument(
+        "--model_bin",
+        dest="model_bin",
+        type=str,
+        default="../model/picodet.bin")
+    parser.add_argument(
+        "--cfg_path", dest="cfg_path", type=str, default="config/picodet.yml")
+    parser.add_argument(
+        "--img_fold", dest="img_fold", type=str, default="../imgs")
+    parser.add_argument(
+        "--result_fold", dest="result_fold", type=str, default="../results")
+    parser.add_argument(
+        "--input_shape",
+        dest="input_shape",
+        nargs=2,
+        type=int,
+        default=[320, 320])
+    parser.add_argument(
+        "--backend", choices=["ncnn", "ONNX", "torch"], default="ncnn")
+    args = parser.parse_args()
+    print(f"Detecting {args.img_fold}")
+    # load detector
+    if args.backend == "ncnn":
+        detector = picodetNCNN(
+            args.model_path, args.model_bin, input_shape=args.input_shape)
+    elif args.backend == "ONNX":
+        detector = picodetONNX(args.model_path, input_shape=args.input_shape)
+    elif args.backend == "torch":
+        detector = picodetTorch(
+            args.model_path, args.cfg_path, input_shape=args.input_shape)
+    else:
+        raise ValueError
+    # detect folder
+    detector.detect_folder(args.img_fold, args.result_fold)
+def test_one():
+    detector = picodetNCNN("../weight/picodet_m_416.param",
+                           "../weight/picodet_m_416.bin")
+    img = cv2.imread("../000000000102.jpg")
+    bbox, label, score = detector.detect(img)
+    img_draw = detector.draw_box(img, bbox, label, score)
+    img_out = img_draw[..., ::-1]
+    cv2.imwrite('python_version.jpg', img_out)
+if __name__ == "__main__":
+    # main()
+    test_one()
--- a/deploy/third_engine/demo_openvino/CMakeLists.txt
+++ b/deploy/third_engine/demo_openvino/CMakeLists.txt
+cmake_minimum_required(VERSION 3.4.1)
+set(CMAKE_CXX_STANDARD 14)
+project(picodet_demo)
+find_package(OpenCV REQUIRED)
+find_package(InferenceEngine REQUIRED)
+find_package(ngraph REQUIRED)
+include_directories(
+    ${OpenCV_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_executable(picodet_demo main.cpp picodet_openvino.cpp)
+target_link_libraries(
+    picodet_demo
+    ${InferenceEngine_LIBRARIES}
+    ${NGRAPH_LIBRARIES}
+    ${OpenCV_LIBS}
+)
--- a/deploy/third_engine/demo_openvino/README.md
+++ b/deploy/third_engine/demo_openvino/README.md
+# PicoDet OpenVINO Demo
+This fold provides PicoDet inference code using
+[Intel's OpenVINO Toolkit](https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html). Most of the implements in this fold are same as *demo_ncnn*.  
+**Recommand** to use the xxx.tar.gz file to install instead of github method.
+## Install OpenVINO Toolkit
+Go to [OpenVINO HomePage](https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html)
+Download a suitable version and install.
+Follow the official Get Started Guides: https://docs.openvinotoolkit.org/latest/get_started_guides.html
+## Set the Environment Variables
+### Windows:
+Run this command in cmd. (Every time before using OpenVINO)
+```cmd
+<INSTSLL_DIR>\openvino_2021\bin\setupvars.bat
+```
+Or set the system environment variables once for all:
+Name                  |Value
+:--------------------:|:--------:
+INTEL_OPENVINO_DIR | <INSTSLL_DIR>\openvino_2021
+INTEL_CVSDK_DIR | %INTEL_OPENVINO_DIR%
+InferenceEngine_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\share
+HDDL_INSTALL_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\external\hddl
+ngraph_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\ngraph\cmake
+And add this to ```Path```
+```
+%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\Debug;%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\Release;%HDDL_INSTALL_DIR%\bin;%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\external\tbb\bin;%INTEL_OPENVINO_DIR%\deployment_tools\ngraph\lib
+```
+### Linux
+Run this command in shell. (Every time before using OpenVINO)
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+```
+Or edit .bashrc
+```shell
+vi ~/.bashrc
+```
+Add this line to the end of the file
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+```
+## Convert model
+   Convert to OpenVINO
+   ``` shell
+   cd <INSTSLL_DIR>/openvino_2021/deployment_tools/model_optimizer
+   ```
+   Install requirements for convert tool
+   ```shell
+   cd ./install_prerequisites
+   sudo install_prerequisites_onnx.sh
+   ```
+   Then convert model. Notice: mean_values and scale_values should be the same with your training settings in YAML config file.
+   ```shell
+   python3 mo_onnx.py --input_model <ONNX_MODEL> --mean_values [103.53,116.28,123.675] --scale_values [57.375,57.12,58.395]
+   ```
+## Build
+### Windows
+```cmd
+<OPENVINO_INSTSLL_DIR>\openvino_2021\bin\setupvars.bat
+mkdir -p build
+cd build
+cmake ..
+msbuild picodet_demo.vcxproj /p:configuration=release /p:platform=x64
+```
+### Linux
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+mkdir build
+cd build
+cmake ..
+make
+```
+## Run demo
+Download PicoDet openvino model [PicoDet openvino model download link](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416_openvino.zip).
+move picodet openvino model files to the demo's weight folder. Then run these commands:
+### Webcam
+```shell
+picodet_demo 0 0
+```
+### Inference images
+```shell
+picodet_demo 1 IMAGE_FOLDER/*.jpg
+```
+### Inference video
+```shell
+picodet_demo 2 VIDEO_PATH
+```
+### Benchmark
+```shell
+picodet_demo 3 0
+```
--- a/deploy/third_engine/demo_openvino/main.cpp
+++ b/deploy/third_engine/demo_openvino/main.cpp
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet
+#include "picodet_openvino.h"
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <iostream>
+struct object_rect {
+    int x;
+    int y;
+    int width;
+    int height;
+};
+int resize_uniform(cv::Mat& src, cv::Mat& dst, cv::Size dst_size, object_rect& effect_area)
+{
+    int w = src.cols;
+    int h = src.rows;
+    int dst_w = dst_size.width;
+    int dst_h = dst_size.height;
+    dst = cv::Mat(cv::Size(dst_w, dst_h), CV_8UC3, cv::Scalar(0));
+    float ratio_src = w * 1.0 / h;
+    float ratio_dst = dst_w * 1.0 / dst_h;
+    int tmp_w = 0;
+    int tmp_h = 0;
+    if (ratio_src > ratio_dst) {
+        tmp_w = dst_w;
+        tmp_h = floor((dst_w * 1.0 / w) * h);
+    }
+    else if (ratio_src < ratio_dst) {
+        tmp_h = dst_h;
+        tmp_w = floor((dst_h * 1.0 / h) * w);
+    }
+    else {
+        cv::resize(src, dst, dst_size);
+        effect_area.x = 0;
+        effect_area.y = 0;
+        effect_area.width = dst_w;
+        effect_area.height = dst_h;
+        return 0;
+    }
+    cv::Mat tmp;
+    cv::resize(src, tmp, cv::Size(tmp_w, tmp_h));
+    if (tmp_w != dst_w) {
+        int index_w = floor((dst_w - tmp_w) / 2.0);
+        for (int i = 0; i < dst_h; i++) {
+            memcpy(dst.data + i * dst_w * 3 + index_w * 3, tmp.data + i * tmp_w * 3, tmp_w * 3);
+        }
+        effect_area.x = index_w;
+        effect_area.y = 0;
+        effect_area.width = tmp_w;
+        effect_area.height = tmp_h;
+    }
+    else if (tmp_h != dst_h) {
+        int index_h = floor((dst_h - tmp_h) / 2.0);
+        memcpy(dst.data + index_h * dst_w * 3, tmp.data, tmp_w * tmp_h * 3);
+        effect_area.x = 0;
+        effect_area.y = index_h;
+        effect_area.width = tmp_w;
+        effect_area.height = tmp_h;
+    }
+    else {
+        printf("error\n");
+    }
+    return 0;
+}
+const int color_list[80][3] =
+{
+    {216 , 82 , 24},
+    {236 ,176 , 31},
+    {125 , 46 ,141},
+    {118 ,171 , 47},
+    { 76 ,189 ,237},
+    {238 , 19 , 46},
+    { 76 , 76 , 76},
+    {153 ,153 ,153},
+    {255 ,  0 ,  0},
+    {255 ,127 ,  0},
+    {190 ,190 ,  0},
+    {  0 ,255 ,  0},
+    {  0 ,  0 ,255},
+    {170 ,  0 ,255},
+    { 84 , 84 ,  0},
+    { 84 ,170 ,  0},
+    { 84 ,255 ,  0},
+    {170 , 84 ,  0},
+    {170 ,170 ,  0},
+    {170 ,255 ,  0},
+    {255 , 84 ,  0},
+    {255 ,170 ,  0},
+    {255 ,255 ,  0},
+    {  0 , 84 ,127},
+    {  0 ,170 ,127},
+    {  0 ,255 ,127},
+    { 84 ,  0 ,127},
+    { 84 , 84 ,127},
+    { 84 ,170 ,127},
+    { 84 ,255 ,127},
+    {170 ,  0 ,127},
+    {170 , 84 ,127},
+    {170 ,170 ,127},
+    {170 ,255 ,127},
+    {255 ,  0 ,127},
+    {255 , 84 ,127},
+    {255 ,170 ,127},
+    {255 ,255 ,127},
+    {  0 , 84 ,255},
+    {  0 ,170 ,255},
+    {  0 ,255 ,255},
+    { 84 ,  0 ,255},
+    { 84 , 84 ,255},
+    { 84 ,170 ,255},
+    { 84 ,255 ,255},
+    {170 ,  0 ,255},
+    {170 , 84 ,255},
+    {170 ,170 ,255},
+    {170 ,255 ,255},
+    {255 ,  0 ,255},
+    {255 , 84 ,255},
+    {255 ,170 ,255},
+    { 42 ,  0 ,  0},
+    { 84 ,  0 ,  0},
+    {127 ,  0 ,  0},
+    {170 ,  0 ,  0},
+    {212 ,  0 ,  0},
+    {255 ,  0 ,  0},
+    {  0 , 42 ,  0},
+    {  0 , 84 ,  0},
+    {  0 ,127 ,  0},
+    {  0 ,170 ,  0},
+    {  0 ,212 ,  0},
+    {  0 ,255 ,  0},
+    {  0 ,  0 , 42},
+    {  0 ,  0 , 84},
+    {  0 ,  0 ,127},
+    {  0 ,  0 ,170},
+    {  0 ,  0 ,212},
+    {  0 ,  0 ,255},
+    {  0 ,  0 ,  0},
+    { 36 , 36 , 36},
+    { 72 , 72 , 72},
+    {109 ,109 ,109},
+    {145 ,145 ,145},
+    {182 ,182 ,182},
+    {218 ,218 ,218},
+    {  0 ,113 ,188},
+    { 80 ,182 ,188},
+    {127 ,127 ,  0},
+};
+void draw_bboxes(const cv::Mat& bgr, const std::vector<BoxInfo>& bboxes, object_rect effect_roi)
+{
+    static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus",
+                                        "train", "truck", "boat", "traffic light", "fire hydrant",
+                                        "stop sign", "parking meter", "bench", "bird", "cat", "dog",
+                                        "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
+                                        "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+                                        "skis", "snowboard", "sports ball", "kite", "baseball bat",
+                                        "baseball glove", "skateboard", "surfboard", "tennis racket",
+                                        "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
+                                        "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
+                                        "hot dog", "pizza", "donut", "cake", "chair", "couch",
+                                        "potted plant", "bed", "dining table", "toilet", "tv", "laptop",
+                                        "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
+                                        "toaster", "sink", "refrigerator", "book", "clock", "vase",
+                                        "scissors", "teddy bear", "hair drier", "toothbrush"
+    };
+    cv::Mat image = bgr.clone();
+    int src_w = image.cols;
+    int src_h = image.rows;
+    int dst_w = effect_roi.width;
+    int dst_h = effect_roi.height;
+    float width_ratio = (float)src_w / (float)dst_w;
+    float height_ratio = (float)src_h / (float)dst_h;
+    for (size_t i = 0; i < bboxes.size(); i++)
+    {
+        const BoxInfo& bbox = bboxes[i];
+        cv::Scalar color = cv::Scalar(color_list[bbox.label][0], color_list[bbox.label][1], color_list[bbox.label][2]);
+        cv::rectangle(image, cv::Rect(cv::Point((bbox.x1 - effect_roi.x) * width_ratio, (bbox.y1 - effect_roi.y) * height_ratio),
+                                      cv::Point((bbox.x2 - effect_roi.x) * width_ratio, (bbox.y2 - effect_roi.y) * height_ratio)), color);
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[bbox.label], bbox.score * 100);
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
+        int x = (bbox.x1 - effect_roi.x) * width_ratio;
+        int y = (bbox.y1 - effect_roi.y) * height_ratio - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+            color, -1);
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+            cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(255, 255, 255));
+    }
+    cv::imshow("image", image);
+}
+int image_demo(PicoDet& detector, const char* imagepath)
+{
+    std::vector<std::string> filenames;
+    cv::glob(imagepath, filenames, false);
+    for (auto img_name : filenames)
+    {
+        cv::Mat image = cv::imread(img_name);
+        if (image.empty())
+        {
+            return -1;
+        }
+        object_rect effect_roi;
+        cv::Mat resized_img;
+        resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+        auto results = detector.detect(resized_img, 0.4, 0.5);
+        draw_bboxes(image, results, effect_roi);
+    }
+    return 0;
+}
+int webcam_demo(PicoDet& detector, int cam_id)
+{
+    cv::Mat image;
+    cv::VideoCapture cap(cam_id);
+    while (true)
+    {
+        cap >> image;
+        object_rect effect_roi;
+        cv::Mat resized_img;
+        resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+        auto results = detector.detect(resized_img, 0.4, 0.5);
+        draw_bboxes(image, results, effect_roi);
+        cv::waitKey(1);
+    }
+    return 0;
+}
+int video_demo(PicoDet& detector, const char* path)
+{
+    cv::Mat image;
+    cv::VideoCapture cap(path);
+    while (true)
+    {
+        cap >> image;
+        object_rect effect_roi;
+        cv::Mat resized_img;
+        resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+        auto results = detector.detect(resized_img, 0.4, 0.5);
+        draw_bboxes(image, results, effect_roi);
+        cv::waitKey(1);
+    }
+    return 0;
+}
+int benchmark(PicoDet& detector)
+{
+    int loop_num = 100;
+    int warm_up = 8;
+    double time_min = DBL_MAX;
+    double time_max = -DBL_MAX;
+    double time_avg = 0;
+    cv::Mat image(320, 320, CV_8UC3, cv::Scalar(1, 1, 1));
+    for (int i = 0; i < warm_up + loop_num; i++)
+    {
+        auto start = std::chrono::steady_clock::now();
+        std::vector<BoxInfo> results;
+        results = detector.detect(image, 0.4, 0.5);
+        auto end = std::chrono::steady_clock::now();
+        double time = std::chrono::duration<double, std::milli>(end - start).count();
+        if (i >= warm_up)
+        {
+            time_min = (std::min)(time_min, time);
+            time_max = (std::max)(time_max, time);
+            time_avg += time;
+        }
+    }
+    time_avg /= loop_num;
+    fprintf(stderr, "%20s  min = %7.2f  max = %7.2f  avg = %7.2f\n", "picodet", time_min, time_max, time_avg);
+    return 0;
+}
+int main(int argc, char** argv)
+{
+    if (argc != 3)
+    {
+        fprintf(stderr, "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; \n For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, mode=2; \n For benchmark, mode=3 path=0.\n", argv[0]);
+        return -1;
+    }
+    std::cout<<"start init model"<<std::endl;
+    auto detector = PicoDet("../weight/picodet_m_416.xml");
+    std::cout<<"success"<<std::endl;
+    int mode = atoi(argv[1]);
+    switch (mode)
+    {
+    case 0:{
+        int cam_id = atoi(argv[2]);
+        webcam_demo(detector, cam_id);
+        break;
+        }
+    case 1:{
+        const char* images = argv[2];
+        image_demo(detector, images);
+        break;
+        }
+    case 2:{
+        const char* path = argv[2];
+        video_demo(detector, path);
+        break;
+        }
+    case 3:{
+        benchmark(detector);
+        break;
+        }
+    default:{
+        fprintf(stderr, "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; \n For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, mode=2; \n For benchmark, mode=3 path=0.\n", argv[0]);
+        break;
+        }
+    }
+}
--- a/deploy/third_engine/demo_openvino/picodet_openvino.cpp
+++ b/deploy/third_engine/demo_openvino/picodet_openvino.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_openvino
+#include "picodet_openvino.h"
+inline float fast_exp(float x)
+{
+    union {
+        uint32_t i;
+        float f;
+    } v{};
+    v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+    return v.f;
+}
+inline float sigmoid(float x)
+{
+    return 1.0f / (1.0f + fast_exp(-x));
+}
+template<typename _Tp>
+int activation_function_softmax(const _Tp* src, _Tp* dst, int length)
+{
+    const _Tp alpha = *std::max_element(src, src + length);
+    _Tp denominator{ 0 };
+    for (int i = 0; i < length; ++i)
+    {
+        dst[i] = fast_exp(src[i] - alpha);
+        denominator += dst[i];
+    }
+    for (int i = 0; i < length; ++i)
+    {
+        dst[i] /= denominator;
+    }
+    return 0;
+}
+PicoDet::PicoDet(const char* model_path)
+{
+    InferenceEngine::Core ie;
+    InferenceEngine::CNNNetwork model = ie.ReadNetwork(model_path);
+    // prepare input settings
+    InferenceEngine::InputsDataMap inputs_map(model.getInputsInfo());
+    input_name_ = inputs_map.begin()->first;
+    InferenceEngine::InputInfo::Ptr input_info = inputs_map.begin()->second;
+    //prepare output settings
+    InferenceEngine::OutputsDataMap outputs_map(model.getOutputsInfo());
+    for (auto &output_info : outputs_map)
+    {
+        output_info.second->setPrecision(InferenceEngine::Precision::FP32);
+    }
+    //get network
+    network_ = ie.LoadNetwork(model, "CPU");
+    infer_request_ = network_.CreateInferRequest();
+}
+PicoDet::~PicoDet()
+{
+}
+void PicoDet::preprocess(cv::Mat& image, InferenceEngine::Blob::Ptr& blob)
+{
+    int img_w = image.cols;
+    int img_h = image.rows;
+    int channels = 3;
+    InferenceEngine::MemoryBlob::Ptr mblob = InferenceEngine::as<InferenceEngine::MemoryBlob>(blob);
+    if (!mblob)
+    {
+        THROW_IE_EXCEPTION << "We expect blob to be inherited from MemoryBlob in matU8ToBlob, "
+            << "but by fact we were not able to cast inputBlob to MemoryBlob";
+    }
+    auto mblobHolder = mblob->wmap();
+    float *blob_data = mblobHolder.as<float *>();
+    for (size_t c = 0; c < channels; c++)
+    {
+        for (size_t  h = 0; h < img_h; h++)
+        {
+            for (size_t w = 0; w < img_w; w++)
+            {
+                blob_data[c * img_w * img_h + h * img_w + w] =
+                    (float)image.at<cv::Vec3b>(h, w)[c];
+            }
+        }
+    }
+}
+std::vector<BoxInfo> PicoDet::detect(cv::Mat image, float score_threshold, float nms_threshold)
+{
+    InferenceEngine::Blob::Ptr input_blob = infer_request_.GetBlob(input_name_);
+    preprocess(image, input_blob);
+    // do inference
+    infer_request_.Infer();
+    // get output
+    std::vector<std::vector<BoxInfo>> results;
+    results.resize(this->num_class_);
+    for (const auto& head_info : this->heads_info_)
+    {
+        const InferenceEngine::Blob::Ptr dis_pred_blob = infer_request_.GetBlob(head_info.dis_layer);
+        const InferenceEngine::Blob::Ptr cls_pred_blob = infer_request_.GetBlob(head_info.cls_layer);
+        auto mdis_pred = InferenceEngine::as<InferenceEngine::MemoryBlob>(dis_pred_blob);
+        auto mdis_pred_holder = mdis_pred->rmap();
+        const float *dis_pred = mdis_pred_holder.as<const float *>();
+        auto mcls_pred = InferenceEngine::as<InferenceEngine::MemoryBlob>(cls_pred_blob);
+        auto mcls_pred_holder = mcls_pred->rmap();
+        const float *cls_pred = mcls_pred_holder.as<const float *>();
+        this->decode_infer(cls_pred, dis_pred, head_info.stride, score_threshold, results);
+    }
+    std::vector<BoxInfo> dets;
+    for (int i = 0; i < (int)results.size(); i++)
+    {
+        this->nms(results[i], nms_threshold);
+        for (auto& box : results[i])
+        {
+            dets.push_back(box);
+        }
+    }
+    return dets;
+}
+void PicoDet::decode_infer(const float*& cls_pred, const float*& dis_pred, int stride, float threshold, std::vector<std::vector<BoxInfo>>& results)
+{
+    int feature_h = input_size_ / stride;
+    int feature_w = input_size_ / stride;
+    for (int idx = 0; idx < feature_h * feature_w; idx++)
+    {
+        int row = idx / feature_w;
+        int col = idx % feature_w;
+        float score = 0;
+        int cur_label = 0;
+        for (int label = 0; label < num_class_; label++)
+        {
+            if (cls_pred[idx * num_class_ +label] > score)
+            {
+                score = cls_pred[idx * num_class_ + label];
+                cur_label = label;
+            }
+        }
+        if (score > threshold)
+        {
+            const float* bbox_pred = dis_pred + idx * (reg_max_ + 1) * 4;
+            results[cur_label].push_back(this->disPred2Bbox(bbox_pred, cur_label, score, col, row, stride));
+        }
+    }
+}
+BoxInfo PicoDet::disPred2Bbox(const float*& dfl_det, int label, float score, int x, int y, int stride)
+{
+    float ct_x = (x + 0.5) * stride;
+    float ct_y = (y + 0.5) * stride;
+    std::vector<float> dis_pred;
+    dis_pred.resize(4);
+    for (int i = 0; i < 4; i++)
+    {
+        float dis = 0;
+        float* dis_after_sm = new float[reg_max_ + 1];
+        activation_function_softmax(dfl_det + i * (reg_max_ + 1), dis_after_sm, reg_max_ + 1);
+        for (int j = 0; j < reg_max_ + 1; j++)
+        {
+            dis += j * dis_after_sm[j];
+        }
+        dis *= stride;
+        dis_pred[i] = dis;
+        delete[] dis_after_sm;
+    }
+    float xmin = (std::max)(ct_x - dis_pred[0], .0f);
+    float ymin = (std::max)(ct_y - dis_pred[1], .0f);
+    float xmax = (std::min)(ct_x + dis_pred[2], (float)this->input_size_);
+    float ymax = (std::min)(ct_y + dis_pred[3], (float)this->input_size_);
+    return BoxInfo { xmin, ymin, xmax, ymax, score, label };
+}
+void PicoDet::nms(std::vector<BoxInfo>& input_boxes, float NMS_THRESH)
+{
+    std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
+    std::vector<float> vArea(input_boxes.size());
+    for (int i = 0; i < int(input_boxes.size()); ++i)
+    {
+        vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1)
+            * (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
+    }
+    for (int i = 0; i < int(input_boxes.size()); ++i)
+    {
+        for (int j = i + 1; j < int(input_boxes.size());)
+        {
+            float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
+            float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
+            float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
+            float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);
+            float w = (std::max)(float(0), xx2 - xx1 + 1);
+            float h = (std::max)(float(0), yy2 - yy1 + 1);
+            float inter = w * h;
+            float ovr = inter / (vArea[i] + vArea[j] - inter);
+            if (ovr >= NMS_THRESH)
+            {
+                input_boxes.erase(input_boxes.begin() + j);
+                vArea.erase(vArea.begin() + j);
+            }
+            else
+            {
+                j++;
+            }
+        }
+    }
+}
--- a/deploy/third_engine/demo_openvino/picodet_openvino.h
+++ b/deploy/third_engine/demo_openvino/picodet_openvino.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_openvino
+#ifndef _PICODET_OPENVINO_H_
+#define _PICODET_OPENVINO_H_
+#include <string>
+#include <opencv2/core.hpp>
+#include <inference_engine.hpp>
+typedef struct HeadInfo
+{
+    std::string cls_layer;
+    std::string dis_layer;
+    int stride;
+} HeadInfo;
+typedef struct BoxInfo
+{
+    float x1;
+    float y1;
+    float x2;
+    float y2;
+    float score;
+    int label;
+} BoxInfo;
+class PicoDet
+{
+public:
+    PicoDet(const char* param);
+    ~PicoDet();
+    InferenceEngine::ExecutableNetwork network_;
+    InferenceEngine::InferRequest infer_request_;
+    // static bool hasGPU;
+    std::vector<HeadInfo> heads_info_{
+        // cls_pred|dis_pred|stride
+        {"save_infer_model/scale_0.tmp_1", "save_infer_model/scale_4.tmp_1", 8},
+        {"save_infer_model/scale_1.tmp_1", "save_infer_model/scale_5.tmp_1", 16},
+        {"save_infer_model/scale_2.tmp_1", "save_infer_model/scale_6.tmp_1", 32},
+        {"save_infer_model/scale_3.tmp_1", "save_infer_model/scale_7.tmp_1", 64},
+    };
+    std::vector<BoxInfo> detect(cv::Mat image, float score_threshold, float nms_threshold);
+private:
+    void preprocess(cv::Mat& image, InferenceEngine::Blob::Ptr& blob);
+    void decode_infer(const float*& cls_pred, const float*& dis_pred, int stride, float threshold, std::vector<std::vector<BoxInfo>>& results);
+    BoxInfo disPred2Bbox(const float*& dfl_det, int label, float score, int x, int y, int stride);
+    static void nms(std::vector<BoxInfo>& result, float nms_threshold);
+    std::string input_name_;
+    int input_size_ = 320;
+    int num_class_ = 80;
+    int reg_max_ = 7;
+};
+#endif