cherry-pick 2 pr from develop (#4674)

* add deploy keypoint infer save results (#4480) * fix cpp infer deploy visualize

cherry-pick 2 pr from develop (#4674)
* add deploy keypoint infer save results (#4480) * fix cpp infer deploy visualize
d7cb0f91 · zhiboniu · GitHub · f3b4c238 · d7cb0f91 · d7cb0f91
4 changed file
--- a/deploy/cpp/src/main_keypoint.cc
+++ b/deploy/cpp/src/main_keypoint.cc
@@ -14,72 +14,96 @@
 #include <glog/logging.h>
+#include <math.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <algorithm>
 #include <iostream>
+#include <numeric>
 #include <string>
 #include <vector>
-#include <numeric>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <math.h>
-#include <algorithm>
 #ifdef _WIN32
 #include <direct.h>
 #include <io.h>
 #elif LINUX
 #include <stdarg.h>
-#include <sys/stat.h>
 #endif
-#include "include/object_detector.h"
+#include <gflags/gflags.h>
 #include "include/keypoint_detector.h"
+#include "include/object_detector.h"
 #include "include/preprocess_op.h"
-#include <gflags/gflags.h>
 DEFINE_string(model_dir, "", "Path of object detector inference model");
-DEFINE_string(model_dir_keypoint, "", "Path of keypoint detector inference model");
+DEFINE_string(model_dir_keypoint,
+              "",
+              "Path of keypoint detector inference model");
 DEFINE_string(image_file, "", "Path of input image");
-DEFINE_string(image_dir, "", "Dir of input image, `image_file` has a higher priority.");
+DEFINE_string(image_dir,
+              "",
+              "Dir of input image, `image_file` has a higher priority.");
 DEFINE_int32(batch_size, 1, "batch_size of object detector");
 DEFINE_int32(batch_size_keypoint, 8, "batch_size of keypoint detector");
-DEFINE_string(video_file, "", "Path of input video, `video_file` or `camera_id` has a highest priority.");
+DEFINE_string(
+    video_file,
+    "",
+    "Path of input video, `video_file` or `camera_id` has a highest priority.");
 DEFINE_int32(camera_id, -1, "Device id of camera to predict");
-DEFINE_bool(use_gpu, false, "Deprecated, please use `--device` to set the device you want to run.");
+DEFINE_bool(
-DEFINE_string(device, "CPU", "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU.");
+    use_gpu,
+    false,
+    "Deprecated, please use `--device` to set the device you want to run.");
+DEFINE_string(device,
+              "CPU",
+              "Choose the device you want to run, it can be: CPU/GPU/XPU, "
+              "default is CPU.");
 DEFINE_double(threshold, 0.5, "Threshold of score.");
 DEFINE_double(threshold_keypoint, 0.5, "Threshold of score.");
 DEFINE_string(output_dir, "output", "Directory of output visualization files.");
-DEFINE_string(run_mode, "fluid", "Mode of running(fluid/trt_fp32/trt_fp16/trt_int8)");
+DEFINE_string(run_mode,
+              "fluid",
+              "Mode of running(fluid/trt_fp32/trt_fp16/trt_int8)");
 DEFINE_int32(gpu_id, 0, "Device id of GPU to execute");
-DEFINE_bool(run_benchmark, false, "Whether to predict a image_file repeatedly for benchmark");
+DEFINE_bool(run_benchmark,
+            false,
+            "Whether to predict a image_file repeatedly for benchmark");
 DEFINE_bool(use_mkldnn, false, "Whether use mkldnn with CPU");
 DEFINE_int32(cpu_threads, 1, "Num of threads with CPU");
 DEFINE_int32(trt_min_shape, 1, "Min shape of TRT DynamicShapeI");
 DEFINE_int32(trt_max_shape, 1280, "Max shape of TRT DynamicShapeI");
 DEFINE_int32(trt_opt_shape, 640, "Opt shape of TRT DynamicShapeI");
-DEFINE_bool(trt_calib_mode, false, "If the model is produced by TRT offline quantitative calibration, trt_calib_mode need to set True");
+DEFINE_bool(trt_calib_mode,
+            false,
+            "If the model is produced by TRT offline quantitative calibration, "
+            "trt_calib_mode need to set True");
 DEFINE_bool(use_dark, true, "Whether use dark decode in keypoint postprocess");
-void PrintBenchmarkLog(std::vector<double> det_time, int img_num){
+void PrintBenchmarkLog(std::vector<double> det_time, int img_num) {
  LOG(INFO) << "----------------------- Config info -----------------------";
  LOG(INFO) << "runtime_device: " << FLAGS_device;
-  LOG(INFO) << "ir_optim: " << "True";
+  LOG(INFO) << "ir_optim: "
-  LOG(INFO) << "enable_memory_optim: " << "True";
+            << "True";
+  LOG(INFO) << "enable_memory_optim: "
+            << "True";
  int has_trt = FLAGS_run_mode.find("trt");
  if (has_trt >= 0) {
-    LOG(INFO) << "enable_tensorrt: " << "True";
+    LOG(INFO) << "enable_tensorrt: "
+              << "True";
    std::string precision = FLAGS_run_mode.substr(4, 8);
    LOG(INFO) << "precision: " << precision;
  } else {
-    LOG(INFO) << "enable_tensorrt: " << "False";
+    LOG(INFO) << "enable_tensorrt: "
-    LOG(INFO) << "precision: " << "fp32";
+              << "False";
+    LOG(INFO) << "precision: "
+              << "fp32";
  }
  LOG(INFO) << "enable_mkldnn: " << (FLAGS_use_mkldnn ? "True" : "False");
  LOG(INFO) << "cpu_math_library_num_threads: " << FLAGS_cpu_threads;
  LOG(INFO) << "----------------------- Data info -----------------------";
  LOG(INFO) << "batch_size: " << FLAGS_batch_size;
-  LOG(INFO) << "input_shape: " << "dynamic shape";
+  LOG(INFO) << "input_shape: "
+            << "dynamic shape";
  LOG(INFO) << "----------------------- Model info -----------------------";
  FLAGS_model_dir.erase(FLAGS_model_dir.find_last_not_of("/") + 1);
  LOG(INFO) << "model_name: " << FLAGS_model_dir;
@@ -93,11 +117,12 @@ void PrintBenchmarkLog(std::vector<double> det_time, int img_num){
            << ", postprocess_time(ms): " << det_time[2] / img_num;
 }
-void PrintKptsBenchmarkLog(std::vector<double> det_time, int img_num){
+void PrintKptsBenchmarkLog(std::vector<double> det_time, int img_num) {
  LOG(INFO) << "----------------------- Data info -----------------------";
  LOG(INFO) << "batch_size_keypoint: " << FLAGS_batch_size_keypoint;
  LOG(INFO) << "----------------------- Model info -----------------------";
-  FLAGS_model_dir_keypoint.erase(FLAGS_model_dir_keypoint.find_last_not_of("/") + 1);
+  FLAGS_model_dir_keypoint.erase(
+      FLAGS_model_dir_keypoint.find_last_not_of("/") + 1);
  LOG(INFO) << "keypoint_model_name: " << FLAGS_model_dir_keypoint;
  LOG(INFO) << "----------------------- Perf info ------------------------";
  LOG(INFO) << "Total number of predicted data: " << img_num
@@ -110,7 +135,7 @@ void PrintKptsBenchmarkLog(std::vector<double> det_time, int img_num){
            << ", postprocess_time(ms): " << det_time[2] / img_num;
 }
-static std::string DirName(const std::string &filepath) {
+static std::string DirName(const std::string& filepath) {
  auto pos = filepath.rfind(OS_PATH_SEP);
  if (pos == std::string::npos) {
    return "";
@@ -118,7 +143,7 @@ static std::string DirName(const std::string &filepath) {
  return filepath.substr(0, pos);
 }
-static bool PathExists(const std::string& path){
+static bool PathExists(const std::string& path) {
 #ifdef _WIN32
  struct _stat buffer;
  return (_stat(path.c_str(), &buffer) == 0);
@@ -158,11 +183,12 @@ void PredictVideo(const std::string& video_path,
  // Open video
  cv::VideoCapture capture;
  std::string video_out_name = "output.mp4";
-  if (FLAGS_camera_id != -1){
+  if (FLAGS_camera_id != -1) {
    capture.open(FLAGS_camera_id);
-  }else{
+  } else {
    capture.open(video_path.c_str());
-    video_out_name = video_path.substr(video_path.find_last_of(OS_PATH_SEP) + 1);
+    video_out_name =
+        video_path.substr(video_path.find_last_of(OS_PATH_SEP) + 1);
  }
  if (!capture.isOpened()) {
    printf("can not open video : %s\n", video_path.c_str());
@@ -173,7 +199,8 @@ void PredictVideo(const std::string& video_path,
  int video_width = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_WIDTH));
  int video_height = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_HEIGHT));
  int video_fps = static_cast<int>(capture.get(CV_CAP_PROP_FPS));
-  int video_frame_count = static_cast<int>(capture.get(CV_CAP_PROP_FRAME_COUNT));
+  int video_frame_count =
+      static_cast<int>(capture.get(CV_CAP_PROP_FRAME_COUNT));
  printf("fps: %d, frame_count: %d\n", video_fps, video_frame_count);
  // Create VideoWriter for output
@@ -199,7 +226,6 @@ void PredictVideo(const std::string& video_path,
  auto labels = det->GetLabelList();
  auto colormap = PaddleDetection::GenerateColorMap(labels.size());
  // Store keypoint results
  std::vector<PaddleDetection::KeyPointResult> result_kpts;
  std::vector<cv::Mat> imgs_kpts;
@@ -221,55 +247,61 @@ void PredictVideo(const std::string& video_path,
    std::vector<PaddleDetection::ObjectResult> out_result;
    for (const auto& item : result) {
      if (item.confidence < FLAGS_threshold || item.class_id == -1) {
-          continue;
+        continue;
      }
      out_result.push_back(item);
-      if (item.rect.size() > 6){
+      if (item.rect.size() > 6) {
-      is_rbox = true;
+        is_rbox = true;
-      printf("class=%d confidence=%.4f rect=[%d %d %d %d %d %d %d %d]\n",
+        printf("class=%d confidence=%.4f rect=[%d %d %d %d %d %d %d %d]\n",
-          item.class_id,
+               item.class_id,
-          item.confidence,
+               item.confidence,
-          item.rect[0],
+               item.rect[0],
-          item.rect[1],
+               item.rect[1],
-          item.rect[2],
+               item.rect[2],
-          item.rect[3],
+               item.rect[3],
-          item.rect[4],
+               item.rect[4],
-          item.rect[5],
+               item.rect[5],
-          item.rect[6],
+               item.rect[6],
-          item.rect[7]);
+               item.rect[7]);
-      }
+      } else {
-      else{
        printf("class=%d confidence=%.4f rect=[%d %d %d %d]\n",
-          item.class_id,
+               item.class_id,
-          item.confidence,
+               item.confidence,
-          item.rect[0],
+               item.rect[0],
-          item.rect[1],
+               item.rect[1],
-          item.rect[2],
+               item.rect[2],
-          item.rect[3]);
+               item.rect[3]);
      }
    }
-    if(keypoint)
+    if (keypoint) {
-    {
+      result_kpts.clear();
      int imsize = out_result.size();
-      for (int i=0; i<imsize; i++){
+      for (int i = 0; i < imsize; i++) {
        auto item = out_result[i];
        cv::Mat crop_img;
        std::vector<double> keypoint_times;
-        std::vector<int> rect = {item.rect[0], item.rect[1], item.rect[2], item.rect[3]};
+        std::vector<int> rect = {
+            item.rect[0], item.rect[1], item.rect[2], item.rect[3]};
        std::vector<float> center;
        std::vector<float> scale;
-        if(item.class_id == 0)
+        if (item.class_id == 0) {
-        {
          PaddleDetection::CropImg(frame, crop_img, rect, center, scale);
          center_bs.emplace_back(center);
          scale_bs.emplace_back(scale);
          imgs_kpts.emplace_back(crop_img);
        }
-        if (imgs_kpts.size()==FLAGS_batch_size_keypoint || ((i==imsize-1)&&!imgs_kpts.empty()))
+        if (imgs_kpts.size() == FLAGS_batch_size_keypoint ||
-        {
+            ((i == imsize - 1) && !imgs_kpts.empty())) {
-          keypoint->Predict(imgs_kpts, center_bs, scale_bs, FLAGS_threshold, 0, 1, &result_kpts, &keypoint_times);
+          keypoint->Predict(imgs_kpts,
+                            center_bs,
+                            scale_bs,
+                            FLAGS_threshold,
+                            0,
+                            1,
+                            &result_kpts,
+                            &keypoint_times);
          imgs_kpts.clear();
          center_bs.clear();
          scale_bs.clear();
@@ -277,8 +309,7 @@ void PredictVideo(const std::string& video_path,
      }
      cv::Mat out_im = VisualizeKptsResult(frame, result_kpts, colormap_kpts);
      video_out.write(out_im);
-    }
+    } else {
-    else{
      // Visualization result
      cv::Mat out_im = PaddleDetection::VisualizeResult(
          frame, out_result, labels, colormap, is_rbox);
@@ -299,11 +330,13 @@ void PredictImage(const std::vector<std::string> all_img_paths,
                  PaddleDetection::KeyPointDetector* keypoint,
                  const std::string& output_dir = "output") {
  std::vector<double> det_t = {0, 0, 0};
-  int steps = ceil(float(all_img_paths.size()) / batch_size);
+  int steps = ceil(static_cast<float>(all_img_paths.size()) / batch_size);
  int kpts_imgs = 0;
  std::vector<double> keypoint_t = {0, 0, 0};
  printf("total images = %d, batch_size = %d, total steps = %d\n",
-                all_img_paths.size(), batch_size, steps);
+         all_img_paths.size(),
+         batch_size,
+         steps);
  for (int idx = 0; idx < steps; idx++) {
    std::vector<cv::Mat> batch_imgs;
    int left_image_cnt = all_img_paths.size() - idx * batch_size;
@@ -311,11 +344,11 @@ void PredictImage(const std::vector<std::string> all_img_paths,
      left_image_cnt = batch_size;
    }
    for (int bs = 0; bs < left_image_cnt; bs++) {
-      std::string image_file_path = all_img_paths.at(idx * batch_size+bs);
+      std::string image_file_path = all_img_paths.at(idx * batch_size + bs);
      cv::Mat im = cv::imread(image_file_path, 1);
      batch_imgs.insert(batch_imgs.end(), im);
    }
    // Store all detected result
    std::vector<PaddleDetection::ObjectResult> result;
    std::vector<int> bbox_num;
@@ -330,7 +363,8 @@ void PredictImage(const std::vector<std::string> all_img_paths,
    bool is_rbox = false;
    if (run_benchmark) {
-      det->Predict(batch_imgs, threshold, 10, 10, &result, &bbox_num,  &det_times);
+      det->Predict(
+          batch_imgs, threshold, 10, 10, &result, &bbox_num, &det_times);
    } else {
      det->Predict(batch_imgs, threshold, 0, 1, &result, &bbox_num, &det_times);
    }
@@ -349,33 +383,33 @@ void PredictImage(const std::vector<std::string> all_img_paths,
        }
        detect_num += 1;
        im_result.push_back(item);
-        if (item.rect.size() > 6){
+        if (item.rect.size() > 6) {
          is_rbox = true;
          printf("class=%d confidence=%.4f rect=[%d %d %d %d %d %d %d %d]\n",
-            item.class_id,
+                 item.class_id,
-            item.confidence,
+                 item.confidence,
-            item.rect[0],
+                 item.rect[0],
-            item.rect[1],
+                 item.rect[1],
-            item.rect[2],
+                 item.rect[2],
-            item.rect[3],
+                 item.rect[3],
-            item.rect[4],
+                 item.rect[4],
-            item.rect[5],
+                 item.rect[5],
-            item.rect[6],
+                 item.rect[6],
-            item.rect[7]);
+                 item.rect[7]);
-        }
+        } else {
-        else{
          printf("class=%d confidence=%.4f rect=[%d %d %d %d]\n",
-            item.class_id,
+                 item.class_id,
-            item.confidence,
+                 item.confidence,
-            item.rect[0],
+                 item.rect[0],
-            item.rect[1],
+                 item.rect[1],
-            item.rect[2],
+                 item.rect[2],
-            item.rect[3]);
+                 item.rect[3]);
        }
      }
-      std::cout << all_img_paths.at(idx * batch_size + i) << " The number of detected box: " << detect_num << std::endl;
+      std::cout << all_img_paths.at(idx * batch_size + i)
-      item_start_idx = item_start_idx + bbox_num[i];   
+                << " The number of detected box: " << detect_num << std::endl;
+      item_start_idx = item_start_idx + bbox_num[i];
      std::vector<int> compression_params;
      compression_params.push_back(CV_IMWRITE_JPEG_QUALITY);
      compression_params.push_back(95);
@@ -384,18 +418,17 @@ void PredictImage(const std::vector<std::string> all_img_paths,
        output_path += OS_PATH_SEP;
      }
      std::string image_file_path = all_img_paths.at(idx * batch_size + i);
-      if(keypoint)
+      if (keypoint) {
-      {
        int imsize = im_result.size();
-        for (int i=0; i<imsize; i++){
+        for (int i = 0; i < imsize; i++) {
          auto item = im_result[i];
          cv::Mat crop_img;
          std::vector<double> keypoint_times;
-          std::vector<int> rect = {item.rect[0], item.rect[1], item.rect[2], item.rect[3]};
+          std::vector<int> rect = {
+              item.rect[0], item.rect[1], item.rect[2], item.rect[3]};
          std::vector<float> center;
          std::vector<float> scale;
-          if(item.class_id == 0)
+          if (item.class_id == 0) {
-          {
            PaddleDetection::CropImg(im, crop_img, rect, center, scale);
            center_bs.emplace_back(center);
            scale_bs.emplace_back(scale);
@@ -403,13 +436,26 @@ void PredictImage(const std::vector<std::string> all_img_paths,
            kpts_imgs += 1;
          }
-          if (imgs_kpts.size()==FLAGS_batch_size_keypoint || ((i==imsize-1)&&!imgs_kpts.empty()))
+          if (imgs_kpts.size() == FLAGS_batch_size_keypoint ||
-          {
+              ((i == imsize - 1) && !imgs_kpts.empty())) {
            if (run_benchmark) {
-              keypoint->Predict(imgs_kpts, center_bs, scale_bs, 0.5, 10, 10, &result_kpts, &keypoint_times);
+              keypoint->Predict(imgs_kpts,
-            }
+                                center_bs,
-            else{
+                                scale_bs,
-              keypoint->Predict(imgs_kpts, center_bs, scale_bs, 0.5, 0, 1, &result_kpts, &keypoint_times);
+                                0.5,
+                                10,
+                                10,
+                                &result_kpts,
+                                &keypoint_times);
+            } else {
+              keypoint->Predict(imgs_kpts,
+                                center_bs,
+                                scale_bs,
+                                0.5,
+                                0,
+                                1,
+                                &result_kpts,
+                                &keypoint_times);
            }
            imgs_kpts.clear();
            center_bs.clear();
@@ -419,21 +465,25 @@ void PredictImage(const std::vector<std::string> all_img_paths,
            keypoint_t[2] += keypoint_times[2];
          }
        }
-        std::string kpts_savepath = output_path + "keypoint_" + image_file_path.substr(image_file_path.find_last_of('/') + 1);
+        std::string kpts_savepath =
-        cv::Mat kpts_vis_img = VisualizeKptsResult(im, result_kpts, colormap_kpts);
+            output_path + "keypoint_" +
+            image_file_path.substr(image_file_path.find_last_of('/') + 1);
+        cv::Mat kpts_vis_img =
+            VisualizeKptsResult(im, result_kpts, colormap_kpts);
        cv::imwrite(kpts_savepath, kpts_vis_img, compression_params);
        printf("Visualized output saved as %s\n", kpts_savepath.c_str());
-      }
+      } else {
-      else{
        // Visualization result
        cv::Mat vis_img = PaddleDetection::VisualizeResult(
            im, im_result, labels, colormap, is_rbox);
-        std::string det_savepath = output_path + image_file_path.substr(image_file_path.find_last_of('/') + 1);
+        std::string det_savepath =
+            output_path +
+            image_file_path.substr(image_file_path.find_last_of('/') + 1);
        cv::imwrite(det_savepath, vis_img, compression_params);
-        printf("Visualized output saved as %s\n", det_savepath.c_str());   
+        printf("Visualized output saved as %s\n", det_savepath.c_str());
      }
    }
    det_t[0] += det_times[0];
    det_t[1] += det_times[1];
    det_t[2] += det_times[2];
@@ -447,43 +497,65 @@ void PredictImage(const std::vector<std::string> all_img_paths,
 int main(int argc, char** argv) {
  // Parsing command-line
  google::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_model_dir.empty()
+  if (FLAGS_model_dir.empty() ||
-      || (FLAGS_image_file.empty() && FLAGS_image_dir.empty() && FLAGS_video_file.empty())) {
+      (FLAGS_image_file.empty() && FLAGS_image_dir.empty() &&
-    std::cout << "Usage: ./main --model_dir=/PATH/TO/INFERENCE_MODEL/ (--model_dir_keypoint=/PATH/TO/INFERENCE_MODEL/)"
+       FLAGS_video_file.empty())) {
-                << "--image_file=/PATH/TO/INPUT/IMAGE/" << std::endl;
+    std::cout << "Usage: ./main --model_dir=/PATH/TO/INFERENCE_MODEL/ "
+                 "(--model_dir_keypoint=/PATH/TO/INFERENCE_MODEL/)"
+              << "--image_file=/PATH/TO/INPUT/IMAGE/" << std::endl;
    return -1;
  }
-  if (!(FLAGS_run_mode == "fluid" || FLAGS_run_mode == "trt_fp32"
+  if (!(FLAGS_run_mode == "fluid" || FLAGS_run_mode == "trt_fp32" ||
-      || FLAGS_run_mode == "trt_fp16" || FLAGS_run_mode == "trt_int8")) {
+        FLAGS_run_mode == "trt_fp16" || FLAGS_run_mode == "trt_int8")) {
-    std::cout << "run_mode should be 'fluid', 'trt_fp32', 'trt_fp16' or 'trt_int8'.";
+    std::cout
+        << "run_mode should be 'fluid', 'trt_fp32', 'trt_fp16' or 'trt_int8'.";
    return -1;
  }
-  transform(FLAGS_device.begin(),FLAGS_device.end(),FLAGS_device.begin(),::toupper);
+  transform(FLAGS_device.begin(),
-  if (!(FLAGS_device == "CPU" || FLAGS_device == "GPU" || FLAGS_device == "XPU")) {
+            FLAGS_device.end(),
+            FLAGS_device.begin(),
+            ::toupper);
+  if (!(FLAGS_device == "CPU" || FLAGS_device == "GPU" ||
+        FLAGS_device == "XPU")) {
    std::cout << "device should be 'CPU', 'GPU' or 'XPU'.";
    return -1;
  }
  if (FLAGS_use_gpu) {
-    std::cout << "Deprecated, please use `--device` to set the device you want to run.";
+    std::cout << "Deprecated, please use `--device` to set the device you want "
+                 "to run.";
    return -1;
  }
  // Load model and create a object detector
-  PaddleDetection::ObjectDetector det(FLAGS_model_dir, FLAGS_device, FLAGS_use_mkldnn,
+  PaddleDetection::ObjectDetector det(FLAGS_model_dir,
-                        FLAGS_cpu_threads, FLAGS_run_mode, FLAGS_batch_size,FLAGS_gpu_id,
+                                      FLAGS_device,
-                        FLAGS_trt_min_shape, FLAGS_trt_max_shape, FLAGS_trt_opt_shape,
+                                      FLAGS_use_mkldnn,
-			FLAGS_trt_calib_mode);
+                                      FLAGS_cpu_threads,
+                                      FLAGS_run_mode,
+                                      FLAGS_batch_size,
+                                      FLAGS_gpu_id,
+                                      FLAGS_trt_min_shape,
+                                      FLAGS_trt_max_shape,
+                                      FLAGS_trt_opt_shape,
+                                      FLAGS_trt_calib_mode);
  PaddleDetection::KeyPointDetector* keypoint = nullptr;
-  if (!FLAGS_model_dir_keypoint.empty())
+  if (!FLAGS_model_dir_keypoint.empty()) {
-  {
+    keypoint = new PaddleDetection::KeyPointDetector(FLAGS_model_dir_keypoint,
-      keypoint = new PaddleDetection::KeyPointDetector(FLAGS_model_dir_keypoint, FLAGS_device, FLAGS_use_mkldnn,
+                                                     FLAGS_device,
-                        FLAGS_cpu_threads, FLAGS_run_mode, FLAGS_batch_size_keypoint, FLAGS_gpu_id,
+                                                     FLAGS_use_mkldnn,
-                        FLAGS_trt_min_shape, FLAGS_trt_max_shape, FLAGS_trt_opt_shape,
+                                                     FLAGS_cpu_threads,
-			FLAGS_trt_calib_mode, FLAGS_use_dark);
+                                                     FLAGS_run_mode,
+                                                     FLAGS_batch_size_keypoint,
+                                                     FLAGS_gpu_id,
+                                                     FLAGS_trt_min_shape,
+                                                     FLAGS_trt_max_shape,
+                                                     FLAGS_trt_opt_shape,
+                                                     FLAGS_trt_calib_mode,
+                                                     FLAGS_use_dark);
  }
  // Do inference on input video or image
  if (!PathExists(FLAGS_output_dir)) {
-      MkDirs(FLAGS_output_dir);
+    MkDirs(FLAGS_output_dir);
  }
  if (!FLAGS_video_file.empty() || FLAGS_camera_id != -1) {
    PredictVideo(FLAGS_video_file, &det, keypoint, FLAGS_output_dir);
@@ -493,17 +565,23 @@ int main(int argc, char** argv) {
    if (!FLAGS_image_file.empty()) {
      all_img_paths.push_back(FLAGS_image_file);
      if (FLAGS_batch_size > 1) {
-        std::cout << "batch_size should be 1, when set `image_file`." << std::endl;
+        std::cout << "batch_size should be 1, when set `image_file`."
-	return -1;
+                  << std::endl;
+        return -1;
      }
    } else {
-        cv::glob(FLAGS_image_dir, cv_all_img_paths);
+      cv::glob(FLAGS_image_dir, cv_all_img_paths);
-        for (const auto & img_path : cv_all_img_paths) {
+      for (const auto& img_path : cv_all_img_paths) {
-          all_img_paths.push_back(img_path);
+        all_img_paths.push_back(img_path);
-        }
+      }
    }
-    PredictImage(all_img_paths, FLAGS_batch_size, FLAGS_threshold,
+    PredictImage(all_img_paths,
-		 FLAGS_run_benchmark, &det, keypoint, FLAGS_output_dir);
+                 FLAGS_batch_size,
+                 FLAGS_threshold,
+                 FLAGS_run_benchmark,
+                 &det,
+                 keypoint,
+                 FLAGS_output_dir);
  }
  delete keypoint;
  keypoint = nullptr;

--- a/deploy/python/det_keypoint_unite_infer.py
+++ b/deploy/python/det_keypoint_unite_infer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import os
+import json
 import cv2
 import math
 import numpy as np
@@ -80,7 +80,7 @@ def predict_with_given_det(image, det_res, keypoint_detector,
    keypoint_res = {}
    keypoint_res['keypoint'] = [
-        np.vstack(keypoint_vector), np.vstack(score_vector)
+        np.vstack(keypoint_vector).tolist(), np.vstack(score_vector).tolist()
    ] if len(keypoint_vector) > 0 else [[], []]
    keypoint_res['bbox'] = rect_vector
    return keypoint_res
@@ -89,8 +89,10 @@ def predict_with_given_det(image, det_res, keypoint_detector,
 def topdown_unite_predict(detector,
                          topdown_keypoint_detector,
                          image_list,
-                          keypoint_batch_size=1):
+                          keypoint_batch_size=1,
+                          save_res=False):
    det_timer = detector.get_timer()
+    store_res = []
    for i, img_file in enumerate(image_list):
        # Decode image in advance in det + pose prediction
        det_timer.preprocess_time_s.start()
@@ -114,6 +116,11 @@ def topdown_unite_predict(detector,
            image, results, topdown_keypoint_detector, keypoint_batch_size,
            FLAGS.det_threshold, FLAGS.keypoint_threshold, FLAGS.run_benchmark)
+        if save_res:
+            store_res.append([
+                i, keypoint_res['bbox'],
+                [keypoint_res['keypoint'][0], keypoint_res['keypoint'][1]]
+            ])
        if FLAGS.run_benchmark:
            cm, gm, gu = get_current_memory_mb()
            topdown_keypoint_detector.cpu_mem += cm
@@ -127,12 +134,23 @@ def topdown_unite_predict(detector,
                keypoint_res,
                visual_thread=FLAGS.keypoint_threshold,
                save_dir=FLAGS.output_dir)
+    if save_res:
+        """
+        1) store_res: a list of image_data
+        2) image_data: [imageid, rects, [keypoints, scores]]
+        3) rects: list of rect [xmin, ymin, xmax, ymax]
+        4) keypoints: 17(joint numbers)*[x, y, conf], total 51 data in list
+        5) scores: mean of all joint conf
+        """
+        with open("det_keypoint_unite_image_results.json", 'w') as wf:
+            json.dump(store_res, wf, indent=4)
 def topdown_unite_predict_video(detector,
                                topdown_keypoint_detector,
                                camera_id,
-                                keypoint_batch_size=1):
+                                keypoint_batch_size=1,
+                                save_res=False):
    video_name = 'output.mp4'
    if camera_id != -1:
        capture = cv2.VideoCapture(camera_id)
@@ -150,9 +168,10 @@ def topdown_unite_predict_video(detector,
    if not os.path.exists(FLAGS.output_dir):
        os.makedirs(FLAGS.output_dir)
    out_path = os.path.join(FLAGS.output_dir, video_name)
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
    writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
    index = 0
+    store_res = []
    while (1):
        ret, frame = capture.read()
        if not ret:
@@ -172,6 +191,11 @@ def topdown_unite_predict_video(detector,
            keypoint_res,
            visual_thread=FLAGS.keypoint_threshold,
            returnimg=True)
+        if save_res:
+            store_res.append([
+                index, keypoint_res['bbox'],
+                [keypoint_res['keypoint'][0], keypoint_res['keypoint'][1]]
+            ])
        writer.write(im)
        if camera_id != -1:
@@ -179,6 +203,16 @@ def topdown_unite_predict_video(detector,
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
    writer.release()
+    if save_res:
+        """
+        1) store_res: a list of frame_data
+        2) frame_data: [frameid, rects, [keypoints, scores]]
+        3) rects: list of rect [xmin, ymin, xmax, ymax]
+        4) keypoints: 17(joint numbers)*[x, y, conf], total 51 data in list
+        5) scores: mean of all joint conf
+        """
+        with open("det_keypoint_unite_video_results.json", 'w') as wf:
+            json.dump(store_res, wf, indent=4)
 def main():
@@ -219,12 +253,13 @@ def main():
    # predict from video file or camera video stream
    if FLAGS.video_file is not None or FLAGS.camera_id != -1:
        topdown_unite_predict_video(detector, topdown_keypoint_detector,
-                                    FLAGS.camera_id, FLAGS.keypoint_batch_size)
+                                    FLAGS.camera_id, FLAGS.keypoint_batch_size,
+                                    FLAGS.save_res)
    else:
        # predict from image
        img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
        topdown_unite_predict(detector, topdown_keypoint_detector, img_list,
-                              FLAGS.keypoint_batch_size)
+                              FLAGS.keypoint_batch_size, FLAGS.save_res)
        if not FLAGS.run_benchmark:
            detector.det_times.info(average=True)
            topdown_keypoint_detector.det_times.info(average=True)

--- a/deploy/python/det_keypoint_unite_utils.py
+++ b/deploy/python/det_keypoint_unite_utils.py
@@ -115,5 +115,15 @@ def argsparser():
        type=bool,
        default=True,
        help='whether to use darkpose to get better keypoint position predict ')
+    parser.add_argument(
+        '--save_res',
+        type=bool,
+        default=False,
+        help=(
+            "whether to save predict results to json file"
+            "1) store_res: a list of image_data"
+            "2) image_data: [imageid, rects, [keypoints, scores]]"
+            "3) rects: list of rect [xmin, ymin, xmax, ymax]"
+            "4) keypoints: 17(joint numbers)*[x, y, conf], total 51 data in list"
+            "5) scores: mean of all joint conf"))
    return parser
--- a/deploy/python/visualize.py
+++ b/deploy/python/visualize.py
@@ -240,6 +240,7 @@ def draw_pose(imgfile,
        raise e
    skeletons, scores = results['keypoint']
+    skeletons = np.array(skeletons)
    kpt_nums = 17
    if len(skeletons) > 0:
        kpt_nums = skeletons.shape[1]