diff --git a/deploy/cpp/docs/linux_build.md b/deploy/cpp/docs/linux_build.md
index 70788adfd1aa1fdfe8ef0da85794b8fe5773276e..321ec564569e8eb491f842439c222726bb32e30c 100644
--- a/deploy/cpp/docs/linux_build.md
+++ b/deploy/cpp/docs/linux_build.md
@@ -85,7 +85,7 @@ make
  ```shell
  sh ./scripts/build.sh
  ```
-
+**注意**: OPENCV依赖OPENBLAS，Ubuntu用户需确认系统是否已存在`libopenblas.so`。如未安装，可执行apt-get install libopenblas-dev进行安装。
 
 ### Step5: 预测及可视化
 编译成功后，预测入口程序为`build/main`其主要命令参数说明如下：
@@ -95,9 +95,10 @@ make
 | image_path  | 要预测的图片文件路径 |
 | video_path  | 要预测的视频文件路径 |
 | use_gpu  | 是否使用 GPU 预测, 支持值为0或1(默认值为0)|
+| gpu_id  |  指定进行推理的GPU device id(默认值为0)|
 | --run_mode |使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16）|
 
-**注意**：如果同时设置了`video_path`和`image_path`，程序仅预测`video_path`。
+**注意**: 如果同时设置了`video_path`和`image_path`，程序仅预测`video_path`。
 
 
 `样例一`：
@@ -111,7 +112,7 @@ make
 
 `样例二`:
 ```shell
-#使用 `GPU`预测视频`/root/projects/videos/test.avi`
-./build/main --model_dir=/root/projects/models/yolov3_darknet --video_path=/root/projects/images/test.avi --use_gpu=1
+#使用 `GPU`预测视频`/root/projects/videos/test.mp4`
+./build/main --model_dir=/root/projects/models/yolov3_darknet --video_path=/root/projects/images/test.mp4 --use_gpu=1
 ```
-视频文件`可视化预测结果`会保存在当前目录下`output.avi`文件中。
+视频文件目前支持`.mp4`格式的预测，`可视化预测结果`会保存在当前目录下`output.mp4`文件中。
diff --git a/deploy/cpp/docs/windows_vs2019_build.md b/deploy/cpp/docs/windows_vs2019_build.md
index 7f23a983ce0f7f50545d5b4dd7b63746955aa209..18870d21dadecca65e42d4472be1f3aaab5f2d99 100644
--- a/deploy/cpp/docs/windows_vs2019_build.md
+++ b/deploy/cpp/docs/windows_vs2019_build.md
@@ -96,6 +96,7 @@ cd D:\projects\PaddleDetection\deploy\cpp\out\build\x64-Release
 | image_path  | 要预测的图片文件路径 |
 | video_path  | 要预测的视频文件路径 |
 | use_gpu  | 是否使用 GPU 预测, 支持值为0或1(默认值为0)|
+| gpu_id  |  指定进行推理的GPU device id(默认值为0)|
 
 **注意**：如果同时设置了`video_path`和`image_path`，程序仅预测`video_path`。
 
@@ -111,8 +112,8 @@ cd D:\projects\PaddleDetection\deploy\cpp\out\build\x64-Release
 
 `样例二`:
 ```shell
-#使用`GPU`测试视频 `D:\\videos\\test.avi`  
-.\main --model_dir=D:\\models\\yolov3_darknet --video_path=D:\\videos\\test.jpeg --use_gpu=1
+#使用`GPU`测试视频 `D:\\videos\\test.mp4`  
+.\main --model_dir=D:\\models\\yolov3_darknet --video_path=D:\\videos\\test.mp4 --use_gpu=1
 ```
 
-视频文件`可视化预测结果`会保存在当前目录下`output.avi`文件中。
+视频文件目前支持`.mp4`格式的预测，`可视化预测结果`会保存在当前目录下`output.mp4`文件中。
diff --git a/deploy/cpp/include/object_detector.h b/deploy/cpp/include/object_detector.h
index 328dd458efb4e165d13974415535a6bf3a5c2f65..82d860f8d24a92c965010d84988ceba62454b2cd 100644
--- a/deploy/cpp/include/object_detector.h
+++ b/deploy/cpp/include/object_detector.h
@@ -54,12 +54,14 @@ cv::Mat VisualizeResult(const cv::Mat& img,
 
 class ObjectDetector {
  public:
-  explicit ObjectDetector(const std::string& model_dir, bool use_gpu = false,
-                          const std::string& run_mode = "fluid") {
+  explicit ObjectDetector(const std::string& model_dir, 
+                          bool use_gpu=false,
+                          const std::string& run_mode="fluid",
+                          const int gpu_id=0) {
     config_.load_config(model_dir);
     threshold_ = config_.draw_threshold_;
     preprocessor_.Init(config_.preprocess_info_, config_.arch_);
-    LoadModel(model_dir, use_gpu, config_.min_subgraph_size_, 1, run_mode);
+    LoadModel(model_dir, use_gpu, config_.min_subgraph_size_, 1, run_mode, gpu_id);
   }
 
   // Load Paddle inference model
@@ -68,7 +70,8 @@ class ObjectDetector {
     bool use_gpu,
     const int min_subgraph_size,
     const int batch_size = 1,
-    const std::string& run_mode = "fluid");
+    const std::string& run_mode = "fluid",
+    const int gpu_id=0);
 
   // Run predictor
   void Predict(
diff --git a/deploy/cpp/scripts/bootstrap.sh b/deploy/cpp/scripts/bootstrap.sh
index f9fc1d1edc327370f7b5d8e7494cb88d4fd4d12c..65e30cbd8286505175bf512bae82a1053996bd1e 100644
--- a/deploy/cpp/scripts/bootstrap.sh
+++ b/deploy/cpp/scripts/bootstrap.sh
@@ -1,10 +1,9 @@
 # download pre-compiled opencv lib
-OPENCV_URL=https://paddleseg.bj.bcebos.com/deploy/docker/opencv3gcc4.8.tar.bz2
-if [ ! -d "./deps/opencv3gcc4.8" ]; then
+OPENCV_URL=https://bj.bcebos.com/paddleseg/deploy/opencv3.4.6gcc4.8ffmpeg.tar.gz2
+if [ ! -d "./deps/opencv3.4.6gcc4.8ffmpeg/" ]; then
     mkdir -p deps
     cd deps
     wget -c ${OPENCV_URL}
-    tar xvfj opencv3gcc4.8.tar.bz2
-    rm -rf opencv3gcc4.8.tar.bz2
+    tar xvfj opencv3.4.6gcc4.8ffmpeg.tar.gz2
     cd ..
 fi
diff --git a/deploy/cpp/scripts/build.sh b/deploy/cpp/scripts/build.sh
index 0cfd8ceb5fce986b2f5fecd9ee5e7c029308ea0a..626ab9f9850341e87763bcfe77cb5e85745b88d0 100644
--- a/deploy/cpp/scripts/build.sh
+++ b/deploy/cpp/scripts/build.sh
@@ -18,7 +18,7 @@ CUDNN_LIB=/path/to/cudnn/lib/
 
 # OPENCV 路径, 如果使用自带预编译版本可不修改
 sh $(pwd)/scripts/bootstrap.sh  # 下载预编译版本的opencv
-OPENCV_DIR=$(pwd)/deps/opencv3gcc4.8/
+OPENCV_DIR=$(pwd)/deps/opencv3.4.6gcc4.8ffmpeg/
 
 # 以下无需改动
 rm -rf build
diff --git a/deploy/cpp/src/main.cc b/deploy/cpp/src/main.cc
index 63cd99f0105992c43cf91bbf87769b9f0f1864eb..45e46e2487d0d17f222dd390af3c379b330159bf 100644
--- a/deploy/cpp/src/main.cc
+++ b/deploy/cpp/src/main.cc
@@ -25,13 +25,15 @@ DEFINE_string(model_dir, "", "Path of inference model");
 DEFINE_string(image_path, "", "Path of input image");
 DEFINE_string(video_path, "", "Path of input video");
 DEFINE_bool(use_gpu, false, "Infering with GPU or CPU");
-DEFINE_string(run_mode, "fluid", "mode of running(fluid/trt_fp32/trt_fp16)");
+DEFINE_string(run_mode, "fluid", "Mode of running(fluid/trt_fp32/trt_fp16)");
+DEFINE_int32(gpu_id, 0, "Device id of GPU to execute");
 
 void PredictVideo(const std::string& video_path,
                   PaddleDetection::ObjectDetector* det) {
   // Open video
   cv::VideoCapture capture;
-  capture.open(video_path.c_str());
+  //capture.open(video_path.c_str());
+  capture.open(video_path);
   if (!capture.isOpened()) {
     printf("can not open video : %s\n", video_path.c_str());
     return;
@@ -44,9 +46,9 @@ void PredictVideo(const std::string& video_path,
 
   // Create VideoWriter for output
   cv::VideoWriter video_out;
-  std::string video_out_path = "output.avi";
+  std::string video_out_path = "output.mp4";
   video_out.open(video_out_path.c_str(),
-                 CV_FOURCC('M', 'J', 'P', 'G'),
+                 0x00000021,
                  video_fps,
                  cv::Size(video_width, video_height),
                  true);
@@ -60,6 +62,7 @@ void PredictVideo(const std::string& video_path,
   auto colormap = PaddleDetection::GenerateColorMap(labels.size());
   // Capture all frames and do inference
   cv::Mat frame;
+  int frame_id = 0;
   while (capture.read(frame)) {
     if (frame.empty()) {
       break;
@@ -67,7 +70,18 @@ void PredictVideo(const std::string& video_path,
     det->Predict(frame, &result);
     cv::Mat out_im = PaddleDetection::VisualizeResult(
         frame, result, labels, colormap);
+    for (const auto& item : result) {
+      printf("In frame id %d, we detect: class=%d confidence=%.2f rect=[%d %d %d %d]\n",
+        frame_id,
+        item.class_id,
+        item.confidence,
+        item.rect[0],
+        item.rect[1],
+        item.rect[2],
+        item.rect[3]);
+   }   
     video_out.write(out_im);
+    frame_id += 1;
   }
   capture.release();
   video_out.release();
@@ -97,7 +111,7 @@ void PredictImage(const std::string& image_path,
   std::vector<int> compression_params;
   compression_params.push_back(CV_IMWRITE_JPEG_QUALITY);
   compression_params.push_back(95);
-  cv::imwrite("output.jpeg", vis_img, compression_params);
+  cv::imwrite("output.jpg", vis_img, compression_params);
   printf("Visualized output saved as output.jpeg\n");
 }
 
@@ -118,7 +132,7 @@ int main(int argc, char** argv) {
 
   // Load model and create a object detector
   PaddleDetection::ObjectDetector det(FLAGS_model_dir, FLAGS_use_gpu,
-    FLAGS_run_mode);
+    FLAGS_run_mode, FLAGS_gpu_id);
   // Do inference on input video or image
   if (!FLAGS_video_path.empty()) {
     PredictVideo(FLAGS_video_path, &det);
diff --git a/deploy/cpp/src/object_detector.cc b/deploy/cpp/src/object_detector.cc
index c6522f4f113927ea7964320fa462f2b6ba21045e..7b660127d9433714c65a52245f4bf38006d06720 100644
--- a/deploy/cpp/src/object_detector.cc
+++ b/deploy/cpp/src/object_detector.cc
@@ -21,13 +21,14 @@ void ObjectDetector::LoadModel(const std::string& model_dir,
                                bool use_gpu,
                                const int min_subgraph_size,
                                const int batch_size,
-                               const std::string& run_mode) {
+                               const std::string& run_mode,
+                               const int gpu_id) {
   paddle::AnalysisConfig config;
   std::string prog_file = model_dir + OS_PATH_SEP + "__model__";
   std::string params_file = model_dir + OS_PATH_SEP + "__params__";
   config.SetModel(prog_file, params_file);
   if (use_gpu) {
-    config.EnableUseGpu(100, 0);
+    config.EnableUseGpu(100, gpu_id);
     if (run_mode != "fluid") {
       auto precision = paddle::AnalysisConfig::Precision::kFloat32;
       if (run_mode == "trt_fp16") {
@@ -182,7 +183,11 @@ void ObjectDetector::Predict(const cv::Mat& im,
   // Calculate output length
   int output_size = 1;
   for (int j = 0; j < output_shape.size(); ++j) {
-      output_size *= output_shape[j];
+    output_size *= output_shape[j];
+  }
+
+  if (output_size < 6) {
+    std::cerr << "[WARNING] No object detected." << std::endl;
   }
   output_data_.resize(output_size);
   out_tensor->copy_to_cpu(output_data_.data());
diff --git a/deploy/python/README.md b/deploy/python/README.md
index 105f6285228a04afac4369f33a1fa25d27350bf9..9c810ae1de2c21db787ec0a9055b455f886b496c 100644
--- a/deploy/python/README.md
+++ b/deploy/python/README.md
@@ -48,6 +48,7 @@ python deploy/python/infer.py --model_dir=/path/to/models --image_file=/path/to/
 | --run_mode |No|使用GPU时，默认为fluid, 可选（fluid/trt_fp32/trt_fp16）|
 | --threshold |No|预测得分的阈值，默认为0.5|
 | --output_dir |No|可视化结果保存的根目录，默认为output/|
+| --run_benchmark |No|是否运行benchmark，同时需指定--image_file|
 
 说明：
 
diff --git a/deploy/python/infer.py b/deploy/python/infer.py
index 77d10bf4e41a5fc0c4fc7599e9913a7ede8ffbee..091fb724d0a4c82c4f0391c9c4d99b2f60ffcd8e 100644
--- a/deploy/python/infer.py
+++ b/deploy/python/infer.py
@@ -16,6 +16,8 @@ import os
 import argparse
 import time
 import yaml
+import ast
+from functools import reduce
 
 from PIL import Image
 import cv2
@@ -286,6 +288,7 @@ class Config():
         self.mask_resolution = None
         if 'mask_resolution' in yml_conf:
             self.mask_resolution = yml_conf['mask_resolution']
+        self.print_config()
 
     def check_model(self, yml_conf):
         """
@@ -299,6 +302,15 @@ class Config():
             "Unsupported arch: {}, expect SSD, YOLO, RetinaNet, RCNN and Face".
             format(yml_conf['arch']))
 
+    def print_config(self):
+        print('-----------  Model Configuration -----------')
+        print('%s: %s' % ('Model Arch', self.arch))
+        print('%s: %s' % ('Use Padddle Executor', self.use_python_inference))
+        print('%s: ' % ('Transform Order'))
+        for op_info in self.preprocess_infos:
+            print('--%s: %s' % ('transform op', op_info['type']))
+        print('--------------------------------------------')
+
 
 def load_predictor(model_dir,
                    run_mode='fluid',
@@ -322,6 +334,7 @@ def load_predictor(model_dir,
         raise ValueError("TensorRT int8 mode is not supported now, "
                          "please use trt_fp32 or trt_fp16 instead.")
     precision_map = {
+        'trt_int8': fluid.core.AnalysisConfig.Precision.Int8,
         'trt_fp32': fluid.core.AnalysisConfig.Precision.Float32,
         'trt_fp16': fluid.core.AnalysisConfig.Precision.Half
     }
@@ -450,7 +463,7 @@ class Detector():
             results['masks'] = np_masks
         return results
 
-    def predict(self, image, threshold=0.5):
+    def predict(self, image, threshold=0.5, warmup=0, repeats=1):
         '''
         Args:
             image (str/np.ndarray): path of image/ np.ndarray read by cv2
@@ -464,13 +477,19 @@ class Detector():
         inputs, im_info = self.preprocess(image)
         np_boxes, np_masks = None, None
         if self.config.use_python_inference:
+            for i in range(warmup):
+                outs = self.executor.run(self.program,
+                                         feed=inputs,
+                                         fetch_list=self.fecth_targets,
+                                         return_numpy=False)
             t1 = time.time()
-            outs = self.executor.run(self.program,
-                                     feed=inputs,
-                                     fetch_list=self.fecth_targets,
-                                     return_numpy=False)
+            for i in range(repeats):
+                outs = self.executor.run(self.program,
+                                         feed=inputs,
+                                         fetch_list=self.fecth_targets,
+                                         return_numpy=False)
             t2 = time.time()
-            ms = (t2 - t1) * 1000.0
+            ms = (t2 - t1) * 1000.0 / repeats
             print("Inference: {} ms per batch image".format(ms))
 
             np_boxes = np.array(outs[0])
@@ -481,35 +500,55 @@ class Detector():
             for i in range(len(inputs)):
                 input_tensor = self.predictor.get_input_tensor(input_names[i])
                 input_tensor.copy_from_cpu(inputs[input_names[i]])
-            t1 = time.time()
-            self.predictor.zero_copy_run()
-            t2 = time.time()
 
-            output_names = self.predictor.get_output_names()
-            boxes_tensor = self.predictor.get_output_tensor(output_names[0])
-            np_boxes = boxes_tensor.copy_to_cpu()
-            if self.config.mask_resolution is not None:
-                masks_tensor = self.predictor.get_output_tensor(output_names[1])
-                np_masks = masks_tensor.copy_to_cpu()
+            for i in range(warmup):
+                self.predictor.zero_copy_run()
+                output_names = self.predictor.get_output_names()
+                boxes_tensor = self.predictor.get_output_tensor(output_names[0])
+                np_boxes = boxes_tensor.copy_to_cpu()
+                if self.config.mask_resolution is not None:
+                    masks_tensor = self.predictor.get_output_tensor(
+                        output_names[1])
+                    np_masks = masks_tensor.copy_to_cpu()
 
-            ms = (t2 - t1) * 1000.0
+            t1 = time.time()
+            for i in range(repeats):
+                self.predictor.zero_copy_run()
+                output_names = self.predictor.get_output_names()
+                boxes_tensor = self.predictor.get_output_tensor(output_names[0])
+                np_boxes = boxes_tensor.copy_to_cpu()
+                if self.config.mask_resolution is not None:
+                    masks_tensor = self.predictor.get_output_tensor(
+                        output_names[1])
+                    np_masks = masks_tensor.copy_to_cpu()
+            t2 = time.time()
+            ms = (t2 - t1) * 1000.0 / repeats
             print("Inference: {} ms per batch image".format(ms))
 
-        results = self.postprocess(
-            np_boxes, np_masks, im_info, threshold=threshold)
+        if reduce(lambda x, y: x * y, np_boxes.shape) < 6:
+            print('[WARNNING] No object detected.')
+            results = {'boxes': np.array([])}
+        else:
+            results = self.postprocess(
+                np_boxes, np_masks, im_info, threshold=threshold)
+
         return results
 
 
 def predict_image():
     detector = Detector(
         FLAGS.model_dir, use_gpu=FLAGS.use_gpu, run_mode=FLAGS.run_mode)
-    results = detector.predict(FLAGS.image_file, FLAGS.threshold)
-    visualize(
-        FLAGS.image_file,
-        results,
-        detector.config.labels,
-        mask_resolution=detector.config.mask_resolution,
-        output_dir=FLAGS.output_dir)
+    if FLAGS.run_benchmark:
+        detector.predict(
+            FLAGS.image_file, FLAGS.threshold, warmup=100, repeats=100)
+    else:
+        results = detector.predict(FLAGS.image_file, FLAGS.threshold)
+        visualize(
+            FLAGS.image_file,
+            results,
+            detector.config.labels,
+            mask_resolution=detector.config.mask_resolution,
+            output_dir=FLAGS.output_dir)
 
 
 def predict_video():
@@ -543,6 +582,13 @@ def predict_video():
     writer.release()
 
 
+def print_arguments(args):
+    print('-----------  Running Arguments -----------')
+    for arg, value in sorted(vars(args).items()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------')
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument(
@@ -562,7 +608,15 @@ if __name__ == '__main__':
         default='fluid',
         help="mode of running(fluid/trt_fp32/trt_fp16)")
     parser.add_argument(
-        "--use_gpu", default=False, help="Whether to predict with GPU.")
+        "--use_gpu",
+        type=ast.literal_eval,
+        default=False,
+        help="Whether to predict with GPU.")
+    parser.add_argument(
+        "--run_benchmark",
+        type=ast.literal_eval,
+        default=False,
+        help="Whether to predict a image_file repeatedly for benchmark")
     parser.add_argument(
         "--threshold", type=float, default=0.5, help="Threshold of score.")
     parser.add_argument(
@@ -572,6 +626,8 @@ if __name__ == '__main__':
         help="Directory of output visualization files.")
 
     FLAGS = parser.parse_args()
+    print_arguments(FLAGS)
+
     if FLAGS.image_file != '' and FLAGS.video_file != '':
         assert "Cannot predict image and video at the same time"
     if FLAGS.image_file != '':