diff --git a/deploy/cpp/demo/classifier.cpp b/deploy/cpp/demo/classifier.cpp
index 29e61b18da7313dc8f574e9d5bae9d90f37779e2..a4b3bfa3c5695f663a49862d49da9848ae86c4b0 100644
--- a/deploy/cpp/demo/classifier.cpp
+++ b/deploy/cpp/demo/classifier.cpp
@@ -21,6 +21,7 @@
 #include <string>
 #include <vector>
 #include <utility>
+#include <omp.h>
 #include "include/paddlex/paddlex.h"
 
 using namespace std::chrono;
diff --git a/deploy/cpp/demo/detector.cpp b/deploy/cpp/demo/detector.cpp
index 428505e30f7073ea3e66450dacb62e7f83962ea7..5068a35ad49791564f625fa140a2a156ecf58ed8 100644
--- a/deploy/cpp/demo/detector.cpp
+++ b/deploy/cpp/demo/detector.cpp
@@ -21,6 +21,7 @@
 #include <string>
 #include <vector>
 #include <utility>
+#include <omp.h>
 
 #include "include/paddlex/paddlex.h"
 #include "include/paddlex/visualize.h"
@@ -90,8 +91,8 @@ int main(int argc, char** argv) {
       total_running_time_s += double(duration.count()) * microseconds::period::num / microseconds::period::den;
       //输出结果目标框
       for(int j = 0; j < im_vec_size - i; ++j) {
-        std::cout << "image file: " << image_paths[i + j] << std::endl;          
         for(int k = 0; k < results[j].boxes.size(); ++k) {
+          std::cout << "image file: " << image_paths[i + j] << ", ";// << std::endl;          
           std::cout << "predict label: " << results[j].boxes[k].category
                     << ", label_id:" << results[j].boxes[k].category_id
                     << ", score: " << results[j].boxes[k].score << ", box(xmin, ymin, w, h):("
diff --git a/deploy/cpp/demo/segmenter.cpp b/deploy/cpp/demo/segmenter.cpp
index fb49cd2c642b679afa00f79f4482cc092de52a34..23bdd1f955e089a477d9bc029b292a59d9434475 100644
--- a/deploy/cpp/demo/segmenter.cpp
+++ b/deploy/cpp/demo/segmenter.cpp
@@ -21,6 +21,7 @@
 #include <string>
 #include <vector>
 #include <utility>
+#include <omp.h>
 #include "include/paddlex/paddlex.h"
 #include "include/paddlex/visualize.h"
 
diff --git a/deploy/cpp/include/paddlex/paddlex.h b/deploy/cpp/include/paddlex/paddlex.h
index b4d518bc1325e91eabfef1cfc05874a665e5688e..bcddd87845f8edff135047a0edbf9706c8579316 100644
--- a/deploy/cpp/include/paddlex/paddlex.h
+++ b/deploy/cpp/include/paddlex/paddlex.h
@@ -75,10 +75,6 @@ class Model {
 
   bool predict(const std::vector<cv::Mat> &im_batch, std::vector<SegResult> &result);
   
-  bool postprocess(SegResult* result);
-
-  bool postprocess(DetResult* result);
-
   std::string type;
   std::string name;
   std::map<int, std::string> labels;
diff --git a/deploy/cpp/include/paddlex/transforms.h b/deploy/cpp/include/paddlex/transforms.h
index 7c5892ece5569f598b83704c2122f7cf2fda2b9e..ed809164733dda8b7e84eda6281b4862d52d0e1a 100644
--- a/deploy/cpp/include/paddlex/transforms.h
+++ b/deploy/cpp/include/paddlex/transforms.h
@@ -58,7 +58,6 @@ class Transform {
  public:
   virtual void Init(const YAML::Node& item) = 0;
   virtual bool Run(cv::Mat* im, ImageBlob* data) = 0;
-  virtual void SetPaddingSize(int max_h, int max_w) {}
 };
 
 class Normalize : public Transform {
@@ -170,13 +169,10 @@ class Padding : public Transform {
     }
   }
   virtual bool Run(cv::Mat* im, ImageBlob* data);
-  virtual void SetPaddingSize(int max_h, int max_w);
  private:
   int coarsest_stride_ = -1;
   int width_ = 0;
   int height_ = 0;
-  int max_height_ = 0;
-  int max_width_ = 0;
 };
 
 class Transforms {
@@ -184,12 +180,9 @@ class Transforms {
   void Init(const YAML::Node& node, bool to_rgb = true);
   std::shared_ptr<Transform> CreateTransform(const std::string& name);
   bool Run(cv::Mat* im, ImageBlob* data);
-  void SetPaddingSize(int max_h, int max_w);
  private:
   std::vector<std::shared_ptr<Transform>> transforms_;
   bool to_rgb_ = true;
-  int max_h_ = 0;
-  int max_w_ = 0;
 };
 
 }  // namespace PaddleX
diff --git a/deploy/cpp/src/paddlex.cpp b/deploy/cpp/src/paddlex.cpp
index 99757a5b0981ca55640dd44f02262eff1ae44585..575b9b8304353ad5c582e91ec3a7f8ae2c518ecd 100644
--- a/deploy/cpp/src/paddlex.cpp
+++ b/deploy/cpp/src/paddlex.cpp
@@ -14,7 +14,7 @@
 #include <algorithm>
 #include <omp.h>
 #include "include/paddlex/paddlex.h"
-#include <fstream>
+#include <cstring>
 namespace PaddleX {
 
 void Model::create_predictor(const std::string& model_dir,
@@ -103,7 +103,6 @@ bool Model::preprocess(const cv::Mat& input_im, ImageBlob* blob) {
   cv::Mat im = input_im.clone();
   int max_h = im.rows;
   int max_w = im.cols;
-  transforms_.SetPaddingSize(max_h, max_w);
   if (!transforms_.Run(&im, blob)) {
     return false;
   }
@@ -116,11 +115,6 @@ bool Model::preprocess(const std::vector<cv::Mat> &input_im_batch, std::vector<I
   bool success = true;
   int max_h = -1;
   int max_w = -1;
-  for(int i = 0; i < input_im_batch.size(); ++i) {
-    max_h = std::max(max_h, input_im_batch[i].rows);
-    max_w = std::max(max_w, input_im_batch[i].cols);
-  }
-  transforms_.SetPaddingSize(max_h, max_w);
   #pragma omp parallel for num_threads(batch_size)
   for(int i = 0; i < input_im_batch.size(); ++i) {
     cv::Mat im = input_im_batch[i].clone();
@@ -254,14 +248,9 @@ bool Model::predict(const cv::Mat& im, DetResult* result) {
   im_tensor->Reshape({1, 3, h, w});
   im_tensor->copy_from_cpu(inputs_.im_data_.data());
 
-  std::ofstream fout("test_single.dat", std::ios::out);
   if (name == "YOLOv3") {
     auto im_size_tensor = predictor_->GetInputTensor("im_size");
     im_size_tensor->Reshape({1, 2});
-    for(int i = 0; i < inputs_.ori_im_size_.size(); ++i) {
-      fout << inputs_.ori_im_size_[i] << " ";
-    }
-    fout << std::endl;
     im_size_tensor->copy_from_cpu(inputs_.ori_im_size_.data());
   } else if (name == "FasterRCNN" || name == "MaskRCNN") {
     auto im_info_tensor = predictor_->GetInputTensor("im_info");
@@ -294,9 +283,6 @@ bool Model::predict(const cv::Mat& im, DetResult* result) {
     std::cerr << "[WARNING] There's no object detected." << std::endl;
     return true;
   }
-  for(int i = 0; i < output_box.size(); ++i) {
-    fout << output_box[i] << " ";
-  }
   int num_boxes = size / 6;
   // 解析预测框box
   for (int i = 0; i < num_boxes; ++i) {
@@ -353,12 +339,47 @@ bool Model::predict(const std::vector<cv::Mat> &im_batch, std::vector<DetResult>
     return false;
   }
 
+  int batch_size = im_batch.size();
   // 处理输入图像
   if (!preprocess(im_batch, inputs_batch_)) {
     std::cerr << "Preprocess failed!" << std::endl;
     return false;
   }
-  int batch_size = im_batch.size();
+  // 对RCNN类模型做批量padding
+  if (batch_size > 1) {
+    if (name == "FasterRCNN" || name == "MaskRCNN") {
+      int max_h = -1;
+      int max_w = -1;
+      for(int i = 0; i < inputs_batch_.size(); ++i) {
+        max_h = std::max(max_h, inputs_batch_[i].new_im_size_[0]);
+        max_w = std::max(max_w, inputs_batch_[i].new_im_size_[1]);
+        std::cout << "(" << inputs_batch_[i].new_im_size_[0] 
+                  << ", " << inputs_batch_[i].new_im_size_[1] 
+                  <<  ")" << std::endl;
+      }
+      #pragma omp parallel for num_threads(batch_size)
+      for(int i = 0; i < inputs_batch_.size(); ++i) {
+        int h = inputs_batch_[i].new_im_size_[0];
+        int w = inputs_batch_[i].new_im_size_[1];
+        int c = im_batch[i].channels();
+        if(max_h != h || max_w != w) {
+          std::vector<float> temp_buffer(c * max_h * max_w);
+          float *temp_ptr = temp_buffer.data();
+          float *ptr = inputs_batch_[i].im_data_.data();
+          for(int cur_channel = c - 1; cur_channel >= 0; --cur_channel) {
+            int ori_pos = cur_channel * h * w + (h - 1) * w;
+            int des_pos = cur_channel * max_h * max_w + (h - 1) * max_w;
+            for(int start_pos = ori_pos; start_pos >= cur_channel * h * w; start_pos -= w, des_pos -= max_w) {
+              memcpy(temp_ptr + des_pos, ptr + start_pos, w * sizeof(float));
+            }
+          }
+          inputs_batch_[i].im_data_.swap(temp_buffer);
+          inputs_batch_[i].new_im_size_[0] = max_h;
+          inputs_batch_[i].new_im_size_[1] = max_w; 
+        }
+      }
+    }
+  }
   int h = inputs_batch_[0].new_im_size_[0];
   int w = inputs_batch_[0].new_im_size_[1];
   auto im_tensor = predictor_->GetInputTensor("image");
@@ -368,7 +389,6 @@ bool Model::predict(const std::vector<cv::Mat> &im_batch, std::vector<DetResult>
     std::copy(inputs_batch_[i].im_data_.begin(), inputs_batch_[i].im_data_.end(), inputs_data.begin() + i * 3 * h * w);
   }
   im_tensor->copy_from_cpu(inputs_data.data());
-  std::ofstream fout("test_batch.dat", std::ios::out);
   if (name == "YOLOv3") {
     auto im_size_tensor = predictor_->GetInputTensor("im_size");
     im_size_tensor->Reshape({batch_size, 2});
@@ -376,10 +396,6 @@ bool Model::predict(const std::vector<cv::Mat> &im_batch, std::vector<DetResult>
     for(int i = 0; i < inputs_batch_.size(); ++i){
       std::copy(inputs_batch_[i].ori_im_size_.begin(), inputs_batch_[i].ori_im_size_.end(), inputs_data_size.begin() + 2 * i);
     }
-    for(int i = 0; i < inputs_data_size.size(); ++i) {
-      fout << inputs_data_size[i] << " ";
-    }
-    fout << std::endl;
     im_size_tensor->copy_from_cpu(inputs_data_size.data());
   } else if (name == "FasterRCNN" || name == "MaskRCNN") {
     auto im_info_tensor = predictor_->GetInputTensor("im_info");
@@ -422,9 +438,6 @@ bool Model::predict(const std::vector<cv::Mat> &im_batch, std::vector<DetResult>
     std::cerr << "[WARNING] There's no object detected." << std::endl;
     return true;
   }
-  for(int i = 0; i < output_box.size(); ++i) {
-    fout << output_box[i] << " ";
-  }
   auto lod_vector = output_box_tensor->lod();
   int num_boxes = size / 6;
   // 解析预测框box
diff --git a/deploy/cpp/src/transforms.cpp b/deploy/cpp/src/transforms.cpp
index 27977ef44fa79b8486ef717c4ee0a8f4f653f62e..99a73ee7345bbc8cc672d1c42627a9326ded0cf7 100644
--- a/deploy/cpp/src/transforms.cpp
+++ b/deploy/cpp/src/transforms.cpp
@@ -96,10 +96,12 @@ bool Padding::Run(cv::Mat* im, ImageBlob* data) {
     padding_w = width_ - im->cols;
     padding_h = height_ - im->rows;
   } else if (coarsest_stride_ >= 1) {
+    int h = im->rows;
+    int w = im->cols;
     padding_h =
-        ceil(max_height_ * 1.0 / coarsest_stride_) * coarsest_stride_ - im->rows;
+        ceil(h * 1.0 / coarsest_stride_) * coarsest_stride_ - im->rows;
     padding_w =
-        ceil(max_width_ * 1.0 / coarsest_stride_) * coarsest_stride_ - im->cols;
+        ceil(w * 1.0 / coarsest_stride_) * coarsest_stride_ - im->cols;
   }
 
   if (padding_h < 0 || padding_w < 0) {
@@ -115,11 +117,6 @@ bool Padding::Run(cv::Mat* im, ImageBlob* data) {
   return true;
 }
 
-void Padding::SetPaddingSize(int max_h, int max_w) {
-  max_height_ = max_h;
-  max_width_ = max_w;
-}
-
 bool ResizeByLong::Run(cv::Mat* im, ImageBlob* data) {
   if (long_size_ <= 0) {
     std::cerr << "[ResizeByLong] long_size should be greater than 0"
@@ -206,7 +203,6 @@ bool Transforms::Run(cv::Mat* im, ImageBlob* data) {
   data->new_im_size_[0] = im->rows;
   data->new_im_size_[1] = im->cols;
   for (int i = 0; i < transforms_.size(); ++i) {
-    transforms_[i]->SetPaddingSize(max_h_, max_w_);
     if (!transforms_[i]->Run(im, data)) {
       std::cerr << "Apply transforms to image failed!" << std::endl;
       return false;
@@ -226,9 +222,4 @@ bool Transforms::Run(cv::Mat* im, ImageBlob* data) {
   return true;
 }
 
-void Transforms::SetPaddingSize(int max_h, int max_w) {
-  max_h_ = max_h;
-  max_w_ = max_w;
-}
-
 }  // namespace PaddleX