diff --git a/core/predictor/framework/bsf-inl.h b/core/predictor/framework/bsf-inl.h
old mode 100755
new mode 100644
index ca5fc0f8db743a25c8e0937a9589f7d651cced19..eb0a1a94e5d53d69a4d2932da19d16b1a86e3937
--- a/core/predictor/framework/bsf-inl.h
+++ b/core/predictor/framework/bsf-inl.h
@@ -86,14 +86,15 @@ bool Task<InItemT, OutItemT>::task_fetch_create(BatchTasks<TaskT>& batchTask) {
         // 此时 lod 为空。
         tensor_out.lod = batchTask._batch_out[fetchvar_index].lod;
         // resize all batch memory at one time
-        
+
         size_t databuf_size = fetchvar_batch * fetchvar_bytesize_index;
-        
-        void* databuf_data = MempoolWrapper::instance().malloc(databuf_size,memoryPtr);
+
+        void* databuf_data =
+            MempoolWrapper::instance().malloc(databuf_size, memoryPtr);
         paddle::PaddleBuf paddleBuf(databuf_data, databuf_size);
         tensor_out.data = paddleBuf;
-        
-        //tensor_out.data.Resize(databuf_size);
+
+        // tensor_out.data.Resize(databuf_size);
       } else {
         // 当taskmeta_num = 1时，由于同时只有一个taskMeta操作task
         // 不涉及线程安全问题，所以此时可以直接由taskMeta->task->resize->copy
@@ -213,7 +214,8 @@ void TaskExecutor<TaskT>::stop() {
 template <typename TaskT>
 TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(
     const void* inVectorT_ptr,
-    void* outVectorT_ptr, MempoolRegion* memoryPtr) {  // NOLINT
+    void* outVectorT_ptr,
+    MempoolRegion* memoryPtr) {  // NOLINT
   TaskT* task = butil::get_object<TaskT>();
   if (!task) {
     LOG(ERROR) << "Failed get TaskT from object pool";
@@ -240,7 +242,7 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(
   task->write_fd = fds[1];
   task->owner_tid = ::syscall(SYS_gettid);
   task->memoryPtr = memoryPtr;
-  //task->_bspec_key = _bspec_key;
+  // task->_bspec_key = _bspec_key;
   task->inVectorT_ptr = (const InVectorT*)inVectorT_ptr;
   task->outVectorT_ptr = (OutVectorT*)outVectorT_ptr;
   if (!task->task_init()) {
@@ -309,7 +311,7 @@ bool TaskExecutor<TaskT>::move_task_to_batch(
     }
 
     // combine_task_valid负责判断是否能够合并
-    // 除最外层的shape外，内层shape应一致才能合并。
+    // 除最外层的shape外，内层shape应一致或者允许Padding才能合并。
     // 否则跳出循环,放入下一个batchTask中。
     // 以此保证batch.append_task(task)中的task的内层shape相同。
 
@@ -317,12 +319,15 @@ bool TaskExecutor<TaskT>::move_task_to_batch(
     // 所以要求该feedvar必须相等，才能合并。
     // 否则跳出循环,放入下一个batchTask中。
     // 目前没有PaddleTensor和PaddleBuff没有重载==，所以只能比较内存.
-    // TODO(HexToString): 可以考虑后期支持AutoPadding.
     if (previous_task != nullptr) {
-      if (!task->combine_task_valid(previous_task)) {
+      if (task->combine_task_valid(previous_task) == 0) {
         break;
       }
     }
+
+    if (batchTask.padding(task) != 2) {
+      break;
+    }
     size_t rem = batchTask.append_task(task);
     previous_task = task;
     if (task->rem <= 0) {
@@ -407,10 +412,11 @@ int TaskExecutor<TaskT>::work(ThreadContext<TaskT>* context) {
 }
 
 template <typename InItemT, typename OutItemT>
-bool TaskManager<InItemT, OutItemT>::schedule(const void* in,
-                                              void* out, MempoolRegion* memoryPtr) {  // NOLINT
+bool TaskManager<InItemT, OutItemT>::schedule(
+    const void* in, void* out, MempoolRegion* memoryPtr) {  // NOLINT
   TaskHandler<TaskT> handler =
-      TaskExecutorVector<TaskT>::instance()[_model_index].schedule(in, out, memoryPtr);
+      TaskExecutorVector<TaskT>::instance()[_model_index].schedule(
+          in, out, memoryPtr);
 
   if (handler.valid()) {
     _task_owned = handler;
diff --git a/core/predictor/framework/bsf.h b/core/predictor/framework/bsf.h
index ed8415ccef3105f6e776a3102cae2a0c568db6d1..5af31600cde109ede056a5a6483d8aa604640dd8 100755
--- a/core/predictor/framework/bsf.h
+++ b/core/predictor/framework/bsf.h
@@ -20,7 +20,9 @@
 #include <list>
 #include <set>
 #include <vector>
-
+#include <cmath>
+#include <functional>
+#include <numeric>
 #ifdef BCLOUD
 #include "base/atomicops.h"
 #else
@@ -38,6 +40,8 @@ namespace im {
 namespace bsf {
 
 static const size_t DEFAULT_BATCH_SIZE = 100;
+static const size_t ABSOLUTE_ERROR = 1024;
+static const float RELATIVE_ERROR = 0.5;
 typedef baidu::paddle_serving::predictor::MempoolWrapper MempoolWrapper;
 typedef baidu::paddle_serving::predictor::MempoolRegion MempoolRegion;
 
@@ -124,7 +128,7 @@ struct Task {
     outLodTensorVector.clear();
   }
 
-  void clear(){
+  void clear() {
     read_fd = -1;
     write_fd = -1;
     owner_tid = -1;
@@ -158,13 +162,18 @@ struct Task {
     return 1;
   }
 
-  bool combine_task_valid(Task* other_task) {
-    // TODO(HexToString): auto-padding
-    // 除最外层的shape外，内层shape应一致才能合并。
+  int combine_task_valid(Task* other_task) {
+    // 除最外层的shape外，内层shape应一致或者允许Padding才能合并。
     // 否则跳出循环,放入下一个batchTask中。
+    // 当内层shape不一致时，此时先不判断是否Padding，在batchTask层判断，返回2。
     // 以此保证batch.append_task(task)中的task的内层shape相同。
+
+    // return 0 表示Shape[0] = 1
+    // 而!=batch的情况，两个Task中的值不同，此时不能合并。
+    // return 1 表示Shape维度完全一致，直接合并即可。
+    // return 2 表示Shape维度不完全一致，还需要进一步的判断，是否合并。
     if (other_task->feedvar_shape_nobatch() != feedvar_shape_nobatch()) {
-      return false;
+      return 2;
     }
 
     // 对于Shape[0] = 1 而!=batch的情况，因为合并时，取其中一个的值
@@ -177,9 +186,9 @@ struct Task {
           std::memcmp((*inVectorT_ptr)[feedvar_index].data.data(),
                       (*(other_task->inVectorT_ptr))[feedvar_index].data.data(),
                       (*inVectorT_ptr)[feedvar_index].data.length());
-      if (result != 0) return false;
+      if (result != 0) return 0;
     }
-    return true;
+    return 1;
   }
 
   size_t feedvar_batch_size(size_t feedvar_index) {
@@ -373,11 +382,12 @@ struct Task {
         // 一次性扩容PaddleTensor中的data和lod
         paddle::PaddleTensor& fetchVarTensor = (*outVectorT_ptr)[feedvar_index];
         fetchVarTensor.shape[0] = total_shape0;
-        void* databuf_data = MempoolWrapper::instance().malloc(data_length,memoryPtr);
+        void* databuf_data =
+            MempoolWrapper::instance().malloc(data_length, memoryPtr);
         paddle::PaddleBuf paddleBuf(databuf_data, data_length);
         fetchVarTensor.data = paddleBuf;
-         
-        //fetchVarTensor.data.Resize(data_length);
+
+        // fetchVarTensor.data.Resize(data_length);
         // task中的lod补0
         if (fetchVarTensor.lod.size() <= 0) {
           fetchVarTensor.lod.push_back({0});
@@ -393,7 +403,7 @@ struct Task {
         size_t once_lod_length = 0;
         for (size_t taskmeta_index = 0; taskmeta_index < total_taskmeta_num;
              ++taskmeta_index) {
-          //process data
+          // process data
           void* dst_ptr = fetchVarTensor.data.data() + data_length_offset;
           void* source_ptr =
               outLodTensorVector[taskmeta_index][index].data.data();
@@ -401,7 +411,7 @@ struct Task {
               outLodTensorVector[taskmeta_index][index].data.length();
           memcpy(dst_ptr, source_ptr, once_data_length);
           data_length_offset += once_data_length;
-          //process lod
+          // process lod
           size_t last_lod_value = fetchVarTensor.lod[0][lod_length_offset];
           once_lod_length =
               outLodTensorVector[taskmeta_index][index].lod[0].size();
@@ -412,7 +422,6 @@ struct Task {
                 outLodTensorVector[taskmeta_index][index].lod[0][once_index];
             lod_length_offset++;
           }
-
         }
       }
     }
@@ -496,11 +505,15 @@ class BatchTasks {
 
   explicit BatchTasks(size_t batch_size,
                       bool overrun = false,
-                      bool allow_split_request = true)
+                      bool allow_split_request = true,
+                      bool auto_padding = true,
+                      int padding_value = 0)
       : _batch_size(batch_size),
         _rem_size(batch_size),
         _overrun(overrun),
-        _allow_split_request(allow_split_request) {
+        _allow_split_request(allow_split_request),
+        _auto_padding(auto_padding),
+        _padding_value(padding_value) {
     _batch_in.clear();
     _batch_in_offset.clear();
     _total_shape0_batch_in.clear();
@@ -530,6 +543,71 @@ class BatchTasks {
     vector_fetch_lod_index.clear();
   }
 
+  // return 0
+  // 表示feedvar数量都不一样，或者，每个feedvar的shape维度都不同，此时不能合并batch。
+  // return 1 表示合并batch不划算。
+  // return 2 表示合并batch划算。
+  int padding(TaskT* task) {
+    const VectorOfShapeVector& task_vector_shape =
+        task->feedvar_shape_nobatch();
+    int return_value = 2;
+
+    // 当batchTask中为空时，第一次加入Task，此时则BatchTask中即为第一个Task中的Shape.
+    if (vector_of_max_shape.size() == 0) {
+      vector_of_max_shape = task_vector_shape;
+      return 2;
+    }
+
+    if (vector_of_max_shape.size() != task_vector_shape.size()) {
+      return 0;
+    }
+
+    // 当两个Shape完全相同时，无须更新，无须计算，无须Padding。
+    if (vector_of_max_shape == task_vector_shape) {
+      return 2;
+    }
+
+    vector<size_t> multiplies_1(vector_of_max_shape.size());
+    vector<size_t> multiplies_2(vector_of_max_shape.size());
+    vector<size_t> temp_multiplies(vector_of_max_shape.size());
+    VectorOfShapeVector temp_vector_max_shape(vector_of_max_shape.size());
+    for (size_t i = 0; i < vector_of_max_shape.size(); ++i) {
+      if (vector_of_max_shape[i].size() != task_vector_shape[i].size())
+        return 0;
+      for (size_t j = 0; j < vector_of_max_shape[i].size(); ++j) {
+        temp_vector_max_shape[i].push_back(
+            std::max(vector_of_max_shape[i][j], task_vector_shape[i][j]));
+      }
+      temp_multiplies[i] = std::accumulate(temp_vector_max_shape[i].begin(),
+                                           temp_vector_max_shape[i].end(),
+                                           1,
+                                           std::multiplies<size_t>());
+      multiplies_1[i] = std::accumulate(vector_of_max_shape[i].begin(),
+                                        vector_of_max_shape[i].end(),
+                                        1,
+                                        std::multiplies<size_t>());
+      multiplies_2[i] = std::accumulate(task_vector_shape[i].begin(),
+                                        task_vector_shape[i].end(),
+                                        1,
+                                        std::multiplies<size_t>());
+      if ((abs(temp_multiplies[i] - multiplies_1[i]) <= ABSOLUTE_ERROR &&
+           abs(temp_multiplies[i] - multiplies_2[i]) <= ABSOLUTE_ERROR) ||
+          (temp_multiplies[i] / multiplies_1[i] >= RELATIVE_ERROR &&
+           temp_multiplies[i] / multiplies_2[i] >= RELATIVE_ERROR)) {
+        continue;
+      } else {
+        return_value = 1;
+      }
+    }
+
+    // 当合并batch时，需要更新BatchTask中的最大Shape
+    // 此时，整个BatchTask到最后合并多个Task时，需要Padding
+    if (return_value == 2) {
+      vector_of_max_shape = temp_vector_max_shape;
+    }
+    return return_value;
+  }
+
   // synchronized operation
   // because Upper level callers of this function have already locked.
   // 能进到此函数的task都是同类task，在该函数之前已保证了这点。
@@ -545,8 +623,9 @@ class BatchTasks {
     TaskMetaT tm(task, start_index, add, task->taskmeta_num);
     task->rem -= add;
     _rem_size -= add;
-    if(task->taskmeta_num == 0){
-      task->total_taskmeta_num = 1 + (task->rem + _batch_size - 1)/_batch_size;
+    if (task->taskmeta_num == 0) {
+      task->total_taskmeta_num =
+          1 + (task->rem + _batch_size - 1) / _batch_size;
     }
     task->taskmeta_num += 1;
     _taskmeta_vector.push_back(tm);
@@ -619,7 +698,7 @@ class BatchTasks {
   // batch.merge_tasks() is thread-safe function
   // cause batch is a local variable and Task is just read, not written.
 
-  void merge_tasks() {
+  void copy_element_value(size_t) void merge_tasks() {
     if (_taskmeta_vector.size() <= 0) {
       return;
     }
@@ -631,19 +710,28 @@ class BatchTasks {
         const paddle::PaddleTensor& feedVarTensor =
             (*tm.task->inVectorT_ptr)[feedvar_index];
         size_t feedvar_bytesize = tm.task->feedvar_bytesize(feedvar_index);
+        const ShapeVector& feedvar_max_shape_vector =
+            vector_of_max_shape[feedvar_index];
+        size_t feedvar_max_num =
+            std::accumulate(feedvar_max_shape_vector.begin(),
+                            feedvar_max_shape_vector.end(),
+                            1,
+                            std::multiplies<size_t>());
+        size_t feedvar_element_bytesize =
+            tm.task->feedvar_element_bytesize(feedvar_index);
+        size_t feedvar_max_bytes = feedvar_element_bytesize * feedvar_max_num;
 
         if (ti == 0) {
           // Create the entire tensor at once
-          // for now, we assume that every task feedvar_bytesize is the same.
-          // which means we dont support auto embedding.
-          // but for different feedvar, it is different.
           paddle::PaddleTensor paddleTensor;
           paddleTensor.dtype = feedVarTensor.dtype;
           paddleTensor.name = feedVarTensor.name;
           paddleTensor.lod = _batch_in_lod[feedvar_index];
-          paddleTensor.shape = feedVarTensor.shape;
-          paddleTensor.shape[0] = _total_shape0_batch_in[feedvar_index];
-          size_t databuf_size = feedvar_bytesize * _total_shape0_batch_in[feedvar_index];
+          paddleTensor.shape = feedvar_max_shape_vector;
+          paddleTensor.shape.insert(paddleTensor.shape.begin(),
+                                    _total_shape0_batch_in[feedvar_index]);
+          size_t databuf_size =
+              feedvar_max_bytes * _total_shape0_batch_in[feedvar_index];
           void* databuf_data = MempoolWrapper::instance().malloc(databuf_size);
           paddle::PaddleBuf paddleBuf(databuf_data, databuf_size);
           paddleTensor.data = paddleBuf;
@@ -656,12 +744,243 @@ class BatchTasks {
             feedVarTensor.data.data() +
             feedvar_bytesize * tm.feed_shape0_range[feedvar_index][0];
         size_t length =
-            feedvar_bytesize * (tm.feed_shape0_range[feedvar_index][1] -
-                                tm.feed_shape0_range[feedvar_index][0]);
-        memcpy(dst_ptr, source_ptr, length);
+            feedvar_max_bytes * (tm.feed_shape0_range[feedvar_index][1] -
+                                 tm.feed_shape0_range[feedvar_index][0]);
+
+        // 不需要padding，连续内存，则直接memcpy
+        // 这里可以直接比较内存是否一样大
+        // 在于前面Padding函数中，已经保证了vector_of_max_shape中各个维度都是最大值。
+        // 当shape-1 = [8000,20000] shape-2 = [20000,8000]时
+        // 此时，vector_of_max_shape中的shape = [20000,20000]
+        // 所以feedvar_max_bytes == feedvar_bytesize时，一定是shape完全相同。
+        if (feedvar_max_bytes == feedvar_bytesize) {
+          memcpy(dst_ptr, source_ptr, length);
+        } else {
+          memset(dst_ptr, 0, length);
+          size_t old_index = 0;
+          size_t new_index = 0;
+
+          switch (feedvar_max_shape_vector.size()) {
+            case 5:
+              for (int i_0 = tm.feed_shape0_range[feedvar_index][0];
+                   i_0 < tm.feed_shape0_range[feedvar_index][1];
+                   ++i_0) {
+                for (int i_1 = 0; i_1 < feedVarTensor.shape[1]; ++i_1) {
+                  for (int i_2 = 0; i_2 < feedVarTensor.shape[2]; ++i_2) {
+                    for (int i_3 = 0; i_3 < feedVarTensor.shape[3]; ++i_3) {
+                      for (int i_4 = 0; i_4 < feedVarTensor.shape[4]; ++i_4) {
+                        for (int i_5 = 0; i_5 < feedVarTensor.shape[5]; ++i_5) {
+                          old_index = i_0 * feedVarTensor.shape[1] *
+                                          feedVarTensor.shape[2] *
+                                          feedVarTensor.shape[3] *
+                                          feedVarTensor.shape[4] *
+                                          feedVarTensor.shape[5] +
+                                      i_1 * feedVarTensor.shape[2] *
+                                          feedVarTensor.shape[3] *
+                                          feedVarTensor.shape[4] *
+                                          feedVarTensor.shape[5] +
+                                      i_2 * feedVarTensor.shape[3] *
+                                          feedVarTensor.shape[4] *
+                                          feedVarTensor.shape[5] +
+                                      i_3 * feedVarTensor.shape[4] *
+                                          feedVarTensor.shape[5] +
+                                      i_4 * feedVarTensor.shape[5] + i_5;
+                          new_index = i_0 * feedvar_max_shape_vector[0] *
+                                          feedvar_max_shape_vector[1] *
+                                          feedvar_max_shape_vector[2] *
+                                          feedvar_max_shape_vector[3] *
+                                          feedvar_max_shape_vector[4] +
+                                      i_1 * feedvar_max_shape_vector[1] *
+                                          feedvar_max_shape_vector[2] *
+                                          feedvar_max_shape_vector[3] *
+                                          feedvar_max_shape_vector[4] +
+                                      i_2 * feedvar_max_shape_vector[2] *
+                                          feedvar_max_shape_vector[3] *
+                                          feedvar_max_shape_vector[4] +
+                                      i_3 * feedvar_max_shape_vector[3] *
+                                          feedvar_max_shape_vector[4] +
+                                      i_4 * feedvar_max_shape_vector[4] + i_5;
+                          if (feedVarTensor.dtype ==
+                              paddle::PaddleDType::INT64) {
+                            *((int64_t*)dst_ptr + new_index) =
+                                *((int64_t*)source_ptr + old_index);
+                          } else if (feedVarTensor.dtype ==
+                                     paddle::PaddleDType::FLOAT32) {
+                            *((float*)dst_ptr + new_index) =
+                                *((float*)source_ptr + old_index);
+                          } else if (feedVarTensor.dtype ==
+                                     paddle::PaddleDType::INT32) {
+                            *((int32_t*)dst_ptr + new_index) =
+                                *((int32_t*)source_ptr + old_index);
+                          } else if (feedVarTensor.dtype ==
+                                     paddle::PaddleDType::UINT8) {
+                            *((char*)dst_ptr + new_index) =
+                                *((char*)source_ptr + old_index);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+              break;
+            case 4:
+              for (int i_0 = tm.feed_shape0_range[feedvar_index][0];
+                   i_0 < tm.feed_shape0_range[feedvar_index][1];
+                   ++i_0) {
+                for (int i_1 = 0; i_1 < feedVarTensor.shape[1]; ++i_1) {
+                  for (int i_2 = 0; i_2 < feedVarTensor.shape[2]; ++i_2) {
+                    for (int i_3 = 0; i_3 < feedVarTensor.shape[3]; ++i_3) {
+                      for (int i_4 = 0; i_4 < feedVarTensor.shape[4]; ++i_4) {
+                        old_index = i_0 * feedVarTensor.shape[1] *
+                                        feedVarTensor.shape[2] *
+                                        feedVarTensor.shape[3] *
+                                        feedVarTensor.shape[4] +
+                                    i_1 * feedVarTensor.shape[2] *
+                                        feedVarTensor.shape[3] *
+                                        feedVarTensor.shape[4] +
+                                    i_2 * feedVarTensor.shape[3] *
+                                        feedVarTensor.shape[4] +
+                                    i_3 * feedVarTensor.shape[4] + i_4;
+                        new_index = i_0 * feedvar_max_shape_vector[0] *
+                                        feedvar_max_shape_vector[1] *
+                                        feedvar_max_shape_vector[2] *
+                                        feedvar_max_shape_vector[3] +
+                                    i_1 * feedvar_max_shape_vector[1] *
+                                        feedvar_max_shape_vector[2] *
+                                        feedvar_max_shape_vector[3] +
+                                    i_2 * feedvar_max_shape_vector[2] *
+                                        feedvar_max_shape_vector[3] +
+                                    i_3 * feedvar_max_shape_vector[3] + i_4;
+                        if (feedVarTensor.dtype == paddle::PaddleDType::INT64) {
+                          *((int64_t*)dst_ptr + new_index) =
+                              *((int64_t*)source_ptr + old_index);
+                        } else if (feedVarTensor.dtype ==
+                                   paddle::PaddleDType::FLOAT32) {
+                          *((float*)dst_ptr + new_index) =
+                              *((float*)source_ptr + old_index);
+                        } else if (feedVarTensor.dtype ==
+                                   paddle::PaddleDType::INT32) {
+                          *((int32_t*)dst_ptr + new_index) =
+                              *((int32_t*)source_ptr + old_index);
+                        } else if (feedVarTensor.dtype ==
+                                   paddle::PaddleDType::UINT8) {
+                          *((char*)dst_ptr + new_index) =
+                              *((char*)source_ptr + old_index);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+              break;
+            case 3:
+              for (int i_0 = tm.feed_shape0_range[feedvar_index][0];
+                   i_0 < tm.feed_shape0_range[feedvar_index][1];
+                   ++i_0) {
+                for (int i_1 = 0; i_1 < feedVarTensor.shape[1]; ++i_1) {
+                  for (int i_2 = 0; i_2 < feedVarTensor.shape[2]; ++i_2) {
+                    for (int i_3 = 0; i_3 < feedVarTensor.shape[3]; ++i_3) {
+                      old_index = i_0 * feedVarTensor.shape[1] *
+                                      feedVarTensor.shape[2] *
+                                      feedVarTensor.shape[3] +
+                                  i_1 * feedVarTensor.shape[2] *
+                                      feedVarTensor.shape[3] +
+                                  i_2 * feedVarTensor.shape[3] + i_3;
+                      new_index = i_0 * feedvar_max_shape_vector[0] *
+                                      feedvar_max_shape_vector[1] *
+                                      feedvar_max_shape_vector[2] +
+                                  i_1 * feedvar_max_shape_vector[1] *
+                                      feedvar_max_shape_vector[2] +
+                                  i_2 * feedvar_max_shape_vector[2] + i_3;
+                      if (feedVarTensor.dtype == paddle::PaddleDType::INT64) {
+                        *((int64_t*)dst_ptr + new_index) =
+                            *((int64_t*)source_ptr + old_index);
+                      } else if (feedVarTensor.dtype ==
+                                 paddle::PaddleDType::FLOAT32) {
+                        *((float*)dst_ptr + new_index) =
+                            *((float*)source_ptr + old_index);
+                      } else if (feedVarTensor.dtype ==
+                                 paddle::PaddleDType::INT32) {
+                        *((int32_t*)dst_ptr + new_index) =
+                            *((int32_t*)source_ptr + old_index);
+                      } else if (feedVarTensor.dtype ==
+                                 paddle::PaddleDType::UINT8) {
+                        *((char*)dst_ptr + new_index) =
+                            *((char*)source_ptr + old_index);
+                      }
+                    }
+                  }
+                }
+              }
+              break;
+            case 2:
+              for (int i_0 = tm.feed_shape0_range[feedvar_index][0];
+                   i_0 < tm.feed_shape0_range[feedvar_index][1];
+                   ++i_0) {
+                for (int i_1 = 0; i_1 < feedVarTensor.shape[1]; ++i_1) {
+                  for (int i_2 = 0; i_2 < feedVarTensor.shape[2]; ++i_2) {
+                    old_index =
+                        i_0 * feedVarTensor.shape[1] * feedVarTensor.shape[2] +
+                        i_1 * feedVarTensor.shape[2] + i_2;
+                    new_index = i_0 * feedvar_max_shape_vector[0] *
+                                    feedvar_max_shape_vector[1] +
+                                i_1 * feedvar_max_shape_vector[1] + i_2;
+                    if (feedVarTensor.dtype == paddle::PaddleDType::INT64) {
+                      *((int64_t*)dst_ptr + new_index) =
+                          *((int64_t*)source_ptr + old_index);
+                    } else if (feedVarTensor.dtype ==
+                               paddle::PaddleDType::FLOAT32) {
+                      *((float*)dst_ptr + new_index) =
+                          *((float*)source_ptr + old_index);
+                    } else if (feedVarTensor.dtype ==
+                               paddle::PaddleDType::INT32) {
+                      *((int32_t*)dst_ptr + new_index) =
+                          *((int32_t*)source_ptr + old_index);
+                    } else if (feedVarTensor.dtype ==
+                               paddle::PaddleDType::UINT8) {
+                      *((char*)dst_ptr + new_index) =
+                          *((char*)source_ptr + old_index);
+                    }
+                  }
+                }
+              }
+              break;
+            case 1:
+              for (int i_0 = tm.feed_shape0_range[feedvar_index][0];
+                   i_0 < tm.feed_shape0_range[feedvar_index][1];
+                   ++i_0) {
+                for (int i_1 = 0; i_1 < feedVarTensor.shape[1]; ++i_1) {
+                  old_index = i_0 * feedVarTensor.shape[1] + i_1;
+                  new_index = i_0 * feedvar_max_shape_vector[0] + i_1;
+                  if (feedVarTensor.dtype == paddle::PaddleDType::INT64) {
+                    *((int64_t*)dst_ptr + new_index) =
+                        *((int64_t*)source_ptr + old_index);
+                  } else if (feedVarTensor.dtype ==
+                             paddle::PaddleDType::FLOAT32) {
+                    *((float*)dst_ptr + new_index) =
+                        *((float*)source_ptr + old_index);
+                  } else if (feedVarTensor.dtype ==
+                             paddle::PaddleDType::INT32) {
+                    *((int32_t*)dst_ptr + new_index) =
+                        *((int32_t*)source_ptr + old_index);
+                  } else if (feedVarTensor.dtype ==
+                             paddle::PaddleDType::UINT8) {
+                    *((char*)dst_ptr + new_index) =
+                        *((char*)source_ptr + old_index);
+                  }
+                }
+              }
+              break;
+            default:
+              break;
+          }
+        }
+
         // nobatch类型的feedvar，不叠加.
-        if (tm.feedvar_type[feedvar_index] != 3)
+        if (tm.feedvar_type[feedvar_index] != 3) {
           _batch_in_offset[feedvar_index] += length;
+        }
       }
     }
   }
@@ -753,25 +1072,26 @@ class BatchTasks {
       // 此时，无法分辨是否是天然nobatch，此时set_fetch_nobatch_index会漏掉
       // 后续希望在其他地方能够区分两者。
       if (fetchvar_batch_size(fetchvar_index) != _total_fetch_batch) {
-        if(fetchvar_batch_size(fetchvar_index) <= 0){
+        if (fetchvar_batch_size(fetchvar_index) <= 0) {
           // which means error.
           return false;
-        }else if(fetchvar_batch_size(fetchvar_index) == 1){
+        } else if (fetchvar_batch_size(fetchvar_index) == 1) {
           // which means fetchvar shape[0] = 1.
           // shape[0] does not change with batch
           set_fetch_nobatch_index.insert(fetchvar_index);
           _total_fetch_batch =
               std::max(fetchvar_batch_size(fetchvar_index), _total_fetch_batch);
-        }else if(_total_fetch_batch == 1){
-          //这时意味着，之前的fetchvar shape[0] 全部都= 1
-          //当前的fetchvar shape[0] > 1
-          //所以，之前的都是no_batch
-          for(size_t temp_index = fetchvar_index-1; temp_index >= 0; --temp_index){
+        } else if (_total_fetch_batch == 1) {
+          // 这时意味着，之前的fetchvar shape[0] 全部都= 1
+          // 当前的fetchvar shape[0] > 1
+          // 所以，之前的都是no_batch
+          for (size_t temp_index = fetchvar_index - 1; temp_index >= 0;
+               --temp_index) {
             set_fetch_nobatch_index.insert(fetchvar_index);
           }
           _total_fetch_batch =
               std::max(fetchvar_batch_size(fetchvar_index), _total_fetch_batch);
-        }else{
+        } else {
           // which means error.
           return false;
         }
@@ -856,10 +1176,11 @@ class BatchTasks {
             fetchVarTensor.shape[0] = shape0_length;
             fetch_lod_index++;
 
-            void* databuf_data = MempoolWrapper::instance().malloc(length,task->memoryPtr);
+            void* databuf_data =
+                MempoolWrapper::instance().malloc(length, task->memoryPtr);
             paddle::PaddleBuf paddleBuf(databuf_data, length);
             fetchVarTensor.data = paddleBuf;
-            //fetchVarTensor.data.Resize(length);
+            // fetchVarTensor.data.Resize(length);
             void* dst_ptr = fetchVarTensor.data.data();
             void* source_ptr = _batch_out[fetchvar_index].data.data() +
                                shape0_index_start * fetchvar_bytesize_index;
@@ -885,12 +1206,13 @@ class BatchTasks {
                 (*task->outVectorT_ptr)[fetchvar_index];
             size_t length = fetchvar_bytesize_index * shape0_length;
             fetchVarTensor.shape[0] = shape0_length;
-            
-            void* databuf_data = MempoolWrapper::instance().malloc(length,task->memoryPtr);
+
+            void* databuf_data =
+                MempoolWrapper::instance().malloc(length, task->memoryPtr);
             paddle::PaddleBuf paddleBuf(databuf_data, length);
             fetchVarTensor.data = paddleBuf;
-            
-            //fetchVarTensor.data.Resize(length);
+
+            // fetchVarTensor.data.Resize(length);
             void* dst_ptr = fetchVarTensor.data.data();
             void* source_ptr = _batch_out[fetchvar_index].data.data() +
                                shape0_index_start * fetchvar_bytesize_index;
@@ -979,10 +1301,26 @@ class BatchTasks {
   std::set<size_t> set_fetch_nobatch_index;
   std::vector<size_t> vector_fetch_lod_index;
 
+  // 这个BatchTask中目前,各个FeedVar中最大的Shape集合
+
   size_t _rem_size;
   size_t _batch_size;
   bool _overrun;
   bool _allow_split_request;
+
+  // 这个BatchTask中目前,各个FeedVar中最大的Shape集合
+  VectorOfShapeVector vector_of_max_shape;
+  // AutoPadding功能中，用这个与新的待合并Batch的TaskMeta来计算是否合并
+  // 策略有两个，满足任何一个均可合并
+  // 1、当相似度的乘积大于50%时
+  // 2、当绝对差的字节数小于1024字节时
+  // 例如，Shape-1 = [batch, 500, 500] Shape-2 = [batch, 400, 400]
+  // 此时，绝对值差为90000字节，相对误差为0.8*0.8 = 0.64，满足条件1，不满足条件2
+  // 例如，Shape-1 = [batch, 1, 1] Shape-2 = [batch, 2, 2]
+  // 此时，绝对值差为3字节，相对误差为0.5*0.5 = 0.25，满足条件2，不满足条件1
+  // 上述两种情况都可以进行AutoPadding.
+  bool _auto_padding;
+  int _padding_value;
 };
 
 // BSF task handle
diff --git a/doc/C++_Serving/2+_model.md b/doc/C++_Serving/2+_model.md
index 25065b818fdea818a129744646ddc02931d7f3f9..226f4973e2494b570f8710c75388d2e458fc024e 100755
--- a/doc/C++_Serving/2+_model.md
+++ b/doc/C++_Serving/2+_model.md
@@ -220,4 +220,4 @@ python3 自定义.py ocr_det_client ocr_rec_client
 #ocr_det_client为第一个模型的Client端proto文件夹的相对路径
 #ocr_rec_client为第二个模型的Client端proto文件夹的相对路径
 ```
-此时，对于Server端而言，输入的数据的格式与`第一个模型的Client端proto格式`定义的一致，输出的数据格式与`最后一个模型的Client端proto`文件一致。一般情况下您无须关注此事，当您需要了解详细的[proto的定义，请参考此处](../Serving_Configure_CN.md)。
+此时，对于Server端而言，输入的数据的格式与`第一个模型的Client端proto格式`定义的一致，输出的数据格式与`最后一个模型的Client端proto`文件一致。一般情况下您无须关注此事，当您需要了解详细的[proto的定义，请参考此处](./Serving_Configure_CN.md)。
diff --git a/doc/C++_Serving/DAG_CN.md b/doc/C++_Serving/DAG_CN.md
index dd076cd80ab24a0f33383eb341c4a95cd38a2675..fd4d043bf8ac513cb2d418111833403ff896d321 100755
--- a/doc/C++_Serving/DAG_CN.md
+++ b/doc/C++_Serving/DAG_CN.md
@@ -40,7 +40,7 @@ op_seq_maker.add_op(general_infer_op)
 op_seq_maker.add_op(general_response_op)
 ```
 
-如果使用`命令行 + 配置文件的方式启动C++Server`只需[修改配置文件](../Serving_Configure_CN.md)即可,无须修改👆的代码。
+如果使用`命令行 + 配置文件的方式启动C++Server`只需[修改配置文件]((./Serving_Configure_CN.md))即可,无须修改👆的代码。
 
 
 对于简单的串联逻辑，我们将其简化为`Sequence`，使用`OpSeqMaker`进行构建。用户可以不指定每个节点的前继，默认按加入`OpSeqMaker`的顺序来确定前继。
diff --git a/doc/C++_Serving/DAG_EN.md b/doc/C++_Serving/DAG_EN.md
index a45707344f3e1cd181a0deab2cf976a3f2c15971..6f8b9c2543b6d3488466787e13a6223fcf3e2ff3 100755
--- a/doc/C++_Serving/DAG_EN.md
+++ b/doc/C++_Serving/DAG_EN.md
@@ -39,7 +39,7 @@ op_seq_maker.add_op(general_infer_op)
 op_seq_maker.add_op(general_response_op)
 ```
 
-If you use `the command line + configuration file method to start C++ server`, you only need to modify [the configuration file](../Serving_Configure_CN.md), don`t need to change any line of 👆 code.
+If you use `the command line + configuration file method to start C++ server`, you only need to modify [the configuration file](./Serving_Configure_CN.md), don`t need to change any line of 👆 code.
 
 For simple series logic, we simplify it and build it with `OpSeqMaker`. You can determine the successor by default according to the order of joining `OpSeqMaker` without specifying the successor of each node.