diff --git a/CMakeLists.txt b/CMakeLists.txt
old mode 100755
new mode 100644
index 4cb661faf996bc32424f88103f238088efd08520..cad0bb5bc638e08bd05a573fe548c7a81323435c
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,7 @@ find_package(Threads REQUIRED)
 find_package(CUDA QUIET)
 
 include(simd)
-
+# SET(CMAKE_BUILD_TYPE "Debug")
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
diff --git a/README.md b/README.md
index ab6b1c0148315f2d19838b67a84cc732f175c944..6c6d0924bf44137dc463fb68599713835d4cb0f2 100644
--- a/README.md
+++ b/README.md
@@ -175,9 +175,12 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 
 | Argument                                       | Type | Default | Description                                           |
 | ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- |
-| `thread`                                       | int  | `4`     | Concurrency of current service                        |
+| `thread`                                       | int  | `2`     | Number of brpc service thread                         |
+| `op_num`                                       | int[]| `0`     | Thread Number for each model in asynchronous mode     |
+| `op_max_batch`                                 | int[]| `0`     | Batch Number for each model in asynchronous mode      |
+| `gpu_ids`                                      | str[]| `"-1"`  | Gpu card id for each model                            |
 | `port`                                         | int  | `9292`  | Exposed port of current service to users              |
-| `model`                                        | str  | `""`    | Path of paddle model directory to be served           |
+| `model`                                        | str[]| `""`    | Path of paddle model directory to be served           |
 | `mem_optim_off`                                | -    | -       | Disable memory / graphic memory optimization          |
 | `ir_optim`                                     | bool | False   | Enable analysis and optimization of calculation graph |
 | `use_mkl` (Only for cpu version)               | -    | -       | Run inference with MKL                                |
@@ -186,7 +189,24 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 | `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU        |
 | `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8              |
 | `use_calib`                                    | bool | False   | Only for deployment with TensorRT                     |
-
+| `gpu_multi_stream`                             | bool | False   | EnableGpuMultiStream to get larger QPS                |
+
+#### Description of asynchronous model
+    Asynchronous mode is suitable for 1. When the number of requests is very large, 2. When multiple models are concatenated and you want to specify the concurrency number of each model.
+    Asynchronous mode helps to improve the throughput (QPS) of service, but for a single request, the delay will increase slightly.
+    In asynchronous mode, each model will start n threads of the number you specify, and each thread contains a model instance. In other words, each model is equivalent to a thread pool containing N threads, and the task is taken from the task queue of the thread pool to execute.
+    In asynchronous mode, each RPC server thread is only responsible for putting the request into the task queue of the model thread pool. After the task is executed, the completed task is removed from the task queue.
+    In the above table, the number of RPC server threads is specified by --thread, and the default value is 2.
+    --op_num specifies the number of threads in the thread pool of each model. The default value is 0, indicating that asynchronous mode is not used.
+    --op_max_batch specifies the number of batches for each model. The default value is 32. It takes effect when --op_num is not 0.
+#### When you want a model to use multiple GPU cards.
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0,1,2
+#### When you want 2 models.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292
+#### When you want 2 models, and want each of them use multiple GPU cards.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2
+#### When a service contains two models, and each model needs to specify multiple GPU cards, and needs asynchronous mode, each model specifies different concurrency number.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2 --op_num 4 8
 
 
 ```python
diff --git a/README_CN.md b/README_CN.md
index d728071dbd80ae2400a6e95b5ccb06010fd7ef06..a1bb9f9e7c513a3d772cce2d56d0bcd76e3548f9 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -172,19 +172,40 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 ```
 
 
-| Argument                                       | Type | Default | Description                                            |
-| ---------------------------------------------- | ---- | ------- | ------------------------------------------------------ |
-| `thread`                                       | int  | `4`     | Concurrency of current service                         |
-| `port`                                         | int  | `9292`  | Exposed port of current service to users               |
-| `name`                                         | str  | `""`    | Service name, can be used to generate HTTP request url |
-| `model`                                        | str  | `""`    | Path of paddle model directory to be served            |
-| `mem_optim_off`                                | -    | -       | Disable memory optimization                            |
-| `ir_optim`                                     | bool | False   | Enable analysis and optimization of calculation graph  |
-| `use_mkl` (Only for cpu version)               | -    | -       | Run inference with MKL                                 |
-| `use_trt` (Only for Cuda>=10.1 version)        | -    | -       | Run inference with TensorRT                            |
-| `use_lite` (Only for Intel x86 CPU or ARM CPU) | -    | -       | Run PaddleLite inference                               |
-| `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU         |
-| `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8               |
+| Argument                                       | Type | Default | Description                                           |
+| ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- |
+| `thread`                                       | int  | `2`     | Number of brpc service thread                         |
+| `op_num`                                       | int[]| `0`     | Thread Number for each model in asynchronous mode     |
+| `op_max_batch`                                 | int[]| `32`    | Batch Number for each model in asynchronous mode      |
+| `gpu_ids`                                      | str[]| `"-1"`  | Gpu card id for each model                            |
+| `port`                                         | int  | `9292`  | Exposed port of current service to users              |
+| `model`                                        | str[]| `""`    | Path of paddle model directory to be served           |
+| `mem_optim_off`                                | -    | -       | Disable memory / graphic memory optimization          |
+| `ir_optim`                                     | bool | False   | Enable analysis and optimization of calculation graph |
+| `use_mkl` (Only for cpu version)               | -    | -       | Run inference with MKL                                |
+| `use_trt` (Only for trt version)               | -    | -       | Run inference with TensorRT                           |
+| `use_lite` (Only for Intel x86 CPU or ARM CPU) | -    | -       | Run PaddleLite inference                              |
+| `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU        |
+| `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8              |
+| `use_calib`                                    | bool | False   | Only for deployment with TensorRT                     |
+| `gpu_multi_stream`                             | bool | False   | EnableGpuMultiStream to get larger QPS                |
+
+#### 异步模型的说明
+    异步模式适用于1、请求数量非常大的情况,2、多模型串联,想要分别指定每个模型的并发数的情况。
+    异步模式有助于提高Service服务的吞吐(QPS),但对于单次请求而言,时延会有少量增加。
+    异步模式中,每个模型会启动您指定个数的N个线程,每个线程中包含一个模型实例,换句话说每个模型相当于包含N个线程的线程池,从线程池的任务队列中取任务来执行。
+    异步模式中,各个RPC Server的线程只负责将Request请求放入模型线程池的任务队列中,等任务被执行完毕后,再从任务队列中取出已完成的任务。
+    上表中通过 --thread 10 指定的是RPC Server的线程数量,默认值为2,--op_num 指定的是各个模型的线程池中线程数N,默认值为0,表示不使用异步模式。
+    --op_max_batch 指定的各个模型的batch数量,默认值为32,该参数只有当--op_num不为0时才生效。
+    
+#### 当您的某个模型想使用多张GPU卡部署时.
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0,1,2
+#### 当您的一个服务包含两个模型部署时.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292
+#### 当您的一个服务包含两个模型,且每个模型都需要指定多张GPU卡部署时.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2
+#### 当您的一个服务包含两个模型,且每个模型都需要指定多张GPU卡,且需要异步模式每个模型指定不同的并发数时.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2 --op_num 4 8
 
 
 
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
old mode 100755
new mode 100644
diff --git a/core/configure/CMakeLists.txt b/core/configure/CMakeLists.txt
index 32534fee141ee5b4b0b7b1eed580e1769deb5cff..a9eb6d7e36e40292a8ab50caa93be29fd19dcada 100644
--- a/core/configure/CMakeLists.txt
+++ b/core/configure/CMakeLists.txt
@@ -33,9 +33,7 @@ if (WITH_PYTHON)
   add_custom_target(general_model_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
   add_dependencies(general_model_config_py_proto general_model_config_py_proto_init)
   
-  py_grpc_proto_compile(multi_lang_general_model_service_py_proto SRCS proto/multi_lang_general_model_service.proto)
-  add_custom_target(multi_lang_general_model_service_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-  add_dependencies(multi_lang_general_model_service_py_proto multi_lang_general_model_service_py_proto_init)
+
   
   if (CLIENT)
     py_proto_compile(sdk_configure_py_proto SRCS proto/sdk_configure.proto)
@@ -53,11 +51,7 @@ if (WITH_PYTHON)
                     COMMENT "Copy generated general_model_config proto file into directory paddle_serving_client/proto."
                     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     
-    add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
-                    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-                    COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-                    COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_client/proto."
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
   endif()
   
   if (APP)
@@ -84,11 +78,6 @@ if (WITH_PYTHON)
     		COMMENT "Copy generated general_model_config proto file into directory paddle_serving_server/proto."
     		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     
-    add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
-                    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-                    COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-                    COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server/proto."
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
 
 endif()
diff --git a/core/configure/proto/multi_lang_general_model_service.proto b/core/configure/proto/multi_lang_general_model_service.proto
deleted file mode 100755
index 18fbcf760647e1694e738c0832fe45f4f7d9934f..0000000000000000000000000000000000000000
--- a/core/configure/proto/multi_lang_general_model_service.proto
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-syntax = "proto2";
-
-package baidu.paddle_serving.multi_lang;
-
-option java_multiple_files = true;
-option java_package = "io.paddle.serving.grpc";
-option java_outer_classname = "ServingProto";
-
-message Tensor {
-  optional bytes data = 1;
-  repeated int32 int_data = 2;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
-  optional int32 elem_type = 5;
-  repeated int32 shape = 6;
-  repeated int32 lod = 7; // only for fetch tensor currently
-};
-
-message FeedInst { repeated Tensor tensor_array = 1; };
-
-message FetchInst { repeated Tensor tensor_array = 1; };
-
-message InferenceRequest {
-  repeated FeedInst insts = 1;
-  repeated string feed_var_names = 2;
-  repeated string fetch_var_names = 3;
-  required bool is_python = 4 [ default = false ];
-  required uint64 log_id = 5 [ default = 0 ];
-};
-
-message InferenceResponse {
-  repeated ModelOutput outputs = 1;
-  optional string tag = 2;
-  required int32 err_code = 3;
-};
-
-message ModelOutput {
-  repeated FetchInst insts = 1;
-  optional string engine_name = 2;
-}
-
-message SetTimeoutRequest { required int32 timeout_ms = 1; }
-
-message SimpleResponse { required int32 err_code = 1; }
-
-message GetClientConfigRequest {}
-
-message GetClientConfigResponse { required string client_config_str = 1; }
-
-service MultiLangGeneralModelService {
-  rpc Inference(InferenceRequest) returns (InferenceResponse) {}
-  rpc SetTimeout(SetTimeoutRequest) returns (SimpleResponse) {}
-  rpc GetClientConfig(GetClientConfigRequest)
-      returns (GetClientConfigResponse) {}
-};
diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto
old mode 100755
new mode 100644
index 24fb62806476effdcf453cb7b4047122731106ea..5cace06420e29e1590218f63777c85bbcf504b29
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -21,11 +21,12 @@ message EngineDesc {
   required string reloadable_meta = 3;
   required string reloadable_type = 4;
   required string model_dir = 5;
-  required int32 runtime_thread_num = 6;
-  required int32 batch_infer_size = 7;
-  required int32 enable_batch_align = 8;
-  optional string version_file = 9;
-  optional string version_type = 10;
+  repeated int32 gpu_ids = 6;
+  required int32 runtime_thread_num = 7;
+  required int32 batch_infer_size = 8;
+  required int32 enable_batch_align = 9;
+  optional string version_file = 10;
+  optional string version_type = 11;
 
   /*
    * Sparse Parameter Service type. Valid types are:
@@ -38,16 +39,17 @@ message EngineDesc {
     LOCAL = 1;
     REMOTE = 2;
   }
-  optional SparseParamServiceType sparse_param_service_type = 11;
-  optional string sparse_param_service_table_name = 12;
-  optional bool enable_memory_optimization = 13;
-  optional bool enable_ir_optimization = 14;
-  optional bool use_trt = 15;
-  optional bool use_lite = 16;
-  optional bool use_xpu = 17;
-  optional bool use_gpu = 18;
-  optional bool combined_model = 19;
-  optional bool encrypted_model = 20;
+  optional SparseParamServiceType sparse_param_service_type = 12;
+  optional string sparse_param_service_table_name = 13;
+  optional bool enable_memory_optimization = 14;
+  optional bool enable_ir_optimization = 15;
+  optional bool use_trt = 16;
+  optional bool use_lite = 17;
+  optional bool use_xpu = 18;
+  optional bool use_gpu = 19;
+  optional bool combined_model = 20;
+  optional bool encrypted_model = 21;
+  optional bool gpu_multi_stream = 22;
 };
 
 // model_toolkit conf
diff --git a/core/cube/cube-agent/src/agent/http.go b/core/cube/cube-agent/src/agent/http.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-agent/src/agent/http_get.go b/core/cube/cube-agent/src/agent/http_get.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-agent/src/agent/http_post.go b/core/cube/cube-agent/src/agent/http_post.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-builder/CMakeLists.txt b/core/cube/cube-builder/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/conf/transfer.conf b/core/cube/cube-transfer/conf/transfer.conf
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/cube-transfer.go b/core/cube/cube-transfer/src/cube-transfer.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/builder.go b/core/cube/cube-transfer/src/transfer/builder.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/config.go b/core/cube/cube-transfer/src/transfer/config.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/deployer.go b/core/cube/cube-transfer/src/transfer/deployer.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/dict/cube_agent_server.go b/core/cube/cube-transfer/src/transfer/dict/cube_agent_server.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/dict/define.go b/core/cube/cube-transfer/src/transfer/dict/define.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/dict/dict_info.go b/core/cube/cube-transfer/src/transfer/dict/dict_info.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/dict/dict_instance_status.go b/core/cube/cube-transfer/src/transfer/dict/dict_instance_status.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/dict/dict_shard_info.go b/core/cube/cube-transfer/src/transfer/dict/dict_shard_info.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/dict/dict_version_info.go b/core/cube/cube-transfer/src/transfer/dict/dict_version_info.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/global.go b/core/cube/cube-transfer/src/transfer/global.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/http.go b/core/cube/cube-transfer/src/transfer/http.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/http_get.go b/core/cube/cube-transfer/src/transfer/http_get.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/transfer.go b/core/cube/cube-transfer/src/transfer/transfer.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/trigger.go b/core/cube/cube-transfer/src/transfer/trigger.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/util.go b/core/cube/cube-transfer/src/transfer/util.go
old mode 100755
new mode 100644
diff --git a/core/general-client/include/general_model.h b/core/general-client/include/general_model.h
old mode 100755
new mode 100644
index b1c4f71f5602bed4eded49822d7afe7caac6e242..7c80500d03b482c8bbaa0515b0484d72d518434e
--- a/core/general-client/include/general_model.h
+++ b/core/general-client/include/general_model.h
@@ -207,7 +207,7 @@ class PredictorClient {
 
   void init_gflags(std::vector argv);
 
-  int init(const std::vector &client_conf);
+  int init(const std::vector& client_conf);
 
   void set_predictor_conf(const std::string& conf_path,
                           const std::string& conf_file);
@@ -218,23 +218,22 @@ class PredictorClient {
 
   int destroy_predictor();
 
-  int numpy_predict(
-      const std::vector>>& float_feed_batch,
-      const std::vector& float_feed_name,
-      const std::vector>& float_shape,
-      const std::vector>& float_lod_slot_batch,
-      const std::vector>>& int_feed_batch,
-      const std::vector& int_feed_name,
-      const std::vector>& int_shape,
-      const std::vector>& int_lod_slot_batch,
-      const std::vector>& string_feed_batch,
-      const std::vector& string_feed_name,
-      const std::vector>& string_shape,
-      const std::vector>& string_lod_slot_batch,
-      const std::vector& fetch_name,
-      PredictorRes& predict_res_batch,  // NOLINT
-      const int& pid,
-      const uint64_t log_id);
+  int numpy_predict(const std::vector>& float_feed,
+                    const std::vector& float_feed_name,
+                    const std::vector>& float_shape,
+                    const std::vector>& float_lod_slot_batch,
+                    const std::vector>& int_feed,
+                    const std::vector& int_feed_name,
+                    const std::vector>& int_shape,
+                    const std::vector>& int_lod_slot_batch,
+                    const std::vector& string_feed,
+                    const std::vector& string_feed_name,
+                    const std::vector>& string_shape,
+                    const std::vector>& string_lod_slot_batch,
+                    const std::vector& fetch_name,
+                    PredictorRes& predict_res_batch,  // NOLINT
+                    const int& pid,
+                    const uint64_t log_id);
 
  private:
   PredictorApi _api;
@@ -243,6 +242,7 @@ class PredictorClient {
   std::string _predictor_path;
   std::string _conf_file;
   std::map _feed_name_to_idx;
+  std::vector _feed_name;
   std::map _fetch_name_to_idx;
   std::map _fetch_name_to_var_name;
   std::map _fetch_name_to_type;
diff --git a/core/general-client/src/general_model.cpp b/core/general-client/src/general_model.cpp
index 0ade573de6ac2da59156ba82f5ff3e04f1b7f6b2..cf85048b0f4a43659801f58df963a8597e0c2aba 100644
--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -25,8 +25,6 @@ using baidu::paddle_serving::Timer;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Tensor;
-using baidu::paddle_serving::predictor::general_model::FeedInst;
-using baidu::paddle_serving::predictor::general_model::FetchInst;
 enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
 std::once_flag gflags_init_flag;
 namespace py = pybind11;
@@ -68,9 +66,13 @@ int PredictorClient::init(const std::vector &conf_file) {
     _fetch_name_to_idx.clear();
     _shape.clear();
     int feed_var_num = model_config.feed_var_size();
+    _feed_name.clear();
     VLOG(2) << "feed var num: " << feed_var_num;
     for (int i = 0; i < feed_var_num; ++i) {
       _feed_name_to_idx[model_config.feed_var(i).alias_name()] = i;
+      VLOG(2) << "feed [" << i << "]"
+              << " name: " << model_config.feed_var(i).name();
+      _feed_name.push_back(model_config.feed_var(i).name());
       VLOG(2) << "feed alias name: " << model_config.feed_var(i).alias_name()
               << " index: " << i;
       std::vector tmp_feed_shape;
@@ -146,15 +148,15 @@ int PredictorClient::create_predictor() {
 }
 
 int PredictorClient::numpy_predict(
-    const std::vector>> &float_feed_batch,
+    const std::vector> &float_feed,
     const std::vector &float_feed_name,
     const std::vector> &float_shape,
     const std::vector> &float_lod_slot_batch,
-    const std::vector>> &int_feed_batch,
+    const std::vector> &int_feed,
     const std::vector &int_feed_name,
     const std::vector> &int_shape,
     const std::vector> &int_lod_slot_batch,
-    const std::vector> &string_feed_batch,
+    const std::vector &string_feed,
     const std::vector &string_feed_name,
     const std::vector> &string_shape,
     const std::vector> &string_lod_slot_batch,
@@ -162,10 +164,6 @@ int PredictorClient::numpy_predict(
     PredictorRes &predict_res_batch,
     const int &pid,
     const uint64_t log_id) {
-  int batch_size = std::max(float_feed_batch.size(), int_feed_batch.size());
-  batch_size = batch_size > string_feed_batch.size() ? batch_size
-                                                     : string_feed_batch.size();
-  VLOG(2) << "batch size: " << batch_size;
   predict_res_batch.clear();
   Timer timeline;
   int64_t preprocess_start = timeline.TimeStampUS();
@@ -188,134 +186,122 @@ int PredictorClient::numpy_predict(
   }
 
   int vec_idx = 0;
-  for (int bi = 0; bi < batch_size; bi++) {
-    VLOG(2) << "prepare batch " << bi;
-    std::vector tensor_vec;
-    FeedInst *inst = req.add_insts();
-    std::vector> float_feed = float_feed_batch[bi];
-    std::vector> int_feed = int_feed_batch[bi];
-    std::vector string_feed = string_feed_batch[bi];
-    for (auto &name : float_feed_name) {
-      tensor_vec.push_back(inst->add_tensor_array());
-    }
-
-    for (auto &name : int_feed_name) {
-      tensor_vec.push_back(inst->add_tensor_array());
-    }
+  // batch is already in Tensor.
+  std::vector tensor_vec;
 
-    for (auto &name : string_feed_name) {
-      tensor_vec.push_back(inst->add_tensor_array());
-    }
+  for (auto &name : float_feed_name) {
+    tensor_vec.push_back(req.add_tensor());
+  }
 
-    VLOG(2) << "batch [" << bi << "] "
-            << "prepared";
+  for (auto &name : int_feed_name) {
+    tensor_vec.push_back(req.add_tensor());
+  }
 
-    vec_idx = 0;
-    for (auto &name : float_feed_name) {
-      int idx = _feed_name_to_idx[name];
-      if (idx >= tensor_vec.size()) {
-        LOG(ERROR) << "idx > tensor_vec.size()";
-        return -1;
-      }
-      int nbytes = float_feed[vec_idx].nbytes();
-      void *rawdata_ptr = (void *)(float_feed[vec_idx].data(0));
-      int total_number = float_feed[vec_idx].size();
-      Tensor *tensor = tensor_vec[idx];
-
-      VLOG(2) << "prepare float feed " << name << " shape size "
-              << float_shape[vec_idx].size();
-      for (uint32_t j = 0; j < float_shape[vec_idx].size(); ++j) {
-        tensor->add_shape(float_shape[vec_idx][j]);
-      }
-      for (uint32_t j = 0; j < float_lod_slot_batch[vec_idx].size(); ++j) {
-        tensor->add_lod(float_lod_slot_batch[vec_idx][j]);
-      }
-      tensor->set_elem_type(P_FLOAT32);
+  for (auto &name : string_feed_name) {
+    tensor_vec.push_back(req.add_tensor());
+  }
 
-      tensor->mutable_float_data()->Resize(total_number, 0);
-      memcpy(tensor->mutable_float_data()->mutable_data(), rawdata_ptr, nbytes);
-      vec_idx++;
+  vec_idx = 0;
+  for (auto &name : float_feed_name) {
+    int idx = _feed_name_to_idx[name];
+    if (idx >= tensor_vec.size()) {
+      LOG(ERROR) << "idx > tensor_vec.size()";
+      return -1;
+    }
+    VLOG(2) << "prepare float feed " << name << " idx " << idx;
+    int nbytes = float_feed[vec_idx].nbytes();
+    void *rawdata_ptr = (void *)(float_feed[vec_idx].data(0));
+    int total_number = float_feed[vec_idx].size();
+    Tensor *tensor = tensor_vec[idx];
+
+    VLOG(2) << "prepare float feed " << name << " shape size "
+            << float_shape[vec_idx].size();
+    for (uint32_t j = 0; j < float_shape[vec_idx].size(); ++j) {
+      tensor->add_shape(float_shape[vec_idx][j]);
+    }
+    for (uint32_t j = 0; j < float_lod_slot_batch[vec_idx].size(); ++j) {
+      tensor->add_lod(float_lod_slot_batch[vec_idx][j]);
     }
+    tensor->set_elem_type(P_FLOAT32);
 
-    VLOG(2) << "batch [" << bi << "] "
-            << "float feed value prepared";
+    tensor->set_name(_feed_name[idx]);
+    tensor->set_alias_name(name);
 
-    vec_idx = 0;
-    for (auto &name : int_feed_name) {
-      int idx = _feed_name_to_idx[name];
-      if (idx >= tensor_vec.size()) {
-        LOG(ERROR) << "idx > tensor_vec.size()";
-        return -1;
-      }
-      Tensor *tensor = tensor_vec[idx];
-      int nbytes = int_feed[vec_idx].nbytes();
-      void *rawdata_ptr = (void *)(int_feed[vec_idx].data(0));
-      int total_number = int_feed[vec_idx].size();
+    tensor->mutable_float_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_float_data()->mutable_data(), rawdata_ptr, nbytes);
+    vec_idx++;
+  }
 
-      for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
-        tensor->add_shape(int_shape[vec_idx][j]);
-      }
-      for (uint32_t j = 0; j < int_lod_slot_batch[vec_idx].size(); ++j) {
-        tensor->add_lod(int_lod_slot_batch[vec_idx][j]);
-      }
-      tensor->set_elem_type(_type[idx]);
-
-      if (_type[idx] == P_INT64) {
-        tensor->mutable_int64_data()->Resize(total_number, 0);
-        memcpy(
-            tensor->mutable_int64_data()->mutable_data(), rawdata_ptr, nbytes);
-      } else {
-        tensor->mutable_int_data()->Resize(total_number, 0);
-        memcpy(tensor->mutable_int_data()->mutable_data(), rawdata_ptr, nbytes);
-      }
-      vec_idx++;
+  vec_idx = 0;
+  for (auto &name : int_feed_name) {
+    int idx = _feed_name_to_idx[name];
+    if (idx >= tensor_vec.size()) {
+      LOG(ERROR) << "idx > tensor_vec.size()";
+      return -1;
     }
+    Tensor *tensor = tensor_vec[idx];
+    int nbytes = int_feed[vec_idx].nbytes();
+    void *rawdata_ptr = (void *)(int_feed[vec_idx].data(0));
+    int total_number = int_feed[vec_idx].size();
 
-    VLOG(2) << "batch [" << bi << "] "
-            << "int feed value prepared";
+    for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
+      tensor->add_shape(int_shape[vec_idx][j]);
+    }
+    for (uint32_t j = 0; j < int_lod_slot_batch[vec_idx].size(); ++j) {
+      tensor->add_lod(int_lod_slot_batch[vec_idx][j]);
+    }
+    tensor->set_elem_type(_type[idx]);
+    tensor->set_name(_feed_name[idx]);
+    tensor->set_alias_name(name);
+
+    if (_type[idx] == P_INT64) {
+      tensor->mutable_int64_data()->Resize(total_number, 0);
+      memcpy(tensor->mutable_int64_data()->mutable_data(), rawdata_ptr, nbytes);
+    } else {
+      tensor->mutable_int_data()->Resize(total_number, 0);
+      memcpy(tensor->mutable_int_data()->mutable_data(), rawdata_ptr, nbytes);
+    }
+    vec_idx++;
+  }
 
-    vec_idx = 0;
-    for (auto &name : string_feed_name) {
-      int idx = _feed_name_to_idx[name];
-      if (idx >= tensor_vec.size()) {
-        LOG(ERROR) << "idx > tensor_vec.size()";
-        return -1;
-      }
-      Tensor *tensor = tensor_vec[idx];
+  vec_idx = 0;
+  for (auto &name : string_feed_name) {
+    int idx = _feed_name_to_idx[name];
+    if (idx >= tensor_vec.size()) {
+      LOG(ERROR) << "idx > tensor_vec.size()";
+      return -1;
+    }
+    Tensor *tensor = tensor_vec[idx];
 
-      for (uint32_t j = 0; j < string_shape[vec_idx].size(); ++j) {
-        tensor->add_shape(string_shape[vec_idx][j]);
-      }
-      for (uint32_t j = 0; j < string_lod_slot_batch[vec_idx].size(); ++j) {
-        tensor->add_lod(string_lod_slot_batch[vec_idx][j]);
-      }
-      tensor->set_elem_type(P_STRING);
-
-      const int string_shape_size = string_shape[vec_idx].size();
-      // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
-      // we pass string via vector >.
-      if (string_shape_size != 1) {
-        LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
-                   << string_shape_size;
-        return -1;
-      }
-      switch (string_shape_size) {
-        case 1: {
-          tensor->add_data(string_feed[vec_idx]);
-          break;
-        }
+    for (uint32_t j = 0; j < string_shape[vec_idx].size(); ++j) {
+      tensor->add_shape(string_shape[vec_idx][j]);
+    }
+    for (uint32_t j = 0; j < string_lod_slot_batch[vec_idx].size(); ++j) {
+      tensor->add_lod(string_lod_slot_batch[vec_idx][j]);
+    }
+    tensor->set_elem_type(P_STRING);
+    tensor->set_name(_feed_name[idx]);
+    tensor->set_alias_name(name);
+
+    const int string_shape_size = string_shape[vec_idx].size();
+    // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
+    // we pass string via vector >.
+    if (string_shape_size != 1) {
+      LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
+                 << string_shape_size;
+      return -1;
+    }
+    switch (string_shape_size) {
+      case 1: {
+        tensor->add_data(string_feed[vec_idx]);
+        break;
       }
-      vec_idx++;
     }
-
-    VLOG(2) << "batch [" << bi << "] "
-            << "string feed value prepared";
+    vec_idx++;
   }
 
   int64_t preprocess_end = timeline.TimeStampUS();
-
   int64_t client_infer_start = timeline.TimeStampUS();
-
   Response res;
 
   int64_t client_infer_end = 0;
@@ -347,19 +333,18 @@ int PredictorClient::numpy_predict(
       int idx = 0;
       for (auto &name : fetch_name) {
         // int idx = _fetch_name_to_idx[name];
-        int shape_size = output.insts(0).tensor_array(idx).shape_size();
+        int shape_size = output.tensor(idx).shape_size();
         VLOG(2) << "fetch var " << name << " index " << idx << " shape size "
                 << shape_size;
         model._shape_map[name].resize(shape_size);
         for (int i = 0; i < shape_size; ++i) {
-          model._shape_map[name][i] =
-              output.insts(0).tensor_array(idx).shape(i);
+          model._shape_map[name][i] = output.tensor(idx).shape(i);
         }
-        int lod_size = output.insts(0).tensor_array(idx).lod_size();
+        int lod_size = output.tensor(idx).lod_size();
         if (lod_size > 0) {
           model._lod_map[name].resize(lod_size);
           for (int i = 0; i < lod_size; ++i) {
-            model._lod_map[name][i] = output.insts(0).tensor_array(idx).lod(i);
+            model._lod_map[name][i] = output.tensor(idx).lod(i);
           }
         }
         idx += 1;
@@ -371,22 +356,22 @@ int PredictorClient::numpy_predict(
         // int idx = _fetch_name_to_idx[name];
         if (_fetch_name_to_type[name] == P_INT64) {
           VLOG(2) << "ferch var " << name << "type int64";
-          int size = output.insts(0).tensor_array(idx).int64_data_size();
+          int size = output.tensor(idx).int64_data_size();
           model._int64_value_map[name] = std::vector(
-              output.insts(0).tensor_array(idx).int64_data().begin(),
-              output.insts(0).tensor_array(idx).int64_data().begin() + size);
+              output.tensor(idx).int64_data().begin(),
+              output.tensor(idx).int64_data().begin() + size);
         } else if (_fetch_name_to_type[name] == P_FLOAT32) {
           VLOG(2) << "fetch var " << name << "type float";
-          int size = output.insts(0).tensor_array(idx).float_data_size();
+          int size = output.tensor(idx).float_data_size();
           model._float_value_map[name] = std::vector(
-              output.insts(0).tensor_array(idx).float_data().begin(),
-              output.insts(0).tensor_array(idx).float_data().begin() + size);
+              output.tensor(idx).float_data().begin(),
+              output.tensor(idx).float_data().begin() + size);
         } else if (_fetch_name_to_type[name] == P_INT32) {
           VLOG(2) << "fetch var " << name << "type int32";
-          int size = output.insts(0).tensor_array(idx).int_data_size();
+          int size = output.tensor(idx).int_data_size();
           model._int32_value_map[name] = std::vector(
-              output.insts(0).tensor_array(idx).int_data().begin(),
-              output.insts(0).tensor_array(idx).int_data().begin() + size);
+              output.tensor(idx).int_data().begin(),
+              output.tensor(idx).int_data().begin() + size);
         }
         idx += 1;
       }
diff --git a/core/general-client/src/pybind_general_model.cpp b/core/general-client/src/pybind_general_model.cpp
old mode 100755
new mode 100644
index 499f0856ad8b7ffae5f3f037142036ac486cc035..d5c95d1af55e962db40e347823c5c491216851bb
--- a/core/general-client/src/pybind_general_model.cpp
+++ b/core/general-client/src/pybind_general_model.cpp
@@ -97,33 +97,31 @@ PYBIND11_MODULE(serving_client, m) {
            [](PredictorClient &self) { self.destroy_predictor(); })
       .def("numpy_predict",
            [](PredictorClient &self,
-              const std::vector>>
-                  &float_feed_batch,
+              const std::vector> &float_feed,
               const std::vector &float_feed_name,
               const std::vector> &float_shape,
               const std::vector> &float_lod_slot_batch,
-              const std::vector>>
-                  &int_feed_batch,
+              const std::vector> &int_feed,
               const std::vector &int_feed_name,
               const std::vector> &int_shape,
               const std::vector> &int_lod_slot_batch,
-              const std::vector>& string_feed_batch,
-              const std::vector& string_feed_name,
-              const std::vector>& string_shape,
-              const std::vector>& string_lod_slot_batch,
+              const std::vector &string_feed,
+              const std::vector &string_feed_name,
+              const std::vector> &string_shape,
+              const std::vector> &string_lod_slot_batch,
               const std::vector &fetch_name,
               PredictorRes &predict_res_batch,
               const int &pid,
               const uint64_t log_id) {
-             return self.numpy_predict(float_feed_batch,
+             return self.numpy_predict(float_feed,
                                        float_feed_name,
                                        float_shape,
                                        float_lod_slot_batch,
-                                       int_feed_batch,
+                                       int_feed,
                                        int_feed_name,
                                        int_shape,
                                        int_lod_slot_batch,
-                                       string_feed_batch,
+                                       string_feed,
                                        string_feed_name,
                                        string_shape,
                                        string_lod_slot_batch,
diff --git a/core/general-server/CMakeLists.txt b/core/general-server/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/core/general-server/op/CMakeLists.txt b/core/general-server/op/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/core/general-server/op/general_copy_op.cpp b/core/general-server/op/general_copy_op.cpp
deleted file mode 100644
index 0391a98bcb7f471c0a0687dd9deb7b404a15a2bf..0000000000000000000000000000000000000000
--- a/core/general-server/op/general_copy_op.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "core/general-server/op/general_copy_op.h"
-#include 
-#include 
-#include 
-#include 
-#include "core/general-server/op/general_infer_helper.h"
-#include "core/predictor/framework/infer.h"
-#include "core/predictor/framework/memory.h"
-#include "core/util/include/timer.h"
-
-namespace baidu {
-namespace paddle_serving {
-namespace serving {
-
-using baidu::paddle_serving::Timer;
-using baidu::paddle_serving::predictor::MempoolWrapper;
-using baidu::paddle_serving::predictor::general_model::Tensor;
-using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FeedInst;
-using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
-
-int GeneralCopyOp::inference() {
-  // reade request from client
-  const std::vector pre_node_names = pre_names();
-  if (pre_node_names.size() != 1) {
-    LOG(ERROR) << "This op(" << op_name()
-               << ") can only have one predecessor op, but received "
-               << pre_node_names.size();
-    return -1;
-  }
-  const std::string pre_name = pre_node_names[0];
-
-  const GeneralBlob *input_blob = get_depend_argument(pre_name);
-  uint64_t log_id = input_blob->GetLogId();
-
-  VLOG(2) << "(logid=" << log_id << ") precedent name: " << pre_name;
-  const TensorVector *in = &input_blob->tensor_vector;
-  VLOG(2) << "(logid=" << log_id << ") input size: " << in->size();
-  int batch_size = input_blob->GetBatchSize();
-  int input_var_num = 0;
-
-  GeneralBlob *res = mutable_data();
-  res->SetLogId(log_id);
-  TensorVector *out = &res->tensor_vector;
-
-  VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
-  res->SetBatchSize(batch_size);
-
-  if (!res) {
-    LOG(ERROR) << "(logid=" << log_id
-               << ") Failed get op tls reader object output";
-  }
-
-  Timer timeline;
-  int64_t start = timeline.TimeStampUS();
-
-  VLOG(2) << "(logid=" << log_id << ") Going to init lod tensor";
-  for (int i = 0; i < in->size(); ++i) {
-    paddle::PaddleTensor lod_tensor;
-    CopyLod(&in->at(i), &lod_tensor);
-    lod_tensor.dtype = in->at(i).dtype;
-    lod_tensor.name = in->at(i).name;
-    VLOG(2) << "(logid=" << log_id << ") lod tensor [" << i
-            << "].name = " << lod_tensor.name;
-    out->push_back(lod_tensor);
-  }
-
-  VLOG(2) << "(logid=" << log_id << ") pack done.";
-
-  for (int i = 0; i < out->size(); ++i) {
-    int64_t *src_ptr = static_cast(in->at(i).data.data());
-    out->at(i).data.Resize(out->at(i).lod[0].back() * sizeof(int64_t));
-    out->at(i).shape = {out->at(i).lod[0].back(), 1};
-    int64_t *tgt_ptr = static_cast(out->at(i).data.data());
-    for (int j = 0; j < out->at(i).lod[0].back(); ++j) {
-      tgt_ptr[j] = src_ptr[j];
-    }
-  }
-
-  VLOG(2) << "(logid=" << log_id << ") output done.";
-
-  timeline.Pause();
-  int64_t end = timeline.TimeStampUS();
-  CopyBlobInfo(input_blob, res);
-  AddBlobInfo(res, start);
-  AddBlobInfo(res, end);
-
-  VLOG(2) << "(logid=" << log_id << ") read data from client success";
-  return 0;
-}
-
-DEFINE_OP(GeneralCopyOp);
-}  // namespace serving
-}  // namespace paddle_serving
-}  // namespace baidu
diff --git a/core/general-server/op/general_copy_op.h b/core/general-server/op/general_copy_op.h
deleted file mode 100644
index 9b4caadc6a82f1f1a601ab66394b3f629af703ff..0000000000000000000000000000000000000000
--- a/core/general-server/op/general_copy_op.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include 
-#include 
-#include "core/general-server/general_model_service.pb.h"
-#include "core/general-server/op/general_infer_helper.h"
-#include "core/predictor/framework/resource.h"
-#include "paddle_inference_api.h"  // NOLINT
-
-namespace baidu {
-namespace paddle_serving {
-namespace serving {
-
-class GeneralCopyOp
-    : public baidu::paddle_serving::predictor::OpWithChannel {
- public:
-  typedef std::vector TensorVector;
-
-  DECLARE_OP(GeneralCopyOp);
-
-  int inference();
-};
-
-}  // namespace serving
-}  // namespace paddle_serving
-}  // namespace baidu
diff --git a/core/general-server/op/general_detection_op.cpp b/core/general-server/op/general_detection_op.cpp
old mode 100755
new mode 100644
index 7c33ec8efa8c6e89a7a778def6342415d19ffa94..46f5ddf1b508681661b69c60a25b6d7d000e6d4e
--- a/core/general-server/op/general_detection_op.cpp
+++ b/core/general-server/op/general_detection_op.cpp
@@ -36,7 +36,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FetchInst;
 using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
 
diff --git a/core/general-server/op/general_detection_op.h b/core/general-server/op/general_detection_op.h
old mode 100755
new mode 100644
diff --git a/core/general-server/op/general_dist_kv_infer_op.cpp b/core/general-server/op/general_dist_kv_infer_op.cpp
index 2f966f46204197fc1ab2ee822fb519893e05daae..f6001ef3fa4d2575bbdc964b3dda30e813ec02a6 100644
--- a/core/general-server/op/general_dist_kv_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_infer_op.cpp
@@ -34,7 +34,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FetchInst;
 using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
 
diff --git a/core/general-server/op/general_dist_kv_quant_infer_op.cpp b/core/general-server/op/general_dist_kv_quant_infer_op.cpp
old mode 100755
new mode 100644
index 756b83d625d04b9d2c6c6faf1ab42eecf5a19073..77036c35519d9355fa5100e57e99b8b1d2916c44
--- a/core/general-server/op/general_dist_kv_quant_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_quant_infer_op.cpp
@@ -35,7 +35,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FetchInst;
 using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
 
@@ -117,9 +116,6 @@ int GeneralDistKVQuantInferOp::inference() {
   std::unordered_map in_out_map;
   baidu::paddle_serving::predictor::Resource &resource =
       baidu::paddle_serving::predictor::Resource::instance();
-  //TODO:Temporary addition, specific details to be studied by HexToString
-  std::shared_ptr model_config =
-      resource.get_general_model_config()[0];
   int cube_quant_bits = resource.get_cube_quant_bits();
   size_t EMBEDDING_SIZE = 0;
   if (cube_quant_bits == 0) {
@@ -146,7 +142,7 @@ int GeneralDistKVQuantInferOp::inference() {
     sparse_out[sparse_idx].shape.push_back(
         sparse_out[sparse_idx].lod[0].back());
     sparse_out[sparse_idx].shape.push_back(EMBEDDING_SIZE);
-    sparse_out[sparse_idx].name = model_config->_feed_name[i];
+    sparse_out[sparse_idx].name = in->at(i).name;
     sparse_out[sparse_idx].data.Resize(sparse_out[sparse_idx].lod[0].back() *
                                        EMBEDDING_SIZE * sizeof(float));
     // END HERE
diff --git a/core/general-server/op/general_infer_op.cpp b/core/general-server/op/general_infer_op.cpp
old mode 100755
new mode 100644
index 46038e1fe20d5659d3061e3d7490af65f6d54092..00c408a0c5fbe6d886fc3a62285b92ff486aa154
--- a/core/general-server/op/general_infer_op.cpp
+++ b/core/general-server/op/general_infer_op.cpp
@@ -31,7 +31,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FetchInst;
 using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
 
@@ -49,7 +48,7 @@ int GeneralInferOp::inference() {
   const GeneralBlob *input_blob = get_depend_argument(pre_name);
   if (!input_blob) {
     LOG(ERROR) << "input_blob is nullptr,error";
-      return -1;
+    return -1;
   }
   uint64_t log_id = input_blob->GetLogId();
   VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
@@ -57,7 +56,7 @@ int GeneralInferOp::inference() {
   GeneralBlob *output_blob = mutable_data();
   if (!output_blob) {
     LOG(ERROR) << "output_blob is nullptr,error";
-      return -1;
+    return -1;
   }
   output_blob->SetLogId(log_id);
 
diff --git a/core/general-server/op/general_reader_op.cpp b/core/general-server/op/general_reader_op.cpp
index 3e1091dd844f0afd71c8556586f82aafc42c5097..af77df553837c594789b0e9943790fc37fc01c95 100644
--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -30,42 +30,8 @@ using baidu::paddle_serving::Timer;
 using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FeedInst;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
 enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
-int conf_check(const Request *req,
-               const std::shared_ptr &model_config) {
-  int var_num = req->insts(0).tensor_array_size();
-  if (var_num != model_config->_feed_type.size()) {
-    LOG(ERROR) << "feed var number not match: model config["
-               << model_config->_feed_type.size() << "] vs. actual[" << var_num
-               << "]";
-    return -1;
-  }
-
-  VLOG(2) << "fetch var num in reader op: " << req->fetch_var_names_size();
-
-  for (int i = 0; i < var_num; ++i) {
-    const Tensor &tensor = req->insts(0).tensor_array(i);
-    if (model_config->_feed_type[i] != tensor.elem_type()) {
-      LOG(ERROR) << "feed type not match.";
-      return -1;
-    }
-    if (model_config->_feed_shape[i].size() == tensor.shape_size()) {
-      for (int j = 0; j < model_config->_feed_shape[i].size(); ++j) {
-        tensor.shape(j);
-        if (model_config->_feed_shape[i][j] != tensor.shape(j)) {
-          LOG(ERROR) << "feed shape not match.";
-          return -1;
-        }
-      }
-    } else {
-      LOG(ERROR) << "feed shape not match.";
-      return -1;
-    }
-  }
-  return 0;
-}
 
 int GeneralReaderOp::inference() {
   // read request from client
@@ -93,7 +59,8 @@ int GeneralReaderOp::inference() {
   res->SetLogId(log_id);
   Timer timeline;
   int64_t start = timeline.TimeStampUS();
-  int var_num = req->insts(0).tensor_array_size();
+  // var_num means the number of feed_var.
+  int var_num = req->tensor_size();
 
   VLOG(2) << "(logid=" << log_id << ") var num: " << var_num
           << ") start to call load general model_conf op";
@@ -102,19 +69,7 @@ int GeneralReaderOp::inference() {
       baidu::paddle_serving::predictor::Resource::instance();
 
   VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
-  // get the first InferOP's model_config as ReaderOp's model_config by default.
-  std::shared_ptr model_config =
-      resource.get_general_model_config().front();
 
-  // TODO(guru4elephant): how to do conditional check?
-  /*
-  int ret = conf_check(req, model_config);
-  if (ret != 0) {
-    LOG(ERROR) << "model conf of server:";
-    resource.print_general_model_config(model_config);
-    return 0;
-  }
-  */
   // package tensor
   // prepare basic information for input
   // specify the memory needed for output tensor_vector
@@ -125,7 +80,7 @@ int GeneralReaderOp::inference() {
   int64_t databuf_size = 0;
   for (int i = 0; i < var_num; ++i) {
     paddle::PaddleTensor paddleTensor;
-    const Tensor &tensor = req->insts(0).tensor_array(i);
+    const Tensor &tensor = req->tensor(i);
     data_len = 0;
     elem_type = 0;
     elem_size = 0;
@@ -172,13 +127,16 @@ int GeneralReaderOp::inference() {
       VLOG(2) << "(logid=" << log_id << ") shape for var[" << i << "]: " << dim;
       paddleTensor.shape.push_back(dim);
     }
-    paddleTensor.name = model_config->_feed_name[i];
+    paddleTensor.name = tensor.name();
     out->push_back(paddleTensor);
 
     VLOG(2) << "(logid=" << log_id << ") tensor size for var[" << i
             << "]: " << data_len;
     databuf_size = data_len * elem_size;
-    out->at(i).data.Resize(databuf_size);
+    void *databuf_char = MempoolWrapper::instance().malloc(databuf_size);
+    paddle::PaddleBuf paddleBuf(databuf_char, databuf_size);
+    out->at(i).data = paddleBuf;
+    // out->at(i).data.Resize(databuf_size);
     if (out->at(i).lod.size() > 0) {
       VLOG(2) << "(logid=" << log_id << ") var[" << i
               << "] has lod_tensor and len=" << out->at(i).lod[0].back();
diff --git a/core/general-server/op/general_response_op.cpp b/core/general-server/op/general_response_op.cpp
old mode 100755
new mode 100644
index d8fece0f7e25a967a6a72f41a9090b0977bf252a..161e291117b8893703844ab07ec93a891fc46f27
--- a/core/general-server/op/general_response_op.cpp
+++ b/core/general-server/op/general_response_op.cpp
@@ -34,7 +34,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FetchInst;
 using baidu::paddle_serving::predictor::general_model::ModelOutput;
 using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
@@ -49,7 +48,6 @@ int GeneralResponseOp::inference() {
       get_depend_argument(pre_node_names[0])->GetLogId();
 
   const Request *req = dynamic_cast(get_request_message());
-  // response inst with only fetch_var_names
   Response *res = mutable_data();
 
   Timer timeline;
@@ -63,7 +61,8 @@ int GeneralResponseOp::inference() {
       baidu::paddle_serving::predictor::Resource::instance();
 
   VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
-  //get the last InferOP's model_config as ResponseOp's model_config by default.
+  // get the last InferOP's model_config as ResponseOp's model_config by
+  // default.
   std::shared_ptr model_config =
       resource.get_general_model_config().back();
 
@@ -71,6 +70,10 @@ int GeneralResponseOp::inference() {
           << ") max body size : " << brpc::fLU64::FLAGS_max_body_size;
 
   std::vector fetch_index;
+  // this is based on GetOutPutNames() is ordered map.
+  // and the order of Output is the same as the prototxt FetchVar.
+  // otherwise, you can only get the Output by the corresponding of
+  // Name -- Alias_name.
   fetch_index.resize(req->fetch_var_names_size());
   for (int i = 0; i < req->fetch_var_names_size(); ++i) {
     fetch_index[i] =
@@ -95,40 +98,41 @@ int GeneralResponseOp::inference() {
     ModelOutput *output = res->add_outputs();
     // To get the order of model return values
     output->set_engine_name(pre_name);
-    FetchInst *fetch_inst = output->add_insts();
 
+    var_idx = 0;
+    // idx is the real index of FetchVar.
+    // idx is not the index of FetchList.
+    // fetch_index is the real index in FetchVar of Fetchlist
+    // for example, FetchVar = {0:A, 1:B, 2:C}
+    // FetchList = {0:C,1:A}, at this situation.
+    // fetch_index = [2,0], C`index = 2 and A`index = 0 
     for (auto &idx : fetch_index) {
-      Tensor *tensor = fetch_inst->add_tensor_array();
-      //tensor->set_elem_type(1);
-      if (model_config->_is_lod_fetch[idx]) {
-        VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
-                << model_config->_fetch_name[idx] << " is lod_tensor";
-        for (int k = 0; k < in->at(idx).shape.size(); ++k) {
-          VLOG(2) << "(logid=" << log_id << ") shape[" << k
-                  << "]: " << in->at(idx).shape[k];
-          tensor->add_shape(in->at(idx).shape[k]);
-        }
-      } else {
-        VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
-                << model_config->_fetch_name[idx] << " is tensor";
-        for (int k = 0; k < in->at(idx).shape.size(); ++k) {
-          VLOG(2) << "(logid=" << log_id << ") shape[" << k
-                  << "]: " << in->at(idx).shape[k];
-          tensor->add_shape(in->at(idx).shape[k]);
+      Tensor *tensor = output->add_tensor();
+      tensor->set_name(in->at(idx).name);
+      tensor->set_alias_name(model_config->_fetch_alias_name[idx]);
+      for (int k = 0; k < in->at(idx).shape.size(); ++k) {
+        VLOG(2) << "(logid=" << log_id << ") shape[" << k
+                << "]: " << in->at(idx).shape[k];
+        tensor->add_shape(in->at(idx).shape[k]);
+      }
+      std::string str_tensor_type = "is tensor";
+      if (model_config->_is_lod_fetch[idx] && in->at(idx).lod.size() > 0) {
+        str_tensor_type = "is lod_tensor";
+        for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
+          tensor->add_lod(in->at(idx).lod[0][j]);
         }
       }
-    }
+      VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
+              << model_config->_fetch_name[idx] << str_tensor_type;
 
-    var_idx = 0;
-    for (auto &idx : fetch_index) {
       cap = 1;
       for (int j = 0; j < in->at(idx).shape.size(); ++j) {
         cap *= in->at(idx).shape[j];
       }
 
-      FetchInst *fetch_p = output->mutable_insts(0);
       auto dtype = in->at(idx).dtype;
       if (dtype == paddle::PaddleDType::INT64) {
+        tensor->set_elem_type(0);
         VLOG(2) << "(logid=" << log_id << ") Prepare int64 var ["
                 << model_config->_fetch_name[idx] << "].";
         int64_t *data_ptr = static_cast(in->at(idx).data.data());
@@ -137,35 +141,24 @@ int GeneralResponseOp::inference() {
         // `Swap` method is faster than `{}` method.
         google::protobuf::RepeatedField tmp_data(data_ptr,
                                                           data_ptr + cap);
-        fetch_p->mutable_tensor_array(var_idx)->mutable_int64_data()->Swap(
-            &tmp_data);
+        output->mutable_tensor(var_idx)->mutable_int64_data()->Swap(&tmp_data);
       } else if (dtype == paddle::PaddleDType::FLOAT32) {
+        tensor->set_elem_type(1);
         VLOG(2) << "(logid=" << log_id << ") Prepare float var ["
                 << model_config->_fetch_name[idx] << "].";
-        
+
         float *data_ptr = static_cast(in->at(idx).data.data());
         google::protobuf::RepeatedField tmp_data(data_ptr,
                                                         data_ptr + cap);
-        fetch_p->mutable_tensor_array(var_idx)->mutable_float_data()->Swap(
-            &tmp_data);
+        output->mutable_tensor(var_idx)->mutable_float_data()->Swap(&tmp_data);
       } else if (dtype == paddle::PaddleDType::INT32) {
-
+        tensor->set_elem_type(2);
         VLOG(2) << "(logid=" << log_id << ")Prepare int32 var ["
                 << model_config->_fetch_name[idx] << "].";
         int32_t *data_ptr = static_cast(in->at(idx).data.data());
         google::protobuf::RepeatedField tmp_data(data_ptr,
                                                           data_ptr + cap);
-        fetch_p->mutable_tensor_array(var_idx)->mutable_int_data()->Swap(
-            &tmp_data);
-      }
-
-      if (model_config->_is_lod_fetch[idx]) {
-        if (in->at(idx).lod.size() > 0) {
-          for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
-            fetch_p->mutable_tensor_array(var_idx)->add_lod(
-                in->at(idx).lod[0][j]);
-          }
-        }
+        output->mutable_tensor(var_idx)->mutable_int_data()->Swap(&tmp_data);
       }
 
       VLOG(2) << "(logid=" << log_id << ") fetch var ["
@@ -205,4 +198,4 @@ DEFINE_OP(GeneralResponseOp);
 
 }  // namespace serving
 }  // namespace paddle_serving
-}  // namespace baidu
\ No newline at end of file
+}  // namespace baidu
diff --git a/core/general-server/op/general_text_reader_op.cpp b/core/general-server/op/general_text_reader_op.cpp
deleted file mode 100755
index 6c305c18c0cb56bc5dd841c9c6a09807c6dbf518..0000000000000000000000000000000000000000
--- a/core/general-server/op/general_text_reader_op.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "core/general-server/op/general_text_reader_op.h"
-#include 
-#include 
-#include 
-#include 
-#include "core/predictor/framework/infer.h"
-#include "core/predictor/framework/memory.h"
-#include "core/util/include/timer.h"
-
-namespace baidu {
-namespace paddle_serving {
-namespace serving {
-
-using baidu::paddle_serving::Timer;
-using baidu::paddle_serving::predictor::MempoolWrapper;
-using baidu::paddle_serving::predictor::general_model::Tensor;
-using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FeedInst;
-using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
-
-int GeneralTextReaderOp::inference() {
-  // reade request from client
-  const Request *req = dynamic_cast(get_request_message());
-  uint64_t log_id = req->log_id();
-
-  int batch_size = req->insts_size();
-  int input_var_num = 0;
-
-  std::vector elem_type;
-  std::vector elem_size;
-  std::vector capacity;
-
-  GeneralBlob *res = mutable_data();
-
-  if (!res) {
-    LOG(ERROR) << "(logid=" << log_id
-               << ") Failed get op tls reader object output";
-  }
-
-  TensorVector *out = &res->tensor_vector;
-  res->SetBatchSize(batch_size);
-  res->SetLogId(log_id);
-
-  if (batch_size <= 0) {
-    LOG(ERROR) << "(logid=" << log_id << ") Batch size < 0";
-    return -1;
-  }
-
-  Timer timeline;
-  int64_t start = timeline.TimeStampUS();
-
-  int var_num = req->insts(0).tensor_array_size();
-  VLOG(2) << "(logid=" << log_id << ") var num: " << var_num;
-
-  VLOG(2) << "(logid=" << log_id
-          << ") start to call load general model_conf op";
-  baidu::paddle_serving::predictor::Resource &resource =
-      baidu::paddle_serving::predictor::Resource::instance();
-
-  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
-  std::shared_ptr model_config =
-      resource.get_general_model_config()[0];
-
-  VLOG(2) << "(logid=" << log_id << ") print general model config done.";
-
-  elem_type.resize(var_num);
-  elem_size.resize(var_num);
-  capacity.resize(var_num);
-  for (int i = 0; i < var_num; ++i) {
-    paddle::PaddleTensor lod_tensor;
-    elem_type[i] = req->insts(0).tensor_array(i).elem_type();
-    VLOG(2) << "(logid=" << log_id << ") var[" << i
-            << "] has elem type: " << elem_type[i];
-    if (elem_type[i] == 0) {  // int64
-      elem_size[i] = sizeof(int64_t);
-      lod_tensor.dtype = paddle::PaddleDType::INT64;
-    } else {
-      elem_size[i] = sizeof(float);
-      lod_tensor.dtype = paddle::PaddleDType::FLOAT32;
-    }
-
-    if (req->insts(0).tensor_array(i).shape(0) == -1) {
-      lod_tensor.lod.resize(1);
-      lod_tensor.lod[0].push_back(0);
-      VLOG(2) << "(logid=" << log_id << ") var[" << i << "] is lod_tensor";
-    } else {
-      lod_tensor.shape.push_back(batch_size);
-      capacity[i] = 1;
-      for (int k = 0; k < req->insts(0).tensor_array(i).shape_size(); ++k) {
-        int dim = req->insts(0).tensor_array(i).shape(k);
-        VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
-                << "]: " << dim;
-        capacity[i] *= dim;
-        lod_tensor.shape.push_back(dim);
-      }
-      VLOG(2) << "(logid=" << log_id << ") var[" << i
-              << "] is tensor, capacity: " << capacity[i];
-    }
-    lod_tensor.name = model_config->_feed_name[i];
-    out->push_back(lod_tensor);
-  }
-
-  for (int i = 0; i < var_num; ++i) {
-    if (out->at(i).lod.size() == 1) {
-      for (int j = 0; j < batch_size; ++j) {
-        const Tensor &tensor = req->insts(j).tensor_array(i);
-        int data_len = tensor.int_data_size();
-        int cur_len = out->at(i).lod[0].back();
-        out->at(i).lod[0].push_back(cur_len + data_len);
-      }
-      out->at(i).data.Resize(out->at(i).lod[0].back() * elem_size[i]);
-      out->at(i).shape = {out->at(i).lod[0].back(), 1};
-      VLOG(2) << "(logid=" << log_id << ") var[" << i
-              << "] is lod_tensor and len=" << out->at(i).lod[0].back();
-    } else {
-      out->at(i).data.Resize(batch_size * capacity[i] * elem_size[i]);
-      VLOG(2) << "(logid=" << log_id << ") var[" << i
-              << "] is tensor and capacity=" << batch_size * capacity[i];
-    }
-  }
-
-  for (int i = 0; i < var_num; ++i) {
-    if (elem_type[i] == 0) {
-      int64_t *dst_ptr = static_cast(out->at(i).data.data());
-      int offset = 0;
-      for (int j = 0; j < batch_size; ++j) {
-        for (int k = 0; k < req->insts(j).tensor_array(i).int_data_size();
-             ++k) {
-          dst_ptr[offset + k] = req->insts(j).tensor_array(i).int_data(k);
-        }
-        if (out->at(i).lod.size() == 1) {
-          offset = out->at(i).lod[0][j + 1];
-        } else {
-          offset += capacity[i];
-        }
-      }
-    } else {
-      float *dst_ptr = static_cast(out->at(i).data.data());
-      int offset = 0;
-      for (int j = 0; j < batch_size; ++j) {
-        for (int k = 0; k < req->insts(j).tensor_array(i).int_data_size();
-             ++k) {
-          dst_ptr[offset + k] = req->insts(j).tensor_array(i).int_data(k);
-        }
-        if (out->at(i).lod.size() == 1) {
-          offset = out->at(i).lod[0][j + 1];
-        } else {
-          offset += capacity[i];
-        }
-      }
-    }
-  }
-
-  int64_t end = timeline.TimeStampUS();
-  res->p_size = 0;
-  AddBlobInfo(res, start);
-  AddBlobInfo(res, end);
-
-  VLOG(2) << "(logid=" << log_id << ") read data from client success";
-  return 0;
-}
-DEFINE_OP(GeneralTextReaderOp);
-}  // namespace serving
-}  // namespace paddle_serving
-}  // namespace baidu
diff --git a/core/general-server/op/general_text_reader_op.h b/core/general-server/op/general_text_reader_op.h
deleted file mode 100644
index af822993dc37fae23c1fa584d640cbfe8d9950c8..0000000000000000000000000000000000000000
--- a/core/general-server/op/general_text_reader_op.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include 
-#include 
-#include "core/general-server/general_model_service.pb.h"
-#include "core/general-server/load_general_model_service.pb.h"
-#include "core/general-server/op/general_infer_helper.h"
-#include "core/predictor/framework/resource.h"
-#include "paddle_inference_api.h"  // NOLINT
-
-namespace baidu {
-namespace paddle_serving {
-namespace serving {
-
-class GeneralTextReaderOp
-    : public baidu::paddle_serving::predictor::OpWithChannel {
- public:
-  typedef std::vector TensorVector;
-
-  DECLARE_OP(GeneralTextReaderOp);
-
-  int inference();
-};
-
-}  // namespace serving
-}  // namespace paddle_serving
-}  // namespace baidu
diff --git a/core/general-server/op/general_text_response_op.cpp b/core/general-server/op/general_text_response_op.cpp
deleted file mode 100755
index 03ab08cd361ea9eb8060c4ba5372d319a34df1f6..0000000000000000000000000000000000000000
--- a/core/general-server/op/general_text_response_op.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "core/general-server/op/general_text_response_op.h"
-#include 
-#include 
-#include 
-#include 
-#include "core/predictor/framework/infer.h"
-#include "core/predictor/framework/memory.h"
-#include "core/predictor/framework/resource.h"
-#include "core/util/include/timer.h"
-
-namespace baidu {
-namespace paddle_serving {
-namespace serving {
-
-using baidu::paddle_serving::Timer;
-using baidu::paddle_serving::predictor::MempoolWrapper;
-using baidu::paddle_serving::predictor::general_model::Tensor;
-using baidu::paddle_serving::predictor::general_model::Response;
-using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FetchInst;
-using baidu::paddle_serving::predictor::general_model::ModelOutput;
-using baidu::paddle_serving::predictor::InferManager;
-using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
-
-int GeneralTextResponseOp::inference() {
-  VLOG(2) << "Going to run inference";
-  const std::vector pre_node_names = pre_names();
-  VLOG(2) << "pre node names size: " << pre_node_names.size();
-  const GeneralBlob *input_blob;
-  uint64_t log_id =
-      get_depend_argument(pre_node_names[0])->GetLogId();
-
-  const Request *req = dynamic_cast(get_request_message());
-  // response inst with only fetch_var_names
-  Response *res = mutable_data();
-
-  Timer timeline;
-  int64_t start = timeline.TimeStampUS();
-
-  VLOG(2) << "(logid=" << log_id
-          << ") start to call load general model_conf op";
-  baidu::paddle_serving::predictor::Resource &resource =
-      baidu::paddle_serving::predictor::Resource::instance();
-
-  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
-  std::shared_ptr model_config =
-      resource.get_general_model_config().back();
-
-  std::vector fetch_index;
-  fetch_index.resize(req->fetch_var_names_size());
-  for (int i = 0; i < req->fetch_var_names_size(); ++i) {
-    fetch_index[i] =
-        model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
-  }
-
-  for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
-    const std::string &pre_name = pre_node_names[pi];
-    VLOG(2) << "(logid=" << log_id << ") pre names[" << pi << "]: " << pre_name
-            << " (" << pre_node_names.size() << ")";
-    input_blob = get_depend_argument(pre_name);
-    if (!input_blob) {
-      LOG(ERROR) << "(logid=" << log_id
-                 << ") Failed mutable depended argument, op: " << pre_name;
-      return -1;
-    }
-
-    const TensorVector *in = &input_blob->tensor_vector;
-    int batch_size = input_blob->GetBatchSize();
-    VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
-
-    ModelOutput *output = res->add_outputs();
-    output->set_engine_name(
-        pre_name);  // To get the order of model return values
-    for (int i = 0; i < batch_size; ++i) {
-      FetchInst *fetch_inst = output->add_insts();
-      for (auto &idx : fetch_index) {
-        Tensor *tensor = fetch_inst->add_tensor_array();
-        // currently only response float tensor or lod_tensor
-        tensor->set_elem_type(1);
-        if (model_config->_is_lod_fetch[idx]) {
-          VLOG(2) << "(logid=" << log_id << ") out[" << idx << " is lod_tensor";
-          tensor->add_shape(-1);
-        } else {
-          VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] is tensor";
-          for (int k = 1; k < in->at(idx).shape.size(); ++k) {
-            VLOG(2) << "(logid=" << log_id << ") shape[" << k - 1
-                    << "]: " << in->at(idx).shape[k];
-            tensor->add_shape(in->at(idx).shape[k]);
-          }
-        }
-      }
-    }
-
-    int var_idx = 0;
-    for (auto &idx : fetch_index) {
-      float *data_ptr = static_cast(in->at(idx).data.data());
-      int cap = 1;
-      for (int j = 1; j < in->at(idx).shape.size(); ++j) {
-        cap *= in->at(idx).shape[j];
-      }
-      if (model_config->_is_lod_fetch[idx]) {
-        for (int j = 0; j < batch_size; ++j) {
-          for (int k = in->at(idx).lod[0][j]; k < in->at(idx).lod[0][j + 1];
-               k++) {
-            output->mutable_insts(j)
-                ->mutable_tensor_array(var_idx)
-                ->add_float_data(data_ptr[k]);
-          }
-        }
-      } else {
-        for (int j = 0; j < batch_size; ++j) {
-          for (int k = j * cap; k < (j + 1) * cap; ++k) {
-            output->mutable_insts(j)
-                ->mutable_tensor_array(var_idx)
-                ->add_float_data(data_ptr[k]);
-          }
-        }
-      }
-      var_idx++;
-    }
-  }
-
-  if (req->profile_server()) {
-    int64_t end = timeline.TimeStampUS();
-    // TODO(barriery): multi-model profile_time.
-    // At present, only the response_op is multi-input, so here we get
-    // the profile_time by hard coding. It needs to be replaced with
-    // a more elegant way.
-    for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
-      input_blob = get_depend_argument(pre_node_names[pi]);
-      VLOG(2) << "(logid=" << log_id
-              << ") p size for input blob: " << input_blob->p_size;
-      int profile_time_idx = -1;
-      if (pi == 0) {
-        profile_time_idx = 0;
-      } else {
-        profile_time_idx = input_blob->p_size - 2;
-      }
-      for (; profile_time_idx < input_blob->p_size; ++profile_time_idx) {
-        res->add_profile_time(input_blob->time_stamp[profile_time_idx]);
-      }
-    }
-    // TODO(guru4elephant): find more elegant way to do this
-    res->add_profile_time(start);
-    res->add_profile_time(end);
-  }
-
-  return 0;
-}
-DEFINE_OP(GeneralTextResponseOp);
-
-}  // namespace serving
-}  // namespace paddle_serving
-}  // namespace baidu
diff --git a/core/general-server/op/general_text_response_op.h b/core/general-server/op/general_text_response_op.h
deleted file mode 100644
index 334d98476e67f745635f7d66d7b8682de62da355..0000000000000000000000000000000000000000
--- a/core/general-server/op/general_text_response_op.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include 
-#include 
-#include "core/general-server/general_model_service.pb.h"
-#include "core/general-server/op/general_infer_helper.h"
-#include "paddle_inference_api.h"  // NOLINT
-
-namespace baidu {
-namespace paddle_serving {
-namespace serving {
-
-class GeneralTextResponseOp
-    : public baidu::paddle_serving::predictor::OpWithChannel<
-          baidu::paddle_serving::predictor::general_model::Response> {
- public:
-  typedef std::vector TensorVector;
-
-  DECLARE_OP(GeneralTextResponseOp);
-
-  int inference();
-};
-
-}  // namespace serving
-}  // namespace paddle_serving
-}  // namespace baidu
diff --git a/core/general-server/proto/general_model_service.proto b/core/general-server/proto/general_model_service.proto
index e7dd5fccf54be43db8e65a9ed1112ceaece93700..8e1c172a754ad28d57b1222ffb0b2e64b0f93bf1 100644
--- a/core/general-server/proto/general_model_service.proto
+++ b/core/general-server/proto/general_model_service.proto
@@ -24,17 +24,16 @@ message Tensor {
   repeated int32 int_data = 2;
   repeated int64 int64_data = 3;
   repeated float float_data = 4;
-  optional int32 elem_type = 5;
-  repeated int32 shape = 6;
-  repeated int32 lod = 7; // only for fetch tensor currently
+  optional int32 elem_type =
+      5; // 0 means int64, 1 means float32, 2 means int32, 3 means bytes(string)
+  repeated int32 shape = 6;       // shape should include batch
+  repeated int32 lod = 7;         // only for fetch tensor currently
+  optional string name = 8;       // get from the Model prototxt
+  optional string alias_name = 9; // get from the Model prototxt
 };
 
-message FeedInst { repeated Tensor tensor_array = 1; };
-
-message FetchInst { repeated Tensor tensor_array = 1; };
-
 message Request {
-  repeated FeedInst insts = 1;
+  repeated Tensor tensor = 1;
   repeated string fetch_var_names = 2;
   optional bool profile_server = 3 [ default = false ];
   required uint64 log_id = 4 [ default = 0 ];
@@ -46,7 +45,7 @@ message Response {
 };
 
 message ModelOutput {
-  repeated FetchInst insts = 1;
+  repeated Tensor tensor = 1;
   optional string engine_name = 2;
 }
 
diff --git a/core/pdcodegen/src/pdcodegen.cpp b/core/pdcodegen/src/pdcodegen.cpp
index c505ca66385dd363ad0a76470012f07a925bcd17..a99828ee3466a32d45dcabb61a2700f9362539d4 100644
--- a/core/pdcodegen/src/pdcodegen.cpp
+++ b/core/pdcodegen/src/pdcodegen.cpp
@@ -280,6 +280,7 @@ class PdsCodeGenerator : public CodeGenerator {
             "  baidu::rpc::ClosureGuard done_guard(done);\n"
             "  baidu::rpc::Controller* cntl = \n"
             "        static_cast(cntl_base);\n"
+            "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
             "  uint64_t log_id = request->log_id();\n"
             "  cntl->set_log_id(log_id);\n"
             "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
@@ -322,6 +323,7 @@ class PdsCodeGenerator : public CodeGenerator {
             "  baidu::rpc::ClosureGuard done_guard(done);\n"
             "  baidu::rpc::Controller* cntl = \n"
             "        static_cast(cntl_base);\n"
+            "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
             "  uint64_t log_id = equest->log_id();\n"
             "  cntl->set_log_id(log_id);\n"
             "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
@@ -1023,6 +1025,7 @@ class PdsCodeGenerator : public CodeGenerator {
             "  brpc::ClosureGuard done_guard(done);\n"
             "  brpc::Controller* cntl = \n"
             "        static_cast(cntl_base);\n"
+            "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
             "  uint64_t log_id = request->log_id();\n"
             "  cntl->set_log_id(log_id);\n"
             "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
@@ -1067,6 +1070,7 @@ class PdsCodeGenerator : public CodeGenerator {
             "  brpc::ClosureGuard done_guard(done);\n"
             "  brpc::Controller* cntl = \n"
             "        static_cast(cntl_base);\n"
+            "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
             "  uint64_t log_id = request->log_id();\n"
             "  cntl->set_log_id(log_id);\n"
             "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
diff --git a/core/predictor/framework/bsf-inl-tensor.h b/core/predictor/framework/bsf-inl-tensor.h
deleted file mode 100644
index b7c725b443281f355addffb8f2fcb36651b6d9b6..0000000000000000000000000000000000000000
--- a/core/predictor/framework/bsf-inl-tensor.h
+++ /dev/null
@@ -1,373 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef BCLOUD
-#include 
-#else
-#include 
-#endif
-
-#include 
-#include 
-#include 
-#include 
-#include "core/predictor/common/inner_common.h"
-#include "core/predictor/framework/infer_data.h"
-#include "core/predictor/framework/memory.h"
-
-#include 
-
-namespace im {
-namespace bsf {
-
-template <>
-struct Task {
-  typedef Task
-      TaskT;
-  typedef baidu::paddle_serving::predictor::Tensor Tensor;
-  typedef baidu::paddle_serving::predictor::Tensor InType;
-  typedef baidu::paddle_serving::predictor::Tensor OutType;
-  typedef baidu::paddle_serving::predictor::BatchTensor BatchTensor;
-  typedef baidu::paddle_serving::predictor::BatchTensor InArrayT;
-  typedef baidu::paddle_serving::predictor::BatchTensor OutArrayT;
-
-  struct Segment {
-    Segment(void* p, size_t b, size_t s) : ptr(p), begin(b), size(s) {}
-    void* ptr;
-    size_t begin;
-    size_t size;
-  };
-
-  int read_fd;
-  int write_fd;
-
-  pid_t owner_tid;
-
-  const InArrayT* in;
-  OutArrayT* out;
-
-  size_t rem;
-  size_t size;
-
-  butil::atomic index;
-
-  const BatchTensor* get(bool is_in) const {
-    if (is_in) {
-      return in;
-    } else {
-      return out;
-    }
-  }
-
-  BatchTensor* get(bool is_in) {
-    if (is_in) {
-      return const_cast(in);
-    } else {
-      return out;
-    }
-  }
-
-  Task() {
-    read_fd = -1;
-    write_fd = -1;
-    owner_tid = -1;
-    in = NULL;
-    out = NULL;
-    rem = -1;
-    size = -1;
-    index.store(0, butil::memory_order_relaxed);
-  }
-};
-
-template <>
-class BatchTasks> {
- public:
-  typedef baidu::paddle_serving::predictor::Tensor Tensor;
-  typedef baidu::paddle_serving::predictor::Tensor InType;
-  typedef baidu::paddle_serving::predictor::Tensor OutType;
-  typedef baidu::paddle_serving::predictor::DataBuf DataBuf;
-  typedef baidu::paddle_serving::predictor::MempoolWrapper MempoolWrapper;
-
-  typedef Task
-      TaskT;
-  typedef TaskMeta TaskMetaT;
-  typedef TaskT::InArrayT InArrayT;
-  typedef TaskT::OutArrayT OutArrayT;
-
-  explicit BatchTasks(size_t batch_size, bool batch_align = false)
-      : _batch_size(batch_size),
-        _rem_size(batch_size),
-        _batch_align(batch_align) {
-    _batch_in.clear();
-    _batch_out.clear();
-    _tasks.clear();
-  }
-
-  ~BatchTasks() {
-    _batch_in.clear();
-    _batch_out.clear();
-    _tasks.clear();
-  }
-
-  static bool check_valid(const InArrayT& in,
-                          OutArrayT& out,  // NOLINT
-                          bool align) {    // NOLINT
-    if (align) {
-      if (out.count() <= 0 || out.size() <= 0) {
-        LOG(ERROR) << "Out tensor is empty, when aligned";
-        return false;
-      }
-
-      if (out.size() != in.size()) {
-        LOG(ERROR) << "In/Out tensor size not eq: " << out.size()
-                   << "!=" << in.size();
-        return false;
-      }
-
-      for (size_t fi = 0, shape0 = 0; fi < out.count(); ++fi) {
-        if (!out[fi].valid()) {
-          LOG(ERROR) << "Out[" << fi << "] tensor not valid";
-          return false;
-        }
-
-        if (out.size() != out[fi].shape0()) {
-          LOG(ERROR) << "Shape0 not consistency, " << out.size()
-                     << "!=" << out[fi].shape0() << ", " << fi;
-          return false;
-        }
-      }
-    }
-
-    return true;
-  }
-
-  size_t append_task(TaskT* task) {
-    size_t add = std::min(task->rem, _rem_size);
-    if (!_batch_align) {
-      add = task->rem;
-    }
-    TaskMetaT tm(task, task->in->size() - task->rem, add);
-    _tasks.push_back(tm);
-
-    task->rem -= add;
-    _rem_size -= add;
-    return _rem_size;
-  }
-
-  void merge_tasks() {
-    merge_input();
-    merge_output();
-  }
-
-  void merge_input() {
-    if (_tasks.size() <= 0 || _tasks[0].task->in->count() <= 0) {
-      return;
-    }
-
-    if (_tasks.size() == 1 && !_batch_align) {
-      TaskMetaT& tm = _tasks[0];
-      _batch_in = *(tm.task->in);
-      return;
-    }
-
-    merge_tensor(true);
-  }
-
-  void merge_output() {
-    if (_batch_align) {
-      if (_tasks.size() <= 0 || _tasks[0].task->out->count() <= 0) {
-        return;
-      }
-    }
-
-    if (_tasks.size() <= 0 || _tasks[0].task->out->count() <= 0) {
-      return;
-    }
-
-    TaskMetaT& tm = _tasks[0];
-    if (_tasks.size() == 1 && !_batch_align) {
-      _batch_out = *(tm.task->out);
-      return;
-    }
-
-    if (tm.task->out->size() <= 0) {
-      // shape is empty
-      _batch_out = *(tm.task->out);
-      return;
-    }
-
-    if ((*tm.task->out)[0].data.data() == 0 ||
-        (*tm.task->out)[0].data.size() == 0) {
-      _batch_out = *(tm.task->out);
-      return;
-    }
-
-    merge_tensor(false);
-  }
-
-  void merge_tensor(bool is_in) {
-    // accumulate batch size from fetched tasks
-    size_t batch_size = 0;
-    for (size_t ti = 0; ti < _tasks.size(); ++ti) {
-      TaskMetaT& tm = _tasks[ti];
-      size_t add = tm.end - tm.begin;
-      batch_size += add;
-    }
-
-    // merge all instanses in each tensor data
-    size_t tensor_count = _tasks[0].task->get(is_in)->count();
-    for (size_t fi = 0; fi < tensor_count; ++fi) {
-      const Tensor& head = (*(_tasks[0].task->get(is_in)))[fi];
-      Tensor batch_tensor;
-      batch_tensor.name = head.name;
-      batch_tensor.type = head.type;
-      batch_tensor.shape.push_back(batch_size);
-
-      size_t ins_ele_count = 1;
-      for (size_t si = 1; si < head.shape.size(); ++si) {
-        batch_tensor.shape.push_back(head.shape[si]);
-        ins_ele_count *= head.shape[si];
-      }
-
-      size_t tensor_ele_count = ins_ele_count * batch_size;
-      size_t ins_byte = ins_ele_count * head.ele_byte();
-
-      size_t tensor_byte = tensor_ele_count * head.ele_byte();
-      void* data_buf = MempoolWrapper::instance().malloc(tensor_byte);
-      if (!data_buf) {
-        LOG(ERROR) << "Malloc failed, size: " << tensor_byte;
-        return;
-      }
-
-      size_t data_byte = 0;
-      for (size_t ti = 0; ti < _tasks.size(); ++ti) {
-        TaskMetaT& tm = _tasks[ti];
-        size_t acc_byte = ins_byte * (tm.end - tm.begin);
-        if (data_byte + acc_byte > tensor_byte) {
-          LOG(ERROR) << "Invalid bytes: " << data_byte << " + " << acc_byte
-                     << " >= " << tensor_byte;
-          return;
-        }
-
-        const Tensor& tensor = (*(tm.task->get(is_in)))[fi];
-        memcpy(
-            reinterpret_cast(data_buf) + data_byte,
-            reinterpret_cast(tensor.data.data()) + tm.begin * ins_byte,
-            acc_byte);
-        data_byte += acc_byte;
-      }
-
-      if (data_byte != tensor_byte) {
-        LOG(ERROR) << "Invalid tensor byte: " << data_byte
-                   << " != " << tensor_byte;
-        return;
-      }
-
-      batch_tensor.data =
-          DataBuf(reinterpret_cast(data_buf), tensor_byte);
-      if (is_in) {
-        _batch_in.push_back(batch_tensor);
-      } else {
-        _batch_out.push_back(batch_tensor);
-      }
-    }
-
-    LOG(INFO) << "merge input(" << is_in << ") samples: " << batch_size
-              << " from " << _tasks.size() << " pvs";
-  }
-
-  void notify_tasks() {
-    if (_batch_out.size() != _batch_in.size()) {
-      LOG(ERROR) << "batch size not consistency: " << _batch_out.size()
-                 << " != " << _batch_in.size();
-      return;
-    }
-
-    size_t tensor_count = _batch_out.count();
-    size_t batch_size = _batch_out.size();
-    for (size_t fi = 0; fi < tensor_count; ++fi) {
-      const Tensor& tensor = _batch_out[fi];
-      size_t ins_byte = tensor.ele_byte();
-      for (size_t si = 1; si < tensor.shape.size(); ++si) {
-        ins_byte *= tensor.shape[si];
-      }
-
-      for (size_t ti = 0, bi = 0, add = 0; ti < _tasks.size();
-           ++ti, bi += add) {
-        OutArrayT* dst = _tasks[ti].task->out;
-        add = _tasks[ti].end - _tasks[ti].begin;
-        size_t offset_src = ins_byte * bi;
-        size_t add_byte = add * ins_byte;
-
-        if (_batch_align) {  // merge all batchs
-          size_t offset_dst = ins_byte * _tasks[ti].begin;
-          void* ptr = const_cast((*dst)[fi].data.data());
-          memcpy(
-              reinterpret_cast(ptr) + offset_dst,
-              reinterpret_cast(_batch_out[fi].data.data()) + offset_src,
-              add_byte);
-        } else {  // overwrite
-          if (dst->count() <= 0) {
-            dst->push_back(_batch_out[fi]);
-          } else {
-            (*dst)[fi] = _batch_out[fi];
-          }
-
-          (*dst)[fi].shape[0] = add;
-          (*dst)[fi].data = DataBuf(
-              reinterpret_cast(_batch_out[fi].data.data()) + offset_src,
-              add_byte);
-        }
-      }
-    }
-
-    for (size_t ti = 0; ti < _tasks.size(); ++ti) {
-      TaskT* task = _tasks[ti].task;
-      size_t begin = _tasks[ti].begin;
-      size_t end = _tasks[ti].end;
-      size_t add = end - begin;
-
-      size_t index = task->index.fetch_add(add);
-      if ((index + add) >= task->in->size()) {
-        char c = 0;
-        while (write(task->write_fd, &c, 1) != 1 && errno == EINTR) {
-        }
-        butil::return_object(task);
-      }
-    }
-  }
-
-  const typename TaskT::InArrayT& in() const { return _batch_in; }
-
-  typename TaskT::OutArrayT& out() { return _batch_out; }
-
-  size_t task_size() { return _tasks.size(); }
-
- private:
-  std::vector _tasks;
-  InArrayT _batch_in;
-  OutArrayT _batch_out;
-  size_t _batch_size;
-  size_t _rem_size;
-  bool _batch_align;
-};
-
-}  // namespace bsf
-}  // namespace im
diff --git a/core/predictor/framework/bsf-inl.h b/core/predictor/framework/bsf-inl.h
index 1193ce4860e595598b738adab738c7af9664cc26..1f5d272d2875ee878f09ac2882364afe9fd899fb 100644
--- a/core/predictor/framework/bsf-inl.h
+++ b/core/predictor/framework/bsf-inl.h
@@ -24,6 +24,7 @@
 #include 
 
 #include "core/predictor/common/inner_common.h"
+#include "core/predictor/framework/memory.h"
 
 namespace im {
 namespace bsf {
@@ -35,7 +36,7 @@ void* TaskExecutor::thread_entry(void* args) {
       static_cast*>(context->executor);
   executor->work(context);
 
-  return NULL;
+  return nullptr;
 }
 
 template 
@@ -70,7 +71,7 @@ int TaskExecutor::start(uint32_t thread_num, uint32_t init_timeout_sec) {
     _thread_contexts.push_back(&contexts[i]);
   }
 
-  int init_timeout = init_timeout_sec * 1000 * 1000;
+  size_t init_timeout = init_timeout_sec * 1000 * 1000;
   bool has_error = false;
 
   bool has_timeout = true;
@@ -102,7 +103,7 @@ int TaskExecutor::start(uint32_t thread_num, uint32_t init_timeout_sec) {
     }
 
     // 100ms
-    const int sleep_interval = 100 * 1000;
+    const size_t sleep_interval = 100 * 1000;
     usleep(sleep_interval);
     init_timeout -= sleep_interval;
   }
@@ -125,18 +126,21 @@ void TaskExecutor::stop() {
 }
 
 template 
-TaskHandler TaskExecutor::schedule(const InArrayT& in,
-                                                 OutArrayT& out) {  // NOLINT
+TaskHandler TaskExecutor::schedule(
+    const void* inVectorT_ptr,
+    void* outVectorT_ptr) {  // NOLINT
   TaskT* task = butil::get_object();
   if (!task) {
     LOG(ERROR) << "Failed get TaskT from object pool";
     return TaskHandler::valid_handle();
   }
 
+  /*
   if (!BatchTasks::check_valid(in, out, _batch_align)) {
     LOG(ERROR) << "Invalid input & output";
     return TaskHandler::valid_handle();
   }
+  */
 
   int fds[2];
   int rc = pipe(fds);
@@ -150,10 +154,9 @@ TaskHandler TaskExecutor::schedule(const InArrayT& in,
   task->write_fd = fds[1];
   task->owner_tid = ::syscall(SYS_gettid);
 
-  task->in = ∈
-  task->out = &out;
-  task->rem = in.size();
-  task->size = in.size();
+  task->inVectorT_ptr = (const InVectorT*)inVectorT_ptr;
+  task->outVectorT_ptr = (OutVectorT*)outVectorT_ptr;
+  task->rem = task->batch_size();
   task->index.store(0, butil::memory_order_relaxed);
 
   AutoMutex lock(_mut);
@@ -163,8 +166,13 @@ TaskHandler TaskExecutor::schedule(const InArrayT& in,
   return TaskHandler(*task);
 }
 
+// this function is accessed by multi thread.
+// so AutoMutex at first.
+// so batch.append_task is thread safe.
+// you dont need to add extra lock in append_task()
 template 
-bool TaskExecutor::fetch_batch(BatchTasks& batch) {  // NOLINT
+bool TaskExecutor::move_task_to_batch(
+    BatchTasks& batch) {  // NOLINT
   AutoMutex lock(_mut);
   while (_task_queue.empty()) {
     THREAD_COND_WAIT(&_cond, &_mut);
@@ -187,8 +195,30 @@ bool TaskExecutor::fetch_batch(BatchTasks& batch) {  // NOLINT
   return true;
 }
 
+// this function is accessed by multi thread.
+// move_task_to_batch have add lock inside the function.
+// Packaging 1 TaskT as 1 or Several TaskMeta.
+// TaskT is from the SingleTon TaskExecutor`s _task_queue
+// although TaskMeta is a local variable, but several TaskMeta may points to
+// the same TaskT which is get from the SingleTon TaskExecutor`s _task_queue.
+// put TaskMeta to the local variable BatchTasks batch.
+
+// batch.merge_tasks() and batch.notify_tasks() has no lock.
+// BatchTasks batch itself is a local variable, it`s thread safe.
+// If batch.merge_tasks() and batch.notify_tasks() do something to TaskMeta
+// you need to pay attention to that.
+// Multi-Thread deal with different TaskMeta(cause it`s created as local
+// variable)
+// But different TaskMeta may points to the same TaskT
+// which is get from the SingleTon TaskExecutor`s _task_queue.
+
 template 
 int TaskExecutor::work(ThreadContext* context) {
+  if (MempoolWrapper::instance().thread_initialize() != 0) {
+    LOG(ERROR) << "Failed thread initialize mempool";
+    return -1;
+  }
+
   if (_thread_init_fn != NULL) {
     if (_thread_init_fn(context->user_thread_context) != 0) {
       LOG(ERROR) << "execute thread init thunk failed, BSF thread will exit";
@@ -207,10 +237,15 @@ int TaskExecutor::work(ThreadContext* context) {
       }
     }
 
+    if (MempoolWrapper::instance().thread_clear() != 0) {
+      LOG(ERROR) << "Failed thread clear mempool";
+      return -1;
+    }
+
     BatchTasks batch(_batch_size, _batch_align);
-    if (fetch_batch(batch)) {
+    if (move_task_to_batch(batch)) {
       batch.merge_tasks();
-      _fn(batch.in(), batch.out());
+      _fn(&batch.in(), &batch.out());
       batch.notify_tasks();
     }
   }
@@ -219,9 +254,10 @@ int TaskExecutor::work(ThreadContext* context) {
 }
 
 template 
-bool TaskManager::schedule(const InArrayT& in,
-                                              OutArrayT& out) {  // NOLINT
-  TaskHandler handler = _executor.schedule(in, out);
+bool TaskManager::schedule(const void* in,
+                                              void* out) {  // NOLINT
+  TaskHandler handler =
+      TaskExecutorVector::instance()[_model_index].schedule(in, out);
 
   if (handler.valid()) {
     _task_owned = handler;
diff --git a/core/predictor/framework/bsf.h b/core/predictor/framework/bsf.h
index 36a00c381130c191de713e5024c7247d64cb96e7..7a8629e75b87aec889a1cce98b6392dddad32ce0 100644
--- a/core/predictor/framework/bsf.h
+++ b/core/predictor/framework/bsf.h
@@ -16,7 +16,7 @@
 
 #include 
 #include 
-#include 
+#include 
 #include 
 
 #ifdef BCLOUD
@@ -29,46 +29,186 @@
 
 #include "boost/function.hpp"
 
+#include "core/predictor/framework/memory.h"
+#include "paddle_inference_api.h"
+
 namespace im {
 namespace bsf {
 
 static const size_t DEFAULT_BATCH_SIZE = 100;
 
+// InItemT is paddle::PaddleTensor
+// InVectorT std::vector
+// InVectorT means different feedvar, but not batch.
+// Batch is already inside the  paddle::PaddleTensor.
+
+// size_t `rem` records how many batch have not been put in BatchTasks.
+// `rem` don`t need to be atomic, cause the operation `put` is synchronous.
+// actually, the reason is that lock have been added outside the operation
+// `put`.
+
+// size_t `index` records how many batch have been processing completed.
+// `index` need to be atomic, cause the operation 'notify' is asynchronous.
 template 
 struct Task {
-  typedef std::vector InArrayT;
-  typedef std::vector OutArrayT;
+  typedef std::vector InVectorT;
+  typedef std::vector OutVectorT;
   typedef InItemT InType;
   typedef OutItemT OutType;
   typedef Task TaskT;
+  typedef std::vector ShapeVector;
+  typedef std::vector VectorOfShapeVector;
 
   int read_fd;
   int write_fd;
-
   pid_t owner_tid;
-
-  const InArrayT* in;
-  OutArrayT* out;
-
+  const InVectorT* inVectorT_ptr;
+  OutVectorT* outVectorT_ptr;
   size_t rem;
-  size_t size;
-
-  size_t batch_size() { return in->size(); }
-
   butil::atomic index;
 
   Task() {
     read_fd = -1;
     write_fd = -1;
     owner_tid = -1;
-    in = NULL;
-    out = NULL;
+    inVectorT_ptr = NULL;
+    outVectorT_ptr = NULL;
     rem = -1;
-    size = -1;
     index.store(0, butil::memory_order_relaxed);
   }
+
+  bool check_feedvar_valid(int feedvar_index) {
+    if (feedvar_index < 0 || inVectorT_ptr->size() <= feedvar_index) {
+      LOG(ERROR) << "feedvar doesnt exsit or feedvar_index error";
+      return 0;
+    }
+
+    if ((*inVectorT_ptr)[feedvar_index].shape.size() <= 0) {
+      LOG(ERROR) << "feedvar[" << feedvar_index << "].shape.size()<=0,error";
+      return 0;
+    }
+
+    return 1;
+  }
+
+  // Now, it simply assume that the first dimension of data is batch.
+  // so the batch is PaddleTensor.shape[0]
+
+  // If batch information is added into feedvar.prototxt.
+  // we can get the information from the feedvar.prototxt instead of assume.
+  size_t feedvar_batch_size(int feedvar_index) {
+    if (!check_feedvar_valid(feedvar_index)) {
+      return 0;
+    }
+
+    return (*inVectorT_ptr)[feedvar_index].shape[0];
+  }
+
+  size_t feedvar_element_bytesize(int feedvar_index) {
+    if (!check_feedvar_valid(feedvar_index)) {
+      return 0;
+    }
+    int dtype = (*inVectorT_ptr)[feedvar_index].dtype;
+    if (dtype == paddle::PaddleDType::INT64) {
+      return sizeof(int64_t);
+    }
+    if (dtype == paddle::PaddleDType::FLOAT32) {
+      return sizeof(float);
+    }
+    if (dtype == paddle::PaddleDType::INT32) {
+      return sizeof(int32_t);
+    }
+    if (dtype == paddle::PaddleDType::UINT8) {
+      return sizeof(char);
+    }
+    return 0;
+  }
+
+  // Now, the implementation of this function is based on assumption
+  // that shape [0] = batch_size.
+  size_t feedvar_element_num(int feedvar_index) {
+    if (!check_feedvar_valid(feedvar_index)) {
+      return 0;
+    }
+    size_t element_num = 1;
+    if ((*inVectorT_ptr)[feedvar_index].shape.size() == 1) {
+      // cause shape[0] is batch_size.
+      // [10,1] = [10], so if shape[1] doesn`t exist.
+      // should return 1.
+      return 1;
+    }
+    // start from shape[1], cause shape[0] = batch_size.
+    for (int i = 1; i < (*inVectorT_ptr)[feedvar_index].shape.size(); ++i) {
+      element_num *= (*inVectorT_ptr)[feedvar_index].shape[i];
+    }
+    return element_num;
+  }
+
+  size_t feedvar_bytesize(int feedvar_index) {
+    return feedvar_element_num(feedvar_index) *
+           feedvar_element_bytesize(feedvar_index);
+  }
+
+  ShapeVector feedvar_shape_nobatch(int feedvar_index) {
+    if (!check_feedvar_valid(feedvar_index)) {
+      return ShapeVector();
+    }
+    return ShapeVector{(*inVectorT_ptr)[feedvar_index].shape.begin() + 1,
+                       (*inVectorT_ptr)[feedvar_index].shape.end()};
+  }
+
+  VectorOfShapeVector feedvar_shape_nobatch() {
+    VectorOfShapeVector vector_of_feedvar_shape_nobatch(inVectorT_ptr->size());
+    for (int index = 0; index < inVectorT_ptr->size(); ++index) {
+      vector_of_feedvar_shape_nobatch.push_back(feedvar_shape_nobatch(index));
+    }
+    return vector_of_feedvar_shape_nobatch;
+  }
+
+  // At present, it is considered that the batch of all feedvar is consistent.
+  // so for each feedvar, PaddleTensor.shape[0] should be the same.
+  bool check_batch_align() {
+    int batch_size_align = feedvar_batch_size(0);
+    for (int feedvar_index = 0; feedvar_index < inVectorT_ptr->size();
+         ++feedvar_index) {
+      if (feedvar_batch_size(feedvar_index) != batch_size_align) {
+        return 0;
+      }
+    }
+    /*
+    for(int fetchvar_index = 0; fetchvar_index < outVectorT_ptr->size();
+    ++fetchvar_index) {
+      if(fetchvar_batch_size(fetchvar_index) != batch_size_align) {
+        return 0;
+      }
+    }
+    */
+    return 1;
+  }
+
+  size_t batch_size() {
+    if (check_batch_align()) {
+      return feedvar_batch_size(0);
+    }
+    return 0;
+  }
 };
 
+// `Several Task` or `part of batch in Task` can be a TaskMeta.
+// Task is the original Request from User.
+// For example, the batch of Task is 30. There are 4 Requests.
+// The batch of BatchTasks is 100, which means we can deal 100 batch 1 time.
+// TaskMeta-1:{task-1,0,30} TaskMeta-2:{task-2,0,30} TaskMeta-3:{task-3,0,30}
+// but the last Task will be divided to 2 TaskMeta.
+// TaskMeta-4:{task-4,0,10} TaskMeta-5:{task-4,10,30}.
+// TaskMeta-1 ~ TaskMeta-4 will be inside BatchTasks-1.
+// TaskMeta-5 will be inside BatchTasks-2.
+
+// TaskMeta is necessary.
+// cause we need know the the corresponding relationship between
+// `batch_out`(which is in BatchTasks) and `outVectorT_ptr`(which is in Task).
+// especially when 1 Task be divided into several TaskMeta and be put into
+// several different BatchTasks.
 template 
 struct TaskMeta {
   TaskMeta(TaskT* ptr, size_t start, size_t add)
@@ -79,6 +219,11 @@ struct TaskMeta {
   size_t end;
 };
 
+// each TaskT is already include batch in itself
+// BatchTasks need to combine several `small TaskMeta` into a new `big TaskT`.
+// The only difference between the `big TaskT` and `small TaskT` is that
+// the TaskT.inVectorT_ptr->[feedvar_index].shape[0]
+// which is actually batch_size is different.
 template 
 class BatchTasks {
  public:
@@ -91,33 +236,38 @@ class BatchTasks {
         _rem_size(batch_size),
         _batch_align(batch_align) {
     _batch_in.clear();
+    _batch_in_offset.clear();
     _batch_out.clear();
-    _tasks.clear();
+    _batch_out_offset.clear();
+    _taskmeta_vector.clear();
   }
 
   ~BatchTasks() {
     _batch_in.clear();
+    _batch_in_offset.clear();
     _batch_out.clear();
-    _tasks.clear();
+    _batch_out_offset.clear();
+    _taskmeta_vector.clear();
   }
 
   // synchronized operation
+  // because Upper level callers of this function have already locked.
   size_t append_task(TaskT* task) {
     size_t add = std::min(task->rem, _rem_size);
     if (!_batch_align) {
       add = task->rem;
     }
-
-    TaskMetaT tm(task, task->in->size() - task->rem, add);
-    _tasks.push_back(tm);
+    int start_index = task->batch_size() - task->rem;
+    TaskMetaT tm(task, start_index, add);
+    _taskmeta_vector.push_back(tm);
 
     task->rem -= add;
     _rem_size -= add;
     return _rem_size;
   }
 
-  static bool check_valid(const typename TaskT::InArrayT& in,
-                          const typename TaskT::OutArrayT& out,
+  static bool check_valid(const typename TaskT::InVectorT& in,
+                          const typename TaskT::OutVectorT& out,
                           bool align) {
     (void)in;
     (void)out;
@@ -125,40 +275,220 @@ class BatchTasks {
     return true;
   }
 
+  // this should be modified totally.
+  // maybe we don`t need to do this inside the BatchTasks.
+  // we can do the copy work outside the BatchTasks.
+  // cause maybe next time we don`t need to do the extra copy.
+  // directly copy the every Task into the Predictor.
+
+  // lod is not supported.
+  // if lod is set, we should not allow to use the bsf task.
+
+  // batch.merge_tasks() is thread-safe function
+  // cause batch is a local variable and Task is just read, not written.
   void merge_tasks() {
-    for (size_t ti = 0; ti < _tasks.size(); ++ti) {
-      TaskMetaT& tm = _tasks[ti];
-      for (size_t vi = tm.begin; vi < tm.end; ++vi) {
-        _batch_in.push_back((*tm.task->in)[vi]);
-        _batch_out.push_back((*tm.task->out)[vi]);
+    if (_taskmeta_vector.size() <= 0) {
+      return;
+    }
+
+    // Temporarily, the batch of each feedvar is consistent
+    // If not consistent, use feedvar_batch_size instead of task->batch_size().
+    int temp_batch = 0;
+    for (size_t ti = 0; ti < _taskmeta_vector.size(); ++ti) {
+      TaskMetaT& tm = _taskmeta_vector[ti];
+      temp_batch += tm.task->batch_size();
+    }
+    if (temp_batch > _batch_size) {
+      LOG(ERROR) << "_realNumber_batch_in >_batch_size, error.";
+      return;
+    }
+
+    int feedvar_num = _taskmeta_vector[0].task->inVectorT_ptr->size();
+    if (_batch_in_offset.size() == 0) {
+      _batch_in_offset.resize(feedvar_num, 0);
+      _realNumber_batch_in.resize(feedvar_num, temp_batch);
+    }
+
+    for (size_t ti = 0; ti < _taskmeta_vector.size(); ++ti) {
+      TaskMetaT& tm = _taskmeta_vector[ti];
+
+      for (int index = 0; index < feedvar_num; ++index) {
+        const paddle::PaddleTensor& feedVarTensor =
+            (*tm.task->inVectorT_ptr)[index];
+        size_t feedvar_bytesize = tm.task->feedvar_bytesize(index);
+
+        if (ti == 0) {
+          if (feedVarTensor.lod.size() > 0 && feedVarTensor.lod[0].size() > 0) {
+            LOG(ERROR) << "lod Tensor is not supported now.";
+            return;
+          }
+          // for now, we assume that every task feedvar_bytesize is the same.
+          // which means we dont support auto embedding.
+          // but for different feedvar, it is different.
+          paddle::PaddleTensor paddleTensor;
+          paddleTensor.dtype = feedVarTensor.dtype;
+          paddleTensor.name = feedVarTensor.name;
+          paddleTensor.lod = feedVarTensor.lod;
+          paddleTensor.shape = feedVarTensor.shape;
+          paddleTensor.shape[0] = _realNumber_batch_in[index];
+          paddleTensor.data.Resize(feedvar_bytesize *
+                                   _realNumber_batch_in[index]);
+          _batch_in.push_back(paddleTensor);
+        }
+
+        void* dst_ptr = _batch_in[index].data.data() + _batch_in_offset[index];
+        void* source_ptr =
+            feedVarTensor.data.data() + feedvar_bytesize * tm.begin;
+        size_t length = feedvar_bytesize * (tm.end - tm.begin);
+        memcpy(dst_ptr, source_ptr, length);
+        _batch_in_offset[index] += length;
       }
     }
   }
 
+  bool check_fetchvar_valid(int fetchvar_index) {
+    if (fetchvar_index < 0 || _batch_out.size() <= fetchvar_index) {
+      LOG(ERROR) << "fetchvar doesnt exsit or fetchvar_index error";
+      return 0;
+    }
+
+    if (_batch_out[fetchvar_index].shape.size() <= 0) {
+      LOG(ERROR) << "fetchvar[" << fetchvar_index << "].shape.size()<=0,error";
+      return 0;
+    }
+
+    return 1;
+  }
+
+  size_t fetchvar_batch_size(int fetchvar_index) {
+    if (!check_fetchvar_valid(fetchvar_index)) {
+      return 0;
+    }
+
+    return _batch_out[fetchvar_index].shape[0];
+  }
+
+  size_t fetchvar_element_bytesize(int fetchvar_index) {
+    if (!check_fetchvar_valid(fetchvar_index)) {
+      return 0;
+    }
+    int dtype = _batch_out[fetchvar_index].dtype;
+    if (dtype == paddle::PaddleDType::INT64) {
+      return sizeof(int64_t);
+    }
+    if (dtype == paddle::PaddleDType::FLOAT32) {
+      return sizeof(float);
+    }
+    if (dtype == paddle::PaddleDType::INT32) {
+      return sizeof(int32_t);
+    }
+    if (dtype == paddle::PaddleDType::UINT8) {
+      return sizeof(char);
+    }
+    return 0;
+  }
+
+  // Now, the implementation of this function is based on assumption
+  // that shape [0] = batch_size.
+  size_t fetchvar_element_num(int fetchvar_index) {
+    if (!check_fetchvar_valid(fetchvar_index)) {
+      return 0;
+    }
+    size_t element_num = 1;
+    if (_batch_out[fetchvar_index].shape.size() == 1) {
+      // cause shape[0] is batch_size.
+      return 1;
+    }
+    // start from shape[1], cause shape[0] = batch_size.
+    for (int i = 1; i < _batch_out[fetchvar_index].shape.size(); ++i) {
+      element_num *= _batch_out[fetchvar_index].shape[i];
+    }
+    return element_num;
+  }
+
+  size_t fetchvar_bytesize(int fetchvar_index) {
+    return fetchvar_element_num(fetchvar_index) *
+           fetchvar_element_bytesize(fetchvar_index);
+  }
+
+  bool check_fetchvar_batch_align() {
+    int batch_size_align = fetchvar_batch_size(0);
+
+    for (int fetchvar_index = 0; fetchvar_index < _batch_out.size();
+         ++fetchvar_index) {
+      if (fetchvar_batch_size(fetchvar_index) != batch_size_align) {
+        return 0;
+      }
+    }
+
+    return 1;
+  }
+
+  size_t fetchvar_batch_size() {
+    if (check_fetchvar_batch_align()) {
+      return fetchvar_batch_size(0);
+    }
+    return 0;
+  }
+
   void notify_tasks() {
-    if (_batch_out.size() != _batch_in.size()) {
-      LOG(ERROR) << "batch size not consistency: " << _batch_out.size()
-                 << " != " << _batch_in.size();
+    if (_taskmeta_vector.size() <= 0) {
+      LOG(ERROR) << "_taskmeta_vector.size() <=0, error.";
+      return;
+    }
+    if (_realNumber_batch_in[0] != fetchvar_batch_size()) {
+      LOG(ERROR) << "_batch_out`s batch != _batch_in`s batch, error.";
       return;
     }
 
-    for (size_t ti = 0, bi = 0; ti < _tasks.size(); ++ti) {
-      TaskT* task = _tasks[ti].task;
-      size_t begin = _tasks[ti].begin;
-      size_t end = _tasks[ti].end;
+    int fetchvar_num = _batch_out.size();
+    if (_batch_out_offset.size() == 0) {
+      _batch_out_offset.resize(fetchvar_num, 0);
+    }
+
+    for (size_t ti = 0; ti < _taskmeta_vector.size(); ++ti) {
+      TaskT* task = _taskmeta_vector[ti].task;
+      size_t begin = _taskmeta_vector[ti].begin;
+      size_t end = _taskmeta_vector[ti].end;
       size_t add = end - begin;
 
-      for (size_t oi = begin; oi < end; ++oi, ++bi) {
-        if (bi >= _batch_in.size()) {
-          LOG(ERROR) << "batch index overflow: " << bi << " > "
-                     << _batch_in.size();
+      for (int index = 0; index < fetchvar_num; ++index) {
+        // the task->outVectorT_ptr is null before core->run().
+        // first time we should copy from _batch_out
+        // so we need init.
+        size_t fetchvar_bytesize_index = fetchvar_bytesize(index);
+        if (task->outVectorT_ptr->size() <= index) {
+          paddle::PaddleTensor tensor_out;
+          tensor_out.name = _batch_out[index].name;
+          tensor_out.dtype = paddle::PaddleDType(_batch_out[index].dtype);
+          tensor_out.shape = _batch_out[index].shape;
+          tensor_out.shape[0] = task->batch_size();
+          tensor_out.lod = _batch_out[index].lod;
+          // resize all batch memory at one time
+          size_t databuf_size = task->batch_size() * fetchvar_bytesize_index;
+          tensor_out.data.Resize(databuf_size);
+          task->outVectorT_ptr->push_back(tensor_out);
+        }
+
+        paddle::PaddleTensor& fetchVarTensor = (*task->outVectorT_ptr)[index];
+
+        void* dst_ptr =
+            fetchVarTensor.data.data() + fetchvar_bytesize_index * begin;
+        size_t length = fetchvar_bytesize_index * add;
+        if (_batch_out_offset[index] + length >
+            fetchvar_batch_size() * fetchvar_bytesize(index)) {
+          LOG(ERROR) << "_batch_out is less than taskmeta, error.";
           return;
         }
-        (*task->out)[oi] = _batch_out[bi];
+        void* source_ptr =
+            _batch_out[index].data.data() + _batch_out_offset[index];
+
+        memcpy(dst_ptr, source_ptr, length);
+        _batch_out_offset[index] += length;
       }
 
       size_t index = task->index.fetch_add(add);
-      if ((index + add) >= task->in->size()) {
+      if ((index + add) >= task->batch_size()) {
         char c = 0;
         while (write(task->write_fd, &c, 1) != 1 && errno == EINTR) {
         }
@@ -167,22 +497,33 @@ class BatchTasks {
     }
   }
 
-  const typename TaskT::InArrayT& in() const { return _batch_in; }
+  const typename TaskT::InVectorT& in() const { return _batch_in; }
 
-  typename TaskT::OutArrayT& out() { return _batch_out; }
+  typename TaskT::OutVectorT& out() { return _batch_out; }
 
-  size_t task_size() { return _tasks.size(); }
+  size_t task_size() { return _taskmeta_vector.size(); }
 
  private:
-  std::vector _tasks;
-  typename TaskT::InArrayT _batch_in;
-  typename TaskT::OutArrayT _batch_out;
+  std::vector _taskmeta_vector;
+  typename TaskT::InVectorT _batch_in;
+  std::vector _batch_in_offset;
+  std::vector _realNumber_batch_in;
+  typename TaskT::OutVectorT _batch_out;
+  std::vector _batch_out_offset;
+  std::vector _realNumber_batch_out;
   size_t _rem_size;
   size_t _batch_size;
   bool _batch_align;
 };
 
 // BSF task handle
+// TaskHandler is the handle of Task.
+// `read_fd` is used for receive signal in brpc Thread.
+// 'write_fd' is used for write signal in bsf Thread.
+// when TaskMeta is done, bsf Thread will write to 'write_fd'.
+// brpc Thread is keeping reading 'read_fd' in a while loop.
+// brpc Thread will receive signal when TaskMeta is done.
+// so `read_fd` and 'write_fd' is used for communicate in different Thread.
 template 
 struct TaskHandler {
   int read_fd;
@@ -205,12 +546,11 @@ struct TaskHandler {
   }
 };
 
+// TaskExecutor is a Thread pool.
 template 
 class TaskExecutor;
 
-template 
-class TaskManager;
-
+// ThreadContext is used for start a bsf Thread.
 template 
 struct ThreadContext {
   TaskExecutor* executor;
@@ -231,14 +571,24 @@ struct ThreadContext {
   }
 };
 
+// TaskExecutor is a Thread pool.
+// Each Model corresponding to a Model.
+// TaskT is actually a Request preprocessed by ReaderOp.
+// TaskT will be divided as TaskMeta which will be
+// put into _task_queue in brpc-Thread by schedule().
+// TaskHander will be returned to brpc-Thread.
+// start() function will create `thread_num` bsf Threads.
+// every bsf Thread check the _task_queue and take TaskMeta from it.
+// when a Task`s all TaskMeta is done, TaskHander will be noticed.
 template 
 class TaskExecutor {
  public:
   typedef typename TaskT::InType InType;
   typedef typename TaskT::OutType OutType;
-  typedef typename TaskT::InArrayT InArrayT;
-  typedef typename TaskT::OutArrayT OutArrayT;
+  typedef typename TaskT::InVectorT InVectorT;
+  typedef typename TaskT::OutVectorT OutVectorT;
   typedef std::vector TaskArrayT;
+  typedef baidu::paddle_serving::predictor::MempoolWrapper MempoolWrapper;
 
   TaskExecutor()
       : _stop(false),
@@ -258,9 +608,11 @@ class TaskExecutor {
     THREAD_COND_DESTROY(&_cond);
   }
 
-  static TaskExecutor* instance() {
-    static TaskExecutor singleton;
-    return &singleton;
+  // cause vector.resize will use copy or move construct.
+  TaskExecutor(TaskExecutor&& other) noexcept {
+    if (this != &other) {
+      TaskExecutor();
+    }
   }
 
   void set_batch_size(size_t batch_size) { _batch_size = batch_size; }
@@ -277,8 +629,7 @@ class TaskExecutor {
     _thread_reset_fn = reset_fn;
   }
 
-  void set_thread_callback_fn(
-      boost::function cb) {
+  void set_thread_callback_fn(boost::function cb) {
     _fn = cb;
   }
 
@@ -287,15 +638,21 @@ class TaskExecutor {
 
   static void* thread_entry(void* args);
 
- private:
-  TaskExecutor(TaskExecutor const& other);
-  TaskExecutor* operator=(TaskExecutor const& other);
-
   int work(ThreadContext* context);
 
-  TaskHandler schedule(const InArrayT&, OutArrayT&);
+  TaskHandler schedule(const void*, void*);
 
-  bool fetch_batch(BatchTasks& batch);  // NOLINT
+  bool move_task_to_batch(BatchTasks& batch);  // NOLINT
+
+ private:
+  TaskExecutor(TaskExecutor