Merge branch 'develop' of https://github.com/paddlepaddle/serving into develop

770b6c26 · bjjwwang · a71530d7 · ce773b71 · 770b6c26 · 770b6c26
233 changed file
--- a/README.md
+++ b/README.md
@@ -188,7 +188,7 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 | `use_lite` (Only for Intel x86 CPU or ARM CPU) | -    | -       | Run PaddleLite inference                              |
 | `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU        |
 | `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8              |
-| `use_calib`                                    | bool | False   | Only for deployment with TensorRT                     |
+| `use_calib`                                    | bool | False   | Use TRT int8 calibration                              |
 | `gpu_multi_stream`                             | bool | False   | EnableGpuMultiStream to get larger QPS                |

 #### Description of asynchronous model

--- a/README_CN.md
+++ b/README_CN.md
@@ -187,7 +187,7 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 | `use_lite` (Only for Intel x86 CPU or ARM CPU) | -    | -       | Run PaddleLite inference                              |
 | `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU        |
 | `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8              |
-| `use_calib`                                    | bool | False   | Only for deployment with TensorRT                     |
+| `use_calib`                                    | bool | False   | Use TRT int8 calibration                              |
 | `gpu_multi_stream`                             | bool | False   | EnableGpuMultiStream to get larger QPS                |

 #### 异步模型的说明

--- a/cmake/external/cudnn.cmake
+++ b/cmake/external/cudnn.cmake
@@ -61,8 +61,11 @@ else()
 endif()

 if(CUDNN_FOUND)
-    file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
-
+    if(EXISTS "${CUDNN_INCLUDE_DIR}/cudnn_version.h")
+        file(READ ${CUDNN_INCLUDE_DIR}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS)
+    elseif(EXISTS "${CUDNN_INCLUDE_DIR}/cudnn.h")
+        file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+    endif()
    get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)

    string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)"

--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -27,52 +27,54 @@ set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/

 message( "WITH_GPU = ${WITH_GPU}")

-
 # Paddle Version should be one of:
 # latest: latest develop build
 # version number like 1.5.2
-SET(PADDLE_VERSION "2.1.0")
+SET(PADDLE_VERSION "2.2.0-rc0")
 if (WITH_GPU)
-    if(CUDA_VERSION EQUAL 11.0)
-        set(CUDA_SUFFIX "cuda11.0-cudnn8-mkl-gcc8.2")
+    message("CUDA: ${CUDA_VERSION}, CUDNN_MAJOR_VERSION: ${CUDNN_MAJOR_VERSION}")
+    # cuda 11.0 is not supported, 11.2 would be added.
+    if(CUDA_VERSION EQUAL 10.1)
+        set(CUDA_SUFFIX "x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5")
        set(WITH_TRT ON)
    elseif(CUDA_VERSION EQUAL 10.2)
-        set(CUDA_SUFFIX "cuda10.2-cudnn8-mkl-gcc8.2")
-        set(WITH_TRT ON)
-    elseif(CUDA_VERSION EQUAL 10.1)
-        set(CUDA_SUFFIX "cuda10.1-cudnn7-mkl-gcc8.2")
+        if(CUDNN_MAJOR_VERSION EQUAL 7)
+            set(CUDA_SUFFIX "x86-64_gcc5.4_avx_mkl_cuda10.2_cudnn7.6.5_trt6.0.1.5")
+            set(WITH_TRT ON)
+        elseif(CUDNN_MAJOR_VERSION EQUAL 8)
+            set(CUDA_SUFFIX "x86-64_gcc8.2_avx_mkl_cuda10.2_cudnn8.1.1_trt7.2.3.4")
+            set(WITH_TRT ON)
+        endif()
+    elseif(CUDA_VERSION EQUAL 11.2)
+        set(CUDA_SUFFIX "x86-64_gcc8.2_avx_mkl_cuda11.2_cudnn8.2.1_trt8.0.3.4")
        set(WITH_TRT ON)
-    elseif(CUDA_VERSION EQUAL 10.0)
-        set(CUDA_SUFFIX "cuda10-cudnn7-avx-mkl")
-    elseif(CUDA_VERSION EQUAL 9.0)
-        set(CUDA_SUFFIX "cuda9-cudnn7-avx-mkl")
    endif()
 else()
    set(WITH_TRT OFF)
 endif()  

 if (WITH_GPU)
-    SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-${CUDA_SUFFIX}")
+    SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/GPU/${CUDA_SUFFIX}")
 elseif (WITH_LITE)
    if (WITH_XPU)
-        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-${CMAKE_SYSTEM_PROCESSOR}-xpu")
+        SET(PADDLE_LIB_VERSION "arm64_gcc7.3_openblas")
    else()
        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-${CMAKE_SYSTEM_PROCESSOR}")
    endif()
 else()
    if (WITH_AVX)
        if (WITH_MKLML)
-            SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-cpu-avx-mkl")
+            SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/CPU/gcc8.2_avx_mkl")
        else()
-            SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-cpu-avx-openblas")
+            SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/CPU/gcc8.2_avx_openblas")
        endif()
    else()
-        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-cpu-noavx-openblas")
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/CPU/gcc8.2_openblas")
    endif()
 endif()

 if(WITH_LITE)
-    SET(PADDLE_LIB_PATH "http://paddle-serving.bj.bcebos.com/inferlib/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
+    SET(PADDLE_LIB_PATH "https://paddle-inference-lib.bj.bcebos.com/2.2.0-rc0/cxx_c/Linux/XPU/${PADDLE_LIB_VERSION}/paddle_inference_install_dir.tar.gz ")
 else()
    SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
 endif()

--- a/core/configure/proto/general_model_service.proto
+++ b/core/configure/proto/general_model_service.proto
@@ -12,41 +12,97 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-syntax = "proto2";
+syntax = "proto3";
 package baidu.paddle_serving.predictor.general_model;
 option java_multiple_files = true;
+option cc_generic_services = true;

 message Tensor {
-  repeated string data = 1;
-  repeated int32 int_data = 2;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
-  optional int32 elem_type =
-      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
-  repeated int32 shape = 6;       // shape should include batch
-  repeated int32 lod = 7;         // only for fetch tensor currently
-  optional string name = 8;       // get from the Model prototxt
-  optional string alias_name = 9; // get from the Model prototxt
+  // VarType: INT64
+  repeated int64 int64_data = 1;
+
+  // VarType: FP32
+  repeated float float_data = 2;
+
+  // VarType: INT32
+  repeated int32 int_data = 3;
+
+  // VarType: FP64
+  repeated double float64_data = 4;
+
+  // VarType: UINT32
+  repeated uint32 uint32_data = 5;
+
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+
+  // VarType: STRING
+  repeated string data = 9;
+
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  20 => STRING
+  int32 elem_type = 10;
+
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 11;
+
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 12;
+
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 13;
+
+  // Correspond to the variable 'alias_name' in the model description prototxt.
+  string alias_name = 14; // get from the Model prototxt
+
+  // VarType: FP16, INT16, INT8, BF16, UINT8
+  bytes tensor_content = 15;
 };

 message Request {
  repeated Tensor tensor = 1;
  repeated string fetch_var_names = 2;
-  optional bool profile_server = 3 [ default = false ];
-  required uint64 log_id = 4 [ default = 0 ];
+  bool profile_server = 3;
+  uint64 log_id = 4;
 };

 message Response {
  repeated ModelOutput outputs = 1;
  repeated int64 profile_time = 2;
+  // Error code
+  int32 err_no = 3;
+
+  // Error messages
+  string err_msg = 4;
 };

 message ModelOutput {
  repeated Tensor tensor = 1;
-  optional string engine_name = 2;
+  string engine_name = 2;
 }

 service GeneralModelService {
-  rpc inference(Request) returns (Response) {}
-  rpc debug(Request) returns (Response) {}
+  rpc inference(Request) returns (Response);
+  rpc debug(Request) returns (Response);
 };
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -22,11 +22,8 @@ message EngineDesc {
  required string reloadable_type = 4;
  required string model_dir = 5;
  repeated int32 gpu_ids = 6;
-  required int32 runtime_thread_num = 7;
-  required int32 batch_infer_size = 8;
-  required int32 enable_batch_align = 9;
-  optional string version_file = 10;
-  optional string version_type = 11;
+  optional string version_file = 7;
+  optional string version_type = 8;

  /*
   * Sparse Parameter Service type. Valid types are:
@@ -39,17 +36,34 @@ message EngineDesc {
    LOCAL = 1;
    REMOTE = 2;
  }
-  optional SparseParamServiceType sparse_param_service_type = 12;
-  optional string sparse_param_service_table_name = 13;
-  optional bool enable_memory_optimization = 14;
-  optional bool enable_ir_optimization = 15;
-  optional bool use_trt = 16;
-  optional bool use_lite = 17;
-  optional bool use_xpu = 18;
-  optional bool use_gpu = 19;
-  optional bool combined_model = 20;
-  optional bool encrypted_model = 21;
-  optional bool gpu_multi_stream = 22;
+  optional SparseParamServiceType sparse_param_service_type = 10;
+  optional string sparse_param_service_table_name = 11;
+  optional bool enable_memory_optimization = 12;
+  optional bool enable_ir_optimization = 13;
+  optional bool use_trt = 14;
+  optional bool use_lite = 15;
+  optional bool use_xpu = 16;
+  optional bool use_gpu = 17;
+  optional bool combined_model = 18;
+  optional bool encrypted_model = 19;
+  optional bool gpu_multi_stream = 20;
+
+  /*
+   * "runtime_thread_num": n == 0 means don`t use Asynchronous task scheduling
+   * mode.
+   * n > 0 means how many Predictor for this engine in Asynchronous task
+   * scheduling mode.
+   * "batch_infer_size": the max batch for this engine in Asynchronous task
+   * scheduling mode.
+   * "enable_overrun": always put a whole task into the TaskQueue even if the
+   * total batch is bigger than "batch_infer_size".
+   * "allow_split_request": allow to split task(which is corresponding to
+   * request).
+   */
+  optional int32 runtime_thread_num = 30 [ default = 0 ];
+  optional int32 batch_infer_size = 31 [ default = 32 ];
+  optional bool enable_overrun = 32 [ default = false ];
+  optional bool allow_split_request = 33 [ default = true ];
 };

 // model_toolkit conf
@@ -61,11 +75,14 @@ message ResourceConf {
  repeated string model_toolkit_file = 2;
  repeated string general_model_path = 3;
  repeated string general_model_file = 4;
-  optional string cube_config_path = 5;
-  optional string cube_config_file = 6;
-  optional int32 cube_quant_bits = 7; // set 0 if no quant.
-  optional string auth_product_name = 8;
-  optional string auth_container_id = 9;
+
+  optional string cube_config_path = 10;
+  optional string cube_config_file = 11;
+  optional int32 cube_quant_bits = 12;
+  optional string cube_cache_path = 13;
+
+  optional string auth_product_name = 20;
+  optional string auth_container_id = 21;
 };

 // DAG node depency info

--- a/core/cube/cube-api/go-api/conf/cube.conf
+++ b/core/cube/cube-api/go-api/conf/cube.conf
+[{
+    "dict_name": "test",
+    "shard": 2,
+    "nodes": [{
+        "ip": "127.0.0.1",
+        "port": 8731
+    },{
+        "ip": "127.0.0.1",
+        "port": 8730
+    }]
+}]
--- a/core/cube/cube-api/go-api/demo.go
+++ b/core/cube/cube-api/go-api/demo.go
+package main
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io/ioutil"
+)
+
+func main() {
+	dict_name := flag.String("n", "test", "cube name")
+	conf_path := flag.String("c", "./conf/cube.conf", "cube conf path")
+	input_path := flag.String("i", "./input.json", "keys to seek")
+	output_path := flag.String("o", "./output.json", "result to save")
+	flag.Parse()
+	bytes, err := ioutil.ReadFile(*conf_path)
+	if err != nil {
+		fmt.Println("读取配置文件失败", err)
+		return
+	}
+	var meta Meta
+	err = json.Unmarshal(bytes, &meta.Servers)
+	if err != nil {
+		fmt.Println("解析数据失败", err)
+		return
+	}
+
+	err = meta.Seek(*dict_name, *input_path, *output_path)
+	if err != nil {
+		fmt.Println(err)
+	}
+	return
+}
--- a/core/cube/cube-api/go-api/input.json
+++ b/core/cube/cube-api/go-api/input.json
+{"keys": [0,1,2,3,4,5,6,7]}
+{"keys": [1]}
--- a/core/cube/cube-api/go-api/meta.go
+++ b/core/cube/cube-api/go-api/meta.go
+package main
+
+import "fmt"
+
+type Meta struct {
+	Servers []CubeServer `json:"servers,omitempty"`
+}
+
+func (meta *Meta) Seek(dict_name string, input string, output string) (err error) {
+	var server CubeServer
+
+	for _, s := range meta.Servers {
+		if s.Name == dict_name {
+			server = s
+			break
+		}
+	}
+	if server.Name != dict_name {
+		err = fmt.Errorf("%s server not exist", dict_name)
+		return err
+	}
+	err = server.Seek(input, output)
+	return err
+}
--- a/core/cube/cube-api/go-api/server.go
+++ b/core/cube/cube-api/go-api/server.go
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net/http"
+	"os"
+)
+
+type Input struct {
+	Keys []uint64 `json:"keys"`
+}
+
+type SingleValue struct {
+	Status uint32 `json:"status"`
+	Value  string `json:"value"`
+}
+type Output struct {
+	Values []SingleValue `json:"values"`
+}
+
+type ServerNode struct {
+	Ip   string `json:"ip"`
+	Port uint64 `json:"port"`
+}
+
+type CubeServer struct {
+	Name  string       `json:"dict_name"`
+	Shard uint64       `json:"shard"`
+	Nodes []ServerNode `json:"nodes"`
+}
+
+func (server *CubeServer) SplitKeys(keys []uint64) (splited_keys map[uint64]Input, offset map[uint64][]uint64) {
+	splited_keys = make(map[uint64]Input)
+
+	offset = make(map[uint64][]uint64)
+	for i, key := range keys {
+		shard_id := key % server.Shard
+		temp_split, _ := splited_keys[shard_id]
+		temp_split.Keys = append(temp_split.Keys, key)
+		splited_keys[shard_id] = temp_split
+
+		temp_offset, _ := offset[shard_id]
+		temp_offset = append(temp_offset, uint64(i))
+		offset[shard_id] = temp_offset
+	}
+
+	return splited_keys, offset
+}
+
+func (server *CubeServer) Seek(input string, output_path string) (err error) {
+	file, err := os.Open(input)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+
+	buf := bufio.NewReader(file)
+
+	for {
+		line, err := buf.ReadBytes('\n')
+		//line = strings.TrimSpace(line)
+		if err != nil || io.EOF == err {
+			break
+		}
+		var temp_input Input
+		json.Unmarshal(line, &temp_input)
+		key_nums := len(temp_input.Keys)
+		var output Output
+		output.Values = make([]SingleValue, key_nums+1)
+		splited_keys, offset := server.SplitKeys(temp_input.Keys)
+		for shard_id, keys := range splited_keys {
+			cur_output, _ := server.Post(shard_id, keys)
+			for index, single_value := range cur_output.Values {
+				output.Values[offset[shard_id][index]] = single_value
+			}
+		}
+		json_str, _ := json.Marshal(output)
+		fp, err := os.OpenFile(output_path, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0755)
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer fp.Close()
+		_, err = fp.Write(json_str)
+	}
+	return err
+}
+
+func (server *CubeServer) Post(shard_id uint64, input Input) (output Output, err error) {
+	if shard_id >= uint64(len(server.Nodes)) {
+		err = fmt.Errorf("have no shard:%v", shard_id)
+		return output, err
+	}
+	json_str, _ := json.Marshal(input)
+	URL := fmt.Sprintf("http://%s:%v/DictService/seek", server.Nodes[shard_id].Ip, server.Nodes[shard_id].Port)
+	req, err := http.NewRequest("POST", URL, bytes.NewBuffer(json_str))
+	if err != nil {
+		return output, err
+	}
+	req.Header.Set("Content-Type", "application/json")
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		return output, err
+	}
+	body, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		return output, err
+	}
+	err = json.Unmarshal(body, &output)
+	return output, err
+}
--- a/core/cube/cube-api/python-api/conf/cube.conf
+++ b/core/cube/cube-api/python-api/conf/cube.conf
+[{
+    "dict_name": "test",
+    "shard": 2,
+    "nodes": [{
+        "ip": "127.0.0.1",
+        "port": 8731
+    },{
+        "ip": "127.0.0.1",
+        "port": 8730
+    }]
+}]
--- a/core/cube/cube-api/python-api/demo.py
+++ b/core/cube/cube-api/python-api/demo.py
+#coding=utf-8
+
+import requests
+import sys
+import json
+
+class Meta(object):
+    """记录cube分片server路由"""
+    def __init__(self, conf_path):
+        """根据配置文件初始化路由"""
+        self.server_api = "/DictService/seek"
+        self.server_meta = {}
+        with open(conf_path, "r", encoding="utf8") as fp:
+            cube_servcers = json.load(fp)
+            for server in cube_servcers:
+                self.server_meta[server["dict_name"]] = server
+            fp.close()
+
+    def seek(self, dict_name, keys_path, save_path):
+        """查询"""
+        save_file = open(save_path, 'w')
+        with open(keys_path, "r", encoding="utf8") as fp:
+            lines = fp.readlines()
+            for line in lines:
+                json_line = json.loads(line)
+                values = [{} for i in range(len(json_line["keys"]))]
+                splited_keys, offset = self.split_keys(json_line)
+                for shard_id, keys in splited_keys.items():
+                    results = self.post(dict_name, shard_id, keys)
+                    for i, result in enumerate(results["values"]):
+                        values[offset[shard_id][i]] = result
+                cur_line_results = {}
+                cur_line_results["values"] = values
+                
+                json.dump(cur_line_results, save_file)
+                save_file.write("\n")
+                
+            fp.close()
+        save_file.close()
+
+    def split_keys(self, json_line):
+        """根据key值及分片数判断去哪一个分片上查询"""
+        keys_split = {}
+        offset = {}
+        i = 0
+        for key in json_line["keys"]:
+            shard_id = key % self.server_meta[dict_name]["shard"]
+            if shard_id not in keys_split:
+                keys_split[shard_id] = []
+            keys_split[shard_id].append(key)
+            if shard_id not in offset:
+                offset[shard_id] = []
+            offset[shard_id].append(i)
+            i += 1
+        return keys_split, offset
+
+    def post(self, dict_name, shard_id, keys):
+        """向分片server发送post请求"""
+        api = "http://%s:%s%s" % (self.server_meta[dict_name]["nodes"][shard_id]["ip"],
+            self.server_meta[dict_name]["nodes"][shard_id]["port"],
+            self.server_api)
+        data = {"keys": keys}
+        response = requests.post(api, json.dumps(data))
+        return response.json()
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 5:
+        print('please usage: python demo.py conf_path dict_name keys_path save_path')
+        exit(0)
+    conf_path = sys.argv[1]
+    dict_name = sys.argv[2]
+    keys_path = sys.argv[3]
+    save_path = sys.argv[4]
+    meta = Meta(conf_path)
+    meta.seek(dict_name, keys_path, save_path)
--- a/core/cube/cube-api/python-api/input.json
+++ b/core/cube/cube-api/python-api/input.json
+{"keys": [0,1,2,3,4,5,6,7]}
+{"keys": [1]}
\ No newline at end of file
--- a/core/cube/cube-api/python-api/ptyhon_api.md
+++ b/core/cube/cube-api/python-api/ptyhon_api.md
+# cube python api说明文档
+参考[大规模稀疏参数服务Cube的部署和使用](https://github.com/PaddlePaddle/Serving/blob/master/doc/DEPLOY.md#2-大规模稀疏参数服务cube的部署和使用)文档进行cube的部署。
+使用python api，可替代上述文档中第3节预测服务的部署、使用
+
+## 配置说明
+conf/cube.conf 以json格式，设置各个分片cube server的ip以及port，shard与分片数一致，示例：
+```bash
+[{
+    "dict_name": "test",
+    "shard": 2,
+    "nodes": [{
+        "ip": "127.0.0.1",
+        "port": 8731
+    },{
+        "ip": "127.0.0.1",
+        "port": 8730
+    }]
+}]
+```
+
+## 数据格式
+```bash
+{"keys": [0,1,2,3,4,5,6,7]}
+{"keys": [1]}
+```
+支持批量查询，每次查询一行
+
+## 使用
+```bash
+cd ./python-api
+python3 demo.py conf/cube.conf test input.json result.json
+```
\ No newline at end of file
--- a/core/cube/cube-api/python-api/result.json
+++ b/core/cube/cube-api/python-api/result.json
+{"values": [{"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}]}
+{"values": [{"status": 4294967295, "value": ""}]}
--- a/core/general-client/CMakeLists.txt
+++ b/core/general-client/CMakeLists.txt
@@ -3,3 +3,24 @@ add_subdirectory(pybind11)
 pybind11_add_module(serving_client src/general_model.cpp src/pybind_general_model.cpp)
 target_link_libraries(serving_client PRIVATE -Wl,--whole-archive utils sdk-cpp pybind python -Wl,--no-whole-archive -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -Wl,-rpath,'$ORIGIN'/lib)
 endif()
+
+if(CLIENT)
+FILE(GLOB client_srcs include/*.h src/client.cpp src/brpc_client.cpp)
+add_library(client ${client_srcs})
+add_dependencies(client utils sdk-cpp)
+target_link_libraries(client utils sdk-cpp)
+endif()
+
+if(CLIENT)
+include_directories(SYSTEM  ${CMAKE_CURRENT_LIST_DIR}/../../)
+add_executable(simple_client example/simple_client.cpp)
+
+add_dependencies(simple_client utils sdk-cpp client)
+
+target_link_libraries(simple_client -Wl,--whole-archive
+        -Wl,--no-whole-archive -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -Wl,-rpath,'$ORIGIN'/lib)
+
+target_link_libraries(simple_client utils)
+target_link_libraries(simple_client sdk-cpp)
+target_link_libraries(simple_client client)
+endif()
\ No newline at end of file
--- a/core/general-client/README_CN.md
+++ b/core/general-client/README_CN.md
+# 用于Paddle Serving的C++客户端
+
+(简体中文|[English](./README.md))
+
+## 请求BRPC-Server
+
+### 服务端启动
+
+以fit_a_line模型为例，服务端启动与常规BRPC-Server端启动命令一样。
+
+```
+cd ../../python/examples/fit_a_line
+sh get_data.sh
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
+```
+
+### 客户端预测
+
+客户端目前支持BRPC
+目前已经实现了BRPC的封装函数，详见[brpc_client.cpp](./src/brpc_client.cpp)
+
+```
+./simple_client --client_conf="uci_housing_client/serving_client_conf.prototxt" --server_port="127.0.0.1:9393" --test_type="brpc" --sample_type="fit_a_line"
+```
+
+更多示例详见[simple_client.cpp](./example/simple_client.cpp)
+
+| Argument                                       | Type | Default                              | Description                                           |
+| ---------------------------------------------- | ---- | ------------------------------------ | ----------------------------------------------------- |
+| `client_conf`                                  | str  | `"serving_client_conf.prototxt"`     | Path of client conf                                   |
+| `server_port`                                  | str  | `"127.0.0.1:9393"`                   | Exposed ip:port of server                             |
+| `test_type`                                    | str  | `"brpc"`                             | Mode of request "brpc"                                |
+| `sample_type`                                  | str  | `"fit_a_line"`                       | Type of sample include "fit_a_line,bert"              |
--- a/core/general-client/example/simple_client.cpp
+++ b/core/general-client/example/simple_client.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <vector>
+
+#include "core/general-client/include/brpc_client.h"
+
+using baidu::paddle_serving::client::ServingClient;
+using baidu::paddle_serving::client::ServingBrpcClient;
+using baidu::paddle_serving::client::PredictorInputs;
+using baidu::paddle_serving::client::PredictorOutputs;
+
+DEFINE_string(server_port, "127.0.0.1:9292", "ip:port");
+DEFINE_string(client_conf, "serving_client_conf.prototxt", "Path of client conf");
+DEFINE_string(test_type, "brpc", "brpc");
+// fit_a_line, bert
+DEFINE_string(sample_type, "fit_a_line", "List: fit_a_line, bert");
+
+namespace {
+int prepare_fit_a_line(PredictorInputs& input, std::vector<std::string>& fetch_name) {
+  std::vector<float> float_feed = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
+            0.0582f, -0.0727f, -0.1583f, -0.0584f,
+            0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
+  std::vector<int> float_shape = {1, 13};
+  std::string feed_name = "x";
+  fetch_name = {"price"};
+  std::vector<int> lod;
+  input.add_float_data(float_feed, feed_name, float_shape, lod);
+  return 0;
+}
+
+int prepare_bert(PredictorInputs& input, std::vector<std::string>& fetch_name) {
+  {
+    std::vector<float> float_feed(128, 0.0f);
+    float_feed[0] = 1.0f;
+    std::vector<int> float_shape = {1, 128, 1};
+    std::string feed_name = "input_mask";
+    std::vector<int> lod;
+    input.add_float_data(float_feed, feed_name, float_shape, lod);
+  }
+  {
+    std::vector<int64_t> feed(128, 0);
+    std::vector<int> shape = {1, 128, 1};
+    std::string feed_name = "position_ids";
+    std::vector<int> lod;
+    input.add_int64_data(feed, feed_name, shape, lod);
+  }
+  {
+    std::vector<int64_t> feed(128, 0);
+    feed[0] = 101;
+    std::vector<int> shape = {1, 128, 1};
+    std::string feed_name = "input_ids";
+    std::vector<int> lod;
+    input.add_int64_data(feed, feed_name, shape, lod);
+  }
+  {
+    std::vector<int64_t> feed(128, 0);
+    std::vector<int> shape = {1, 128, 1};
+    std::string feed_name = "segment_ids";
+    std::vector<int> lod;
+    input.add_int64_data(feed, feed_name, shape, lod);
+  }
+  
+  fetch_name = {"pooled_output"};
+  return 0;
+}
+} // namespace
+
+int main(int argc, char* argv[]) {
+
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  std::string url = FLAGS_server_port;
+  std::string conf = FLAGS_client_conf;
+  std::string test_type = FLAGS_test_type;
+  std::string sample_type = FLAGS_sample_type;
+  LOG(INFO) << "url = " << url << ";"
+            << "client_conf = " << conf << ";"
+            << "test_type = " << test_type
+            << "sample_type = " << sample_type;
+  std::unique_ptr<ServingClient> client;
+  // default type is brpc
+  // will add grpc&http in the future
+  if (test_type == "brpc") {
+    client.reset(new ServingBrpcClient());
+  } else {
+    client.reset(new ServingBrpcClient());
+  }
+  std::vector<std::string> confs;
+  confs.push_back(conf);
+  if (client->init(confs, url) != 0) {
+    LOG(ERROR) << "Failed to init client!";
+    return 0;
+  }
+
+  PredictorInputs input;
+  PredictorOutputs output;
+  std::vector<std::string> fetch_name;
+
+  if (sample_type == "fit_a_line") {
+    prepare_fit_a_line(input, fetch_name);
+  }
+  else if (sample_type == "bert") {
+    prepare_bert(input, fetch_name);
+  }
+  else {
+    prepare_fit_a_line(input, fetch_name);
+  }
+
+  if (client->predict(input, output, fetch_name, 0) != 0) {
+    LOG(ERROR) << "Failed to predict!";
+  }
+  else {
+    LOG(INFO) << output.print();
+  }
+  
+  return 0;
+}
--- a/core/general-client/include/brpc_client.h
+++ b/core/general-client/include/brpc_client.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "core/general-client/include/client.h"
+#include "core/sdk-cpp/include/predictor_sdk.h"
+using baidu::paddle_serving::sdk_cpp::Predictor;
+using baidu::paddle_serving::sdk_cpp::PredictorApi;
+
+namespace baidu {
+namespace paddle_serving {
+namespace client {
+
+class ServingBrpcClient : public ServingClient {
+ public:
+
+  ServingBrpcClient() {};
+
+  ~ServingBrpcClient() {};
+
+  virtual int connect(const std::string server_port);
+
+  int predict(const PredictorInputs& inputs,
+              PredictorOutputs& outputs,
+              const std::vector<std::string>& fetch_name,
+              const uint64_t log_id);
+
+ private:
+  // generate default SDKConf
+  std::string gen_desc(const std::string server_port);
+
+ private:
+  PredictorApi _api;
+  Predictor* _predictor;
+};
+
+}  // namespace client
+}  // namespace paddle_serving
+}  // namespace baidu
\ No newline at end of file
--- a/core/general-client/include/client.h
+++ b/core/general-client/include/client.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+#include <vector>
+#include <map>
+#include <sstream>
+#include <memory>
+
+namespace baidu {
+namespace paddle_serving {
+namespace predictor {
+  namespace general_model {
+    class Request;
+    class Response;
+  }
+}
+namespace client {
+
+class PredictorInputs;
+class PredictorOutputs;
+
+class ServingClient {
+ public:
+  ServingClient() {};
+
+  virtual ~ServingClient() = default;
+
+  int init(const std::vector<std::string>& client_conf,
+           const std::string server_port);
+
+  int load_client_config(const std::vector<std::string>& client_conf);
+
+  virtual int connect(const std::string server_port) = 0;
+
+  virtual int predict(const PredictorInputs& inputs,
+                      PredictorOutputs& outputs,
+                      const std::vector<std::string>& fetch_name,
+                      const uint64_t log_id) = 0;
+
+ protected:
+  std::map<std::string, int> _feed_name_to_idx;
+  std::vector<std::string> _feed_name;
+  std::map<std::string, int> _fetch_name_to_idx;
+  std::map<std::string, std::string> _fetch_name_to_var_name;
+  std::map<std::string, int> _fetch_name_to_type;
+  std::vector<std::vector<int>> _shape;
+  std::vector<int> _type;
+  std::vector<int64_t> _last_request_ts;
+};
+
+class PredictorData {
+ public:
+  PredictorData() {};
+  virtual ~PredictorData() {};
+
+  void add_float_data(const std::vector<float>& data,
+                      const std::string& name,
+                      const std::vector<int>& shape,
+                      const std::vector<int>& lod,
+                      const int datatype = 1);
+
+  void add_int64_data(const std::vector<int64_t>& data,
+                      const std::string& name,
+                      const std::vector<int>& shape,
+                      const std::vector<int>& lod,
+                      const int datatype = 0);
+
+  void add_int32_data(const std::vector<int32_t>& data,
+                      const std::string& name,
+                      const std::vector<int>& shape,
+                      const std::vector<int>& lod,
+                      const int datatype = 2);
+
+  void add_string_data(const std::string& data,
+                       const std::string& name,
+                       const std::vector<int>& shape,
+                       const std::vector<int>& lod,
+                       const int datatype = 20);
+
+  const std::map<std::string, std::vector<float>>& float_data_map() const {
+    return _float_data_map;
+  };
+
+  std::map<std::string, std::vector<float>>* mutable_float_data_map() {
+    return &_float_data_map;
+  };
+
+  const std::map<std::string, std::vector<int64_t>>& int64_data_map() const {
+    return _int64_data_map;
+  };
+
+  std::map<std::string, std::vector<int64_t>>* mutable_int64_data_map() {
+    return &_int64_data_map;
+  };
+
+  const std::map<std::string, std::vector<int32_t>>& int_data_map() const {
+    return _int32_data_map;
+  };
+
+  std::map<std::string, std::vector<int32_t>>* mutable_int_data_map() {
+    return &_int32_data_map;
+  };
+
+  const std::map<std::string, std::string>& string_data_map() const {
+    return _string_data_map;
+  };
+
+  std::map<std::string, std::string>* mutable_string_data_map() {
+    return &_string_data_map;
+  };
+
+  const std::map<std::string, std::vector<int>>& shape_map() const {
+    return _shape_map;
+  };
+
+  std::map<std::string, std::vector<int>>* mutable_shape_map() {
+    return &_shape_map;
+  };
+
+  const std::map<std::string, std::vector<int>>& lod_map() const {
+    return _lod_map;
+  };
+
+  std::map<std::string, std::vector<int>>* mutable_lod_map() {
+    return &_lod_map;
+  };
+
+  int get_datatype(std::string name) const;
+
+  void set_datatype(std::string name, int type);
+
+  std::string print();
+
+ private:
+  // used to print vector data map e.g. _float_data_map
+  template<typename T1, typename T2>
+  std::string map2string(const std::map<T1, std::vector<T2>>& map) {
+    std::ostringstream oss;
+    oss.str("");
+    oss.precision(6);
+	  oss.setf(std::ios::fixed);
+    std::string key_seg = ":";
+    std::string val_seg = ",";
+    std::string end_seg = "\n";
+    typename std::map<T1, std::vector<T2>>::const_iterator it = map.begin();
+    typename std::map<T1, std::vector<T2>>::const_iterator itEnd = map.end();
+    for (; it != itEnd; it++) {
+      oss << "{";
+      oss << it->first << key_seg;
+      const std::vector<T2>& v = it->second;
+      oss << v.size() << key_seg;
+      for (size_t i = 0; i < v.size(); ++i) {
+        if (i != v.size() - 1) {
+          oss << v[i] << val_seg;
+        }
+        else {
+          oss << v[i];
+        }
+      }
+      oss << "}";
+    }
+    return oss.str();
+  };
+
+  // used to print data map without vector e.g. _string_data_map
+  template<typename T1, typename T2>
+  std::string map2string(const std::map<T1, T2>& map) {
+    std::ostringstream oss;
+    oss.str("");
+    std::string key_seg = ":";
+    std::string val_seg = ",";
+    std::string end_seg = "\n";
+    typename std::map<T1, T2>::const_iterator it = map.begin();
+    typename std::map<T1, T2>::const_iterator itEnd = map.end();
+    for (; it != itEnd; it++) {
+      oss << "{";
+      oss << it->first << key_seg
+          << "size=" << it->second.size() << key_seg
+          << "type=" << this->get_datatype(it->first);
+      oss << "}";
+    }
+    return oss.str();
+  };
+
+ protected:
+  std::map<std::string, std::vector<float>> _float_data_map;
+  std::map<std::string, std::vector<int64_t>> _int64_data_map;
+  std::map<std::string, std::vector<int32_t>> _int32_data_map;
+  std::map<std::string, std::string> _string_data_map;
+  std::map<std::string, std::vector<int>> _shape_map;
+  std::map<std::string, std::vector<int>> _lod_map;
+  std::map<std::string, int> _datatype_map;
+};
+
+class PredictorInputs : public PredictorData {
+ public:
+  PredictorInputs() {};
+  virtual ~PredictorInputs() {};
+
+  // generate proto from inputs
+  // feed_name_to_idx: mapping alias name to idx
+  // feed_name: mapping idx to name
+  static int GenProto(const PredictorInputs& inputs,
+                      const std::map<std::string, int>& feed_name_to_idx,
+                      const std::vector<std::string>& feed_name,
+                      predictor::general_model::Request& req);
+};
+
+class PredictorOutputs {
+ public:
+  struct PredictorOutput {
+    std::string engine_name;
+    PredictorData data;
+  };
+
+  PredictorOutputs() {};
+  virtual ~PredictorOutputs() {};
+
+  const std::vector<std::shared_ptr<PredictorOutputs::PredictorOutput>>& datas() {
+    return _datas;
+  };
+
+  std::vector<std::shared_ptr<PredictorOutputs::PredictorOutput>>* mutable_datas() {
+    return &_datas;
+  };
+
+  void add_data(const std::shared_ptr<PredictorOutputs::PredictorOutput>& data) {
+    _datas.push_back(data);
+  };
+
+  std::string print();
+
+  void clear();
+
+  // Parse proto to outputs
+  // fetch_name: name of data to be output
+  // fetch_name_to_type: mapping of fetch_name to datatype
+  static int ParseProto(const predictor::general_model::Response& res,
+                        const std::vector<std::string>& fetch_name,
+                        std::map<std::string, int>& fetch_name_to_type,
+                        PredictorOutputs& outputs);
+
+ protected:
+  std::vector<std::shared_ptr<PredictorOutputs::PredictorOutput>> _datas;
+};
+
+}  // namespace client
+}  // namespace paddle_serving
+}  // namespace baidu
\ No newline at end of file
--- a/core/general-client/include/general_model.h
+++ b/core/general-client/include/general_model.h
@@ -51,8 +51,13 @@ class ModelRes {
                            res._float_value_map.end());
    _int32_value_map.insert(res._int32_value_map.begin(),
                            res._int32_value_map.end());
+    _string_value_map.insert(res._string_value_map.begin(),
+                            res._string_value_map.end());
    _shape_map.insert(res._shape_map.begin(), res._shape_map.end());
    _lod_map.insert(res._lod_map.begin(), res._lod_map.end());
+    _tensor_alias_names.insert(_tensor_alias_names.end(),
+                               res._tensor_alias_names.begin(),
+                               res._tensor_alias_names.end());
  }
  ModelRes(ModelRes&& res) {
    _engine_name = std::move(res._engine_name);
@@ -65,10 +70,17 @@ class ModelRes {
    _int32_value_map.insert(
        std::make_move_iterator(std::begin(res._int32_value_map)),
        std::make_move_iterator(std::end(res._int32_value_map)));
+    _string_value_map.insert(
+        std::make_move_iterator(std::begin(res._string_value_map)),
+        std::make_move_iterator(std::end(res._string_value_map)));
    _shape_map.insert(std::make_move_iterator(std::begin(res._shape_map)),
                      std::make_move_iterator(std::end(res._shape_map)));
    _lod_map.insert(std::make_move_iterator(std::begin(res._lod_map)),
                    std::make_move_iterator(std::end(res._lod_map)));
+    _tensor_alias_names.insert(
+        _tensor_alias_names.end(),
+        std::make_move_iterator(std::begin(res._tensor_alias_names)),
+        std::make_move_iterator(std::end(res._tensor_alias_names)));
  }
  ~ModelRes() {}
  const std::vector<int64_t>& get_int64_by_name(const std::string& name) {
@@ -89,6 +101,12 @@ class ModelRes {
  std::vector<int32_t>&& get_int32_by_name_with_rv(const std::string& name) {
    return std::move(_int32_value_map[name]);
  }
+  const std::string& get_string_by_name(const std::string& name) {
+    return _string_value_map[name];
+  }
+  std::string&& get_string_by_name_with_rv(const std::string& name) {
+    return std::move(_string_value_map[name]);
+  }
  const std::vector<int>& get_shape_by_name(const std::string& name) {
    return _shape_map[name];
  }
@@ -105,6 +123,10 @@ class ModelRes {
    _engine_name = engine_name;
  }
  const std::string& engine_name() { return _engine_name; }
+
+  const std::vector<std::string>& tensor_alias_names() {
+    return _tensor_alias_names;
+  }
  ModelRes& operator=(ModelRes&& res) {
    if (this != &res) {
      _engine_name = std::move(res._engine_name);
@@ -117,10 +139,17 @@ class ModelRes {
      _int32_value_map.insert(
          std::make_move_iterator(std::begin(res._int32_value_map)),
          std::make_move_iterator(std::end(res._int32_value_map)));
+      _string_value_map.insert(
+          std::make_move_iterator(std::begin(res._string_value_map)),
+          std::make_move_iterator(std::end(res._string_value_map)));
      _shape_map.insert(std::make_move_iterator(std::begin(res._shape_map)),
                        std::make_move_iterator(std::end(res._shape_map)));
      _lod_map.insert(std::make_move_iterator(std::begin(res._lod_map)),
                      std::make_move_iterator(std::end(res._lod_map)));
+      _tensor_alias_names.insert(
+          _tensor_alias_names.end(),
+          std::make_move_iterator(std::begin(res._tensor_alias_names)),
+          std::make_move_iterator(std::end(res._tensor_alias_names)));
    }
    return *this;
  }
@@ -130,8 +159,10 @@ class ModelRes {
  std::map<std::string, std::vector<int64_t>> _int64_value_map;
  std::map<std::string, std::vector<float>> _float_value_map;
  std::map<std::string, std::vector<int32_t>> _int32_value_map;
+  std::map<std::string, std::string> _string_value_map;
  std::map<std::string, std::vector<int>> _shape_map;
  std::map<std::string, std::vector<int>> _lod_map;
+  std::vector<std::string> _tensor_alias_names;
 };

 class PredictorRes {
@@ -168,6 +199,14 @@ class PredictorRes {
                                                   const std::string& name) {
    return std::move(_models[model_idx].get_int32_by_name_with_rv(name));
  }
+  const std::string& get_string_by_name(const int model_idx,
+                                                const std::string& name) {
+    return _models[model_idx].get_string_by_name(name);
+  }
+  std::string&& get_string_by_name_with_rv(const int model_idx,
+                                                   const std::string& name) {
+    return std::move(_models[model_idx].get_string_by_name_with_rv(name));
+  }
  const std::vector<int>& get_shape_by_name(const int model_idx,
                                            const std::string& name) {
    return _models[model_idx].get_shape_by_name(name);
@@ -193,11 +232,16 @@ class PredictorRes {
  }
  const std::string& variant_tag() { return _variant_tag; }
  const std::vector<std::string>& get_engine_names() { return _engine_names; }
+  const std::vector<std::string>& get_tensor_alias_names(const int model_idx) {
+    _tensor_alias_names = _models[model_idx].tensor_alias_names();
+    return _tensor_alias_names;
+  }

 private:
  std::vector<ModelRes> _models;
  std::string _variant_tag;
  std::vector<std::string> _engine_names;
+  std::vector<std::string> _tensor_alias_names;
 };

 class PredictorClient {
@@ -222,10 +266,14 @@ class PredictorClient {
                    const std::vector<std::string>& float_feed_name,
                    const std::vector<std::vector<int>>& float_shape,
                    const std::vector<std::vector<int>>& float_lod_slot_batch,
-                    const std::vector<py::array_t<int64_t>>& int_feed,
-                    const std::vector<std::string>& int_feed_name,
-                    const std::vector<std::vector<int>>& int_shape,
-                    const std::vector<std::vector<int>>& int_lod_slot_batch,
+                    const std::vector<py::array_t<int32_t>> &int32_feed,
+                    const std::vector<std::string> &int32_feed_name,
+                    const std::vector<std::vector<int>> &int32_shape,
+                    const std::vector<std::vector<int>> &int32_lod_slot_batch,
+                    const std::vector<py::array_t<int64_t>> &int64_feed,
+                    const std::vector<std::string> &int64_feed_name,
+                    const std::vector<std::vector<int>> &int64_shape,
+                    const std::vector<std::vector<int>> &int64_lod_slot_batch,
                    const std::vector<std::string>& string_feed,
                    const std::vector<std::string>& string_feed_name,
                    const std::vector<std::vector<int>>& string_shape,

--- a/core/general-client/src/brpc_client.cpp
+++ b/core/general-client/src/brpc_client.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "core/general-client/include/brpc_client.h"
+#include "core/sdk-cpp/include/common.h"
+#include "core/util/include/timer.h"
+#include "core/sdk-cpp/builtin_format.pb.h"
+#include "core/sdk-cpp/general_model_service.pb.h"
+DEFINE_bool(profile_client, false, "");
+DEFINE_bool(profile_server, false, "");
+#define BRPC_MAX_BODY_SIZE 512 * 1024 * 1024
+
+namespace baidu {
+namespace paddle_serving {
+namespace client {
+
+using baidu::paddle_serving::Timer;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+
+using configure::SDKConf;
+using configure::VariantConf;
+using configure::Predictor;
+using configure::VariantConf;
+
+int ServingBrpcClient::connect(const std::string server_port) {
+  brpc::fLU64::FLAGS_max_body_size = BRPC_MAX_BODY_SIZE;
+  if (_api.create(gen_desc(server_port)) != 0) {
+    LOG(ERROR) << "Predictor Creation Failed";
+    return -1;
+  }
+  // _api.thrd_initialize();
+  return 0;
+}
+
+std::string ServingBrpcClient::gen_desc(const std::string server_port) {
+  // default config for brpc
+  SDKConf sdk_conf;
+
+  Predictor* predictor = sdk_conf.add_predictors();
+  predictor->set_name("general_model");
+  predictor->set_service_name("baidu.paddle_serving.predictor.general_model.GeneralModelService");
+  predictor->set_endpoint_router("WeightedRandomRender");
+  predictor->mutable_weighted_random_render_conf()->set_variant_weight_list("100");
+  VariantConf* predictor_var = predictor->add_variants();
+  predictor_var->set_tag("default_tag_1");
+  std::string cluster = "list://" + server_port;
+  predictor_var->mutable_naming_conf()->set_cluster(cluster);
+
+  VariantConf* var = sdk_conf.mutable_default_variant_conf();
+  var->set_tag("default");
+  var->mutable_connection_conf()->set_connect_timeout_ms(2000);
+  var->mutable_connection_conf()->set_rpc_timeout_ms(200000);
+  var->mutable_connection_conf()->set_connect_retry_count(2);
+  var->mutable_connection_conf()->set_max_connection_per_host(100);
+  var->mutable_connection_conf()->set_hedge_request_timeout_ms(-1);
+  var->mutable_connection_conf()->set_hedge_fetch_retry_count(2);
+  var->mutable_connection_conf()->set_connection_type("pooled");
+  var->mutable_connection_conf()->set_connect_timeout_ms(2000);
+
+  var->mutable_naming_conf()->set_cluster_filter_strategy("Default");
+  var->mutable_naming_conf()->set_load_balance_strategy("la");
+
+  var->mutable_rpc_parameter()->set_compress_type(0);
+  var->mutable_rpc_parameter()->set_package_size(20);
+  var->mutable_rpc_parameter()->set_protocol("baidu_std");
+  var->mutable_rpc_parameter()->set_max_channel_per_request(3);
+
+  return sdk_conf.SerializePartialAsString();
+}
+
+int ServingBrpcClient::predict(const PredictorInputs& inputs,
+                               PredictorOutputs& outputs,
+                               const std::vector<std::string>& fetch_name,
+                               const uint64_t log_id) {
+  Timer timeline;
+  int64_t preprocess_start = timeline.TimeStampUS();
+  // thread initialize for StubTLS
+  _api.thrd_initialize();
+  std::string variant_tag;
+  // predictor is bound to request with brpc::Controller
+  _predictor = _api.fetch_predictor("general_model", &variant_tag);
+  if (_predictor == NULL) {
+    LOG(ERROR) << "Failed fetch predictor so predict error!";
+    return -1;
+  }
+  // predict_res_batch.set_variant_tag(variant_tag);
+  VLOG(2) << "fetch general model predictor done.";
+  VLOG(2) << "variant_tag:" << variant_tag;
+  VLOG(2) << "max body size : " << brpc::fLU64::FLAGS_max_body_size;
+  Request req;
+  req.set_log_id(log_id);
+  for (auto &name : fetch_name) {
+    req.add_fetch_var_names(name);
+  }
+
+  if (PredictorInputs::GenProto(inputs, _feed_name_to_idx, _feed_name, req) != 0) {
+    LOG(ERROR) << "Failed to preprocess req!";
+    return -1;
+  }
+
+  int64_t preprocess_end = timeline.TimeStampUS();
+  int64_t client_infer_start = timeline.TimeStampUS();
+  Response res;
+
+  int64_t client_infer_end = 0;
+  int64_t postprocess_start = 0;
+  int64_t postprocess_end = 0;
+
+  if (FLAGS_profile_server) {
+    req.set_profile_server(true);
+  }
+
+  res.Clear();
+  if (_predictor->inference(&req, &res) != 0) {
+    LOG(ERROR) << "failed call predictor with req: " << req.ShortDebugString();
+    return -1;
+  }
+
+  client_infer_end = timeline.TimeStampUS();
+  postprocess_start = client_infer_end;
+  if (PredictorOutputs::ParseProto(res, fetch_name, _fetch_name_to_type, outputs) != 0) {
+    LOG(ERROR) << "Failed to post_process res!";
+    return -1;
+  }
+  postprocess_end = timeline.TimeStampUS();
+
+  if (FLAGS_profile_client) {
+    std::ostringstream oss;
+    oss << "PROFILE\t"
+        << "pid:" << getpid() << "\t"
+        << "prepro_0:" << preprocess_start << " "
+        << "prepro_1:" << preprocess_end << " "
+        << "client_infer_0:" << client_infer_start << " "
+        << "client_infer_1:" << client_infer_end << " ";
+    if (FLAGS_profile_server) {
+      int op_num = res.profile_time_size() / 2;
+      for (int i = 0; i < op_num; ++i) {
+        oss << "op" << i << "_0:" << res.profile_time(i * 2) << " ";
+        oss << "op" << i << "_1:" << res.profile_time(i * 2 + 1) << " ";
+      }
+    }
+
+    oss << "postpro_0:" << postprocess_start << " ";
+    oss << "postpro_1:" << postprocess_end;
+
+    fprintf(stderr, "%s\n", oss.str().c_str());
+  }
+
+  // release predictor
+  _api.thrd_clear();
+
+  std::ostringstream oss;
+  oss << "[client]"
+      << "logid=" << log_id <<",";
+  if (FLAGS_profile_client) {
+    double pre_cost = (preprocess_end - preprocess_start) / 1000.0;
+    double infer_cost = (client_infer_end - client_infer_start) / 1000.0;
+    double post_cost = (postprocess_end - postprocess_start) / 1000.0;
+    oss << "client_pre_cost=" << pre_cost << "ms,"
+        << "client_infer_cost=" << infer_cost << "ms,"
+        << "client_post_cost=" << post_cost << "ms,";
+  }
+  double client_cost = (postprocess_end - preprocess_start) / 1000.0;
+  oss << "client_cost=" << client_cost << "ms,";
+
+  int op_num = res.profile_time_size() / 2;
+  if (FLAGS_profile_server) {
+    for (int i = 0; i < op_num - 1; ++i) {
+      double t = (res.profile_time(i * 2 + 1)
+                 - res.profile_time(i * 2)) / 1000.0;
+      oss << "op" << i << "=" << t << "ms,";
+    }
+  }
+  if (op_num > 0) {
+    int i = op_num - 1;
+    double server_cost = (res.profile_time(i * 2 + 1)
+                 - res.profile_time(i * 2)) / 1000.0;
+    oss << "server_cost=" << server_cost << "ms.";
+  }
+  LOG(INFO) << oss.str();
+
+  return 0;
+}
+
+}  // namespace general_model
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/core/general-client/src/client.cpp
+++ b/core/general-client/src/client.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "core/general-client/include/client.h"
+#include "core/sdk-cpp/include/common.h"
+#include "core/sdk-cpp/general_model_service.pb.h"
+
+namespace baidu {
+namespace paddle_serving {
+namespace client {
+using configure::GeneralModelConfig;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+// support: FLOAT32, INT64, INT32, UINT8, INT8, FLOAT16
+enum ProtoDataType {
+  P_INT64 = 0,
+  P_FLOAT32,
+  P_INT32,
+  P_FP64,
+  P_INT16,
+  P_FP16,
+  P_BF16,
+  P_UINT8,
+  P_INT8,
+  P_BOOL,
+  P_COMPLEX64,
+  P_COMPLEX128,
+  P_STRING = 20,
+};
+
+int ServingClient::init(const std::vector<std::string>& client_conf,
+           const std::string server_port) {
+  if (load_client_config(client_conf) != 0) {
+    LOG(ERROR) << "Failed to load client config";
+    return -1;
+  }
+
+  // pure virtual func, subclass implementation
+  if (connect(server_port) != 0) {
+    LOG(ERROR) << "Failed to connect";
+    return -1;
+  }
+
+  return 0;
+}
+
+int ServingClient::load_client_config(const std::vector<std::string> &conf_file) {
+  try {
+    GeneralModelConfig model_config;
+    if (configure::read_proto_conf(conf_file[0].c_str(), &model_config) != 0) {
+      LOG(ERROR) << "Failed to load general model config"
+                 << ", file path: " << conf_file[0];
+      return -1;
+    }
+
+    _feed_name_to_idx.clear();
+    _fetch_name_to_idx.clear();
+    _shape.clear();
+    int feed_var_num = model_config.feed_var_size();
+    _feed_name.clear();
+    VLOG(2) << "feed var num: " << feed_var_num;
+    for (int i = 0; i < feed_var_num; ++i) {
+      _feed_name_to_idx[model_config.feed_var(i).alias_name()] = i;
+      VLOG(2) << "feed [" << i << "]"
+              << " name: " << model_config.feed_var(i).name();
+      _feed_name.push_back(model_config.feed_var(i).name());
+      VLOG(2) << "feed alias name: " << model_config.feed_var(i).alias_name()
+              << " index: " << i;
+      std::vector<int> tmp_feed_shape;
+      VLOG(2) << "feed"
+              << "[" << i << "] shape:";
+      for (int j = 0; j < model_config.feed_var(i).shape_size(); ++j) {
+        tmp_feed_shape.push_back(model_config.feed_var(i).shape(j));
+        VLOG(2) << "shape[" << j << "]: " << model_config.feed_var(i).shape(j);
+      }
+      _type.push_back(model_config.feed_var(i).feed_type());
+      VLOG(2) << "feed"
+              << "[" << i
+              << "] feed type: " << model_config.feed_var(i).feed_type();
+      _shape.push_back(tmp_feed_shape);
+    }
+
+    if (conf_file.size() > 1) {
+      model_config.Clear();
+      if (configure::read_proto_conf(conf_file[conf_file.size() - 1].c_str(),
+                                     &model_config) != 0) {
+        LOG(ERROR) << "Failed to load general model config"
+                   << ", file path: " << conf_file[conf_file.size() - 1];
+        return -1;
+      }
+    }
+    int fetch_var_num = model_config.fetch_var_size();
+    VLOG(2) << "fetch_var_num: " << fetch_var_num;
+    for (int i = 0; i < fetch_var_num; ++i) {
+      _fetch_name_to_idx[model_config.fetch_var(i).alias_name()] = i;
+      VLOG(2) << "fetch [" << i << "]"
+              << " alias name: " << model_config.fetch_var(i).alias_name();
+      _fetch_name_to_var_name[model_config.fetch_var(i).alias_name()] =
+          model_config.fetch_var(i).name();
+      _fetch_name_to_type[model_config.fetch_var(i).alias_name()] =
+          model_config.fetch_var(i).fetch_type();
+    }
+  } catch (std::exception &e) {
+    LOG(ERROR) << "Failed load general model config" << e.what();
+    return -1;
+  }
+  return 0;
+}
+
+void PredictorData::add_float_data(const std::vector<float>& data,
+                                    const std::string& name,
+                                    const std::vector<int>& shape,
+                                    const std::vector<int>& lod,
+                                    const int datatype) {
+  _float_data_map[name] = data;
+  _shape_map[name] = shape;
+  _lod_map[name] = lod;
+  _datatype_map[name] = datatype;
+}
+
+void PredictorData::add_int64_data(const std::vector<int64_t>& data,
+                                    const std::string& name,
+                                    const std::vector<int>& shape,
+                                    const std::vector<int>& lod,
+                                    const int datatype) {
+  _int64_data_map[name] = data;
+  _shape_map[name] = shape;
+  _lod_map[name] = lod;
+  _datatype_map[name] = datatype;
+}
+
+void PredictorData::add_int32_data(const std::vector<int32_t>& data,
+                                    const std::string& name,
+                                    const std::vector<int>& shape,
+                                    const std::vector<int>& lod,
+                                    const int datatype) {
+  _int32_data_map[name] = data;
+  _shape_map[name] = shape;
+  _lod_map[name] = lod;
+  _datatype_map[name] = datatype;
+}
+
+void PredictorData::add_string_data(const std::string& data,
+                                    const std::string& name,
+                                    const std::vector<int>& shape,
+                                    const std::vector<int>& lod,
+                                    const int datatype) {
+  _string_data_map[name] = data;
+  _shape_map[name] = shape;
+  _lod_map[name] = lod;
+  _datatype_map[name] = datatype;
+}
+
+int PredictorData::get_datatype(std::string name) const {
+  std::map<std::string, int>::const_iterator it = _datatype_map.find(name);
+  if (it != _datatype_map.end()) {
+    return it->second;
+  }
+  return 0;
+}
+
+void PredictorData::set_datatype(std::string name, int type) {
+  _datatype_map[name] = type;
+}
+
+std::string PredictorData::print() {
+  std::string res;
+  res.append(map2string<std::string, float>(_float_data_map));
+  res.append(map2string<std::string, int64_t>(_int64_data_map));
+  res.append(map2string<std::string, int32_t>(_int32_data_map));
+  res.append(map2string<std::string, std::string>(_string_data_map));
+  return res;
+}
+
+int PredictorInputs::GenProto(const PredictorInputs& inputs,
+                              const std::map<std::string, int>& feed_name_to_idx,
+                              const std::vector<std::string>& feed_name,
+                              Request& req) {
+  const std::map<std::string, std::vector<float>>& float_feed_map = inputs.float_data_map();
+  const std::map<std::string, std::vector<int64_t>>& int64_feed_map = inputs.int64_data_map();
+  const std::map<std::string, std::vector<int32_t>>& int32_feed_map = inputs.int_data_map();
+  const std::map<std::string, std::string>& string_feed_map = inputs.string_data_map();
+  const std::map<std::string, std::vector<int>>& shape_map = inputs.shape_map();
+  const std::map<std::string, std::vector<int>>& lod_map = inputs.lod_map();
+
+  VLOG(2) << "float feed name size: " << float_feed_map.size();
+  VLOG(2) << "int feed name size: " << int64_feed_map.size();
+  VLOG(2) << "string feed name size: " << string_feed_map.size();
+
+  // batch is already in Tensor.
+
+  for (std::map<std::string, std::vector<float>>::const_iterator iter = float_feed_map.begin();
+        iter != float_feed_map.end();
+        ++iter) {
+    std::string name = iter->first;
+    const std::vector<float>& float_data = iter->second;
+    const std::vector<int>& float_shape = shape_map.at(name);
+    const std::vector<int>& float_lod = lod_map.at(name);
+    // default datatype = P_FLOAT32
+    int datatype = inputs.get_datatype(name);
+    std::map<std::string, int>::const_iterator feed_name_it = feed_name_to_idx.find(name);
+    if (feed_name_it == feed_name_to_idx.end()) {
+      LOG(ERROR) << "Do not find [" << name << "] in feed_map!";
+      return -1;
+    }
+    int idx = feed_name_to_idx.at(name);
+    VLOG(2) << "prepare float feed " << name << " idx " << idx;
+    int total_number = float_data.size();
+    Tensor *tensor = req.add_tensor();
+
+    VLOG(2) << "prepare float feed " << name << " shape size "
+            << float_shape.size();
+    for (uint32_t j = 0; j < float_shape.size(); ++j) {
+      tensor->add_shape(float_shape[j]);
+    }
+    for (uint32_t j = 0; j < float_lod.size(); ++j) {
+      tensor->add_lod(float_lod[j]);
+    }
+    tensor->set_elem_type(datatype);
+
+    tensor->set_name(feed_name[idx]);
+    tensor->set_alias_name(name);
+
+    tensor->mutable_float_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_float_data()->mutable_data(), float_data.data(), total_number * sizeof(float));
+  }
+
+  for (std::map<std::string, std::vector<int64_t>>::const_iterator iter = int64_feed_map.begin();
+        iter != int64_feed_map.end();
+        ++iter) {
+    std::string name = iter->first;
+    const std::vector<int64_t>& int64_data = iter->second;
+    const std::vector<int>& int64_shape = shape_map.at(name);
+    const std::vector<int>& int64_lod = lod_map.at(name);
+    // default datatype = P_INT64
+    int datatype = inputs.get_datatype(name);
+    std::map<std::string, int>::const_iterator feed_name_it = feed_name_to_idx.find(name);
+    if (feed_name_it == feed_name_to_idx.end()) {
+      LOG(ERROR) << "Do not find [" << name << "] in feed_map!";
+      return -1;
+    }
+    int idx = feed_name_to_idx.at(name);
+    Tensor *tensor = req.add_tensor();
+    int total_number = int64_data.size();
+
+    for (uint32_t j = 0; j < int64_shape.size(); ++j) {
+      tensor->add_shape(int64_shape[j]);
+    }
+    for (uint32_t j = 0; j < int64_lod.size(); ++j) {
+      tensor->add_lod(int64_lod[j]);
+    }
+    tensor->set_elem_type(datatype);
+    tensor->set_name(feed_name[idx]);
+    tensor->set_alias_name(name);
+
+    tensor->mutable_int64_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_int64_data()->mutable_data(), int64_data.data(), total_number * sizeof(int64_t));
+  }
+
+  for (std::map<std::string, std::vector<int32_t>>::const_iterator iter = int32_feed_map.begin();
+        iter != int32_feed_map.end();
+        ++iter) {
+    std::string name = iter->first;
+    const std::vector<int32_t>& int32_data = iter->second;
+    const std::vector<int>& int32_shape = shape_map.at(name);
+    const std::vector<int>& int32_lod = lod_map.at(name);
+    // default datatype = P_INT32
+    int datatype = inputs.get_datatype(name);
+    std::map<std::string, int>::const_iterator feed_name_it = feed_name_to_idx.find(name);
+    if (feed_name_it == feed_name_to_idx.end()) {
+      LOG(ERROR) << "Do not find [" << name << "] in feed_map!";
+      return -1;
+    }
+    int idx = feed_name_to_idx.at(name);
+    Tensor *tensor = req.add_tensor();
+    int total_number = int32_data.size();
+
+    for (uint32_t j = 0; j < int32_shape.size(); ++j) {
+      tensor->add_shape(int32_shape[j]);
+    }
+    for (uint32_t j = 0; j < int32_lod.size(); ++j) {
+      tensor->add_lod(int32_lod[j]);
+    }
+    tensor->set_elem_type(datatype);
+    tensor->set_name(feed_name[idx]);
+    tensor->set_alias_name(name);
+
+    tensor->mutable_int_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_int_data()->mutable_data(), int32_data.data(), total_number * sizeof(int32_t));
+  }
+
+  for (std::map<std::string, std::string>::const_iterator iter = string_feed_map.begin();
+        iter != string_feed_map.end();
+        ++iter) {
+    std::string name = iter->first;
+    const std::string& string_data = iter->second;
+    const std::vector<int>& string_shape = shape_map.at(name);
+    const std::vector<int>& string_lod = lod_map.at(name);
+    // default datatype = P_STRING
+    int datatype = inputs.get_datatype(name);
+    std::map<std::string, int>::const_iterator feed_name_it = feed_name_to_idx.find(name);
+    if (feed_name_it == feed_name_to_idx.end()) {
+      LOG(ERROR) << "Do not find [" << name << "] in feed_map!";
+      return -1;
+    }
+    int idx = feed_name_to_idx.at(name);
+    Tensor *tensor = req.add_tensor();
+
+    for (uint32_t j = 0; j < string_shape.size(); ++j) {
+      tensor->add_shape(string_shape[j]);
+    }
+    for (uint32_t j = 0; j < string_lod.size(); ++j) {
+      tensor->add_lod(string_lod[j]);
+    }
+    tensor->set_elem_type(datatype);
+    tensor->set_name(feed_name[idx]);
+    tensor->set_alias_name(name);
+
+    if (datatype == P_STRING) {
+      const int string_shape_size = string_shape.size();
+      // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
+      // we pass string via vector<vector<string> >.
+      if (string_shape_size != 1) {
+        LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
+                   << string_shape_size;
+        return -1;
+      }
+      switch (string_shape_size) {
+        case 1: {
+          tensor->add_data(string_data);
+          break;
+        }
+      }
+    } else {
+      tensor->set_tensor_content(string_data);
+    }
+    
+  }
+  return 0;
+}
+
+std::string PredictorOutputs::print() {
+  std::string res = "";
+  for (size_t i = 0; i < _datas.size(); ++i) {
+    res.append(_datas[i]->engine_name);
+    res.append(":");
+    res.append(_datas[i]->data.print());
+    res.append("\n");
+  }
+  return res;
+}
+
+void PredictorOutputs::clear() {
+  _datas.clear();
+}
+
+int PredictorOutputs::ParseProto(const Response& res,
+                                  const std::vector<std::string>& fetch_name,
+                                  std::map<std::string, int>& fetch_name_to_type,
+                                  PredictorOutputs& outputs) {
+  VLOG(2) << "get model output num";
+  uint32_t model_num = res.outputs_size();
+  VLOG(2) << "model num: " << model_num;
+  for (uint32_t m_idx = 0; m_idx < model_num; ++m_idx) {
+    VLOG(2) << "process model output index: " << m_idx;
+    auto& output = res.outputs(m_idx);
+    std::shared_ptr<PredictorOutputs::PredictorOutput> predictor_output =
+        std::make_shared<PredictorOutputs::PredictorOutput>();
+    predictor_output->engine_name = output.engine_name();
+
+    PredictorData& predictor_data = predictor_output->data;
+    std::map<std::string, std::vector<float>>& float_data_map = *predictor_output->data.mutable_float_data_map();
+    std::map<std::string, std::vector<int64_t>>& int64_data_map = *predictor_output->data.mutable_int64_data_map();
+    std::map<std::string, std::vector<int32_t>>& int32_data_map = *predictor_output->data.mutable_int_data_map();
+    std::map<std::string, std::string>& string_data_map = *predictor_output->data.mutable_string_data_map();
+    std::map<std::string, std::vector<int>>& shape_map = *predictor_output->data.mutable_shape_map();
+    std::map<std::string, std::vector<int>>& lod_map = *predictor_output->data.mutable_lod_map();
+
+    int idx = 0;
+    for (auto &name : fetch_name) {
+      // int idx = _fetch_name_to_idx[name];
+      int shape_size = output.tensor(idx).shape_size();
+      VLOG(2) << "fetch var " << name << " index " << idx << " shape size "
+              << shape_size;
+      shape_map[name].resize(shape_size);
+      for (int i = 0; i < shape_size; ++i) {
+        shape_map[name][i] = output.tensor(idx).shape(i);
+      }
+      int lod_size = output.tensor(idx).lod_size();
+      if (lod_size > 0) {
+        lod_map[name].resize(lod_size);
+        for (int i = 0; i < lod_size; ++i) {
+          lod_map[name][i] = output.tensor(idx).lod(i);
+        }
+      }
+      idx += 1;
+    }
+    idx = 0;
+
+    for (auto &name : fetch_name) {
+      // int idx = _fetch_name_to_idx[name];
+      if (fetch_name_to_type[name] == P_INT64) {
+        VLOG(2) << "fetch var " << name << "type int64";
+        int size = output.tensor(idx).int64_data_size();
+        int64_data_map[name] = std::vector<int64_t>(
+            output.tensor(idx).int64_data().begin(),
+            output.tensor(idx).int64_data().begin() + size);
+      } else if (fetch_name_to_type[name] == P_FLOAT32) {
+        VLOG(2) << "fetch var " << name << "type float";
+        int size = output.tensor(idx).float_data_size();
+        float_data_map[name] = std::vector<float>(
+            output.tensor(idx).float_data().begin(),
+            output.tensor(idx).float_data().begin() + size);
+      } else if (fetch_name_to_type[name] == P_INT32) {
+        VLOG(2) << "fetch var " << name << "type int32";
+        int size = output.tensor(idx).int_data_size();
+        int32_data_map[name] = std::vector<int32_t>(
+            output.tensor(idx).int_data().begin(),
+            output.tensor(idx).int_data().begin() + size);
+      } else if (fetch_name_to_type[name] == P_UINT8
+                || fetch_name_to_type[name] == P_INT8
+                || fetch_name_to_type[name] == P_FP16) {
+        VLOG(2) << "fetch var [" << name << "]type="
+                << fetch_name_to_type[name];
+        string_data_map[name] = output.tensor(idx).tensor_content();
+      }
+      predictor_data.set_datatype(name, output.tensor(idx).elem_type());
+      idx += 1;
+    }
+    outputs.add_data(predictor_output);
+  }
+  return 0;
+}
+
+}  // namespace client
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -25,7 +25,22 @@ using baidu::paddle_serving::Timer;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Tensor;
-enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
+// support: FLOAT32, INT64, INT32, UINT8, INT8, FLOAT16
+enum ProtoDataType {
+  P_INT64 = 0,
+  P_FLOAT32,
+  P_INT32,
+  P_FP64,
+  P_INT16,
+  P_FP16,
+  P_BF16,
+  P_UINT8,
+  P_INT8,
+  P_BOOL,
+  P_COMPLEX64,
+  P_COMPLEX128,
+  P_STRING = 20,
+};
 std::once_flag gflags_init_flag;
 namespace py = pybind11;

@@ -152,10 +167,14 @@ int PredictorClient::numpy_predict(
    const std::vector<std::string> &float_feed_name,
    const std::vector<std::vector<int>> &float_shape,
    const std::vector<std::vector<int>> &float_lod_slot_batch,
-    const std::vector<py::array_t<int64_t>> &int_feed,
-    const std::vector<std::string> &int_feed_name,
-    const std::vector<std::vector<int>> &int_shape,
-    const std::vector<std::vector<int>> &int_lod_slot_batch,
+    const std::vector<py::array_t<int32_t>> &int32_feed,
+    const std::vector<std::string> &int32_feed_name,
+    const std::vector<std::vector<int>> &int32_shape,
+    const std::vector<std::vector<int>> &int32_lod_slot_batch,
+    const std::vector<py::array_t<int64_t>> &int64_feed,
+    const std::vector<std::string> &int64_feed_name,
+    const std::vector<std::vector<int>> &int64_shape,
+    const std::vector<std::vector<int>> &int64_lod_slot_batch,
    const std::vector<std::string> &string_feed,
    const std::vector<std::string> &string_feed_name,
    const std::vector<std::vector<int>> &string_shape,
@@ -168,15 +187,14 @@ int PredictorClient::numpy_predict(
  Timer timeline;
  int64_t preprocess_start = timeline.TimeStampUS();

-  int fetch_name_num = fetch_name.size();
-
  _api.thrd_initialize();
  std::string variant_tag;
  _predictor = _api.fetch_predictor("general_model", &variant_tag);
  predict_res_batch.set_variant_tag(variant_tag);
  VLOG(2) << "fetch general model predictor done.";
  VLOG(2) << "float feed name size: " << float_feed_name.size();
-  VLOG(2) << "int feed name size: " << int_feed_name.size();
+  VLOG(2) << "int feed name size: " << int32_feed_name.size();
+  VLOG(2) << "int feed name size: " << int64_feed_name.size();
  VLOG(2) << "string feed name size: " << string_feed_name.size();
  VLOG(2) << "max body size : " << brpc::fLU64::FLAGS_max_body_size;
  Request req;
@@ -193,7 +211,11 @@ int PredictorClient::numpy_predict(
    tensor_vec.push_back(req.add_tensor());
  }

-  for (auto &name : int_feed_name) {
+  for (auto &name : int32_feed_name) {
+    tensor_vec.push_back(req.add_tensor());
+  }
+
+  for (auto &name : int64_feed_name) {
    tensor_vec.push_back(req.add_tensor());
  }

@@ -233,37 +255,63 @@ int PredictorClient::numpy_predict(
  }

  vec_idx = 0;
-  for (auto &name : int_feed_name) {
+  for (auto &name : int32_feed_name) {
    int idx = _feed_name_to_idx[name];
    if (idx >= tensor_vec.size()) {
      LOG(ERROR) << "idx > tensor_vec.size()";
      return -1;
    }
    Tensor *tensor = tensor_vec[idx];
-    int nbytes = int_feed[vec_idx].nbytes();
-    void *rawdata_ptr = (void *)(int_feed[vec_idx].data(0));
-    int total_number = int_feed[vec_idx].size();
+    int nbytes = int32_feed[vec_idx].nbytes();
+    void *rawdata_ptr = (void *)(int32_feed[vec_idx].data(0));
+    int total_number = int32_feed[vec_idx].size();

-    for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
-      tensor->add_shape(int_shape[vec_idx][j]);
+    for (uint32_t j = 0; j < int32_shape[vec_idx].size(); ++j) {
+      tensor->add_shape(int32_shape[vec_idx][j]);
    }
-    for (uint32_t j = 0; j < int_lod_slot_batch[vec_idx].size(); ++j) {
-      tensor->add_lod(int_lod_slot_batch[vec_idx][j]);
+    for (uint32_t j = 0; j < int32_lod_slot_batch[vec_idx].size(); ++j) {
+      tensor->add_lod(int32_lod_slot_batch[vec_idx][j]);
    }
    tensor->set_elem_type(_type[idx]);
    tensor->set_name(_feed_name[idx]);
    tensor->set_alias_name(name);

-    if (_type[idx] == P_INT64) {
-      tensor->mutable_int64_data()->Resize(total_number, 0);
-      memcpy(tensor->mutable_int64_data()->mutable_data(), rawdata_ptr, nbytes);
-    } else {
-      tensor->mutable_int_data()->Resize(total_number, 0);
-      memcpy(tensor->mutable_int_data()->mutable_data(), rawdata_ptr, nbytes);
+    tensor->mutable_int_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_int_data()->mutable_data(), rawdata_ptr, nbytes);
+    vec_idx++;
+  }
+
+
+  // Individual INT_64 feed data of int_input to tensor_content
+  vec_idx = 0;
+  for (auto &name : int64_feed_name) {
+    int idx = _feed_name_to_idx[name];
+    if (idx >= tensor_vec.size()) {
+      LOG(ERROR) << "idx > tensor_vec.size()";
+      return -1;
+    }
+    Tensor *tensor = tensor_vec[idx];
+    int nbytes = int64_feed[vec_idx].nbytes();
+    void *rawdata_ptr = (void *)(int64_feed[vec_idx].data(0));
+    int total_number = int64_feed[vec_idx].size();
+
+    for (uint32_t j = 0; j < int64_shape[vec_idx].size(); ++j) {
+      tensor->add_shape(int64_shape[vec_idx][j]);
+    }
+    for (uint32_t j = 0; j < int64_lod_slot_batch[vec_idx].size(); ++j) {
+      tensor->add_lod(int64_lod_slot_batch[vec_idx][j]);
    }
+    tensor->set_elem_type(_type[idx]);
+    tensor->set_name(_feed_name[idx]);
+    tensor->set_alias_name(name);
+
+    tensor->mutable_int64_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_int64_data()->mutable_data(), rawdata_ptr, nbytes);
    vec_idx++;
  }

+  // Add !P_STRING feed data of string_input to tensor_content
+  // UINT8 INT8 FLOAT16
  vec_idx = 0;
  for (auto &name : string_feed_name) {
    int idx = _feed_name_to_idx[name];
@@ -279,22 +327,27 @@ int PredictorClient::numpy_predict(
    for (uint32_t j = 0; j < string_lod_slot_batch[vec_idx].size(); ++j) {
      tensor->add_lod(string_lod_slot_batch[vec_idx][j]);
    }
-    tensor->set_elem_type(P_STRING);
    tensor->set_name(_feed_name[idx]);
    tensor->set_alias_name(name);

-    const int string_shape_size = string_shape[vec_idx].size();
-    // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
-    // we pass string via vector<vector<string> >.
-    if (string_shape_size != 1) {
-      LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
-                 << string_shape_size;
-      return -1;
-    }
-    switch (string_shape_size) {
-      case 1: {
-        tensor->add_data(string_feed[vec_idx]);
-        break;
+    if (_type[idx] != P_STRING) {
+      tensor->set_elem_type(_type[idx]);
+      tensor->set_tensor_content(string_feed[vec_idx]);
+    } else {
+      tensor->set_elem_type(P_STRING);
+      const int string_shape_size = string_shape[vec_idx].size();
+      // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
+      // we pass string via vector<vector<string> >.
+      if (string_shape_size != 1) {
+        LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
+                   << string_shape_size;
+        return -1;
+      }
+      switch (string_shape_size) {
+        case 1: {
+          tensor->add_data(string_feed[vec_idx]);
+          break;
+        }
      }
    }
    vec_idx++;
@@ -308,10 +361,8 @@ int PredictorClient::numpy_predict(
  int64_t postprocess_start = 0;
  int64_t postprocess_end = 0;

-  if (FLAGS_profile_client) {
-    if (FLAGS_profile_server) {
-      req.set_profile_server(true);
-    }
+  if (FLAGS_profile_server) {
+    req.set_profile_server(true);
  }

  res.Clear();
@@ -329,10 +380,12 @@ int PredictorClient::numpy_predict(
      auto output = res.outputs(m_idx);
      ModelRes model;
      model.set_engine_name(output.engine_name());
-
-      int idx = 0;
-      for (auto &name : fetch_name) {
+      // 在ResponseOp处，已经按照fetch_name对输出数据进行了处理
+      // 所以，输出的数据与fetch_name是严格对应的，按顺序处理即可。
+      for (int idx = 0; idx < output.tensor_size(); ++idx) {
        // int idx = _fetch_name_to_idx[name];
+        const std::string name = output.tensor(idx).alias_name();
+        model._tensor_alias_names.push_back(name);
        int shape_size = output.tensor(idx).shape_size();
        VLOG(2) << "fetch var " << name << " index " << idx << " shape size "
                << shape_size;
@@ -347,13 +400,7 @@ int PredictorClient::numpy_predict(
            model._lod_map[name][i] = output.tensor(idx).lod(i);
          }
        }
-        idx += 1;
-      }
-
-      idx = 0;

-      for (auto &name : fetch_name) {
-        // int idx = _fetch_name_to_idx[name];
        if (_fetch_name_to_type[name] == P_INT64) {
          VLOG(2) << "ferch var " << name << "type int64";
          int size = output.tensor(idx).int64_data_size();
@@ -372,8 +419,16 @@ int PredictorClient::numpy_predict(
          model._int32_value_map[name] = std::vector<int32_t>(
              output.tensor(idx).int_data().begin(),
              output.tensor(idx).int_data().begin() + size);
+        } else if (_fetch_name_to_type[name] == P_UINT8) {
+          VLOG(2) << "fetch var " << name << "type uint8";
+          model._string_value_map[name] = output.tensor(idx).tensor_content();
+        } else if (_fetch_name_to_type[name] == P_INT8) {
+          VLOG(2) << "fetch var " << name << "type int8";
+          model._string_value_map[name] = output.tensor(idx).tensor_content();
+        } else if (_fetch_name_to_type[name] == P_FP16) {
+          VLOG(2) << "fetch var " << name << "type float16";
+          model._string_value_map[name] = output.tensor(idx).tensor_content();
        }
-        idx += 1;
      }
      predict_res_batch.add_model_res(std::move(model));
    }
@@ -403,6 +458,36 @@ int PredictorClient::numpy_predict(
  }

  _api.thrd_clear();
+
+  std::ostringstream oss;
+  oss << "[client]"
+      << "logid=" << log_id <<",";
+  if (FLAGS_profile_client) {
+    double pre_cost = (preprocess_end - preprocess_start) / 1000.0;
+    double infer_cost = (client_infer_end - client_infer_start) / 1000.0;
+    double post_cost = (postprocess_end - postprocess_start) / 1000.0;
+    oss << "client_pre_cost=" << pre_cost << "ms,"
+        << "client_infer_cost=" << infer_cost << "ms,"
+        << "client_post_cost=" << post_cost << "ms,";
+  }
+  double client_cost = (postprocess_end - preprocess_start) / 1000.0;
+  oss << "client_cost=" << client_cost << "ms,";
+
+  int op_num = res.profile_time_size() / 2;
+  if (FLAGS_profile_server) {
+    for (int i = 0; i < op_num - 1; ++i) {
+      double t = (res.profile_time(i * 2 + 1)
+                 - res.profile_time(i * 2)) / 1000.0;
+      oss << "op" << i << "=" << t << "ms,";
+    }
+  }
+  if (op_num > 0) {
+    int i = op_num - 1;
+    double server_cost = (res.profile_time(i * 2 + 1)
+                 - res.profile_time(i * 2)) / 1000.0;
+    oss << "server_cost=" << server_cost << "ms.";
+  }
+  LOG(INFO) << oss.str();
  return 0;
 }
 }  // namespace general_model

--- a/core/general-client/src/pybind_general_model.cpp
+++ b/core/general-client/src/pybind_general_model.cpp
@@ -49,6 +49,19 @@ PYBIND11_MODULE(serving_client, m) {
             });
             return py::array(ptr->size(), ptr->data(), capsule);
           })
+      .def("get_int32_by_name",
+           [](PredictorRes &self, int model_idx, std::string &name) {
+             std::vector<int32_t> *ptr = new std::vector<int32_t>(
+                 std::move(self.get_int32_by_name_with_rv(model_idx, name)));
+             auto capsule = py::capsule(ptr, [](void *p) {
+               delete reinterpret_cast<std::vector<int32_t> *>(p);
+             });
+             return py::array(ptr->size(), ptr->data(), capsule);
+           })
+      .def("get_string_by_name",
+           [](PredictorRes &self, int model_idx, std::string &name) {
+             return self.get_string_by_name_with_rv(model_idx, name);
+           })
      .def("get_shape",
           [](PredictorRes &self, int model_idx, std::string &name) {
             std::vector<int> *ptr = new std::vector<int>(
@@ -69,7 +82,10 @@ PYBIND11_MODULE(serving_client, m) {
           })
      .def("variant_tag", [](PredictorRes &self) { return self.variant_tag(); })
      .def("get_engine_names",
-           [](PredictorRes &self) { return self.get_engine_names(); });
+           [](PredictorRes &self) { return self.get_engine_names(); })
+      .def("get_tensor_alias_names", [](PredictorRes &self, int model_idx) {
+        return self.get_tensor_alias_names(model_idx);
+      });

  py::class_<PredictorClient>(m, "PredictorClient", py::buffer_protocol())
      .def(py::init())
@@ -101,10 +117,14 @@ PYBIND11_MODULE(serving_client, m) {
              const std::vector<std::string> &float_feed_name,
              const std::vector<std::vector<int>> &float_shape,
              const std::vector<std::vector<int>> &float_lod_slot_batch,
-              const std::vector<py::array_t<int64_t>> &int_feed,
-              const std::vector<std::string> &int_feed_name,
-              const std::vector<std::vector<int>> &int_shape,
-              const std::vector<std::vector<int>> &int_lod_slot_batch,
+              const std::vector<py::array_t<int32_t>> &int32_feed,
+              const std::vector<std::string> &int32_feed_name,
+              const std::vector<std::vector<int>> &int32_shape,
+              const std::vector<std::vector<int>> &int32_lod_slot_batch,
+              const std::vector<py::array_t<int64_t>> &int64_feed,
+              const std::vector<std::string> &int64_feed_name,
+              const std::vector<std::vector<int>> &int64_shape,
+              const std::vector<std::vector<int>> &int64_lod_slot_batch,
              const std::vector<std::string> &string_feed,
              const std::vector<std::string> &string_feed_name,
              const std::vector<std::vector<int>> &string_shape,
@@ -117,10 +137,14 @@ PYBIND11_MODULE(serving_client, m) {
                                       float_feed_name,
                                       float_shape,
                                       float_lod_slot_batch,
-                                       int_feed,
-                                       int_feed_name,
-                                       int_shape,
-                                       int_lod_slot_batch,
+                                       int32_feed,
+                                       int32_feed_name,
+                                       int32_shape,
+                                       int32_lod_slot_batch,
+                                       int64_feed,
+                                       int64_feed_name,
+                                       int64_shape,
+                                       int64_lod_slot_batch,
                                       string_feed,
                                       string_feed_name,
                                       string_shape,

--- a/core/general-server/op/general_detection_op.cpp
+++ b/core/general-server/op/general_detection_op.cpp
@@ -191,42 +191,64 @@ int GeneralDetectionOp::inference() {

    boxes = post_processor_.FilterTagDetRes(boxes, ratio_h, ratio_w, srcimg);

-    for (int i = boxes.size() - 1; i >= 0; i--) {
-      crop_img = GetRotateCropImage(img, boxes[i]);
-
-      float wh_ratio = float(crop_img.cols) / float(crop_img.rows);
+    float max_wh_ratio = 0.0f;
+    std::vector<cv::Mat> crop_imgs;
+    std::vector<cv::Mat> resize_imgs;
+    int max_resize_w = 0;
+    int max_resize_h = 0;
+    int box_num = boxes.size();
+    std::vector<std::vector<float>> output_rec;
+    for (int i = 0; i < box_num; ++i) {
+      cv::Mat line_img = GetRotateCropImage(img, boxes[i]);
+      float wh_ratio = float(line_img.cols) / float(line_img.rows);
+      max_wh_ratio = max_wh_ratio > wh_ratio ? max_wh_ratio : wh_ratio;
+      crop_imgs.push_back(line_img);
+    }

+    for (int i = 0; i < box_num; ++i) {
+      cv::Mat resize_img;
+      crop_img = crop_imgs[i];
      this->resize_op_rec.Run(
-          crop_img, resize_img_rec, wh_ratio, this->use_tensorrt_);
+          crop_img, resize_img, max_wh_ratio, this->use_tensorrt_);

      this->normalize_op_.Run(
-          &resize_img_rec, this->mean_rec, this->scale_rec, this->is_scale_);
-
-      std::vector<float> output_rec(
-          1 * 3 * resize_img_rec.rows * resize_img_rec.cols, 0.0f);
-
-      this->permute_op_.Run(&resize_img_rec, output_rec.data());
-
-      // Inference.
-      output_shape = {1, 3, resize_img_rec.rows, resize_img_rec.cols};
-      out_num = std::accumulate(
-          output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
-      databuf_size_out = out_num * sizeof(float);
-      databuf_data_out = MempoolWrapper::instance().malloc(databuf_size_out);
-      if (!databuf_data_out) {
-        LOG(ERROR) << "Malloc failed, size: " << databuf_size_out;
-        return -1;
-      }
-      memcpy(databuf_data_out, output_rec.data(), databuf_size_out);
-      databuf_char_out = reinterpret_cast<char*>(databuf_data_out);
-      paddle::PaddleBuf paddleBuf(databuf_char_out, databuf_size_out);
-      paddle::PaddleTensor tensor_out;
-      tensor_out.name = "image";
-      tensor_out.dtype = paddle::PaddleDType::FLOAT32;
-      tensor_out.shape = {1, 3, resize_img_rec.rows, resize_img_rec.cols};
-      tensor_out.data = paddleBuf;
-      out->push_back(tensor_out);
+          &resize_img, this->mean_rec, this->scale_rec, this->is_scale_);
+
+      max_resize_w = std::max(max_resize_w, resize_img.cols);
+      max_resize_h = std::max(max_resize_h, resize_img.rows);
+      resize_imgs.push_back(resize_img);
+    }
+    int buf_size = 3 * max_resize_h * max_resize_w;
+    output_rec = std::vector<std::vector<float>>(box_num,
+                     std::vector<float>(buf_size, 0.0f));
+    for (int i = 0; i < box_num; ++i) {
+      resize_img_rec = resize_imgs[i];
+
+      this->permute_op_.Run(&resize_img_rec, output_rec[i].data());
+    }
+
+    // Inference.
+    output_shape = {box_num, 3, max_resize_h, max_resize_w};
+    out_num = std::accumulate(
+        output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+    databuf_size_out = out_num * sizeof(float);
+    databuf_data_out = MempoolWrapper::instance().malloc(databuf_size_out);
+    if (!databuf_data_out) {
+      LOG(ERROR) << "Malloc failed, size: " << databuf_size_out;
+      return -1;
+    }
+    int offset = buf_size * sizeof(float);
+    for (int i = 0; i < box_num; ++i) {
+      memcpy(databuf_data_out + i * offset, output_rec[i].data(), offset);
    }
+    databuf_char_out = reinterpret_cast<char*>(databuf_data_out);
+    paddle::PaddleBuf paddleBuf(databuf_char_out, databuf_size_out);
+    paddle::PaddleTensor tensor_out;
+    tensor_out.name = "image";
+    tensor_out.dtype = paddle::PaddleDType::FLOAT32;
+    tensor_out.shape = output_shape;
+    tensor_out.data = paddleBuf;
+    out->push_back(tensor_out);
  }
  out->erase(out->begin(), out->begin() + infer_outnum);


--- a/core/general-server/op/general_detection_op.h
+++ b/core/general-server/op/general_detection_op.h
@@ -63,7 +63,7 @@ class GeneralDetectionOp

    double det_db_thresh_ = 0.3;
    double det_db_box_thresh_ = 0.5;
-    double det_db_unclip_ratio_ = 2.0;
+    double det_db_unclip_ratio_ = 1.5;

    std::vector<float> mean_det = {0.485f, 0.456f, 0.406f};
    std::vector<float> scale_det = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f};

--- a/core/general-server/op/general_dist_kv_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_infer_op.cpp
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <utility>
 #include "core/cube/cube-api/include/cube_api.h"
+#include "core/predictor/framework/cache.h"
 #include "core/predictor/framework/infer.h"
 #include "core/predictor/framework/memory.h"
 #include "core/predictor/framework/resource.h"
@@ -36,10 +37,11 @@ using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+using baidu::paddle_serving::predictor::CubeCache;

 // DistKV Infer Op: seek cube and then call paddle inference
 // op seq: general_reader-> dist_kv_infer -> general_response
-int GeneralDistKVInferOp::inference() { 
+int GeneralDistKVInferOp::inference() {
  VLOG(2) << "Going to run inference";
  const std::vector<std::string> pre_node_names = pre_names();
  if (pre_node_names.size() != 1) {
@@ -60,8 +62,8 @@ int GeneralDistKVInferOp::inference() {

  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
  if (!output_blob) {
-    LOG(ERROR) <<  "(logid=" << log_id << ") output_blob is nullptr,error";
-      return -1;
+    LOG(ERROR) << "(logid=" << log_id << ") output_blob is nullptr,error";
+    return -1;
  }
  output_blob->SetLogId(log_id);

@@ -70,21 +72,30 @@ int GeneralDistKVInferOp::inference() {
               << ") Failed mutable depended argument, op:" << pre_name;
    return -1;
  }
-
+  Timer timeline;
+  timeline.Start();
  const TensorVector *in = &input_blob->tensor_vector;
  TensorVector *out = &output_blob->tensor_vector;
  std::vector<uint64_t> keys;
+  std::vector<uint64_t> unique_keys;
+  std::unordered_map<uint64_t, rec::mcube::CubeValue *> key_map;
  std::vector<rec::mcube::CubeValue> values;
-  int sparse_count = 0; // sparse inputs counts, sparse would seek cube
-  int dense_count = 0; // dense inputs counts, dense would directly call paddle infer
+  // sparse inputs counts, sparse would seek cube
+  int sparse_count = 0;
+  // dense inputs counts, dense would directly call paddle infer
+  int dense_count = 0;
  std::vector<std::pair<int64_t *, size_t>> dataptr_size_pairs;
  size_t key_len = 0;
+
  for (size_t i = 0; i < in->size(); ++i) {
    if (in->at(i).dtype != paddle::PaddleDType::INT64) {
+      // dense input type is not int64
      ++dense_count;
      continue;
    }
+    // sparse input type is int64
    ++sparse_count;
+
    size_t elem_num = 1;
    for (size_t s = 0; s < in->at(i).shape.size(); ++s) {
      elem_num *= in->at(i).shape[s];
@@ -94,7 +105,8 @@ int GeneralDistKVInferOp::inference() {
    dataptr_size_pairs.push_back(std::make_pair(data_ptr, elem_num));
  }
  keys.resize(key_len);
-  VLOG(3) << "(logid=" << log_id << ") cube number of keys to look up: " << key_len;
+  unique_keys.resize(key_len);
+
  int key_idx = 0;
  for (size_t i = 0; i < dataptr_size_pairs.size(); ++i) {
    std::copy(dataptr_size_pairs[i].first,
@@ -102,20 +114,81 @@ int GeneralDistKVInferOp::inference() {
              keys.begin() + key_idx);
    key_idx += dataptr_size_pairs[i].second;
  }
+
+  // filter dumplicate keys
+  int unique_keys_count = 0;
+  for (size_t i = 0; i < keys.size(); ++i) {
+    if (key_map.find(keys[i]) == key_map.end()) {
+      key_map[keys[i]] = nullptr;
+      unique_keys[unique_keys_count++] = keys[i];
+    }
+  }
+  unique_keys.resize(unique_keys_count);
+  VLOG(1) << "(logid=" << log_id
+          << ") cube number of keys to look up: " << key_len
+          << " uniq keys: " << unique_keys_count;
+
+  // fitler cache keys
+  size_t hit_counts = 0;
+  int64_t seek_cache_start = timeline.TimeStampUS();
+  CubeCache *p_cube_cache =
+      InferManager::instance().get_cube_cache(engine_name().c_str());
+  if (p_cube_cache != nullptr) {
+    for (size_t i = 0; i < unique_keys_count; ++i) {
+      rec::mcube::CubeValue *hit_val = p_cube_cache->get_data(unique_keys[i]);
+      if (hit_val) {
+        // LOG(WARNING) << "Hit one cache. key:" << unique_keys[i];
+        key_map[unique_keys[i]] = hit_val;
+        if (hit_counts % 100 == 0) {
+          LOG(WARNING) << "hit cache! key:" << unique_keys[i]
+                       << " value:" << hit_val->buff;
+        }
+        unique_keys[i] = 0;
+        ++hit_counts;
+      }
+    }
+  } else {
+    LOG(WARNING) << "get cube cache fail. model: " << engine_name();
+  }
+  // clear unique keys which hit caches
+  if (hit_counts > 0) {
+    for (auto it = unique_keys.begin(); it < unique_keys.end();) {
+      if (*it == 0) {
+        it = unique_keys.erase(it);
+        --unique_keys_count;
+      } else {
+        ++it;
+      }
+    }
+  }
+  int64_t seek_cache_end = timeline.TimeStampUS();
+  VLOG(2) << "cache hit " << hit_counts
+          << " keys in cube cache, last unique_keys:" << unique_keys.size()
+          << " , seek_time:" << seek_cache_end - seek_cache_start;
+
+  // seek sparse params
  rec::mcube::CubeAPI *cube = rec::mcube::CubeAPI::instance();
  std::vector<std::string> table_names = cube->get_table_names();
  if (table_names.size() == 0) {
    LOG(ERROR) << "cube init error or cube config not given.";
    return -1;
  }
-  // gather keys and seek cube servers, put results in values 
-  int ret = cube->seek(table_names[0], keys, &values);
-  VLOG(3) << "(logid=" << log_id << ") cube seek status: " << ret;
+  int64_t seek_start = timeline.TimeStampUS();
+  int ret = cube->seek(table_names[0], unique_keys, &values);
+  int64_t seek_end = timeline.TimeStampUS();
+  VLOG(2) << "(logid=" << log_id << ") cube seek status: " << ret
+          << " , unique_key: " << unique_keys.size()
+          << " , seek_time: " << seek_end - seek_start;
+
+  for (size_t i = 0; i < unique_keys.size(); ++i) {
+    key_map[unique_keys[i]] = &values[i];
+  }
  if (values.size() != keys.size() || values[0].buff.size() == 0) {
    LOG(ERROR) << "cube value return null";
  }
-  // EMBEDDING_SIZE means the length of sparse vector, user can define length here. 
-  size_t EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
+   size_t EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
+  // size_t EMBEDDING_SIZE = (values[0].buff.size() - 10) / sizeof(float);
+  //size_t EMBEDDING_SIZE = 9;
  TensorVector sparse_out;
  sparse_out.resize(sparse_count);
  TensorVector dense_out;
@@ -126,8 +199,10 @@ int GeneralDistKVInferOp::inference() {
  std::unordered_map<int, int> in_out_map;
  baidu::paddle_serving::predictor::Resource &resource =
      baidu::paddle_serving::predictor::Resource::instance();
-  std::shared_ptr<PaddleGeneralModelConfig> model_config = resource.get_general_model_config().front();
-  //copy data to tnsor
+  std::shared_ptr<PaddleGeneralModelConfig> model_config =
+      resource.get_general_model_config().front();
+  int cube_key_found = 0;
+  int cube_key_miss = 0;
  for (size_t i = 0; i < in->size(); ++i) {
    if (in->at(i).dtype != paddle::PaddleDType::INT64) {
      dense_out[dense_idx] = in->at(i);
@@ -142,43 +217,75 @@ int GeneralDistKVInferOp::inference() {
                sparse_out[sparse_idx].lod[x].begin());
    }
    sparse_out[sparse_idx].dtype = paddle::PaddleDType::FLOAT32;
-    sparse_out[sparse_idx].shape.push_back(sparse_out[sparse_idx].lod[0].back());
+    sparse_out[sparse_idx].shape.push_back(
+        sparse_out[sparse_idx].lod[0].back());
    sparse_out[sparse_idx].shape.push_back(EMBEDDING_SIZE);
    sparse_out[sparse_idx].name = model_config->_feed_name[i];
    sparse_out[sparse_idx].data.Resize(sparse_out[sparse_idx].lod[0].back() *
                                       EMBEDDING_SIZE * sizeof(float));
    float *dst_ptr = static_cast<float *>(sparse_out[sparse_idx].data.data());
+    if (!dst_ptr) {
+      VLOG(2) << "dst_ptr is null. sparse_idx:" << sparse_idx;
+      continue;
+    }
    for (int x = 0; x < sparse_out[sparse_idx].lod[0].back(); ++x) {
      float *data_ptr = dst_ptr + x * EMBEDDING_SIZE;
-      memcpy(data_ptr,
-             values[cube_val_idx].buff.data(),
-             values[cube_val_idx].buff.size());
-      cube_val_idx++;
+      uint64_t cur_key = keys[cube_val_idx];
+      rec::mcube::CubeValue *cur_val = key_map[cur_key];
+      if (cur_val->buff.size() == 0) {
+        memset(data_ptr, (float)0.0, sizeof(float) * EMBEDDING_SIZE);
+        ++cube_key_miss;
+        ++cube_val_idx;
+        continue;
+      }
+
+      // The data generated by pslib has 10 bytes of information to be filtered
+      // out
+      memcpy(data_ptr, cur_val->buff.data(), cur_val->buff.size() );
+      // VLOG(3) <<  keys[cube_val_idx] << ":" << data_ptr[0] << ", " <<
+      // data_ptr[1] << ", " <<data_ptr[2] << ", " <<data_ptr[3] << ", "
+      // <<data_ptr[4] << ", " <<data_ptr[5] << ", " <<data_ptr[6] << ", "
+      // <<data_ptr[7] << ", " <<data_ptr[8];
+      ++cube_key_found;
+      ++cube_val_idx;
    }
    ++sparse_idx;
  }
-  VLOG(3) << "(logid=" << log_id << ") sparse tensor load success.";
+  bool cube_fail = (cube_key_found == 0);
+  if (cube_fail) {
+    LOG(WARNING) << "(logid=" << log_id << ") cube seek fail";
+  }
+  VLOG(2) << "(logid=" << log_id << ") cube key found: " << cube_key_found
+          << " , cube key miss: " << cube_key_miss;
+  VLOG(2) << "(logid=" << log_id << ") sparse tensor load success.";
+  timeline.Pause();
+  VLOG(2) << "dist kv, cube and datacopy time: " << timeline.ElapsedUS();
+
  TensorVector infer_in;
  infer_in.insert(infer_in.end(), dense_out.begin(), dense_out.end());
  infer_in.insert(infer_in.end(), sparse_out.begin(), sparse_out.end());
  int batch_size = input_blob->_batch_size;
  output_blob->_batch_size = batch_size;
-  Timer timeline;
  int64_t start = timeline.TimeStampUS();
  timeline.Start();
  // call paddle inference here
  if (InferManager::instance().infer(
          engine_name().c_str(), &infer_in, out, batch_size)) {
-    LOG(ERROR) << "(logid=" << log_id << ") Failed do infer in fluid model: " << engine_name();
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name();
    return -1;
  }
  int64_t end = timeline.TimeStampUS();
-
+  if (cube_fail) {
+    float *out_ptr = static_cast<float *>(out->at(0).data.data());
+    out_ptr[0] = 0.0;
+  }
+  timeline.Pause();
+  VLOG(2) << "dist kv, pure paddle infer time: " << timeline.ElapsedUS();
  CopyBlobInfo(input_blob, output_blob);
  AddBlobInfo(output_blob, start);
  AddBlobInfo(output_blob, end);
-  return 0; 
-
+  return 0;
 }
 DEFINE_OP(GeneralDistKVInferOp);


--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -31,7 +31,22 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
-enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
+// support: FLOAT32, INT64, INT32, UINT8, INT8, FLOAT16
+enum ProtoDataType {
+  P_INT64 = 0,
+  P_FLOAT32,
+  P_INT32,
+  P_FP64,
+  P_INT16,
+  P_FP16,
+  P_BF16,
+  P_UINT8,
+  P_INT8,
+  P_BOOL,
+  P_COMPLEX64,
+  P_COMPLEX128,
+  P_STRING = 20,
+};

 int GeneralReaderOp::inference() {
  // read request from client
@@ -78,6 +93,7 @@ int GeneralReaderOp::inference() {
  int64_t elem_type = 0;
  int64_t elem_size = 0;
  int64_t databuf_size = 0;
+  const void* src_ptr = nullptr;
  for (int i = 0; i < var_num; ++i) {
    paddle::PaddleTensor paddleTensor;
    const Tensor &tensor = req->tensor(i);
@@ -86,19 +102,38 @@ int GeneralReaderOp::inference() {
    elem_size = 0;
    databuf_size = 0;
    elem_type = tensor.elem_type();
-    VLOG(2) << "var[" << i << "] has elem type: " << elem_type;
+    src_ptr = nullptr ;
    if (elem_type == P_INT64) {  // int64
      elem_size = sizeof(int64_t);
      paddleTensor.dtype = paddle::PaddleDType::INT64;
      data_len = tensor.int64_data_size();
+      src_ptr = tensor.int64_data().data();
    } else if (elem_type == P_FLOAT32) {
      elem_size = sizeof(float);
      paddleTensor.dtype = paddle::PaddleDType::FLOAT32;
      data_len = tensor.float_data_size();
+      src_ptr = tensor.float_data().data();
    } else if (elem_type == P_INT32) {
      elem_size = sizeof(int32_t);
      paddleTensor.dtype = paddle::PaddleDType::INT32;
      data_len = tensor.int_data_size();
+      src_ptr = tensor.int_data().data();
+    } else if (elem_type == P_UINT8) {
+      elem_size = sizeof(uint8_t);
+      paddleTensor.dtype = paddle::PaddleDType::UINT8;
+      data_len = tensor.tensor_content().size();
+      src_ptr = tensor.tensor_content().data();
+    } else if (elem_type == P_INT8) {
+      elem_size = sizeof(int8_t);
+      paddleTensor.dtype = paddle::PaddleDType::INT8;
+      data_len = tensor.tensor_content().size();
+      src_ptr = tensor.tensor_content().data();
+    } else if (elem_type == P_FP16) {
+      // copy bytes from tensor content to TensorVector
+      elem_size = 1;
+      paddleTensor.dtype = paddle::PaddleDType::FLOAT16;
+      data_len = tensor.tensor_content().size();
+      src_ptr = tensor.tensor_content().data();
    } else if (elem_type == P_STRING) {
      // use paddle::PaddleDType::UINT8 as for String.
      elem_size = sizeof(char);
@@ -109,8 +144,18 @@ int GeneralReaderOp::inference() {
      // now only support single string
      for (int idx = 0; idx < tensor.data_size(); idx++) {
        data_len += tensor.data()[idx].length() + 1;
+        src_ptr = tensor.data()[idx].data();
      }
    }
+    VLOG(2) << "var[" << i << "] has elem type: " << elem_type << ";"
+            << "elem_size=" << elem_size << ";"
+            << "dtype=" << paddleTensor.dtype << ";"
+            << "data_len=" << data_len;
+    if (src_ptr == nullptr) {
+      LOG(ERROR) << "Not support var[" << i << "] with elem_type[" 
+                 << elem_type << "]";
+      continue;
+    }
    // implement lod tensor here
    // only support 1-D lod
    // TODO(HexToString): support 2-D lod
@@ -141,44 +186,17 @@ int GeneralReaderOp::inference() {
      VLOG(2) << "(logid=" << log_id << ") var[" << i
              << "] has lod_tensor and len=" << out->at(i).lod[0].back();
    }
-    if (elem_type == P_INT64) {
-      int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
-              << "] is " << tensor.int64_data(0);
-      if (!dst_ptr) {
-        LOG(ERROR) << "dst_ptr is nullptr";
-        return -1;
-      }
-      memcpy(dst_ptr, tensor.int64_data().data(), databuf_size);
-      /*
-      int elem_num = tensor.int64_data_size();
-      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[k] = tensor.int64_data(k);
-      }
-      */
-    } else if (elem_type == P_FLOAT32) {
-      float *dst_ptr = static_cast<float *>(out->at(i).data.data());
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
-              << "] is " << tensor.float_data(0);
-      if (!dst_ptr) {
-        LOG(ERROR) << "dst_ptr is nullptr";
-        return -1;
-      }
-      memcpy(dst_ptr, tensor.float_data().data(), databuf_size);
-      /*int elem_num = tensor.float_data_size();
-      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[k] = tensor.float_data(k);
-      }*/
-    } else if (elem_type == P_INT32) {
-      int32_t *dst_ptr = static_cast<int32_t *>(out->at(i).data.data());
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
-              << "] is " << tensor.int_data(0);
-      if (!dst_ptr) {
-        LOG(ERROR) << "dst_ptr is nullptr";
-        return -1;
-      }
-      memcpy(dst_ptr, tensor.int_data().data(), databuf_size);
-    } else if (elem_type == P_STRING) {
+    void* dst_ptr = out->at(i).data.data();
+    if (!dst_ptr) {
+      LOG(ERROR) << "dst_ptr is nullptr";
+      return -1;
+    }
+
+    // For common data, we just copy from src to dst
+    // For string data, we need to iterate through all str
+    if (elem_type != P_STRING) {
+      memcpy(dst_ptr, src_ptr, databuf_size);
+    } else {
      char *dst_ptr = static_cast<char *>(out->at(i).data.data());
      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
              << "] is " << tensor.data(0);

--- a/core/general-server/op/general_response_op.cpp
+++ b/core/general-server/op/general_response_op.cpp
@@ -74,10 +74,19 @@ int GeneralResponseOp::inference() {
  // and the order of Output is the same as the prototxt FetchVar.
  // otherwise, you can only get the Output by the corresponding of
  // Name -- Alias_name.
-  fetch_index.resize(req->fetch_var_names_size());
-  for (int i = 0; i < req->fetch_var_names_size(); ++i) {
-    fetch_index[i] =
-        model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
+  if (req->fetch_var_names_size() > 0) {
+    fetch_index.resize(req->fetch_var_names_size());
+    for (int i = 0; i < req->fetch_var_names_size(); ++i) {
+      fetch_index[i] =
+          model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
+    }
+  } else {
+    fetch_index.resize(model_config->_fetch_alias_name.size());
+    for (int i = 0; i < model_config->_fetch_alias_name.size(); ++i) {
+      fetch_index[i] =
+          model_config
+              ->_fetch_alias_name_to_index[model_config->_fetch_alias_name[i]];
+    }
  }

  for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
@@ -105,7 +114,7 @@ int GeneralResponseOp::inference() {
    // fetch_index is the real index in FetchVar of Fetchlist
    // for example, FetchVar = {0:A, 1:B, 2:C}
    // FetchList = {0:C,1:A}, at this situation.
-    // fetch_index = [2,0], C`index = 2 and A`index = 0 
+    // fetch_index = [2,0], C`index = 2 and A`index = 0
    for (auto &idx : fetch_index) {
      Tensor *tensor = output->add_tensor();
      tensor->set_name(in->at(idx).name);
@@ -159,6 +168,21 @@ int GeneralResponseOp::inference() {
        google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr,
                                                          data_ptr + cap);
        output->mutable_tensor(var_idx)->mutable_int_data()->Swap(&tmp_data);
+      } else if (dtype == paddle::PaddleDType::UINT8) {
+        tensor->set_elem_type(7);
+        VLOG(2) << "(logid=" << log_id << ")Prepare uint8 var ["
+                << model_config->_fetch_name[idx] << "].";
+        tensor->set_tensor_content(in->at(idx).data.data(), in->at(idx).data.length());
+      } else if (dtype == paddle::PaddleDType::INT8) {
+        tensor->set_elem_type(8);
+        VLOG(2) << "(logid=" << log_id << ")Prepare int8 var ["
+                << model_config->_fetch_name[idx] << "].";
+        tensor->set_tensor_content(in->at(idx).data.data(), in->at(idx).data.length());
+      } else if (dtype == paddle::PaddleDType::FLOAT16) {
+        tensor->set_elem_type(5);
+        VLOG(2) << "(logid=" << log_id << ")Prepare float16 var ["
+                << model_config->_fetch_name[idx] << "].";
+        tensor->set_tensor_content(in->at(idx).data.data(), in->at(idx).data.length());
      }

      VLOG(2) << "(logid=" << log_id << ") fetch var ["

--- a/core/general-server/proto/general_model_service.proto
+++ b/core/general-server/proto/general_model_service.proto
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-syntax = "proto2";
+syntax = "proto3";
 import "pds_option.proto";
 import "builtin_format.proto";
 package baidu.paddle_serving.predictor.general_model;
@@ -20,33 +20,88 @@ package baidu.paddle_serving.predictor.general_model;
 option cc_generic_services = true;

 message Tensor {
-  repeated string data = 1;
-  repeated int32 int_data = 2;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
-  optional int32 elem_type =
-      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
-  repeated int32 shape = 6;       // shape should include batch
-  repeated int32 lod = 7;         // only for fetch tensor currently
-  optional string name = 8;       // get from the Model prototxt
-  optional string alias_name = 9; // get from the Model prototxt
+  // VarType: INT64
+  repeated int64 int64_data = 1;
+
+  // VarType: FP32
+  repeated float float_data = 2;
+
+  // VarType: INT32
+  repeated int32 int_data = 3;
+
+  // VarType: FP64
+  repeated double float64_data = 4;
+
+  // VarType: UINT32
+  repeated uint32 uint32_data = 5;
+
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+
+  // VarType: STRING
+  repeated string data = 9;
+
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  20 => STRING
+  int32 elem_type = 10;
+
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 11;
+
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 12;
+
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 13;
+
+  // Correspond to the variable 'alias_name' in the model description prototxt.
+  string alias_name = 14; // get from the Model prototxt
+
+  // VarType: FP16, INT16, INT8, BF16, UINT8
+  bytes tensor_content = 15;
 };

 message Request {
  repeated Tensor tensor = 1;
  repeated string fetch_var_names = 2;
-  optional bool profile_server = 3 [ default = false ];
-  required uint64 log_id = 4 [ default = 0 ];
+  bool profile_server = 3;
+  uint64 log_id = 4;
 };

 message Response {
  repeated ModelOutput outputs = 1;
  repeated int64 profile_time = 2;
+  // Error code
+  int32 err_no = 3;
+
+  // Error messages
+  string err_msg = 4;
 };

 message ModelOutput {
  repeated Tensor tensor = 1;
-  optional string engine_name = 2;
+  string engine_name = 2;
 }

 service GeneralModelService {

--- a/core/pdcodegen/src/pdcodegen.cpp
+++ b/core/pdcodegen/src/pdcodegen.cpp
@@ -276,43 +276,65 @@ class PdsCodeGenerator : public CodeGenerator {
          "output_name",
          google::protobuf::dots_to_colons(m->output_type()->full_name()));
      if (m->name() == "inference") {
+        std::string inference_body = "";
+        inference_body += "  brpc::ClosureGuard done_guard(done);\n";
+        inference_body += "  brpc::Controller* cntl = \n";
+        inference_body += "        static_cast<brpc::Controller*>(cntl_base);\n";
+        inference_body += "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n";
+        inference_body += "  uint64_t log_id = request->log_id();\n";
+        inference_body += "  cntl->set_log_id(log_id);\n";
+        inference_body += "  ::baidu::paddle_serving::predictor::InferService* svr = \n";
+        inference_body += "       ";
+        inference_body += "::baidu::paddle_serving::predictor::InferServiceManager::instance(";
+        inference_body += ").item(\"$service$\");\n";
+        inference_body += "  if (svr == NULL) {\n";
+        inference_body += "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: ";
+        inference_body += "$service$\";\n";
+        inference_body += "    cntl->SetFailed(404, \"Not found service: $service$\");\n";
+        inference_body += "    return ;\n";
+        inference_body += "  }\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "remote_side=\[\" << cntl->remote_side() << ";  // NOLINT
+        inference_body += "\"\]\";\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "local_side=\[\" << cntl->local_side() << ";  // NOLINT
+        inference_body += "\"\]\";\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "service_name=\[\" << \"$name$\" << \"\]\";\n";  // NOLINT
+        inference_body += "  int err_code = svr->inference(request, response, log_id);\n";
+        inference_body += "  if (err_code != 0) {\n";
+        inference_body += "    LOG(WARNING)\n";
+        inference_body += "        << \"(logid=\" << log_id << \") Failed call ";
+        inference_body += "inferservice[$name$], name[$service$]\"\n";
+        inference_body += "        << \", error_code: \" << err_code;\n";
+        inference_body += "    cntl->SetFailed(err_code, \"InferService inference ";
+        inference_body += "failed!\");\n";
+        inference_body += "  }\n";
+        inference_body += "  gettimeofday(&tv, NULL);\n";
+        inference_body += "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n";
+        if (service_name == "GeneralModelService") {
+          inference_body += "  std::ostringstream oss;\n";
+          inference_body += "  oss << \"[serving]\"\n";
+          inference_body += "      << \"logid=\" << log_id << \",\";\n";
+          inference_body += "  int op_num = response->profile_time_size() / 2;\n";
+          inference_body += "  for (int i = 0; i < op_num; ++i) {\n";
+          inference_body += "    double t = (response->profile_time(i * 2 + 1)\n";
+          inference_body += "                - response->profile_time(i * 2)) / 1000.0;\n";
+          inference_body += "    oss << \"op\" << i << \"=\" << t << \"ms,\";\n";
+          inference_body += "  }\n";
+          inference_body += "  double total_time = (end - start) / 1000.0;\n";
+          inference_body += "  oss << \"cost=\" << total_time << \"ms.\";\n";
+          inference_body += "  // flush notice log\n";
+          inference_body += "  LOG(INFO) << oss.str();\n";
+          inference_body += "  response->add_profile_time(start);\n";
+          inference_body += "  response->add_profile_time(end);\n";
+        } else {
+          inference_body += "  // flush notice log\n";
+          inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - ";  // NOLINT
+          inference_body += "start) << \"\]\";\n";
+        }
        printer->Print(
-            "  baidu::rpc::ClosureGuard done_guard(done);\n"
-            "  baidu::rpc::Controller* cntl = \n"
-            "        static_cast<baidu::rpc::Controller*>(cntl_base);\n"
-            "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
-            "  uint64_t log_id = request->log_id();\n"
-            "  cntl->set_log_id(log_id);\n"
-            "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
-            "       "
-            "::baidu::paddle_serving::predictor::InferServiceManager::instance("
-            ").item(\"$service$\");\n"
-            "  if (svr == NULL) {\n"
-            "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: "
-            "$service$\";\n"
-            "    cntl->SetFailed(404, \"Not found service: $service$\");\n"
-            "    return ;\n"
-            "  }\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") remote_side=\[\" "  // NOLINT
-            "<< cntl->remote_side() << \"\]\";\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") local_side=\[\" "  // NOLINT
-            "<< cntl->local_side() << \"\]\";\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") service_name=\[\" "  // NOLINT
-            "<< \"$name$\" << \"\]\";\n"
-            "  int err_code = svr->inference(request, response, log_id);\n"
-            "  if (err_code != 0) {\n"
-            "    LOG(WARNING)\n"
-            "        << \"(logid=\" << log_id << \") Failed call "
-            "inferservice[$name$], name[$service$]\"\n"
-            "        << \", error_code: \" << err_code;\n"
-            "    cntl->SetFailed(err_code, \"InferService inference "
-            "failed!\");\n"
-            "  }\n"
-            "  gettimeofday(&tv, NULL);\n"
-            "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n"
-            "  // flush notice log\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - "  // NOLINT
-            "start) << \"\]\";\n",  // NOLINT
+            inference_body.c_str(),
            "name",
            class_name,
            "service",
@@ -1021,45 +1043,65 @@ class PdsCodeGenerator : public CodeGenerator {
          "output_name",
          google::protobuf::dots_to_colons(m->output_type()->full_name()));
      if (m->name() == "inference") {
+        std::string inference_body = "";
+        inference_body += "  brpc::ClosureGuard done_guard(done);\n";
+        inference_body += "  brpc::Controller* cntl = \n";
+        inference_body += "        static_cast<brpc::Controller*>(cntl_base);\n";
+        inference_body += "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n";
+        inference_body += "  uint64_t log_id = request->log_id();\n";
+        inference_body += "  cntl->set_log_id(log_id);\n";
+        inference_body += "  ::baidu::paddle_serving::predictor::InferService* svr = \n";
+        inference_body += "       ";
+        inference_body += "::baidu::paddle_serving::predictor::InferServiceManager::instance(";
+        inference_body += ").item(\"$service$\");\n";
+        inference_body += "  if (svr == NULL) {\n";
+        inference_body += "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: ";
+        inference_body += "$service$\";\n";
+        inference_body += "    cntl->SetFailed(404, \"Not found service: $service$\");\n";
+        inference_body += "    return ;\n";
+        inference_body += "  }\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "remote_side=\[\" << cntl->remote_side() << ";  // NOLINT
+        inference_body += "\"\]\";\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "local_side=\[\" << cntl->local_side() << ";  // NOLINT
+        inference_body += "\"\]\";\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "service_name=\[\" << \"$name$\" << \"\]\";\n";  // NOLINT
+        inference_body += "  int err_code = svr->inference(request, response, log_id);\n";
+        inference_body += "  if (err_code != 0) {\n";
+        inference_body += "    LOG(WARNING)\n";
+        inference_body += "        << \"(logid=\" << log_id << \") Failed call ";
+        inference_body += "inferservice[$name$], name[$service$]\"\n";
+        inference_body += "        << \", error_code: \" << err_code;\n";
+        inference_body += "    cntl->SetFailed(err_code, \"InferService inference ";
+        inference_body += "failed!\");\n";
+        inference_body += "  }\n";
+        inference_body += "  gettimeofday(&tv, NULL);\n";
+        inference_body += "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n";
+        if (service_name == "GeneralModelService") {
+          inference_body += "  std::ostringstream oss;\n";
+          inference_body += "  oss << \"[serving]\"\n";
+          inference_body += "      << \"logid=\" << log_id << \",\";\n";
+          inference_body += "  int op_num = response->profile_time_size() / 2;\n";
+          inference_body += "  for (int i = 0; i < op_num; ++i) {\n";
+          inference_body += "    double t = (response->profile_time(i * 2 + 1)\n";
+          inference_body += "                - response->profile_time(i * 2)) / 1000.0;\n";
+          inference_body += "    oss << \"op\" << i << \"=\" << t << \"ms,\";\n";
+          inference_body += "  }\n";
+          inference_body += "  double total_time = (end - start) / 1000.0;\n";
+          inference_body += "  oss << \"cost=\" << total_time << \"ms.\";\n";
+          inference_body += "  // flush notice log\n";
+          inference_body += "  LOG(INFO) << oss.str();\n";
+          inference_body += "  response->add_profile_time(start);\n";
+          inference_body += "  response->add_profile_time(end);\n";
+        } else {
+          inference_body += "  // flush notice log\n";
+          inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - ";  // NOLINT
+          inference_body += "start) << \"\]\";\n";
+        }
        printer->Print(
-            "  brpc::ClosureGuard done_guard(done);\n"
-            "  brpc::Controller* cntl = \n"
-            "        static_cast<brpc::Controller*>(cntl_base);\n"
-            "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
-            "  uint64_t log_id = request->log_id();\n"
-            "  cntl->set_log_id(log_id);\n"
-            "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
-            "       "
-            "::baidu::paddle_serving::predictor::InferServiceManager::instance("
-            ").item(\"$service$\");\n"
-            "  if (svr == NULL) {\n"
-            "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: "
-            "$service$\";\n"
-            "    cntl->SetFailed(404, \"Not found service: $service$\");\n"
-            "    return ;\n"
-            "  }\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") "
-            "remote_side=\[\" << cntl->remote_side() << "  // NOLINT
-            "\"\]\";\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") "
-            "local_side=\[\" << cntl->local_side() << "  // NOLINT
-            "\"\]\";\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") "
-            "service_name=\[\" << \"$name$\" << \"\]\";\n"  // NOLINT
-            "  int err_code = svr->inference(request, response, log_id);\n"
-            "  if (err_code != 0) {\n"
-            "    LOG(WARNING)\n"
-            "        << \"(logid=\" << log_id << \") Failed call "
-            "inferservice[$name$], name[$service$]\"\n"
-            "        << \", error_code: \" << err_code;\n"
-            "    cntl->SetFailed(err_code, \"InferService inference "
-            "failed!\");\n"
-            "  }\n"
-            "  gettimeofday(&tv, NULL);\n"
-            "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n"
-            "  // flush notice log\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - "  // NOLINT
-            "start) << \"\]\";\n",  // NOLINT
+            inference_body.c_str(),
            "name",
            class_name,
            "service",
@@ -1492,11 +1534,6 @@ class PdsCodeGenerator : public CodeGenerator {
      const FieldDescriptor* fd = in_shared_fields[si];
      std::string field_name = fd->name();
      printer->Print("\n/////$field_name$\n", "field_name", field_name);
-      if (fd->is_optional()) {
-        printer->Print(
-            "if (req->has_$field_name$()) {\n", "field_name", field_name);
-        printer->Indent();
-      }
      if (fd->cpp_type() ==
              google::protobuf::FieldDescriptor::CPPTYPE_MESSAGE ||
          fd->is_repeated()) {
@@ -1509,10 +1546,6 @@ class PdsCodeGenerator : public CodeGenerator {
                       "field_name",
                       field_name);
      }
-      if (fd->is_optional()) {
-        printer->Outdent();
-        printer->Print("}\n");
-      }
    }

    printer->Print(

--- a/core/predictor/common/constant.cpp
+++ b/core/predictor/common/constant.cpp
@@ -25,7 +25,7 @@ DEFINE_int32(port, 8010, "");
 DEFINE_string(workflow_path, "./conf", "");
 DEFINE_string(workflow_file, "workflow.prototxt", "");
 DEFINE_string(inferservice_path, "./conf", "");
-DEFINE_string(inferservice_file, "service.prototxt", "");
+DEFINE_string(inferservice_file, "infer_service.prototxt", "");
 DEFINE_string(logger_path, "./conf", "");
 DEFINE_string(logger_file, "log.conf", "");
 DEFINE_string(resource_path, "./conf", "");

--- a/core/predictor/framework/CMakeLists.txt
+++ b/core/predictor/framework/CMakeLists.txt
-FILE(GLOB framework_srcs ${CMAKE_CURRENT_LIST_DIR}/*.cpp)
+FILE(GLOB framework_srcs ${CMAKE_CURRENT_LIST_DIR}/*.cpp ${CMAKE_CURRENT_LIST_DIR}/../../cube/cube-builder/src/seqfile_reader.cpp)
 LIST(APPEND pdserving_srcs ${framework_srcs})
 LIST(APPEND pclient_srcs ${framework_srcs})
--- a/core/predictor/framework/bsf-inl.h
+++ b/core/predictor/framework/bsf-inl.h
@@ -26,9 +26,90 @@
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/framework/memory.h"

+// this file is included by bsf.h
 namespace im {
 namespace bsf {

+template <typename InItemT, typename OutItemT>
+bool Task<InItemT, OutItemT>::task_fetch_init(BatchTasks<TaskT>& batchTask) {
+  // 双检锁，减少加锁的粒度
+  if (!fetch_init) {
+    if (taskmeta_num > 1) {
+      // 对于task被拆分为多个taskmeta,需要加锁。
+      AutoMutex lock(task_mut);
+      task_fetch_create(batchTask);
+    } else {
+      // 对于task只有1个taskmeta,不需要加锁。
+      task_fetch_create(batchTask);
+    }
+  }
+  return true;
+}
+
+template <typename InItemT, typename OutItemT>
+bool Task<InItemT, OutItemT>::task_fetch_create(BatchTasks<TaskT>& batchTask) {
+  if (!fetch_init) {
+    vector_fetch_lod_index = batchTask.vector_fetch_lod_index;
+    set_fetch_nobatch_index = batchTask.set_fetch_nobatch_index;
+    OutVectorT taskMetaOutLodTensor;
+    size_t fetchvar_num = batchTask._batch_out.size();
+    for (size_t fetchvar_index = 0; fetchvar_index < fetchvar_num;
+         ++fetchvar_index) {
+      size_t fetchvar_bytesize_index =
+          batchTask.fetchvar_bytesize(fetchvar_index);
+      size_t fetchvar_batch = 0;
+      // 1. nobatch fetchvar情况
+      if (set_fetch_nobatch_index.size() > 0 &&
+          set_fetch_nobatch_index.find(fetchvar_index) !=
+              set_fetch_nobatch_index.end()) {
+        fetchvar_batch = 1;
+      } else if (vector_fetch_lod_index.size() > 0 &&
+                 std::find(vector_fetch_lod_index.begin(),
+                           vector_fetch_lod_index.end(),
+                           fetchvar_index) != vector_fetch_lod_index.end()) {
+        // lod fetchvar情况，此时无法确定总的shape[0]
+        // 根据task中的task_num总数开辟task_num个临时空间
+        // 每个lod型的fetchvar拷贝到对应的临时空间中
+        // 最后再计算临时空间的总量，合并fetchvar和lod
+        fetchvar_batch = 0;
+
+      } else {
+        // 普通fetchvar情况，此时该Task总的fetchvar_batch =
+        // 输入的总的batch_size()
+        fetchvar_batch = batch_size();
+      }
+      paddle::PaddleTensor tensor_out;
+      tensor_out.name = batchTask._batch_out[fetchvar_index].name;
+      tensor_out.dtype =
+          paddle::PaddleDType(batchTask._batch_out[fetchvar_index].dtype);
+      tensor_out.shape = batchTask._batch_out[fetchvar_index].shape;
+      tensor_out.shape[0] = fetchvar_batch;
+      if (fetchvar_batch != 0) {
+        // 此时 lod 为空。
+        tensor_out.lod = batchTask._batch_out[fetchvar_index].lod;
+        // resize all batch memory at one time
+        size_t databuf_size = fetchvar_batch * fetchvar_bytesize_index;
+        tensor_out.data.Resize(databuf_size);
+      } else {
+        // 当taskmeta_num = 1时，由于同时只有一个taskMeta操作task
+        // 不涉及线程安全问题，所以此时可以直接由taskMeta->task->resize->copy
+
+        // 当task被分为多个taskMeta时，需要临时对象记录
+        // 收齐后再一起合并
+        if (taskmeta_num > 1) {
+          taskMetaOutLodTensor.push_back(tensor_out);
+        }
+      }
+      outVectorT_ptr->push_back(tensor_out);
+    }
+    // outLodTensorVector实际是一个双层vector
+    // shape为taskmeta_num * vector_fetch_lod_index.size();
+    outLodTensorVector.resize(taskmeta_num, taskMetaOutLodTensor);
+    fetch_init = true;
+  }
+  return true;
+}
+
 template <typename TaskT>
 void* TaskExecutor<TaskT>::thread_entry(void* args) {
  ThreadContext<TaskT>* context = static_cast<ThreadContext<TaskT>*>(args);
@@ -134,9 +215,10 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(
    LOG(ERROR) << "Failed get TaskT from object pool";
    return TaskHandler<TaskT>::valid_handle();
  }
+  task->clear();

  /*
-  if (!BatchTasks<TaskT>::check_valid(in, out, _batch_align)) {
+  if (!BatchTasks<TaskT>::check_valid(in, out, _overrun)) {
    LOG(ERROR) << "Invalid input & output";
    return TaskHandler<TaskT>::valid_handle();
  }
@@ -156,9 +238,11 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(

  task->inVectorT_ptr = (const InVectorT*)inVectorT_ptr;
  task->outVectorT_ptr = (OutVectorT*)outVectorT_ptr;
+  if (!task->task_init()) {
+    LOG(ERROR) << "task->init() failed";
+  }
  task->rem = task->batch_size();
  task->index.store(0, butil::memory_order_relaxed);
-
  AutoMutex lock(_mut);
  _task_queue.push_back(task);
  THREAD_COND_SIGNAL(&_cond);
@@ -168,11 +252,12 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(

 // this function is accessed by multi thread.
 // so AutoMutex at first.
-// so batch.append_task is thread safe.
+// so batchTask.append_task is thread safe.
 // you dont need to add extra lock in append_task()
+// task is already init.
 template <typename TaskT>
 bool TaskExecutor<TaskT>::move_task_to_batch(
-    BatchTasks<TaskT>& batch) {  // NOLINT
+    BatchTasks<TaskT>& batchTask) {  // NOLINT
  AutoMutex lock(_mut);
  while (_task_queue.empty()) {
    THREAD_COND_WAIT(&_cond, &_mut);
@@ -183,15 +268,65 @@ bool TaskExecutor<TaskT>::move_task_to_batch(
    return false;
  }

+  TaskT* previous_task = nullptr;
  while (!_task_queue.empty()) {
    TaskT* task = _task_queue.front();
-    size_t rem = batch.append_task(task);
+
+    // 由于无法确定fetchVar是否为lod（即使输入是非lod，输出也可能是lod）
+    // 简单的处理方法是：task不能被拆分，即用户的请求可以合并一起预测，但不能拆分两个小部分去预测。
+    // 只需要设置engine的属性allow_split_request = false即可。
+
+    // 复杂的处理方法是允许拆分Task，无论是否包含lod.
+    // 难点：预测前，能够知道被拆成了几个taskmeta,但只有预测后，才知道有多少个fetchvar,多少个lod的fetchvar
+    // 所以，task中先要创建taskmeta_num* fetchvar
+    // num（lod类型的）个临时PaddleTensor（存储data及Lod）
+    // 由于多线程调度的单位是taskmeta，故只能在notify_task中，用taskmeta->task去创建
+    // 此时由于多个taskmeta对应一个task，存在多线程竞争，所以需要在task中加锁。
+    // 原子操作不可行，因为多个线程必须等待创建好上述的PaddleTensor后才能继续。
+    // 对于普通的fetch，也需要加锁去创建PaddleTensor，后续才能往里拷贝。
+
+    // _overrun表示，异步BatchTasks是否允许单次临时超过限制。
+    // _overrun为true时，即使BatchTasks剩下1-batch，也会全放入一个完整的Task，允许临时超限。
+    // _overrun为false时，不允许。
+    // 对于模型本身有最大Batch限制的情况，应将该值设为false，默认为false。
+    // 对于模型本身无最大Batch限制，但自己设置了BatchTasks的最大Batch，可以考虑设置为True。
+
+    // _allow_split_request ==
+    // true，则允许拆分task.BatchTasks剩下1-batch，则会从下一个Task中拆出1-Batch
+    // _allow_split_request ==
+    // false，则每个task不会被拆分。BatchTasks剩下1-batch会被浪费
+    // 默认为true，允许拆分task从而使得空间利用率最大。
+    if (!batchTask.get_allow_split_request()) {
+      if (task->batch_size() > batchTask.get_rem_size() &&
+          !batchTask.get_overrun()) {
+        break;
+      }
+    }
+
+    // combine_task_valid负责判断是否能够合并
+    // 除最外层的shape外，内层shape应一致才能合并。
+    // 否则跳出循环,放入下一个batchTask中。
+    // 以此保证batch.append_task(task)中的task的内层shape相同。
+
+    // 对于Shape[0] = 1 而!=batch的情况，因为合并时，取其中一个的值
+    // 所以要求该feedvar必须相等，才能合并。
+    // 否则跳出循环,放入下一个batchTask中。
+    // 目前没有PaddleTensor和PaddleBuff没有重载==，所以只能比较内存.
+    // TODO(HexToString): 可以考虑后期支持AutoPadding.
+    if (previous_task != nullptr) {
+      if (!task->combine_task_valid(previous_task)) {
+        break;
+      }
+    }
+    size_t rem = batchTask.append_task(task);
+    previous_task = task;
    if (task->rem <= 0) {
      _task_queue.pop_front();
    }
    if (rem <= 0) break;
  }
-
+  LOG(INFO) << "Number of tasks remaining in _task_queue is"
+            << _task_queue.size();
  return true;
 }

@@ -201,11 +336,12 @@ bool TaskExecutor<TaskT>::move_task_to_batch(
 // TaskT is from the SingleTon TaskExecutor`s _task_queue
 // although TaskMeta is a local variable, but several TaskMeta may points to
 // the same TaskT which is get from the SingleTon TaskExecutor`s _task_queue.
-// put TaskMeta to the local variable BatchTasks<TaskT> batch.
+// put TaskMeta to the local variable BatchTasks<TaskT> batchTask.

-// batch.merge_tasks() and batch.notify_tasks() has no lock.
-// BatchTasks<TaskT> batch itself is a local variable, it`s thread safe.
-// If batch.merge_tasks() and batch.notify_tasks() do something to TaskMeta
+// batchTask.merge_tasks() and batchTask.notify_tasks() has no lock.
+// BatchTasks<TaskT> batchTask itself is a local variable, it`s thread safe.
+// If batchTask.merge_tasks() and batchTask.notify_tasks() do something to
+// TaskMeta
 // you need to pay attention to that.
 // Multi-Thread deal with different TaskMeta(cause it`s created as local
 // variable)
@@ -242,11 +378,23 @@ int TaskExecutor<TaskT>::work(ThreadContext<TaskT>* context) {
      return -1;
    }

-    BatchTasks<TaskT> batch(_batch_size, _batch_align);
-    if (move_task_to_batch(batch)) {
-      batch.merge_tasks();
-      _fn(&batch.in(), &batch.out());
-      batch.notify_tasks();
+    // move_task_to_batch() take the original task from the `_task_queue`
+    // put the original task into its own Vector<taskmeta>
+    // the capacity of its own Vector<taskmeta> is decided by `_batch_size` or
+    // `_overrun`
+
+    // merge_tasks() move the imput-data into `_batch_in` from its own
+    // Vector<taskmeta>.
+    // because the predictor`s input is the `_batch_in`
+
+    // notify_tasks() move the output-data into every single taskmeta from
+    // `_batch_out`.
+    // because the predictor`s output is the `_batch_out`
+    BatchTasks<TaskT> batchTask(_batch_size, _overrun, _allow_split_request);
+    if (move_task_to_batch(batchTask)) {
+      batchTask.merge_tasks();
+      _fn(&batchTask.in(), &batchTask.out());
+      batchTask.notify_tasks();
    }
  }


--- a/core/predictor/framework/bsf.h
+++ b/core/predictor/framework/bsf.h
--- a/core/predictor/framework/cache.cpp
+++ b/core/predictor/framework/cache.cpp
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+
+#include "core/predictor/framework/cache.h"
+#include <dirent.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <string>
+#include <utility>
+#include "core/cube/cube-builder/include/cube-builder/seqfile_reader.h"
+
+namespace baidu {
+namespace paddle_serving {
+namespace predictor {
+
+int CubeCache::clear() {
+  for (auto it = _map_cache.begin(); it != _map_cache.end(); ++it) {
+    if (it->second) {
+      delete (it->second);
+      it->second = nullptr;
+    }
+  }
+  _map_cache.clear();
+
+  return 0;
+}
+
+rec::mcube::CubeValue* CubeCache::get_data(uint64_t key) {
+  auto it = _map_cache.find(key);
+  if (it != _map_cache.end()) {
+    return it->second;
+  }
+  return nullptr;
+}
+
+int CubeCache::reload_data(const std::string& cache_path) {
+  LOG(INFO) << "cube cache is loading data, path: " << cache_path;
+  DIR* dp = nullptr;
+  struct dirent* dirp = nullptr;
+  struct stat st;
+
+  // clear cache data
+  clear();
+
+  // loading data from cache files
+  if (stat(cache_path.c_str(), &st) < 0 || !S_ISDIR(st.st_mode)) {
+    LOG(ERROR) << "invalid cache path " << cache_path;
+    return -1;
+  }
+  if ((dp = opendir(cache_path.c_str())) == nullptr) {
+    LOG(ERROR) << "opendir " << cache_path << " fail.";
+    return -1;
+  }
+  while ((dirp = readdir(dp)) != nullptr) {
+    // filtering by file type.
+    if (dirp->d_type != DT_REG) {
+      continue;
+    }
+    // Filter upper-level directories and hidden files
+    if ((!strncmp(dirp->d_name, ".", 1)) || (!strncmp(dirp->d_name, "..", 2))) {
+      continue;
+    }
+    // Match the file whose name prefix is 'part-'
+    if (std::string(dirp->d_name).find("part-") != std::string::npos) {
+      SequenceFileRecordReader reader(cache_path + "/" + dirp->d_name);
+
+      if (reader.open() != 0) {
+        LOG(ERROR) << "open file failed! " << dirp->d_name;
+        continue;
+      }
+      if (reader.read_header() != 0) {
+        LOG(ERROR) << "read header error! " << dirp->d_name;
+        reader.close();
+        continue;
+      }
+
+      Record record(reader.get_header());
+      while (reader.next(&record) == 0) {
+        uint64_t key =
+            *reinterpret_cast<uint64_t*>(const_cast<char*>(record.key.data()));
+
+        auto it_find = _map_cache.find(key);
+        if (it_find != _map_cache.end()) {
+          // load dumplicate key
+          LOG(WARNING) << "Load dumplicate key:" << key
+                       << " from file:" << dirp->d_name;
+          continue;
+        }
+        rec::mcube::CubeValue* new_value = new rec::mcube::CubeValue();
+        new_value->error = 0;
+        new_value->buff.swap(record.value);
+        _map_cache.insert(std::make_pair(key, new_value));
+      }
+
+      LOG(WARNING) << "Load cube cache file " << dirp->d_name << " done.";
+    }
+    LOG(WARNING) << "Load all cube cache files done";
+  }
+  return 0;
+}
+
+}  // namespace predictor
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/core/predictor/framework/cache.h
+++ b/core/predictor/framework/cache.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <sys/types.h>
+#include <numeric>
+#include <string>
+#include <unordered_map>
+#include "core/cube/cube-api/include/cube_api.h"
+
+namespace baidu {
+namespace paddle_serving {
+namespace predictor {
+
+// Large models that use sparse parameters may use cube cache.
+// When the cube cache exists, the model is required to be
+// consistent with the version of the cube cache. Therefore,
+// when the model is updated, the model and the cube cache are
+// required to be reloaded at the same time.
+// Load all cached data at once without updating, it's lock free
+// switching two cube cache.
+class CubeCache {
+ public:
+  CubeCache() {}
+
+  ~CubeCache() { clear(); }
+
+  // clear cache data.
+  int clear();
+
+  // get cache data by key
+  rec::mcube::CubeValue* get_data(uint64_t key);
+
+  // reload all cache files from cache_path
+  int reload_data(const std::string& cache_path);
+
+ private:
+  // switching free lock, key type is uint64_t, value type is CubeValue*
+  std::unordered_map<uint64_t, rec::mcube::CubeValue*> _map_cache;
+};
+
+}  // namespace predictor
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/core/predictor/framework/dag_view.cpp
+++ b/core/predictor/framework/dag_view.cpp
@@ -21,6 +21,15 @@
 #include <string>
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/framework/op_repository.h"
+#ifdef BCLOUD
+#include <base/atomicops.h>
+#else
+#include <butil/atomicops.h>
+#endif
+#include <errno.h>
+
+#include "core/predictor/framework/resource.h"
+using baidu::paddle_serving::predictor::Resource;

 namespace baidu {
 namespace paddle_serving {
@@ -238,6 +247,77 @@ const Channel* DagView::get_response_channel(const uint64_t log_id) const {
  return last_op->mutable_channel();
 }

+void* call_back(void* ori_args) {
+  Resource::instance().thread_initialize();
+  Args* args = (Args*)ori_args;
+  Op* op = static_cast<Op*>(args->_op);
+  uint64_t log_id = static_cast<uint64_t>(args->_log_id);
+  bool debug = static_cast<bool>(args->_debug);
+  args->errcode = op->process(log_id, debug);
+  return nullptr;
+}
+
+int ParallelDagView::execute_one_stage(ViewStage* vstage,
+                                       const uint64_t log_id,
+                                       butil::IOBufBuilder* debug_os) {
+  butil::Timer stage_time(butil::Timer::STARTED);
+  uint32_t node_size = vstage->nodes.size();
+  std::vector<THREAD_T> tids(node_size);
+  Args* args = new Args[node_size];
+  VLOG(2) << "(logid=" << log_id << ") vstage->nodes.size(): " << node_size;
+  for (uint32_t ni = 0; ni < node_size; ni++) {
+    ViewNode* vnode = vstage->nodes[ni];
+    DagNode* conf = vnode->conf;
+    Op* op = vnode->op;
+    TRACEPRINTF(
+        "(logid=%" PRIu64 ") start to execute op[%s]", log_id, op->name());
+
+    args[ni]._op = op;
+    args[ni]._log_id = log_id;
+    args[ni]._debug = (debug_os != NULL);
+    int rc = THREAD_CREATE(&tids[ni], NULL, call_back, (void*)(args + ni));
+    if (rc != 0) {
+      LOG(ERROR) << "failed to create ParallelDagView worker thread: index="
+                 << ni << ", rc=" << rc << ", errno=" << errno << ":"
+                 << strerror(errno);
+      delete[] args;
+      return -1;
+    }
+  }
+  for (uint32_t ni = 0; ni < node_size; ni++) {
+    THREAD_JOIN(tids[ni], NULL);
+    int errcode = args[ni].errcode;
+    Op* op = args[ni]._op;
+    TRACEPRINTF(
+        "(logid=%" PRIu64 ") finish to execute op[%s]", log_id, op->name());
+    if (errcode < 0) {
+      LOG(ERROR) << "(logid=" << log_id
+                 << ") Execute failed, Op:" << op->debug_string();
+      delete[] args;
+      return errcode;
+    }
+
+    if (errcode > 0) {
+      LOG(INFO) << "(logid=" << log_id
+                << ") Execute ignore, Op:" << op->debug_string();
+      continue;
+    }
+
+    if (debug_os) {
+      (*debug_os) << "(logid=" << log_id << ") {\"op_name\": \"" << op->name()
+                  << "\", \"debug_str:\": \"" << op->debug_string()
+                  << "\", \"time_info\": \"" << op->time_info() << "\"}";
+    }
+
+    // LOG(DEBUG) << "Execute succ, Op:" << op->debug_string();
+  }
+  stage_time.stop();
+  PredictorMetric::GetInstance()->update_latency_metric(
+      STAGE_METRIC_PREFIX + vstage->full_name, stage_time.u_elapsed());
+  delete[] args;
+  return ERR_OK;
+}
+
 }  // namespace predictor
 }  // namespace paddle_serving
 }  // namespace baidu
--- a/core/predictor/framework/dag_view.h
+++ b/core/predictor/framework/dag_view.h
@@ -24,7 +24,7 @@ namespace baidu {
 namespace paddle_serving {
 namespace predictor {

-class Op;
+// class Op;

 struct ViewNode {
  Op* op;  // op->full_name == service_workflow_stageindex_opname
@@ -75,11 +75,20 @@ class DagView {
  Bus* _bus;
 };

+struct Args {
+  Op* _op;
+  uint64_t _log_id;
+  bool _debug;
+  int errcode;
+};
+
 // The derived DagView supports parallel execution
 // strategy, by implments the execute_one_stage().
 class ParallelDagView : public DagView {
 public:
-  int execute_one_stage(ViewStage* vstage, butil::IOBufBuilder*) { return 0; }
+  virtual int execute_one_stage(ViewStage* vstage,
+                                const uint64_t log_id,
+                                butil::IOBufBuilder* debug_os);
 };

 }  // namespace predictor

--- a/core/predictor/framework/infer.cpp
+++ b/core/predictor/framework/infer.cpp
@@ -25,7 +25,8 @@ int ReloadableInferEngine::proc_initialize_impl(
  _model_dir = conf.model_dir();
  _infer_thread_num = conf.runtime_thread_num();
  _infer_batch_size = conf.batch_infer_size();
-  _infer_batch_align = conf.enable_batch_align();
+  _infer_overrun = conf.enable_overrun();
+  _allow_split_request = conf.allow_split_request();

  _conf = conf;

@@ -56,9 +57,6 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,
  }

  // init bsf framework
-  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
-      .set_thread_init_fn(
-          boost::bind(&InferEngine::thrd_initialize_impl, this));
  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
      .set_thread_init_fn(
          boost::bind(&InferEngine::thrd_initialize_impl, this));
@@ -69,8 +67,10 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,
          boost::bind(&InferEngine::task_infer_impl, this, _1, _2));
  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_batch_size(
      _infer_batch_size);
-  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_batch_align(
-      _infer_batch_align);
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_overrun(
+      _infer_overrun);
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
+      .set_allow_split_request(_allow_split_request);
  if (im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].start(
          _infer_thread_num) != 0) {
    LOG(ERROR) << "Failed start bsf executor, threads:" << _infer_thread_num;
@@ -79,7 +79,8 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,

  LOG(WARNING) << "Enable batch schedule framework, thread_num:"
               << _infer_thread_num << ", batch_size:" << _infer_batch_size
-               << ", enable_batch_align:" << _infer_batch_align;
+               << ", enable_overrun:" << _infer_overrun
+               << ", allow_split_request:" << _allow_split_request;
  return 0;
 }

@@ -348,7 +349,7 @@ T* VersionedInferEngine::get_core() {
 }

 template <typename T>
-T* VersionedInferEngine::get_core(uint64_t version) {
+T* VersionedInferEngine::get_core(const uint64_t version) {
  auto iter = _versions.find(version);
  if (iter == _versions.end()) {
    LOG(ERROR) << "Not found version engine: " << version;
@@ -363,6 +364,15 @@ T* VersionedInferEngine::get_core(uint64_t version) {
  return NULL;
 }

+CubeCache* VersionedInferEngine::get_cube_cache() {
+  InferEngine* engine = default_engine();
+  if (!engine) {
+    LOG(WARNING) << "fail to get default engine";
+    return nullptr;
+  }
+  return engine->get_cube_cache();
+}
+
 int VersionedInferEngine::proc_initialize_impl(
    const configure::EngineDesc& conf, bool) {
  return -1;
@@ -382,6 +392,11 @@ int VersionedInferEngine::task_infer_impl(const void* in,
  return -1;
 }

+int InferManager::set_taskexecutor_num(size_t total_engine_num) {
+  im::bsf::TaskExecutorVector<TaskT>::instance().resize(total_engine_num);
+  return 0;
+}
+
 int InferManager::proc_initialize(const char* path,
                                  const char* file,
                                  std::shared_ptr<int> engine_index_ptr) {
@@ -391,8 +406,6 @@ int InferManager::proc_initialize(const char* path,
    return -1;
  }
  uint32_t engine_num = model_toolkit_conf.engines_size();
-  im::bsf::TaskExecutorVector<TaskT>::instance().resize(*engine_index_ptr +
-                                                        engine_num);
  for (uint32_t ei = 0; ei < engine_num; ++ei) {
    LOG(INFO) << "model_toolkit_conf.engines(" << ei
              << ").name: " << model_toolkit_conf.engines(ei).name();
@@ -502,6 +515,15 @@ T* InferManager::get_core(const char* model_name) {
  return NULL;
 }

+CubeCache* InferManager::get_cube_cache(const char* model_name) {
+  auto it = _map.find(model_name);
+  if (it == _map.end()) {
+    LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
+    return nullptr;
+  }
+  return it->second->get_cube_cache();
+}
+
 // Versioned inference interface
 int InferManager::infer(const char* model_name,
                        const void* in,
@@ -517,7 +539,7 @@ int InferManager::infer(const char* model_name,
 }

 template <typename T>
-T* InferManager::get_core(const char* model_name, uint64_t version) {
+T* InferManager::get_core(const char* model_name, const uint64_t version) {
  auto it = _map.find(model_name);
  if (it == _map.end()) {
    LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;

--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
--- a/core/predictor/framework/resource.cpp
+++ b/core/predictor/framework/resource.cpp
@@ -135,6 +135,17 @@ int Resource::initialize(const std::string& path, const std::string& file) {

  if (FLAGS_enable_model_toolkit) {
    size_t model_toolkit_num = resource_conf.model_toolkit_path_size();
+    // 此处暂时认为，每个model_toolkit仅包含一个engine
+    // 故认为 model_toolkit_num == engine总数
+    // 若以后出现model_toolkit仅包含多个engine
+    // 则应先for循环统计engine总数,再set_taskexecutor_num
+    // 切不可动态im::bsf::TaskExecutorVector<TaskT>::instance().resize
+    // TaskExecutor是线程池，内含锁，在engine进程初始化时已开始work加锁循环运行了
+    // 之后再resize内存搬运，会导致work使用原锁，而搬运后的TaskExecutor的锁内存已改变
+    if (InferManager::instance().set_taskexecutor_num(model_toolkit_num) != 0) {
+      LOG(ERROR) << "failed set_taskexecutor_num";
+      return -1;
+    }
    std::shared_ptr<int> engine_index_ptr(new int(0));
    for (size_t mi = 0; mi < model_toolkit_num; ++mi) {
      std::string model_toolkit_path = resource_conf.model_toolkit_path(mi);
@@ -165,18 +176,18 @@ int Resource::initialize(const std::string& path, const std::string& file) {
    rec::mcube::CubeAPI* cube = rec::mcube::CubeAPI::instance();
    std::string cube_config_fullpath = "./" + resource_conf.cube_config_path() +
                                       "/" + resource_conf.cube_config_file();
-    this->cube_config_fullpath = cube_config_fullpath;
-    this->cube_quant_bits = resource_conf.has_cube_quant_bits()
-                                ? resource_conf.cube_quant_bits()
-                                : 0;
-    if (this->cube_quant_bits != 0 && this->cube_quant_bits != 8) {
+    this->_cube_config_fullpath = cube_config_fullpath;
+    this->_cube_quant_bits = resource_conf.has_cube_quant_bits()
+                                 ? resource_conf.cube_quant_bits()
+                                 : 0;
+    if (this->_cube_quant_bits != 0 && this->_cube_quant_bits != 8) {
      LOG(ERROR) << "Cube quant bits illegal! should be 0 or 8.";
      return -1;
    }
-    if (this->cube_quant_bits == 0) {
+    if (this->_cube_quant_bits == 0) {
      LOG(INFO) << "cube quant mode OFF";
    } else {
-      LOG(INFO) << "cube quant mode ON, quant bits: " << this->cube_quant_bits;
+      LOG(INFO) << "cube quant mode ON, quant bits: " << this->_cube_quant_bits;
    }
  }

@@ -187,10 +198,10 @@ int Resource::initialize(const std::string& path, const std::string& file) {
 // model config
 int Resource::general_model_initialize(const std::string& path,
                                       const std::string& file) {
-  if (this->cube_config_fullpath.size() != 0) {
-    LOG(INFO) << "init cube by config file : " << this->cube_config_fullpath;
+  if (this->_cube_config_fullpath.size() != 0) {
+    LOG(INFO) << "init cube by config file : " << this->_cube_config_fullpath;
    rec::mcube::CubeAPI* cube = rec::mcube::CubeAPI::instance();
-    int ret = cube->init(this->cube_config_fullpath.c_str());
+    int ret = cube->init(this->_cube_config_fullpath.c_str());
    if (ret != 0) {
      LOG(ERROR) << "cube init error";
      return -1;
@@ -315,7 +326,7 @@ int Resource::thread_clear() {
  }
  return 0;
 }
-size_t Resource::get_cube_quant_bits() { return this->cube_quant_bits; }
+size_t Resource::get_cube_quant_bits() { return this->_cube_quant_bits; }

 int Resource::reload() {
  if (FLAGS_enable_model_toolkit && InferManager::instance().reload() != 0) {

--- a/core/predictor/framework/resource.h
+++ b/core/predictor/framework/resource.h
@@ -16,8 +16,10 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "core/cube/cube-api/include/cube_api.h"
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/framework/infer.h"
@@ -27,6 +29,8 @@ namespace baidu {
 namespace paddle_serving {
 namespace predictor {

+// Paddle general model configuration, read the model configuration information
+// from the general_model_config.proto file
 class PaddleGeneralModelConfig {
 public:
  PaddleGeneralModelConfig() {}
@@ -34,23 +38,47 @@ class PaddleGeneralModelConfig {
  ~PaddleGeneralModelConfig() {}

 public:
+  // feed/fetch name and alias_name
  std::vector<std::string> _feed_name;
  std::vector<std::string> _feed_alias_name;
-  std::vector<int> _feed_type;      // 0 int64, 1 float
-  std::vector<bool> _is_lod_feed;   // true lod tensor
-  std::vector<bool> _is_lod_fetch;  // whether a fetch var is lod_tensor
-  std::vector<int> _capacity;       //  capacity for each tensor
-                                    /*
-                                      feed_shape_ for feeded variable
-                                      feed_shape_[i][j] represents the jth dim for ith input Tensor
-                                      if is_lod_feed_[i] == False, feed_shape_[i][0] = -1
-                                     */
-  std::vector<std::vector<int>> _feed_shape;
-
  std::vector<std::string> _fetch_name;
  std::vector<std::string> _fetch_alias_name;
+
+  // Be consistent with model saving interface var type conversion
+  // (python/paddle serving client/io/__init__)
+  // int64 => 0;
+  // float32 => 1;
+  // int32 => 2;
+  // float64 => 3;
+  // int16 => 4;
+  // float16 => 5;
+  // bfloat16 => 6;
+  // uint8 => 7;
+  // int8 => 8;
+  // bool => 9;
+  // complex64 => 10,
+  // complex128 => 11;
+  std::vector<int> _feed_type;
+
+  // whether a feed or fetch var is lod_tensor.
+  std::vector<bool> _is_lod_feed;
+  std::vector<bool> _is_lod_fetch;
+
+  // capacity for each tensor
+  std::vector<int> _capacity;
+
+  // _feed_shape and _fetch_shape are used to represent the dimensional
+  // information of tensor.
+  // for examples, feed_shape_[i][j] represents the j(th) dim for i(th) input
+  // tensor.
+  // if is_lod_feed_[i] == False, feed_shape_[i][0] = -1
+  std::vector<std::vector<int>> _feed_shape;
  std::vector<std::vector<int>> _fetch_shape;
+
+  // fetch name -> index of fetch_name vector.
  std::map<std::string, int> _fetch_name_to_index;
+
+  // fetch alias name -> index of fetch_alias_name vector.
  std::map<std::string, int> _fetch_alias_name_to_index;
 };

@@ -73,33 +101,50 @@ class Resource {
    return ins;
  }

+  // initialize resource
  int initialize(const std::string& path, const std::string& file);

+  // loading all models configurations from prototxt
  int general_model_initialize(const std::string& path,
                               const std::string& file);

+  // initialize thread local data
  int thread_initialize();

+  // clear thread local data
  int thread_clear();

+  // reload resources
  int reload();

+  // finalize
  int finalize();

+  // get all model configs
  std::vector<std::shared_ptr<PaddleGeneralModelConfig>>
  get_general_model_config();

+  // print all configurations of all models
  void print_general_model_config(
      const std::shared_ptr<PaddleGeneralModelConfig>& config);

+  // get cube quantity bit size
  size_t get_cube_quant_bits();

 private:
  int thread_finalize() { return 0; }
+
+ private:
+  // configuration infermation of all models, loading from prototxt files
  std::vector<std::shared_ptr<PaddleGeneralModelConfig>> _configs;
-  std::string cube_config_fullpath;
-  int cube_quant_bits;  // 0 if no empty

+  // full path of cube configuration file.
+  std::string _cube_config_fullpath;
+
+  // cube quantify bit size, support 0/8. set 0 if no quant.
+  size_t _cube_quant_bits;
+
+  // bthread local key
  THREAD_KEY_T _tls_bspec_key;
 };


--- a/core/predictor/tools/ocrtools/preprocess_op.cpp
+++ b/core/predictor/tools/ocrtools/preprocess_op.cpp
--- a/core/sdk-cpp/proto/general_model_service.proto
+++ b/core/sdk-cpp/proto/general_model_service.proto
--- a/doc/HTTP_SERVICE_CN.md
+++ b/doc/HTTP_SERVICE_CN.md
--- a/java/README_CN.md
+++ b/java/README_CN.md
--- a/java/examples/src/main/java/PaddleServingClientExample.java
+++ b/java/examples/src/main/java/PaddleServingClientExample.java
--- a/java/src/main/java/io/paddle/serving/client/Client.java
+++ b/java/src/main/java/io/paddle/serving/client/Client.java
--- a/java/src/main/proto/general_model_service.proto
+++ b/java/src/main/proto/general_model_service.proto
--- a/paddle_inference/paddle/include/paddle_engine.h
+++ b/paddle_inference/paddle/include/paddle_engine.h
--- a/python/examples/bert/README.md
+++ b/python/examples/bert/README.md
--- a/python/examples/bert/README_CN.md
+++ b/python/examples/bert/README_CN.md
--- a/python/examples/bert/bert_web_service.py
+++ b/python/examples/bert/bert_web_service.py
--- a/python/examples/bert/bert_web_service_gpu.py
+++ b/python/examples/bert/bert_web_service_gpu.py
--- a/python/examples/blazeface/README.md
+++ b/python/examples/blazeface/README.md
--- a/python/examples/cascade_rcnn/README.md
+++ b/python/examples/cascade_rcnn/README.md
--- a/python/examples/cascade_rcnn/README_CN.md
+++ b/python/examples/cascade_rcnn/README_CN.md
--- a/python/examples/cascade_rcnn/get_data.sh
+++ b/python/examples/cascade_rcnn/get_data.sh
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/cascade_rcnn_r50_fpx_1x_serving.tar.gz
-tar xf cascade_rcnn_r50_fpx_1x_serving.tar.gz
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco_serving.tar.gz
+tar xf cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco_serving.tar.gz
--- a/python/examples/cascade_rcnn/label_list.txt
+++ b/python/examples/cascade_rcnn/label_list.txt
-background
 person
 bicycle
 car

--- a/python/examples/cascade_rcnn/test_client.py
+++ b/python/examples/cascade_rcnn/test_client.py
--- a/python/examples/criteo_ctr/README.md
+++ b/python/examples/criteo_ctr/README.md
--- a/python/examples/criteo_ctr/README_CN.md
+++ b/python/examples/criteo_ctr/README_CN.md
--- a/python/examples/criteo_ctr_with_cube/README.md
+++ b/python/examples/criteo_ctr_with_cube/README.md
--- a/python/examples/criteo_ctr_with_cube/README_CN.md
+++ b/python/examples/criteo_ctr_with_cube/README_CN.md
--- a/python/examples/criteo_ctr_with_cube/cube/conf/cube.conf
+++ b/python/examples/criteo_ctr_with_cube/cube/conf/cube.conf
--- a/python/examples/criteo_ctr_with_cube/cube/conf/gflags.conf
+++ b/python/examples/criteo_ctr_with_cube/cube/conf/gflags.conf
+--port=8027
+--dict_split=1
+--in_mem=true
+--log_dir=./log/
--- a/python/examples/criteo_ctr_with_cube/cube/keys
+++ b/python/examples/criteo_ctr_with_cube/cube/keys
--- a/python/examples/criteo_ctr_with_cube/test_client.py
+++ b/python/examples/criteo_ctr_with_cube/test_client.py
--- a/python/examples/deeplabv3/README.md
+++ b/python/examples/deeplabv3/README.md
--- a/python/examples/deeplabv3/README_CN.md
+++ b/python/examples/deeplabv3/README_CN.md
--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/test_client.py
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/test_client.py
--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/test_client.py
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/test_client.py
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/000000014439.jpg
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/000000014439.jpg
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/000000570688.jpg
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/000000570688.jpg
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README.md
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README.md
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README_CN.md
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README_CN.md
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/test_client.py
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/test_client.py
--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md
--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md
--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/test_client.py
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/test_client.py
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/000000014439.jpg
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/000000014439.jpg
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/000000570688.jpg
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/000000570688.jpg
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/README.md
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/README.md
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/README_CN.md
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/README_CN.md
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/label_list.txt
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/label_list.txt
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/test_client.py
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/test_client.py
--- a/python/examples/detection/ttfnet_darknet53_1x_coco/README.md
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/README.md
--- a/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md
--- a/python/examples/detection/ttfnet_darknet53_1x_coco/test_client.py
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/test_client.py
--- a/python/examples/detection/yolov3_darknet53_270e_coco/README.md
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/README.md
--- a/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md
--- a/python/examples/detection/yolov3_darknet53_270e_coco/test_client.py
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/test_client.py
--- a/python/examples/encryption/README.md
+++ b/python/examples/encryption/README.md
--- a/python/examples/encryption/README_CN.md
+++ b/python/examples/encryption/README_CN.md
--- a/python/examples/fit_a_line/README.md
+++ b/python/examples/fit_a_line/README.md
--- a/python/examples/fit_a_line/README_CN.md
+++ b/python/examples/fit_a_line/README_CN.md
--- a/python/examples/fit_a_line/test_httpclient.py
+++ b/python/examples/fit_a_line/test_httpclient.py
--- a/python/examples/imagenet/README.md
+++ b/python/examples/imagenet/README.md
--- a/python/examples/imagenet/README_CN.md
+++ b/python/examples/imagenet/README_CN.md
--- a/python/examples/imagenet/resnet50_http_client.py
+++ b/python/examples/imagenet/resnet50_http_client.py
--- a/python/examples/imagenet/resnet50_web_service.py
+++ b/python/examples/imagenet/resnet50_web_service.py
--- a/python/examples/imdb/README.md
+++ b/python/examples/imdb/README.md
--- a/python/examples/imdb/README_CN.md
+++ b/python/examples/imdb/README_CN.md
--- a/python/examples/imdb/imdb_web_service_demo.sh
+++ b/python/examples/imdb/imdb_web_service_demo.sh
--- a/python/examples/imdb/text_classify_service.py
+++ b/python/examples/imdb/text_classify_service.py
--- a/python/examples/lac/README.md
+++ b/python/examples/lac/README.md
--- a/python/examples/lac/README_CN.md
+++ b/python/examples/lac/README_CN.md
--- a/python/examples/lac/lac_http_client.py
+++ b/python/examples/lac/lac_http_client.py
--- a/python/examples/lac/lac_web_service.py
+++ b/python/examples/lac/lac_web_service.py
--- a/python/examples/low_precision/resnet50/README.md
+++ b/python/examples/low_precision/resnet50/README.md
--- a/python/examples/low_precision/resnet50/README_CN.md
+++ b/python/examples/low_precision/resnet50/README_CN.md
--- a/python/examples/mobilenet/README.md
+++ b/python/examples/mobilenet/README.md
--- a/python/examples/mobilenet/README_CN.md
+++ b/python/examples/mobilenet/README_CN.md
--- a/python/examples/ocr/README.md
+++ b/python/examples/ocr/README.md
--- a/python/examples/ocr/README_CN.md
+++ b/python/examples/ocr/README_CN.md
--- a/python/examples/pipeline/PaddleClas/DarkNet53/README.md
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/README.md
--- a/python/examples/pipeline/PaddleClas/DarkNet53/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/README_CN.md
--- a/python/examples/pipeline/PaddleClas/DarkNet53/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/benchmark.py
--- a/python/examples/pipeline/PaddleClas/DarkNet53/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/pipeline_rpc_client.py
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/README.md
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/README.md
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/README_CN.md
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/benchmark.py
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/pipeline_rpc_client.py
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/resnet50_web_service.py
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/README.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/README.md
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/README_CN.md
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/benchmark.py
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/pipeline_rpc_client.py
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/resnet50_web_service.py
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/README.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/README.md
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/README_CN.md
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/benchmark.py
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/pipeline_rpc_client.py
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/resnet50_web_service.py
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README.md
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README_CN.md
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/benchmark.py
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/pipeline_rpc_client.py
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/resnet50_web_service.py
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README.md
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README_CN.md
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/benchmark.py
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/pipeline_rpc_client.py
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/resnet50_web_service.py
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/README.md
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/README_CN.md
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/benchmark.py
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/pipeline_rpc_client.py
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/resnet50_web_service.py
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README.md
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README_CN.md
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/benchmark.py
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/pipeline_rpc_client.py
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/resnet50_web_service.py
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README.md
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README_CN.md
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/benchmark.py
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/pipeline_rpc_client.py
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/resnet50_web_service.py
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README.md
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README_CN.md
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/benchmark.py
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/pipeline_rpc_client.py
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/resnet50_web_service.py
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/README.md
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/README_CN.md
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/benchmark.py
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/pipeline_rpc_client.py
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README.md
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README.md
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README_CN.md
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/benchmark.py
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/pipeline_rpc_client.py
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/resnet50_web_service.py
--- a/python/examples/pipeline/PaddleDetection/faster_rcnn/README.md
+++ b/python/examples/pipeline/PaddleDetection/faster_rcnn/README.md
--- a/python/examples/pipeline/PaddleDetection/faster_rcnn/benchmark.py
+++ b/python/examples/pipeline/PaddleDetection/faster_rcnn/benchmark.py
--- a/python/examples/pipeline/PaddleDetection/faster_rcnn/web_service.py
+++ b/python/examples/pipeline/PaddleDetection/faster_rcnn/web_service.py
--- a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/README.md
+++ b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/README.md
--- a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/benchmark.py
+++ b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/benchmark.py
--- a/python/examples/pipeline/PaddleDetection/yolov3/README.md
+++ b/python/examples/pipeline/PaddleDetection/yolov3/README.md
--- a/python/examples/pipeline/PaddleDetection/yolov3/benchmark.py
+++ b/python/examples/pipeline/PaddleDetection/yolov3/benchmark.py
--- a/python/examples/pipeline/bert/benchmark.py
+++ b/python/examples/pipeline/bert/benchmark.py
--- a/python/examples/pipeline/ocr/benchmark.py
+++ b/python/examples/pipeline/ocr/benchmark.py
--- a/python/examples/pipeline/simple_web_service/README.md
+++ b/python/examples/pipeline/simple_web_service/README.md
--- a/python/examples/pipeline/simple_web_service/README_CN.md
+++ b/python/examples/pipeline/simple_web_service/README_CN.md
--- a/python/examples/pipeline/simple_web_service/benchmark.py
+++ b/python/examples/pipeline/simple_web_service/benchmark.py
--- a/python/examples/resnet_v2_50/README.md
+++ b/python/examples/resnet_v2_50/README.md
--- a/python/examples/resnet_v2_50/README_CN.md
+++ b/python/examples/resnet_v2_50/README_CN.md
--- a/python/examples/senta/README.md
+++ b/python/examples/senta/README.md
--- a/python/examples/senta/README_CN.md
+++ b/python/examples/senta/README_CN.md
--- a/python/examples/unet_for_image_seg/README.md
+++ b/python/examples/unet_for_image_seg/README.md
--- a/python/examples/unet_for_image_seg/README_CN.md
+++ b/python/examples/unet_for_image_seg/README_CN.md
--- a/python/examples/util/README.md
+++ b/python/examples/util/README.md
--- a/python/examples/util/README_CN.md
+++ b/python/examples/util/README_CN.md
--- a/python/examples/xpu/bert/bert_web_service.py
+++ b/python/examples/xpu/bert/bert_web_service.py
--- a/python/examples/xpu/ernie/ernie_web_service.py
+++ b/python/examples/xpu/ernie/ernie_web_service.py
--- a/python/examples/xpu/fit_a_line_xpu/README.md
+++ b/python/examples/xpu/fit_a_line_xpu/README.md
--- a/python/examples/xpu/fit_a_line_xpu/README_CN.md
+++ b/python/examples/xpu/fit_a_line_xpu/README_CN.md
--- a/python/examples/xpu/resnet_v2_50_xpu/README.md
+++ b/python/examples/xpu/resnet_v2_50_xpu/README.md
--- a/python/examples/xpu/resnet_v2_50_xpu/README_CN.md
+++ b/python/examples/xpu/resnet_v2_50_xpu/README_CN.md
--- a/python/examples/xpu/vgg19/README.md
+++ b/python/examples/xpu/vgg19/README.md
--- a/python/examples/yolov4/README.md
+++ b/python/examples/yolov4/README.md
--- a/python/examples/yolov4/README_CN.md
+++ b/python/examples/yolov4/README_CN.md
--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
--- a/python/paddle_serving_app/reader/__init__.py
+++ b/python/paddle_serving_app/reader/__init__.py
--- a/python/paddle_serving_app/reader/image_reader.py
+++ b/python/paddle_serving_app/reader/image_reader.py
--- a/python/paddle_serving_client/client.py
+++ b/python/paddle_serving_client/client.py
--- a/python/paddle_serving_client/convert.py
+++ b/python/paddle_serving_client/convert.py
--- a/python/paddle_serving_client/httpclient.py
+++ b/python/paddle_serving_client/httpclient.py
--- a/python/paddle_serving_client/io/__init__.py
+++ b/python/paddle_serving_client/io/__init__.py
--- a/python/paddle_serving_server/parse_profile.py
+++ b/python/paddle_serving_server/parse_profile.py
--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
--- a/python/paddle_serving_server/server.py
+++ b/python/paddle_serving_server/server.py
--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
--- a/python/pipeline/analyse.py
+++ b/python/pipeline/analyse.py
--- a/python/pipeline/channel.py
+++ b/python/pipeline/channel.py
--- a/python/pipeline/dag.py
+++ b/python/pipeline/dag.py
--- a/python/pipeline/gateway/proto/gateway.proto
+++ b/python/pipeline/gateway/proto/gateway.proto
--- a/python/pipeline/operator.py
+++ b/python/pipeline/operator.py
--- a/python/pipeline/pipeline_client.py
+++ b/python/pipeline/pipeline_client.py
--- a/python/pipeline/pipeline_server.py
+++ b/python/pipeline/pipeline_server.py
--- a/python/pipeline/proto/pipeline_service.proto
+++ b/python/pipeline/proto/pipeline_service.proto
--- a/python/pipeline/util.py
+++ b/python/pipeline/util.py
--- a/python/requirements.txt
+++ b/python/requirements.txt
--- a/python/requirements_mac.txt
+++ b/python/requirements_mac.txt
--- a/python/setup.py.server.in
+++ b/python/setup.py.server.in
--- a/tools/scripts/ipipe_py3.sh
+++ b/tools/scripts/ipipe_py3.sh