Merge branch 'develop' of https://github.com/paddlepaddle/serving into develop

770b6c26 · bjjwwang · a71530d7 · ce773b71 · 770b6c26 · 770b6c26
233 changed file
--- a/README.md
+++ b/README.md
@@ -188,7 +188,7 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 | `use_lite` (Only for Intel x86 CPU or ARM CPU) | -    | -       | Run PaddleLite inference                              |
 | `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU        |
 | `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8              |
-| `use_calib`                                    | bool | False   | Only for deployment with TensorRT                     |
+| `use_calib`                                    | bool | False   | Use TRT int8 calibration                              |
 | `gpu_multi_stream`                             | bool | False   | EnableGpuMultiStream to get larger QPS                |
 #### Description of asynchronous model

--- a/README_CN.md
+++ b/README_CN.md
@@ -187,7 +187,7 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 | `use_lite` (Only for Intel x86 CPU or ARM CPU) | -    | -       | Run PaddleLite inference                              |
 | `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU        |
 | `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8              |
-| `use_calib`                                    | bool | False   | Only for deployment with TensorRT                     |
+| `use_calib`                                    | bool | False   | Use TRT int8 calibration                              |
 | `gpu_multi_stream`                             | bool | False   | EnableGpuMultiStream to get larger QPS                |
 #### 异步模型的说明

--- a/cmake/external/cudnn.cmake
+++ b/cmake/external/cudnn.cmake
@@ -61,8 +61,11 @@ else()
 endif()
 if(CUDNN_FOUND)
-    file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+    if(EXISTS "${CUDNN_INCLUDE_DIR}/cudnn_version.h")
+        file(READ ${CUDNN_INCLUDE_DIR}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS)
+    elseif(EXISTS "${CUDNN_INCLUDE_DIR}/cudnn.h")
+        file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+    endif()
    get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)
    string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)"

--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -27,52 +27,54 @@ set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/
 message( "WITH_GPU = ${WITH_GPU}")
 # Paddle Version should be one of:
 # latest: latest develop build
 # version number like 1.5.2
-SET(PADDLE_VERSION "2.1.0")
+SET(PADDLE_VERSION "2.2.0-rc0")
 if (WITH_GPU)
-    if(CUDA_VERSION EQUAL 11.0)
+    message("CUDA: ${CUDA_VERSION}, CUDNN_MAJOR_VERSION: ${CUDNN_MAJOR_VERSION}")
-        set(CUDA_SUFFIX "cuda11.0-cudnn8-mkl-gcc8.2")
+    # cuda 11.0 is not supported, 11.2 would be added.
+    if(CUDA_VERSION EQUAL 10.1)
+        set(CUDA_SUFFIX "x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5")
        set(WITH_TRT ON)
    elseif(CUDA_VERSION EQUAL 10.2)
-        set(CUDA_SUFFIX "cuda10.2-cudnn8-mkl-gcc8.2")
+        if(CUDNN_MAJOR_VERSION EQUAL 7)
-        set(WITH_TRT ON)
+            set(CUDA_SUFFIX "x86-64_gcc5.4_avx_mkl_cuda10.2_cudnn7.6.5_trt6.0.1.5")
-    elseif(CUDA_VERSION EQUAL 10.1)
+            set(WITH_TRT ON)
-        set(CUDA_SUFFIX "cuda10.1-cudnn7-mkl-gcc8.2")
+        elseif(CUDNN_MAJOR_VERSION EQUAL 8)
+            set(CUDA_SUFFIX "x86-64_gcc8.2_avx_mkl_cuda10.2_cudnn8.1.1_trt7.2.3.4")
+            set(WITH_TRT ON)
+        endif()
+    elseif(CUDA_VERSION EQUAL 11.2)
+        set(CUDA_SUFFIX "x86-64_gcc8.2_avx_mkl_cuda11.2_cudnn8.2.1_trt8.0.3.4")
        set(WITH_TRT ON)
-    elseif(CUDA_VERSION EQUAL 10.0)
-        set(CUDA_SUFFIX "cuda10-cudnn7-avx-mkl")
-    elseif(CUDA_VERSION EQUAL 9.0)
-        set(CUDA_SUFFIX "cuda9-cudnn7-avx-mkl")
    endif()
 else()
    set(WITH_TRT OFF)
 endif()  
 if (WITH_GPU)
-    SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-${CUDA_SUFFIX}")
+    SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/GPU/${CUDA_SUFFIX}")
 elseif (WITH_LITE)
    if (WITH_XPU)
-        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-${CMAKE_SYSTEM_PROCESSOR}-xpu")
+        SET(PADDLE_LIB_VERSION "arm64_gcc7.3_openblas")
    else()
        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-${CMAKE_SYSTEM_PROCESSOR}")
    endif()
 else()
    if (WITH_AVX)
        if (WITH_MKLML)
-            SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-cpu-avx-mkl")
+            SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/CPU/gcc8.2_avx_mkl")
        else()
-            SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-cpu-avx-openblas")
+            SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/CPU/gcc8.2_avx_openblas")
        endif()
    else()
-        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-cpu-noavx-openblas")
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/CPU/gcc8.2_openblas")
    endif()
 endif()
 if(WITH_LITE)
-    SET(PADDLE_LIB_PATH "http://paddle-serving.bj.bcebos.com/inferlib/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
+    SET(PADDLE_LIB_PATH "https://paddle-inference-lib.bj.bcebos.com/2.2.0-rc0/cxx_c/Linux/XPU/${PADDLE_LIB_VERSION}/paddle_inference_install_dir.tar.gz ")
 else()
    SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
 endif()

--- a/core/configure/proto/general_model_service.proto
+++ b/core/configure/proto/general_model_service.proto
@@ -12,41 +12,97 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-syntax = "proto2";
+syntax = "proto3";
 package baidu.paddle_serving.predictor.general_model;
 option java_multiple_files = true;
+option cc_generic_services = true;
 message Tensor {
-  repeated string data = 1;
+  // VarType: INT64
-  repeated int32 int_data = 2;
+  repeated int64 int64_data = 1;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
+  // VarType: FP32
-  optional int32 elem_type =
+  repeated float float_data = 2;
-      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
-  repeated int32 shape = 6;       // shape should include batch
+  // VarType: INT32
-  repeated int32 lod = 7;         // only for fetch tensor currently
+  repeated int32 int_data = 3;
-  optional string name = 8;       // get from the Model prototxt
-  optional string alias_name = 9; // get from the Model prototxt
+  // VarType: FP64
+  repeated double float64_data = 4;
+  // VarType: UINT32
+  repeated uint32 uint32_data = 5;
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+  // VarType: STRING
+  repeated string data = 9;
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  20 => STRING
+  int32 elem_type = 10;
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 11;
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 12;
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 13;
+  // Correspond to the variable 'alias_name' in the model description prototxt.
+  string alias_name = 14; // get from the Model prototxt
+  // VarType: FP16, INT16, INT8, BF16, UINT8
+  bytes tensor_content = 15;
 };
 message Request {
  repeated Tensor tensor = 1;
  repeated string fetch_var_names = 2;
-  optional bool profile_server = 3 [ default = false ];
+  bool profile_server = 3;
-  required uint64 log_id = 4 [ default = 0 ];
+  uint64 log_id = 4;
 };
 message Response {
  repeated ModelOutput outputs = 1;
  repeated int64 profile_time = 2;
+  // Error code
+  int32 err_no = 3;
+  // Error messages
+  string err_msg = 4;
 };
 message ModelOutput {
  repeated Tensor tensor = 1;
-  optional string engine_name = 2;
+  string engine_name = 2;
 }
 service GeneralModelService {
-  rpc inference(Request) returns (Response) {}
+  rpc inference(Request) returns (Response);
-  rpc debug(Request) returns (Response) {}
+  rpc debug(Request) returns (Response);
 };
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -22,11 +22,8 @@ message EngineDesc {
  required string reloadable_type = 4;
  required string model_dir = 5;
  repeated int32 gpu_ids = 6;
-  required int32 runtime_thread_num = 7;
+  optional string version_file = 7;
-  required int32 batch_infer_size = 8;
+  optional string version_type = 8;
-  required int32 enable_batch_align = 9;
-  optional string version_file = 10;
-  optional string version_type = 11;
  /*
   * Sparse Parameter Service type. Valid types are:
@@ -39,17 +36,34 @@ message EngineDesc {
    LOCAL = 1;
    REMOTE = 2;
  }
-  optional SparseParamServiceType sparse_param_service_type = 12;
+  optional SparseParamServiceType sparse_param_service_type = 10;
-  optional string sparse_param_service_table_name = 13;
+  optional string sparse_param_service_table_name = 11;
-  optional bool enable_memory_optimization = 14;
+  optional bool enable_memory_optimization = 12;
-  optional bool enable_ir_optimization = 15;
+  optional bool enable_ir_optimization = 13;
-  optional bool use_trt = 16;
+  optional bool use_trt = 14;
-  optional bool use_lite = 17;
+  optional bool use_lite = 15;
-  optional bool use_xpu = 18;
+  optional bool use_xpu = 16;
-  optional bool use_gpu = 19;
+  optional bool use_gpu = 17;
-  optional bool combined_model = 20;
+  optional bool combined_model = 18;
-  optional bool encrypted_model = 21;
+  optional bool encrypted_model = 19;
-  optional bool gpu_multi_stream = 22;
+  optional bool gpu_multi_stream = 20;
+  /*
+   * "runtime_thread_num": n == 0 means don`t use Asynchronous task scheduling
+   * mode.
+   * n > 0 means how many Predictor for this engine in Asynchronous task
+   * scheduling mode.
+   * "batch_infer_size": the max batch for this engine in Asynchronous task
+   * scheduling mode.
+   * "enable_overrun": always put a whole task into the TaskQueue even if the
+   * total batch is bigger than "batch_infer_size".
+   * "allow_split_request": allow to split task(which is corresponding to
+   * request).
+   */
+  optional int32 runtime_thread_num = 30 [ default = 0 ];
+  optional int32 batch_infer_size = 31 [ default = 32 ];
+  optional bool enable_overrun = 32 [ default = false ];
+  optional bool allow_split_request = 33 [ default = true ];
 };
 // model_toolkit conf
@@ -61,11 +75,14 @@ message ResourceConf {
  repeated string model_toolkit_file = 2;
  repeated string general_model_path = 3;
  repeated string general_model_file = 4;
-  optional string cube_config_path = 5;
-  optional string cube_config_file = 6;
+  optional string cube_config_path = 10;
-  optional int32 cube_quant_bits = 7; // set 0 if no quant.
+  optional string cube_config_file = 11;
-  optional string auth_product_name = 8;
+  optional int32 cube_quant_bits = 12;
-  optional string auth_container_id = 9;
+  optional string cube_cache_path = 13;
+  optional string auth_product_name = 20;
+  optional string auth_container_id = 21;
 };
 // DAG node depency info

--- a/core/cube/cube-api/go-api/conf/cube.conf
+++ b/core/cube/cube-api/go-api/conf/cube.conf
+[{
+    "dict_name": "test",
+    "shard": 2,
+    "nodes": [{
+        "ip": "127.0.0.1",
+        "port": 8731
+    },{
+        "ip": "127.0.0.1",
+        "port": 8730
+    }]
+}]
--- a/core/cube/cube-api/go-api/demo.go
+++ b/core/cube/cube-api/go-api/demo.go
+package main
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io/ioutil"
+)
+func main() {
+	dict_name := flag.String("n", "test", "cube name")
+	conf_path := flag.String("c", "./conf/cube.conf", "cube conf path")
+	input_path := flag.String("i", "./input.json", "keys to seek")
+	output_path := flag.String("o", "./output.json", "result to save")
+	flag.Parse()
+	bytes, err := ioutil.ReadFile(*conf_path)
+	if err != nil {
+		fmt.Println("读取配置文件失败", err)
+		return
+	}
+	var meta Meta
+	err = json.Unmarshal(bytes, &meta.Servers)
+	if err != nil {
+		fmt.Println("解析数据失败", err)
+		return
+	}
+	err = meta.Seek(*dict_name, *input_path, *output_path)
+	if err != nil {
+		fmt.Println(err)
+	}
+	return
+}
--- a/core/cube/cube-api/go-api/input.json
+++ b/core/cube/cube-api/go-api/input.json
+{"keys": [0,1,2,3,4,5,6,7]}
+{"keys": [1]}
--- a/core/cube/cube-api/go-api/meta.go
+++ b/core/cube/cube-api/go-api/meta.go
+package main
+import "fmt"
+type Meta struct {
+	Servers []CubeServer `json:"servers,omitempty"`
+}
+func (meta *Meta) Seek(dict_name string, input string, output string) (err error) {
+	var server CubeServer
+	for _, s := range meta.Servers {
+		if s.Name == dict_name {
+			server = s
+			break
+		}
+	}
+	if server.Name != dict_name {
+		err = fmt.Errorf("%s server not exist", dict_name)
+		return err
+	}
+	err = server.Seek(input, output)
+	return err
+}
--- a/core/cube/cube-api/go-api/server.go
+++ b/core/cube/cube-api/go-api/server.go
+package main
+import (
+	"bufio"
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net/http"
+	"os"
+)
+type Input struct {
+	Keys []uint64 `json:"keys"`
+}
+type SingleValue struct {
+	Status uint32 `json:"status"`
+	Value  string `json:"value"`
+}
+type Output struct {
+	Values []SingleValue `json:"values"`
+}
+type ServerNode struct {
+	Ip   string `json:"ip"`
+	Port uint64 `json:"port"`
+}
+type CubeServer struct {
+	Name  string       `json:"dict_name"`
+	Shard uint64       `json:"shard"`
+	Nodes []ServerNode `json:"nodes"`
+}
+func (server *CubeServer) SplitKeys(keys []uint64) (splited_keys map[uint64]Input, offset map[uint64][]uint64) {
+	splited_keys = make(map[uint64]Input)
+	offset = make(map[uint64][]uint64)
+	for i, key := range keys {
+		shard_id := key % server.Shard
+		temp_split, _ := splited_keys[shard_id]
+		temp_split.Keys = append(temp_split.Keys, key)
+		splited_keys[shard_id] = temp_split
+		temp_offset, _ := offset[shard_id]
+		temp_offset = append(temp_offset, uint64(i))
+		offset[shard_id] = temp_offset
+	}
+	return splited_keys, offset
+}
+func (server *CubeServer) Seek(input string, output_path string) (err error) {
+	file, err := os.Open(input)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+	buf := bufio.NewReader(file)
+	for {
+		line, err := buf.ReadBytes('\n')
+		//line = strings.TrimSpace(line)
+		if err != nil || io.EOF == err {
+			break
+		}
+		var temp_input Input
+		json.Unmarshal(line, &temp_input)
+		key_nums := len(temp_input.Keys)
+		var output Output
+		output.Values = make([]SingleValue, key_nums+1)
+		splited_keys, offset := server.SplitKeys(temp_input.Keys)
+		for shard_id, keys := range splited_keys {
+			cur_output, _ := server.Post(shard_id, keys)
+			for index, single_value := range cur_output.Values {
+				output.Values[offset[shard_id][index]] = single_value
+			}
+		}
+		json_str, _ := json.Marshal(output)
+		fp, err := os.OpenFile(output_path, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0755)
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer fp.Close()
+		_, err = fp.Write(json_str)
+	}
+	return err
+}
+func (server *CubeServer) Post(shard_id uint64, input Input) (output Output, err error) {
+	if shard_id >= uint64(len(server.Nodes)) {
+		err = fmt.Errorf("have no shard:%v", shard_id)
+		return output, err
+	}
+	json_str, _ := json.Marshal(input)
+	URL := fmt.Sprintf("http://%s:%v/DictService/seek", server.Nodes[shard_id].Ip, server.Nodes[shard_id].Port)
+	req, err := http.NewRequest("POST", URL, bytes.NewBuffer(json_str))
+	if err != nil {
+		return output, err
+	}
+	req.Header.Set("Content-Type", "application/json")
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		return output, err
+	}
+	body, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		return output, err
+	}
+	err = json.Unmarshal(body, &output)
+	return output, err
+}
--- a/core/cube/cube-api/python-api/conf/cube.conf
+++ b/core/cube/cube-api/python-api/conf/cube.conf
+[{
+    "dict_name": "test",
+    "shard": 2,
+    "nodes": [{
+        "ip": "127.0.0.1",
+        "port": 8731
+    },{
+        "ip": "127.0.0.1",
+        "port": 8730
+    }]
+}]
--- a/core/cube/cube-api/python-api/demo.py
+++ b/core/cube/cube-api/python-api/demo.py
+#coding=utf-8
+import requests
+import sys
+import json
+class Meta(object):
+    """记录cube分片server路由"""
+    def __init__(self, conf_path):
+        """根据配置文件初始化路由"""
+        self.server_api = "/DictService/seek"
+        self.server_meta = {}
+        with open(conf_path, "r", encoding="utf8") as fp:
+            cube_servcers = json.load(fp)
+            for server in cube_servcers:
+                self.server_meta[server["dict_name"]] = server
+            fp.close()
+    def seek(self, dict_name, keys_path, save_path):
+        """查询"""
+        save_file = open(save_path, 'w')
+        with open(keys_path, "r", encoding="utf8") as fp:
+            lines = fp.readlines()
+            for line in lines:
+                json_line = json.loads(line)
+                values = [{} for i in range(len(json_line["keys"]))]
+                splited_keys, offset = self.split_keys(json_line)
+                for shard_id, keys in splited_keys.items():
+                    results = self.post(dict_name, shard_id, keys)
+                    for i, result in enumerate(results["values"]):
+                        values[offset[shard_id][i]] = result
+                cur_line_results = {}
+                cur_line_results["values"] = values
+                json.dump(cur_line_results, save_file)
+                save_file.write("\n")
+            fp.close()
+        save_file.close()
+    def split_keys(self, json_line):
+        """根据key值及分片数判断去哪一个分片上查询"""
+        keys_split = {}
+        offset = {}
+        i = 0
+        for key in json_line["keys"]:
+            shard_id = key % self.server_meta[dict_name]["shard"]
+            if shard_id not in keys_split:
+                keys_split[shard_id] = []
+            keys_split[shard_id].append(key)
+            if shard_id not in offset:
+                offset[shard_id] = []
+            offset[shard_id].append(i)
+            i += 1
+        return keys_split, offset
+    def post(self, dict_name, shard_id, keys):
+        """向分片server发送post请求"""
+        api = "http://%s:%s%s" % (self.server_meta[dict_name]["nodes"][shard_id]["ip"],
+            self.server_meta[dict_name]["nodes"][shard_id]["port"],
+            self.server_api)
+        data = {"keys": keys}
+        response = requests.post(api, json.dumps(data))
+        return response.json()
+if __name__ == '__main__':
+    if len(sys.argv) != 5:
+        print('please usage: python demo.py conf_path dict_name keys_path save_path')
+        exit(0)
+    conf_path = sys.argv[1]
+    dict_name = sys.argv[2]
+    keys_path = sys.argv[3]
+    save_path = sys.argv[4]
+    meta = Meta(conf_path)
+    meta.seek(dict_name, keys_path, save_path)
--- a/core/cube/cube-api/python-api/input.json
+++ b/core/cube/cube-api/python-api/input.json
+{"keys": [0,1,2,3,4,5,6,7]}
+{"keys": [1]}
\ No newline at end of file
--- a/core/cube/cube-api/python-api/ptyhon_api.md
+++ b/core/cube/cube-api/python-api/ptyhon_api.md
+# cube python api说明文档
+参考[大规模稀疏参数服务Cube的部署和使用](https://github.com/PaddlePaddle/Serving/blob/master/doc/DEPLOY.md#2-大规模稀疏参数服务cube的部署和使用)文档进行cube的部署。
+使用python api，可替代上述文档中第3节预测服务的部署、使用
+## 配置说明
+conf/cube.conf 以json格式，设置各个分片cube server的ip以及port，shard与分片数一致，示例：
+```bash
+[{
+    "dict_name": "test",
+    "shard": 2,
+    "nodes": [{
+        "ip": "127.0.0.1",
+        "port": 8731
+    },{
+        "ip": "127.0.0.1",
+        "port": 8730
+    }]
+}]
+```
+## 数据格式
+```bash
+{"keys": [0,1,2,3,4,5,6,7]}
+{"keys": [1]}
+```
+支持批量查询，每次查询一行
+## 使用
+```bash
+cd ./python-api
+python3 demo.py conf/cube.conf test input.json result.json
+```
\ No newline at end of file
--- a/core/cube/cube-api/python-api/result.json
+++ b/core/cube/cube-api/python-api/result.json
+{"values": [{"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}]}
+{"values": [{"status": 4294967295, "value": ""}]}
--- a/core/general-client/CMakeLists.txt
+++ b/core/general-client/CMakeLists.txt
@@ -3,3 +3,24 @@ add_subdirectory(pybind11)
 pybind11_add_module(serving_client src/general_model.cpp src/pybind_general_model.cpp)
 target_link_libraries(serving_client PRIVATE -Wl,--whole-archive utils sdk-cpp pybind python -Wl,--no-whole-archive -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -Wl,-rpath,'$ORIGIN'/lib)
 endif()
+if(CLIENT)
+FILE(GLOB client_srcs include/*.h src/client.cpp src/brpc_client.cpp)
+add_library(client ${client_srcs})
+add_dependencies(client utils sdk-cpp)
+target_link_libraries(client utils sdk-cpp)
+endif()
+if(CLIENT)
+include_directories(SYSTEM  ${CMAKE_CURRENT_LIST_DIR}/../../)
+add_executable(simple_client example/simple_client.cpp)
+add_dependencies(simple_client utils sdk-cpp client)
+target_link_libraries(simple_client -Wl,--whole-archive
+        -Wl,--no-whole-archive -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -Wl,-rpath,'$ORIGIN'/lib)
+target_link_libraries(simple_client utils)
+target_link_libraries(simple_client sdk-cpp)
+target_link_libraries(simple_client client)
+endif()
\ No newline at end of file
--- a/core/general-client/README_CN.md
+++ b/core/general-client/README_CN.md
+# 用于Paddle Serving的C++客户端
+(简体中文|[English](./README.md))
+## 请求BRPC-Server
+### 服务端启动
+以fit_a_line模型为例，服务端启动与常规BRPC-Server端启动命令一样。
+```
+cd ../../python/examples/fit_a_line
+sh get_data.sh
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
+```
+### 客户端预测
+客户端目前支持BRPC
+目前已经实现了BRPC的封装函数，详见[brpc_client.cpp](./src/brpc_client.cpp)
+```
+./simple_client --client_conf="uci_housing_client/serving_client_conf.prototxt" --server_port="127.0.0.1:9393" --test_type="brpc" --sample_type="fit_a_line"
+```
+更多示例详见[simple_client.cpp](./example/simple_client.cpp)
+| Argument                                       | Type | Default                              | Description                                           |
+| ---------------------------------------------- | ---- | ------------------------------------ | ----------------------------------------------------- |
+| `client_conf`                                  | str  | `"serving_client_conf.prototxt"`     | Path of client conf                                   |
+| `server_port`                                  | str  | `"127.0.0.1:9393"`                   | Exposed ip:port of server                             |
+| `test_type`                                    | str  | `"brpc"`                             | Mode of request "brpc"                                |
+| `sample_type`                                  | str  | `"fit_a_line"`                       | Type of sample include "fit_a_line,bert"              |
--- a/core/general-client/example/simple_client.cpp
+++ b/core/general-client/example/simple_client.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <fstream>
+#include <vector>
+#include "core/general-client/include/brpc_client.h"
+using baidu::paddle_serving::client::ServingClient;
+using baidu::paddle_serving::client::ServingBrpcClient;
+using baidu::paddle_serving::client::PredictorInputs;
+using baidu::paddle_serving::client::PredictorOutputs;
+DEFINE_string(server_port, "127.0.0.1:9292", "ip:port");
+DEFINE_string(client_conf, "serving_client_conf.prototxt", "Path of client conf");
+DEFINE_string(test_type, "brpc", "brpc");
+// fit_a_line, bert
+DEFINE_string(sample_type, "fit_a_line", "List: fit_a_line, bert");
+namespace {
+int prepare_fit_a_line(PredictorInputs& input, std::vector<std::string>& fetch_name) {
+  std::vector<float> float_feed = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
+            0.0582f, -0.0727f, -0.1583f, -0.0584f,
+            0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
+  std::vector<int> float_shape = {1, 13};
+  std::string feed_name = "x";
+  fetch_name = {"price"};
+  std::vector<int> lod;
+  input.add_float_data(float_feed, feed_name, float_shape, lod);
+  return 0;
+}
+int prepare_bert(PredictorInputs& input, std::vector<std::string>& fetch_name) {
+  {
+    std::vector<float> float_feed(128, 0.0f);
+    float_feed[0] = 1.0f;
+    std::vector<int> float_shape = {1, 128, 1};
+    std::string feed_name = "input_mask";
+    std::vector<int> lod;
+    input.add_float_data(float_feed, feed_name, float_shape, lod);
+  }
+  {
+    std::vector<int64_t> feed(128, 0);
+    std::vector<int> shape = {1, 128, 1};
+    std::string feed_name = "position_ids";
+    std::vector<int> lod;
+    input.add_int64_data(feed, feed_name, shape, lod);
+  }
+  {
+    std::vector<int64_t> feed(128, 0);
+    feed[0] = 101;
+    std::vector<int> shape = {1, 128, 1};
+    std::string feed_name = "input_ids";
+    std::vector<int> lod;
+    input.add_int64_data(feed, feed_name, shape, lod);
+  }
+  {
+    std::vector<int64_t> feed(128, 0);
+    std::vector<int> shape = {1, 128, 1};
+    std::string feed_name = "segment_ids";
+    std::vector<int> lod;
+    input.add_int64_data(feed, feed_name, shape, lod);
+  }
+  fetch_name = {"pooled_output"};
+  return 0;
+}
+} // namespace
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  std::string url = FLAGS_server_port;
+  std::string conf = FLAGS_client_conf;
+  std::string test_type = FLAGS_test_type;
+  std::string sample_type = FLAGS_sample_type;
+  LOG(INFO) << "url = " << url << ";"
+            << "client_conf = " << conf << ";"
+            << "test_type = " << test_type
+            << "sample_type = " << sample_type;
+  std::unique_ptr<ServingClient> client;
+  // default type is brpc
+  // will add grpc&http in the future
+  if (test_type == "brpc") {
+    client.reset(new ServingBrpcClient());
+  } else {
+    client.reset(new ServingBrpcClient());
+  }
+  std::vector<std::string> confs;
+  confs.push_back(conf);
+  if (client->init(confs, url) != 0) {
+    LOG(ERROR) << "Failed to init client!";
+    return 0;
+  }
+  PredictorInputs input;
+  PredictorOutputs output;
+  std::vector<std::string> fetch_name;
+  if (sample_type == "fit_a_line") {
+    prepare_fit_a_line(input, fetch_name);
+  }
+  else if (sample_type == "bert") {
+    prepare_bert(input, fetch_name);
+  }
+  else {
+    prepare_fit_a_line(input, fetch_name);
+  }
+  if (client->predict(input, output, fetch_name, 0) != 0) {
+    LOG(ERROR) << "Failed to predict!";
+  }
+  else {
+    LOG(INFO) << output.print();
+  }
+  return 0;
+}
--- a/core/general-client/include/brpc_client.h
+++ b/core/general-client/include/brpc_client.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "core/general-client/include/client.h"
+#include "core/sdk-cpp/include/predictor_sdk.h"
+using baidu::paddle_serving::sdk_cpp::Predictor;
+using baidu::paddle_serving::sdk_cpp::PredictorApi;
+namespace baidu {
+namespace paddle_serving {
+namespace client {
+class ServingBrpcClient : public ServingClient {
+ public:
+  ServingBrpcClient() {};
+  ~ServingBrpcClient() {};
+  virtual int connect(const std::string server_port);
+  int predict(const PredictorInputs& inputs,
+              PredictorOutputs& outputs,
+              const std::vector<std::string>& fetch_name,
+              const uint64_t log_id);
+ private:
+  // generate default SDKConf
+  std::string gen_desc(const std::string server_port);
+ private:
+  PredictorApi _api;
+  Predictor* _predictor;
+};
+}  // namespace client
+}  // namespace paddle_serving
+}  // namespace baidu
\ No newline at end of file
--- a/core/general-client/include/client.h
+++ b/core/general-client/include/client.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include <map>
+#include <sstream>
+#include <memory>
+namespace baidu {
+namespace paddle_serving {
+namespace predictor {
+  namespace general_model {
+    class Request;
+    class Response;
+  }
+}
+namespace client {
+class PredictorInputs;
+class PredictorOutputs;
+class ServingClient {
+ public:
+  ServingClient() {};
+  virtual ~ServingClient() = default;
+  int init(const std::vector<std::string>& client_conf,
+           const std::string server_port);
+  int load_client_config(const std::vector<std::string>& client_conf);
+  virtual int connect(const std::string server_port) = 0;
+  virtual int predict(const PredictorInputs& inputs,
+                      PredictorOutputs& outputs,
+                      const std::vector<std::string>& fetch_name,
+                      const uint64_t log_id) = 0;
+ protected:
+  std::map<std::string, int> _feed_name_to_idx;
+  std::vector<std::string> _feed_name;
+  std::map<std::string, int> _fetch_name_to_idx;
+  std::map<std::string, std::string> _fetch_name_to_var_name;
+  std::map<std::string, int> _fetch_name_to_type;
+  std::vector<std::vector<int>> _shape;
+  std::vector<int> _type;
+  std::vector<int64_t> _last_request_ts;
+};
+class PredictorData {
+ public:
+  PredictorData() {};
+  virtual ~PredictorData() {};
+  void add_float_data(const std::vector<float>& data,
+                      const std::string& name,
+                      const std::vector<int>& shape,
+                      const std::vector<int>& lod,
+                      const int datatype = 1);
+  void add_int64_data(const std::vector<int64_t>& data,
+                      const std::string& name,
+                      const std::vector<int>& shape,
+                      const std::vector<int>& lod,
+                      const int datatype = 0);
+  void add_int32_data(const std::vector<int32_t>& data,
+                      const std::string& name,
+                      const std::vector<int>& shape,
+                      const std::vector<int>& lod,
+                      const int datatype = 2);
+  void add_string_data(const std::string& data,
+                       const std::string& name,
+                       const std::vector<int>& shape,
+                       const std::vector<int>& lod,
+                       const int datatype = 20);
+  const std::map<std::string, std::vector<float>>& float_data_map() const {
+    return _float_data_map;
+  };
+  std::map<std::string, std::vector<float>>* mutable_float_data_map() {
+    return &_float_data_map;
+  };
+  const std::map<std::string, std::vector<int64_t>>& int64_data_map() const {
+    return _int64_data_map;
+  };
+  std::map<std::string, std::vector<int64_t>>* mutable_int64_data_map() {
+    return &_int64_data_map;
+  };
+  const std::map<std::string, std::vector<int32_t>>& int_data_map() const {
+    return _int32_data_map;
+  };
+  std::map<std::string, std::vector<int32_t>>* mutable_int_data_map() {
+    return &_int32_data_map;
+  };
+  const std::map<std::string, std::string>& string_data_map() const {
+    return _string_data_map;
+  };
+  std::map<std::string, std::string>* mutable_string_data_map() {
+    return &_string_data_map;
+  };
+  const std::map<std::string, std::vector<int>>& shape_map() const {
+    return _shape_map;
+  };
+  std::map<std::string, std::vector<int>>* mutable_shape_map() {
+    return &_shape_map;
+  };
+  const std::map<std::string, std::vector<int>>& lod_map() const {
+    return _lod_map;
+  };
+  std::map<std::string, std::vector<int>>* mutable_lod_map() {
+    return &_lod_map;
+  };
+  int get_datatype(std::string name) const;
+  void set_datatype(std::string name, int type);
+  std::string print();
+ private:
+  // used to print vector data map e.g. _float_data_map
+  template<typename T1, typename T2>
+  std::string map2string(const std::map<T1, std::vector<T2>>& map) {
+    std::ostringstream oss;
+    oss.str("");
+    oss.precision(6);
+	  oss.setf(std::ios::fixed);
+    std::string key_seg = ":";
+    std::string val_seg = ",";
+    std::string end_seg = "\n";
+    typename std::map<T1, std::vector<T2>>::const_iterator it = map.begin();
+    typename std::map<T1, std::vector<T2>>::const_iterator itEnd = map.end();
+    for (; it != itEnd; it++) {
+      oss << "{";
+      oss << it->first << key_seg;
+      const std::vector<T2>& v = it->second;
+      oss << v.size() << key_seg;
+      for (size_t i = 0; i < v.size(); ++i) {
+        if (i != v.size() - 1) {
+          oss << v[i] << val_seg;
+        }
+        else {
+          oss << v[i];
+        }
+      }
+      oss << "}";
+    }
+    return oss.str();
+  };
+  // used to print data map without vector e.g. _string_data_map
+  template<typename T1, typename T2>
+  std::string map2string(const std::map<T1, T2>& map) {
+    std::ostringstream oss;
+    oss.str("");
+    std::string key_seg = ":";
+    std::string val_seg = ",";
+    std::string end_seg = "\n";
+    typename std::map<T1, T2>::const_iterator it = map.begin();
+    typename std::map<T1, T2>::const_iterator itEnd = map.end();
+    for (; it != itEnd; it++) {
+      oss << "{";
+      oss << it->first << key_seg
+          << "size=" << it->second.size() << key_seg
+          << "type=" << this->get_datatype(it->first);
+      oss << "}";
+    }
+    return oss.str();
+  };
+ protected:
+  std::map<std::string, std::vector<float>> _float_data_map;
+  std::map<std::string, std::vector<int64_t>> _int64_data_map;
+  std::map<std::string, std::vector<int32_t>> _int32_data_map;
+  std::map<std::string, std::string> _string_data_map;
+  std::map<std::string, std::vector<int>> _shape_map;
+  std::map<std::string, std::vector<int>> _lod_map;
+  std::map<std::string, int> _datatype_map;
+};
+class PredictorInputs : public PredictorData {
+ public:
+  PredictorInputs() {};
+  virtual ~PredictorInputs() {};
+  // generate proto from inputs
+  // feed_name_to_idx: mapping alias name to idx
+  // feed_name: mapping idx to name
+  static int GenProto(const PredictorInputs& inputs,
+                      const std::map<std::string, int>& feed_name_to_idx,
+                      const std::vector<std::string>& feed_name,
+                      predictor::general_model::Request& req);
+};
+class PredictorOutputs {
+ public:
+  struct PredictorOutput {
+    std::string engine_name;
+    PredictorData data;
+  };
+  PredictorOutputs() {};
+  virtual ~PredictorOutputs() {};
+  const std::vector<std::shared_ptr<PredictorOutputs::PredictorOutput>>& datas() {
+    return _datas;
+  };
+  std::vector<std::shared_ptr<PredictorOutputs::PredictorOutput>>* mutable_datas() {
+    return &_datas;
+  };
+  void add_data(const std::shared_ptr<PredictorOutputs::PredictorOutput>& data) {
+    _datas.push_back(data);
+  };
+  std::string print();
+  void clear();
+  // Parse proto to outputs
+  // fetch_name: name of data to be output
+  // fetch_name_to_type: mapping of fetch_name to datatype
+  static int ParseProto(const predictor::general_model::Response& res,
+                        const std::vector<std::string>& fetch_name,
+                        std::map<std::string, int>& fetch_name_to_type,
+                        PredictorOutputs& outputs);
+ protected:
+  std::vector<std::shared_ptr<PredictorOutputs::PredictorOutput>> _datas;
+};
+}  // namespace client
+}  // namespace paddle_serving
+}  // namespace baidu
\ No newline at end of file
--- a/core/general-client/include/general_model.h
+++ b/core/general-client/include/general_model.h
@@ -51,8 +51,13 @@ class ModelRes {
                            res._float_value_map.end());
    _int32_value_map.insert(res._int32_value_map.begin(),
                            res._int32_value_map.end());
+    _string_value_map.insert(res._string_value_map.begin(),
+                            res._string_value_map.end());
    _shape_map.insert(res._shape_map.begin(), res._shape_map.end());
    _lod_map.insert(res._lod_map.begin(), res._lod_map.end());
+    _tensor_alias_names.insert(_tensor_alias_names.end(),
+                               res._tensor_alias_names.begin(),
+                               res._tensor_alias_names.end());
  }
  ModelRes(ModelRes&& res) {
    _engine_name = std::move(res._engine_name);
@@ -65,10 +70,17 @@ class ModelRes {
    _int32_value_map.insert(
        std::make_move_iterator(std::begin(res._int32_value_map)),
        std::make_move_iterator(std::end(res._int32_value_map)));
+    _string_value_map.insert(
+        std::make_move_iterator(std::begin(res._string_value_map)),
+        std::make_move_iterator(std::end(res._string_value_map)));
    _shape_map.insert(std::make_move_iterator(std::begin(res._shape_map)),
                      std::make_move_iterator(std::end(res._shape_map)));
    _lod_map.insert(std::make_move_iterator(std::begin(res._lod_map)),
                    std::make_move_iterator(std::end(res._lod_map)));
+    _tensor_alias_names.insert(
+        _tensor_alias_names.end(),
+        std::make_move_iterator(std::begin(res._tensor_alias_names)),
+        std::make_move_iterator(std::end(res._tensor_alias_names)));
  }
  ~ModelRes() {}
  const std::vector<int64_t>& get_int64_by_name(const std::string& name) {
@@ -89,6 +101,12 @@ class ModelRes {
  std::vector<int32_t>&& get_int32_by_name_with_rv(const std::string& name) {
    return std::move(_int32_value_map[name]);
  }
+  const std::string& get_string_by_name(const std::string& name) {
+    return _string_value_map[name];
+  }
+  std::string&& get_string_by_name_with_rv(const std::string& name) {
+    return std::move(_string_value_map[name]);
+  }
  const std::vector<int>& get_shape_by_name(const std::string& name) {
    return _shape_map[name];
  }
@@ -105,6 +123,10 @@ class ModelRes {
    _engine_name = engine_name;
  }
  const std::string& engine_name() { return _engine_name; }
+  const std::vector<std::string>& tensor_alias_names() {
+    return _tensor_alias_names;
+  }
  ModelRes& operator=(ModelRes&& res) {
    if (this != &res) {
      _engine_name = std::move(res._engine_name);
@@ -117,10 +139,17 @@ class ModelRes {
      _int32_value_map.insert(
          std::make_move_iterator(std::begin(res._int32_value_map)),
          std::make_move_iterator(std::end(res._int32_value_map)));
+      _string_value_map.insert(
+          std::make_move_iterator(std::begin(res._string_value_map)),
+          std::make_move_iterator(std::end(res._string_value_map)));
      _shape_map.insert(std::make_move_iterator(std::begin(res._shape_map)),
                        std::make_move_iterator(std::end(res._shape_map)));
      _lod_map.insert(std::make_move_iterator(std::begin(res._lod_map)),
                      std::make_move_iterator(std::end(res._lod_map)));
+      _tensor_alias_names.insert(
+          _tensor_alias_names.end(),
+          std::make_move_iterator(std::begin(res._tensor_alias_names)),
+          std::make_move_iterator(std::end(res._tensor_alias_names)));
    }
    return *this;
  }
@@ -130,8 +159,10 @@ class ModelRes {
  std::map<std::string, std::vector<int64_t>> _int64_value_map;
  std::map<std::string, std::vector<float>> _float_value_map;
  std::map<std::string, std::vector<int32_t>> _int32_value_map;
+  std::map<std::string, std::string> _string_value_map;
  std::map<std::string, std::vector<int>> _shape_map;
  std::map<std::string, std::vector<int>> _lod_map;
+  std::vector<std::string> _tensor_alias_names;
 };
 class PredictorRes {
@@ -168,6 +199,14 @@ class PredictorRes {
                                                   const std::string& name) {
    return std::move(_models[model_idx].get_int32_by_name_with_rv(name));
  }
+  const std::string& get_string_by_name(const int model_idx,
+                                                const std::string& name) {
+    return _models[model_idx].get_string_by_name(name);
+  }
+  std::string&& get_string_by_name_with_rv(const int model_idx,
+                                                   const std::string& name) {
+    return std::move(_models[model_idx].get_string_by_name_with_rv(name));
+  }
  const std::vector<int>& get_shape_by_name(const int model_idx,
                                            const std::string& name) {
    return _models[model_idx].get_shape_by_name(name);
@@ -193,11 +232,16 @@ class PredictorRes {
  }
  const std::string& variant_tag() { return _variant_tag; }
  const std::vector<std::string>& get_engine_names() { return _engine_names; }
+  const std::vector<std::string>& get_tensor_alias_names(const int model_idx) {
+    _tensor_alias_names = _models[model_idx].tensor_alias_names();
+    return _tensor_alias_names;
+  }
 private:
  std::vector<ModelRes> _models;
  std::string _variant_tag;
  std::vector<std::string> _engine_names;
+  std::vector<std::string> _tensor_alias_names;
 };
 class PredictorClient {
@@ -222,10 +266,14 @@ class PredictorClient {
                    const std::vector<std::string>& float_feed_name,
                    const std::vector<std::vector<int>>& float_shape,
                    const std::vector<std::vector<int>>& float_lod_slot_batch,
-                    const std::vector<py::array_t<int64_t>>& int_feed,
+                    const std::vector<py::array_t<int32_t>> &int32_feed,
-                    const std::vector<std::string>& int_feed_name,
+                    const std::vector<std::string> &int32_feed_name,
-                    const std::vector<std::vector<int>>& int_shape,
+                    const std::vector<std::vector<int>> &int32_shape,
-                    const std::vector<std::vector<int>>& int_lod_slot_batch,
+                    const std::vector<std::vector<int>> &int32_lod_slot_batch,
+                    const std::vector<py::array_t<int64_t>> &int64_feed,
+                    const std::vector<std::string> &int64_feed_name,
+                    const std::vector<std::vector<int>> &int64_shape,
+                    const std::vector<std::vector<int>> &int64_lod_slot_batch,
                    const std::vector<std::string>& string_feed,
                    const std::vector<std::string>& string_feed_name,
                    const std::vector<std::vector<int>>& string_shape,

--- a/core/general-client/src/brpc_client.cpp
+++ b/core/general-client/src/brpc_client.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "core/general-client/include/brpc_client.h"
+#include "core/sdk-cpp/include/common.h"
+#include "core/util/include/timer.h"
+#include "core/sdk-cpp/builtin_format.pb.h"
+#include "core/sdk-cpp/general_model_service.pb.h"
+DEFINE_bool(profile_client, false, "");
+DEFINE_bool(profile_server, false, "");
+#define BRPC_MAX_BODY_SIZE 512 * 1024 * 1024
+namespace baidu {
+namespace paddle_serving {
+namespace client {
+using baidu::paddle_serving::Timer;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+using configure::SDKConf;
+using configure::VariantConf;
+using configure::Predictor;
+using configure::VariantConf;
+int ServingBrpcClient::connect(const std::string server_port) {
+  brpc::fLU64::FLAGS_max_body_size = BRPC_MAX_BODY_SIZE;
+  if (_api.create(gen_desc(server_port)) != 0) {
+    LOG(ERROR) << "Predictor Creation Failed";
+    return -1;
+  }
+  // _api.thrd_initialize();
+  return 0;
+}
+std::string ServingBrpcClient::gen_desc(const std::string server_port) {
+  // default config for brpc
+  SDKConf sdk_conf;
+  Predictor* predictor = sdk_conf.add_predictors();
+  predictor->set_name("general_model");
+  predictor->set_service_name("baidu.paddle_serving.predictor.general_model.GeneralModelService");
+  predictor->set_endpoint_router("WeightedRandomRender");
+  predictor->mutable_weighted_random_render_conf()->set_variant_weight_list("100");
+  VariantConf* predictor_var = predictor->add_variants();
+  predictor_var->set_tag("default_tag_1");
+  std::string cluster = "list://" + server_port;
+  predictor_var->mutable_naming_conf()->set_cluster(cluster);
+  VariantConf* var = sdk_conf.mutable_default_variant_conf();
+  var->set_tag("default");
+  var->mutable_connection_conf()->set_connect_timeout_ms(2000);
+  var->mutable_connection_conf()->set_rpc_timeout_ms(200000);
+  var->mutable_connection_conf()->set_connect_retry_count(2);
+  var->mutable_connection_conf()->set_max_connection_per_host(100);
+  var->mutable_connection_conf()->set_hedge_request_timeout_ms(-1);
+  var->mutable_connection_conf()->set_hedge_fetch_retry_count(2);
+  var->mutable_connection_conf()->set_connection_type("pooled");
+  var->mutable_connection_conf()->set_connect_timeout_ms(2000);
+  var->mutable_naming_conf()->set_cluster_filter_strategy("Default");
+  var->mutable_naming_conf()->set_load_balance_strategy("la");
+  var->mutable_rpc_parameter()->set_compress_type(0);
+  var->mutable_rpc_parameter()->set_package_size(20);
+  var->mutable_rpc_parameter()->set_protocol("baidu_std");
+  var->mutable_rpc_parameter()->set_max_channel_per_request(3);
+  return sdk_conf.SerializePartialAsString();
+}
+int ServingBrpcClient::predict(const PredictorInputs& inputs,
+                               PredictorOutputs& outputs,
+                               const std::vector<std::string>& fetch_name,
+                               const uint64_t log_id) {
+  Timer timeline;
+  int64_t preprocess_start = timeline.TimeStampUS();
+  // thread initialize for StubTLS
+  _api.thrd_initialize();
+  std::string variant_tag;
+  // predictor is bound to request with brpc::Controller
+  _predictor = _api.fetch_predictor("general_model", &variant_tag);
+  if (_predictor == NULL) {
+    LOG(ERROR) << "Failed fetch predictor so predict error!";
+    return -1;
+  }
+  // predict_res_batch.set_variant_tag(variant_tag);
+  VLOG(2) << "fetch general model predictor done.";
+  VLOG(2) << "variant_tag:" << variant_tag;
+  VLOG(2) << "max body size : " << brpc::fLU64::FLAGS_max_body_size;
+  Request req;
+  req.set_log_id(log_id);
+  for (auto &name : fetch_name) {
+    req.add_fetch_var_names(name);
+  }
+  if (PredictorInputs::GenProto(inputs, _feed_name_to_idx, _feed_name, req) != 0) {
+    LOG(ERROR) << "Failed to preprocess req!";
+    return -1;
+  }
+  int64_t preprocess_end = timeline.TimeStampUS();
+  int64_t client_infer_start = timeline.TimeStampUS();
+  Response res;
+  int64_t client_infer_end = 0;
+  int64_t postprocess_start = 0;
+  int64_t postprocess_end = 0;
+  if (FLAGS_profile_server) {
+    req.set_profile_server(true);
+  }
+  res.Clear();
+  if (_predictor->inference(&req, &res) != 0) {
+    LOG(ERROR) << "failed call predictor with req: " << req.ShortDebugString();
+    return -1;
+  }
+  client_infer_end = timeline.TimeStampUS();
+  postprocess_start = client_infer_end;
+  if (PredictorOutputs::ParseProto(res, fetch_name, _fetch_name_to_type, outputs) != 0) {
+    LOG(ERROR) << "Failed to post_process res!";
+    return -1;
+  }
+  postprocess_end = timeline.TimeStampUS();
+  if (FLAGS_profile_client) {
+    std::ostringstream oss;
+    oss << "PROFILE\t"
+        << "pid:" << getpid() << "\t"
+        << "prepro_0:" << preprocess_start << " "
+        << "prepro_1:" << preprocess_end << " "
+        << "client_infer_0:" << client_infer_start << " "
+        << "client_infer_1:" << client_infer_end << " ";
+    if (FLAGS_profile_server) {
+      int op_num = res.profile_time_size() / 2;
+      for (int i = 0; i < op_num; ++i) {
+        oss << "op" << i << "_0:" << res.profile_time(i * 2) << " ";
+        oss << "op" << i << "_1:" << res.profile_time(i * 2 + 1) << " ";
+      }
+    }
+    oss << "postpro_0:" << postprocess_start << " ";
+    oss << "postpro_1:" << postprocess_end;
+    fprintf(stderr, "%s\n", oss.str().c_str());
+  }
+  // release predictor
+  _api.thrd_clear();
+  std::ostringstream oss;
+  oss << "[client]"
+      << "logid=" << log_id <<",";
+  if (FLAGS_profile_client) {
+    double pre_cost = (preprocess_end - preprocess_start) / 1000.0;
+    double infer_cost = (client_infer_end - client_infer_start) / 1000.0;
+    double post_cost = (postprocess_end - postprocess_start) / 1000.0;
+    oss << "client_pre_cost=" << pre_cost << "ms,"
+        << "client_infer_cost=" << infer_cost << "ms,"
+        << "client_post_cost=" << post_cost << "ms,";
+  }
+  double client_cost = (postprocess_end - preprocess_start) / 1000.0;
+  oss << "client_cost=" << client_cost << "ms,";
+  int op_num = res.profile_time_size() / 2;
+  if (FLAGS_profile_server) {
+    for (int i = 0; i < op_num - 1; ++i) {
+      double t = (res.profile_time(i * 2 + 1)
+                 - res.profile_time(i * 2)) / 1000.0;
+      oss << "op" << i << "=" << t << "ms,";
+    }
+  }
+  if (op_num > 0) {
+    int i = op_num - 1;
+    double server_cost = (res.profile_time(i * 2 + 1)
+                 - res.profile_time(i * 2)) / 1000.0;
+    oss << "server_cost=" << server_cost << "ms.";
+  }
+  LOG(INFO) << oss.str();
+  return 0;
+}
+}  // namespace general_model
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/core/general-client/src/client.cpp
+++ b/core/general-client/src/client.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "core/general-client/include/client.h"
+#include "core/sdk-cpp/include/common.h"
+#include "core/sdk-cpp/general_model_service.pb.h"
+namespace baidu {
+namespace paddle_serving {
+namespace client {
+using configure::GeneralModelConfig;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+// support: FLOAT32, INT64, INT32, UINT8, INT8, FLOAT16
+enum ProtoDataType {
+  P_INT64 = 0,
+  P_FLOAT32,
+  P_INT32,
+  P_FP64,
+  P_INT16,
+  P_FP16,
+  P_BF16,
+  P_UINT8,
+  P_INT8,
+  P_BOOL,
+  P_COMPLEX64,
+  P_COMPLEX128,
+  P_STRING = 20,
+};
+int ServingClient::init(const std::vector<std::string>& client_conf,
+           const std::string server_port) {
+  if (load_client_config(client_conf) != 0) {
+    LOG(ERROR) << "Failed to load client config";
+    return -1;
+  }
+  // pure virtual func, subclass implementation
+  if (connect(server_port) != 0) {
+    LOG(ERROR) << "Failed to connect";
+    return -1;
+  }
+  return 0;
+}
+int ServingClient::load_client_config(const std::vector<std::string> &conf_file) {
+  try {
+    GeneralModelConfig model_config;
+    if (configure::read_proto_conf(conf_file[0].c_str(), &model_config) != 0) {
+      LOG(ERROR) << "Failed to load general model config"
+                 << ", file path: " << conf_file[0];
+      return -1;
+    }
+    _feed_name_to_idx.clear();
+    _fetch_name_to_idx.clear();
+    _shape.clear();
+    int feed_var_num = model_config.feed_var_size();
+    _feed_name.clear();
+    VLOG(2) << "feed var num: " << feed_var_num;
+    for (int i = 0; i < feed_var_num; ++i) {
+      _feed_name_to_idx[model_config.feed_var(i).alias_name()] = i;
+      VLOG(2) << "feed [" << i << "]"
+              << " name: " << model_config.feed_var(i).name();
+      _feed_name.push_back(model_config.feed_var(i).name());
+      VLOG(2) << "feed alias name: " << model_config.feed_var(i).alias_name()
+              << " index: " << i;
+      std::vector<int> tmp_feed_shape;
+      VLOG(2) << "feed"
+              << "[" << i << "] shape:";
+      for (int j = 0; j < model_config.feed_var(i).shape_size(); ++j) {
+        tmp_feed_shape.push_back(model_config.feed_var(i).shape(j));
+        VLOG(2) << "shape[" << j << "]: " << model_config.feed_var(i).shape(j);
+      }
+      _type.push_back(model_config.feed_var(i).feed_type());
+      VLOG(2) << "feed"
+              << "[" << i
+              << "] feed type: " << model_config.feed_var(i).feed_type();
+      _shape.push_back(tmp_feed_shape);
+    }
+    if (conf_file.size() > 1) {
+      model_config.Clear();
+      if (configure::read_proto_conf(conf_file[conf_file.size() - 1].c_str(),
+                                     &model_config) != 0) {
+        LOG(ERROR) << "Failed to load general model config"
+                   << ", file path: " << conf_file[conf_file.size() - 1];
+        return -1;
+      }
+    }
+    int fetch_var_num = model_config.fetch_var_size();
+    VLOG(2) << "fetch_var_num: " << fetch_var_num;
+    for (int i = 0; i < fetch_var_num; ++i) {
+      _fetch_name_to_idx[model_config.fetch_var(i).alias_name()] = i;
+      VLOG(2) << "fetch [" << i << "]"
+              << " alias name: " << model_config.fetch_var(i).alias_name();
+      _fetch_name_to_var_name[model_config.fetch_var(i).alias_name()] =
+          model_config.fetch_var(i).name();
+      _fetch_name_to_type[model_config.fetch_var(i).alias_name()] =
+          model_config.fetch_var(i).fetch_type();
+    }
+  } catch (std::exception &e) {
+    LOG(ERROR) << "Failed load general model config" << e.what();
+    return -1;
+  }
+  return 0;
+}
+void PredictorData::add_float_data(const std::vector<float>& data,
+                                    const std::string& name,
+                                    const std::vector<int>& shape,
+                                    const std::vector<int>& lod,
+                                    const int datatype) {
+  _float_data_map[name] = data;
+  _shape_map[name] = shape;
+  _lod_map[name] = lod;
+  _datatype_map[name] = datatype;
+}
+void PredictorData::add_int64_data(const std::vector<int64_t>& data,
+                                    const std::string& name,
+                                    const std::vector<int>& shape,
+                                    const std::vector<int>& lod,
+                                    const int datatype) {
+  _int64_data_map[name] = data;
+  _shape_map[name] = shape;
+  _lod_map[name] = lod;
+  _datatype_map[name] = datatype;
+}
+void PredictorData::add_int32_data(const std::vector<int32_t>& data,
+                                    const std::string& name,
+                                    const std::vector<int>& shape,
+                                    const std::vector<int>& lod,
+                                    const int datatype) {
+  _int32_data_map[name] = data;
+  _shape_map[name] = shape;
+  _lod_map[name] = lod;
+  _datatype_map[name] = datatype;
+}
+void PredictorData::add_string_data(const std::string& data,
+                                    const std::string& name,
+                                    const std::vector<int>& shape,
+                                    const std::vector<int>& lod,
+                                    const int datatype) {
+  _string_data_map[name] = data;
+  _shape_map[name] = shape;
+  _lod_map[name] = lod;
+  _datatype_map[name] = datatype;
+}
+int PredictorData::get_datatype(std::string name) const {
+  std::map<std::string, int>::const_iterator it = _datatype_map.find(name);
+  if (it != _datatype_map.end()) {
+    return it->second;
+  }
+  return 0;
+}
+void PredictorData::set_datatype(std::string name, int type) {
+  _datatype_map[name] = type;
+}
+std::string PredictorData::print() {
+  std::string res;
+  res.append(map2string<std::string, float>(_float_data_map));
+  res.append(map2string<std::string, int64_t>(_int64_data_map));
+  res.append(map2string<std::string, int32_t>(_int32_data_map));
+  res.append(map2string<std::string, std::string>(_string_data_map));
+  return res;
+}
+int PredictorInputs::GenProto(const PredictorInputs& inputs,
+                              const std::map<std::string, int>& feed_name_to_idx,
+                              const std::vector<std::string>& feed_name,
+                              Request& req) {
+  const std::map<std::string, std::vector<float>>& float_feed_map = inputs.float_data_map();
+  const std::map<std::string, std::vector<int64_t>>& int64_feed_map = inputs.int64_data_map();
+  const std::map<std::string, std::vector<int32_t>>& int32_feed_map = inputs.int_data_map();
+  const std::map<std::string, std::string>& string_feed_map = inputs.string_data_map();
+  const std::map<std::string, std::vector<int>>& shape_map = inputs.shape_map();
+  const std::map<std::string, std::vector<int>>& lod_map = inputs.lod_map();
+  VLOG(2) << "float feed name size: " << float_feed_map.size();
+  VLOG(2) << "int feed name size: " << int64_feed_map.size();
+  VLOG(2) << "string feed name size: " << string_feed_map.size();
+  // batch is already in Tensor.
+  for (std::map<std::string, std::vector<float>>::const_iterator iter = float_feed_map.begin();
+        iter != float_feed_map.end();
+        ++iter) {
+    std::string name = iter->first;
+    const std::vector<float>& float_data = iter->second;
+    const std::vector<int>& float_shape = shape_map.at(name);
+    const std::vector<int>& float_lod = lod_map.at(name);
+    // default datatype = P_FLOAT32
+    int datatype = inputs.get_datatype(name);
+    std::map<std::string, int>::const_iterator feed_name_it = feed_name_to_idx.find(name);
+    if (feed_name_it == feed_name_to_idx.end()) {
+      LOG(ERROR) << "Do not find [" << name << "] in feed_map!";
+      return -1;
+    }
+    int idx = feed_name_to_idx.at(name);
+    VLOG(2) << "prepare float feed " << name << " idx " << idx;
+    int total_number = float_data.size();
+    Tensor *tensor = req.add_tensor();
+    VLOG(2) << "prepare float feed " << name << " shape size "
+            << float_shape.size();
+    for (uint32_t j = 0; j < float_shape.size(); ++j) {
+      tensor->add_shape(float_shape[j]);
+    }
+    for (uint32_t j = 0; j < float_lod.size(); ++j) {
+      tensor->add_lod(float_lod[j]);
+    }
+    tensor->set_elem_type(datatype);
+    tensor->set_name(feed_name[idx]);
+    tensor->set_alias_name(name);
+    tensor->mutable_float_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_float_data()->mutable_data(), float_data.data(), total_number * sizeof(float));
+  }
+  for (std::map<std::string, std::vector<int64_t>>::const_iterator iter = int64_feed_map.begin();
+        iter != int64_feed_map.end();
+        ++iter) {
+    std::string name = iter->first;
+    const std::vector<int64_t>& int64_data = iter->second;
+    const std::vector<int>& int64_shape = shape_map.at(name);
+    const std::vector<int>& int64_lod = lod_map.at(name);
+    // default datatype = P_INT64
+    int datatype = inputs.get_datatype(name);
+    std::map<std::string, int>::const_iterator feed_name_it = feed_name_to_idx.find(name);
+    if (feed_name_it == feed_name_to_idx.end()) {
+      LOG(ERROR) << "Do not find [" << name << "] in feed_map!";
+      return -1;
+    }
+    int idx = feed_name_to_idx.at(name);
+    Tensor *tensor = req.add_tensor();
+    int total_number = int64_data.size();
+    for (uint32_t j = 0; j < int64_shape.size(); ++j) {
+      tensor->add_shape(int64_shape[j]);
+    }
+    for (uint32_t j = 0; j < int64_lod.size(); ++j) {
+      tensor->add_lod(int64_lod[j]);
+    }
+    tensor->set_elem_type(datatype);
+    tensor->set_name(feed_name[idx]);
+    tensor->set_alias_name(name);
+    tensor->mutable_int64_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_int64_data()->mutable_data(), int64_data.data(), total_number * sizeof(int64_t));
+  }
+  for (std::map<std::string, std::vector<int32_t>>::const_iterator iter = int32_feed_map.begin();
+        iter != int32_feed_map.end();
+        ++iter) {
+    std::string name = iter->first;
+    const std::vector<int32_t>& int32_data = iter->second;
+    const std::vector<int>& int32_shape = shape_map.at(name);
+    const std::vector<int>& int32_lod = lod_map.at(name);
+    // default datatype = P_INT32
+    int datatype = inputs.get_datatype(name);
+    std::map<std::string, int>::const_iterator feed_name_it = feed_name_to_idx.find(name);
+    if (feed_name_it == feed_name_to_idx.end()) {
+      LOG(ERROR) << "Do not find [" << name << "] in feed_map!";
+      return -1;
+    }
+    int idx = feed_name_to_idx.at(name);
+    Tensor *tensor = req.add_tensor();
+    int total_number = int32_data.size();
+    for (uint32_t j = 0; j < int32_shape.size(); ++j) {
+      tensor->add_shape(int32_shape[j]);
+    }
+    for (uint32_t j = 0; j < int32_lod.size(); ++j) {
+      tensor->add_lod(int32_lod[j]);
+    }
+    tensor->set_elem_type(datatype);
+    tensor->set_name(feed_name[idx]);
+    tensor->set_alias_name(name);
+    tensor->mutable_int_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_int_data()->mutable_data(), int32_data.data(), total_number * sizeof(int32_t));
+  }
+  for (std::map<std::string, std::string>::const_iterator iter = string_feed_map.begin();
+        iter != string_feed_map.end();
+        ++iter) {
+    std::string name = iter->first;
+    const std::string& string_data = iter->second;
+    const std::vector<int>& string_shape = shape_map.at(name);
+    const std::vector<int>& string_lod = lod_map.at(name);
+    // default datatype = P_STRING
+    int datatype = inputs.get_datatype(name);
+    std::map<std::string, int>::const_iterator feed_name_it = feed_name_to_idx.find(name);
+    if (feed_name_it == feed_name_to_idx.end()) {
+      LOG(ERROR) << "Do not find [" << name << "] in feed_map!";
+      return -1;
+    }
+    int idx = feed_name_to_idx.at(name);
+    Tensor *tensor = req.add_tensor();
+    for (uint32_t j = 0; j < string_shape.size(); ++j) {
+      tensor->add_shape(string_shape[j]);
+    }
+    for (uint32_t j = 0; j < string_lod.size(); ++j) {
+      tensor->add_lod(string_lod[j]);
+    }
+    tensor->set_elem_type(datatype);
+    tensor->set_name(feed_name[idx]);
+    tensor->set_alias_name(name);
+    if (datatype == P_STRING) {
+      const int string_shape_size = string_shape.size();
+      // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
+      // we pass string via vector<vector<string> >.
+      if (string_shape_size != 1) {
+        LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
+                   << string_shape_size;
+        return -1;
+      }
+      switch (string_shape_size) {
+        case 1: {
+          tensor->add_data(string_data);
+          break;
+        }
+      }
+    } else {
+      tensor->set_tensor_content(string_data);
+    }
+  }
+  return 0;
+}
+std::string PredictorOutputs::print() {
+  std::string res = "";
+  for (size_t i = 0; i < _datas.size(); ++i) {
+    res.append(_datas[i]->engine_name);
+    res.append(":");
+    res.append(_datas[i]->data.print());
+    res.append("\n");
+  }
+  return res;
+}
+void PredictorOutputs::clear() {
+  _datas.clear();
+}
+int PredictorOutputs::ParseProto(const Response& res,
+                                  const std::vector<std::string>& fetch_name,
+                                  std::map<std::string, int>& fetch_name_to_type,
+                                  PredictorOutputs& outputs) {
+  VLOG(2) << "get model output num";
+  uint32_t model_num = res.outputs_size();
+  VLOG(2) << "model num: " << model_num;
+  for (uint32_t m_idx = 0; m_idx < model_num; ++m_idx) {
+    VLOG(2) << "process model output index: " << m_idx;
+    auto& output = res.outputs(m_idx);
+    std::shared_ptr<PredictorOutputs::PredictorOutput> predictor_output =
+        std::make_shared<PredictorOutputs::PredictorOutput>();
+    predictor_output->engine_name = output.engine_name();
+    PredictorData& predictor_data = predictor_output->data;
+    std::map<std::string, std::vector<float>>& float_data_map = *predictor_output->data.mutable_float_data_map();
+    std::map<std::string, std::vector<int64_t>>& int64_data_map = *predictor_output->data.mutable_int64_data_map();
+    std::map<std::string, std::vector<int32_t>>& int32_data_map = *predictor_output->data.mutable_int_data_map();
+    std::map<std::string, std::string>& string_data_map = *predictor_output->data.mutable_string_data_map();
+    std::map<std::string, std::vector<int>>& shape_map = *predictor_output->data.mutable_shape_map();
+    std::map<std::string, std::vector<int>>& lod_map = *predictor_output->data.mutable_lod_map();
+    int idx = 0;
+    for (auto &name : fetch_name) {
+      // int idx = _fetch_name_to_idx[name];
+      int shape_size = output.tensor(idx).shape_size();
+      VLOG(2) << "fetch var " << name << " index " << idx << " shape size "
+              << shape_size;
+      shape_map[name].resize(shape_size);
+      for (int i = 0; i < shape_size; ++i) {
+        shape_map[name][i] = output.tensor(idx).shape(i);
+      }
+      int lod_size = output.tensor(idx).lod_size();
+      if (lod_size > 0) {
+        lod_map[name].resize(lod_size);
+        for (int i = 0; i < lod_size; ++i) {
+          lod_map[name][i] = output.tensor(idx).lod(i);
+        }
+      }
+      idx += 1;
+    }
+    idx = 0;
+    for (auto &name : fetch_name) {
+      // int idx = _fetch_name_to_idx[name];
+      if (fetch_name_to_type[name] == P_INT64) {
+        VLOG(2) << "fetch var " << name << "type int64";
+        int size = output.tensor(idx).int64_data_size();
+        int64_data_map[name] = std::vector<int64_t>(
+            output.tensor(idx).int64_data().begin(),
+            output.tensor(idx).int64_data().begin() + size);
+      } else if (fetch_name_to_type[name] == P_FLOAT32) {
+        VLOG(2) << "fetch var " << name << "type float";
+        int size = output.tensor(idx).float_data_size();
+        float_data_map[name] = std::vector<float>(
+            output.tensor(idx).float_data().begin(),
+            output.tensor(idx).float_data().begin() + size);
+      } else if (fetch_name_to_type[name] == P_INT32) {
+        VLOG(2) << "fetch var " << name << "type int32";
+        int size = output.tensor(idx).int_data_size();
+        int32_data_map[name] = std::vector<int32_t>(
+            output.tensor(idx).int_data().begin(),
+            output.tensor(idx).int_data().begin() + size);
+      } else if (fetch_name_to_type[name] == P_UINT8
+                || fetch_name_to_type[name] == P_INT8
+                || fetch_name_to_type[name] == P_FP16) {
+        VLOG(2) << "fetch var [" << name << "]type="
+                << fetch_name_to_type[name];
+        string_data_map[name] = output.tensor(idx).tensor_content();
+      }
+      predictor_data.set_datatype(name, output.tensor(idx).elem_type());
+      idx += 1;
+    }
+    outputs.add_data(predictor_output);
+  }
+  return 0;
+}
+}  // namespace client
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -25,7 +25,22 @@ using baidu::paddle_serving::Timer;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Tensor;
-enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
+// support: FLOAT32, INT64, INT32, UINT8, INT8, FLOAT16
+enum ProtoDataType {
+  P_INT64 = 0,
+  P_FLOAT32,
+  P_INT32,
+  P_FP64,
+  P_INT16,
+  P_FP16,
+  P_BF16,
+  P_UINT8,
+  P_INT8,
+  P_BOOL,
+  P_COMPLEX64,
+  P_COMPLEX128,
+  P_STRING = 20,
+};
 std::once_flag gflags_init_flag;
 namespace py = pybind11;
@@ -152,10 +167,14 @@ int PredictorClient::numpy_predict(
    const std::vector<std::string> &float_feed_name,
    const std::vector<std::vector<int>> &float_shape,
    const std::vector<std::vector<int>> &float_lod_slot_batch,
-    const std::vector<py::array_t<int64_t>> &int_feed,
+    const std::vector<py::array_t<int32_t>> &int32_feed,
-    const std::vector<std::string> &int_feed_name,
+    const std::vector<std::string> &int32_feed_name,
-    const std::vector<std::vector<int>> &int_shape,
+    const std::vector<std::vector<int>> &int32_shape,
-    const std::vector<std::vector<int>> &int_lod_slot_batch,
+    const std::vector<std::vector<int>> &int32_lod_slot_batch,
+    const std::vector<py::array_t<int64_t>> &int64_feed,
+    const std::vector<std::string> &int64_feed_name,
+    const std::vector<std::vector<int>> &int64_shape,
+    const std::vector<std::vector<int>> &int64_lod_slot_batch,
    const std::vector<std::string> &string_feed,
    const std::vector<std::string> &string_feed_name,
    const std::vector<std::vector<int>> &string_shape,
@@ -168,15 +187,14 @@ int PredictorClient::numpy_predict(
  Timer timeline;
  int64_t preprocess_start = timeline.TimeStampUS();
-  int fetch_name_num = fetch_name.size();
  _api.thrd_initialize();
  std::string variant_tag;
  _predictor = _api.fetch_predictor("general_model", &variant_tag);
  predict_res_batch.set_variant_tag(variant_tag);
  VLOG(2) << "fetch general model predictor done.";
  VLOG(2) << "float feed name size: " << float_feed_name.size();
-  VLOG(2) << "int feed name size: " << int_feed_name.size();
+  VLOG(2) << "int feed name size: " << int32_feed_name.size();
+  VLOG(2) << "int feed name size: " << int64_feed_name.size();
  VLOG(2) << "string feed name size: " << string_feed_name.size();
  VLOG(2) << "max body size : " << brpc::fLU64::FLAGS_max_body_size;
  Request req;
@@ -193,7 +211,11 @@ int PredictorClient::numpy_predict(
    tensor_vec.push_back(req.add_tensor());
  }
-  for (auto &name : int_feed_name) {
+  for (auto &name : int32_feed_name) {
+    tensor_vec.push_back(req.add_tensor());
+  }
+  for (auto &name : int64_feed_name) {
    tensor_vec.push_back(req.add_tensor());
  }
@@ -233,37 +255,63 @@ int PredictorClient::numpy_predict(
  }
  vec_idx = 0;
-  for (auto &name : int_feed_name) {
+  for (auto &name : int32_feed_name) {
    int idx = _feed_name_to_idx[name];
    if (idx >= tensor_vec.size()) {
      LOG(ERROR) << "idx > tensor_vec.size()";
      return -1;
    }
    Tensor *tensor = tensor_vec[idx];
-    int nbytes = int_feed[vec_idx].nbytes();
+    int nbytes = int32_feed[vec_idx].nbytes();
-    void *rawdata_ptr = (void *)(int_feed[vec_idx].data(0));
+    void *rawdata_ptr = (void *)(int32_feed[vec_idx].data(0));
-    int total_number = int_feed[vec_idx].size();
+    int total_number = int32_feed[vec_idx].size();
-    for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
+    for (uint32_t j = 0; j < int32_shape[vec_idx].size(); ++j) {
-      tensor->add_shape(int_shape[vec_idx][j]);
+      tensor->add_shape(int32_shape[vec_idx][j]);
    }
-    for (uint32_t j = 0; j < int_lod_slot_batch[vec_idx].size(); ++j) {
+    for (uint32_t j = 0; j < int32_lod_slot_batch[vec_idx].size(); ++j) {
-      tensor->add_lod(int_lod_slot_batch[vec_idx][j]);
+      tensor->add_lod(int32_lod_slot_batch[vec_idx][j]);
    }
    tensor->set_elem_type(_type[idx]);
    tensor->set_name(_feed_name[idx]);
    tensor->set_alias_name(name);
-    if (_type[idx] == P_INT64) {
+    tensor->mutable_int_data()->Resize(total_number, 0);
-      tensor->mutable_int64_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_int_data()->mutable_data(), rawdata_ptr, nbytes);
-      memcpy(tensor->mutable_int64_data()->mutable_data(), rawdata_ptr, nbytes);
+    vec_idx++;
-    } else {
+  }
-      tensor->mutable_int_data()->Resize(total_number, 0);
-      memcpy(tensor->mutable_int_data()->mutable_data(), rawdata_ptr, nbytes);
+  // Individual INT_64 feed data of int_input to tensor_content
+  vec_idx = 0;
+  for (auto &name : int64_feed_name) {
+    int idx = _feed_name_to_idx[name];
+    if (idx >= tensor_vec.size()) {
+      LOG(ERROR) << "idx > tensor_vec.size()";
+      return -1;
+    }
+    Tensor *tensor = tensor_vec[idx];
+    int nbytes = int64_feed[vec_idx].nbytes();
+    void *rawdata_ptr = (void *)(int64_feed[vec_idx].data(0));
+    int total_number = int64_feed[vec_idx].size();
+    for (uint32_t j = 0; j < int64_shape[vec_idx].size(); ++j) {
+      tensor->add_shape(int64_shape[vec_idx][j]);
+    }
+    for (uint32_t j = 0; j < int64_lod_slot_batch[vec_idx].size(); ++j) {
+      tensor->add_lod(int64_lod_slot_batch[vec_idx][j]);
    }
+    tensor->set_elem_type(_type[idx]);
+    tensor->set_name(_feed_name[idx]);
+    tensor->set_alias_name(name);
+    tensor->mutable_int64_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_int64_data()->mutable_data(), rawdata_ptr, nbytes);
    vec_idx++;
  }
+  // Add !P_STRING feed data of string_input to tensor_content
+  // UINT8 INT8 FLOAT16
  vec_idx = 0;
  for (auto &name : string_feed_name) {
    int idx = _feed_name_to_idx[name];
@@ -279,22 +327,27 @@ int PredictorClient::numpy_predict(
    for (uint32_t j = 0; j < string_lod_slot_batch[vec_idx].size(); ++j) {
      tensor->add_lod(string_lod_slot_batch[vec_idx][j]);
    }
-    tensor->set_elem_type(P_STRING);
    tensor->set_name(_feed_name[idx]);
    tensor->set_alias_name(name);
-    const int string_shape_size = string_shape[vec_idx].size();
+    if (_type[idx] != P_STRING) {
-    // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
+      tensor->set_elem_type(_type[idx]);
-    // we pass string via vector<vector<string> >.
+      tensor->set_tensor_content(string_feed[vec_idx]);
-    if (string_shape_size != 1) {
+    } else {
-      LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
+      tensor->set_elem_type(P_STRING);
-                 << string_shape_size;
+      const int string_shape_size = string_shape[vec_idx].size();
-      return -1;
+      // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
-    }
+      // we pass string via vector<vector<string> >.
-    switch (string_shape_size) {
+      if (string_shape_size != 1) {
-      case 1: {
+        LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
-        tensor->add_data(string_feed[vec_idx]);
+                   << string_shape_size;
-        break;
+        return -1;
+      }
+      switch (string_shape_size) {
+        case 1: {
+          tensor->add_data(string_feed[vec_idx]);
+          break;
+        }
      }
    }
    vec_idx++;
@@ -308,10 +361,8 @@ int PredictorClient::numpy_predict(
  int64_t postprocess_start = 0;
  int64_t postprocess_end = 0;
-  if (FLAGS_profile_client) {
+  if (FLAGS_profile_server) {
-    if (FLAGS_profile_server) {
+    req.set_profile_server(true);
-      req.set_profile_server(true);
-    }
  }
  res.Clear();
@@ -329,10 +380,12 @@ int PredictorClient::numpy_predict(
      auto output = res.outputs(m_idx);
      ModelRes model;
      model.set_engine_name(output.engine_name());
+      // 在ResponseOp处，已经按照fetch_name对输出数据进行了处理
-      int idx = 0;
+      // 所以，输出的数据与fetch_name是严格对应的，按顺序处理即可。
-      for (auto &name : fetch_name) {
+      for (int idx = 0; idx < output.tensor_size(); ++idx) {
        // int idx = _fetch_name_to_idx[name];
+        const std::string name = output.tensor(idx).alias_name();
+        model._tensor_alias_names.push_back(name);
        int shape_size = output.tensor(idx).shape_size();
        VLOG(2) << "fetch var " << name << " index " << idx << " shape size "
                << shape_size;
@@ -347,13 +400,7 @@ int PredictorClient::numpy_predict(
            model._lod_map[name][i] = output.tensor(idx).lod(i);
          }
        }
-        idx += 1;
-      }
-      idx = 0;
-      for (auto &name : fetch_name) {
-        // int idx = _fetch_name_to_idx[name];
        if (_fetch_name_to_type[name] == P_INT64) {
          VLOG(2) << "ferch var " << name << "type int64";
          int size = output.tensor(idx).int64_data_size();
@@ -372,8 +419,16 @@ int PredictorClient::numpy_predict(
          model._int32_value_map[name] = std::vector<int32_t>(
              output.tensor(idx).int_data().begin(),
              output.tensor(idx).int_data().begin() + size);
+        } else if (_fetch_name_to_type[name] == P_UINT8) {
+          VLOG(2) << "fetch var " << name << "type uint8";
+          model._string_value_map[name] = output.tensor(idx).tensor_content();
+        } else if (_fetch_name_to_type[name] == P_INT8) {
+          VLOG(2) << "fetch var " << name << "type int8";
+          model._string_value_map[name] = output.tensor(idx).tensor_content();
+        } else if (_fetch_name_to_type[name] == P_FP16) {
+          VLOG(2) << "fetch var " << name << "type float16";
+          model._string_value_map[name] = output.tensor(idx).tensor_content();
        }
-        idx += 1;
      }
      predict_res_batch.add_model_res(std::move(model));
    }
@@ -403,6 +458,36 @@ int PredictorClient::numpy_predict(
  }
  _api.thrd_clear();
+  std::ostringstream oss;
+  oss << "[client]"
+      << "logid=" << log_id <<",";
+  if (FLAGS_profile_client) {
+    double pre_cost = (preprocess_end - preprocess_start) / 1000.0;
+    double infer_cost = (client_infer_end - client_infer_start) / 1000.0;
+    double post_cost = (postprocess_end - postprocess_start) / 1000.0;
+    oss << "client_pre_cost=" << pre_cost << "ms,"
+        << "client_infer_cost=" << infer_cost << "ms,"
+        << "client_post_cost=" << post_cost << "ms,";
+  }
+  double client_cost = (postprocess_end - preprocess_start) / 1000.0;
+  oss << "client_cost=" << client_cost << "ms,";
+  int op_num = res.profile_time_size() / 2;
+  if (FLAGS_profile_server) {
+    for (int i = 0; i < op_num - 1; ++i) {
+      double t = (res.profile_time(i * 2 + 1)
+                 - res.profile_time(i * 2)) / 1000.0;
+      oss << "op" << i << "=" << t << "ms,";
+    }
+  }
+  if (op_num > 0) {
+    int i = op_num - 1;
+    double server_cost = (res.profile_time(i * 2 + 1)
+                 - res.profile_time(i * 2)) / 1000.0;
+    oss << "server_cost=" << server_cost << "ms.";
+  }
+  LOG(INFO) << oss.str();
  return 0;
 }
 }  // namespace general_model

--- a/core/general-client/src/pybind_general_model.cpp
+++ b/core/general-client/src/pybind_general_model.cpp
@@ -49,6 +49,19 @@ PYBIND11_MODULE(serving_client, m) {
             });
             return py::array(ptr->size(), ptr->data(), capsule);
           })
+      .def("get_int32_by_name",
+           [](PredictorRes &self, int model_idx, std::string &name) {
+             std::vector<int32_t> *ptr = new std::vector<int32_t>(
+                 std::move(self.get_int32_by_name_with_rv(model_idx, name)));
+             auto capsule = py::capsule(ptr, [](void *p) {
+               delete reinterpret_cast<std::vector<int32_t> *>(p);
+             });
+             return py::array(ptr->size(), ptr->data(), capsule);
+           })
+      .def("get_string_by_name",
+           [](PredictorRes &self, int model_idx, std::string &name) {
+             return self.get_string_by_name_with_rv(model_idx, name);
+           })
      .def("get_shape",
           [](PredictorRes &self, int model_idx, std::string &name) {
             std::vector<int> *ptr = new std::vector<int>(
@@ -69,7 +82,10 @@ PYBIND11_MODULE(serving_client, m) {
           })
      .def("variant_tag", [](PredictorRes &self) { return self.variant_tag(); })
      .def("get_engine_names",
-           [](PredictorRes &self) { return self.get_engine_names(); });
+           [](PredictorRes &self) { return self.get_engine_names(); })
+      .def("get_tensor_alias_names", [](PredictorRes &self, int model_idx) {
+        return self.get_tensor_alias_names(model_idx);
+      });
  py::class_<PredictorClient>(m, "PredictorClient", py::buffer_protocol())
      .def(py::init())
@@ -101,10 +117,14 @@ PYBIND11_MODULE(serving_client, m) {
              const std::vector<std::string> &float_feed_name,
              const std::vector<std::vector<int>> &float_shape,
              const std::vector<std::vector<int>> &float_lod_slot_batch,
-              const std::vector<py::array_t<int64_t>> &int_feed,
+              const std::vector<py::array_t<int32_t>> &int32_feed,
-              const std::vector<std::string> &int_feed_name,
+              const std::vector<std::string> &int32_feed_name,
-              const std::vector<std::vector<int>> &int_shape,
+              const std::vector<std::vector<int>> &int32_shape,
-              const std::vector<std::vector<int>> &int_lod_slot_batch,
+              const std::vector<std::vector<int>> &int32_lod_slot_batch,
+              const std::vector<py::array_t<int64_t>> &int64_feed,
+              const std::vector<std::string> &int64_feed_name,
+              const std::vector<std::vector<int>> &int64_shape,
+              const std::vector<std::vector<int>> &int64_lod_slot_batch,
              const std::vector<std::string> &string_feed,
              const std::vector<std::string> &string_feed_name,
              const std::vector<std::vector<int>> &string_shape,
@@ -117,10 +137,14 @@ PYBIND11_MODULE(serving_client, m) {
                                       float_feed_name,
                                       float_shape,
                                       float_lod_slot_batch,
-                                       int_feed,
+                                       int32_feed,
-                                       int_feed_name,
+                                       int32_feed_name,
-                                       int_shape,
+                                       int32_shape,
-                                       int_lod_slot_batch,
+                                       int32_lod_slot_batch,
+                                       int64_feed,
+                                       int64_feed_name,
+                                       int64_shape,
+                                       int64_lod_slot_batch,
                                       string_feed,
                                       string_feed_name,
                                       string_shape,

--- a/core/general-server/op/general_detection_op.cpp
+++ b/core/general-server/op/general_detection_op.cpp
@@ -191,42 +191,64 @@ int GeneralDetectionOp::inference() {
    boxes = post_processor_.FilterTagDetRes(boxes, ratio_h, ratio_w, srcimg);
-    for (int i = boxes.size() - 1; i >= 0; i--) {
+    float max_wh_ratio = 0.0f;
-      crop_img = GetRotateCropImage(img, boxes[i]);
+    std::vector<cv::Mat> crop_imgs;
+    std::vector<cv::Mat> resize_imgs;
-      float wh_ratio = float(crop_img.cols) / float(crop_img.rows);
+    int max_resize_w = 0;
+    int max_resize_h = 0;
+    int box_num = boxes.size();
+    std::vector<std::vector<float>> output_rec;
+    for (int i = 0; i < box_num; ++i) {
+      cv::Mat line_img = GetRotateCropImage(img, boxes[i]);
+      float wh_ratio = float(line_img.cols) / float(line_img.rows);
+      max_wh_ratio = max_wh_ratio > wh_ratio ? max_wh_ratio : wh_ratio;
+      crop_imgs.push_back(line_img);
+    }
+    for (int i = 0; i < box_num; ++i) {
+      cv::Mat resize_img;
+      crop_img = crop_imgs[i];
      this->resize_op_rec.Run(
-          crop_img, resize_img_rec, wh_ratio, this->use_tensorrt_);
+          crop_img, resize_img, max_wh_ratio, this->use_tensorrt_);
      this->normalize_op_.Run(
-          &resize_img_rec, this->mean_rec, this->scale_rec, this->is_scale_);
+          &resize_img, this->mean_rec, this->scale_rec, this->is_scale_);
-      std::vector<float> output_rec(
+      max_resize_w = std::max(max_resize_w, resize_img.cols);
-          1 * 3 * resize_img_rec.rows * resize_img_rec.cols, 0.0f);
+      max_resize_h = std::max(max_resize_h, resize_img.rows);
+      resize_imgs.push_back(resize_img);
-      this->permute_op_.Run(&resize_img_rec, output_rec.data());
+    }
+    int buf_size = 3 * max_resize_h * max_resize_w;
-      // Inference.
+    output_rec = std::vector<std::vector<float>>(box_num,
-      output_shape = {1, 3, resize_img_rec.rows, resize_img_rec.cols};
+                     std::vector<float>(buf_size, 0.0f));
-      out_num = std::accumulate(
+    for (int i = 0; i < box_num; ++i) {
-          output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+      resize_img_rec = resize_imgs[i];
-      databuf_size_out = out_num * sizeof(float);
-      databuf_data_out = MempoolWrapper::instance().malloc(databuf_size_out);
+      this->permute_op_.Run(&resize_img_rec, output_rec[i].data());
-      if (!databuf_data_out) {
+    }
-        LOG(ERROR) << "Malloc failed, size: " << databuf_size_out;
-        return -1;
+    // Inference.
-      }
+    output_shape = {box_num, 3, max_resize_h, max_resize_w};
-      memcpy(databuf_data_out, output_rec.data(), databuf_size_out);
+    out_num = std::accumulate(
-      databuf_char_out = reinterpret_cast<char*>(databuf_data_out);
+        output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
-      paddle::PaddleBuf paddleBuf(databuf_char_out, databuf_size_out);
+    databuf_size_out = out_num * sizeof(float);
-      paddle::PaddleTensor tensor_out;
+    databuf_data_out = MempoolWrapper::instance().malloc(databuf_size_out);
-      tensor_out.name = "image";
+    if (!databuf_data_out) {
-      tensor_out.dtype = paddle::PaddleDType::FLOAT32;
+      LOG(ERROR) << "Malloc failed, size: " << databuf_size_out;
-      tensor_out.shape = {1, 3, resize_img_rec.rows, resize_img_rec.cols};
+      return -1;
-      tensor_out.data = paddleBuf;
+    }
-      out->push_back(tensor_out);
+    int offset = buf_size * sizeof(float);
+    for (int i = 0; i < box_num; ++i) {
+      memcpy(databuf_data_out + i * offset, output_rec[i].data(), offset);
    }
+    databuf_char_out = reinterpret_cast<char*>(databuf_data_out);
+    paddle::PaddleBuf paddleBuf(databuf_char_out, databuf_size_out);
+    paddle::PaddleTensor tensor_out;
+    tensor_out.name = "image";
+    tensor_out.dtype = paddle::PaddleDType::FLOAT32;
+    tensor_out.shape = output_shape;
+    tensor_out.data = paddleBuf;
+    out->push_back(tensor_out);
  }
  out->erase(out->begin(), out->begin() + infer_outnum);

--- a/core/general-server/op/general_detection_op.h
+++ b/core/general-server/op/general_detection_op.h
@@ -63,7 +63,7 @@ class GeneralDetectionOp
    double det_db_thresh_ = 0.3;
    double det_db_box_thresh_ = 0.5;
-    double det_db_unclip_ratio_ = 2.0;
+    double det_db_unclip_ratio_ = 1.5;
    std::vector<float> mean_det = {0.485f, 0.456f, 0.406f};
    std::vector<float> scale_det = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f};

--- a/core/general-server/op/general_dist_kv_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_infer_op.cpp
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <utility>
 #include "core/cube/cube-api/include/cube_api.h"
+#include "core/predictor/framework/cache.h"
 #include "core/predictor/framework/infer.h"
 #include "core/predictor/framework/memory.h"
 #include "core/predictor/framework/resource.h"
@@ -36,10 +37,11 @@ using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+using baidu::paddle_serving::predictor::CubeCache;
 // DistKV Infer Op: seek cube and then call paddle inference
 // op seq: general_reader-> dist_kv_infer -> general_response
-int GeneralDistKVInferOp::inference() { 
+int GeneralDistKVInferOp::inference() {
  VLOG(2) << "Going to run inference";
  const std::vector<std::string> pre_node_names = pre_names();
  if (pre_node_names.size() != 1) {
@@ -60,8 +62,8 @@ int GeneralDistKVInferOp::inference() {
  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
  if (!output_blob) {
-    LOG(ERROR) <<  "(logid=" << log_id << ") output_blob is nullptr,error";
+    LOG(ERROR) << "(logid=" << log_id << ") output_blob is nullptr,error";
-      return -1;
+    return -1;
  }
  output_blob->SetLogId(log_id);
@@ -70,21 +72,30 @@ int GeneralDistKVInferOp::inference() {
               << ") Failed mutable depended argument, op:" << pre_name;
    return -1;
  }
+  Timer timeline;
+  timeline.Start();
  const TensorVector *in = &input_blob->tensor_vector;
  TensorVector *out = &output_blob->tensor_vector;
  std::vector<uint64_t> keys;
+  std::vector<uint64_t> unique_keys;
+  std::unordered_map<uint64_t, rec::mcube::CubeValue *> key_map;
  std::vector<rec::mcube::CubeValue> values;
-  int sparse_count = 0; // sparse inputs counts, sparse would seek cube
+  // sparse inputs counts, sparse would seek cube
-  int dense_count = 0; // dense inputs counts, dense would directly call paddle infer
+  int sparse_count = 0;
+  // dense inputs counts, dense would directly call paddle infer
+  int dense_count = 0;
  std::vector<std::pair<int64_t *, size_t>> dataptr_size_pairs;
  size_t key_len = 0;
  for (size_t i = 0; i < in->size(); ++i) {
    if (in->at(i).dtype != paddle::PaddleDType::INT64) {
+      // dense input type is not int64
      ++dense_count;
      continue;
    }
+    // sparse input type is int64
    ++sparse_count;
    size_t elem_num = 1;
    for (size_t s = 0; s < in->at(i).shape.size(); ++s) {
      elem_num *= in->at(i).shape[s];
@@ -94,7 +105,8 @@ int GeneralDistKVInferOp::inference() {
    dataptr_size_pairs.push_back(std::make_pair(data_ptr, elem_num));
  }
  keys.resize(key_len);
-  VLOG(3) << "(logid=" << log_id << ") cube number of keys to look up: " << key_len;
+  unique_keys.resize(key_len);
  int key_idx = 0;
  for (size_t i = 0; i < dataptr_size_pairs.size(); ++i) {
    std::copy(dataptr_size_pairs[i].first,
@@ -102,20 +114,81 @@ int GeneralDistKVInferOp::inference() {
              keys.begin() + key_idx);
    key_idx += dataptr_size_pairs[i].second;
  }
+  // filter dumplicate keys
+  int unique_keys_count = 0;
+  for (size_t i = 0; i < keys.size(); ++i) {
+    if (key_map.find(keys[i]) == key_map.end()) {
+      key_map[keys[i]] = nullptr;
+      unique_keys[unique_keys_count++] = keys[i];
+    }
+  }
+  unique_keys.resize(unique_keys_count);
+  VLOG(1) << "(logid=" << log_id
+          << ") cube number of keys to look up: " << key_len
+          << " uniq keys: " << unique_keys_count;
+  // fitler cache keys
+  size_t hit_counts = 0;
+  int64_t seek_cache_start = timeline.TimeStampUS();
+  CubeCache *p_cube_cache =
+      InferManager::instance().get_cube_cache(engine_name().c_str());
+  if (p_cube_cache != nullptr) {
+    for (size_t i = 0; i < unique_keys_count; ++i) {
+      rec::mcube::CubeValue *hit_val = p_cube_cache->get_data(unique_keys[i]);
+      if (hit_val) {
+        // LOG(WARNING) << "Hit one cache. key:" << unique_keys[i];
+        key_map[unique_keys[i]] = hit_val;
+        if (hit_counts % 100 == 0) {
+          LOG(WARNING) << "hit cache! key:" << unique_keys[i]
+                       << " value:" << hit_val->buff;
+        }
+        unique_keys[i] = 0;
+        ++hit_counts;
+      }
+    }
+  } else {
+    LOG(WARNING) << "get cube cache fail. model: " << engine_name();
+  }
+  // clear unique keys which hit caches
+  if (hit_counts > 0) {
+    for (auto it = unique_keys.begin(); it < unique_keys.end();) {
+      if (*it == 0) {
+        it = unique_keys.erase(it);
+        --unique_keys_count;
+      } else {
+        ++it;
+      }
+    }
+  }
+  int64_t seek_cache_end = timeline.TimeStampUS();
+  VLOG(2) << "cache hit " << hit_counts
+          << " keys in cube cache, last unique_keys:" << unique_keys.size()
+          << " , seek_time:" << seek_cache_end - seek_cache_start;
+  // seek sparse params
  rec::mcube::CubeAPI *cube = rec::mcube::CubeAPI::instance();
  std::vector<std::string> table_names = cube->get_table_names();
  if (table_names.size() == 0) {
    LOG(ERROR) << "cube init error or cube config not given.";
    return -1;
  }
-  // gather keys and seek cube servers, put results in values 
+  int64_t seek_start = timeline.TimeStampUS();
-  int ret = cube->seek(table_names[0], keys, &values);
+  int ret = cube->seek(table_names[0], unique_keys, &values);
-  VLOG(3) << "(logid=" << log_id << ") cube seek status: " << ret;
+  int64_t seek_end = timeline.TimeStampUS();
+  VLOG(2) << "(logid=" << log_id << ") cube seek status: " << ret
+          << " , unique_key: " << unique_keys.size()
+          << " , seek_time: " << seek_end - seek_start;
+  for (size_t i = 0; i < unique_keys.size(); ++i) {
+    key_map[unique_keys[i]] = &values[i];
+  }
  if (values.size() != keys.size() || values[0].buff.size() == 0) {
    LOG(ERROR) << "cube value return null";
  }
-  // EMBEDDING_SIZE means the length of sparse vector, user can define length here. 
+   size_t EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
-  size_t EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
+  // size_t EMBEDDING_SIZE = (values[0].buff.size() - 10) / sizeof(float);
+  //size_t EMBEDDING_SIZE = 9;
  TensorVector sparse_out;
  sparse_out.resize(sparse_count);
  TensorVector dense_out;
@@ -126,8 +199,10 @@ int GeneralDistKVInferOp::inference() {
  std::unordered_map<int, int> in_out_map;
  baidu::paddle_serving::predictor::Resource &resource =
      baidu::paddle_serving::predictor::Resource::instance();
-  std::shared_ptr<PaddleGeneralModelConfig> model_config = resource.get_general_model_config().front();
+  std::shared_ptr<PaddleGeneralModelConfig> model_config =
-  //copy data to tnsor
+      resource.get_general_model_config().front();
+  int cube_key_found = 0;
+  int cube_key_miss = 0;
  for (size_t i = 0; i < in->size(); ++i) {
    if (in->at(i).dtype != paddle::PaddleDType::INT64) {
      dense_out[dense_idx] = in->at(i);
@@ -142,43 +217,75 @@ int GeneralDistKVInferOp::inference() {
                sparse_out[sparse_idx].lod[x].begin());
    }
    sparse_out[sparse_idx].dtype = paddle::PaddleDType::FLOAT32;
-    sparse_out[sparse_idx].shape.push_back(sparse_out[sparse_idx].lod[0].back());
+    sparse_out[sparse_idx].shape.push_back(
+        sparse_out[sparse_idx].lod[0].back());
    sparse_out[sparse_idx].shape.push_back(EMBEDDING_SIZE);
    sparse_out[sparse_idx].name = model_config->_feed_name[i];
    sparse_out[sparse_idx].data.Resize(sparse_out[sparse_idx].lod[0].back() *
                                       EMBEDDING_SIZE * sizeof(float));
    float *dst_ptr = static_cast<float *>(sparse_out[sparse_idx].data.data());
+    if (!dst_ptr) {
+      VLOG(2) << "dst_ptr is null. sparse_idx:" << sparse_idx;
+      continue;
+    }
    for (int x = 0; x < sparse_out[sparse_idx].lod[0].back(); ++x) {
      float *data_ptr = dst_ptr + x * EMBEDDING_SIZE;
-      memcpy(data_ptr,
+      uint64_t cur_key = keys[cube_val_idx];
-             values[cube_val_idx].buff.data(),
+      rec::mcube::CubeValue *cur_val = key_map[cur_key];
-             values[cube_val_idx].buff.size());
+      if (cur_val->buff.size() == 0) {
-      cube_val_idx++;
+        memset(data_ptr, (float)0.0, sizeof(float) * EMBEDDING_SIZE);
+        ++cube_key_miss;
+        ++cube_val_idx;
+        continue;
+      }
+      // The data generated by pslib has 10 bytes of information to be filtered
+      // out
+      memcpy(data_ptr, cur_val->buff.data(), cur_val->buff.size() );
+      // VLOG(3) <<  keys[cube_val_idx] << ":" << data_ptr[0] << ", " <<
+      // data_ptr[1] << ", " <<data_ptr[2] << ", " <<data_ptr[3] << ", "
+      // <<data_ptr[4] << ", " <<data_ptr[5] << ", " <<data_ptr[6] << ", "
+      // <<data_ptr[7] << ", " <<data_ptr[8];
+      ++cube_key_found;
+      ++cube_val_idx;
    }
    ++sparse_idx;
  }
-  VLOG(3) << "(logid=" << log_id << ") sparse tensor load success.";
+  bool cube_fail = (cube_key_found == 0);
+  if (cube_fail) {
+    LOG(WARNING) << "(logid=" << log_id << ") cube seek fail";
+  }
+  VLOG(2) << "(logid=" << log_id << ") cube key found: " << cube_key_found
+          << " , cube key miss: " << cube_key_miss;
+  VLOG(2) << "(logid=" << log_id << ") sparse tensor load success.";
+  timeline.Pause();
+  VLOG(2) << "dist kv, cube and datacopy time: " << timeline.ElapsedUS();
  TensorVector infer_in;
  infer_in.insert(infer_in.end(), dense_out.begin(), dense_out.end());
  infer_in.insert(infer_in.end(), sparse_out.begin(), sparse_out.end());
  int batch_size = input_blob->_batch_size;
  output_blob->_batch_size = batch_size;
-  Timer timeline;
  int64_t start = timeline.TimeStampUS();
  timeline.Start();
  // call paddle inference here
  if (InferManager::instance().infer(
          engine_name().c_str(), &infer_in, out, batch_size)) {
-    LOG(ERROR) << "(logid=" << log_id << ") Failed do infer in fluid model: " << engine_name();
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name();
    return -1;
  }
  int64_t end = timeline.TimeStampUS();
+  if (cube_fail) {
+    float *out_ptr = static_cast<float *>(out->at(0).data.data());
+    out_ptr[0] = 0.0;
+  }
+  timeline.Pause();
+  VLOG(2) << "dist kv, pure paddle infer time: " << timeline.ElapsedUS();
  CopyBlobInfo(input_blob, output_blob);
  AddBlobInfo(output_blob, start);
  AddBlobInfo(output_blob, end);
-  return 0; 
+  return 0;
 }
 DEFINE_OP(GeneralDistKVInferOp);

--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -31,7 +31,22 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
-enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
+// support: FLOAT32, INT64, INT32, UINT8, INT8, FLOAT16
+enum ProtoDataType {
+  P_INT64 = 0,
+  P_FLOAT32,
+  P_INT32,
+  P_FP64,
+  P_INT16,
+  P_FP16,
+  P_BF16,
+  P_UINT8,
+  P_INT8,
+  P_BOOL,
+  P_COMPLEX64,
+  P_COMPLEX128,
+  P_STRING = 20,
+};
 int GeneralReaderOp::inference() {
  // read request from client
@@ -78,6 +93,7 @@ int GeneralReaderOp::inference() {
  int64_t elem_type = 0;
  int64_t elem_size = 0;
  int64_t databuf_size = 0;
+  const void* src_ptr = nullptr;
  for (int i = 0; i < var_num; ++i) {
    paddle::PaddleTensor paddleTensor;
    const Tensor &tensor = req->tensor(i);
@@ -86,19 +102,38 @@ int GeneralReaderOp::inference() {
    elem_size = 0;
    databuf_size = 0;
    elem_type = tensor.elem_type();
-    VLOG(2) << "var[" << i << "] has elem type: " << elem_type;
+    src_ptr = nullptr ;
    if (elem_type == P_INT64) {  // int64
      elem_size = sizeof(int64_t);
      paddleTensor.dtype = paddle::PaddleDType::INT64;
      data_len = tensor.int64_data_size();
+      src_ptr = tensor.int64_data().data();
    } else if (elem_type == P_FLOAT32) {
      elem_size = sizeof(float);
      paddleTensor.dtype = paddle::PaddleDType::FLOAT32;
      data_len = tensor.float_data_size();
+      src_ptr = tensor.float_data().data();
    } else if (elem_type == P_INT32) {
      elem_size = sizeof(int32_t);
      paddleTensor.dtype = paddle::PaddleDType::INT32;
      data_len = tensor.int_data_size();
+      src_ptr = tensor.int_data().data();
+    } else if (elem_type == P_UINT8) {
+      elem_size = sizeof(uint8_t);
+      paddleTensor.dtype = paddle::PaddleDType::UINT8;
+      data_len = tensor.tensor_content().size();
+      src_ptr = tensor.tensor_content().data();
+    } else if (elem_type == P_INT8) {
+      elem_size = sizeof(int8_t);
+      paddleTensor.dtype = paddle::PaddleDType::INT8;
+      data_len = tensor.tensor_content().size();
+      src_ptr = tensor.tensor_content().data();
+    } else if (elem_type == P_FP16) {
+      // copy bytes from tensor content to TensorVector
+      elem_size = 1;
+      paddleTensor.dtype = paddle::PaddleDType::FLOAT16;
+      data_len = tensor.tensor_content().size();
+      src_ptr = tensor.tensor_content().data();
    } else if (elem_type == P_STRING) {
      // use paddle::PaddleDType::UINT8 as for String.
      elem_size = sizeof(char);
@@ -109,8 +144,18 @@ int GeneralReaderOp::inference() {
      // now only support single string
      for (int idx = 0; idx < tensor.data_size(); idx++) {
        data_len += tensor.data()[idx].length() + 1;
+        src_ptr = tensor.data()[idx].data();
      }
    }
+    VLOG(2) << "var[" << i << "] has elem type: " << elem_type << ";"
+            << "elem_size=" << elem_size << ";"
+            << "dtype=" << paddleTensor.dtype << ";"
+            << "data_len=" << data_len;
+    if (src_ptr == nullptr) {
+      LOG(ERROR) << "Not support var[" << i << "] with elem_type[" 
+                 << elem_type << "]";
+      continue;
+    }
    // implement lod tensor here
    // only support 1-D lod
    // TODO(HexToString): support 2-D lod
@@ -141,44 +186,17 @@ int GeneralReaderOp::inference() {
      VLOG(2) << "(logid=" << log_id << ") var[" << i
              << "] has lod_tensor and len=" << out->at(i).lod[0].back();
    }
-    if (elem_type == P_INT64) {
+    void* dst_ptr = out->at(i).data.data();
-      int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
+    if (!dst_ptr) {
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
+      LOG(ERROR) << "dst_ptr is nullptr";
-              << "] is " << tensor.int64_data(0);
+      return -1;
-      if (!dst_ptr) {
+    }
-        LOG(ERROR) << "dst_ptr is nullptr";
-        return -1;
+    // For common data, we just copy from src to dst
-      }
+    // For string data, we need to iterate through all str
-      memcpy(dst_ptr, tensor.int64_data().data(), databuf_size);
+    if (elem_type != P_STRING) {
-      /*
+      memcpy(dst_ptr, src_ptr, databuf_size);
-      int elem_num = tensor.int64_data_size();
+    } else {
-      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[k] = tensor.int64_data(k);
-      }
-      */
-    } else if (elem_type == P_FLOAT32) {
-      float *dst_ptr = static_cast<float *>(out->at(i).data.data());
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
-              << "] is " << tensor.float_data(0);
-      if (!dst_ptr) {
-        LOG(ERROR) << "dst_ptr is nullptr";
-        return -1;
-      }
-      memcpy(dst_ptr, tensor.float_data().data(), databuf_size);
-      /*int elem_num = tensor.float_data_size();
-      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[k] = tensor.float_data(k);
-      }*/
-    } else if (elem_type == P_INT32) {
-      int32_t *dst_ptr = static_cast<int32_t *>(out->at(i).data.data());
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
-              << "] is " << tensor.int_data(0);
-      if (!dst_ptr) {
-        LOG(ERROR) << "dst_ptr is nullptr";
-        return -1;
-      }
-      memcpy(dst_ptr, tensor.int_data().data(), databuf_size);
-    } else if (elem_type == P_STRING) {
      char *dst_ptr = static_cast<char *>(out->at(i).data.data());
      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
              << "] is " << tensor.data(0);

--- a/core/general-server/op/general_response_op.cpp
+++ b/core/general-server/op/general_response_op.cpp
@@ -74,10 +74,19 @@ int GeneralResponseOp::inference() {
  // and the order of Output is the same as the prototxt FetchVar.
  // otherwise, you can only get the Output by the corresponding of
  // Name -- Alias_name.
-  fetch_index.resize(req->fetch_var_names_size());
+  if (req->fetch_var_names_size() > 0) {
-  for (int i = 0; i < req->fetch_var_names_size(); ++i) {
+    fetch_index.resize(req->fetch_var_names_size());
-    fetch_index[i] =
+    for (int i = 0; i < req->fetch_var_names_size(); ++i) {
-        model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
+      fetch_index[i] =
+          model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
+    }
+  } else {
+    fetch_index.resize(model_config->_fetch_alias_name.size());
+    for (int i = 0; i < model_config->_fetch_alias_name.size(); ++i) {
+      fetch_index[i] =
+          model_config
+              ->_fetch_alias_name_to_index[model_config->_fetch_alias_name[i]];
+    }
  }
  for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
@@ -105,7 +114,7 @@ int GeneralResponseOp::inference() {
    // fetch_index is the real index in FetchVar of Fetchlist
    // for example, FetchVar = {0:A, 1:B, 2:C}
    // FetchList = {0:C,1:A}, at this situation.
-    // fetch_index = [2,0], C`index = 2 and A`index = 0 
+    // fetch_index = [2,0], C`index = 2 and A`index = 0
    for (auto &idx : fetch_index) {
      Tensor *tensor = output->add_tensor();
      tensor->set_name(in->at(idx).name);
@@ -159,6 +168,21 @@ int GeneralResponseOp::inference() {
        google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr,
                                                          data_ptr + cap);
        output->mutable_tensor(var_idx)->mutable_int_data()->Swap(&tmp_data);
+      } else if (dtype == paddle::PaddleDType::UINT8) {
+        tensor->set_elem_type(7);
+        VLOG(2) << "(logid=" << log_id << ")Prepare uint8 var ["
+                << model_config->_fetch_name[idx] << "].";
+        tensor->set_tensor_content(in->at(idx).data.data(), in->at(idx).data.length());
+      } else if (dtype == paddle::PaddleDType::INT8) {
+        tensor->set_elem_type(8);
+        VLOG(2) << "(logid=" << log_id << ")Prepare int8 var ["
+                << model_config->_fetch_name[idx] << "].";
+        tensor->set_tensor_content(in->at(idx).data.data(), in->at(idx).data.length());
+      } else if (dtype == paddle::PaddleDType::FLOAT16) {
+        tensor->set_elem_type(5);
+        VLOG(2) << "(logid=" << log_id << ")Prepare float16 var ["
+                << model_config->_fetch_name[idx] << "].";
+        tensor->set_tensor_content(in->at(idx).data.data(), in->at(idx).data.length());
      }
      VLOG(2) << "(logid=" << log_id << ") fetch var ["

--- a/core/general-server/proto/general_model_service.proto
+++ b/core/general-server/proto/general_model_service.proto
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-syntax = "proto2";
+syntax = "proto3";
 import "pds_option.proto";
 import "builtin_format.proto";
 package baidu.paddle_serving.predictor.general_model;
@@ -20,33 +20,88 @@ package baidu.paddle_serving.predictor.general_model;
 option cc_generic_services = true;
 message Tensor {
-  repeated string data = 1;
+  // VarType: INT64
-  repeated int32 int_data = 2;
+  repeated int64 int64_data = 1;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
+  // VarType: FP32
-  optional int32 elem_type =
+  repeated float float_data = 2;
-      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
-  repeated int32 shape = 6;       // shape should include batch
+  // VarType: INT32
-  repeated int32 lod = 7;         // only for fetch tensor currently
+  repeated int32 int_data = 3;
-  optional string name = 8;       // get from the Model prototxt
-  optional string alias_name = 9; // get from the Model prototxt
+  // VarType: FP64
+  repeated double float64_data = 4;
+  // VarType: UINT32
+  repeated uint32 uint32_data = 5;
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+  // VarType: STRING
+  repeated string data = 9;
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  20 => STRING
+  int32 elem_type = 10;
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 11;
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 12;
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 13;
+  // Correspond to the variable 'alias_name' in the model description prototxt.
+  string alias_name = 14; // get from the Model prototxt
+  // VarType: FP16, INT16, INT8, BF16, UINT8
+  bytes tensor_content = 15;
 };
 message Request {
  repeated Tensor tensor = 1;
  repeated string fetch_var_names = 2;
-  optional bool profile_server = 3 [ default = false ];
+  bool profile_server = 3;
-  required uint64 log_id = 4 [ default = 0 ];
+  uint64 log_id = 4;
 };
 message Response {
  repeated ModelOutput outputs = 1;
  repeated int64 profile_time = 2;
+  // Error code
+  int32 err_no = 3;
+  // Error messages
+  string err_msg = 4;
 };
 message ModelOutput {
  repeated Tensor tensor = 1;
-  optional string engine_name = 2;
+  string engine_name = 2;
 }
 service GeneralModelService {

--- a/core/pdcodegen/src/pdcodegen.cpp
+++ b/core/pdcodegen/src/pdcodegen.cpp
@@ -276,43 +276,65 @@ class PdsCodeGenerator : public CodeGenerator {
          "output_name",
          google::protobuf::dots_to_colons(m->output_type()->full_name()));
      if (m->name() == "inference") {
+        std::string inference_body = "";
+        inference_body += "  brpc::ClosureGuard done_guard(done);\n";
+        inference_body += "  brpc::Controller* cntl = \n";
+        inference_body += "        static_cast<brpc::Controller*>(cntl_base);\n";
+        inference_body += "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n";
+        inference_body += "  uint64_t log_id = request->log_id();\n";
+        inference_body += "  cntl->set_log_id(log_id);\n";
+        inference_body += "  ::baidu::paddle_serving::predictor::InferService* svr = \n";
+        inference_body += "       ";
+        inference_body += "::baidu::paddle_serving::predictor::InferServiceManager::instance(";
+        inference_body += ").item(\"$service$\");\n";
+        inference_body += "  if (svr == NULL) {\n";
+        inference_body += "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: ";
+        inference_body += "$service$\";\n";
+        inference_body += "    cntl->SetFailed(404, \"Not found service: $service$\");\n";
+        inference_body += "    return ;\n";
+        inference_body += "  }\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "remote_side=\[\" << cntl->remote_side() << ";  // NOLINT
+        inference_body += "\"\]\";\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "local_side=\[\" << cntl->local_side() << ";  // NOLINT
+        inference_body += "\"\]\";\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "service_name=\[\" << \"$name$\" << \"\]\";\n";  // NOLINT
+        inference_body += "  int err_code = svr->inference(request, response, log_id);\n";
+        inference_body += "  if (err_code != 0) {\n";
+        inference_body += "    LOG(WARNING)\n";
+        inference_body += "        << \"(logid=\" << log_id << \") Failed call ";
+        inference_body += "inferservice[$name$], name[$service$]\"\n";
+        inference_body += "        << \", error_code: \" << err_code;\n";
+        inference_body += "    cntl->SetFailed(err_code, \"InferService inference ";
+        inference_body += "failed!\");\n";
+        inference_body += "  }\n";
+        inference_body += "  gettimeofday(&tv, NULL);\n";
+        inference_body += "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n";
+        if (service_name == "GeneralModelService") {
+          inference_body += "  std::ostringstream oss;\n";
+          inference_body += "  oss << \"[serving]\"\n";
+          inference_body += "      << \"logid=\" << log_id << \",\";\n";
+          inference_body += "  int op_num = response->profile_time_size() / 2;\n";
+          inference_body += "  for (int i = 0; i < op_num; ++i) {\n";
+          inference_body += "    double t = (response->profile_time(i * 2 + 1)\n";
+          inference_body += "                - response->profile_time(i * 2)) / 1000.0;\n";
+          inference_body += "    oss << \"op\" << i << \"=\" << t << \"ms,\";\n";
+          inference_body += "  }\n";
+          inference_body += "  double total_time = (end - start) / 1000.0;\n";
+          inference_body += "  oss << \"cost=\" << total_time << \"ms.\";\n";
+          inference_body += "  // flush notice log\n";
+          inference_body += "  LOG(INFO) << oss.str();\n";
+          inference_body += "  response->add_profile_time(start);\n";
+          inference_body += "  response->add_profile_time(end);\n";
+        } else {
+          inference_body += "  // flush notice log\n";
+          inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - ";  // NOLINT
+          inference_body += "start) << \"\]\";\n";
+        }
        printer->Print(
-            "  baidu::rpc::ClosureGuard done_guard(done);\n"
+            inference_body.c_str(),
-            "  baidu::rpc::Controller* cntl = \n"
-            "        static_cast<baidu::rpc::Controller*>(cntl_base);\n"
-            "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
-            "  uint64_t log_id = request->log_id();\n"
-            "  cntl->set_log_id(log_id);\n"
-            "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
-            "       "
-            "::baidu::paddle_serving::predictor::InferServiceManager::instance("
-            ").item(\"$service$\");\n"
-            "  if (svr == NULL) {\n"
-            "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: "
-            "$service$\";\n"
-            "    cntl->SetFailed(404, \"Not found service: $service$\");\n"
-            "    return ;\n"
-            "  }\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") remote_side=\[\" "  // NOLINT
-            "<< cntl->remote_side() << \"\]\";\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") local_side=\[\" "  // NOLINT
-            "<< cntl->local_side() << \"\]\";\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") service_name=\[\" "  // NOLINT
-            "<< \"$name$\" << \"\]\";\n"
-            "  int err_code = svr->inference(request, response, log_id);\n"
-            "  if (err_code != 0) {\n"
-            "    LOG(WARNING)\n"
-            "        << \"(logid=\" << log_id << \") Failed call "
-            "inferservice[$name$], name[$service$]\"\n"
-            "        << \", error_code: \" << err_code;\n"
-            "    cntl->SetFailed(err_code, \"InferService inference "
-            "failed!\");\n"
-            "  }\n"
-            "  gettimeofday(&tv, NULL);\n"
-            "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n"
-            "  // flush notice log\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - "  // NOLINT
-            "start) << \"\]\";\n",  // NOLINT
            "name",
            class_name,
            "service",
@@ -1021,45 +1043,65 @@ class PdsCodeGenerator : public CodeGenerator {
          "output_name",
          google::protobuf::dots_to_colons(m->output_type()->full_name()));
      if (m->name() == "inference") {
+        std::string inference_body = "";
+        inference_body += "  brpc::ClosureGuard done_guard(done);\n";
+        inference_body += "  brpc::Controller* cntl = \n";
+        inference_body += "        static_cast<brpc::Controller*>(cntl_base);\n";
+        inference_body += "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n";
+        inference_body += "  uint64_t log_id = request->log_id();\n";
+        inference_body += "  cntl->set_log_id(log_id);\n";
+        inference_body += "  ::baidu::paddle_serving::predictor::InferService* svr = \n";
+        inference_body += "       ";
+        inference_body += "::baidu::paddle_serving::predictor::InferServiceManager::instance(";
+        inference_body += ").item(\"$service$\");\n";
+        inference_body += "  if (svr == NULL) {\n";
+        inference_body += "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: ";
+        inference_body += "$service$\";\n";
+        inference_body += "    cntl->SetFailed(404, \"Not found service: $service$\");\n";
+        inference_body += "    return ;\n";
+        inference_body += "  }\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "remote_side=\[\" << cntl->remote_side() << ";  // NOLINT
+        inference_body += "\"\]\";\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "local_side=\[\" << cntl->local_side() << ";  // NOLINT
+        inference_body += "\"\]\";\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "service_name=\[\" << \"$name$\" << \"\]\";\n";  // NOLINT
+        inference_body += "  int err_code = svr->inference(request, response, log_id);\n";
+        inference_body += "  if (err_code != 0) {\n";
+        inference_body += "    LOG(WARNING)\n";
+        inference_body += "        << \"(logid=\" << log_id << \") Failed call ";
+        inference_body += "inferservice[$name$], name[$service$]\"\n";
+        inference_body += "        << \", error_code: \" << err_code;\n";
+        inference_body += "    cntl->SetFailed(err_code, \"InferService inference ";
+        inference_body += "failed!\");\n";
+        inference_body += "  }\n";
+        inference_body += "  gettimeofday(&tv, NULL);\n";
+        inference_body += "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n";
+        if (service_name == "GeneralModelService") {
+          inference_body += "  std::ostringstream oss;\n";
+          inference_body += "  oss << \"[serving]\"\n";
+          inference_body += "      << \"logid=\" << log_id << \",\";\n";
+          inference_body += "  int op_num = response->profile_time_size() / 2;\n";
+          inference_body += "  for (int i = 0; i < op_num; ++i) {\n";
+          inference_body += "    double t = (response->profile_time(i * 2 + 1)\n";
+          inference_body += "                - response->profile_time(i * 2)) / 1000.0;\n";
+          inference_body += "    oss << \"op\" << i << \"=\" << t << \"ms,\";\n";
+          inference_body += "  }\n";
+          inference_body += "  double total_time = (end - start) / 1000.0;\n";
+          inference_body += "  oss << \"cost=\" << total_time << \"ms.\";\n";
+          inference_body += "  // flush notice log\n";
+          inference_body += "  LOG(INFO) << oss.str();\n";
+          inference_body += "  response->add_profile_time(start);\n";
+          inference_body += "  response->add_profile_time(end);\n";
+        } else {
+          inference_body += "  // flush notice log\n";
+          inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - ";  // NOLINT
+          inference_body += "start) << \"\]\";\n";
+        }
        printer->Print(
-            "  brpc::ClosureGuard done_guard(done);\n"
+            inference_body.c_str(),
-            "  brpc::Controller* cntl = \n"
-            "        static_cast<brpc::Controller*>(cntl_base);\n"
-            "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
-            "  uint64_t log_id = request->log_id();\n"
-            "  cntl->set_log_id(log_id);\n"
-            "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
-            "       "
-            "::baidu::paddle_serving::predictor::InferServiceManager::instance("
-            ").item(\"$service$\");\n"
-            "  if (svr == NULL) {\n"
-            "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: "
-            "$service$\";\n"
-            "    cntl->SetFailed(404, \"Not found service: $service$\");\n"
-            "    return ;\n"
-            "  }\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") "
-            "remote_side=\[\" << cntl->remote_side() << "  // NOLINT
-            "\"\]\";\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") "
-            "local_side=\[\" << cntl->local_side() << "  // NOLINT
-            "\"\]\";\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") "
-            "service_name=\[\" << \"$name$\" << \"\]\";\n"  // NOLINT
-            "  int err_code = svr->inference(request, response, log_id);\n"
-            "  if (err_code != 0) {\n"
-            "    LOG(WARNING)\n"
-            "        << \"(logid=\" << log_id << \") Failed call "
-            "inferservice[$name$], name[$service$]\"\n"
-            "        << \", error_code: \" << err_code;\n"
-            "    cntl->SetFailed(err_code, \"InferService inference "
-            "failed!\");\n"
-            "  }\n"
-            "  gettimeofday(&tv, NULL);\n"
-            "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n"
-            "  // flush notice log\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - "  // NOLINT
-            "start) << \"\]\";\n",  // NOLINT
            "name",
            class_name,
            "service",
@@ -1492,11 +1534,6 @@ class PdsCodeGenerator : public CodeGenerator {
      const FieldDescriptor* fd = in_shared_fields[si];
      std::string field_name = fd->name();
      printer->Print("\n/////$field_name$\n", "field_name", field_name);
-      if (fd->is_optional()) {
-        printer->Print(
-            "if (req->has_$field_name$()) {\n", "field_name", field_name);
-        printer->Indent();
-      }
      if (fd->cpp_type() ==
              google::protobuf::FieldDescriptor::CPPTYPE_MESSAGE ||
          fd->is_repeated()) {
@@ -1509,10 +1546,6 @@ class PdsCodeGenerator : public CodeGenerator {
                       "field_name",
                       field_name);
      }
-      if (fd->is_optional()) {
-        printer->Outdent();
-        printer->Print("}\n");
-      }
    }
    printer->Print(

--- a/core/predictor/common/constant.cpp
+++ b/core/predictor/common/constant.cpp
@@ -25,7 +25,7 @@ DEFINE_int32(port, 8010, "");
 DEFINE_string(workflow_path, "./conf", "");
 DEFINE_string(workflow_file, "workflow.prototxt", "");
 DEFINE_string(inferservice_path, "./conf", "");
-DEFINE_string(inferservice_file, "service.prototxt", "");
+DEFINE_string(inferservice_file, "infer_service.prototxt", "");
 DEFINE_string(logger_path, "./conf", "");
 DEFINE_string(logger_file, "log.conf", "");
 DEFINE_string(resource_path, "./conf", "");

--- a/core/predictor/framework/CMakeLists.txt
+++ b/core/predictor/framework/CMakeLists.txt
-FILE(GLOB framework_srcs ${CMAKE_CURRENT_LIST_DIR}/*.cpp)
+FILE(GLOB framework_srcs ${CMAKE_CURRENT_LIST_DIR}/*.cpp ${CMAKE_CURRENT_LIST_DIR}/../../cube/cube-builder/src/seqfile_reader.cpp)
 LIST(APPEND pdserving_srcs ${framework_srcs})
 LIST(APPEND pclient_srcs ${framework_srcs})
--- a/core/predictor/framework/bsf-inl.h
+++ b/core/predictor/framework/bsf-inl.h
@@ -26,9 +26,90 @@
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/framework/memory.h"
+// this file is included by bsf.h
 namespace im {
 namespace bsf {
+template <typename InItemT, typename OutItemT>
+bool Task<InItemT, OutItemT>::task_fetch_init(BatchTasks<TaskT>& batchTask) {
+  // 双检锁，减少加锁的粒度
+  if (!fetch_init) {
+    if (taskmeta_num > 1) {
+      // 对于task被拆分为多个taskmeta,需要加锁。
+      AutoMutex lock(task_mut);
+      task_fetch_create(batchTask);
+    } else {
+      // 对于task只有1个taskmeta,不需要加锁。
+      task_fetch_create(batchTask);
+    }
+  }
+  return true;
+}
+template <typename InItemT, typename OutItemT>
+bool Task<InItemT, OutItemT>::task_fetch_create(BatchTasks<TaskT>& batchTask) {
+  if (!fetch_init) {
+    vector_fetch_lod_index = batchTask.vector_fetch_lod_index;
+    set_fetch_nobatch_index = batchTask.set_fetch_nobatch_index;
+    OutVectorT taskMetaOutLodTensor;
+    size_t fetchvar_num = batchTask._batch_out.size();
+    for (size_t fetchvar_index = 0; fetchvar_index < fetchvar_num;
+         ++fetchvar_index) {
+      size_t fetchvar_bytesize_index =
+          batchTask.fetchvar_bytesize(fetchvar_index);
+      size_t fetchvar_batch = 0;
+      // 1. nobatch fetchvar情况
+      if (set_fetch_nobatch_index.size() > 0 &&
+          set_fetch_nobatch_index.find(fetchvar_index) !=
+              set_fetch_nobatch_index.end()) {
+        fetchvar_batch = 1;
+      } else if (vector_fetch_lod_index.size() > 0 &&
+                 std::find(vector_fetch_lod_index.begin(),
+                           vector_fetch_lod_index.end(),
+                           fetchvar_index) != vector_fetch_lod_index.end()) {
+        // lod fetchvar情况，此时无法确定总的shape[0]
+        // 根据task中的task_num总数开辟task_num个临时空间
+        // 每个lod型的fetchvar拷贝到对应的临时空间中
+        // 最后再计算临时空间的总量，合并fetchvar和lod
+        fetchvar_batch = 0;
+      } else {
+        // 普通fetchvar情况，此时该Task总的fetchvar_batch =
+        // 输入的总的batch_size()
+        fetchvar_batch = batch_size();
+      }
+      paddle::PaddleTensor tensor_out;
+      tensor_out.name = batchTask._batch_out[fetchvar_index].name;
+      tensor_out.dtype =
+          paddle::PaddleDType(batchTask._batch_out[fetchvar_index].dtype);
+      tensor_out.shape = batchTask._batch_out[fetchvar_index].shape;
+      tensor_out.shape[0] = fetchvar_batch;
+      if (fetchvar_batch != 0) {
+        // 此时 lod 为空。
+        tensor_out.lod = batchTask._batch_out[fetchvar_index].lod;
+        // resize all batch memory at one time
+        size_t databuf_size = fetchvar_batch * fetchvar_bytesize_index;
+        tensor_out.data.Resize(databuf_size);
+      } else {
+        // 当taskmeta_num = 1时，由于同时只有一个taskMeta操作task
+        // 不涉及线程安全问题，所以此时可以直接由taskMeta->task->resize->copy
+        // 当task被分为多个taskMeta时，需要临时对象记录
+        // 收齐后再一起合并
+        if (taskmeta_num > 1) {
+          taskMetaOutLodTensor.push_back(tensor_out);
+        }
+      }
+      outVectorT_ptr->push_back(tensor_out);
+    }
+    // outLodTensorVector实际是一个双层vector
+    // shape为taskmeta_num * vector_fetch_lod_index.size();
+    outLodTensorVector.resize(taskmeta_num, taskMetaOutLodTensor);
+    fetch_init = true;
+  }
+  return true;
+}
 template <typename TaskT>
 void* TaskExecutor<TaskT>::thread_entry(void* args) {
  ThreadContext<TaskT>* context = static_cast<ThreadContext<TaskT>*>(args);
@@ -134,9 +215,10 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(
    LOG(ERROR) << "Failed get TaskT from object pool";
    return TaskHandler<TaskT>::valid_handle();
  }
+  task->clear();
  /*
-  if (!BatchTasks<TaskT>::check_valid(in, out, _batch_align)) {
+  if (!BatchTasks<TaskT>::check_valid(in, out, _overrun)) {
    LOG(ERROR) << "Invalid input & output";
    return TaskHandler<TaskT>::valid_handle();
  }
@@ -156,9 +238,11 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(
  task->inVectorT_ptr = (const InVectorT*)inVectorT_ptr;
  task->outVectorT_ptr = (OutVectorT*)outVectorT_ptr;
+  if (!task->task_init()) {
+    LOG(ERROR) << "task->init() failed";
+  }
  task->rem = task->batch_size();
  task->index.store(0, butil::memory_order_relaxed);
  AutoMutex lock(_mut);
  _task_queue.push_back(task);
  THREAD_COND_SIGNAL(&_cond);
@@ -168,11 +252,12 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(
 // this function is accessed by multi thread.
 // so AutoMutex at first.
-// so batch.append_task is thread safe.
+// so batchTask.append_task is thread safe.
 // you dont need to add extra lock in append_task()
+// task is already init.
 template <typename TaskT>
 bool TaskExecutor<TaskT>::move_task_to_batch(
-    BatchTasks<TaskT>& batch) {  // NOLINT
+    BatchTasks<TaskT>& batchTask) {  // NOLINT
  AutoMutex lock(_mut);
  while (_task_queue.empty()) {
    THREAD_COND_WAIT(&_cond, &_mut);
@@ -183,15 +268,65 @@ bool TaskExecutor<TaskT>::move_task_to_batch(
    return false;
  }
+  TaskT* previous_task = nullptr;
  while (!_task_queue.empty()) {
    TaskT* task = _task_queue.front();
-    size_t rem = batch.append_task(task);
+    // 由于无法确定fetchVar是否为lod（即使输入是非lod，输出也可能是lod）
+    // 简单的处理方法是：task不能被拆分，即用户的请求可以合并一起预测，但不能拆分两个小部分去预测。
+    // 只需要设置engine的属性allow_split_request = false即可。
+    // 复杂的处理方法是允许拆分Task，无论是否包含lod.
+    // 难点：预测前，能够知道被拆成了几个taskmeta,但只有预测后，才知道有多少个fetchvar,多少个lod的fetchvar
+    // 所以，task中先要创建taskmeta_num* fetchvar
+    // num（lod类型的）个临时PaddleTensor（存储data及Lod）
+    // 由于多线程调度的单位是taskmeta，故只能在notify_task中，用taskmeta->task去创建
+    // 此时由于多个taskmeta对应一个task，存在多线程竞争，所以需要在task中加锁。
+    // 原子操作不可行，因为多个线程必须等待创建好上述的PaddleTensor后才能继续。
+    // 对于普通的fetch，也需要加锁去创建PaddleTensor，后续才能往里拷贝。
+    // _overrun表示，异步BatchTasks是否允许单次临时超过限制。
+    // _overrun为true时，即使BatchTasks剩下1-batch，也会全放入一个完整的Task，允许临时超限。
+    // _overrun为false时，不允许。
+    // 对于模型本身有最大Batch限制的情况，应将该值设为false，默认为false。
+    // 对于模型本身无最大Batch限制，但自己设置了BatchTasks的最大Batch，可以考虑设置为True。
+    // _allow_split_request ==
+    // true，则允许拆分task.BatchTasks剩下1-batch，则会从下一个Task中拆出1-Batch
+    // _allow_split_request ==
+    // false，则每个task不会被拆分。BatchTasks剩下1-batch会被浪费
+    // 默认为true，允许拆分task从而使得空间利用率最大。
+    if (!batchTask.get_allow_split_request()) {
+      if (task->batch_size() > batchTask.get_rem_size() &&
+          !batchTask.get_overrun()) {
+        break;
+      }
+    }
+    // combine_task_valid负责判断是否能够合并
+    // 除最外层的shape外，内层shape应一致才能合并。
+    // 否则跳出循环,放入下一个batchTask中。
+    // 以此保证batch.append_task(task)中的task的内层shape相同。
+    // 对于Shape[0] = 1 而!=batch的情况，因为合并时，取其中一个的值
+    // 所以要求该feedvar必须相等，才能合并。
+    // 否则跳出循环,放入下一个batchTask中。
+    // 目前没有PaddleTensor和PaddleBuff没有重载==，所以只能比较内存.
+    // TODO(HexToString): 可以考虑后期支持AutoPadding.
+    if (previous_task != nullptr) {
+      if (!task->combine_task_valid(previous_task)) {
+        break;
+      }
+    }
+    size_t rem = batchTask.append_task(task);
+    previous_task = task;
    if (task->rem <= 0) {
      _task_queue.pop_front();
    }
    if (rem <= 0) break;
  }
+  LOG(INFO) << "Number of tasks remaining in _task_queue is"
+            << _task_queue.size();
  return true;
 }
@@ -201,11 +336,12 @@ bool TaskExecutor<TaskT>::move_task_to_batch(
 // TaskT is from the SingleTon TaskExecutor`s _task_queue
 // although TaskMeta is a local variable, but several TaskMeta may points to
 // the same TaskT which is get from the SingleTon TaskExecutor`s _task_queue.
-// put TaskMeta to the local variable BatchTasks<TaskT> batch.
+// put TaskMeta to the local variable BatchTasks<TaskT> batchTask.
-// batch.merge_tasks() and batch.notify_tasks() has no lock.
+// batchTask.merge_tasks() and batchTask.notify_tasks() has no lock.
-// BatchTasks<TaskT> batch itself is a local variable, it`s thread safe.
+// BatchTasks<TaskT> batchTask itself is a local variable, it`s thread safe.
-// If batch.merge_tasks() and batch.notify_tasks() do something to TaskMeta
+// If batchTask.merge_tasks() and batchTask.notify_tasks() do something to
+// TaskMeta
 // you need to pay attention to that.
 // Multi-Thread deal with different TaskMeta(cause it`s created as local
 // variable)
@@ -242,11 +378,23 @@ int TaskExecutor<TaskT>::work(ThreadContext<TaskT>* context) {
      return -1;
    }
-    BatchTasks<TaskT> batch(_batch_size, _batch_align);
+    // move_task_to_batch() take the original task from the `_task_queue`
-    if (move_task_to_batch(batch)) {
+    // put the original task into its own Vector<taskmeta>
-      batch.merge_tasks();
+    // the capacity of its own Vector<taskmeta> is decided by `_batch_size` or
-      _fn(&batch.in(), &batch.out());
+    // `_overrun`
-      batch.notify_tasks();
+    // merge_tasks() move the imput-data into `_batch_in` from its own
+    // Vector<taskmeta>.
+    // because the predictor`s input is the `_batch_in`
+    // notify_tasks() move the output-data into every single taskmeta from
+    // `_batch_out`.
+    // because the predictor`s output is the `_batch_out`
+    BatchTasks<TaskT> batchTask(_batch_size, _overrun, _allow_split_request);
+    if (move_task_to_batch(batchTask)) {
+      batchTask.merge_tasks();
+      _fn(&batchTask.in(), &batchTask.out());
+      batchTask.notify_tasks();
    }
  }

--- a/core/predictor/framework/bsf.h
+++ b/core/predictor/framework/bsf.h
@@ -16,7 +16,9 @@
 #include <errno.h>
 #include <algorithm>
+#include <cstring>
 #include <list>
+#include <set>
 #include <vector>
 #ifdef BCLOUD
@@ -46,7 +48,8 @@ static const size_t DEFAULT_BATCH_SIZE = 100;
 // `rem` don`t need to be atomic, cause the operation `put` is synchronous.
 // actually, the reason is that lock have been added outside the operation
 // `put`.
+template <typename TaskT>
+class BatchTasks;
 // size_t `index` records how many batch have been processing completed.
 // `index` need to be atomic, cause the operation 'notify' is asynchronous.
 template <typename InItemT, typename OutItemT>
@@ -56,7 +59,7 @@ struct Task {
  typedef InItemT InType;
  typedef OutItemT OutType;
  typedef Task<InItemT, OutItemT> TaskT;
-  typedef std::vector<int> ShapeVector;
+  typedef std::vector<size_t> ShapeVector;
  typedef std::vector<ShapeVector> VectorOfShapeVector;
  int read_fd;
@@ -65,7 +68,17 @@ struct Task {
  const InVectorT* inVectorT_ptr;
  OutVectorT* outVectorT_ptr;
  size_t rem;
+  size_t total_feed_batch;
+  std::set<size_t> set_feed_lod_index;
+  std::set<size_t> set_feed_nobatch_index;
+  std::vector<size_t> vector_fetch_lod_index;
+  std::set<size_t> set_fetch_nobatch_index;
  butil::atomic<size_t> index;
+  size_t taskmeta_num;
+  THREAD_MUTEX_T task_mut;
+  bool fetch_init;
+  // taskmeta_num * set_feed_lod_index.size()
+  std::vector<OutVectorT> outLodTensorVector;
  Task() {
    read_fd = -1;
@@ -73,11 +86,57 @@ struct Task {
    owner_tid = -1;
    inVectorT_ptr = NULL;
    outVectorT_ptr = NULL;
+    set_feed_lod_index.clear();
+    set_feed_nobatch_index.clear();
+    vector_fetch_lod_index.clear();
+    set_fetch_nobatch_index.clear();
    rem = -1;
+    total_feed_batch = 0;
+    taskmeta_num = 0;
    index.store(0, butil::memory_order_relaxed);
+    THREAD_MUTEX_INIT(&task_mut, NULL);
+    fetch_init = false;
+    outLodTensorVector.clear();
+  }
+  ~Task() {
+    read_fd = -1;
+    write_fd = -1;
+    owner_tid = -1;
+    inVectorT_ptr = NULL;
+    outVectorT_ptr = NULL;
+    set_feed_lod_index.clear();
+    set_feed_nobatch_index.clear();
+    vector_fetch_lod_index.clear();
+    set_fetch_nobatch_index.clear();
+    rem = -1;
+    total_feed_batch = 0;
+    taskmeta_num = 0;
+    index.store(0, butil::memory_order_relaxed);
+    THREAD_MUTEX_DESTROY(&task_mut);
+    fetch_init = false;
+    outLodTensorVector.clear();
  }
-  bool check_feedvar_valid(int feedvar_index) {
+  void clear(){
+    read_fd = -1;
+    write_fd = -1;
+    owner_tid = -1;
+    inVectorT_ptr = NULL;
+    outVectorT_ptr = NULL;
+    set_feed_lod_index.clear();
+    set_feed_nobatch_index.clear();
+    vector_fetch_lod_index.clear();
+    set_fetch_nobatch_index.clear();
+    rem = -1;
+    total_feed_batch = 0;
+    taskmeta_num = 0;
+    index.store(0, butil::memory_order_relaxed);
+    THREAD_MUTEX_INIT(&task_mut, NULL);
+    fetch_init = false;
+    outLodTensorVector.clear();
+  }
+  bool check_feedvar_valid(size_t feedvar_index) {
    if (feedvar_index < 0 || inVectorT_ptr->size() <= feedvar_index) {
      LOG(ERROR) << "feedvar doesnt exsit or feedvar_index error";
      return 0;
@@ -91,20 +150,47 @@ struct Task {
    return 1;
  }
-  // Now, it simply assume that the first dimension of data is batch.
+  bool combine_task_valid(Task* other_task) {
-  // so the batch is PaddleTensor.shape[0]
+    // TODO(HexToString): auto-padding
+    // 除最外层的shape外，内层shape应一致才能合并。
+    // 否则跳出循环,放入下一个batchTask中。
+    // 以此保证batch.append_task(task)中的task的内层shape相同。
+    if (other_task->feedvar_shape_nobatch() != feedvar_shape_nobatch()) {
+      return false;
+    }
+    // 对于Shape[0] = 1 而!=batch的情况，因为合并时，取其中一个的值
+    // 所以要求该feedvar必须相等，才能合并。
+    // 目前没有PaddleTensor和PaddleBuff没有重载==，所以只能比较内存.
+    for (size_t feedvar_index = 0;
+         feedvar_index < set_feed_nobatch_index.size();
+         ++feedvar_index) {
+      int result =
+          std::memcmp((*inVectorT_ptr)[feedvar_index].data.data(),
+                      (*(other_task->inVectorT_ptr))[feedvar_index].data.data(),
+                      (*inVectorT_ptr)[feedvar_index].data.length());
+      if (result != 0) return false;
+    }
+    return true;
+  }
-  // If batch information is added into feedvar.prototxt.
+  size_t feedvar_batch_size(size_t feedvar_index) {
-  // we can get the information from the feedvar.prototxt instead of assume.
-  size_t feedvar_batch_size(int feedvar_index) {
    if (!check_feedvar_valid(feedvar_index)) {
      return 0;
    }
+    // if lod, 'lod[0].size()-1' is batch.
+    // for PaddleTensor lod is vector<vector<size_t>>, so lod[0] is real lod.
+    // for example, lod = [0,3,4,6], shape = [6,340,340], batch is 3 actually.
+    // for lod, the batch < shape[0].
+    if ((*inVectorT_ptr)[feedvar_index].lod.size() > 0 &&
+        (*inVectorT_ptr)[feedvar_index].lod[0].size() > 0) {
+      return (*inVectorT_ptr)[feedvar_index].lod[0].size() - 1;
+    }
+    // if not lod, the first dimension of data `PaddleTensor.shape[0]` is batch.
    return (*inVectorT_ptr)[feedvar_index].shape[0];
  }
-  size_t feedvar_element_bytesize(int feedvar_index) {
+  size_t feedvar_element_bytesize(size_t feedvar_index) {
    if (!check_feedvar_valid(feedvar_index)) {
      return 0;
    }
@@ -126,7 +212,7 @@ struct Task {
  // Now, the implementation of this function is based on assumption
  // that shape [0] = batch_size.
-  size_t feedvar_element_num(int feedvar_index) {
+  size_t feedvar_element_num(size_t feedvar_index) {
    if (!check_feedvar_valid(feedvar_index)) {
      return 0;
    }
@@ -138,18 +224,18 @@ struct Task {
      return 1;
    }
    // start from shape[1], cause shape[0] = batch_size.
-    for (int i = 1; i < (*inVectorT_ptr)[feedvar_index].shape.size(); ++i) {
+    for (size_t i = 1; i < (*inVectorT_ptr)[feedvar_index].shape.size(); ++i) {
      element_num *= (*inVectorT_ptr)[feedvar_index].shape[i];
    }
    return element_num;
  }
-  size_t feedvar_bytesize(int feedvar_index) {
+  size_t feedvar_bytesize(size_t feedvar_index) {
    return feedvar_element_num(feedvar_index) *
           feedvar_element_bytesize(feedvar_index);
  }
-  ShapeVector feedvar_shape_nobatch(int feedvar_index) {
+  ShapeVector feedvar_shape_nobatch(size_t feedvar_index) {
    if (!check_feedvar_valid(feedvar_index)) {
      return ShapeVector();
    }
@@ -158,40 +244,167 @@ struct Task {
  }
  VectorOfShapeVector feedvar_shape_nobatch() {
-    VectorOfShapeVector vector_of_feedvar_shape_nobatch(inVectorT_ptr->size());
+    VectorOfShapeVector vector_of_feedvar_shape_nobatch;
-    for (int index = 0; index < inVectorT_ptr->size(); ++index) {
+    for (size_t feedvar_index = 0; feedvar_index < inVectorT_ptr->size();
-      vector_of_feedvar_shape_nobatch.push_back(feedvar_shape_nobatch(index));
+         ++feedvar_index) {
+      vector_of_feedvar_shape_nobatch.push_back(
+          feedvar_shape_nobatch(feedvar_index));
    }
    return vector_of_feedvar_shape_nobatch;
  }
-  // At present, it is considered that the batch of all feedvar is consistent.
+  // For each feedvar, batch should be 1 or batch_size.
-  // so for each feedvar, PaddleTensor.shape[0] should be the same.
+  // if feedvar-1: batch_size = 1 (always not batch).
-  bool check_batch_align() {
+  // feedvar-2: batch_size = n,  batch = n.
-    int batch_size_align = feedvar_batch_size(0);
+  // this function is not thread safe. only called when task is creating.
-    for (int feedvar_index = 0; feedvar_index < inVectorT_ptr->size();
+  bool task_init() {
+    total_feed_batch = feedvar_batch_size(0);
+    // which means error.
+    if (total_feed_batch <= 0) return false;
+    for (size_t feedvar_index = 0; feedvar_index < inVectorT_ptr->size();
         ++feedvar_index) {
-      if (feedvar_batch_size(feedvar_index) != batch_size_align) {
+      // TODO(HexToString): Distinguish between nobatch and batch =
-        return 0;
+      // 1(By:HexToString)
+      // 当数据中feedvar-1: 带batch,且batch =1，shape[0] = 1
+      // feedvar-2:不带batch，由于不带batch导致shape[0] =1
+      // 此时，无法分辨是否是天然nobatch，此时set_feed_nobatch_index会漏掉
+      // 后续希望在其他地方能够区分两者。
+      if (feedvar_batch_size(feedvar_index) != total_feed_batch) {
+        // which means error.
+        if (feedvar_batch_size(feedvar_index) != 1 && total_feed_batch != 1) {
+          return false;
+        } else {
+          // which means feedvar shape[0] = 1.
+          // shape[0] does not change with batch
+          set_feed_nobatch_index.insert(feedvar_index);
+          total_feed_batch =
+              std::max(feedvar_batch_size(feedvar_index), total_feed_batch);
+        }
+      }
+      // 将lod feedvar index加入到vector中。
+      if ((*inVectorT_ptr)[feedvar_index].lod.size() > 0 &&
+          (*inVectorT_ptr)[feedvar_index].lod[0].size() > 0) {
+        set_feed_lod_index.insert(feedvar_index);
      }
    }
-    /*
+    return true;
-    for(int fetchvar_index = 0; fetchvar_index < outVectorT_ptr->size();
+  }
-    ++fetchvar_index) {
-      if(fetchvar_batch_size(fetchvar_index) != batch_size_align) {
+  size_t batch_size() { return total_feed_batch; }
-        return 0;
+  // start_batch range is 0~batch_size, end_batch range is 1~batch_size
+  // start_batch should not be included, end_batch > start_batch
+  // return is (start_batch, end_batch] = [start_batch+1,end_batch]
+  // for not lod, shape0_index = [(start_batch+1)-1,end_batch-1] =
+  // [start_batch,end_batch-1] = [start_batch,end_batch)
+  // for lod, shape0_index = [lod[start_batch],lod[end_batch]-1] =
+  // [lod[start_batch],lod[end_batch])
+  // for nobatch, shape0_index = [0,1)
+  // 对于调用者，拿到shape0_index后，for(size_t myindex =shape0_index[0];
+  // myindex <shape0_index[1];myindex++)即可.
+  // 原始lod= [0,3,4,6] 取的batch为(start_batch = 1,end_batch =
+  // 3]，即取batch=2,3.
+  // 此时lod=[3,4,6]，处理后得到[1,3]
+  // 这样处理后，合并lod比较方便，直接加上上一个lod的结尾的值即可。
+  std::vector<std::vector<size_t>> get_feature_by_batch(size_t feedvar_index,
+                                                        size_t start_batch,
+                                                        size_t end_batch) {
+    std::vector<std::vector<size_t>> feature_vector;
+    // feature_vector是双层vector,这么设计是由于一个遍历即可处理所有的特征。
+    // feature_vector[0]是由shape0_index的范围值组成的vector,包含两个元素最小和最大值。
+    // feature_vector[1]是由lod组成的vector，包含指定batch的lod信息.
+    // feature_vector[2]是由单个元素的组成的vector，元素值为1表示是nobatch的feedvar。
+    // if 为 nobatch feedvar情况。
+    // else if 为带lod的feedvar情况。
+    // else为不带lod 普通feedvar情况。
+    if (set_feed_nobatch_index.size() > 0 &&
+        set_feed_nobatch_index.find(feedvar_index) !=
+            set_feed_nobatch_index.end()) {
+      feature_vector = {{0, 1}, {}, {1}};
+    } else if (set_feed_lod_index.size() > 0 &&
+               set_feed_lod_index.find(feedvar_index) !=
+                   set_feed_lod_index.end()) {
+      std::vector<size_t> feed_lod_vector(end_batch - start_batch);
+      for (size_t lod_index = start_batch + 1, vector_index = 0;
+           lod_index < end_batch + 1;
+           ++lod_index, ++vector_index) {
+        feed_lod_vector[vector_index] =
+            (*inVectorT_ptr)[feedvar_index].lod[0][lod_index] -
+            (*inVectorT_ptr)[feedvar_index].lod[0][start_batch];
      }
+      size_t shape0_start = (*inVectorT_ptr)[feedvar_index].lod[0][start_batch];
+      size_t shape0_end = (*inVectorT_ptr)[feedvar_index].lod[0][end_batch];
+      feature_vector = {{shape0_start, shape0_end}, feed_lod_vector};
+      // feature_vector.push_back(feed_lod_vector);
+    } else {
+      feature_vector = {{start_batch, end_batch}};
    }
-    */
+    return feature_vector;
-    return 1;
  }
-  size_t batch_size() {
+  bool combine_taskmeta() {
-    if (check_batch_align()) {
+    // 只有含有lod类型的fetch输出，且task被拆分为多个taskmeta的情况
-      return feedvar_batch_size(0);
+    // 才需要将数据从outLodTensorVector搬运到outVectorT_ptr
+    if (vector_fetch_lod_index.size() > 0 && taskmeta_num > 1) {
+      for (size_t index = 0; index < vector_fetch_lod_index.size(); ++index) {
+        size_t data_length = 0;
+        size_t lod_length = 0;
+        size_t total_shape0 = 0;
+        size_t feedvar_index = vector_fetch_lod_index[index];
+        // 由于PaddleTensor的resize实现，是每次都会清空，所以必须先统计总长度。
+        for (size_t taskmeta_index = 0; taskmeta_index < taskmeta_num;
+             ++taskmeta_index) {
+          data_length +=
+              outLodTensorVector[taskmeta_index][index].data.length();
+          lod_length += outLodTensorVector[taskmeta_index][index].lod[0].size();
+          total_shape0 += outLodTensorVector[taskmeta_index][index].shape[0];
+        }
+        // 一次性扩容PaddleTensor中的data和lod
+        paddle::PaddleTensor& fetchVarTensor = (*outVectorT_ptr)[feedvar_index];
+        fetchVarTensor.data.Resize(data_length);
+        // task中的lod补0
+        if (fetchVarTensor.lod.size() <= 0) {
+          fetchVarTensor.lod.push_back({0});
+        } else if (fetchVarTensor.lod[0].size() <= 0) {
+          fetchVarTensor.lod[0].push_back(0);
+        }
+        fetchVarTensor.lod[0].resize(lod_length + 1, 0);
+        //
+        size_t data_length_offset = 0;
+        size_t lod_length_offset = 0;
+        size_t once_data_length = 0;
+        size_t once_lod_length = 0;
+        size_t last_lod_value = fetchVarTensor.lod[0][lod_length_offset];
+        for (size_t taskmeta_index = 0; taskmeta_index < taskmeta_num;
+             ++taskmeta_index) {
+          void* dst_ptr = fetchVarTensor.data.data() + data_length_offset;
+          void* source_ptr =
+              outLodTensorVector[taskmeta_index][index].data.data();
+          once_data_length =
+              outLodTensorVector[taskmeta_index][index].data.length();
+          memcpy(dst_ptr, source_ptr, once_data_length);
+          once_lod_length =
+              outLodTensorVector[taskmeta_index][index].lod[0].size();
+          for (size_t once_index = 0; once_index < once_lod_length;
+               ++once_index) {
+            fetchVarTensor.lod[0][lod_length_offset + 1] =
+                last_lod_value +
+                outLodTensorVector[taskmeta_index][index].lod[0][once_index];
+          }
+          data_length_offset += once_data_length;
+          lod_length_offset += once_lod_length;
+        }
+      }
    }
-    return 0;
+    return true;
  }
+  bool task_fetch_init(BatchTasks<TaskT>& batchTask);
+  bool task_fetch_create(BatchTasks<TaskT>& batchTask);
 };
 // `Several Task` or `part of batch in Task` can be a TaskMeta.
@@ -206,61 +419,164 @@ struct Task {
 // TaskMeta is necessary.
 // cause we need know the the corresponding relationship between
-// `batch_out`(which is in BatchTasks) and `outVectorT_ptr`(which is in Task).
+// `_batch_out`(which is in BatchTasks) and `outVectorT_ptr`(which is in Task).
 // especially when 1 Task be divided into several TaskMeta and be put into
 // several different BatchTasks.
+// begin、add、end means batch, not shape[0].
+// if not lod, batch == shape[0]. if lod, batch != shape[0]
+// for example, lod = [0,3,4,6], shape = [6,340,340]
+// there is 3 batch actually, add = 3, but shape[0] = 6.
 template <typename TaskT>
 struct TaskMeta {
-  TaskMeta(TaskT* ptr, size_t start, size_t add)
+  TaskMeta(TaskT* ptr, size_t start, size_t add, size_t taskmeta_index)
-      : task(ptr), begin(start), end(start + add) {}
+      : task(ptr),
+        begin(start),
+        end(start + add),
+        taskmeta_index(taskmeta_index) {
+    feedvar_num = ptr->inVectorT_ptr->size();
+    for (size_t feedvar_index = 0; feedvar_index < feedvar_num;
+         ++feedvar_index) {
+      std::vector<std::vector<size_t>> feature =
+          ptr->get_feature_by_batch(feedvar_index, start, start + add);
+      feed_shape0_range.push_back(feature[0]);
+      feedvar_type.push_back(feature.size());
+      if (feature.size() == 1) {
+        feed_lod_vector.push_back({});
+      } else if (feature.size() == 2) {
+        feed_lod_vector.push_back(feature[1]);
+      } else {
+        feed_lod_vector.push_back({});
+      }
+    }
+  }
  TaskT* task;
  size_t begin;
  size_t end;
+  size_t feedvar_num;
+  size_t taskmeta_index;
+  std::vector<std::vector<size_t>> feed_shape0_range;
+  std::vector<std::vector<size_t>> feed_lod_vector;
+  std::vector<size_t> feedvar_type;
 };
 // each TaskT is already include batch in itself
 // BatchTasks need to combine several `small TaskMeta` into a new `big TaskT`.
 // The only difference between the `big TaskT` and `small TaskT` is that
-// the TaskT.inVectorT_ptr->[feedvar_index].shape[0]
+// the TaskT.inVectorT_ptr->[feedvar_index].shape[0] is different
-// which is actually batch_size is different.
+// `big TaskT`.inVectorT_ptr->[feedvar_index].shape[0] is actually batch_size .
 template <typename TaskT>
 class BatchTasks {
 public:
  typedef typename TaskT::InType InType;
  typedef typename TaskT::OutType OutType;
  typedef TaskMeta<TaskT> TaskMetaT;
+  typedef std::vector<size_t> ShapeVector;
+  typedef std::vector<ShapeVector> VectorOfShapeVector;
+  typedef std::vector<size_t> LodVector;
+  typedef std::vector<LodVector> PaddleTensorLod;
+  friend TaskT;
-  explicit BatchTasks(size_t batch_size, bool batch_align = true)
+  explicit BatchTasks(size_t batch_size,
+                      bool overrun = false,
+                      bool allow_split_request = true)
      : _batch_size(batch_size),
        _rem_size(batch_size),
-        _batch_align(batch_align) {
+        _overrun(overrun),
+        _allow_split_request(allow_split_request) {
    _batch_in.clear();
    _batch_in_offset.clear();
+    _total_shape0_batch_in.clear();
+    _total_feed_batch = 0;
+    _batch_in_lod.clear();
    _batch_out.clear();
    _batch_out_offset.clear();
+    _total_fetch_batch = 0;
    _taskmeta_vector.clear();
+    set_fetch_nobatch_index.clear();
+    vector_fetch_lod_index.clear();
  }
  ~BatchTasks() {
    _batch_in.clear();
    _batch_in_offset.clear();
+    _total_shape0_batch_in.clear();
+    _total_feed_batch = 0;
+    _batch_in_lod.clear();
    _batch_out.clear();
    _batch_out_offset.clear();
+    _total_fetch_batch = 0;
    _taskmeta_vector.clear();
+    set_fetch_nobatch_index.clear();
+    vector_fetch_lod_index.clear();
  }
  // synchronized operation
  // because Upper level callers of this function have already locked.
+  // 能进到此函数的task都是同类task，在该函数之前已保证了这点。
  size_t append_task(TaskT* task) {
    size_t add = std::min(task->rem, _rem_size);
-    if (!_batch_align) {
+    // when _overrun == true, it means always take a whole task as TaskMeta
+    // we can temporary breakthrough the limit of BatchTask`s capacity
+    // BatchTask`s capacity is _batch_size or _rem_size
+    if (_overrun) {
      add = task->rem;
    }
    int start_index = task->batch_size() - task->rem;
-    TaskMetaT tm(task, start_index, add);
+    TaskMetaT tm(task, start_index, add, task->taskmeta_num);
+    task->taskmeta_num += 1;
    _taskmeta_vector.push_back(tm);
+    if (_batch_in_offset.size() == 0) {
+      _batch_in_offset.resize(tm.feedvar_num, 0);
+    }
+    if (_total_shape0_batch_in.size() == 0) {
+      _total_shape0_batch_in.resize(tm.feedvar_num, 0);
+    }
+    if (_batch_in_lod.size() == 0) {
+      PaddleTensorLod null_lod;
+      _batch_in_lod.resize(tm.feedvar_num, null_lod);
+    }
+    _total_feed_batch += add;
+    for (size_t feedvar_index = 0; feedvar_index < tm.feedvar_num;
+         ++feedvar_index) {
+      if (tm.feedvar_type[feedvar_index] == 1) {
+        // 普通的非lod feedvar
+        // 累计计算shape0的累加值，为后面初始化PaddleTensor做准备。
+        _total_shape0_batch_in[feedvar_index] +=
+            tm.feed_shape0_range[feedvar_index][1] -
+            tm.feed_shape0_range[feedvar_index][0];
+      } else if (tm.feedvar_type[feedvar_index] == 2) {
+        // lod类型的feedvar
+        // 累计计算shape0的累加值，为后面初始化PaddleTensor做准备。
+        _total_shape0_batch_in[feedvar_index] +=
+            tm.feed_shape0_range[feedvar_index][1] -
+            tm.feed_shape0_range[feedvar_index][0];
+        // 在Lod最前面加0
+        if (_batch_in_lod[feedvar_index].size() <= 0) {
+          _batch_in_lod[feedvar_index].push_back({0});
+        } else if (_batch_in_lod[feedvar_index][0].size() <= 0) {
+          _batch_in_lod[feedvar_index][0].push_back(0);
+        }
+        // 将lod加上前一组lod的结尾最大值，组合Lod
+        size_t last_lod_value = _batch_in_lod[feedvar_index][0].back();
+        for (size_t lod_index = 0;
+             lod_index < tm.feed_lod_vector[feedvar_index].size();
+             ++lod_index) {
+          _batch_in_lod[feedvar_index][0].push_back(
+              last_lod_value + tm.feed_lod_vector[feedvar_index][lod_index]);
+        }
+      } else {
+        // tm.feedvar_type[feedvar_index] == 3
+        // nobatch类型的feedvar.
+        // 此时不累加，且值应为1
+        _total_shape0_batch_in[feedvar_index] =
+            tm.feed_shape0_range[feedvar_index][1] -
+            tm.feed_shape0_range[feedvar_index][0];
+      }
+    }
    task->rem -= add;
    _rem_size -= add;
    return _rem_size;
@@ -281,72 +597,56 @@ class BatchTasks {
  // cause maybe next time we don`t need to do the extra copy.
  // directly copy the every Task into the Predictor.
-  // lod is not supported.
-  // if lod is set, we should not allow to use the bsf task.
  // batch.merge_tasks() is thread-safe function
  // cause batch is a local variable and Task is just read, not written.
  void merge_tasks() {
    if (_taskmeta_vector.size() <= 0) {
      return;
    }
-    // Temporarily, the batch of each feedvar is consistent
-    // If not consistent, use feedvar_batch_size instead of task->batch_size().
-    int temp_batch = 0;
-    for (size_t ti = 0; ti < _taskmeta_vector.size(); ++ti) {
-      TaskMetaT& tm = _taskmeta_vector[ti];
-      temp_batch += tm.task->batch_size();
-    }
-    if (temp_batch > _batch_size) {
-      LOG(ERROR) << "_realNumber_batch_in >_batch_size, error.";
-      return;
-    }
-    int feedvar_num = _taskmeta_vector[0].task->inVectorT_ptr->size();
-    if (_batch_in_offset.size() == 0) {
-      _batch_in_offset.resize(feedvar_num, 0);
-      _realNumber_batch_in.resize(feedvar_num, temp_batch);
-    }
    for (size_t ti = 0; ti < _taskmeta_vector.size(); ++ti) {
      TaskMetaT& tm = _taskmeta_vector[ti];
-      for (int index = 0; index < feedvar_num; ++index) {
+      for (size_t feedvar_index = 0; feedvar_index < tm.feedvar_num;
+           ++feedvar_index) {
        const paddle::PaddleTensor& feedVarTensor =
-            (*tm.task->inVectorT_ptr)[index];
+            (*tm.task->inVectorT_ptr)[feedvar_index];
-        size_t feedvar_bytesize = tm.task->feedvar_bytesize(index);
+        size_t feedvar_bytesize = tm.task->feedvar_bytesize(feedvar_index);
        if (ti == 0) {
-          if (feedVarTensor.lod.size() > 0 && feedVarTensor.lod[0].size() > 0) {
+          // Create the entire tensor at once
-            LOG(ERROR) << "lod Tensor is not supported now.";
-            return;
-          }
          // for now, we assume that every task feedvar_bytesize is the same.
          // which means we dont support auto embedding.
          // but for different feedvar, it is different.
          paddle::PaddleTensor paddleTensor;
          paddleTensor.dtype = feedVarTensor.dtype;
          paddleTensor.name = feedVarTensor.name;
-          paddleTensor.lod = feedVarTensor.lod;
+          paddleTensor.lod = _batch_in_lod[feedvar_index];
          paddleTensor.shape = feedVarTensor.shape;
-          paddleTensor.shape[0] = _realNumber_batch_in[index];
+          paddleTensor.shape[0] = _total_shape0_batch_in[feedvar_index];
          paddleTensor.data.Resize(feedvar_bytesize *
-                                   _realNumber_batch_in[index]);
+                                   _total_shape0_batch_in[feedvar_index]);
          _batch_in.push_back(paddleTensor);
        }
-        void* dst_ptr = _batch_in[index].data.data() + _batch_in_offset[index];
+        void* dst_ptr = _batch_in[feedvar_index].data.data() +
+                        _batch_in_offset[feedvar_index];
        void* source_ptr =
-            feedVarTensor.data.data() + feedvar_bytesize * tm.begin;
+            feedVarTensor.data.data() +
-        size_t length = feedvar_bytesize * (tm.end - tm.begin);
+            feedvar_bytesize * tm.feed_shape0_range[feedvar_index][0];
+        size_t length =
+            feedvar_bytesize * (tm.feed_shape0_range[feedvar_index][1] -
+                                tm.feed_shape0_range[feedvar_index][0]);
        memcpy(dst_ptr, source_ptr, length);
-        _batch_in_offset[index] += length;
+        // nobatch类型的feedvar，不叠加.
+        if (tm.feedvar_type[feedvar_index] != 3)
+          _batch_in_offset[feedvar_index] += length;
      }
    }
  }
-  bool check_fetchvar_valid(int fetchvar_index) {
+  bool check_fetchvar_valid(size_t fetchvar_index) {
    if (fetchvar_index < 0 || _batch_out.size() <= fetchvar_index) {
      LOG(ERROR) << "fetchvar doesnt exsit or fetchvar_index error";
      return 0;
@@ -360,19 +660,11 @@ class BatchTasks {
    return 1;
  }
-  size_t fetchvar_batch_size(int fetchvar_index) {
+  size_t fetchvar_element_bytesize(size_t fetchvar_index) {
-    if (!check_fetchvar_valid(fetchvar_index)) {
-      return 0;
-    }
-    return _batch_out[fetchvar_index].shape[0];
-  }
-  size_t fetchvar_element_bytesize(int fetchvar_index) {
    if (!check_fetchvar_valid(fetchvar_index)) {
      return 0;
    }
-    int dtype = _batch_out[fetchvar_index].dtype;
+    size_t dtype = _batch_out[fetchvar_index].dtype;
    if (dtype == paddle::PaddleDType::INT64) {
      return sizeof(int64_t);
    }
@@ -390,7 +682,7 @@ class BatchTasks {
  // Now, the implementation of this function is based on assumption
  // that shape [0] = batch_size.
-  size_t fetchvar_element_num(int fetchvar_index) {
+  size_t fetchvar_element_num(size_t fetchvar_index) {
    if (!check_fetchvar_valid(fetchvar_index)) {
      return 0;
    }
@@ -400,35 +692,66 @@ class BatchTasks {
      return 1;
    }
    // start from shape[1], cause shape[0] = batch_size.
-    for (int i = 1; i < _batch_out[fetchvar_index].shape.size(); ++i) {
+    for (size_t i = 1; i < _batch_out[fetchvar_index].shape.size(); ++i) {
      element_num *= _batch_out[fetchvar_index].shape[i];
    }
    return element_num;
  }
-  size_t fetchvar_bytesize(int fetchvar_index) {
+  size_t fetchvar_bytesize(size_t fetchvar_index) {
    return fetchvar_element_num(fetchvar_index) *
           fetchvar_element_bytesize(fetchvar_index);
  }
-  bool check_fetchvar_batch_align() {
+  size_t fetchvar_batch_size(size_t fetchvar_index) {
-    int batch_size_align = fetchvar_batch_size(0);
+    if (!check_fetchvar_valid(fetchvar_index)) {
+      return 0;
-    for (int fetchvar_index = 0; fetchvar_index < _batch_out.size();
-         ++fetchvar_index) {
-      if (fetchvar_batch_size(fetchvar_index) != batch_size_align) {
-        return 0;
-      }
    }
+    // if lod, 'lod[0].size()-1' is batch.
-    return 1;
+    // for PaddleTensor lod is vector<vector<size_t>>, so lod[0] is real lod.
+    // for example, lod = [0,3,4,6], shape = [6,340,340], batch is 3 actually.
+    // for lod, the batch < shape[0].
+    if (_batch_out[fetchvar_index].lod.size() > 0 &&
+        _batch_out[fetchvar_index].lod[0].size() > 0) {
+      return _batch_out[fetchvar_index].lod[0].size() - 1;
+    }
+    // if not lod, the first dimension of data `PaddleTensor.shape[0]` is batch.
+    return _batch_out[fetchvar_index].shape[0];
  }
-  size_t fetchvar_batch_size() {
+  size_t fetchvar_batch_size() { return _total_fetch_batch; }
-    if (check_fetchvar_batch_align()) {
-      return fetchvar_batch_size(0);
+  bool deal_batch_out() {
+    _total_fetch_batch = fetchvar_batch_size(0);
+    if (_total_fetch_batch <= 0) return false;
+    for (size_t fetchvar_index = 0; fetchvar_index < _batch_out.size();
+         ++fetchvar_index) {
+      // TODO(HexToString): Distinguish between nobatch and batch =
+      // 1(By:HexToString)
+      // 当数据中fetchvar-1: 带batch,且batch =1，shape[0] = 1
+      // fetchvar-2:不带batch，由于不带batch导致shape[0] =1
+      // 此时，无法分辨是否是天然nobatch，此时set_fetch_nobatch_index会漏掉
+      // 后续希望在其他地方能够区分两者。
+      if (fetchvar_batch_size(fetchvar_index) != _total_fetch_batch) {
+        // which means error.
+        if (fetchvar_batch_size(fetchvar_index) != 1 &&
+            _total_fetch_batch != 1) {
+          return false;
+        } else {
+          // which means fetchvar shape[0] = 1.
+          // shape[0] does not change with batch
+          set_fetch_nobatch_index.insert(fetchvar_index);
+          _total_fetch_batch =
+              std::max(fetchvar_batch_size(fetchvar_index), _total_fetch_batch);
+        }
+      }
+      // 将lod fetchvar index加入到vector中。
+      if (_batch_out[fetchvar_index].lod.size() > 0 &&
+          _batch_out[fetchvar_index].lod[0].size() > 0) {
+        vector_fetch_lod_index.push_back(fetchvar_index);
+      }
    }
-    return 0;
+    return true;
  }
  void notify_tasks() {
@@ -436,12 +759,16 @@ class BatchTasks {
      LOG(ERROR) << "_taskmeta_vector.size() <=0, error.";
      return;
    }
-    if (_realNumber_batch_in[0] != fetchvar_batch_size()) {
+    // 根据_batch_out，求出输出的整体batch
+    // 并将lod类型和nobatch类型的fetchvar的index记录到set中，方便后续查看。
+    deal_batch_out();
+    // 若输出的batch不是1，且不与输入batch对应，则错误
+    if (_total_feed_batch != _total_fetch_batch && _total_fetch_batch != 1) {
      LOG(ERROR) << "_batch_out`s batch != _batch_in`s batch, error.";
      return;
    }
-    int fetchvar_num = _batch_out.size();
+    size_t fetchvar_num = _batch_out.size();
    if (_batch_out_offset.size() == 0) {
      _batch_out_offset.resize(fetchvar_num, 0);
    }
@@ -451,44 +778,132 @@ class BatchTasks {
      size_t begin = _taskmeta_vector[ti].begin;
      size_t end = _taskmeta_vector[ti].end;
      size_t add = end - begin;
+      size_t taskmeta_index = _taskmeta_vector[ti].taskmeta_index;
-      for (int index = 0; index < fetchvar_num; ++index) {
+      // 对task中的outVectorT_ptr进行初始化
-        // the task->outVectorT_ptr is null before core->run().
+      // 如果是lod输出+多个taskmeta，此时对outLodTensorVector也需要初始化
-        // first time we should copy from _batch_out
+      if (!task->task_fetch_init(*this)) {
-        // so we need init.
+        LOG(ERROR) << " task_fetch_init error.";
-        size_t fetchvar_bytesize_index = fetchvar_bytesize(index);
+        return;
-        if (task->outVectorT_ptr->size() <= index) {
+      }
-          paddle::PaddleTensor tensor_out;
+      size_t fetch_lod_index = 0;
-          tensor_out.name = _batch_out[index].name;
-          tensor_out.dtype = paddle::PaddleDType(_batch_out[index].dtype);
+      for (size_t fetchvar_index = 0; fetchvar_index < fetchvar_num;
-          tensor_out.shape = _batch_out[index].shape;
+           ++fetchvar_index) {
-          tensor_out.shape[0] = task->batch_size();
+        size_t fetchvar_bytesize_index = fetchvar_bytesize(fetchvar_index);
-          tensor_out.lod = _batch_out[index].lod;
-          // resize all batch memory at one time
+        if (set_fetch_nobatch_index.size() > 0 &&
-          size_t databuf_size = task->batch_size() * fetchvar_bytesize_index;
+            set_fetch_nobatch_index.find(fetchvar_index) !=
-          tensor_out.data.Resize(databuf_size);
+                set_fetch_nobatch_index.end()) {
-          task->outVectorT_ptr->push_back(tensor_out);
+          // nobatch fetchvar情况
-        }
+          // 无论输入是多少batch，该index的fetchvar始终就shape[0] = 1
+          paddle::PaddleTensor& fetchVarTensor =
-        paddle::PaddleTensor& fetchVarTensor = (*task->outVectorT_ptr)[index];
+              (*task->outVectorT_ptr)[fetchvar_index];
+          void* dst_ptr = fetchVarTensor.data.data();
-        void* dst_ptr =
+          size_t length = fetchvar_bytesize_index * 1;
-            fetchVarTensor.data.data() + fetchvar_bytesize_index * begin;
+          void* source_ptr = _batch_out[fetchvar_index].data.data();
-        size_t length = fetchvar_bytesize_index * add;
+          memcpy(dst_ptr, source_ptr, length);
-        if (_batch_out_offset[index] + length >
+        } else if (vector_fetch_lod_index.size() > 0 &&
-            fetchvar_batch_size() * fetchvar_bytesize(index)) {
+                   std::find(vector_fetch_lod_index.begin(),
-          LOG(ERROR) << "_batch_out is less than taskmeta, error.";
+                             vector_fetch_lod_index.end(),
-          return;
+                             fetchvar_index) != vector_fetch_lod_index.end()) {
+          // lod fetchvar情况，此时无法确定总的shape[0]
+          // 根据task中的task_num总数开辟task_num个临时空间
+          // 每个lod型的fetchvar拷贝到对应的临时空间中
+          // 最后再计算临时空间的总量，合并fetchvar和lod
+          size_t last_batch = _batch_out_offset[fetchvar_index];
+          size_t shape0_index_start =
+              _batch_out[fetchvar_index].lod[0][last_batch];
+          size_t shape0_index_end =
+              _batch_out[fetchvar_index].lod[0][last_batch + add];
+          size_t shape0_length = shape0_index_end - shape0_index_start;
+          // task被拆分为多个taskmeta时，不能直接拷入task->outVectorT_ptr
+          // 此时,先拷入task->outLodTensorVector[taskmeta_index]
+          // 当task所有的taskmeta都完成时，再按照顺序进行拷贝回task->outVectorT_ptr。
+          if (task->taskmeta_num > 1) {
+            paddle::PaddleTensor& fetchVarTensor =
+                task->outLodTensorVector[taskmeta_index][fetch_lod_index];
+            size_t length = fetchvar_bytesize_index * shape0_length;
+            fetchVarTensor.shape[0] = shape0_length;
+            fetchVarTensor.data.Resize(length);
+            void* dst_ptr = fetchVarTensor.data.data();
+            void* source_ptr = _batch_out[fetchvar_index].data.data() +
+                               shape0_index_start * fetchvar_bytesize_index;
+            memcpy(dst_ptr, source_ptr, length);
+            // 由于是拆分的各个lod，不要补0，在最后合并给Task中的outVectorT_ptr时再补。
+            if (fetchVarTensor.lod.size() <= 0) {
+              fetchVarTensor.lod.push_back({});
+            }
+            fetchVarTensor.lod[0].resize(add, 0);
+            size_t last_lod_value =
+                _batch_out[fetchvar_index].lod[0][last_batch];
+            for (size_t lod_index = last_batch + 1, my_index = 0;
+                 lod_index < last_batch + add + 1;
+                 ++lod_index, ++my_index) {
+              fetchVarTensor.lod[0][my_index] =
+                  (_batch_out[fetchvar_index].lod[0][lod_index] -
+                   last_lod_value);
+            }
+          } else {
+            // task未被拆分为多个taskmeta，故只有某个线程中的taskmeta会操作task不存在多线程竞争
+            // 此时resize后，直接写入task->outVectorT_ptr中即可。
+            paddle::PaddleTensor& fetchVarTensor =
+                (*task->outVectorT_ptr)[fetchvar_index];
+            size_t length = fetchvar_bytesize_index * shape0_length;
+            fetchVarTensor.shape[0] = shape0_length;
+            fetchVarTensor.data.Resize(length);
+            void* dst_ptr = fetchVarTensor.data.data();
+            void* source_ptr = _batch_out[fetchvar_index].data.data() +
+                               shape0_index_start * fetchvar_bytesize_index;
+            memcpy(dst_ptr, source_ptr, length);
+            // task中的lod补0
+            if (fetchVarTensor.lod.size() <= 0) {
+              fetchVarTensor.lod.push_back({0});
+            } else if (fetchVarTensor.lod[0].size() <= 0) {
+              fetchVarTensor.lod[0].push_back(0);
+            }
+            // 将合并的lod信息对应的batch，拆分到task中。
+            // 注意，此时需要去掉前面lod导致的前置积累。
+            // 例如: 合lod = [0,2,5;7,10]，是由两组batch=2的task合并后预测的。
+            // 此时拆分，第一组时，都减去0,得到[2,5]+(由于前面已经补了0了) =
+            // [0,2,5]
+            // 第二组，都需要减5,得到[2,5]，这样处理才对。
+            fetchVarTensor.lod[0].resize(add + 1, 0);
+            size_t last_lod_value =
+                _batch_out[fetchvar_index].lod[0][last_batch];
+            for (size_t lod_index = last_batch + 1, my_index = 1;
+                 lod_index < last_batch + add + 1;
+                 ++lod_index, ++my_index) {
+              fetchVarTensor.lod[0][my_index] =
+                  (_batch_out[fetchvar_index].lod[0][lod_index] -
+                   last_lod_value);
+            }
+          }
+          fetch_lod_index++;
+        } else {
+          // 普通fetchvar情况，此时该Task总的fetchvar_batch =
+          // 输入的总的batch_size()
+          // 输出的batch应与输入的batch对应相等。
+          paddle::PaddleTensor& fetchVarTensor =
+              (*task->outVectorT_ptr)[fetchvar_index];
+          void* dst_ptr =
+              fetchVarTensor.data.data() + fetchvar_bytesize_index * begin;
+          size_t length = fetchvar_bytesize_index * add;
+          void* source_ptr =
+              _batch_out[fetchvar_index].data.data() +
+              _batch_out_offset[fetchvar_index] * fetchvar_bytesize_index;
+          memcpy(dst_ptr, source_ptr, length);
        }
-        void* source_ptr =
+        _batch_out_offset[fetchvar_index] += add;
-            _batch_out[index].data.data() + _batch_out_offset[index];
-        memcpy(dst_ptr, source_ptr, length);
-        _batch_out_offset[index] += length;
      }
+      // index是局部变量，fetch_add是原子操作，成功则返回原值。
+      // 只有最后一个taskmeta都完成后，该线程的index+add才能>task->batch_size()
+      // 故只有一个线程能进入if{}内.不会造成多线程竞争的问题。
      size_t index = task->index.fetch_add(add);
      if ((index + add) >= task->batch_size()) {
+        task->combine_taskmeta();
        char c = 0;
        while (write(task->write_fd, &c, 1) != 1 && errno == EINTR) {
        }
@@ -503,17 +918,32 @@ class BatchTasks {
  size_t task_size() { return _taskmeta_vector.size(); }
+  const size_t get_rem_size() { return _rem_size; }
+  bool get_overrun() { return _overrun; }
+  bool get_allow_split_request() { return _allow_split_request; }
 private:
  std::vector<TaskMetaT> _taskmeta_vector;
  typename TaskT::InVectorT _batch_in;
  std::vector<size_t> _batch_in_offset;
-  std::vector<size_t> _realNumber_batch_in;
+  std::vector<size_t> _total_shape0_batch_in;
+  size_t _total_feed_batch;
+  std::vector<PaddleTensorLod> _batch_in_lod;
  typename TaskT::OutVectorT _batch_out;
  std::vector<size_t> _batch_out_offset;
-  std::vector<size_t> _realNumber_batch_out;
+  // std::vector<size_t> _total_shape0_batch_out;
+  size_t _total_fetch_batch;
+  // std::vector<PaddleTensorLod>  _batch_out_lod;
+  std::set<size_t> set_fetch_nobatch_index;
+  std::vector<size_t> vector_fetch_lod_index;
  size_t _rem_size;
  size_t _batch_size;
-  bool _batch_align;
+  bool _overrun;
+  bool _allow_split_request;
 };
 // BSF task handle
@@ -589,6 +1019,8 @@ class TaskExecutor {
  typedef typename TaskT::OutVectorT OutVectorT;
  typedef std::vector<TaskT> TaskArrayT;
  typedef baidu::paddle_serving::predictor::MempoolWrapper MempoolWrapper;
+  typedef std::vector<size_t> ShapeVector;
+  typedef std::vector<ShapeVector> VectorOfShapeVector;
  TaskExecutor()
      : _stop(false),
@@ -596,7 +1028,7 @@ class TaskExecutor {
        _thread_reset_fn(NULL),
        _user_thread_contexts(NULL),
        _batch_size(DEFAULT_BATCH_SIZE),
-        _batch_align(false),
+        _overrun(false),
        _fn(NULL) {
    THREAD_MUTEX_INIT(&_mut, NULL);
    THREAD_COND_INIT(&_cond, NULL);
@@ -617,7 +1049,11 @@ class TaskExecutor {
  void set_batch_size(size_t batch_size) { _batch_size = batch_size; }
-  void set_batch_align(size_t batch_align) { _batch_align = batch_align; }
+  void set_overrun(bool overrun) { _overrun = overrun; }
+  void set_allow_split_request(bool allow_split_request) {
+    _allow_split_request = allow_split_request;
+  }
  void set_thread_init_fn(boost::function<int(void*)> init_fn,
                          void** contexts = NULL) {
@@ -642,7 +1078,7 @@ class TaskExecutor {
  TaskHandler<TaskT> schedule(const void*, void*);
-  bool move_task_to_batch(BatchTasks<TaskT>& batch);  // NOLINT
+  bool move_task_to_batch(BatchTasks<TaskT>& batchTask);  // NOLINT
 private:
  TaskExecutor(TaskExecutor<TaskT> const& other) = delete;
@@ -669,7 +1105,8 @@ class TaskExecutor {
  std::vector<ThreadContext<TaskT>*> _thread_contexts;
  size_t _batch_size;
-  bool _batch_align;
+  bool _overrun;
+  bool _allow_split_request;
  boost::function<void(const void*, void*)> _fn;
 };
@@ -687,12 +1124,12 @@ class TaskExecutorVector {
  void resize(int size) { _vector_executor.resize(size); }
-  TaskExecutor<TaskT>& operator[](int index) {
+  TaskExecutor<TaskT>& operator[](int task_index) {
-    if (_vector_executor.size() <= index || index <= -1) {
+    if (_vector_executor.size() <= task_index || task_index <= -1) {
-      LOG(ERROR) << "_vector_executor.size() <= index or <= -1";
+      LOG(ERROR) << "_vector_executor.size() <= task_index or <= -1";
-      throw "_vector_executor.size() <= index or <= -1";
+      throw "_vector_executor.size() <= task_index or <= -1";
    }
-    return _vector_executor[index];
+    return _vector_executor[task_index];
  }
 private:
@@ -717,8 +1154,8 @@ class TaskManager {
  typedef typename TaskT::InVectorT InVectorT;
  typedef typename TaskT::OutVectorT OutVectorT;
-  explicit TaskManager(uint32_t index)  // NOLINT
+  explicit TaskManager(uint32_t model_index)  // NOLINT
-      : _model_index(index) {}
+      : _model_index(model_index) {}
  ~TaskManager() { wait(); }

--- a/core/predictor/framework/cache.cpp
+++ b/core/predictor/framework/cache.cpp
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#include "core/predictor/framework/cache.h"
+#include <dirent.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <string>
+#include <utility>
+#include "core/cube/cube-builder/include/cube-builder/seqfile_reader.h"
+namespace baidu {
+namespace paddle_serving {
+namespace predictor {
+int CubeCache::clear() {
+  for (auto it = _map_cache.begin(); it != _map_cache.end(); ++it) {
+    if (it->second) {
+      delete (it->second);
+      it->second = nullptr;
+    }
+  }
+  _map_cache.clear();
+  return 0;
+}
+rec::mcube::CubeValue* CubeCache::get_data(uint64_t key) {
+  auto it = _map_cache.find(key);
+  if (it != _map_cache.end()) {
+    return it->second;
+  }
+  return nullptr;
+}
+int CubeCache::reload_data(const std::string& cache_path) {
+  LOG(INFO) << "cube cache is loading data, path: " << cache_path;
+  DIR* dp = nullptr;
+  struct dirent* dirp = nullptr;
+  struct stat st;
+  // clear cache data
+  clear();
+  // loading data from cache files
+  if (stat(cache_path.c_str(), &st) < 0 || !S_ISDIR(st.st_mode)) {
+    LOG(ERROR) << "invalid cache path " << cache_path;
+    return -1;
+  }
+  if ((dp = opendir(cache_path.c_str())) == nullptr) {
+    LOG(ERROR) << "opendir " << cache_path << " fail.";
+    return -1;
+  }
+  while ((dirp = readdir(dp)) != nullptr) {
+    // filtering by file type.
+    if (dirp->d_type != DT_REG) {
+      continue;
+    }
+    // Filter upper-level directories and hidden files
+    if ((!strncmp(dirp->d_name, ".", 1)) || (!strncmp(dirp->d_name, "..", 2))) {
+      continue;
+    }
+    // Match the file whose name prefix is 'part-'
+    if (std::string(dirp->d_name).find("part-") != std::string::npos) {
+      SequenceFileRecordReader reader(cache_path + "/" + dirp->d_name);
+      if (reader.open() != 0) {
+        LOG(ERROR) << "open file failed! " << dirp->d_name;
+        continue;
+      }
+      if (reader.read_header() != 0) {
+        LOG(ERROR) << "read header error! " << dirp->d_name;
+        reader.close();
+        continue;
+      }
+      Record record(reader.get_header());
+      while (reader.next(&record) == 0) {
+        uint64_t key =
+            *reinterpret_cast<uint64_t*>(const_cast<char*>(record.key.data()));
+        auto it_find = _map_cache.find(key);
+        if (it_find != _map_cache.end()) {
+          // load dumplicate key
+          LOG(WARNING) << "Load dumplicate key:" << key
+                       << " from file:" << dirp->d_name;
+          continue;
+        }
+        rec::mcube::CubeValue* new_value = new rec::mcube::CubeValue();
+        new_value->error = 0;
+        new_value->buff.swap(record.value);
+        _map_cache.insert(std::make_pair(key, new_value));
+      }
+      LOG(WARNING) << "Load cube cache file " << dirp->d_name << " done.";
+    }
+    LOG(WARNING) << "Load all cube cache files done";
+  }
+  return 0;
+}
+}  // namespace predictor
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/core/predictor/framework/cache.h
+++ b/core/predictor/framework/cache.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <sys/types.h>
+#include <numeric>
+#include <string>
+#include <unordered_map>
+#include "core/cube/cube-api/include/cube_api.h"
+namespace baidu {
+namespace paddle_serving {
+namespace predictor {
+// Large models that use sparse parameters may use cube cache.
+// When the cube cache exists, the model is required to be
+// consistent with the version of the cube cache. Therefore,
+// when the model is updated, the model and the cube cache are
+// required to be reloaded at the same time.
+// Load all cached data at once without updating, it's lock free
+// switching two cube cache.
+class CubeCache {
+ public:
+  CubeCache() {}
+  ~CubeCache() { clear(); }
+  // clear cache data.
+  int clear();
+  // get cache data by key
+  rec::mcube::CubeValue* get_data(uint64_t key);
+  // reload all cache files from cache_path
+  int reload_data(const std::string& cache_path);
+ private:
+  // switching free lock, key type is uint64_t, value type is CubeValue*
+  std::unordered_map<uint64_t, rec::mcube::CubeValue*> _map_cache;
+};
+}  // namespace predictor
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/core/predictor/framework/dag_view.cpp
+++ b/core/predictor/framework/dag_view.cpp
@@ -21,6 +21,15 @@
 #include <string>
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/framework/op_repository.h"
+#ifdef BCLOUD
+#include <base/atomicops.h>
+#else
+#include <butil/atomicops.h>
+#endif
+#include <errno.h>
+#include "core/predictor/framework/resource.h"
+using baidu::paddle_serving::predictor::Resource;
 namespace baidu {
 namespace paddle_serving {
@@ -238,6 +247,77 @@ const Channel* DagView::get_response_channel(const uint64_t log_id) const {
  return last_op->mutable_channel();
 }
+void* call_back(void* ori_args) {
+  Resource::instance().thread_initialize();
+  Args* args = (Args*)ori_args;
+  Op* op = static_cast<Op*>(args->_op);
+  uint64_t log_id = static_cast<uint64_t>(args->_log_id);
+  bool debug = static_cast<bool>(args->_debug);
+  args->errcode = op->process(log_id, debug);
+  return nullptr;
+}
+int ParallelDagView::execute_one_stage(ViewStage* vstage,
+                                       const uint64_t log_id,
+                                       butil::IOBufBuilder* debug_os) {
+  butil::Timer stage_time(butil::Timer::STARTED);
+  uint32_t node_size = vstage->nodes.size();
+  std::vector<THREAD_T> tids(node_size);
+  Args* args = new Args[node_size];
+  VLOG(2) << "(logid=" << log_id << ") vstage->nodes.size(): " << node_size;
+  for (uint32_t ni = 0; ni < node_size; ni++) {
+    ViewNode* vnode = vstage->nodes[ni];
+    DagNode* conf = vnode->conf;
+    Op* op = vnode->op;
+    TRACEPRINTF(
+        "(logid=%" PRIu64 ") start to execute op[%s]", log_id, op->name());
+    args[ni]._op = op;
+    args[ni]._log_id = log_id;
+    args[ni]._debug = (debug_os != NULL);
+    int rc = THREAD_CREATE(&tids[ni], NULL, call_back, (void*)(args + ni));
+    if (rc != 0) {
+      LOG(ERROR) << "failed to create ParallelDagView worker thread: index="
+                 << ni << ", rc=" << rc << ", errno=" << errno << ":"
+                 << strerror(errno);
+      delete[] args;
+      return -1;
+    }
+  }
+  for (uint32_t ni = 0; ni < node_size; ni++) {
+    THREAD_JOIN(tids[ni], NULL);
+    int errcode = args[ni].errcode;
+    Op* op = args[ni]._op;
+    TRACEPRINTF(
+        "(logid=%" PRIu64 ") finish to execute op[%s]", log_id, op->name());
+    if (errcode < 0) {
+      LOG(ERROR) << "(logid=" << log_id
+                 << ") Execute failed, Op:" << op->debug_string();
+      delete[] args;
+      return errcode;
+    }
+    if (errcode > 0) {
+      LOG(INFO) << "(logid=" << log_id
+                << ") Execute ignore, Op:" << op->debug_string();
+      continue;
+    }
+    if (debug_os) {
+      (*debug_os) << "(logid=" << log_id << ") {\"op_name\": \"" << op->name()
+                  << "\", \"debug_str:\": \"" << op->debug_string()
+                  << "\", \"time_info\": \"" << op->time_info() << "\"}";
+    }
+    // LOG(DEBUG) << "Execute succ, Op:" << op->debug_string();
+  }
+  stage_time.stop();
+  PredictorMetric::GetInstance()->update_latency_metric(
+      STAGE_METRIC_PREFIX + vstage->full_name, stage_time.u_elapsed());
+  delete[] args;
+  return ERR_OK;
+}
 }  // namespace predictor
 }  // namespace paddle_serving
 }  // namespace baidu
--- a/core/predictor/framework/dag_view.h
+++ b/core/predictor/framework/dag_view.h
@@ -24,7 +24,7 @@ namespace baidu {
 namespace paddle_serving {
 namespace predictor {
-class Op;
+// class Op;
 struct ViewNode {
  Op* op;  // op->full_name == service_workflow_stageindex_opname
@@ -75,11 +75,20 @@ class DagView {
  Bus* _bus;
 };
+struct Args {
+  Op* _op;
+  uint64_t _log_id;
+  bool _debug;
+  int errcode;
+};
 // The derived DagView supports parallel execution
 // strategy, by implments the execute_one_stage().
 class ParallelDagView : public DagView {
 public:
-  int execute_one_stage(ViewStage* vstage, butil::IOBufBuilder*) { return 0; }
+  virtual int execute_one_stage(ViewStage* vstage,
+                                const uint64_t log_id,
+                                butil::IOBufBuilder* debug_os);
 };
 }  // namespace predictor

--- a/core/predictor/framework/infer.cpp
+++ b/core/predictor/framework/infer.cpp
@@ -25,7 +25,8 @@ int ReloadableInferEngine::proc_initialize_impl(
  _model_dir = conf.model_dir();
  _infer_thread_num = conf.runtime_thread_num();
  _infer_batch_size = conf.batch_infer_size();
-  _infer_batch_align = conf.enable_batch_align();
+  _infer_overrun = conf.enable_overrun();
+  _allow_split_request = conf.allow_split_request();
  _conf = conf;
@@ -56,9 +57,6 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,
  }
  // init bsf framework
-  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
-      .set_thread_init_fn(
-          boost::bind(&InferEngine::thrd_initialize_impl, this));
  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
      .set_thread_init_fn(
          boost::bind(&InferEngine::thrd_initialize_impl, this));
@@ -69,8 +67,10 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,
          boost::bind(&InferEngine::task_infer_impl, this, _1, _2));
  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_batch_size(
      _infer_batch_size);
-  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_batch_align(
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_overrun(
-      _infer_batch_align);
+      _infer_overrun);
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
+      .set_allow_split_request(_allow_split_request);
  if (im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].start(
          _infer_thread_num) != 0) {
    LOG(ERROR) << "Failed start bsf executor, threads:" << _infer_thread_num;
@@ -79,7 +79,8 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,
  LOG(WARNING) << "Enable batch schedule framework, thread_num:"
               << _infer_thread_num << ", batch_size:" << _infer_batch_size
-               << ", enable_batch_align:" << _infer_batch_align;
+               << ", enable_overrun:" << _infer_overrun
+               << ", allow_split_request:" << _allow_split_request;
  return 0;
 }
@@ -348,7 +349,7 @@ T* VersionedInferEngine::get_core() {
 }
 template <typename T>
-T* VersionedInferEngine::get_core(uint64_t version) {
+T* VersionedInferEngine::get_core(const uint64_t version) {
  auto iter = _versions.find(version);
  if (iter == _versions.end()) {
    LOG(ERROR) << "Not found version engine: " << version;
@@ -363,6 +364,15 @@ T* VersionedInferEngine::get_core(uint64_t version) {
  return NULL;
 }
+CubeCache* VersionedInferEngine::get_cube_cache() {
+  InferEngine* engine = default_engine();
+  if (!engine) {
+    LOG(WARNING) << "fail to get default engine";
+    return nullptr;
+  }
+  return engine->get_cube_cache();
+}
 int VersionedInferEngine::proc_initialize_impl(
    const configure::EngineDesc& conf, bool) {
  return -1;
@@ -382,6 +392,11 @@ int VersionedInferEngine::task_infer_impl(const void* in,
  return -1;
 }
+int InferManager::set_taskexecutor_num(size_t total_engine_num) {
+  im::bsf::TaskExecutorVector<TaskT>::instance().resize(total_engine_num);
+  return 0;
+}
 int InferManager::proc_initialize(const char* path,
                                  const char* file,
                                  std::shared_ptr<int> engine_index_ptr) {
@@ -391,8 +406,6 @@ int InferManager::proc_initialize(const char* path,
    return -1;
  }
  uint32_t engine_num = model_toolkit_conf.engines_size();
-  im::bsf::TaskExecutorVector<TaskT>::instance().resize(*engine_index_ptr +
-                                                        engine_num);
  for (uint32_t ei = 0; ei < engine_num; ++ei) {
    LOG(INFO) << "model_toolkit_conf.engines(" << ei
              << ").name: " << model_toolkit_conf.engines(ei).name();
@@ -502,6 +515,15 @@ T* InferManager::get_core(const char* model_name) {
  return NULL;
 }
+CubeCache* InferManager::get_cube_cache(const char* model_name) {
+  auto it = _map.find(model_name);
+  if (it == _map.end()) {
+    LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
+    return nullptr;
+  }
+  return it->second->get_cube_cache();
+}
 // Versioned inference interface
 int InferManager::infer(const char* model_name,
                        const void* in,
@@ -517,7 +539,7 @@ int InferManager::infer(const char* model_name,
 }
 template <typename T>
-T* InferManager::get_core(const char* model_name, uint64_t version) {
+T* InferManager::get_core(const char* model_name, const uint64_t version) {
  auto it = _map.find(model_name);
  if (it == _map.end()) {
    LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;

--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <pthread.h>
 #include <sys/stat.h>
+#include <sys/syscall.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <functional>
@@ -25,16 +26,19 @@
 #include <vector>
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/framework/bsf.h"
+#include "core/predictor/framework/cache.h"
 #include "core/predictor/framework/factory.h"
 #include "core/predictor/framework/infer_data.h"
 #include "core/predictor/framework/memory.h"
 #include "paddle_inference_api.h"  // NOLINT
+#include "experimental/float16.h"
 namespace baidu {
 namespace paddle_serving {
 namespace predictor {
 using configure::ModelToolkitConf;
+// Auto mutex lock
 class AutoLock {
 public:
  explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
@@ -46,6 +50,7 @@ class AutoLock {
  pthread_mutex_t& _mut;
 };
+// Gloabl singleton mutex lock
 class GlobalCreateMutex {
 public:
  pthread_mutex_t& mutex() { return _mut; }
@@ -60,6 +65,7 @@ class GlobalCreateMutex {
  pthread_mutex_t _mut;
 };
+// InferEngine
 class InferEngine {
 public:
  virtual ~InferEngine() {}
@@ -90,11 +96,13 @@ class InferEngine {
                         void* out,
                         uint32_t batch_size = -1) = 0;
  virtual int task_infer_impl(const void* in, void* out) = 0;  // NOLINT
+  virtual CubeCache* get_cube_cache() = 0;
 protected:
  uint32_t _model_index;
  // end: framework inner call
 };
 typedef im::bsf::Task<paddle::PaddleTensor, paddle::PaddleTensor> TaskT;
 class ReloadableInferEngine : public InferEngine {
 public:
@@ -163,28 +171,37 @@ class ReloadableInferEngine : public InferEngine {
  uint32_t _infer_batch_size;
  // Need to align batch_size in inferring
-  bool _infer_batch_align;
+  bool _infer_overrun;
+  // allow to split request in inferring
+  bool _allow_split_request;
  // model version
  uint64_t _version;
 };
-// Lock free switching two models
+// Lock free switching two models and cube caches
 template <typename EngineCore>
 struct ModelData {
  ModelData() : current_idx(1) {
-    cores[0] = NULL;
+    cores[0] = nullptr;
-    cores[1] = NULL;
+    cores[1] = nullptr;
+    caches[0] = nullptr;
+    caches[1] = nullptr;
  }
  ~ModelData() {
    delete cores[0];
    delete cores[1];
+    delete caches[0];
+    delete caches[1];
  }
-  void* get() { return cores[current_idx]->get(); }
+  void* get_core() { return cores[current_idx]->get(); }
+  CubeCache* get_cache() { return caches[current_idx]; }
  EngineCore* cores[2];
+  CubeCache* caches[2];
  uint32_t current_idx;
 };
@@ -196,7 +213,7 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
  int proc_initialize(const configure::EngineDesc& conf, bool version) {
    THREAD_KEY_CREATE(&_skey, NULL);
    THREAD_MUTEX_INIT(&_mutex, NULL);
-    gpu_index = 0;
+    _gpu_index = 0;
    return ReloadableInferEngine::proc_initialize(conf, version);
  }
@@ -209,7 +226,7 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
    if (_reload_vec.empty()) {
      return 0;
    }
-    gpu_index = 0;
+    _gpu_index = 0;
    for (uint32_t ti = 0; ti < _reload_vec.size(); ++ti) {
      if (load_data(_reload_vec[ti], conf) != 0) {
        LOG(ERROR) << "Failed reload engine model: " << ti;
@@ -224,26 +241,56 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
  virtual int load_data(ModelData<EngineCore>* md,
                        const configure::EngineDesc& conf) {
    uint32_t next_idx = (md->current_idx + 1) % 2;
+    // reload engine core
    if (md->cores[next_idx]) {
      delete md->cores[next_idx];
    }
    md->cores[next_idx] = new (std::nothrow) EngineCore;
+    if (nullptr == md->cores[next_idx]) {
-    // params.dump();
+      LOG(ERROR) << "Allocating memory failed. ";
+      return -1;
+    }
    size_t gpu_ids_num = conf.gpu_ids_size();
    im::bsf::AutoMutex lock(_mutex);
    int gpu_id = -1;
    if (gpu_ids_num > 0) {
-      gpu_id = conf.gpu_ids(gpu_index % gpu_ids_num);
+      gpu_id = conf.gpu_ids(_gpu_index % gpu_ids_num);
    }
+    LOG(WARNING) << "Loading EngineCore[" << next_idx << "] ...";
    if (!md->cores[next_idx] ||
        md->cores[next_idx]->create(conf, gpu_id) != 0) {
      LOG(ERROR) << "Failed create model, path: " << conf.model_dir();
      return -1;
    }
-    gpu_index++;
+    _gpu_index++;
+    LOG(WARNING) << "Loading EngineCore[" << next_idx << "] done.";
+    // reload cube cache
+    if (nullptr == md->caches[next_idx]) {
+      md->caches[next_idx] = new (std::nothrow) CubeCache;
+    }
+    if (nullptr == md->caches[next_idx]) {
+      LOG(ERROR) << "Allocating memory failed.";
+      return -1;
+    }
+    LOG(WARNING) << "Loading cube cache[" << next_idx << "] ...";
+    std::string model_path = conf.model_dir();
+    if (access(model_path.c_str(), F_OK) == 0) {
+      std::string cube_cache_path = model_path + "/cube_cache";
+      int reload_cache_ret = md->caches[next_idx]->reload_data(cube_cache_path);
+      LOG(WARNING) << "Loading cube cache[" << next_idx << "] done.";
+    } else {
+      LOG(ERROR) << "model_path " << model_path
+                 << " is not exits. Ignore cube cache!";
+    }
+    // switch current_idx
    md->current_idx = next_idx;
+    LOG(WARNING)
+        << "Reload model and cube cache done. switching to current_idx["
+        << next_idx << "]";
    return 0;
  }
@@ -309,11 +356,25 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
    return md->cores[md->current_idx];
  }
+  CubeCache* get_cube_cache() {
+    ModelData<EngineCore>* md =
+        (ModelData<EngineCore>*)THREAD_GETSPECIFIC(_skey);
+    if (!md) {
+      LOG(ERROR) << "Failed get thread specific data";
+      return NULL;
+    }
+    return md->get_cache();
+  }
 protected:
  THREAD_KEY_T _skey;
  THREAD_MUTEX_T _mutex;
+  // vector of all model engines
  std::vector<ModelData<EngineCore>*> _reload_vec;
-  int gpu_index = 0;
+  // gpu card id
+  int _gpu_index = 0;
 };
 // 多个EngineCore共用同一份模型数据
@@ -331,12 +392,20 @@ class CloneDBReloadableInferEngine
  virtual int load_data(ModelData<EngineCore>* md,
                        const configure::EngineDesc& conf) {
+    int tid = syscall(SYS_gettid);
    uint32_t next_idx = (md->current_idx + 1) % 2;
    if (md->cores[next_idx]) {
      delete md->cores[next_idx];
    }
    md->cores[next_idx] = new (std::nothrow) EngineCore;
+    if (nullptr == md->caches[next_idx]) {
+      md->caches[next_idx] = new (std::nothrow) CubeCache;
+    }
+    if (nullptr == md->cores[next_idx] || nullptr == md->caches[next_idx]) {
+      LOG(ERROR) << "Allocating memory fail.";
+      return -1;
+    }
    // params.dump();
    // gpu_ids_num > 0 is always true.
    // if use CPU, gpu_ids = [-1].
@@ -347,46 +416,70 @@ class CloneDBReloadableInferEngine
    im::bsf::AutoMutex lock(DBReloadableInferEngine<EngineCore>::_mutex);
    int gpu_id = -1;
    if (gpu_ids_num > 0) {
-      gpu_id = conf.gpu_ids(DBReloadableInferEngine<EngineCore>::gpu_index %
+      gpu_id = conf.gpu_ids(DBReloadableInferEngine<EngineCore>::_gpu_index %
                            gpu_ids_num);
    } else {
      gpu_ids_num = 1;
    }
-    // gpu_index will be set to be 0, when load() or proc_initial() is called.
-    // gpu_index < gpu_ids_num, means there are predictors still not create
+    // _gpu_index will be set to be 0, when load() or proc_initial() is called.
+    // _gpu_index < gpu_ids_num, means there are predictors still not create
    // on some GPU card.
    // so we need to create the predictor.
-    // gpu_index >= gpu_ids_num, means each GPU card has already create one.
+    // _gpu_index >= gpu_ids_num, means each GPU card has already create one.
    // so we need to clone the predictor.
-    if (DBReloadableInferEngine<EngineCore>::gpu_index < gpu_ids_num) {
+    LOG(WARNING) << "tid:" << tid << " Loading clone model ...";
-      if (!md->cores[next_idx] ||
+    if (DBReloadableInferEngine<EngineCore>::_gpu_index < gpu_ids_num) {
-          md->cores[next_idx]->create(conf, gpu_id) != 0) {
+      // create cores
+      if (md->cores[next_idx]->create(conf, gpu_id) != 0) {
        LOG(ERROR) << "Failed create model, path: " << conf.model_dir();
        return -1;
      }
-      DBReloadableInferEngine<EngineCore>::gpu_index++;
+      // create caches
-      md->current_idx = next_idx;
+      std::string model_path = conf.model_dir();
+      if (access(model_path.c_str(), F_OK) == 0) {
+        std::string cube_cache_path = model_path + "/cube_cache";
+        int reload_cache_ret =
+            md->caches[next_idx]->reload_data(cube_cache_path);
+        LOG(WARNING) << "create cube cache[" << next_idx << "] done.";
+      } else {
+        LOG(WARNING) << "model_path " << model_path
+                     << " is not exits. Ignore cube cache!";
+      }
+      DBReloadableInferEngine<EngineCore>::_gpu_index++;
+      // md->current_idx = next_idx;
      if (_cloneTemplate.size() <
-          DBReloadableInferEngine<EngineCore>::gpu_index) {
+          DBReloadableInferEngine<EngineCore>::_gpu_index) {
        _cloneTemplate.push_back(md);
      } else {
-        _cloneTemplate[DBReloadableInferEngine<EngineCore>::gpu_index - 1] = md;
+        _cloneTemplate[DBReloadableInferEngine<EngineCore>::_gpu_index - 1] =
+            md;
      }
    } else {
-      int template_index = DBReloadableInferEngine<EngineCore>::gpu_index %
+      int template_index = DBReloadableInferEngine<EngineCore>::_gpu_index %
                           _cloneTemplate.size();
-      if (!md->cores[next_idx] ||
-          md->cores[next_idx]->clone(_cloneTemplate[template_index]->get()) !=
+      // clone cores
-              0) {
+      if (md->cores[next_idx]->clone(
+              _cloneTemplate[template_index]->get_core()) != 0) {
        LOG(ERROR) << "Failed clone model from core";
        return -1;
      }
-      DBReloadableInferEngine<EngineCore>::gpu_index++;
+      // clone caches
-      md->current_idx = next_idx;
+      md->caches[next_idx] = _cloneTemplate[template_index]->get_cache();
-      LOG(WARNING) << "core clone model succ, cur_idx[" << md->current_idx
+      LOG(WARNING) << "tid:" << tid << " clone caches done";
-                   << "].";
+      DBReloadableInferEngine<EngineCore>::_gpu_index++;
    }
+    // switch current_idx
+    md->current_idx = next_idx;
+    LOG(WARNING)
+        << "[" << tid
+        << "] Reload clone model and cube cache done. switching to current_idx["
+        << next_idx << "]";
    return 0;
  }
@@ -441,7 +534,28 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<EngineCore> {
                 paddle::PaddleDType::INT32) {
        int32_t* data = static_cast<int32_t*>(origin_data);
        lod_tensor_in->CopyFromCpu(data);
+      } else if ((*tensorVector_in_pointer)[i].dtype ==
+                 paddle::PaddleDType::UINT8) {
+        uint8_t* data = static_cast<uint8_t*>(origin_data);
+        lod_tensor_in->CopyFromCpu(data);
+      } else if ((*tensorVector_in_pointer)[i].dtype ==
+                 paddle::PaddleDType::INT8) {
+        int8_t* data = static_cast<int8_t*>(origin_data);
+        lod_tensor_in->CopyFromCpu(data);
+      } else if ((*tensorVector_in_pointer)[i].dtype ==
+               paddle::PaddleDType::FLOAT16) {
+        paddle::platform::float16* data =
+            static_cast<paddle::platform::float16*>(origin_data);
+        lod_tensor_in->CopyFromCpu(data);
+      } else {
+        LOG(ERROR) << "Inference not support type["
+                   << (*tensorVector_in_pointer)[i].dtype << "],name["
+                   << (*tensorVector_in_pointer)[i].name << "]"
+                   << " copy into core failed!";
      }
+      VLOG(2) << "Tensor:name=" << (*tensorVector_in_pointer)[i].name
+              << ";in_dtype=" << (*tensorVector_in_pointer)[i].dtype
+              << ";tensor_dtype=" << lod_tensor_in->type();
    }
    // After the input data is passed in,
    // call 'core->Run()' perform the prediction process.
@@ -506,7 +620,39 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<EngineCore> {
        int32_t* data_out = reinterpret_cast<int32_t*>(databuf_data);
        lod_tensor_out->CopyToCpu(data_out);
        databuf_char = reinterpret_cast<char*>(data_out);
+      } else if (dataType == paddle::PaddleDType::UINT8) {
+        databuf_size = out_num * sizeof(uint8_t);
+        databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+        if (!databuf_data) {
+          LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+          return -1;
+        }
+        uint8_t* data_out = reinterpret_cast<uint8_t*>(databuf_data);
+        lod_tensor_out->CopyToCpu(data_out);
+        databuf_char = reinterpret_cast<char*>(data_out);
+      } else if (dataType == paddle::PaddleDType::INT8) {
+        databuf_size = out_num * sizeof(int8_t);
+        databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+        if (!databuf_data) {
+          LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+          return -1;
+        }
+        int8_t* data_out = reinterpret_cast<int8_t*>(databuf_data);
+        lod_tensor_out->CopyToCpu(data_out);
+        databuf_char = reinterpret_cast<char*>(data_out);
+      } else if (dataType == paddle::PaddleDType::FLOAT16) {
+        databuf_size = out_num * sizeof(paddle::platform::float16);
+        databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+        if (!databuf_data) {
+          LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+          return -1;
+        }
+        paddle::platform::float16* data_out =
+            reinterpret_cast<paddle::platform::float16*>(databuf_data);
+        lod_tensor_out->CopyToCpu(data_out);
+        databuf_char = reinterpret_cast<char*>(data_out);
      }
      // Because task scheduling requires OPs to use 'Channel'
      // (which is a data structure) to transfer data between OPs.
      // We need to copy the processed data to the 'Channel' for the next OP.
@@ -532,6 +678,10 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<EngineCore> {
  int task_infer_impl(const void* in, void* out) {  // NOLINT
    return infer_impl(in, out);
  }
+  CubeCache* get_cube_cache() {
+    return DBReloadableInferEngine<EngineCore>::get_cube_cache();
+  }
 };
 typedef FactoryPool<InferEngine> StaticInferFactory;
@@ -565,11 +715,13 @@ class VersionedInferEngine : public InferEngine {
  template <typename T>
  T* get_core();
+  CubeCache* get_cube_cache();
  // versioned inference interface
  int infer(const void* in, void* out, uint32_t batch_size, uint64_t version);
  template <typename T>
-  T* get_core(uint64_t version);
+  T* get_core(const uint64_t version);
  int proc_initialize_impl(const configure::EngineDesc& conf, bool);
@@ -600,6 +752,8 @@ class InferManager {
                      const char* file,
                      std::shared_ptr<int> engine_index_ptr);
+  int set_taskexecutor_num(size_t total_engine_num);
  int thrd_initialize();
  int thrd_clear();
@@ -616,9 +770,13 @@ class InferManager {
            void* out,
            uint32_t batch_size = -1);
+  // get engine core
  template <typename T>
  T* get_core(const char* model_name);
+  // get cube cache
+  CubeCache* get_cube_cache(const char* model_name);
  // Versioned inference interface
  int infer(const char* model_name,
            const void* in,
@@ -626,9 +784,11 @@ class InferManager {
            uint32_t batch_size,
            uint64_t version);
+  // Versioned get engine core
  template <typename T>
-  T* get_core(const char* model_name, uint64_t version);
+  T* get_core(const char* model_name, const uint64_t version);
+  // query model version
  int query_version(const std::string& model, uint64_t& version);
 private:

--- a/core/predictor/framework/resource.cpp
+++ b/core/predictor/framework/resource.cpp
@@ -135,6 +135,17 @@ int Resource::initialize(const std::string& path, const std::string& file) {
  if (FLAGS_enable_model_toolkit) {
    size_t model_toolkit_num = resource_conf.model_toolkit_path_size();
+    // 此处暂时认为，每个model_toolkit仅包含一个engine
+    // 故认为 model_toolkit_num == engine总数
+    // 若以后出现model_toolkit仅包含多个engine
+    // 则应先for循环统计engine总数,再set_taskexecutor_num
+    // 切不可动态im::bsf::TaskExecutorVector<TaskT>::instance().resize
+    // TaskExecutor是线程池，内含锁，在engine进程初始化时已开始work加锁循环运行了
+    // 之后再resize内存搬运，会导致work使用原锁，而搬运后的TaskExecutor的锁内存已改变
+    if (InferManager::instance().set_taskexecutor_num(model_toolkit_num) != 0) {
+      LOG(ERROR) << "failed set_taskexecutor_num";
+      return -1;
+    }
    std::shared_ptr<int> engine_index_ptr(new int(0));
    for (size_t mi = 0; mi < model_toolkit_num; ++mi) {
      std::string model_toolkit_path = resource_conf.model_toolkit_path(mi);
@@ -165,18 +176,18 @@ int Resource::initialize(const std::string& path, const std::string& file) {
    rec::mcube::CubeAPI* cube = rec::mcube::CubeAPI::instance();
    std::string cube_config_fullpath = "./" + resource_conf.cube_config_path() +
                                       "/" + resource_conf.cube_config_file();
-    this->cube_config_fullpath = cube_config_fullpath;
+    this->_cube_config_fullpath = cube_config_fullpath;
-    this->cube_quant_bits = resource_conf.has_cube_quant_bits()
+    this->_cube_quant_bits = resource_conf.has_cube_quant_bits()
-                                ? resource_conf.cube_quant_bits()
+                                 ? resource_conf.cube_quant_bits()
-                                : 0;
+                                 : 0;
-    if (this->cube_quant_bits != 0 && this->cube_quant_bits != 8) {
+    if (this->_cube_quant_bits != 0 && this->_cube_quant_bits != 8) {
      LOG(ERROR) << "Cube quant bits illegal! should be 0 or 8.";
      return -1;
    }
-    if (this->cube_quant_bits == 0) {
+    if (this->_cube_quant_bits == 0) {
      LOG(INFO) << "cube quant mode OFF";
    } else {
-      LOG(INFO) << "cube quant mode ON, quant bits: " << this->cube_quant_bits;
+      LOG(INFO) << "cube quant mode ON, quant bits: " << this->_cube_quant_bits;
    }
  }
@@ -187,10 +198,10 @@ int Resource::initialize(const std::string& path, const std::string& file) {
 // model config
 int Resource::general_model_initialize(const std::string& path,
                                       const std::string& file) {
-  if (this->cube_config_fullpath.size() != 0) {
+  if (this->_cube_config_fullpath.size() != 0) {
-    LOG(INFO) << "init cube by config file : " << this->cube_config_fullpath;
+    LOG(INFO) << "init cube by config file : " << this->_cube_config_fullpath;
    rec::mcube::CubeAPI* cube = rec::mcube::CubeAPI::instance();
-    int ret = cube->init(this->cube_config_fullpath.c_str());
+    int ret = cube->init(this->_cube_config_fullpath.c_str());
    if (ret != 0) {
      LOG(ERROR) << "cube init error";
      return -1;
@@ -315,7 +326,7 @@ int Resource::thread_clear() {
  }
  return 0;
 }
-size_t Resource::get_cube_quant_bits() { return this->cube_quant_bits; }
+size_t Resource::get_cube_quant_bits() { return this->_cube_quant_bits; }
 int Resource::reload() {
  if (FLAGS_enable_model_toolkit && InferManager::instance().reload() != 0) {

--- a/core/predictor/framework/resource.h
+++ b/core/predictor/framework/resource.h
@@ -16,8 +16,10 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "core/cube/cube-api/include/cube_api.h"
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/framework/infer.h"
@@ -27,6 +29,8 @@ namespace baidu {
 namespace paddle_serving {
 namespace predictor {
+// Paddle general model configuration, read the model configuration information
+// from the general_model_config.proto file
 class PaddleGeneralModelConfig {
 public:
  PaddleGeneralModelConfig() {}
@@ -34,23 +38,47 @@ class PaddleGeneralModelConfig {
  ~PaddleGeneralModelConfig() {}
 public:
+  // feed/fetch name and alias_name
  std::vector<std::string> _feed_name;
  std::vector<std::string> _feed_alias_name;
-  std::vector<int> _feed_type;      // 0 int64, 1 float
-  std::vector<bool> _is_lod_feed;   // true lod tensor
-  std::vector<bool> _is_lod_fetch;  // whether a fetch var is lod_tensor
-  std::vector<int> _capacity;       //  capacity for each tensor
-                                    /*
-                                      feed_shape_ for feeded variable
-                                      feed_shape_[i][j] represents the jth dim for ith input Tensor
-                                      if is_lod_feed_[i] == False, feed_shape_[i][0] = -1
-                                     */
-  std::vector<std::vector<int>> _feed_shape;
  std::vector<std::string> _fetch_name;
  std::vector<std::string> _fetch_alias_name;
+  // Be consistent with model saving interface var type conversion
+  // (python/paddle serving client/io/__init__)
+  // int64 => 0;
+  // float32 => 1;
+  // int32 => 2;
+  // float64 => 3;
+  // int16 => 4;
+  // float16 => 5;
+  // bfloat16 => 6;
+  // uint8 => 7;
+  // int8 => 8;
+  // bool => 9;
+  // complex64 => 10,
+  // complex128 => 11;
+  std::vector<int> _feed_type;
+  // whether a feed or fetch var is lod_tensor.
+  std::vector<bool> _is_lod_feed;
+  std::vector<bool> _is_lod_fetch;
+  // capacity for each tensor
+  std::vector<int> _capacity;
+  // _feed_shape and _fetch_shape are used to represent the dimensional
+  // information of tensor.
+  // for examples, feed_shape_[i][j] represents the j(th) dim for i(th) input
+  // tensor.
+  // if is_lod_feed_[i] == False, feed_shape_[i][0] = -1
+  std::vector<std::vector<int>> _feed_shape;
  std::vector<std::vector<int>> _fetch_shape;
+  // fetch name -> index of fetch_name vector.
  std::map<std::string, int> _fetch_name_to_index;
+  // fetch alias name -> index of fetch_alias_name vector.
  std::map<std::string, int> _fetch_alias_name_to_index;
 };
@@ -73,33 +101,50 @@ class Resource {
    return ins;
  }
+  // initialize resource
  int initialize(const std::string& path, const std::string& file);
+  // loading all models configurations from prototxt
  int general_model_initialize(const std::string& path,
                               const std::string& file);
+  // initialize thread local data
  int thread_initialize();
+  // clear thread local data
  int thread_clear();
+  // reload resources
  int reload();
+  // finalize
  int finalize();
+  // get all model configs
  std::vector<std::shared_ptr<PaddleGeneralModelConfig>>
  get_general_model_config();
+  // print all configurations of all models
  void print_general_model_config(
      const std::shared_ptr<PaddleGeneralModelConfig>& config);
+  // get cube quantity bit size
  size_t get_cube_quant_bits();
 private:
  int thread_finalize() { return 0; }
+ private:
+  // configuration infermation of all models, loading from prototxt files
  std::vector<std::shared_ptr<PaddleGeneralModelConfig>> _configs;
-  std::string cube_config_fullpath;
-  int cube_quant_bits;  // 0 if no empty
+  // full path of cube configuration file.
+  std::string _cube_config_fullpath;
+  // cube quantify bit size, support 0/8. set 0 if no quant.
+  size_t _cube_quant_bits;
+  // bthread local key
  THREAD_KEY_T _tls_bspec_key;
 };

--- a/core/predictor/tools/ocrtools/preprocess_op.cpp
+++ b/core/predictor/tools/ocrtools/preprocess_op.cpp
@@ -82,14 +82,14 @@ void ResizeImgType0::Run(const cv::Mat &img, cv::Mat &resize_img,
  else if (resize_h / 32 < 1 + 1e-5)
    resize_h = 32;
  else
-    resize_h = (resize_h / 32) * 32;
+    resize_h = (resize_h / 32 - 1) * 32;
  if (resize_w % 32 == 0)
    resize_w = resize_w;
  else if (resize_w / 32 < 1 + 1e-5)
    resize_w = 32;
  else
-    resize_w = (resize_w / 32) * 32;
+    resize_w = (resize_w / 32 - 1) * 32;
  if (!use_tensorrt) {
    cv::resize(img, resize_img, cv::Size(resize_w, resize_h));
    ratio_h = float(resize_h) / float(h);

--- a/core/sdk-cpp/proto/general_model_service.proto
+++ b/core/sdk-cpp/proto/general_model_service.proto
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-syntax = "proto2";
+syntax = "proto3";
 import "pds_option.proto";
 import "builtin_format.proto";
 package baidu.paddle_serving.predictor.general_model;
@@ -20,33 +20,88 @@ package baidu.paddle_serving.predictor.general_model;
 option cc_generic_services = true;
 message Tensor {
-  repeated string data = 1;
+  // VarType: INT64
-  repeated int32 int_data = 2;
+  repeated int64 int64_data = 1;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
+  // VarType: FP32
-  optional int32 elem_type =
+  repeated float float_data = 2;
-      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
-  repeated int32 shape = 6;       // shape should include batch
+  // VarType: INT32
-  repeated int32 lod = 7;         // only for fetch tensor currently
+  repeated int32 int_data = 3;
-  optional string name = 8;       // get from the Model prototxt
-  optional string alias_name = 9; // get from the Model prototxt
+  // VarType: FP64
+  repeated double float64_data = 4;
+  // VarType: UINT32
+  repeated uint32 uint32_data = 5;
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+  // VarType: STRING
+  repeated string data = 9;
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  20 => STRING
+  int32 elem_type = 10;
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 11;
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 12;
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 13;
+  // Correspond to the variable 'alias_name' in the model description prototxt.
+  string alias_name = 14; // get from the Model prototxt
+  // VarType: FP16, INT16, INT8, BF16, UINT8
+  bytes tensor_content = 15;
 };
 message Request {
  repeated Tensor tensor = 1;
  repeated string fetch_var_names = 2;
-  optional bool profile_server = 3 [ default = false ];
+  bool profile_server = 3;
-  required uint64 log_id = 4 [ default = 0 ];
+  uint64 log_id = 4;
 };
 message Response {
  repeated ModelOutput outputs = 1;
  repeated int64 profile_time = 2;
+  // Error code
+  int32 err_no = 3;
+  // Error messages
+  string err_msg = 4;
 };
 message ModelOutput {
  repeated Tensor tensor = 1;
-  optional string engine_name = 2;
+  string engine_name = 2;
 }
 service GeneralModelService {

--- a/doc/HTTP_SERVICE_CN.md
+++ b/doc/HTTP_SERVICE_CN.md
@@ -12,7 +12,7 @@ BRPC-Server会尝试去JSON字符串中再去反序列化出Proto格式的数据
 ### Http+protobuf方式
 各种语言都提供了对ProtoBuf的支持，如果您对此比较熟悉，您也可以先将数据使用ProtoBuf序列化，再将序列化后的数据放入Http请求数据体中，然后指定Content-Type: application/proto，从而使用http/h2+protobuf二进制串访问服务。
-实测随着数据量的增大，使用JSON方式的Http的数据量和反序列化的耗时会大幅度增加，推荐当您的数据量较大时，使用Http+protobuf方式，后续我们会在框架的HttpClient中增加该功能，目前暂没有支持。
+实测随着数据量的增大，使用JSON方式的Http的数据量和反序列化的耗时会大幅度增加，推荐当您的数据量较大时，使用Http+protobuf方式，目前已经在Java和Python的Client端提供了支持。
 **理论上讲，序列化/反序列化的性能从高到底排序为：protobuf > http/h2+protobuf > http**
@@ -42,7 +42,7 @@ python3.6 -m paddle_serving_server.serve --model uci_housing_model --thread 10 -
 为了方便用户快速的使用Http方式请求Server端预测服务，我们已经将常用的Http请求的数据体封装、压缩、请求加密等功能封装为一个HttpClient类提供给用户，方便用户使用。
-使用HttpClient最简单只需要三步，1、创建一个HttpClient对象。2、加载Client端的prototxt配置文件（本例中为python/examples/fit_a_line/目录下的uci_housing_client/serving_client_conf.prototxt)，3、调用Predict函数，通过Http方式请求预测服务。
+使用HttpClient最简单只需要四步，1、创建一个HttpClient对象。2、加载Client端的prototxt配置文件（本例中为python/examples/fit_a_line/目录下的uci_housing_client/serving_client_conf.prototxt)。3、调用connect函数。4、调用Predict函数，通过Http方式请求预测服务。
 此外，您可以根据自己的需要配置Server端IP、Port、服务名称（此服务名称需要与[`core/general-server/proto/general_model_service.proto`](../core/general-server/proto/general_model_service.proto)文件中的Service服务名和rpc方法名对应，即`GeneralModelService`字段和`inference`字段），设置Request数据体压缩，设置Response支持压缩传输，模型加密预测（需要配置Server端使用模型加密）、设置响应超时时间等功能。
@@ -52,7 +52,9 @@ Java的HttpClient使用示例见[`java/examples/src/main/java/PaddleServingClien
 如果不能满足您的需求，您也可以在此基础上添加一些功能。
-如需支持https或者自定义Response的Status Code等,则需要对C++端brpc-Server进行一定的二次开发，请参考https://github.com/apache/incubator-brpc/blob/master/docs/cn/http_service.md，后续如果需求很大，我们也会将这部分功能加入到Server中，尽情期待。
+如需支持https或者自定义Response的Status Code等,则需要对C++端brpc-Server进行一定的二次开发，请参考https://github.com/apache/incubator-brpc/blob/master/docs/cn/http_service.md
+后续如果需求很大，我们也会将这部分功能加入到Server中，尽情期待。
 ### curl方式发送Http请求(基本原理)
@@ -101,7 +103,7 @@ repeated int32 numbers = 1;
 ```
 #### elem_type
-表示数据类型，0 means int64, 1 means float32, 2 means int32, 3 means bytes(string)
+表示数据类型，0 means int64, 1 means float32, 2 means int32, 20 means bytes(string)
 #### fetch_var_names

--- a/java/README_CN.md
+++ b/java/README_CN.md
@@ -7,8 +7,8 @@
 为了方便用户使用java进行开发，我们提供了编译好的Serving工程放置在java镜像当中，获取镜像并进入开发环境的方式是
 ```
-docker pull registry.baidubce.com/paddlepaddle/serving:0.5.0-java
+docker pull registry.baidubce.com/paddlepaddle/serving:0.6.0-java
-docker run --rm -dit --name java_serving registry.baidubce.com/paddlepaddle/serving:0.5.0-java
+docker run --rm -dit --name java_serving registry.baidubce.com/paddlepaddle/serving:0.6.0-java
 docker exec -it java_serving bash
 cd Serving/java
 ```
@@ -29,7 +29,7 @@ mvn install
 ## 请求BRPC-Server
-###服务端启动
+### 服务端启动
 以fit_a_line模型为例，服务端启动与常规BRPC-Server端启动命令一样。
@@ -39,7 +39,7 @@ sh get_data.sh
 python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
 ```
-###客户端预测
+### 客户端预测
 客户端目前支持多种请求方式，目前支持HTTP（数据为JSON格式）、HTTP（数据为PROTO格式）、GRPC
 推荐您使用HTTP（数据为PROTO格式），此时数据体为PROTO格式，传输的数据量小，速度快，目前已经帮用户实现了HTTP/GRPC的数据体（JSON/PROTO）的封装函数,详见[Client.java](./src/main/java/io/paddle/serving/client/Client.java)
@@ -47,14 +47,14 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 cd ../../../java/examples/target
 java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample http_proto <configPath>
 ```
-**注意  <configPath>为客户端配置文件，一般是名为serving_client_conf.prototxt的文件。**
+**注意  `<configPath>`为客户端配置文件，一般是名为serving_client_conf.prototxt的文件。**
 更多示例详见[PaddleServingClientExample.java](./examples/src/main/java/PaddleServingClientExample.java)
 ## 请求Pipeline-Server
-###服务端启动
+### 服务端启动
 对于input data type = string类型，以IMDB model ensemble模型为例，服务端启动
@@ -66,14 +66,14 @@ python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.
 python test_pipeline_server.py &>pipeline.log &
 ```
-客户端预测(同步)
+### 客户端预测(同步)
 ```
 cd ../../../java/examples/target
 java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample string_imdb_predict
 ```
-客户端预测(异步)
+### 客户端预测(异步)
 ```
 cd ../../../java/examples/target
@@ -81,7 +81,7 @@ java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar Pipeli
 ```
-对于input data type = INDArray类型，以Simple Pipeline WebService中的uci_housing_model模型为例，服务端启动
+### 对于input data type = INDArray类型，以Simple Pipeline WebService中的uci_housing_model模型为例，服务端启动
 ```
 cd ../../python/examples/pipeline/simple_web_service
@@ -89,7 +89,7 @@ sh get_data.sh
 python web_service_java.py &>log.txt &
 ```
-客户端预测(同步)
+### 客户端预测(同步)
 ```
 cd ../../../java/examples/target
@@ -98,7 +98,7 @@ java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar Pipeli
 ### 注意事项
-1.在示例中，端口号都是9393，ip默认设置为了0.0.0.0表示本机，注意ip和port需要与Server端对应。
+1.在示例中，端口号都是9393，ip默认设置为了127.0.0.1表示本机，注意ip和port需要与Server端对应。
 2.目前Serving已推出Pipeline模式（原理详见[Pipeline Serving](../doc/PIPELINE_SERVING_CN.md)），面向Java的Pipeline Serving Client已发布。

--- a/java/examples/src/main/java/PaddleServingClientExample.java
+++ b/java/examples/src/main/java/PaddleServingClientExample.java
@@ -25,7 +25,7 @@ public class PaddleServingClientExample {
        List<String> fetch = Arrays.asList("price");
        Client client = new Client();
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
        client.setPort("9393");
        client.loadClientConfig(model_config_path);
        String result = client.predict(feed_data, fetch, true, 0);
@@ -49,7 +49,7 @@ public class PaddleServingClientExample {
        Client client = new Client();
        //注意：跨docker，需要设置--net-host或直接访问另一个docker的ip
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
        client.setPort("9393");
        client.set_http_proto(false);
        client.loadClientConfig(model_config_path);
@@ -73,7 +73,7 @@ public class PaddleServingClientExample {
        List<String> fetch = Arrays.asList("price");
        Client client = new Client();
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
        client.setPort("9393");
        client.loadClientConfig(model_config_path);
        client.set_use_grpc_client(true);
@@ -97,7 +97,7 @@ public class PaddleServingClientExample {
        List<String> fetch = Arrays.asList("price");
        Client client = new Client();
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
        client.setPort("9393");
        client.loadClientConfig(model_config_path);
        client.use_key(keyFilePath);
@@ -125,7 +125,7 @@ public class PaddleServingClientExample {
        List<String> fetch = Arrays.asList("price");
        Client client = new Client();
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
        client.setPort("9393");
        client.loadClientConfig(model_config_path);
        client.set_request_compress(true);
@@ -176,7 +176,7 @@ public class PaddleServingClientExample {
            }};
        List<String> fetch = Arrays.asList("save_infer_model/scale_0.tmp_0");
        Client client = new Client();
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
        client.setPort("9393");
        client.loadClientConfig(model_config_path);
        String result = client.predict(feed_data, fetch, true, 0);
@@ -198,7 +198,7 @@ public class PaddleServingClientExample {
            }};
        List<String> fetch = Arrays.asList("pooled_output");
        Client client = new Client();
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
        client.setPort("9393");
        client.loadClientConfig(model_config_path);
        String result = client.predict(feed_data, fetch, true, 0);
@@ -268,7 +268,7 @@ public class PaddleServingClientExample {
            }};
        List<String> fetch = Arrays.asList("prob");
        Client client = new Client();
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
        client.setPort("9393");
        client.loadClientConfig(model_config_path);
        String result = client.predict(feed_data, fetch, true, 0);

--- a/java/src/main/java/io/paddle/serving/client/Client.java
+++ b/java/src/main/java/io/paddle/serving/client/Client.java
@@ -59,9 +59,20 @@ import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
-enum ElementType
+class ElementType {
-{
+    public static final int Int64_type = 0;
-    Int64_type, Float32_type, Int32_type, Bytes_type;
+    public static final int Float32_type = 1;
+    public static final int Int32_type = 2;
+    public static final int String_type = 20;
+    public static final Map<Integer, String> feedTypeToDataKey_;
+    static
+    {
+        feedTypeToDataKey_ = new HashMap<Integer, String>();
+        feedTypeToDataKey_.put(ElementType.Int64_type, "int64_data");
+        feedTypeToDataKey_.put(ElementType.Float32_type, "float_data");
+        feedTypeToDataKey_.put(ElementType.Int32_type, "int_data");
+        feedTypeToDataKey_.put(ElementType.String_type, "data");
+    }
 }
 class Profiler {
@@ -104,7 +115,6 @@ public class Client {
    private Map<String, Integer> feedTypes_;
    private Map<String, List<Integer>> feedShapes_;
    private Map<String, Integer> feedNameToIndex_;
-    private Map<Integer, String> feedTypeToDataKey_;
    private List<String> fetchNames_;
    private Map<String, Integer> fetchTypes_;
    private Set<String> lodTensorSet_;
@@ -134,7 +144,7 @@ public class Client {
        feedTensorLen_ = null;
        feedNameToIndex_ = null;
        timeoutS_ = 200000;
-        ip = "0.0.0.0";
+        ip = "127.0.0.1";
        port = "9393";
        serverPort = "9393";
        serviceName = "/GeneralModelService/inference";
@@ -147,12 +157,6 @@ public class Client {
        channel_ = null;
        blockingStub_ = null;
-        feedTypeToDataKey_ = new HashMap<Integer, String>();
-        feedTypeToDataKey_.put(0, "int64_data");
-        feedTypeToDataKey_.put(1, "float_data");
-        feedTypeToDataKey_.put(2, "int_data");
-        feedTypeToDataKey_.put(3, "data");
        profiler_ = new Profiler();
        boolean is_profile = false;
        String FLAGS_profile_client = System.getenv("FLAGS_profile_client");
@@ -525,7 +529,7 @@ public class Client {
                    jsonTensor.put("elem_type", element_type);
                    // 处理数据与shape
-                    String protoDataKey = feedTypeToDataKey_.get(element_type);
+                    String protoDataKey = ElementType.feedTypeToDataKey_.get(element_type);
                    // 如果是INDArray类型，先转为一维.
                    // 此时shape为INDArray的shape
                    if(objectValue instanceof INDArray){
@@ -535,11 +539,11 @@ public class Client {
                        for(long dim:indarrayShape){
                            shape.add((int)dim);
                        }
-                        if(element_type == ElementType.Int64_type.ordinal()){
+                        if(element_type == ElementType.Int64_type){
                            objectValue = tempIndArray.data().asLong();
-                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                        }else if(element_type == ElementType.Int32_type){
                            objectValue = tempIndArray.data().asInt();
-                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                        }else if(element_type == ElementType.Float32_type){
                            objectValue = tempIndArray.data().asFloat();
                        }else{
                            throw new Exception("INDArray 类型不支持");
@@ -564,11 +568,11 @@ public class Client {
                        // 此时无法获取batch信息，故对shape不处理
                        // 由于Proto中为Repeated,需要把数据包装成list
                        if(objectValue instanceof String){
-                            if(feedTypes_.get(protoDataKey)!= ElementType.Bytes_type.ordinal()){
+                            if(feedTypes_.get(protoDataKey)!= ElementType.String_type){
                                throw new Exception("feedvar is not string-type,feed can`t be a single string.");
                            }
                        }else{
-                            if(feedTypes_.get(protoDataKey)== ElementType.Bytes_type.ordinal()){
+                            if(feedTypes_.get(protoDataKey)== ElementType.String_type){
                                throw new Exception("feedvar is string-type,feed, feed can`t be a single int or others.");
                            }
                        }
@@ -662,17 +666,17 @@ public class Client {
                        for(long dim:indarrayShape){
                            shape.add((int)dim);
                        }   
-                        if(element_type == ElementType.Int64_type.ordinal()){
+                        if(element_type == ElementType.Int64_type){
                            List<Long> iter = Arrays.stream(tempIndArray.data().asLong()).boxed().collect(Collectors.toList());
                            tensor_builder.addAllInt64Data(iter);
-                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                        }else if(element_type == ElementType.Int32_type){
                            List<Integer> iter = Arrays.stream(tempIndArray.data().asInt()).boxed().collect(Collectors.toList());
                            tensor_builder.addAllIntData(iter);
-                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                        }else if(element_type == ElementType.Float32_type){
                            List<Float> iter = Arrays.asList(ArrayUtils.toObject(tempIndArray.data().asFloat()));
                            tensor_builder.addAllFloatData(iter);
@@ -684,13 +688,13 @@ public class Client {
                        // 如果是数组类型，则无须处理，直接使用即可。
                        // 且数组无法嵌套，此时batch无法从数据中获取
                        // 默认batch维度为1，或者feedVar的shape信息中已包含batch
-                        if(element_type == ElementType.Int64_type.ordinal()){
+                        if(element_type == ElementType.Int64_type){
                            List<Long> iter = Arrays.stream((long[])objectValue).boxed().collect(Collectors.toList());
                            tensor_builder.addAllInt64Data(iter);
-                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                        }else if(element_type == ElementType.Int32_type){
                            List<Integer> iter = Arrays.stream((int[])objectValue).boxed().collect(Collectors.toList());
                            tensor_builder.addAllIntData(iter);
-                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                        }else if(element_type == ElementType.Float32_type){
                            List<Float> iter = Arrays.asList(ArrayUtils.toObject((float[])objectValue));
                            tensor_builder.addAllFloatData(iter);
                        }else{
@@ -707,11 +711,11 @@ public class Client {
                            // 在index=0处，加上batch
                            shape.add(0, list.size());
                        }
-                        if(element_type == ElementType.Int64_type.ordinal()){
+                        if(element_type == ElementType.Int64_type){
                            tensor_builder.addAllInt64Data((List<Long>)(List)recursiveExtract(objectValue));
-                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                        }else if(element_type == ElementType.Int32_type){
                            tensor_builder.addAllIntData((List<Integer>)(List)recursiveExtract(objectValue));
-                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                        }else if(element_type == ElementType.Float32_type){
                            tensor_builder.addAllFloatData((List<Float>)(List)recursiveExtract(objectValue));
                        }else{
                            // 看接口是String还是Bytes
@@ -723,11 +727,11 @@ public class Client {
                        // 由于Proto中为Repeated,需要把数据包装成list
                        List<Object> tempList = new ArrayList<>();
                        tempList.add(objectValue);
-                        if(element_type == ElementType.Int64_type.ordinal()){
+                        if(element_type == ElementType.Int64_type){
                            tensor_builder.addAllInt64Data((List<Long>)(List)tempList);
-                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                        }else if(element_type == ElementType.Int32_type){
                            tensor_builder.addAllIntData((List<Integer>)(List)tempList);
-                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                        }else if(element_type == ElementType.Float32_type){
                            tensor_builder.addAllFloatData((List<Float>)(List)tempList);
                        }else{
                            // 看接口是String还是Bytes

--- a/java/src/main/proto/general_model_service.proto
+++ b/java/src/main/proto/general_model_service.proto
@@ -12,41 +12,96 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-syntax = "proto2";
+syntax = "proto3";
 package baidu.paddle_serving.predictor.general_model;
 option java_multiple_files = true;
 message Tensor {
-  repeated string data = 1;
+  // VarType: INT64
-  repeated int32 int_data = 2;
+  repeated int64 int64_data = 1;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
+  // VarType: FP32
-  optional int32 elem_type =
+  repeated float float_data = 2;
-      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
-  repeated int32 shape = 6;       // shape should include batch
+  // VarType: INT32
-  repeated int32 lod = 7;         // only for fetch tensor currently
+  repeated int32 int_data = 3;
-  optional string name = 8;       // get from the Model prototxt
-  optional string alias_name = 9; // get from the Model prototxt
+  // VarType: FP64
+  repeated double float64_data = 4;
+  // VarType: UINT32
+  repeated uint32 uint32_data = 5;
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+  // VarType: STRING
+  repeated string data = 9;
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  20 => STRING
+  int32 elem_type = 10;
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 11;
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 12;
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 13;
+  // Correspond to the variable 'alias_name' in the model description prototxt.
+  string alias_name = 14; // get from the Model prototxt
+  // VarType: FP16, INT16, INT8, BF16, UINT8
+  bytes tensor_content = 15;
 };
 message Request {
  repeated Tensor tensor = 1;
  repeated string fetch_var_names = 2;
-  optional bool profile_server = 3 [ default = false ];
+  bool profile_server = 3;
-  required uint64 log_id = 4 [ default = 0 ];
+  uint64 log_id = 4;
 };
 message Response {
  repeated ModelOutput outputs = 1;
  repeated int64 profile_time = 2;
+  // Error code
+  int32 err_no = 3;
+  // Error messages
+  string err_msg = 4;
 };
 message ModelOutput {
  repeated Tensor tensor = 1;
-  optional string engine_name = 2;
+  string engine_name = 2;
 }
 service GeneralModelService {
-  rpc inference(Request) returns (Response) {}
+  rpc inference(Request) returns (Response);
-  rpc debug(Request) returns (Response) {}
+  rpc debug(Request) returns (Response);
 };
--- a/paddle_inference/paddle/include/paddle_engine.h
+++ b/paddle_inference/paddle/include/paddle_engine.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <dirent.h>
 #include <pthread.h>
 #include <fstream>
 #include <map>
@@ -69,7 +70,33 @@ PrecisionType GetPrecision(const std::string& precision_data) {
  return PrecisionType::kFloat32;
 }
-// Engine Base
+const std::string getFileBySuffix(
+    const std::string& path, const std::vector<std::string>& suffixVector) {
+  DIR* dp = nullptr;
+  std::string fileName = "";
+  struct dirent* dirp = nullptr;
+  if ((dp = opendir(path.c_str())) == nullptr) {
+    return fileName;
+  }
+  while ((dirp = readdir(dp)) != nullptr) {
+    if (dirp->d_type == DT_REG) {
+      for (int idx = 0; idx < suffixVector.size(); ++idx) {
+        if (std::string(dirp->d_name).find(suffixVector[idx]) !=
+            std::string::npos) {
+          fileName = static_cast<std::string>(dirp->d_name);
+          break;
+        }
+      }
+    }
+    if (fileName.length() != 0) break;
+  }
+  closedir(dp);
+  return fileName;
+}
+// Engine Core is the base class of inference engines, which can be derived from
+// paddle Inference Engine, or inference engines of other machine learning
+// platforms
 class EngineCore {
 public:
  virtual ~EngineCore() {}
@@ -116,6 +143,11 @@ class EngineCore {
  virtual void* get() { return _predictor.get(); }
 protected:
+  // _predictor is a prediction instance of Paddle Inference.
+  // when inferring on the CPU, _predictor is bound to a model.
+  // when inferring on the GPU, _predictor is bound to a model and a GPU card.
+  // Therefore, when using GPU multi-card inference, you need to create multiple
+  // EngineCore.
  std::shared_ptr<Predictor> _predictor;
 };
@@ -131,9 +163,21 @@ class PaddleInferenceEngine : public EngineCore {
    }
    Config config;
-    // todo, auto config(zhangjun)
+    std::vector<std::string> suffixParaVector = {".pdiparams", "__params__"};
-    if (engine_conf.has_encrypted_model() && engine_conf.encrypted_model()) {
+    std::vector<std::string> suffixModelVector = {".pdmodel", "__model__"};
+    std::string paraFileName = getFileBySuffix(model_path, suffixParaVector);
+    std::string modelFileName = getFileBySuffix(model_path, suffixModelVector);
+    std::string encryParaPath = model_path + "/encrypt_model";
+    std::string encryModelPath = model_path + "/encrypt_params";
+    std::string encryKeyPath = model_path + "/key";
+    // encrypt model
+    if (access(encryParaPath.c_str(), F_OK) != -1 &&
+        access(encryModelPath.c_str(), F_OK) != -1 &&
+        access(encryKeyPath.c_str(), F_OK) != -1) {
      // decrypt model
      std::string model_buffer, params_buffer, key_buffer;
      predictor::ReadBinaryFile(model_path + "/encrypt_model", &model_buffer);
      predictor::ReadBinaryFile(model_path + "/encrypt_params", &params_buffer);
@@ -147,16 +191,11 @@ class PaddleInferenceEngine : public EngineCore {
                            real_model_buffer.size(),
                            &real_params_buffer[0],
                            real_params_buffer.size());
-    } else if (engine_conf.has_combined_model()) {
+    } else if (paraFileName.length() != 0 && modelFileName.length() != 0) {
-      if (!engine_conf.combined_model()) {
+      config.SetParamsFile(model_path + "/" + paraFileName);
-        config.SetModel(model_path);
+      config.SetProgFile(model_path + "/" + modelFileName);
-      } else {
-        config.SetParamsFile(model_path + "/__params__");
-        config.SetProgFile(model_path + "/__model__");
-      }
    } else {
-      config.SetParamsFile(model_path + "/__params__");
+      config.SetModel(model_path);
-      config.SetProgFile(model_path + "/__model__");
    }
    config.SwitchSpecifyInputNames(true);

--- a/python/examples/bert/README.md
+++ b/python/examples/bert/README.md
-## Bert as service
+Http## Bert as service
 ([简体中文](./README_CN.md)|English)
@@ -42,48 +42,36 @@ sh get_data.sh
 ```
 this script will download Chinese Dictionary File vocab.txt and Chinese Sample Data data-c.txt
-### RPC Inference Service
+### Inference Service(Support BRPC-Client、GRPC-Client、Http-Client)
 start cpu inference service,Run
 ```
-python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #cpu inference service
+python3 -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #cpu inference service
 ```
 Or,start gpu inference service,Run
 ```
-python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #launch gpu inference service at GPU 0
+python3 -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #launch gpu inference service at GPU 0
 ```
-### RPC Inference
+### BRPC-Client Inference
 before prediction we should install paddle_serving_app. This module provides data preprocessing for BERT model.
 ```
-pip install paddle_serving_app
+pip3 install paddle_serving_app
 ```
 Run
 ```
-head data-c.txt | python bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
+head data-c.txt | python3 bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
 ```
 the client reads data from data-c.txt and send prediction request, the prediction is given by word vector. (Due to massive data in the word vector, we do not print it).
-### HTTP Inference Service
+#### GRPC-Client/HTTP-Client
-start cpu HTTP inference service,Run
+Run
-```
- python bert_web_service.py bert_seq128_model/ 9292 #launch cpu inference service
 ```
+head data-c.txt | python3 bert_httpclient.py --model bert_seq128_client/serving_client_conf.prototxt
-Or,start gpu HTTP inference service,Run
-```
- export CUDA_VISIBLE_DEVICES=0,1
-```
-set environmental variable to specify which gpus are used, the command above means gpu 0 and gpu 1 is used.
 ```
- python bert_web_service_gpu.py bert_seq128_model/ 9292 #launch gpu inference service
-```
-### HTTP Inference 
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:9292/bert/prediction
-```
 ## Benchmark
 ``` shell

--- a/python/examples/bert/README_CN.md
+++ b/python/examples/bert/README_CN.md
@@ -40,15 +40,15 @@ sh get_data.sh
 ```
 脚本将下载中文词典vocab.txt和中文样例数据data-c.txt
-### 启动RPC预测服务
+### 启动预测服务（支持BRPC-Client、GRPC-Client、HTTP-Client三种方式访问）
 启动cpu预测服务，执行
 ```
-python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #启动cpu预测服务
+python3 -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #启动cpu预测服务
 ```
 或者，启动gpu预测服务，执行
 ```
-python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务
+python3 -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务
 ```
@@ -56,37 +56,22 @@ python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --g
 执行预测前需要安装paddle_serving_app，模块中提供了BERT模型的数据预处理方法。
 ```
-pip install paddle_serving_app
+pip3 install paddle_serving_app
 ```
+#### BRPC-Client
 执行
 ```
-head data-c.txt | python bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
+head data-c.txt | python3 bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
 ```
 启动client读取data-c.txt中的数据进行预测，预测结果为文本的向量表示（由于数据较多，脚本中没有将输出进行打印），server端的地址在脚本中修改。
+#### GRPC-Client/HTTP-Client
+执行
-### 启动HTTP预测服务
-启动cpu HTTP预测服务，执行
-```
-python bert_web_service.py bert_seq128_model/ 9292 #启动CPU预测服务
-```
-或者，启动gpu HTTP预测服务，执行
-```
- export CUDA_VISIBLE_DEVICES=0,1
-```
-通过环境变量指定gpu预测服务使用的gpu，示例中指定索引为0和1的两块gpu
-```
-python bert_web_service_gpu.py bert_seq128_model/ 9292 #启动gpu预测服务
 ```
+head data-c.txt | python3 bert_httpclient.py --model bert_seq128_client/serving_client_conf.prototxt
-### 执行预测
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:9292/bert/prediction
 ```
 ## 性能测试

--- a/python/examples/bert/bert_web_service.py
+++ b/python/examples/bert/bert_web_service.py
-# coding=utf-8
+# coding:utf-8
+# pylint: disable=doc-string-missing
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,37 +13,46 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server.web_service import WebService
-from paddle_serving_app.reader import ChineseBertReader
 import sys
-import os
+from paddle_serving_client import HttpClient
+from paddle_serving_client.utils import benchmark_args
+from paddle_serving_app.reader import ChineseBertReader
 import numpy as np
+args = benchmark_args()
+reader = ChineseBertReader({"max_seq_len": 128})
+fetch = ["pooled_output"]
+endpoint_list = ['127.0.0.1:9292']
+client = HttpClient()
+client.load_client_config(args.model)
+''' 
+if you want use GRPC-client, set_use_grpc_client(True)
+or you can directly use client.grpc_client_predict(...)
+as for HTTP-client,set_use_grpc_client(False)(which is default)
+or you can directly use client.http_client_predict(...)
+'''
+#client.set_use_grpc_client(True)
+'''
+if you want to enable Encrypt Module,uncommenting the following line
+'''
+#client.use_key("./key")
+'''
+if you want to compress,uncommenting the following line
+'''
+#client.set_response_compress(True)
+#client.set_request_compress(True)
+'''
+we recommend use Proto data format in HTTP-body, set True(which is default)
+if you want use JSON data format in HTTP-body, set False
+'''
+#client.set_http_proto(True)
+client.connect(endpoint_list)
-class BertService(WebService):
+for line in sys.stdin:
-    def load(self):
+    feed_dict = reader.process(line)
-        self.reader = ChineseBertReader({
+    for key in feed_dict.keys():
-            "vocab_file": "vocab.txt",
+        feed_dict[key] = np.array(feed_dict[key]).reshape((128, 1))
-            "max_seq_len": 128
+    #print(feed_dict)
-        })
+    result = client.predict(feed=feed_dict, fetch=fetch, batch=False)
+print(result)
-    def preprocess(self, feed=[], fetch=[]):
-        feed_res = []
-        is_batch = False
-        for ins in feed:
-            feed_dict = self.reader.process(ins["words"].encode("utf-8"))
-            for key in feed_dict.keys():
-                feed_dict[key] = np.array(feed_dict[key]).reshape(
-                    (len(feed_dict[key]), 1))
-            feed_res.append(feed_dict)
-        return feed_res, fetch, is_batch
-bert_service = BertService(name="bert")
-bert_service.load()
-bert_service.load_model_config(sys.argv[1])
-bert_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[2]), device="cpu")
-bert_service.run_rpc_service()
-bert_service.run_web_service()
--- a/python/examples/bert/bert_web_service_gpu.py
+++ b/python/examples/bert/bert_web_service_gpu.py
-# coding=utf-8
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server.web_service import WebService
-from paddle_serving_app.reader import ChineseBertReader
-import sys
-import os
-import numpy as np
-class BertService(WebService):
-    def load(self):
-        self.reader = ChineseBertReader({
-            "vocab_file": "vocab.txt",
-            "max_seq_len": 128
-        })
-    def preprocess(self, feed=[], fetch=[]):
-        feed_res = []
-        is_batch = False
-        for ins in feed:
-            feed_dict = self.reader.process(ins["words"].encode("utf-8"))
-            for key in feed_dict.keys():
-                feed_dict[key] = np.array(feed_dict[key]).reshape(
-                    (len(feed_dict[key]), 1))
-            feed_res.append(feed_dict)
-        return feed_res, fetch, is_batch
-bert_service = BertService(name="bert")
-bert_service.load()
-bert_service.load_model_config(sys.argv[1])
-bert_service.set_gpus("0")
-bert_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[2]), device="gpu")
-bert_service.run_rpc_service()
-bert_service.run_web_service()
--- a/python/examples/blazeface/README.md
+++ b/python/examples/blazeface/README.md
@@ -2,7 +2,7 @@
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model blazeface
+python3 -m paddle_serving_app.package --get_model blazeface
 tar -xf blazeface.tar.gz
 ```
@@ -11,13 +11,13 @@ tar -xf blazeface.tar.gz
 ### Start Service
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9494
+python3 -m paddle_serving_server.serve --model serving_server --port 9494
 ```
 ### Client Prediction
 ```
-python test_client.py serving_client/serving_client_conf.prototxt test.jpg
+python3 test_client.py serving_client/serving_client_conf.prototxt test.jpg
 ```
 the result is in `output` folder, including a json file and image file with bounding boxes.
--- a/python/examples/cascade_rcnn/README.md
+++ b/python/examples/cascade_rcnn/README.md
@@ -10,12 +10,12 @@ If you want to have more detection models, please refer to [Paddle Detection Mod
 ### Start the service
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
 ```
 ### Perform prediction
 ```
-python test_client.py 
+python3 test_client.py 000000570688.jpg
 ```
 Image with bounding boxes and json result would be saved in `output` folder.
--- a/python/examples/cascade_rcnn/README_CN.md
+++ b/python/examples/cascade_rcnn/README_CN.md
@@ -10,12 +10,12 @@ sh get_data.sh
 ### 启动服务
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
 ```
 ### 执行预测
 ```
-python test_client.py
+python3 test_client.py 000000570688.jpg
 ```
 客户端已经为图片做好了后处理，在`output`文件夹下存放各个框的json格式信息还有后处理结果图片。
--- a/python/examples/cascade_rcnn/get_data.sh
+++ b/python/examples/cascade_rcnn/get_data.sh
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/cascade_rcnn_r50_fpx_1x_serving.tar.gz
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco_serving.tar.gz
-tar xf cascade_rcnn_r50_fpx_1x_serving.tar.gz
+tar xf cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco_serving.tar.gz
--- a/python/examples/cascade_rcnn/label_list.txt
+++ b/python/examples/cascade_rcnn/label_list.txt
-background
 person
 bicycle
 car

--- a/python/examples/cascade_rcnn/test_client.py
+++ b/python/examples/cascade_rcnn/test_client.py
@@ -12,29 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import sys
+import numpy as np
 from paddle_serving_client import Client
 from paddle_serving_app.reader import *
-import numpy as np
+import cv2
-preprocess = Sequential([
+preprocess = DetectionSequential([
-    File2Image(), BGR2RGB(), Div(255.0),
+        DetectionFile2Image(),
-    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
+        DetectionResize((800, 1333), True, interpolation=2), 
-    Resize(800, 1333), Transpose((2, 0, 1)), PadStride(32)
+        DetectionNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True),
+        DetectionTranspose((2,0,1)),
+        DetectionPadStride(32)
 ])
 postprocess = RCNNPostprocess("label_list.txt", "output")
 client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9292'])
-im = preprocess('000000570688.jpg')
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
    feed={
        "image": im,
-        "im_info": np.array(list(im.shape[1:]) + [1.0]),
+        "im_shape": np.array(list(im.shape[1:])).reshape(-1),
-        "im_shape": np.array(list(im.shape[1:]) + [1.0])
+        "scale_factor": im_info['scale_factor'],
    },
-    fetch=["multiclass_nms_0.tmp_0"],
+    fetch=["save_infer_model/scale_0.tmp_1"],
    batch=False)
-fetch_map["image"] = '000000570688.jpg'
 print(fetch_map)
+fetch_map["image"] = sys.argv[1]
 postprocess(fetch_map)
-print(fetch_map)
--- a/python/examples/criteo_ctr/README.md
+++ b/python/examples/criteo_ctr/README.md
@@ -19,13 +19,13 @@ the directories like `ctr_serving_model` and `ctr_client_conf` will appear.
 ### Start RPC Inference Service
 ```
-python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #CPU RPC Service
+python3 -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #CPU RPC Service
-python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #RPC Service on GPU 0
+python3 -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #RPC Service on GPU 0
 ```
 ### RPC Infer
 ```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0
+python3 test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0
 ```
 the latency will display in the end.
--- a/python/examples/criteo_ctr/README_CN.md
+++ b/python/examples/criteo_ctr/README_CN.md
@@ -19,13 +19,13 @@ mv models/ctr_serving_model .
 ### 启动RPC预测服务
 ```
-python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #启动CPU预测服务
+python3 -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #启动CPU预测服务
-python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #在GPU 0上启动预测服务
+python3 -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #在GPU 0上启动预测服务
 ```
 ### 执行预测
 ```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0
+python3 test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0
 ```
 预测完毕会输出预测过程的耗时。
--- a/python/examples/criteo_ctr_with_cube/README.md
+++ b/python/examples/criteo_ctr_with_cube/README.md
@@ -32,13 +32,13 @@ Here, the sparse parameter is loaded by cube sparse parameter indexing service C
 ### Start RPC Predictor, the number of serving thread is 4（configurable in test_server.py）
 ```
-python test_server.py ctr_serving_model_kv 
+python3 test_server.py ctr_serving_model_kv 
 ```
 ### Run Prediction
 ```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
+python3 test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
 ```
 ### Benchmark

--- a/python/examples/criteo_ctr_with_cube/README_CN.md
+++ b/python/examples/criteo_ctr_with_cube/README_CN.md
@@ -30,13 +30,13 @@ sh cube_prepare.sh &
 ### 启动RPC预测服务，服务端线程数为4（可在test_server.py配置）
 ```
-python test_server.py ctr_serving_model_kv 
+python3 test_server.py ctr_serving_model_kv 
 ```
 ### 执行预测
 ```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
+python3 test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
 ```
 ### Benchmark

--- a/python/examples/criteo_ctr_with_cube/cube/conf/cube.conf
+++ b/python/examples/criteo_ctr_with_cube/cube/conf/cube.conf
+[{
+    "dict_name": "test_dict",
+    "shard": 1,
+    "dup": 1,
+    "timeout": 200,
+    "retry": 3,
+    "backup_request": 100,
+    "type": "ipport_list",
+    "load_balancer": "rr",
+    "nodes": [{
+        "ipport_list": "list://127.0.0.1:8027"
+    }]
+}]
--- a/python/examples/criteo_ctr_with_cube/cube/conf/gflags.conf
+++ b/python/examples/criteo_ctr_with_cube/cube/conf/gflags.conf
+--port=8027
+--dict_split=1
+--in_mem=true
+--log_dir=./log/
--- a/python/examples/criteo_ctr_with_cube/cube/keys
+++ b/python/examples/criteo_ctr_with_cube/cube/keys
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
--- a/python/examples/criteo_ctr_with_cube/test_client.py
+++ b/python/examples/criteo_ctr_with_cube/test_client.py
@@ -16,7 +16,7 @@
 from paddle_serving_client import Client
 import sys
 import os
-import criteo as criteo
+import criteo_reader as criteo
 import time
 from paddle_serving_client.metric import auc
 import numpy as np
@@ -35,22 +35,23 @@ reader = dataset.infer_reader(test_filelists, batch, buf_size)
 label_list = []
 prob_list = []
 start = time.time()
-for ei in range(10000):
+for ei in range(100):
    if py_version == 2:
        data = reader().next()
    else:
        data = reader().__next__()
    feed_dict = {}
-    feed_dict['dense_input'] = data[0][0]
+    feed_dict['dense_input'] = np.array(data[0][0]).reshape(1, len(data[0][0]))
    for i in range(1, 27):
-        feed_dict["embedding_{}.tmp_0".format(i - 1)] = np.array(data[0][i]).reshape(-1)
+        feed_dict["embedding_{}.tmp_0".format(i - 1)] = np.array(data[0][i]).reshape(len(data[0][i]))
        feed_dict["embedding_{}.tmp_0.lod".format(i - 1)] = [0, len(data[0][i])]
-    fetch_map = client.predict(feed=feed_dict, fetch=["prob"])
+    fetch_map = client.predict(feed=feed_dict, fetch=["prob"],batch=True)
    print(fetch_map)
    prob_list.append(fetch_map['prob'][0][1])
    label_list.append(data[0][-1][0])
-print(auc(label_list, prob_list))
 end = time.time()
 print(end - start)
--- a/python/examples/deeplabv3/README.md
+++ b/python/examples/deeplabv3/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model deeplabv3
+python3 -m paddle_serving_app.package --get_model deeplabv3
 tar -xzvf deeplabv3.tar.gz
 ```
@@ -12,11 +12,11 @@ tar -xzvf deeplabv3.tar.gz
 ### Start Service
 ```
-python -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
+python3 -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
 ```
 ### Client Prediction
 ```
-python deeplabv3_client.py
+python3 deeplabv3_client.py
 ```
--- a/python/examples/deeplabv3/README_CN.md
+++ b/python/examples/deeplabv3/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 ```
-python -m paddle_serving_app.package --get_model deeplabv3
+python3 -m paddle_serving_app.package --get_model deeplabv3
 tar -xzvf deeplabv3.tar.gz
 ```
@@ -12,10 +12,10 @@ tar -xzvf deeplabv3.tar.gz
 ### 启动服务端
 ```
-python -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
+python3 -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
 ```
 ### 客户端预测
 ```
-python deeplabv3_client.py
+python3 deeplabv3_client.py
--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
@@ -4,13 +4,13 @@
 ### Get The Faster RCNN HRNet Model
 ```
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/faster_rcnn_hrnetv2p_w18_1x.tar
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/faster_rcnn_hrnetv2p_w18_1x.tar.gz
 ```
 ### Start the service
 ```
-tar xf faster_rcnn_hrnetv2p_w18_1x.tar
+tar xf faster_rcnn_hrnetv2p_w18_1x.tar.gz
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`. But you need to do some extra work.
@@ -19,5 +19,5 @@ Please reference to https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/m
 ### Prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
@@ -4,19 +4,19 @@
 ## 获得Faster RCNN HRNet模型
 ```
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/faster_rcnn_hrnetv2p_w18_1x.tar
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/faster_rcnn_hrnetv2p_w18_1x.tar.gz
 ```
 ### 启动服务
 ```
-tar xf faster_rcnn_hrnetv2p_w18_1x.tar
+tar xf faster_rcnn_hrnetv2p_w18_1x.tar.gz
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项,但此时需要额外设置子图的TRT变长最大最小最优shape.
 请参考https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/test_client.py
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/test_client.py
-from paddle_serving_client import Client
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-from paddle_serving_app.reader import *
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
-preprocess = Sequential([
+preprocess = DetectionSequential([
-    File2Image(), BGR2RGB(), Div(255.0),
+       DetectionFile2Image(),
-    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
+       DetectionResize((800, 1333), True, interpolation=2), 
-    Resize(640, 640), Transpose((2, 0, 1))
+       DetectionNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True),
+       DetectionTranspose((2,0,1)),
+       DetectionPadStride(32)
 ])
 postprocess = RCNNPostprocess("label_list.txt", "output")
@@ -15,13 +32,15 @@ client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9494'])
-im = preprocess(sys.argv[1])
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
    feed={
        "image": im,
-        "im_info": np.array(list(im.shape[1:]) + [1.0]),
+        "im_shape": np.array(list(im.shape[1:])).reshape(-1),
-        "im_shape": np.array(list(im.shape[1:]) + [1.0])
+        "scale_factor": im_info['scale_factor'],
    },
-    fetch=["multiclass_nms_0.tmp_0"],
+    fetch=["save_infer_model/scale_0.tmp_1"],
    batch=False)
 print(fetch_map)
+fetch_map["image"] = sys.argv[1]
+postprocess(fetch_map)
--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
@@ -10,7 +10,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`. But you need to do some extra work.
@@ -19,7 +19,7 @@ Please reference to https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/m
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
 ## 3. Result analysis

--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
@@ -11,14 +11,14 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项,但此时需要额外设置子图的TRT变长最大最小最优shape.
 请参考https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
 ## 3. 结果分析

--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/test_client.py
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/test_client.py
@@ -12,15 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle_serving_client import Client
-from paddle_serving_app.reader import *
 import sys
 import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
-preprocess = Sequential([
+preprocess = DetectionSequential([
-    File2Image(), BGR2RGB(), Div(255.0),
+        DetectionFile2Image(),
-    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
+        DetectionNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True),
-    Resize(640, 640), Transpose((2, 0, 1))
+        DetectionResize(
+        (800, 1333), True, interpolation=cv2.INTER_LINEAR), 
+        DetectionTranspose((2,0,1)),
+        DetectionPadStride(128)
 ])
 postprocess = RCNNPostprocess("label_list.txt", "output")
@@ -29,15 +33,14 @@ client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9494'])
-im = preprocess(sys.argv[1])
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
    feed={
        "image": im,
        "im_shape": np.array(list(im.shape[1:])).reshape(-1),
-        "scale_factor": np.array([1.0, 1.0]).reshape(-1),
+        "scale_factor": im_info['scale_factor'],
    },
    fetch=["save_infer_model/scale_0.tmp_1"],
    batch=False)
-print(fetch_map)
 fetch_map["image"] = sys.argv[1]
 postprocess(fetch_map)
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/000000014439.jpg
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/000000014439.jpg
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/000000570688.jpg
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/000000570688.jpg
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README.md
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README.md
@@ -10,11 +10,11 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf fcos_dcn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000014439.jpg
 ```
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README_CN.md
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README_CN.md
@@ -11,12 +11,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf fcos_dcn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000014439.jpg
 ```
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/test_client.py
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/test_client.py
@@ -12,15 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle_serving_client import Client
-from paddle_serving_app.reader import *
 import sys
 import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
-preprocess = Sequential([
+preprocess = DetectionSequential([
-    File2Image(), BGR2RGB(), Div(255.0),
+        DetectionFile2Image(),
-    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
+        DetectionNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True),
-    Resize(640, 640), Transpose((2, 0, 1))
+        DetectionResize(
+        (800, 1333), True, interpolation=cv2.INTER_LINEAR), 
+        DetectionTranspose((2,0,1)),
+        DetectionPadStride(128)
 ])
 postprocess = RCNNPostprocess("label_list.txt", "output")
@@ -29,12 +33,14 @@ client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9494'])
-im = preprocess(sys.argv[1])
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
    feed={
        "image": im,
-        "scale_factor": np.array([1.0, 1.0]).reshape(-1),
+        "scale_factor": im_info['scale_factor'],
    },
    fetch=["save_infer_model/scale_0.tmp_1"],
    batch=False)
 print(fetch_map)
+fetch_map["image"] = sys.argv[1]
+postprocess(fetch_map)
--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md
@@ -10,13 +10,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ppyolo_r50vd_dcn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md
@@ -11,13 +11,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf ppyolo_r50vd_dcn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/test_client.py
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/test_client.py
@@ -12,15 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle_serving_client import Client
-from paddle_serving_app.reader import *
 import sys
 import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
-preprocess = Sequential([
+preprocess = DetectionSequential([
-    File2Image(), BGR2RGB(), Div(255.0),
+        DetectionFile2Image(),
-    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
+        DetectionNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True),
-    Resize((608, 608)), Transpose((2, 0, 1))
+        DetectionResize(
+        (608, 608), False, interpolation=2), 
+        DetectionTranspose((2,0,1))
 ])
 postprocess = RCNNPostprocess("label_list.txt", "output")
@@ -29,15 +32,14 @@ client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9494'])
-im = preprocess(sys.argv[1])
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
    feed={
        "image": im,
        "im_shape": np.array(list(im.shape[1:])).reshape(-1),
-        "scale_factor": np.array([1.0, 1.0]).reshape(-1),
+        "scale_factor": im_info['scale_factor'],
    },
    fetch=["save_infer_model/scale_0.tmp_1"],
    batch=False)
-print(fetch_map)
 fetch_map["image"] = sys.argv[1]
 postprocess(fetch_map)
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/000000014439.jpg
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/000000014439.jpg
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/000000570688.jpg
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/000000570688.jpg
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/README.md
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/README.md
@@ -10,11 +10,11 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ssd_vgg16_300_240e_voc.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000014439.jpg
 ```
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/README_CN.md
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/README_CN.md
@@ -11,12 +11,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf ssd_vgg16_300_240e_voc.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000014439.jpg
 ```
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/label_list.txt
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/label_list.txt
-person
+aeroplane
 bicycle
-car
-motorcycle
-airplane
-bus
-train
-truck
-boat
-traffic light
-fire hydrant
-stop sign
-parking meter
-bench
 bird
+boat
+bottle
+bus
+car
 cat
+chair
+cow
+diningtable
 dog
 horse
+motorbike
+person
+pottedplant
 sheep
-cow
+sofa
-elephant
+train
-bear
+tvmonitor
-zebra
-giraffe
-backpack
-umbrella
-handbag
-tie
-suitcase
-frisbee
-skis
-snowboard
-sports ball
-kite
-baseball bat
-baseball glove
-skateboard
-surfboard
-tennis racket
-bottle
-wine glass
-cup
-fork
-knife
-spoon
-bowl
-banana
-apple
-sandwich
-orange
-broccoli
-carrot
-hot dog
-pizza
-donut
-cake
-chair
-couch
-potted plant
-bed
-dining table
-toilet
-tv
-laptop
-mouse
-remote
-keyboard
-cell phone
-microwave
-oven
-toaster
-sink
-refrigerator
-book
-clock
-vase
-scissors
-teddy bear
-hair drier
-toothbrush
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/test_client.py
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/test_client.py
@@ -12,15 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle_serving_client import Client
-from paddle_serving_app.reader import *
 import sys
 import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
-preprocess = Sequential([
+preprocess = DetectionSequential([
-    File2Image(), BGR2RGB(),
+        DetectionFile2Image(),
-    Normalize([123.675, 116.28, 103.53], [58.395, 57.12, 57.375], False),
+        DetectionResize(
-    Resize((512, 512)), Transpose((2, 0, 1))
+        (300, 300), False, interpolation=cv2.INTER_LINEAR), 
+        DetectionNormalize([104.0, 117.0, 123.0], [1.0, 1.0, 1.0], False),
+        DetectionTranspose((2,0,1)),
 ])
 postprocess = RCNNPostprocess("label_list.txt", "output")
@@ -29,13 +32,15 @@ client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9494'])
-im = preprocess(sys.argv[1])
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
    feed={
        "image": im,
-        "im_shape": np.array([512, 512]),
+        "im_shape": np.array(list(im.shape[1:])).reshape(-1),
-        "scale_factor": np.array([1.0, 1.0]).reshape(-1),
+        "scale_factor": im_info['scale_factor'],
    },
    fetch=["save_infer_model/scale_0.tmp_1"],
    batch=False)
 print(fetch_map)
+fetch_map["image"] = sys.argv[1]
+postprocess(fetch_map)
--- a/python/examples/detection/ttfnet_darknet53_1x_coco/README.md
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/README.md
@@ -4,18 +4,17 @@
 ### Get Model
 ```
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/ttfnet_darknet53_1x_coco.tar
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/ttfnet_darknet53_1x_coco.tar
 ```
 ### Start the service
 ```
 tar xf ttfnet_darknet53_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
--- a/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md
@@ -4,20 +4,19 @@
 ## 获得模型
 ```
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/ttfnet_darknet53_1x_coco.tar
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/ttfnet_darknet53_1x_coco.tar
 ```
 ### 启动服务
 ```
 tar xf ttfnet_darknet53_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
--- a/python/examples/detection/ttfnet_darknet53_1x_coco/test_client.py
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/test_client.py
@@ -11,16 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle_serving_client import Client
-from paddle_serving_app.reader import *
 import sys
 import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
-preprocess = Sequential([
+preprocess = DetectionSequential([
-    File2Image(), BGR2RGB(), 
+        DetectionFile2Image(),
-    Normalize([123.675, 116.28, 103.53], [58.395, 57.12, 57.375], False),
+        DetectionResize(
-    Resize((512, 512)), Transpose((2, 0, 1))
+        (512, 512), False, interpolation=cv2.INTER_LINEAR),
+        DetectionNormalize([123.675, 116.28, 103.53], [58.395, 57.12, 57.375], False),
+        DetectionTranspose((2,0,1))
 ])
 postprocess = RCNNPostprocess("label_list.txt", "output")
@@ -29,11 +31,14 @@ client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9494'])
-im = preprocess(sys.argv[1])
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
    feed={
        "image": im,
-        "scale_factor": np.array([1.0, 1.0]).reshape(-1),
+        "im_shape": np.array(list(im.shape[1:])).reshape(-1),
+        "scale_factor": im_info['scale_factor'],
    },
    fetch=["save_infer_model/scale_0.tmp_1"],
    batch=False)

--- a/python/examples/detection/yolov3_darknet53_270e_coco/README.md
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/README.md
@@ -10,13 +10,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf yolov3_darknet53_270e_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
--- a/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md
@@ -11,13 +11,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf yolov3_darknet53_270e_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
--- a/python/examples/detection/yolov3_darknet53_270e_coco/test_client.py
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/test_client.py
@@ -12,15 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle_serving_client import Client
-from paddle_serving_app.reader import *
 import sys
 import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
-preprocess = Sequential([
+preprocess = DetectionSequential([
-    File2Image(), BGR2RGB(), Div(255.0),
+        DetectionFile2Image(),
-    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
+        DetectionResize(
-    Resize((608, 608)), Transpose((2, 0, 1))
+        (608, 608), False, interpolation=2), 
+        DetectionNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True),
+        DetectionTranspose((2,0,1)),
 ])
 postprocess = RCNNPostprocess("label_list.txt", "output")
@@ -29,15 +32,14 @@ client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9494'])
-im = preprocess(sys.argv[1])
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
    feed={
        "image": im,
        "im_shape": np.array(list(im.shape[1:])).reshape(-1),
-        "scale_factor": np.array([1.0, 1.0]).reshape(-1),
+        "scale_factor": im_info['scale_factor'],
    },
    fetch=["save_infer_model/scale_0.tmp_1"],
    batch=False)
-print(fetch_map)
 fetch_map["image"] = sys.argv[1]
 postprocess(fetch_map)
--- a/python/examples/encryption/README.md
+++ b/python/examples/encryption/README.md
@@ -12,9 +12,9 @@ sh get_data.sh
 ## Encrypt Model
-The `paddlepaddle` package is used in this example, you may need to download the corresponding package(`pip install paddlepaddle`).
+The `paddlepaddle` package is used in this example, you may need to download the corresponding package(`pip3 install paddlepaddle`).
-[python encrypt.py](./encrypt.py)
+[python3 encrypt.py](./encrypt.py)
 [//file]:#encrypt.py
 ``` python
@@ -35,14 +35,14 @@ client-side configuration file are stored in the `encrypt_client` directory.
 ## Start Encryption Service
 CPU Service
 ```
-python -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model
+python3 -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model
 ```
 GPU Service
 ```
-python -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model --gpu_ids 0
+python3 -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model --gpu_ids 0
 ```
 ## Prediction
 ```
-python test_client.py encrypt_client/serving_client_conf.prototxt
+python3 test_client.py encrypt_client/serving_client_conf.prototxt
 ```
--- a/python/examples/encryption/README_CN.md
+++ b/python/examples/encryption/README_CN.md
@@ -11,9 +11,9 @@ sh get_data.sh
 ```
 ## 模型加密
-本示例中使用了`paddlepaddle`包中的模块，需要进行下载（`pip install paddlepaddle`）。
+本示例中使用了`paddlepaddle`包中的模块，需要进行下载（`pip3 install paddlepaddle`）。
-运行[python encrypt.py](./encrypt.py)进行模型加密
+运行[python3 encrypt.py](./encrypt.py)进行模型加密
 [//file]:#encrypt.py
 ``` python
@@ -36,14 +36,14 @@ def serving_encryption():
 ## 启动加密预测服务
 CPU预测服务
 ```
-python -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model
+python3 -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model
 ```
 GPU预测服务
 ```
-python -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model --gpu_ids 0
+python3 -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model --gpu_ids 0
 ```
 ## 预测
 ```
-python test_client.py encrypt_client/serving_client_conf.prototxt
+python3 test_client.py encrypt_client/serving_client_conf.prototxt
 ```
--- a/python/examples/fit_a_line/README.md
+++ b/python/examples/fit_a_line/README.md
@@ -15,22 +15,22 @@ sh get_data.sh
 ### Start server
 ```shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
 ```
 ## Client prediction
 ### RPC Client
-The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip install paddlepaddle`).
+The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip3 install paddlepaddle`).
 ``` shell
-python test_client.py uci_housing_client/serving_client_conf.prototxt
+python3 test_client.py uci_housing_client/serving_client_conf.prototxt
 ```
 ### Http Client
 ``` shell
-python test_httpclient.py uci_housing_client/serving_client_conf.prototxt
+python3 test_httpclient.py uci_housing_client/serving_client_conf.prototxt
 ```

--- a/python/examples/fit_a_line/README_CN.md
+++ b/python/examples/fit_a_line/README_CN.md
@@ -9,28 +9,26 @@ sh get_data.sh
 ```
+## 开启服务端（支持BRPC-Client/GRPC Client/Http-Client）
-## 开启服务端
 ```shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
 ```
 ## 客户端预测
-### 客户端RPC
+### BRPC-Client
-`test_client.py`中使用了`paddlepaddle`包，需要进行下载（`pip install paddlepaddle`）。
+`test_client.py`中使用了`paddlepaddle`包，需要进行下载（`pip3 install paddlepaddle`）。
 ``` shell
-python test_client.py uci_housing_client/serving_client_conf.prototxt
+python3 test_client.py uci_housing_client/serving_client_conf.prototxt
 ```
-### 客户端Http预测
+### GRPC-Client/Http-Client
 ``` shell
-python test_httpclient.py uci_housing_client/serving_client_conf.prototxt
+python3 test_httpclient.py uci_housing_client/serving_client_conf.prototxt
 ```

--- a/python/examples/fit_a_line/test_httpclient.py
+++ b/python/examples/fit_a_line/test_httpclient.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing
-from paddle_serving_client.httpclient import GeneralClient
+from paddle_serving_client.httpclient import HttpClient
 import sys
 import numpy as np
 import time
-client = GeneralClient()
+client = HttpClient()
 client.load_client_config(sys.argv[1])
 ''' 
 if you want use GRPC-client, set_use_grpc_client(True)
@@ -41,13 +41,14 @@ we recommend use Proto data format in HTTP-body, set True(which is default)
 if you want use JSON data format in HTTP-body, set False
 '''
 #client.set_http_proto(True)
+client.connect(["127.0.0.1:9393"])
+fetch_list = client.get_fetch_names()
 import paddle
 test_reader = paddle.batch(
    paddle.reader.shuffle(
        paddle.dataset.uci_housing.test(), buf_size=500),
    batch_size=1)
-fetch_list = client.get_fetch_names()
 for data in test_reader():
    new_data = np.zeros((1, 13)).astype("float32")
    new_data[0] = data[0][0]

--- a/python/examples/imagenet/README.md
+++ b/python/examples/imagenet/README.md
@@ -12,38 +12,30 @@ sh get_model.sh
 ### Install preprocess module
 ```
-pip install paddle_serving_app
+pip3 install paddle_serving_app
 ```
-### HTTP Service
-launch server side
-```
-python resnet50_web_service.py ResNet50_vd_model cpu 9696 #cpu inference service
-```
-```
-python resnet50_web_service.py ResNet50_vd_model gpu 9696 #gpu inference service
-```
+### Inference Service(Support BRPC-Client/GRPC-Client/Http-Client)
-client send inference request
+launch server side
 ```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"image": "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"}], "fetch": ["score"]}' http://127.0.0.1:9696/image/prediction
+python3 -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu inference service
 ```
-### RPC Service
-launch server side
 ```
-python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu inference service
+python3 -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu inference service
 ```
+### BRPC-Client
+client send inference request
 ```
-python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu inference service
+python3 resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
 ```
+*the port of server side in this example is 9696
+### GRPC-Client/Http-Client
 client send inference request
 ```
-python resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
+python3 resnet50_http_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
 ```
-*the port of server side in this example is 9696
--- a/python/examples/imagenet/README_CN.md
+++ b/python/examples/imagenet/README_CN.md
@@ -12,38 +12,30 @@ sh get_model.sh
 ### 安装数据预处理模块
 ```
-pip install paddle_serving_app
+pip3 install paddle_serving_app
 ```
-### HTTP服务
+### 启动服务端（支持BRPC-Client、GRPC-Client、Http-Client）
 启动server端
 ```
-python resnet50_web_service.py ResNet50_vd_model cpu 9696 #cpu预测服务
+python3 -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu预测服务
 ```
-```
-python resnet50_web_service.py ResNet50_vd_model gpu 9696 #gpu预测服务
-```
-发送HTTP POST请求
 ```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"image": "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"}], "fetch": ["score"]}' http://127.0.0.1:9696/image/prediction
+python3 -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu预测服务
 ```
-### RPC服务
+### BRPC-Client预测
+client端进行预测
-启动server端
 ```
-python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu预测服务
+python3 resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
 ```
+*server端示例中服务端口为9696端口
-```
-python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu预测服务
-```
+### GRPC-Client/Http-Client预测
 client端进行预测
 ```
-python resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
+python3 resnet50_http_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
 ```
-*server端示例中服务端口为9696端口
--- a/python/examples/imagenet/resnet50_http_client.py
+++ b/python/examples/imagenet/resnet50_http_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from paddle_serving_client import HttpClient
+from paddle_serving_app.reader import Sequential, URL2Image, Resize
+from paddle_serving_app.reader import CenterCrop, RGB2BGR, Transpose, Div, Normalize
+import time
+client = HttpClient()
+client.load_client_config(sys.argv[1])
+''' 
+if you want use GRPC-client, set_use_grpc_client(True)
+or you can directly use client.grpc_client_predict(...)
+as for HTTP-client,set_use_grpc_client(False)(which is default)
+or you can directly use client.http_client_predict(...)
+'''
+#client.set_use_grpc_client(True)
+'''
+if you want to enable Encrypt Module,uncommenting the following line
+'''
+#client.use_key("./key")
+'''
+if you want to compress,uncommenting the following line
+'''
+#client.set_response_compress(True)
+#client.set_request_compress(True)
+'''
+we recommend use Proto data format in HTTP-body, set True(which is default)
+if you want use JSON data format in HTTP-body, set False
+'''
+#client.set_http_proto(True)
+client.connect(["127.0.0.1:9696"])
+label_dict = {}
+label_idx = 0
+with open("imagenet.label") as fin:
+    for line in fin:
+        label_dict[label_idx] = line.strip()
+        label_idx += 1
+seq = Sequential([
+    URL2Image(), Resize(256), CenterCrop(224), RGB2BGR(), Transpose((2, 0, 1)),
+    Div(255), Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True)
+])
+start = time.time()
+image_file = "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"
+for i in range(10):
+    img = seq(image_file)
+    fetch_map = client.predict(
+        feed={"image": img}, fetch=["score"], batch=False)
+    print(fetch_map)
+end = time.time()
+print(end - start)
--- a/python/examples/imagenet/resnet50_web_service.py
+++ b/python/examples/imagenet/resnet50_web_service.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-from paddle_serving_client import Client
-import numpy as np
-from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-if len(sys.argv) != 4:
-    print("python resnet50_web_service.py model device port")
-    sys.exit(-1)
-device = sys.argv[2]
-if device == "cpu":
-    from paddle_serving_server.web_service import WebService
-else:
-    from paddle_serving_server.web_service import WebService
-class ImageService(WebService):
-    def init_imagenet_setting(self):
-        self.seq = Sequential([
-            URL2Image(), Resize(256), CenterCrop(224), RGB2BGR(), Transpose(
-                (2, 0, 1)), Div(255), Normalize([0.485, 0.456, 0.406],
-                                                [0.229, 0.224, 0.225], True)
-        ])
-        self.label_dict = {}
-        label_idx = 0
-        with open("imagenet.label") as fin:
-            for line in fin:
-                self.label_dict[label_idx] = line.strip()
-                label_idx += 1
-    def preprocess(self, feed=[], fetch=[]):
-        feed_batch = []
-        is_batch = True
-        for ins in feed:
-            if "image" not in ins:
-                raise ("feed data error!")
-            img = self.seq(ins["image"])
-            feed_batch.append({"image": img[np.newaxis, :]})
-        return feed_batch, fetch, is_batch
-    def postprocess(self, feed=[], fetch=[], fetch_map={}):
-        score_list = fetch_map["score"]
-        result = {"label": [], "prob": []}
-        for score in score_list:
-            score = score.tolist()
-            max_score = max(score)
-            result["label"].append(self.label_dict[score.index(max_score)]
-                                   .strip().replace(",", ""))
-            result["prob"].append(max_score)
-        return result
-image_service = ImageService(name="image")
-image_service.load_model_config(sys.argv[1])
-image_service.init_imagenet_setting()
-if device == "gpu":
-    image_service.set_gpus("0")
-image_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[3]), device=device)
-image_service.run_rpc_service()
-image_service.run_web_service()
--- a/python/examples/imdb/README.md
+++ b/python/examples/imdb/README.md
@@ -9,24 +9,20 @@ sh get_data.sh
 ```
 the package downloaded contains cnn, lstm and bow model config along with their test_data and train_data.
-### Start RPC inference service
+### Start inference service(Support BRPC-Client/GRPC-Client/Http-Client)
 ```
-python -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292
+python3 -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292
 ```
-### RPC Infer
+### BRPC-Client Infer
 ```
-head test_data/part-0 | python test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
+head test_data/part-0 | python3 test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
 ```
 it will get predict results of the first 10 test cases.
-### Start HTTP inference service
-```
-python text_classify_service.py imdb_cnn_model/ workdir/ 9292 imdb.vocab
-```
-### HTTP Infer
+### GRPC-Client/Http-Client Infer
 ```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:9292/imdb/prediction
+head test_data/part-0 | python3 test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
 ```
--- a/python/examples/imdb/README_CN.md
+++ b/python/examples/imdb/README_CN.md
@@ -9,23 +9,18 @@ sh get_data.sh
 ```
 脚本会下载和解压出cnn、lstm和bow三种模型的配置文文件以及test_data和train_data。
-### 启动RPC预测服务
+### 启动预测服务(支持BRPC-Client/GRPC-Client/Http-Client)
 ```
-python -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292
+python3 -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292
 ```
-### 执行预测
+### BRPC-Client预测
 ```
-head test_data/part-0 | python test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
+head test_data/part-0 | python3 test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
 ```
 预测test_data/part-0的前十个样例。
-### 启动HTTP预测服务
+### BRPC-Client预测
 ```
-python text_classify_service.py imdb_cnn_model/ workdir/ 9292 imdb.vocab
+head test_data/part-0 | python3 test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
-```
-### 执行预测
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:9292/imdb/prediction
 ```
--- a/python/examples/imdb/imdb_web_service_demo.sh
+++ b/python/examples/imdb/imdb_web_service_demo.sh
-wget https://paddle-serving.bj.bcebos.com/imdb-demo/imdb_service.tar.gz
-tar -xzf imdb_service.tar.gz
-wget --no-check-certificate https://fleet.bj.bcebos.com/text_classification_data.tar.gz
-tar -zxvf text_classification_data.tar.gz
-python text_classify_service.py serving_server_model/ workdir imdb.vocab
--- a/python/examples/imdb/text_classify_service.py
+++ b/python/examples/imdb/text_classify_service.py
@@ -12,37 +12,50 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
+from paddle_serving_client import HttpClient
-from paddle_serving_server.web_service import WebService
 from paddle_serving_app.reader.imdb_reader import IMDBDataset
 import sys
 import numpy as np
+client = HttpClient()
+client.load_client_config(sys.argv[1])
+''' 
+if you want use GRPC-client, set_use_grpc_client(True)
+or you can directly use client.grpc_client_predict(...)
+as for HTTP-client,set_use_grpc_client(False)(which is default)
+or you can directly use client.http_client_predict(...)
+'''
+#client.set_use_grpc_client(True)
+'''
+if you want to enable Encrypt Module,uncommenting the following line
+'''
+#client.use_key("./key")
+'''
+if you want to compress,uncommenting the following line
+'''
+#client.set_response_compress(True)
+#client.set_request_compress(True)
+'''
+we recommend use Proto data format in HTTP-body, set True(which is default)
+if you want use JSON data format in HTTP-body, set False
+'''
+#client.set_http_proto(True)
+client.connect(["127.0.0.1:9292"])
-class IMDBService(WebService):
+# you can define any english sentence or dataset here
-    def prepare_dict(self, args={}):
+# This example reuses imdb reader in training, you
-        if len(args) == 0:
+# can define your own data preprocessing easily.
-            exit(-1)
+imdb_dataset = IMDBDataset()
-        self.dataset = IMDBDataset()
+imdb_dataset.load_resource(sys.argv[2])
-        self.dataset.load_resource(args["dict_file_path"])
-    def preprocess(self, feed={}, fetch=[]):
-        feed_batch = []
-        words_lod = [0]
-        is_batch = True
-        for ins in feed:
-            words = self.dataset.get_words_only(ins["words"])
-            words = np.array(words).reshape(len(words), 1)
-            words_lod.append(words_lod[-1] + len(words))
-            feed_batch.append(words)
-        feed = {"words": np.concatenate(feed_batch), "words.lod": words_lod}
-        return feed, fetch, is_batch
-imdb_service = IMDBService(name="imdb")
+for line in sys.stdin:
-imdb_service.load_model_config(sys.argv[1])
+    word_ids, label = imdb_dataset.get_words_and_label(line)
-imdb_service.prepare_server(
+    word_len = len(word_ids)
-    workdir=sys.argv[2], port=int(sys.argv[3]), device="cpu")
+    feed = {
-imdb_service.prepare_dict({"dict_file_path": sys.argv[4]})
+        "words": np.array(word_ids).reshape(word_len, 1),
-imdb_service.run_rpc_service()
+        "words.lod": [0, word_len]
-imdb_service.run_web_service()
+    }
+    #print(feed)
+    fetch = ["prediction"]
+    fetch_map = client.predict(feed=feed, fetch=fetch, batch=True)
+    print(fetch_map)
--- a/python/examples/lac/README.md
+++ b/python/examples/lac/README.md
@@ -4,28 +4,23 @@
 ### Get Model
 ```
-python -m paddle_serving_app.package --get_model lac
+python3 -m paddle_serving_app.package --get_model lac
 tar -xzvf lac.tar.gz
 ```
-#### Start RPC inference service
+#### Start inference service(Support BRPC-Client/GRPC-Client/Http-Client)
 ```
-python -m paddle_serving_server.serve --model lac_model/ --port 9292
+python3 -m paddle_serving_server.serve --model lac_model/ --port 9292
 ```
-### RPC Infer
+### BRPC Infer
 ```
-echo "我爱北京天安门" | python lac_client.py lac_client/serving_client_conf.prototxt
+echo "我爱北京天安门" | python3 lac_client.py lac_client/serving_client_conf.prototxt
 ```
 It will get the segmentation result. 
-### Start HTTP inference service
+### GRPC/Http Infer
 ```
-python lac_web_service.py lac_model/ lac_workdir 9292
+echo "我爱北京天安门" | python3 lac_http_client.py lac_client/serving_client_conf.prototxt
-```
-### HTTP Infer
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "我爱北京天安门"}], "fetch":["word_seg"]}' http://127.0.0.1:9292/lac/prediction
 ```
--- a/python/examples/lac/README_CN.md
+++ b/python/examples/lac/README_CN.md
@@ -4,28 +4,23 @@
 ### 获取模型
 ```
-python -m paddle_serving_app.package --get_model lac
+python3 -m paddle_serving_app.package --get_model lac
 tar -xzvf lac.tar.gz
 ```
-#### 开启RPC预测服务
+#### 开启预测服务(支持BRPC-Client/GRPC-Client/Http-Client)
 ```
-python -m paddle_serving_server.serve --model lac_model/ --port 9292
+python3 -m paddle_serving_server.serve --model lac_model/ --port 9292
 ```
-### 执行RPC预测
+### 执行BRPC预测
 ```
-echo "我爱北京天安门" | python lac_client.py lac_client/serving_client_conf.prototxt
+echo "我爱北京天安门" | python3 lac_client.py lac_client/serving_client_conf.prototxt
 ```
 我们就能得到分词结果
-### 开启HTTP预测服务
+### 执行GRPC/Http预测
 ```
-python lac_web_service.py lac_model/ lac_workdir 9292
+echo "我爱北京天安门" | python3 lac_http_client.py lac_client/serving_client_conf.prototxt
-```
-### 执行HTTP预测
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "我爱北京天安门"}], "fetch":["word_seg"]}' http://127.0.0.1:9292/lac/prediction
 ```
--- a/python/examples/lac/lac_http_client.py
+++ b/python/examples/lac/lac_http_client.py
+# encoding=utf-8
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,17 +12,55 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#coding=utf-8
+# pylint: disable=doc-string-missing
-import requests
-import json
-import time
-if __name__ == "__main__":
+from paddle_serving_client import HttpClient
-    server = "http://127.0.0.1:9280/lac/prediction"
+from paddle_serving_app.reader import LACReader
-    fin = open("jieba_test.txt", "r")
+import sys
-    start = time.time()
+import os
-    for line in fin:
+import io
-        req_data = {"words": line.strip(), "fetch": ["crf_decode"]}
+import numpy as np
-        r = requests.post(server, json=req_data)
-    end = time.time()
+client = HttpClient()
-    print(end - start)
+client.load_client_config(sys.argv[1])
+''' 
+if you want use GRPC-client, set_use_grpc_client(True)
+or you can directly use client.grpc_client_predict(...)
+as for HTTP-client,set_use_grpc_client(False)(which is default)
+or you can directly use client.http_client_predict(...)
+'''
+#client.set_use_grpc_client(True)
+'''
+if you want to enable Encrypt Module,uncommenting the following line
+'''
+#client.use_key("./key")
+'''
+if you want to compress,uncommenting the following line
+'''
+#client.set_response_compress(True)
+#client.set_request_compress(True)
+'''
+we recommend use Proto data format in HTTP-body, set True(which is default)
+if you want use JSON data format in HTTP-body, set False
+'''
+#client.set_http_proto(True)
+client.connect(["127.0.0.1:9292"])
+reader = LACReader()
+for line in sys.stdin:
+    if len(line) <= 0:
+        continue
+    feed_data = reader.process(line)
+    if len(feed_data) <= 0:
+        continue
+    print(feed_data)
+    #fetch_map = client.predict(feed={"words": np.array(feed_data).reshape(len(feed_data), 1), "words.lod": [0, len(feed_data)]}, fetch=["crf_decode"], batch=True)
+    fetch_map = client.predict(
+        feed={
+            "words": np.array(feed_data + feed_data).reshape(
+                len(feed_data) * 2, 1),
+            "words.lod": [0, len(feed_data), 2 * len(feed_data)]
+        },
+        fetch=["crf_decode"],
+        batch=True)
+    print(fetch_map)
--- a/python/examples/lac/lac_web_service.py
+++ b/python/examples/lac/lac_web_service.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle_serving_server.web_service import WebService
-import sys
-from paddle_serving_app.reader import LACReader
-import numpy as np
-class LACService(WebService):
-    def load_reader(self):
-        self.reader = LACReader()
-    def preprocess(self, feed={}, fetch=[]):
-        feed_batch = []
-        fetch = ["crf_decode"]
-        lod_info = [0]
-        is_batch = True
-        for ins in feed:
-            if "words" not in ins:
-                raise ("feed data error!")
-            feed_data = self.reader.process(ins["words"])
-            feed_batch.append(np.array(feed_data).reshape(len(feed_data), 1))
-            lod_info.append(lod_info[-1] + len(feed_data))
-        feed_dict = {
-            "words": np.concatenate(
-                feed_batch, axis=0),
-            "words.lod": lod_info
-        }
-        return feed_dict, fetch, is_batch
-    def postprocess(self, feed={}, fetch=[], fetch_map={}):
-        batch_ret = []
-        for idx, ins in enumerate(feed):
-            begin = fetch_map['crf_decode.lod'][idx]
-            end = fetch_map['crf_decode.lod'][idx + 1]
-            segs = self.reader.parse_result(ins["words"],
-                                            fetch_map["crf_decode"][begin:end])
-            batch_ret.append({"word_seg": "|".join(segs)})
-        return batch_ret
-lac_service = LACService(name="lac")
-lac_service.load_model_config(sys.argv[1])
-lac_service.load_reader()
-lac_service.prepare_server(
-    workdir=sys.argv[2], port=int(sys.argv[3]), device="cpu")
-lac_service.run_rpc_service()
-lac_service.run_web_service()
--- a/python/examples/low_precision/resnet50/README.md
+++ b/python/examples/low_precision/resnet50/README.md
@@ -11,15 +11,15 @@ Firstly, download the [Resnet50 int8 model](https://paddle-inference-dist.bj.bce
 wget https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ResNet50_quant.tar.gz
 tar zxvf ResNet50_quant.tar.gz
-python -m paddle_serving_client.convert --dirname ResNet50_quant
+python3 -m paddle_serving_client.convert --dirname ResNet50_quant
 ```
 Start RPC service, specify the GPU id and precision mode
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 
+python3 -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 
 ```
 Request the serving service with Client
 ```
-python resnet50_client.py
+python3 resnet50_client.py
 ```
 ## Reference

--- a/python/examples/low_precision/resnet50/README_CN.md
+++ b/python/examples/low_precision/resnet50/README_CN.md
@@ -10,15 +10,15 @@
 wget https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ResNet50_quant.tar.gz
 tar zxvf ResNet50_quant.tar.gz
-python -m paddle_serving_client.convert --dirname ResNet50_quant
+python3 -m paddle_serving_client.convert --dirname ResNet50_quant
 ```
 启动rpc服务, 设定所选GPU id、部署模型精度
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 
+python3 -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 
 ```
 使用client进行请求
 ```
-python resnet50_client.py
+python3 resnet50_client.py
 ```
 ## 参考文档

--- a/python/examples/mobilenet/README.md
+++ b/python/examples/mobilenet/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model mobilenet_v2_imagenet
+python3 -m paddle_serving_app.package --get_model mobilenet_v2_imagenet
 tar -xzvf mobilenet_v2_imagenet.tar.gz
 ```
@@ -12,11 +12,11 @@ tar -xzvf mobilenet_v2_imagenet.tar.gz
 ### Start Service
 ```
-python -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
+python3 -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
 ```
 ### Client Prediction
 ```
-python mobilenet_tutorial.py
+python3 mobilenet_tutorial.py
 ```
--- a/python/examples/mobilenet/README_CN.md
+++ b/python/examples/mobilenet/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 ```
-python -m paddle_serving_app.package --get_model mobilenet_v2_imagenet
+python3 -m paddle_serving_app.package --get_model mobilenet_v2_imagenet
 tar -xzvf mobilenet_v2_imagenet.tar.gz
 ```
@@ -12,11 +12,11 @@ tar -xzvf mobilenet_v2_imagenet.tar.gz
 ### 启动服务端
 ```
-python -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
+python3 -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
 ```
 ### 客户端预测
 ```
-python mobilenet_tutorial.py
+python3 mobilenet_tutorial.py
 ```
--- a/python/examples/ocr/README.md
+++ b/python/examples/ocr/README.md
@@ -4,9 +4,9 @@
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model ocr_rec
+python3 -m paddle_serving_app.package --get_model ocr_rec
 tar -xzvf ocr_rec.tar.gz
-python -m paddle_serving_app.package --get_model ocr_det
+python3 -m paddle_serving_app.package --get_model ocr_det
 tar -xzvf ocr_det.tar.gz
 ```
@@ -23,16 +23,16 @@ tar xf test_imgs.tar
 ```
 #choose one of cpu/gpu commands as following
 #for cpu user
-python -m paddle_serving_server.serve --model ocr_det_model --port 9293
+python3 -m paddle_serving_server.serve --model ocr_det_model --port 9293
-python ocr_web_server.py cpu
+python3 ocr_web_server.py cpu
 #for gpu user
-python -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_ids 0
-python ocr_web_server.py gpu
+python3 ocr_web_server.py gpu
 ```
 ### Client Prediction
 ```
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 If you want a faster web service, please try Web LocalPredictor Service
@@ -40,14 +40,14 @@ If you want a faster web service, please try Web LocalPredictor Service
 ```
 #choose one of cpu/gpu commands as following
 #for cpu user
-python ocr_debugger_server.py cpu
+python3 ocr_debugger_server.py cpu
 #for gpu user
-python ocr_debugger_server.py gpu 
+python3 ocr_debugger_server.py gpu 
 ```
 ## Web LocalPredictor Client Prediction
 ```
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 ## Benchmark
@@ -69,34 +69,34 @@ if you are going to detect images not recognize it or directly recognize the wor
 ### Det Server 
 ```
-python det_web_server.py cpu #for cpu user
+python3 det_web_server.py cpu #for cpu user
-python det_web_server.py gpu #for gpu user
+python3 det_web_server.py gpu #for gpu user
 #or
-python det_debugger_server.py cpu #for cpu user
+python3 det_debugger_server.py cpu #for cpu user
-python det_debugger_server.py gpu #for gpu user
+python3 det_debugger_server.py gpu #for gpu user
 ```
 ### Det Client
 ```
 # also use ocr_web_client.py
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 ### Rec Server
 ```
-python rec_web_server.py cpu #for cpu user
+python3 rec_web_server.py cpu #for cpu user
-python rec_web_server.py gpu #for gpu user
+python3 rec_web_server.py gpu #for gpu user
 #or
-python rec_debugger_server.py cpu #for cpu user
+python3 rec_debugger_server.py cpu #for cpu user
-python rec_debugger_server.py gpu #for gpu user
+python3 rec_debugger_server.py gpu #for gpu user
 ```
 ### Rec Client
 ```
-python rec_web_client.py
+python3 rec_web_client.py
 ```
 ## C++ OCR Service
@@ -109,9 +109,9 @@ Select a startup mode according to CPU / GPU device
 After the -- model parameter, the folder path of multiple model files is passed in to start the prediction service of multiple model concatenation.
 ```
 #for cpu user
-python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
+python3 -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
 #for gpu user
-python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_ids 0
 ```
 ### Client Prediction
@@ -119,9 +119,9 @@ The pre-processing and post-processing is in the C + + server part, the image's
 so the value of parameter `feed_var` which is in the file `ocr_det_client/serving_client_conf.prototxt` should be changed.
-for this case, `feed_type` should be 3(which means the data type is string),`shape` should be 1.
+for this case, `feed_type` should be 20(which means the data type is string),`shape` should be 1.
 By passing in multiple client folder paths, the client can be started for multi model prediction.
 ```
-python ocr_cpp_client.py ocr_det_client ocr_rec_client
+python3 ocr_cpp_client.py ocr_det_client ocr_rec_client
 ```
--- a/python/examples/ocr/README_CN.md
+++ b/python/examples/ocr/README_CN.md
@@ -4,9 +4,9 @@
 ## 获取模型
 ```
-python -m paddle_serving_app.package --get_model ocr_rec
+python3 -m paddle_serving_app.package --get_model ocr_rec
 tar -xzvf ocr_rec.tar.gz
-python -m paddle_serving_app.package --get_model ocr_det
+python3 -m paddle_serving_app.package --get_model ocr_det
 tar -xzvf ocr_det.tar.gz
 ```
 ## 获取数据集（可选）
@@ -22,16 +22,16 @@ tar xf test_imgs.tar
 ```
 #根据CPU/GPU设备选择一种启动方式
 #for cpu user
-python -m paddle_serving_server.serve --model ocr_det_model --port 9293
+python3 -m paddle_serving_server.serve --model ocr_det_model --port 9293
-python ocr_web_server.py cpu
+python3 ocr_web_server.py cpu
 #for gpu user
-python -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_ids 0
-python ocr_web_server.py gpu
+python3 ocr_web_server.py gpu
 ```
 ### 启动客户端
 ```
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 如果用户需要更快的执行速度，请尝试LocalPredictor版Web服务
@@ -39,14 +39,14 @@ python ocr_web_client.py
 ```
 #根据CPU/GPU设备选择一种启动方式
 #for cpu user
-python ocr_debugger_server.py cpu
+python3 ocr_debugger_server.py cpu
 #for gpu user
-python ocr_debugger_server.py gpu
+python3 ocr_debugger_server.py gpu
 ```
 ## 启动客户端
 ```
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 ## 性能指标
@@ -69,34 +69,34 @@ GPU: Nvidia Tesla V100单卡
 ### 启动检测服务
 ```
-python det_web_server.py cpu #for cpu user
+python3 det_web_server.py cpu #for cpu user
-python det_web_server.py gpu #for gpu user
+python3 det_web_server.py gpu #for gpu user
 #or
-python det_debugger_server.py cpu #for cpu user
+python3 det_debugger_server.py cpu #for cpu user
-python det_debugger_server.py gpu #for gpu user
+python3 det_debugger_server.py gpu #for gpu user
 ```
 ### 检测服务客户端
 ```
 # also use ocr_web_client.py
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 ### 启动识别服务
 ```
-python rec_web_server.py cpu #for cpu user
+python3 rec_web_server.py cpu #for cpu user
-python rec_web_server.py gpu #for gpu user
+python3 rec_web_server.py gpu #for gpu user
 #or
-python rec_debugger_server.py cpu #for cpu user
+python3 rec_debugger_server.py cpu #for cpu user
-python rec_debugger_server.py gpu #for gpu user
+python3 rec_debugger_server.py gpu #for gpu user
 ```
 ### 识别服务客户端
 ```
-python rec_web_client.py
+python3 rec_web_client.py
 ```
 ## C++ OCR Service服务
@@ -108,9 +108,9 @@ python rec_web_client.py
 通过--model后，指定多个模型文件的文件夹路径来启动多模型串联的预测服务。
 ```
 #for cpu user
-python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
+python3 -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
 #for gpu user
-python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_ids 0
 ```
 ### 启动客户端
@@ -118,9 +118,9 @@ python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port
 即`ocr_det_client/serving_client_conf.prototxt`中`feed_var`字段
-对于本示例而言，`feed_type`应修改为3(数据类型为string),`shape`为1.
+对于本示例而言，`feed_type`应修改为20(数据类型为string),`shape`为1.
 通过在客户端启动后加入多个client模型的client配置文件夹路径，启动client进行预测。
 ```
-python ocr_cpp_client.py ocr_det_client ocr_rec_client
+python3 ocr_cpp_client.py ocr_det_client ocr_rec_client
 ```
--- a/python/examples/pipeline/PaddleClas/DarkNet53/README.md
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/DarkNet53/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/DarkNet53/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 10}
    if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
        config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
    else:
        config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
    start = time.time()
    with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
    end = time.time()
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    client = PipelineClient()
    client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
            gpu_id = None
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/PaddleClas/DarkNet53/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json

--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/README.md
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 10}
    if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
        config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
    else:
        config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
    start = time.time()
    with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
    end = time.time()
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    client = PipelineClient()
    client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
            gpu_id = None
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json

--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
+from paddle_serving_server.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2

--- a/python/examples/pipeline/PaddleClas/MobileNetV1/README.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 10}
    if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
        config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
    else:
        config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
    start = time.time()
    with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
    end = time.time()
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    client = PipelineClient()
    client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
            gpu_id = None
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json

--- a/python/examples/pipeline/PaddleClas/MobileNetV1/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
+from paddle_serving_server.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2

--- a/python/examples/pipeline/PaddleClas/MobileNetV2/README.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 10}
    if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
        config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
    else:
        config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
    start = time.time()
    with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
    end = time.time()
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    client = PipelineClient()
    client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
            gpu_id = None
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json

--- a/python/examples/pipeline/PaddleClas/MobileNetV2/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
+from paddle_serving_server.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2

--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 10}
    if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
        config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
    else:
        config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
    start = time.time()
    with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
    end = time.time()
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    client = PipelineClient()
    client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
            gpu_id = None
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json

--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
+from paddle_serving_server.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2

--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 10}
    if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
        config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
    else:
        config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
    start = time.time()
    with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
    end = time.time()
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    client = PipelineClient()
    client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
            gpu_id = None
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json

--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
+from paddle_serving_server.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2

--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 10}
    if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
        config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
    else:
        config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
    start = time.time()
    with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
    end = time.time()
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    client = PipelineClient()
    client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
            gpu_id = None
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json

--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
+from paddle_serving_server.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2

--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 10}
    if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
        config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
    else:
        config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
    start = time.time()
    with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
    end = time.time()
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    client = PipelineClient()
    client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
            gpu_id = None
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json

--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
+from paddle_serving_server.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2

--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 10}
    if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
        config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
    else:
        config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
    start = time.time()
    with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
    end = time.time()
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    client = PipelineClient()
    client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
            gpu_id = None
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json

--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
+from paddle_serving_server.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2

--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 10}
    if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
        config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
    else:
        config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
    start = time.time()
    with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
    end = time.time()
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    client = PipelineClient()
    client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
            gpu_id = None
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json

--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
+from paddle_serving_server.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2

--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/README.md
@@ -4,17 +4,17 @@ This document will takes Imagenet service as an example to introduce how to use
 ## Get model
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
 ## Start server
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/README_CN.md
@@ -4,18 +4,17 @@
 ## 获取模型
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
 ## 启动服务
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 10}
    if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
        config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
    else:
        config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18000/imagenet/prediction"    
+    url = "http://127.0.0.1:18000/imagenet/prediction"
    start = time.time()
    with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
    end = time.time()
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    client = PipelineClient()
    client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
            gpu_id = None
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json

--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README.md
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 10}
    if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
        config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
    else:
        config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
    start = time.time()
    with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
    end = time.time()
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    client = PipelineClient()
    client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
            gpu_id = None
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/pipeline_rpc_client.py
@@ -11,10 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
+from paddle_serving_server.pipeline import PipelineClient
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json

--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
+from paddle_serving_server.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2

--- a/python/examples/pipeline/PaddleDetection/faster_rcnn/README.md
+++ b/python/examples/pipeline/PaddleDetection/faster_rcnn/README.md
@@ -8,11 +8,11 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
-python web_service.py
+python3 web_service.py
 ```
 ### Perform prediction
 ```
-python pipeline_http_client.py
+python3 pipeline_http_client.py
 ```
--- a/python/examples/pipeline/PaddleDetection/faster_rcnn/benchmark.py
+++ b/python/examples/pipeline/PaddleDetection/faster_rcnn/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import yaml
@@ -6,20 +20,20 @@ import time
 import json
 import cv2
 import base64
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -29,17 +43,19 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 30}
    if device == "gpu":
        config["op"]["faster_rcnn"]["local_service_conf"]["device_type"] = 1
-        config["op"]["faster_rcnn"]["local_service_conf"]["devices"] = gpu_id        
+        config["op"]["faster_rcnn"]["local_service_conf"]["devices"] = gpu_id
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
    url = "http://127.0.0.1:18082/faster_rcnn/prediction"
@@ -65,6 +81,7 @@ def run_http(idx, batch_size):
            break
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -84,22 +101,25 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    pass
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        gpu_id = sys.argv[5]
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -110,4 +130,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/PaddleDetection/faster_rcnn/web_service.py
+++ b/python/examples/pipeline/PaddleDetection/faster_rcnn/web_service.py
@@ -25,7 +25,7 @@ class FasterRCNNOp(Op):
        self.img_preprocess = Sequential([
            BGR2RGB(), Div(255.0),
            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
-            Resize((640, 640)), Transpose((2, 0, 1))
+            Resize(640, 640), Transpose((2, 0, 1))
        ])
        self.img_postprocess = RCNNPostprocess("label_list.txt", "output")

--- a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/README.md
+++ b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/README.md
@@ -10,11 +10,10 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ppyolo_mbv3_large_coco.tar
-python web_service.py
+python3 web_service.py
 ```
 ### Perform prediction
 ```
-python pipeline_http_client.py
+python3 pipeline_http_client.py
 ```
--- a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/benchmark.py
+++ b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import yaml
@@ -6,20 +20,20 @@ import time
 import json
 import cv2
 import base64
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -29,17 +43,19 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 30}
    if device == "gpu":
        config["op"]["ppyolo_mbv3"]["local_service_conf"]["device_type"] = 1
        config["op"]["ppyolo_mbv3"]["local_service_conf"]["devices"] = gpu_id
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
    url = "http://127.0.0.1:18082/ppyolo_mbv3/prediction"
@@ -65,6 +81,7 @@ def run_http(idx, batch_size):
            break
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -84,22 +101,25 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    pass
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        gpu_id = sys.argv[5]
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -110,4 +130,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/PaddleDetection/yolov3/README.md
+++ b/python/examples/pipeline/PaddleDetection/yolov3/README.md
@@ -10,11 +10,10 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf yolov3_darknet53_270e_coco.tar
-python web_service.py
+python3 web_service.py
 ```
 ### Perform prediction
 ```
-python pipeline_http_client.py
+python3 pipeline_http_client.py
 ```
--- a/python/examples/pipeline/PaddleDetection/yolov3/benchmark.py
+++ b/python/examples/pipeline/PaddleDetection/yolov3/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import yaml
@@ -6,20 +20,20 @@ import time
 import json
 import cv2
 import base64
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -29,17 +43,19 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device, gpu_id):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 30}
    if device == "gpu":
        config["op"]["yolov3"]["local_service_conf"]["device_type"] = 1
-        config["op"]["yolov3"]["local_service_conf"]["devices"] = gpu_id        
+        config["op"]["yolov3"]["local_service_conf"]["devices"] = gpu_id
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
    url = "http://127.0.0.1:18082/yolov3/prediction"
@@ -65,6 +81,7 @@ def run_http(idx, batch_size):
            break
    return [[end - start], latency_list, [total_num]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
    start = time.time()
@@ -84,22 +101,25 @@ def multithread_http(thread, batch_size):
                                         total_cost))
    show_latency(result[1])
 def run_rpc(thread, batch_size):
    pass
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        gpu_id = sys.argv[5]
        gen_yml(device, gpu_id)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -110,4 +130,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/bert/benchmark.py
+++ b/python/examples/pipeline/bert/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
@@ -38,9 +50,11 @@ from paddle_serving_client.utils import benchmark_args, show_latency
 2021-03-16 10:26:01,840 	chl0(In: ['@DAGExecutor'], Out: ['bert']) size[0/0]
 2021-03-16 10:26:01,841 	chl1(In: ['bert'], Out: ['@DAGExecutor']) size[0/0]
 '''
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -50,20 +64,22 @@ def parse_benchmark(filein, fileout):
    with open(fileout, "w") as fout:
        yaml.dump(res, fout, default_flow_style=False)
 def gen_yml(device):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 10}
    if device == "gpu":
        config["op"]["bert"]["local_service_conf"]["device_type"] = 1
-        config["op"]["bert"]["local_service_conf"]["devices"] = "2"        
+        config["op"]["bert"]["local_service_conf"]["devices"] = "2"
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18082/bert/prediction"    
+    url = "http://127.0.0.1:18082/bert/prediction"
    start = time.time()
    with open("data-c.txt", 'r') as fin:
        start = time.time()
@@ -84,9 +100,11 @@ def run_http(idx, batch_size):
        end = time.time()
    return [[end - start]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_http , thread, batch_size)
+    result = multi_thread_runner.run(run_http, thread, batch_size)
 def run_rpc(thread, batch_size):
    client = PipelineClient()
@@ -110,16 +128,17 @@ def run_rpc(thread, batch_size):
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        device = sys.argv[4]
        gen_yml(device)
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
@@ -130,4 +149,3 @@ if __name__ == "__main__":
        filein = sys.argv[2]
        fileout = sys.argv[3]
        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/ocr/benchmark.py
+++ b/python/examples/pipeline/ocr/benchmark.py
@@ -19,10 +19,8 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
@@ -30,7 +28,7 @@ from paddle_serving_client.utils import benchmark_args, show_latency
 def parse_benchmark(filein, fileout):
    with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
        del_list = []
        for key in res["DAG"].keys():
            if "call" in key:
@@ -43,7 +41,7 @@ def parse_benchmark(filein, fileout):
 def gen_yml(device):
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 10}
    if device == "gpu":

--- a/python/examples/pipeline/simple_web_service/README.md
+++ b/python/examples/pipeline/simple_web_service/README.md
@@ -10,7 +10,7 @@ sh get_data.sh
 ## Start server
 ```
-python web_service.py &>log.txt &
+python3 web_service.py &>log.txt &
 ```
 ## Http test

--- a/python/examples/pipeline/simple_web_service/README_CN.md
+++ b/python/examples/pipeline/simple_web_service/README_CN.md
@@ -10,7 +10,7 @@ sh get_data.sh
 ## 启动服务
 ```
-python web_service.py &>log.txt &
+python3 web_service.py &>log.txt &
 ```
 ## 测试

--- a/python/examples/pipeline/simple_web_service/benchmark.py
+++ b/python/examples/pipeline/simple_web_service/benchmark.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 def gen_yml():
    fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
    fin.close()
    config["dag"]["tracer"] = {"interval_s": 5}
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
        yaml.dump(config, fout, default_flow_style=False)
 def run_http(idx, batch_size):
    print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18082/uci/prediction"    
+    url = "http://127.0.0.1:18082/uci/prediction"
    start = time.time()
    value = "0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"
    all_value = ";".join([value for i in range(batch_size)])
@@ -33,9 +47,11 @@ def run_http(idx, batch_size):
    end = time.time()
    return [[end - start]]
 def multithread_http(thread, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_http , thread, batch_size)
+    result = multi_thread_runner.run(run_http, thread, batch_size)
 def run_rpc(thread, batch_size):
    client = PipelineClient()
@@ -44,25 +60,26 @@ def run_rpc(thread, batch_size):
    all_value = ";".join([value for i in range(batch_size)])
    data = {"key": "x", "value": all_value}
    for i in range(1000):
-        ret = client.predict(feed_dict={data["key"]: data["value"]}, fetch=["res"])
+        ret = client.predict(
+            feed_dict={data["key"]: data["value"]}, fetch=["res"])
    print(ret)
 def multithread_rpc(thraed, batch_size):
    multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
 if __name__ == "__main__":
    if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
        thread = int(sys.argv[3])
        gen_yml()
    elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
        thread = int(sys.argv[3])
        batch_size = int(sys.argv[4])
        if mode == "http":
            multithread_http(thread, batch_size)
        elif mode == "rpc":
            multithread_rpc(thread, batch_size)
--- a/python/examples/resnet_v2_50/README.md
+++ b/python/examples/resnet_v2_50/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
@@ -12,11 +12,11 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### Start Service
 ```
-python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
+python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
 ```
 ### Client Prediction
 ```
-python resnet50_v2_tutorial.py
+python3 resnet50_v2_tutorial.py
 ```
--- a/python/examples/resnet_v2_50/README_CN.md
+++ b/python/examples/resnet_v2_50/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
@@ -12,11 +12,11 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### 启动服务端
 ```
-python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
+python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
 ```
 ### 客户端预测
 ```
-python resnet50_v2_tutorial.py
+python3 resnet50_v2_tutorial.py
 ```
--- a/python/examples/senta/README.md
+++ b/python/examples/senta/README.md
@@ -3,16 +3,16 @@
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model senta_bilstm
+python3 -m paddle_serving_app.package --get_model senta_bilstm
-python -m paddle_serving_app.package --get_model lac
+python3 -m paddle_serving_app.package --get_model lac
 tar -xzvf senta_bilstm.tar.gz
 tar -xzvf lac.tar.gz
 ```
 ## Start HTTP Service
 ```
-python -m paddle_serving_server.serve --model lac_model --port 9300
+python3 -m paddle_serving_server.serve --model lac_model --port 9300
-python senta_web_service.py
+python3 senta_web_service.py
 ```
 In the Chinese sentiment classification task, the Chinese word segmentation needs to be done through [LAC task] (../lac). 
 In this demo, the LAC task is placed in the preprocessing part of the HTTP prediction service of the sentiment classification task.

--- a/python/examples/senta/README_CN.md
+++ b/python/examples/senta/README_CN.md
@@ -3,16 +3,16 @@
 ## 获取模型文件
 ```
-python -m paddle_serving_app.package --get_model senta_bilstm
+python3 -m paddle_serving_app.package --get_model senta_bilstm
-python -m paddle_serving_app.package --get_model lac
+python3 -m paddle_serving_app.package --get_model lac
 tar -xzvf lac.tar.gz
 tar -xzvf senta_bilstm.tar.gz
 ```
 ## 启动HTTP服务
 ```
-python -m paddle_serving_server.serve --model lac_model --port 9300
+python3 -m paddle_serving_server.serve --model lac_model --port 9300
-python senta_web_service.py
+python3 senta_web_service.py
 ```
 中文情感分类任务中需要先通过[LAC任务](../lac)进行中文分词。
 示例中将LAC任务放在情感分类任务的HTTP预测服务的预处理部分。

--- a/python/examples/unet_for_image_seg/README.md
+++ b/python/examples/unet_for_image_seg/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model unet
+python3 -m paddle_serving_app.package --get_model unet
 tar -xzvf unet.tar.gz
 ```
@@ -12,11 +12,11 @@ tar -xzvf unet.tar.gz
 ### Start Service
 ```
-python -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
+python3 -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
 ```
 ### Client Prediction
 ```
-python seg_client.py
+python3 seg_client.py
 ```
--- a/python/examples/unet_for_image_seg/README_CN.md
+++ b/python/examples/unet_for_image_seg/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 ```
-python -m paddle_serving_app.package --get_model unet
+python3 -m paddle_serving_app.package --get_model unet
 tar -xzvf unet.tar.gz
 ```
@@ -12,11 +12,11 @@ tar -xzvf unet.tar.gz
 ### 启动服务端
 ```
-python -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
+python3 -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
 ```
 ### 客户端预测
 ```
-python seg_client.py
+python3 seg_client.py
 ```
--- a/python/examples/util/README.md
+++ b/python/examples/util/README.md
@@ -13,14 +13,14 @@ In order to show the time consuming of each stage more intuitively, a script is
 When using, first save the output of the client to a file, taking `profile` as an example.
 ```
-python show_profile.py profile ${thread_num}
+python3 show_profile.py profile ${thread_num}
 ```
 Here the `thread_num` parameter is the number of processes when the client is running, and the script will calculate the average time spent in each phase according to this parameter.
 The script calculates the time spent in each stage, divides by the number of threads to average, and prints to standard output.
 ```
-python timeline_trace.py profile trace
+python3 timeline_trace.py profile trace
 ```
 The script converts the time-dot information in the log into a json format and saves it to a trace file. The trace file can be visualized through the tracing function of the Chrome browser.

--- a/python/examples/util/README_CN.md
+++ b/python/examples/util/README_CN.md
@@ -13,14 +13,14 @@ export FLAGS_profile_server=1 #开启server端各阶段时间打点
 使用时先将client的输出保存到文件，以profile为例。
 ```
-python show_profile.py profile ${thread_num}
+python3 show_profile.py profile ${thread_num}
 ```
 这里thread_num参数为client运行时的进程数，脚本将按照这个参数来计算各阶段的平均耗时。
 脚本将计算各阶段的耗时，并除以线程数做平均，打印到标准输出。
 ```
-python timeline_trace.py profile trace
+python3 timeline_trace.py profile trace
 ```
 脚本将日志中的时间打点信息转换成json格式保存到trace文件，trace文件可以通过chrome浏览器的tracing功能进行可视化。

--- a/python/examples/xpu/bert/bert_web_service.py
+++ b/python/examples/xpu/bert/bert_web_service.py
-# coding=utf-8
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server.web_service import WebService
-from paddle_serving_app.reader import ChineseBertReader
-import sys
-import os
-import numpy as np
-class BertService(WebService):
-    def load(self):
-        self.reader = ChineseBertReader({
-            "vocab_file": "vocab.txt",
-            "max_seq_len": 128
-        })
-    def preprocess(self, feed=[], fetch=[]):
-        feed_res = []
-        is_batch = False
-        for ins in feed:
-            feed_dict = self.reader.process(ins["words"].encode("utf-8"))
-            for key in feed_dict.keys():
-                feed_dict[key] = np.array(feed_dict[key]).reshape(
-                    (len(feed_dict[key]), 1))
-            feed_res.append(feed_dict)
-        return feed_res, fetch, is_batch
-bert_service = BertService(name="bert")
-bert_service.load()
-bert_service.load_model_config(sys.argv[1])
-bert_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[2]), use_lite=True, use_xpu=True, ir_optim=True)
-bert_service.run_rpc_service()
-bert_service.run_web_service()
--- a/python/examples/xpu/ernie/ernie_web_service.py
+++ b/python/examples/xpu/ernie/ernie_web_service.py
-# coding=utf-8
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server.web_service import WebService
-from paddle_serving_app.reader import ChineseBertReader
-import sys
-import os
-import numpy as np
-class BertService(WebService):
-    def load(self):
-        self.reader = ChineseBertReader({
-            "vocab_file": "vocab.txt",
-            "max_seq_len": 128
-        })
-    def preprocess(self, feed=[], fetch=[]):
-        feed_res = []
-        is_batch = False
-        for ins in feed:
-            feed_dict = self.reader.process(ins["words"].encode("utf-8"))
-            for key in feed_dict.keys():
-                feed_dict[key] = np.array(feed_dict[key]).reshape(
-                    (len(feed_dict[key]), 1))
-            feed_res.append(feed_dict)
-        return feed_res, fetch, is_batch
-bert_service = BertService(name="bert")
-bert_service.load()
-bert_service.load_model_config(sys.argv[1])
-bert_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[2]), use_lite=True, use_xpu=True, ir_optim=True)
-bert_service.run_rpc_service()
-bert_service.run_web_service()
--- a/python/examples/xpu/fit_a_line_xpu/README.md
+++ b/python/examples/xpu/fit_a_line_xpu/README.md
@@ -13,28 +13,13 @@ sh get_data.sh
 ### Start server
 You can use the following code to start the RPC service 
 ```shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
 ```
 ### Client prediction
-The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip install paddlepaddle`).
+The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip3 install paddlepaddle`).
 ``` shell
-python test_client.py uci_housing_client/serving_client_conf.prototxt
+python3 test_client.py uci_housing_client/serving_client_conf.prototxt
-```
-## HTTP service
-### Start server
-Start a web service with default web service hosting modules:
-``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim --name uci
-```
-### Client prediction
-``` shell
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9393/uci/prediction
 ```
--- a/python/examples/xpu/fit_a_line_xpu/README_CN.md
+++ b/python/examples/xpu/fit_a_line_xpu/README_CN.md
@@ -15,35 +15,19 @@ sh get_data.sh
 ### 开启服务端
 ``` shell
-python test_server.py uci_housing_model/
+python3 test_server.py uci_housing_model/
 ```
 也可以通过下面的一行代码开启默认RPC服务：
 ```shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
 ```
 ### 客户端预测
-`test_client.py`中使用了`paddlepaddle`包，需要进行下载（`pip install paddlepaddle`）。
+`test_client.py`中使用了`paddlepaddle`包，需要进行下载（`pip3 install paddlepaddle`）。
 ``` shell
-python test_client.py uci_housing_client/serving_client_conf.prototxt
+python3 test_client.py uci_housing_client/serving_client_conf.prototxt
-```
-## HTTP服务
-### 开启服务端
-通过下面的一行代码开启默认web服务：
-``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim --name uci
-```
-### 客户端预测
-``` shell
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9393/uci/prediction
 ```
--- a/python/examples/xpu/resnet_v2_50_xpu/README.md
+++ b/python/examples/xpu/resnet_v2_50_xpu/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
@@ -12,11 +12,11 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### Start Service
 ```
-python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
+python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
 ```
 ### Client Prediction
 ```
-python resnet50_client.py
+python3 resnet50_client.py
 ```
--- a/python/examples/xpu/resnet_v2_50_xpu/README_CN.md
+++ b/python/examples/xpu/resnet_v2_50_xpu/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
@@ -12,11 +12,11 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### 启动服务端
 ```
-python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
+python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
 ```
 ### 客户端预测
 ```
-python resnet50_client.py
+python3 resnet50_client.py
 ```
--- a/python/examples/xpu/vgg19/README.md
+++ b/python/examples/xpu/vgg19/README.md
@@ -26,5 +26,5 @@ python3 -m paddle_serving_server.serve --model serving_server --port 7702 --use_
 ### Client Prediction
 ```
-python vgg19_client.py
+python3 vgg19_client.py
 ```
--- a/python/examples/yolov4/README.md
+++ b/python/examples/yolov4/README.md
@@ -5,19 +5,19 @@
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model yolov4
+python3 -m paddle_serving_app.package --get_model yolov4
 tar -xzvf yolov4.tar.gz
 ```
 ## Start RPC Service
 ```
-python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
 ```
 ## Prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
 After the prediction is completed, a json file to save the prediction result and a picture with the detection result box will be generated in the `./outpu folder.
--- a/python/examples/yolov4/README_CN.md
+++ b/python/examples/yolov4/README_CN.md
@@ -5,20 +5,20 @@
 ## 获取模型
 ```
-python -m paddle_serving_app.package --get_model yolov4
+python3 -m paddle_serving_app.package --get_model yolov4
 tar -xzvf yolov4.tar.gz
 ```
 ## 启动RPC服务
 ```
-python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
 ```
 ## 预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
 预测完成会在`./output`文件夹下生成保存预测结果的json文件以及标出检测结果框的图片。
--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -22,6 +22,7 @@ import argparse
 from .proto import general_model_config_pb2 as m_config
 import paddle.inference as paddle_infer
 import logging
+import glob
 logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger("LocalPredictor")
@@ -51,6 +52,23 @@ class LocalPredictor(object):
        self.fetch_names_to_idx_ = {}
        self.fetch_names_to_type_ = {}
+    def search_suffix_files(self, model_path, target_suffix):
+        """
+        Find all files with the suffix xxx in the specified directory.
+        Args:
+            model_path: model directory, not None.
+            target_suffix: filenames with target suffix, not None. e.g: *.pdmodel
+        Returns:
+            file_list, None, [] or [path, ] . 
+        """
+        if model_path is None or target_suffix is None:
+            return None
+        file_list = glob.glob(os.path.join(model_path, target_suffix))
+        return file_list
    def load_model_config(self,
                          model_path,
                          use_gpu=False,
@@ -97,11 +115,30 @@ class LocalPredictor(object):
        f = open(client_config, 'r')
        model_conf = google.protobuf.text_format.Merge(
            str(f.read()), model_conf)
+        # Init paddle_infer config
+        # Paddle's model files and parameter files have multiple naming rules:
+        #   1) __model__, __params__
+        #   2) *.pdmodel, *.pdiparams
+        #   3) __model__, conv2d_1.w_0, conv2d_2.w_0, fc_1.w_0, conv2d_1.b_0, ... 
+        pdmodel_file_list = self.search_suffix_files(model_path, "*.pdmodel")
+        pdiparams_file_list = self.search_suffix_files(model_path,
+                                                       "*.pdiparams")
        if os.path.exists(os.path.join(model_path, "__params__")):
+            # case 1) initializing
            config = paddle_infer.Config(
                os.path.join(model_path, "__model__"),
                os.path.join(model_path, "__params__"))
+        elif pdmodel_file_list and len(
+                pdmodel_file_list) > 0 and pdiparams_file_list and len(
+                    pdiparams_file_list) > 0:
+            # case 2) initializing
+            logger.info("pdmodel_file_list:{}, pdiparams_file_list:{}".format(
+                pdmodel_file_list, pdiparams_file_list))
+            config = paddle_infer.Config(pdmodel_file_list[0],
+                                         pdiparams_file_list[0])
        else:
+            # case 3) initializing.
            config = paddle_infer.Config(model_path)
        logger.info(
@@ -201,8 +238,9 @@ class LocalPredictor(object):
        Run model inference by Paddle Inference API.
        Args:
-            feed: feed var
+            feed: feed var list, None is not allowed.
-            fetch: fetch var
+            fetch: fetch var list, None allowed. when it is None, all fetch 
+                   vars are returned. Otherwise, return fetch specified result.
            batch: batch data or not, False default.If batch is False, a new
                   dimension is added to header of the shape[np.newaxis].
            log_id: for logging
@@ -210,16 +248,8 @@ class LocalPredictor(object):
        Returns:
            fetch_map: dict 
        """
-        if feed is None or fetch is None:
+        if feed is None:
-            raise ValueError("You should specify feed and fetch for prediction.\
+            raise ValueError("You should specify feed vars for prediction.\
-                log_id:{}".format(log_id))
-        fetch_list = []
-        if isinstance(fetch, str):
-            fetch_list = [fetch]
-        elif isinstance(fetch, list):
-            fetch_list = fetch
-        else:
-            raise ValueError("Fetch only accepts string and list of string.\
                log_id:{}".format(log_id))
        feed_batch = []
@@ -231,18 +261,20 @@ class LocalPredictor(object):
            raise ValueError("Feed only accepts dict and list of dict.\
                log_id:{}".format(log_id))
-        fetch_names = []
+        fetch_list = []
+        if fetch is not None:
+            if isinstance(fetch, str):
+                fetch_list = [fetch]
+            elif isinstance(fetch, list):
+                fetch_list = fetch
        # Filter invalid fetch names
+        fetch_names = []
        for key in fetch_list:
            if key in self.fetch_names_:
                fetch_names.append(key)
-        if len(fetch_names) == 0:
+        # Assemble the input data of paddle predictor, and filter invalid inputs. 
-            raise ValueError(
-                "Fetch names should not be empty or out of saved fetch list.\
-                    log_id:{}".format(log_id))
-        # Assemble the input data of paddle predictor 
        input_names = self.predictor.get_input_names()
        for name in input_names:
            if isinstance(feed[name], list):
@@ -282,11 +314,15 @@ class LocalPredictor(object):
                input_tensor_handle.copy_from_cpu(feed[name][np.newaxis, :])
            else:
                input_tensor_handle.copy_from_cpu(feed[name])
+        # set output tensor handlers
        output_tensor_handles = []
+        output_name_to_index_dict = {}
        output_names = self.predictor.get_output_names()
-        for output_name in output_names:
+        for i, output_name in enumerate(output_names):
            output_tensor_handle = self.predictor.get_output_handle(output_name)
            output_tensor_handles.append(output_tensor_handle)
+            output_name_to_index_dict[output_name] = i
        # Run inference 
        self.predictor.run()
@@ -296,10 +332,43 @@ class LocalPredictor(object):
        for output_tensor_handle in output_tensor_handles:
            output = output_tensor_handle.copy_to_cpu()
            outputs.append(output)
+        outputs_len = len(outputs)
+        # Copy fetch vars. If fetch is None, it will copy all results from output_tensor_handles. 
+        # Otherwise, it will copy the fields specified from output_tensor_handles.
        fetch_map = {}
-        for i, name in enumerate(fetch):
+        if fetch is None:
-            fetch_map[name] = outputs[i]
+            for i, name in enumerate(output_names):
-            if len(output_tensor_handles[i].lod()) > 0:
+                fetch_map[name] = outputs[i]
-                fetch_map[name + ".lod"] = np.array(output_tensor_handles[i]
+                if len(output_tensor_handles[i].lod()) > 0:
-                                                    .lod()[0]).astype('int32')
+                    fetch_map[name + ".lod"] = np.array(output_tensor_handles[
+                        i].lod()[0]).astype('int32')
+        else:
+            # Because the save_inference_model interface will increase the scale op 
+            # in the network, the name of fetch_var is different from that in prototxt. 
+            # Therefore, it is compatible with v0.6.x and the previous model save format,
+            # and here is compatible with the results that do not match.
+            fetch_match_num = 0
+            for i, name in enumerate(fetch):
+                output_index = output_name_to_index_dict.get(name)
+                if output_index is None:
+                    continue
+                fetch_map[name] = outputs[output_index]
+                fetch_match_num += 1
+                if len(output_tensor_handles[output_index].lod()) > 0:
+                    fetch_map[name + ".lod"] = np.array(output_tensor_handles[
+                        output_index].lod()[0]).astype('int32')
+            # Compatible with v0.6.x and lower versions model saving formats.
+            if fetch_match_num == 0:
+                logger.debug("fetch match num is 0. Retrain the model please!")
+                for i, name in enumerate(fetch):
+                    if i >= outputs_len:
+                        break
+                    fetch_map[name] = outputs[i]
+                    if len(output_tensor_handles[i].lod()) > 0:
+                        fetch_map[name + ".lod"] = np.array(
+                            output_tensor_handles[i].lod()[0]).astype('int32')
        return fetch_map
--- a/python/paddle_serving_app/reader/__init__.py
+++ b/python/paddle_serving_app/reader/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from .chinese_bert_reader import ChineseBertReader
 from .image_reader import ImageReader, File2Image, URL2Image, Sequential, Normalize, Base64ToImage
+from .image_reader import DetectionFile2Image, DetectionSequential, DetectionNormalize, DetectionTranspose, DetectionResize, DetectionBGR2RGB, DetectionPadStride
 from .image_reader import CenterCrop, Resize, Transpose, Div, RGB2BGR, BGR2RGB, ResizeByFactor
 from .image_reader import RCNNPostprocess, SegPostprocess, PadStride, BlazeFacePostprocess
 from .image_reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes

--- a/python/paddle_serving_app/reader/image_reader.py
+++ b/python/paddle_serving_app/reader/image_reader.py
@@ -498,6 +498,42 @@ class Sequential(object):
        return format_string_
+class DetectionSequential(object):
+    """
+    Args:
+        sequence (sequence of ``Transform`` objects): list of transforms to chain.
+    This API references some of the design pattern of torchvision
+    Users can simply use this API in training as well
+    Example:
+        >>> image_reader.Sequnece([
+        >>>     transforms.CenterCrop(10),
+        >>> ])
+    """
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, im):
+        im_info = {
+        'scale_factor': np.array(
+            [1., 1.], dtype=np.float32),
+        'im_shape': None,
+        }
+        for t in self.transforms:
+            im, im_info = t(im, im_info)
+        return im, im_info
+    def __repr__(self):
+        format_string_ = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string_ += '\n'
+            format_string_ += '    {0}'.format(t)
+        format_string_ += '\n)'
+        return format_string_
 class RGB2BGR(object):
    def __init__(self):
        pass
@@ -520,6 +556,17 @@ class BGR2RGB(object):
        return self.__class__.__name__ + "()"
+class DetectionBGR2RGB(object):
+    def __init__(self):
+        pass
+    def __call__(self, img, img_info=None):
+        return img[:, :, ::-1], img_info
+    def __repr__(self):
+        return self.__class__.__name__ + "()"
 class String2Image(object):
    def __init__(self):
        pass
@@ -556,6 +603,33 @@ class File2Image(object):
    def __repr__(self):
        return self.__class__.__name__ + "()"
+class DetectionFile2Image(object):
+    def __init__(self):
+        pass
+    def __call__(self, img_path, im_info=None):
+        if py_version == 2:
+            fin = open(img_path)
+        else:
+            fin = open(img_path, "rb")
+        sample = fin.read()
+        data = np.fromstring(sample, np.uint8)
+        img = cv2.imdecode(data, cv2.IMREAD_COLOR)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        '''
+        img = cv2.imread(img_path, -1)
+        channels = img.shape[2]
+        ori_h = img.shape[0]
+        ori_w = img.shape[1]
+        '''
+        if im_info is not None:
+            im_info['im_shape'] = np.array(img.shape[:2], dtype=np.float32)
+            im_info['scale_factor'] = np.array([1., 1.], dtype=np.float32)
+        return img, im_info
+    def __repr__(self):
+        return self.__class__.__name__ + "()"
 class URL2Image(object):
    def __init__(self):
@@ -607,6 +681,27 @@ class Div(object):
    def __repr__(self):
        return self.__class__.__name__ + "({})".format(self.value)
+class DetectionDiv(object):
+    """ divide by some float number """
+    def __init__(self, value):
+        self.value = value
+    def __call__(self, img, img_info=None):
+        """
+        Args:
+            img (numpy array): (int8 numpy array)
+        Returns:
+            img (numpy array): (float32 numpy array)
+        """
+        img = img.astype('float32') / self.value
+        return img, img_info
+    def __repr__(self):
+        return self.__class__.__name__ + "({})".format(self.value)
 class Normalize(object):
    """Normalize a tensor image with mean and standard deviation.
@@ -643,6 +738,51 @@ class Normalize(object):
                                                                      self.std)
+class DetectionNormalize(object):
+    """Normalize a tensor image with mean and standard deviation.
+    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform
+    will normalize each channel of the input ``torch.*Tensor`` i.e.
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input tensor.
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        is_scale (bool): whether need im / 255
+    """
+    def __init__(self, mean, std, is_scale=True):
+        self.mean = mean
+        self.std = std
+        self.is_scale = is_scale
+    def __call__(self, im, im_info=None):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.astype(np.float32, copy=False)
+        mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+        std = np.array(self.std)[np.newaxis, np.newaxis, :]
+        if self.is_scale:
+            im = im / 255.0
+        im -= mean
+        im /= std
+        return im, im_info
+    def __repr__(self):
+        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean,
+                                                                      self.std)
 class Lambda(object):
    """Apply a user-defined lambda as a transform.
       Very shame to just copy from
@@ -716,6 +856,124 @@ class Resize(object):
            self.size, self.max_size,
            _cv2_interpolation_to_str[self.interpolation])
+class DetectionResize(object):
+    """resize image by target_size and max_size
+    Args:
+        target_size (int): the target size of image
+        keep_ratio (bool): whether keep_ratio or not, default true
+        interp (int): method of resize
+    """
+    def __init__(self, target_size, keep_ratio=True, interpolation=cv2.INTER_LINEAR):
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+        self.keep_ratio = keep_ratio
+        self.interpolation = interpolation
+    def __call__(self, im, im_info=None):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        im_channel = im.shape[2]
+        im_scale_y, im_scale_x = self.generate_scale(im)
+        im = cv2.resize(
+            im,
+            None,
+            None,
+            fx=im_scale_x,
+            fy=im_scale_y,
+            interpolation=self.interpolation)
+        if im_info is not None:
+            im_info['im_shape'] = np.array(im.shape[:2]).astype('float32')
+            im_info['scale_factor'] = np.array(
+                [im_scale_y, im_scale_x]).astype('float32')
+        return im, im_info
+    def generate_scale(self, im):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+        Returns:
+            im_scale_x: the resize ratio of X
+            im_scale_y: the resize ratio of Y
+        """
+        origin_shape = im.shape[:2]
+        im_c = im.shape[2]
+        if self.keep_ratio:
+            im_size_min = np.min(origin_shape)
+            im_size_max = np.max(origin_shape)
+            target_size_min = np.min(self.target_size)
+            target_size_max = np.max(self.target_size)
+            im_scale = float(target_size_min) / float(im_size_min)
+            if np.round(im_scale * im_size_max) > target_size_max:
+                im_scale = float(target_size_max) / float(im_size_max)
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = self.target_size
+            im_scale_y = resize_h / float(origin_shape[0])
+            im_scale_x = resize_w / float(origin_shape[1])
+        return im_scale_y, im_scale_x
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0}, max_size={1}, interpolation={2})'.format(
+            self.size, self.max_size,
+            _cv2_interpolation_to_str[self.interpolation])
+class PadStride(object):
+    def __init__(self, stride):
+        self.coarsest_stride = stride
+    def __call__(self, img):
+        coarsest_stride = self.coarsest_stride
+        if coarsest_stride == 0:
+            return img
+        im_c, im_h, im_w = img.shape
+        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
+        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
+        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
+        padding_im[:, :im_h, :im_w] = img
+        im_info = {}
+        im_info['resize_shape'] = padding_im.shape[1:]
+        return padding_im
+class DetectionPadStride(object):
+    """ padding image for model with FPN, instead PadBatch(pad_to_stride) in original config
+    Args:
+        stride (bool): model with FPN need image shape % stride == 0
+    """
+    def __init__(self, stride=0):
+        self.coarsest_stride = stride
+    def __call__(self, im, im_info=None):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        coarsest_stride = self.coarsest_stride
+        if coarsest_stride <= 0:
+            return im
+        im_c, im_h, im_w = im.shape
+        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
+        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
+        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
+        padding_im[:, :im_h, :im_w] = im
+        return padding_im, im_info
 class ResizeByFactor(object):
    """Resize the input numpy array Image to a size multiple of factor which is usually required by a network
@@ -768,24 +1026,6 @@ class ResizeByFactor(object):
            self.factor, self.max_side_len)
-class PadStride(object):
-    def __init__(self, stride):
-        self.coarsest_stride = stride
-    def __call__(self, img):
-        coarsest_stride = self.coarsest_stride
-        if coarsest_stride == 0:
-            return img
-        im_c, im_h, im_w = img.shape
-        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
-        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
-        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
-        padding_im[:, :im_h, :im_w] = img
-        im_info = {}
-        im_info['resize_shape'] = padding_im.shape[1:]
-        return padding_im
 class Transpose(object):
    def __init__(self, transpose_target):
        self.transpose_target = transpose_target
@@ -799,6 +1039,19 @@ class Transpose(object):
                        "({})".format(self.transpose_target)
        return format_string
+class DetectionTranspose(object):
+    def __init__(self, transpose_target):
+        self.transpose_target = transpose_target
+    def __call__(self, im, im_info=None):
+        im = F.transpose(im, self.transpose_target)
+        return im, im_info
+    def __repr__(self):
+        format_string = self.__class__.__name__ + \
+                        "({})".format(self.transpose_target)
+        return format_string
 class SortedBoxes(object):
    """

--- a/python/paddle_serving_client/client.py
+++ b/python/paddle_serving_client/client.py
@@ -31,15 +31,21 @@ sys.path.append(
 #param 'type'(which is in feed_var or fetch_var) = 0 means dataType is int64
 #param 'type'(which is in feed_var or fetch_var) = 1 means dataType is float32
 #param 'type'(which is in feed_var or fetch_var) = 2 means dataType is int32
-#param 'type'(which is in feed_var or fetch_var) = 3 means dataType is string(also called bytes in proto)
+#param 'type'(which is in feed_var or fetch_var) = 5 means dataType is float16
+#param 'type'(which is in feed_var or fetch_var) = 7 means dataType is uint8
+#param 'type'(which is in feed_var or fetch_var) = 8 means dataType is int8
+#param 'type'(which is in feed_var or fetch_var) = 20 means dataType is string(also called bytes in proto)
 int64_type = 0
 float32_type = 1
 int32_type = 2
-bytes_type = 3
+float16_type = 5
+uint8_type = 7
+int8_type = 8
+bytes_type = 20
 #int_type,float_type,string_type are the set of each subdivision classes.
 int_type = set([int64_type, int32_type])
 float_type = set([float32_type])
-string_type = set([bytes_type])
+string_type = set([bytes_type, float16_type, uint8_type, int8_type])
 class _NOPProfiler(object):
@@ -289,31 +295,39 @@ class Client(object):
                log_id=0):
        self.profile_.record('py_prepro_0')
-        if feed is None or fetch is None:
+        # fetch 可以为空，此时会取所有的输出结果
-            raise ValueError("You should specify feed and fetch for prediction")
+        if feed is None:
+            raise ValueError("You should specify feed for prediction")
        fetch_list = []
        if isinstance(fetch, str):
            fetch_list = [fetch]
        elif isinstance(fetch, list):
            fetch_list = fetch
+        # fetch 可以为空，此时会取所有的输出结果
+        elif fetch == None:
+            pass
        else:
-            raise ValueError("Fetch only accepts string and list of string")
+            raise ValueError("Fetch only accepts string or list of string")
        feed_batch = []
        if isinstance(feed, dict):
            feed_batch.append(feed)
        elif isinstance(feed, list):
-            # if input is a list and the number of feed_var is 1.
+            # feed = [dict]
-            # create a temp_dict { key = feed_var_name, value = list}
+            if len(feed) == 1 and isinstance(feed[0], dict):
-            # put the temp_dict into the feed_batch.
+                feed_batch = feed
-            if len(self.feed_names_) != 1:
+            else:
-                raise ValueError(
+                # if input is a list and the number of feed_var is 1.
-                    "input is a list, but we got 0 or 2+ feed_var, don`t know how to divide the feed list"
+                # create a temp_dict { key = feed_var_name, value = list}
-                )
+                # put the temp_dict into the feed_batch.
-            temp_dict = {}
+                if len(self.feed_names_) != 1:
-            temp_dict[self.feed_names_[0]] = feed
+                    raise ValueError(
-            feed_batch.append(temp_dict)
+                        "input is a list, but we got 0 or 2+ feed_var, don`t know how to divide the feed list"
+                    )
+                temp_dict = {}
+                temp_dict[self.feed_names_[0]] = feed
+                feed_batch.append(temp_dict)
        else:
            raise ValueError("Feed only accepts dict and list of dict")
@@ -321,10 +335,15 @@ class Client(object):
        if len(feed_batch) != 1:
            raise ValueError("len of feed_batch can only be 1.")
-        int_slot = []
+        int32_slot = []
-        int_feed_names = []
+        int32_feed_names = []
-        int_shape = []
+        int32_shape = []
-        int_lod_slot_batch = []
+        int32_lod_slot_batch = []
+        int64_slot = []
+        int64_feed_names = []
+        int64_shape = []
+        int64_lod_slot_batch = []
        float_slot = []
        float_feed_names = []
@@ -341,10 +360,6 @@ class Client(object):
            if key in self.fetch_names_:
                fetch_names.append(key)
-        if len(fetch_names) == 0:
-            raise ValueError(
-                "Fetch names should not be empty or out of saved fetch list.")
        feed_dict = feed_batch[0]
        for key in feed_dict:
            if ".lod" not in key and key not in self.feed_names_:
@@ -354,27 +369,39 @@ class Client(object):
            self.shape_check(feed_dict, key)
            if self.feed_types_[key] in int_type:
-                int_feed_names.append(key)
                shape_lst = []
                if batch == False:
                    feed_dict[key] = np.expand_dims(feed_dict[key], 0).repeat(
                        1, axis=0)
-                if isinstance(feed_dict[key], np.ndarray):
+                # verify different input int_type
-                    shape_lst.extend(list(feed_dict[key].shape))
+                if(self.feed_types_[key] == int64_type):
-                    int_shape.append(shape_lst)
+                    int64_feed_names.append(key)
-                else:
+                    if isinstance(feed_dict[key], np.ndarray):
-                    int_shape.append(self.feed_shapes_[key])
+                        shape_lst.extend(list(feed_dict[key].shape))
-                if "{}.lod".format(key) in feed_dict:
+                        int64_shape.append(shape_lst)
-                    int_lod_slot_batch.append(feed_dict["{}.lod".format(key)])
+                        self.has_numpy_input = True
-                else:
+                    else:
-                    int_lod_slot_batch.append([])
+                        int64_shape.append(self.feed_shapes_[key])
+                        self.all_numpy_input = False
-                if isinstance(feed_dict[key], np.ndarray):
+                    if "{}.lod".format(key) in feed_dict:
-                    int_slot.append(np.ascontiguousarray(feed_dict[key]))
+                        int64_lod_slot_batch.append(feed_dict["{}.lod".format(key)])
-                    self.has_numpy_input = True
+                    else:
+                        int64_lod_slot_batch.append([])
+                    int64_slot.append(np.ascontiguousarray(feed_dict[key]))
                else:
-                    int_slot.append(np.ascontiguousarray(feed_dict[key]))
+                    int32_feed_names.append(key)
-                    self.all_numpy_input = False
+                    if isinstance(feed_dict[key], np.ndarray):
+                        shape_lst.extend(list(feed_dict[key].shape))
+                        int32_shape.append(shape_lst)
+                        self.has_numpy_input = True
+                    else:
+                        int32_shape.append(self.feed_shapes_[key])
+                        self.all_numpy_input = False
+                    if "{}.lod".format(key) in feed_dict:
+                        int32_lod_slot_batch.append(feed_dict["{}.lod".format(key)])
+                    else:
+                        int32_lod_slot_batch.append([])
+                    int32_slot.append(np.ascontiguousarray(feed_dict[key]))
            elif self.feed_types_[key] in float_type:
                float_feed_names.append(key)
@@ -407,7 +434,10 @@ class Client(object):
                        key)])
                else:
                    string_lod_slot_batch.append([])
-                string_slot.append(feed_dict[key])
+                if type(feed_dict[key]) is np.ndarray:
+                    string_slot.append(feed_dict[key].tostring())
+                else:
+                    string_slot.append(feed_dict[key])
                self.has_numpy_input = True
        self.profile_.record('py_prepro_1')
@@ -417,7 +447,8 @@ class Client(object):
        if self.all_numpy_input:
            res = self.client_handle_.numpy_predict(
                float_slot, float_feed_names, float_shape, float_lod_slot_batch,
-                int_slot, int_feed_names, int_shape, int_lod_slot_batch,
+                int32_slot, int32_feed_names, int32_shape, int32_lod_slot_batch,
+                int64_slot, int64_feed_names, int64_shape, int64_lod_slot_batch,
                string_slot, string_feed_names, string_shape,
                string_lod_slot_batch, fetch_names, result_batch_handle,
                self.pid, log_id)
@@ -439,6 +470,9 @@ class Client(object):
        model_engine_names = result_batch_handle.get_engine_names()
        for mi, engine_name in enumerate(model_engine_names):
            result_map = {}
+            # fetch 为空，则会取所有的输出结果
+            if len(fetch_names) == 0:
+                fetch_names = result_batch_handle.get_tensor_alias_names(mi)
            # result map needs to be a numpy array
            for i, name in enumerate(fetch_names):
                if self.fetch_names_to_type_[name] == int64_type:
@@ -485,6 +519,54 @@ class Client(object):
                        tmp_lod = result_batch_handle.get_lod(mi, name)
                        if np.size(tmp_lod) > 0:
                            result_map["{}.lod".format(name)] = tmp_lod
+                elif self.fetch_names_to_type_[name] == uint8_type:
+                    # result_map[name] will be py::array(numpy array)
+                    tmp_str = result_batch_handle.get_string_by_name(
+                        mi, name)
+                    result_map[name] = np.fromstring(tmp_str, dtype = np.uint8)
+                    if result_map[name].size == 0:
+                        raise ValueError(
+                            "Failed to fetch, maybe the type of [{}]"
+                            " is wrong, please check the model file".format(
+                                name))
+                    shape = result_batch_handle.get_shape(mi, name)
+                    result_map[name].shape = shape
+                    if name in self.lod_tensor_set:
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
+                elif self.fetch_names_to_type_[name] == int8_type:
+                    # result_map[name] will be py::array(numpy array)
+                    tmp_str = result_batch_handle.get_string_by_name(
+                        mi, name)
+                    result_map[name] = np.fromstring(tmp_str, dtype = np.int8)
+                    if result_map[name].size == 0:
+                        raise ValueError(
+                            "Failed to fetch, maybe the type of [{}]"
+                            " is wrong, please check the model file".format(
+                                name))
+                    shape = result_batch_handle.get_shape(mi, name)
+                    result_map[name].shape = shape
+                    if name in self.lod_tensor_set:
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
+                elif self.fetch_names_to_type_[name] == float16_type:
+                    # result_map[name] will be py::array(numpy array)
+                    tmp_str = result_batch_handle.get_string_by_name(
+                        mi, name)
+                    result_map[name] = np.fromstring(tmp_str, dtype = np.float16)
+                    if result_map[name].size == 0:
+                        raise ValueError(
+                            "Failed to fetch, maybe the type of [{}]"
+                            " is wrong, please check the model file".format(
+                                name))
+                    shape = result_batch_handle.get_shape(mi, name)
+                    result_map[name].shape = shape
+                    if name in self.lod_tensor_set:
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
            multi_result_map.append(result_map)
        ret = None
        if len(model_engine_names) == 1:

--- a/python/paddle_serving_client/convert.py
+++ b/python/paddle_serving_client/convert.py
@@ -23,6 +23,12 @@ from .io import inference_model_to_serving
 def parse_args():  # pylint: disable=doc-string-missing
    parser = argparse.ArgumentParser("convert")
+    parser.add_argument(
+        "--show_proto",
+        type=bool,
+        default=False,
+        help='If yes, you can preview the proto and then determine your feed var alias name and fetch var alias name.'
+    )
    parser.add_argument(
        "--dirname",
        type=str,
@@ -53,6 +59,18 @@ def parse_args():  # pylint: disable=doc-string-missing
        default=None,
        help='The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. Default: None.'
    )
+    parser.add_argument(
+        "--feed_alias_names",
+        type=str,
+        default=None,
+        help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of feed vars'
+    )
+    parser.add_argument(
+        "--fetch_alias_names",
+        type=str,
+        default=None,
+        help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of fetch vars'
+    )
    return parser.parse_args()
@@ -63,4 +81,7 @@ if __name__ == "__main__":
        serving_server=args.serving_server,
        serving_client=args.serving_client,
        model_filename=args.model_filename,
-        params_filename=args.params_filename)
+        params_filename=args.params_filename,
+        show_proto=args.show_proto,
+        feed_alias_names=args.feed_alias_names,
+        fetch_alias_names=args.fetch_alias_names)
--- a/python/paddle_serving_client/httpclient.py
+++ b/python/paddle_serving_client/httpclient.py
@@ -22,6 +22,7 @@ import gzip
 from collections import Iterable
 import base64
 import sys
+import re
 import grpc
 from .proto import general_model_service_pb2
@@ -31,13 +32,18 @@ from .proto import general_model_service_pb2_grpc
 #param 'type'(which is in feed_var or fetch_var) = 0 means dataType is int64
 #param 'type'(which is in feed_var or fetch_var) = 1 means dataType is float32
 #param 'type'(which is in feed_var or fetch_var) = 2 means dataType is int32
-#param 'type'(which is in feed_var or fetch_var) = 3 means dataType is string(also called bytes in proto)
+#param 'type'(which is in feed_var or fetch_var) = 20 means dataType is string(also called bytes in proto)
 int64_type = 0
 float32_type = 1
 int32_type = 2
-bytes_type = 3
+bytes_type = 20
 # this is corresponding to the proto
-proto_data_key_list = ["int64_data", "float_data", "int_data", "data"]
+proto_data_key_list = {
+    0: "int64_data",
+    1: "float_data",
+    2: "int_data",
+    20: "data"
+}
 def list_flatten(items, ignore_types=(str, bytes)):
@@ -73,9 +79,9 @@ def data_bytes_number(datalist):
 # 可以直接调用需要的http_client_predict/grpc_client_predict
 # 例如，如果想使用GRPC方式，set_use_grpc_client(True)
 # 或者直接调用grpc_client_predict()
-class GeneralClient(object):
+class HttpClient(object):
    def __init__(self,
-                 ip="0.0.0.0",
+                 ip="127.0.0.1",
                 port="9393",
                 service_name="/GeneralModelService/inference"):
        self.feed_names_ = []
@@ -84,7 +90,7 @@ class GeneralClient(object):
        self.feed_shapes_ = {}
        self.feed_types_ = {}
        self.feed_names_to_idx_ = {}
-        self.timeout_ms = 200000
+        self.timeout_ms = 20000
        self.ip = ip
        self.port = port
        self.server_port = port
@@ -93,9 +99,24 @@ class GeneralClient(object):
        self.try_request_gzip = False
        self.try_response_gzip = False
        self.total_data_number = 0
+        self.headers = {}
        self.http_proto = True
+        self.headers["Content-Type"] = "application/proto"
        self.max_body_size = 512 * 1024 * 1024
        self.use_grpc_client = False
+        self.http_s = "http://"
+        # 使用连接池能够不用反复建立连接
+        self.requests_session = requests.session()
+        # 初始化grpc_stub
+        options = [('grpc.max_receive_message_length', self.max_body_size),
+                   ('grpc.max_send_message_length', self.max_body_size)]
+        endpoints = [self.ip + ":" + self.server_port]
+        g_endpoint = 'ipv4:{}'.format(','.join(endpoints))
+        self.channel_ = grpc.insecure_channel(g_endpoint, options=options)
+        self.stub_ = general_model_service_pb2_grpc.GeneralModelServiceStub(
+            self.channel_)
    def load_client_config(self, model_config_path_list):
        if isinstance(model_config_path_list, str):
@@ -162,14 +183,57 @@ class GeneralClient(object):
        else:
            self.timeout_ms = timeout_ms
-    def set_ip(self, ip):
+    def set_max_retries(self, retry_times=3):
-        self.ip = ip
+        if not isinstance(retry_times, int):
+            raise ValueError("retry_times must be int type.")
+        else:
+            self.requests_session.mount(
+                self.http_s, HTTPAdapter(max_retries=retry_times))
    def set_service_name(self, service_name):
        self.service_name = service_name
-    def set_port(self, port):
+    def connect(self, url=None, encryption=False):
-        self.port = port
+        if isinstance(url, (list, tuple)):
+            if len(url) > 1:
+                raise ValueError("HttpClient only support 1 endpoint")
+            else:
+                url = url[0]
+        if isinstance(url, str):
+            if url.startswith("https://"):
+                url = url[8:]
+                self.http_s = "https://"
+            if url.startswith("http://"):
+                url = url[7:]
+                self.http_s = "http://"
+            url_parts = url.split(':')
+            if len(url_parts) != 2 or self.check_ip(url_parts[0]) == False:
+                raise ValueError(
+                    "url not right, it should be like 127.0.0.1:9393 or http://127.0.0.1:9393"
+                )
+            else:
+                self.ip = url_parts[0]
+                self.port = url_parts[1]
+                self.server_port = url_parts[1]
+        if encryption:
+            self.get_serving_port()
+        if self.use_grpc_client:
+            self.init_grpc_stub()
+    def check_ip(self, ipAddr):
+        compile_ip = re.compile(
+            '^(1\d{2}|2[0-4]\d|25[0-5]|[1-9]\d|[1-9])\.(1\d{2}|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.(1\d{2}|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.(1\d{2}|2[0-4]\d|25[0-5]|[1-9]\d|\d)$'
+        )
+        if compile_ip.match(ipAddr):
+            return True
+        else:
+            return False
+    def add_http_headers(self, headers):
+        if isinstance(headers, dict):
+            self.headers.update(headers)
+        else:
+            print("headers must be a dict")
    def set_request_compress(self, try_request_gzip):
        self.try_request_gzip = try_request_gzip
@@ -179,6 +243,10 @@ class GeneralClient(object):
    def set_http_proto(self, http_proto):
        self.http_proto = http_proto
+        if self.http_proto:
+            self.headers["Content-Type"] = "application/proto"
+        else:
+            self.headers["Content-Type"] = "application/json"
    def set_use_grpc_client(self, use_grpc_client):
        self.use_grpc_client = use_grpc_client
@@ -187,21 +255,21 @@ class GeneralClient(object):
    def use_key(self, key_filename):
        with open(key_filename, "rb") as f:
            self.key = f.read()
-            self.get_serving_port()
    def get_serving_port(self):
-        encrypt_url = "http://" + str(self.ip) + ":" + str(self.port)
+        encrypt_url = self.http_s + str(self.ip) + ":" + str(self.port)
        if self.key is not None:
            req = json.dumps({"key": base64.b64encode(self.key).decode()})
        else:
            req = json.dumps({})
-        r = requests.post(encrypt_url, req)
+        with requests.post(
-        result = r.json()
+                encrypt_url, data=req, timeout=self.timeout_ms / 1000) as r:
-        if "endpoint_list" not in result:
+            result = r.json()
-            raise ValueError("server not ready")
+            if "endpoint_list" not in result:
-        else:
+                raise ValueError("server not ready")
-            self.server_port = str(result["endpoint_list"][0])
+            else:
-            print("rpc port is ", self.server_port)
+                self.server_port = str(result["endpoint_list"][0])
+                print("rpc port is ", self.server_port)
    def get_feed_names(self):
        return self.feed_names_
@@ -210,35 +278,34 @@ class GeneralClient(object):
        return self.fetch_names_
    def get_legal_fetch(self, fetch):
-        if fetch is None:
-            raise ValueError("You should specify feed and fetch for prediction")
        fetch_list = []
        if isinstance(fetch, str):
            fetch_list = [fetch]
        elif isinstance(fetch, (list, tuple)):
            fetch_list = fetch
+        elif fetch == None:
+            pass
        else:
-            raise ValueError("Fetch only accepts string and list of string")
+            raise ValueError("Fetch only accepts string/list/tuple of string")
        fetch_names = []
        for key in fetch_list:
            if key in self.fetch_names_:
                fetch_names.append(key)
-        if len(fetch_names) == 0:
-            raise ValueError(
-                "Fetch names should not be empty or out of saved fetch list.")
-            return {}
        return fetch_names
    def get_feedvar_dict(self, feed):
        if feed is None:
-            raise ValueError("You should specify feed and fetch for prediction")
+            raise ValueError("You should specify feed for prediction")
        feed_dict = {}
        if isinstance(feed, dict):
            feed_dict = feed
        elif isinstance(feed, (list, str, tuple)):
+            # feed = [dict]
+            if len(feed) == 1 and isinstance(feed[0], dict):
+                feed_dict = feed[0]
+                return feed_dict
            # if input is a list or str or tuple, and the number of feed_var is 1.
            # create a feed_dict { key = feed_var_name, value = list}
            if len(self.feed_names_) == 1:
@@ -376,17 +443,19 @@ class GeneralClient(object):
            # 此时先统一处理为一个list
            # 由于输入比较特殊，shape保持原feedvar中不变
            data_value = []
-            data_value.append(feed_dict[key])
+            if isinstance(feed_dict[key], (str, bytes)):
-            if isinstance(feed_dict[key], str):
                if self.feed_types_[key] != bytes_type:
                    raise ValueError(
                        "feedvar is not string-type,feed can`t be a single string."
                    )
+                if isinstance(feed_dict[key], bytes):
+                    feed_dict[key] = feed_dict[key].decode()
            else:
                if self.feed_types_[key] == bytes_type:
                    raise ValueError(
-                        "feedvar is string-type,feed, feed can`t be a single int or others."
+                        "feedvar is string-type,feed can`t be a single int or others."
                    )
+            data_value.append(feed_dict[key])
        # 如果不压缩，那么不需要统计数据量。
        if self.try_request_gzip:
            self.total_data_number = self.total_data_number + data_bytes_number(
@@ -427,36 +496,42 @@ class GeneralClient(object):
        feed_dict = self.get_feedvar_dict(feed)
        fetch_list = self.get_legal_fetch(fetch)
-        headers = {}
        postData = ''
        if self.http_proto == True:
            postData = self.process_proto_data(feed_dict, fetch_list, batch,
                                               log_id).SerializeToString()
-            headers["Content-Type"] = "application/proto"
        else:
            postData = self.process_json_data(feed_dict, fetch_list, batch,
                                              log_id)
-            headers["Content-Type"] = "application/json"
-        web_url = "http://" + self.ip + ":" + self.server_port + self.service_name
+        web_url = self.http_s + self.ip + ":" + self.server_port + self.service_name
        # 当数据区长度大于512字节时才压缩.
+        self.headers.pop("Content-Encoding", "nokey")
        try:
            if self.try_request_gzip and self.total_data_number > 512:
-                origin_data = postData
-                postData = gzip.compress(bytes(postData, 'utf-8'))
+                if self.http_proto:
-                headers["Content-Encoding"] = "gzip"
+                    postData = gzip.compress(postData)
+                else:
+                    postData = gzip.compress(bytes(postData, 'utf-8'))
+                self.headers["Content-Encoding"] = "gzip"
            if self.try_response_gzip:
-                headers["Accept-encoding"] = "gzip"
+                self.headers["Accept-encoding"] = "gzip"
        # 压缩异常，使用原始数据
        except:
            print("compress error, we will use the no-compress data")
-            headers.pop("Content-Encoding", "nokey")
+            self.headers.pop("Content-Encoding", "nokey")
-            postData = origin_data
        # requests支持自动识别解压
        try:
-            result = requests.post(url=web_url, headers=headers, data=postData)
+            result = self.requests_session.post(
+                url=web_url,
+                headers=self.headers,
+                data=postData,
+                timeout=self.timeout_ms / 1000,
+                verify=False)
+            result.raise_for_status()
        except:
            print("http post error")
            return None
@@ -484,6 +559,16 @@ class GeneralClient(object):
        postData = self.process_proto_data(feed_dict, fetch_list, batch, log_id)
+        try:
+            resp = self.stub_.inference(
+                postData, timeout=self.timeout_ms / 1000)
+        except:
+            print("Grpc inference error occur")
+            return None
+        else:
+            return resp
+    def init_grpc_stub(self):
        # https://github.com/tensorflow/serving/issues/1382
        options = [('grpc.max_receive_message_length', self.max_body_size),
                   ('grpc.max_send_message_length', self.max_body_size)]
@@ -493,10 +578,7 @@ class GeneralClient(object):
        self.channel_ = grpc.insecure_channel(g_endpoint, options=options)
        self.stub_ = general_model_service_pb2_grpc.GeneralModelServiceStub(
            self.channel_)
-        try:
-            resp = self.stub_.inference(postData, timeout=self.timeout_ms)
+    def __del__(self):
-        except:
+        self.requests_session.close()
-            print("Grpc inference error occur")
+        self.channel_.close()
-            return None
-        else:
-            return resp
--- a/python/paddle_serving_client/io/__init__.py
+++ b/python/paddle_serving_client/io/__init__.py
@@ -67,7 +67,6 @@ def save_dygraph_model(serving_model_folder, client_config_folder, model):
    }
    config = model_conf.GeneralModelConfig()
-    #int64 = 0; float32 = 1; int32 = 2;
    for key in feed_var_dict:
        feed_var = model_conf.FeedVar()
        feed_var.alias_name = key
@@ -127,7 +126,6 @@ def save_dygraph_model(serving_model_folder, client_config_folder, model):
 def var_type_conversion(dtype):
    """
    Variable type conversion
    Args:
        dtype: type of core.VarDesc.VarType.xxxxx
        (https://github.com/PaddlePaddle/Paddle/blob/release/2.1/python/paddle/framework/dtype.py) 
@@ -184,7 +182,12 @@ def save_model(server_model_folder,
               main_program=None,
               encryption=False,
               key_len=128,
-               encrypt_conf=None):
+               encrypt_conf=None,
+               model_filename=None,
+               params_filename=None,
+               show_proto=False,
+               feed_alias_names=None,
+               fetch_alias_names=None):
    executor = Executor(place=CPUPlace())
    feed_var_names = [feed_var_dict[x].name for x in feed_var_dict]
@@ -194,16 +197,30 @@ def save_model(server_model_folder,
        target_vars.append(fetch_var_dict[key])
        target_var_names.append(key)
-    if not encryption:
+    if not encryption and not show_proto:
-        save_inference_model(
+        if not os.path.exists(server_model_folder):
-            server_model_folder,
+            os.makedirs(server_model_folder)
-            feed_var_names,
+        if not model_filename:
-            target_vars,
+            model_filename = "model.pdmodel"
-            executor,
+        if not params_filename:
-            model_filename="__model__",
+            params_filename = "params.pdiparams"
-            params_filename="__params__",
-            main_program=main_program)
+        new_model_path = os.path.join(server_model_folder, model_filename)
-    else:
+        new_params_path = os.path.join(server_model_folder, params_filename)
+        with open(new_model_path, "wb") as new_model_file:
+            new_model_file.write(main_program.desc.serialize_to_string())
+        paddle.static.save_vars(
+            executor=executor,
+            dirname=server_model_folder,
+            main_program=main_program,
+            vars=None,
+            predicate=paddle.static.io.is_persistable,
+            filename=params_filename)
+    elif not show_proto:
+        if not os.path.exists(server_model_folder):
+            os.makedirs(server_model_folder)
        if encrypt_conf == None:
            aes_cipher = CipherFactory.create_cipher()
        else:
@@ -221,10 +238,19 @@ def save_model(server_model_folder,
        os.chdir("..")
    config = model_conf.GeneralModelConfig()
+    if feed_alias_names is None:
-    for key in feed_var_dict:
+        feed_alias = list(feed_var_dict.keys())
+    else:
+        feed_alias = feed_alias_names.split(',')
+    if fetch_alias_names is None:
+        fetch_alias = target_var_names
+    else:
+        fetch_alias = fetch_alias_names.split(',')
+    if len(feed_alias) != len(feed_var_dict.keys()) or len(fetch_alias) != len(target_var_names):
+        raise ValueError("please check the input --feed_alias_names and --fetch_alias_names, should be same size with feed_vars and fetch_vars") 
+    for i, key in enumerate(feed_var_dict):
        feed_var = model_conf.FeedVar()
-        feed_var.alias_name = key
+        feed_var.alias_name = feed_alias[i]
        feed_var.name = feed_var_dict[key].name
        feed_var.feed_type = var_type_conversion(feed_var_dict[key].dtype)
@@ -239,9 +265,9 @@ def save_model(server_model_folder,
            feed_var.shape.extend(tmp_shape)
        config.feed_var.extend([feed_var])
-    for key in target_var_names:
+    for i, key in enumerate(target_var_names):
        fetch_var = model_conf.FetchVar()
-        fetch_var.alias_name = key
+        fetch_var.alias_name = fetch_alias[i]
        fetch_var.name = fetch_var_dict[key].name
        fetch_var.fetch_type = var_type_conversion(fetch_var_dict[key].dtype)
@@ -257,6 +283,9 @@ def save_model(server_model_folder,
            fetch_var.shape.extend(tmp_shape)
        config.fetch_var.extend([fetch_var])
+    if show_proto:
+        print(str(config))
+        return
    try:
        save_dirname = os.path.normpath(client_config_folder)
        os.makedirs(save_dirname)
@@ -284,7 +313,10 @@ def inference_model_to_serving(dirname,
                               params_filename=None,
                               encryption=False,
                               key_len=128,
-                               encrypt_conf=None):
+                               encrypt_conf=None,
+                               show_proto=False,
+                               feed_alias_names=None,
+                               fetch_alias_names=None):
    paddle.enable_static()
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
@@ -296,7 +328,8 @@ def inference_model_to_serving(dirname,
    }
    fetch_dict = {x.name: x for x in fetch_targets}
    save_model(serving_server, serving_client, feed_dict, fetch_dict,
-               inference_program, encryption, key_len, encrypt_conf)
+               inference_program, encryption, key_len, encrypt_conf,
+               model_filename, params_filename, show_proto, feed_alias_names, fetch_alias_names)
    feed_names = feed_dict.keys()
    fetch_names = fetch_dict.keys()
    return feed_names, fetch_names
--- a/python/paddle_serving_server/parse_profile.py
+++ b/python/paddle_serving_server/parse_profile.py
@@ -96,7 +96,7 @@ if __name__ == "__main__":
    args = parse_args()
    benchmark_cfg_filename = args.benchmark_cfg
    f = open(benchmark_cfg_filename, 'r')
-    benchmark_config = yaml.load(f)
+    benchmark_config = yaml.load(f, yaml.FullLoader)
    f.close()
    benchmark_log_filename = args.benchmark_log
    f = open(benchmark_log_filename, 'r')

--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
@@ -37,7 +37,7 @@ import socket
 def port_is_available(port):
    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
        sock.settimeout(2)
-        result = sock.connect_ex(('0.0.0.0', port))
+        result = sock.connect_ex(('127.0.0.1', port))
    if result != 0:
        return True
    else:

--- a/python/paddle_serving_server/server.py
+++ b/python/paddle_serving_server/server.py
@@ -228,7 +228,8 @@ class Server(object):
            engine.batch_infer_size = self.op_max_batch[index %
                                                        len(self.op_max_batch)]
-            engine.enable_batch_align = 1
+            engine.enable_overrun = False
+            engine.allow_split_request = True
            engine.model_dir = model_config_path
            engine.enable_memory_optimization = self.memory_optimization
            engine.enable_ir_optimization = self.ir_optimization
@@ -537,7 +538,7 @@ class Server(object):
    def port_is_available(self, port):
        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
            sock.settimeout(2)
-            result = sock.connect_ex(('0.0.0.0', port))
+            result = sock.connect_ex(('127.0.0.1', port))
        if result != 0:
            return True
        else:
@@ -563,7 +564,7 @@ class Server(object):
                    "-num_threads {} " \
                    "-port {} " \
                    "-precision {} " \
-                    "-use_calib {} " \
+                    "-use_calib={} " \
                    "-reload_interval_s {} " \
                    "-resource_path {} " \
                    "-resource_file {} " \

--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -33,7 +33,7 @@ from paddle_serving_server.serve import format_gpu_to_strlist
 def port_is_available(port):
    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
        sock.settimeout(2)
-        result = sock.connect_ex(('0.0.0.0', port))
+        result = sock.connect_ex(('127.0.0.1', port))
    if result != 0:
        return True
    else:

--- a/python/pipeline/analyse.py
+++ b/python/pipeline/analyse.py
@@ -274,7 +274,7 @@ class OpAnalyst(object):
        """
        import yaml
        with open(op_config_yaml) as f:
-            op_config = yaml.load(f)
+            op_config = yaml.load(f, yaml.FullLoader)
        # check that each model is deployed on a different card
        card_set = set()

--- a/python/pipeline/channel.py
+++ b/python/pipeline/channel.py
@@ -28,6 +28,7 @@ import logging
 import enum
 import os
 import copy
+import time
 _LOGGER = logging.getLogger(__name__)
@@ -45,7 +46,9 @@ class ChannelDataErrcode(enum.Enum):
    CLOSED_ERROR = 6
    NO_SERVICE = 7
    UNKNOW = 8
-    PRODUCT_ERROR = 9
+    INPUT_PARAMS_ERROR = 9
+    PRODUCT_ERROR = 100
 class ProductErrCode(enum.Enum):
@@ -124,7 +127,6 @@ class ChannelData(object):
    def get_size(self):
        size = 0
-        dict_data = None
        if isinstance(self.dictdata, dict):
            for k in self.dictdata:
                size += sys.getsizeof(self.dictdata[k]) + sys.getsizeof(k)
@@ -259,7 +261,11 @@ class ProcessChannel(object):
        maintains the data obtained from queue.
    """
-    def __init__(self, manager, name=None, maxsize=0):
+    def __init__(self,
+                 manager,
+                 name=None,
+                 maxsize=0,
+                 channel_recv_frist_arrive=False):
        # For queue multiprocess: after putting an object on 
        # an empty queue there may be an infinitessimal delay
        # before the queue's :meth:`~Queue.empty`
@@ -283,6 +289,9 @@ class ProcessChannel(object):
        self._base_cursor = manager.Value('i', 0)
        self._output_buf = manager.list()
+        self._cur_max_dataid = manager.Value('i', -1)
+        self._channel_recv_frist_arrive = channel_recv_frist_arrive
    def get_maxsize(self):
        return self._maxsize
@@ -325,9 +334,10 @@ class ProcessChannel(object):
    def push(self, channeldata, op_name=None):
        _LOGGER.debug(
            self._log(
-                "(data_id={} log_id={}) Op({}) Enter channel::push producers:{}".
+                "(data_id={} log_id={}) Op({}) Enter channel::push producers:{}, time:{}".
                format(channeldata.id, channeldata.log_id, op_name,
-                       len(self._producers))))
+                       len(self._producers), time.time())))
        if len(self._producers) == 0:
            _LOGGER.critical(
                self._log(
@@ -355,16 +365,55 @@ class ProcessChannel(object):
                self._cv.notify_all()
                notify_all_time = _time()
                _LOGGER.debug(
-                    "(data_id={}) Op({}) channel push cost! enter_cv:{} ms, push_que:{} ms, notify:{} ms, data_size:{}".
+                    "(data_id={}) Op({}) channel push cost! enter_cv:{} ms, push_que:{} ms, notify:{} ms, data_size:{}, time:{}".
                    format(channeldata.id, op_name, (enter_cv_time - start_time)
                           * 1000, (push_que_time - enter_cv_time) * 1000, (
                               notify_all_time - push_que_time) * 1000,
-                           channeldata.get_size()))
+                           channeldata.get_size(), time.time()))
            _LOGGER.debug(
                self._log(
                    "(data_id={} log_id={}) Op({}) Pushed data into internal queue.".
                    format(channeldata.id, channeldata.log_id, op_name)))
            return True
+        elif self._channel_recv_frist_arrive == True:
+            start_time = _time()
+            with self._cv:
+                _LOGGER.debug(
+                    "(data_id={}) Op({}) Channel({}) enter channel_recv_first_arrive. _cur_max_dataid:{}".
+                    format(channeldata.id, op_name, self.name,
+                           self._cur_max_dataid.value))
+                if channeldata.id > self._cur_max_dataid.value:
+                    enter_cv_time = _time()
+                    push_que_time = enter_cv_time
+                    while self._stop.value == 0:
+                        try:
+                            self._que.put((channeldata.id, {
+                                op_name: channeldata
+                            }),
+                                          timeout=0)
+                            push_que_time = _time()
+                            self._cur_max_dataid.value = channeldata.id
+                            break
+                        except Queue.Full:
+                            self._cv.wait()
+                    if self._stop.value == 1:
+                        raise ChannelStopError()
+                    self._cv.notify_all()
+                    notify_all_time = _time()
+                    _LOGGER.debug(
+                        "(data_id={}) Op({}) channel push cost! enter_cv:{} ms, push_que:{} ms, notify:{} ms, data_size:{}, time:{}".
+                        format(channeldata.id, op_name, (
+                            enter_cv_time - start_time) * 1000, (
+                                push_que_time - enter_cv_time) * 1000, (
+                                    notify_all_time - push_que_time) * 1000,
+                               channeldata.get_size(), time.time()))
+                else:
+                    # log and drop it
+                    _LOGGER.debug(
+                        "(data_id={}) Op({}) send data is dropped! cur_max_dataid:{}".
+                        format(channeldata.id, op_name,
+                               self._cur_max_dataid.value))
+            return True
        elif op_name is None:
            _LOGGER.critical(
                self._log(
@@ -414,8 +463,8 @@ class ProcessChannel(object):
                _LOGGER.debug(
                    self._log(
-                        "(data_id={} log_id={}) Op({}) Pushed data into internal_queue.".
+                        "(data_id={} log_id={}) Op({}) Pushed data into internal_queue. time:{}".
-                        format(data_id, log_id, op_name)))
+                        format(data_id, log_id, op_name, time.time())))
            self._cv.notify_all()
        return True
@@ -464,9 +513,9 @@ class ProcessChannel(object):
            key = list(resp.keys())[0]
            data_id = resp[key].id
            _LOGGER.debug(
-                "(data_id={}) op({}) front cost enter_cv:{} ms, queue_get:{} ms".
+                "(data_id={}) op({}) front cost enter_cv:{} ms, queue_get:{} ms, time:{}".
                format(data_id, op_name, (time_2 - time_1) / 1000.0, (
-                    time_3 - time_2) / 1000.0))
+                    time_3 - time_2) / 1000.0, time.time()))
            if resp is not None:
                list_values = list(resp.values())
                _LOGGER.debug(
@@ -501,9 +550,9 @@ class ProcessChannel(object):
                    list_values = list(channeldata.values())
                    _LOGGER.debug(
                        self._log(
-                            "(data_id={} log_id={}) Op({}) Pop ready item into output_buffer".
+                            "(data_id={} log_id={}) Op({}) Pop ready item into output_buffer, time:{}".
                            format(list_values[0].id, list_values[0].log_id,
-                                   op_name)))
+                                   op_name, time.time())))
                    break
                except Queue.Empty:
                    if timeout is not None:
@@ -561,8 +610,9 @@ class ProcessChannel(object):
            list_values = list(resp.values())
            _LOGGER.debug(
                self._log(
-                    "(data_id={} log_id={}) Op({}) Got data from output_buffer".
+                    "(data_id={} log_id={}) Op({}) Got data from output_buffer, time:{}".
-                    format(list_values[0].id, list_values[0].log_id, op_name)))
+                    format(list_values[0].id, list_values[0].log_id, op_name,
+                           time.time())))
        return resp
    def stop(self):
@@ -601,7 +651,7 @@ class ThreadChannel(Queue.PriorityQueue):
        maintains the data obtained from queue.
    """
-    def __init__(self, name=None, maxsize=-1):
+    def __init__(self, name=None, maxsize=-1, channel_recv_frist_arrive=False):
        Queue.Queue.__init__(self, maxsize=maxsize)
        self._maxsize = maxsize
        self.name = name
@@ -619,6 +669,9 @@ class ThreadChannel(Queue.PriorityQueue):
        self._base_cursor = 0
        self._output_buf = []
+        self._channel_recv_frist_arrive = channel_recv_frist_arrive
+        self._cur_max_dataid = -1
    def get_maxsize(self):
        return self._maxsize
@@ -662,6 +715,7 @@ class ThreadChannel(Queue.PriorityQueue):
        _LOGGER.debug(
            self._log("(data_id={} log_id={}) Op({}) Pushing data".format(
                channeldata.id, channeldata.log_id, op_name)))
        if len(self._producers) == 0:
            _LOGGER.critical(
                self._log(
@@ -688,6 +742,29 @@ class ThreadChannel(Queue.PriorityQueue):
                    "(data_id={} log_id={}) Op({}) Pushed data into internal_queue.".
                    format(channeldata.id, channeldata.log_id, op_name)))
            return True
+        elif self._channel_recv_frist_arrive is True:
+            with self._cv:
+                if channeldata.id > self._cur_max_dataid:
+                    while self._stop is False:
+                        try:
+                            self.put((channeldata.id, {
+                                op_name: channeldata
+                            }),
+                                     timeout=0)
+                            self._cur_max_dataid = channeldata.id
+                            break
+                        except Queue.Full:
+                            self._cv.wait()
+                    if self._stop:
+                        raise ChannelStopError()
+                    self._cv.notify_all()
+                else:
+                    # log and drop it
+                    _LOGGER.debug(
+                        "(data_id={}) Op({}) send data is dropped! cur_max_dataid:{}".
+                        format(channeldata.id, op_name, self._cur_max_dataid))
+            return True
        elif op_name is None:
            _LOGGER.critical(
                self._log(

--- a/python/pipeline/dag.py
+++ b/python/pipeline/dag.py
@@ -63,6 +63,7 @@ class DAGExecutor(object):
        self._retry = dag_conf["retry"]
        self._server_use_profile = dag_conf["use_profile"]
        channel_size = dag_conf["channel_size"]
+        channel_recv_frist_arrive = dag_conf["channel_recv_frist_arrive"]
        self._is_thread_op = dag_conf["is_thread_op"]
        tracer_conf = dag_conf["tracer"]
@@ -79,7 +80,7 @@ class DAGExecutor(object):
        self._dag = DAG(self.name, response_op, self._server_use_profile,
                        self._is_thread_op, channel_size, build_dag_each_worker,
-                        self._tracer)
+                        self._tracer, channel_recv_frist_arrive)
        (in_channel, out_channel, pack_rpc_func,
         unpack_rpc_func) = self._dag.build()
        self._dag.start()
@@ -480,7 +481,8 @@ class DAG(object):
    """
    def __init__(self, request_name, response_op, use_profile, is_thread_op,
-                 channel_size, build_dag_each_worker, tracer):
+                 channel_size, build_dag_each_worker, tracer,
+                 channel_recv_frist_arrive):
        self._request_name = request_name
        self._response_op = response_op
        self._use_profile = use_profile
@@ -488,6 +490,7 @@ class DAG(object):
        self._channel_size = channel_size
        self._build_dag_each_worker = build_dag_each_worker
        self._tracer = tracer
+        self._channel_recv_frist_arrive = channel_recv_frist_arrive
        if not self._is_thread_op:
            self._manager = PipelineProcSyncManager()
        _LOGGER.info("[DAG] Succ init")
@@ -543,10 +546,15 @@ class DAG(object):
        channel = None
        if self._is_thread_op:
            channel = ThreadChannel(
-                name=name_gen.next(), maxsize=self._channel_size)
+                name=name_gen.next(),
+                maxsize=self._channel_size,
+                channel_recv_frist_arrive=self._channel_recv_frist_arrive)
        else:
            channel = ProcessChannel(
-                self._manager, name=name_gen.next(), maxsize=self._channel_size)
+                self._manager,
+                name=name_gen.next(),
+                maxsize=self._channel_size,
+                channel_recv_frist_arrive=self._channel_recv_frist_arrive)
        _LOGGER.debug("[DAG] Generate channel: {}".format(channel.name))
        return channel

--- a/python/pipeline/gateway/proto/gateway.proto
+++ b/python/pipeline/gateway/proto/gateway.proto
@@ -18,22 +18,117 @@ option go_package = "./;pipeline_serving";
 import "google/api/annotations.proto";
+// Tensor structure, consistent with PADDLE variable types.
+// Descriptions of input and output data.
+message Tensor {
+  // VarType: INT64
+  repeated int64 int64_data = 1;
+  // VarType: FP32, FP16
+  repeated float float_data = 2;
+  // VarType: INT32, INT16, INT8
+  repeated int32 int_data = 3;
+  // VarType: FP64
+  repeated double float64_data = 4;
+  // VarType: BF16, UINT8
+  repeated uint32 uint32_data = 5;
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+  // VarType: STRING
+  repeated string str_data = 9;
+  // VarType: BYTES, is suitable for big data. No need to save data types and
+  // dimensions
+  // pack method: pack by BytesIO, saved by np.save
+  // unpack method: load by np.load, unpack by BytesIO.
+  bytes byte_data = 10;
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  12 => STRING
+  //  13 => BYTES
+  int32 elem_type = 20;
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 21;
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 22;
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 23;
+};
+// The structure of the service request. The input data can be repeated string
+// pairs or tensors.
+message Request {
+  // The input data are repeated string pairs.
+  // for examples. key is "words", value is the string of words.
+  repeated string key = 1;
+  repeated string value = 2;
+  // The input data are repeated tensors for complex data structures.
+  // Becase tensors can save more data information and reduce the amount of data
+  // transferred.
+  repeated Tensor tensors = 3;
+  // The name field in the RESTful API
+  string name = 4;
+  // The method field in the RESTful API
+  string method = 5;
+  // For tracing requests and logs
+  int64 logid = 6;
+  // For tracking sources
+  string clientip = 7;
+};
+// The structure of the service response. The output data can be repeated string
+// pairs or tensors.
 message Response {
+  // Error code
  int32 err_no = 1;
+  // Error messages
  string err_msg = 2;
+  // The results of string pairs
  repeated string key = 3;
  repeated string value = 4;
-};
-message Request {
+  // The results of tensors
-  repeated string key = 1;
+  repeated Tensor tensors = 5;
-  repeated string value = 2;
-  string name = 3;
-  string method = 4;
-  int64 logid = 5;
-  string clientip = 6;
 };
+// Python pipeline service
 service PipelineService {
  rpc inference(Request) returns (Response) {
    option (google.api.http) = {

--- a/python/pipeline/operator.py
+++ b/python/pipeline/operator.py
@@ -26,6 +26,7 @@ import collections
 import numpy as np
 import json
 from numpy import *
+from io import BytesIO
 if sys.version_info.major == 2:
    import Queue
 elif sys.version_info.major == 3:
@@ -40,10 +41,29 @@ from .channel import (ThreadChannel, ProcessChannel, ChannelDataErrcode,
 from .util import NameGenerator
 from .profiler import UnsafeTimeProfiler as TimeProfiler
 from . import local_service_handler
+from .pipeline_client import PipelineClient as PPClient
 _LOGGER = logging.getLogger(__name__)
 _op_name_gen = NameGenerator("Op")
+# data type of tensor to numpy_data
+_TENSOR_DTYPE_2_NUMPY_DATA_DTYPE = {
+    0: "int64",  # VarType.INT64
+    1: "float32",  # VarType.FP32
+    2: "int32",  # VarType.INT32
+    3: "float64",  # VarType.FP64
+    4: "int16",  # VarType.int16
+    5: "float16",  # VarType.FP32
+    6: "uint16",  # VarType.BF16
+    7: "uint8",  # VarType.UINT8
+    8: "int8",  # VarType.INT8
+    9: "bool",  # VarType.BOOL
+    10: "complex64",  # VarType.COMPLEX64
+    11: "complex128",  # VarType.COMPLEX128
+    12: "string",  # load by numpy
+    13: "bytes",  # load by numpy
+}
 class Op(object):
    def __init__(self,
@@ -84,6 +104,9 @@ class Op(object):
        self._server_use_profile = False
        self._tracer = None
+        # for grpc_pipeline predict mode. False, string key/val; True, tensor format.
+        self._pack_tensor_format = False
        # only for thread op
        self._for_init_op_lock = threading.Lock()
        self._for_close_op_lock = threading.Lock()
@@ -330,9 +353,8 @@ class Op(object):
        if self.client_type == 'brpc':
            client = Client()
            client.load_client_config(client_config)
-        # 待测试完成后，使用brpc-http替代。
+        elif self.client_type == 'pipeline_grpc':
-        # elif self.client_type == 'grpc':
+            client = PPClient()
-        #   client = MultiLangClient()
        elif self.client_type == 'local_predictor':
            if self.local_predictor is None:
                raise ValueError("local predictor not yet created")
@@ -372,6 +394,9 @@ class Op(object):
                os._exit(-1)
            self._input_ops.append(op)
+    def set_pack_tensor_format(self, is_tensor_format=False):
+        self._pack_tensor_format = is_tensor_format
    def get_jump_to_ops(self):
        return self._jump_to_ops
@@ -483,7 +508,7 @@ class Op(object):
            os._exit(-1)
        channel.add_producer(self.name)
        self._outputs.append(channel)
-        _LOGGER.info("op:{} add output_channel {}".format(self.name, channel))
+        _LOGGER.debug("op:{} add output_channel {}".format(self.name, channel))
    def clean_output_channels(self):
        self._outputs = []
@@ -531,32 +556,73 @@ class Op(object):
        Returns:
            call_result: predict result
        """
-        err, err_info = ChannelData.check_batch_npdata(feed_batch)
-        if err != 0:
+        call_result = None
-            _LOGGER.critical(
+        err_code = ChannelDataErrcode.OK.value
-                self._log("Failed to run process: {}. Please override "
+        err_info = ""
-                          "preprocess func.".format(err_info)))
-            os._exit(-1)
        if self.client_type == "local_predictor":
+            err, err_info = ChannelData.check_batch_npdata(feed_batch)
+            if err != 0:
+                _LOGGER.error(
+                    self._log("Failed to run process: {}. feed_batch must be \
+                        npdata in process for local_predictor mode."
+                              .format(err_info)))
+                return call_result, ChannelDataErrcode.TYPE_ERROR.value, "feed_batch must be npdata"
            call_result = self.client.predict(
                feed=feed_batch[0],
                fetch=self._fetch_names,
                batch=True,
                log_id=typical_logid)
-        else:
+        elif self.client_type == "brpc":
+            err, err_info = ChannelData.check_batch_npdata(feed_batch)
+            if err != 0:
+                _LOGGER.error(
+                    self._log("Failed to run process: {}. feed_batch must be \
+                        npdata in process for brpc mode.".format(err_info)))
+                return call_result, ChannelDataErrcode.TYPE_ERROR.value, "feed_batch must be npdata"
            call_result = self.client.predict(
-                feed=feed_batch,
+                feed=feed_batch[0],
                fetch=self._fetch_names,
                batch=True,
                log_id=typical_logid)
-        # 后续用HttpClient替代
-        '''
+        elif self.client_type == "pipeline_grpc":
-        if isinstance(self.client, MultiLangClient):
+            err, err_info = ChannelData.check_dictdata(feed_batch)
-            if call_result is None or call_result["serving_status_code"] != 0:
+            if err != 0:
-                return None
+                _LOGGER.error(
-            call_result.pop("serving_status_code")
+                    self._log("Failed to run process: {}. feed_batch must be \
-        '''
+                       npdata in process for pipeline_grpc mode."
-        return call_result
+                              .format(err_info)))
+                return call_result, ChannelDataErrcode.TYPE_ERROR.value, "feed_batch must be dict"
+            call_result = self.client.predict(
+                feed_dict=feed_batch[0],
+                fetch=self._fetch_names,
+                asyn=False,
+                pack_tensor_format=self._pack_tensor_format,
+                profile=False)
+            if call_result is None:
+                _LOGGER.error(
+                    self._log("Failed in pipeline_grpc. call_result is None."))
+                return call_result, ChannelDataErrcode.UNKNOW.value, "pipeline_grpc error"
+            if call_result.err_no != 0:
+                _LOGGER.error(
+                    self._log("Failed in pipeline_grpc. err_no:{}, err_info:{}".
+                              format(call_result.err_no, call_result.err_msg)))
+                return call_result, ChannelDataErrcode(
+                    call_result.err_no).value, call_result.err_msg
+            new_dict = {}
+            err_code = ChannelDataErrcode(call_result.err_no).value
+            err_info = call_result.err_msg
+            for idx, key in enumerate(call_result.key):
+                new_dict[key] = [call_result.value[idx]]
+            call_result = new_dict
+        return call_result, err_code, err_info
    def postprocess(self, input_data, fetch_data, data_id=0, log_id=0):
        """
@@ -891,16 +957,20 @@ class Op(object):
        midped_batch = None
        error_code = ChannelDataErrcode.OK.value
+        error_info = ""
        if self._timeout <= 0:
            # No retry
            try:
                if batch_input is False:
-                    midped_batch = self.process(feed_batch, typical_logid)
+                    midped_batch, error_code, error_info = self.process(
+                        feed_batch, typical_logid)
                else:
                    midped_batch = []
                    for idx in range(len(feed_batch)):
-                        predict_res = self.process([feed_batch[idx]],
+                        predict_res, error_code, error_info = self.process(
-                                                   typical_logid)
+                            [feed_batch[idx]], typical_logid)
+                        if error_code != ChannelDataErrcode.OK.value:
+                            break
                        midped_batch.append(predict_res)
            except Exception as e:
                error_code = ChannelDataErrcode.UNKNOW.value
@@ -913,14 +983,14 @@ class Op(object):
                try:
                    # time out for each process
                    if batch_input is False:
-                        midped_batch = func_timeout.func_timeout(
+                        midped_batch, error_code, error_info = func_timeout.func_timeout(
                            self._timeout,
                            self.process,
                            args=(feed_batch, typical_logid))
                    else:
                        midped_batch = []
                        for idx in range(len(feed_batch)):
-                            predict_res = func_timeout.func_timeout(
+                            predict_res, error_code, error_info = func_timeout.func_timeout(
                                self._timeout,
                                self.process,
                                args=([feed_batch[idx]], typical_logid))
@@ -1265,6 +1335,8 @@ class Op(object):
                break
            end = int(round(_time() * 1000000))
            in_time = end - start
+            _LOGGER.debug("op:{} in_time_end:{}".format(op_info_prefix,
+                                                        time.time()))
            # parse channeldata batch
            try:
@@ -1278,6 +1350,8 @@ class Op(object):
            if len(parsed_data_dict) == 0:
                # data in the whole batch is all error data
                continue
+            _LOGGER.debug("op:{} parse_end:{}".format(op_info_prefix,
+                                                      time.time()))
            # print
            front_cost = int(round(_time() * 1000000)) - start
@@ -1292,6 +1366,8 @@ class Op(object):
                    = self._run_preprocess(parsed_data_dict, op_info_prefix, logid_dict)
            end = profiler.record("prep#{}_1".format(op_info_prefix))
            prep_time = end - start
+            _LOGGER.debug("op:{} preprocess_end:{}, cost:{}".format(
+                op_info_prefix, time.time(), prep_time))
            try:
                # put error requests into output channel, skip process and postprocess stage
                for data_id, err_channeldata in err_channeldata_dict.items():
@@ -1313,6 +1389,8 @@ class Op(object):
                    = self._run_process(preped_data_dict, op_info_prefix, skip_process_dict, logid_dict)
            end = profiler.record("midp#{}_1".format(op_info_prefix))
            midp_time = end - start
+            _LOGGER.debug("op:{} process_end:{}, cost:{}".format(
+                op_info_prefix, time.time(), midp_time))
            try:
                for data_id, err_channeldata in err_channeldata_dict.items():
                    self._push_to_output_channels(
@@ -1334,6 +1412,8 @@ class Op(object):
            end = profiler.record("postp#{}_1".format(op_info_prefix))
            postp_time = end - start
            after_postp_time = _time()
+            _LOGGER.debug("op:{} postprocess_end:{}, cost:{}".format(
+                op_info_prefix, time.time(), postp_time))
            try:
                for data_id, err_channeldata in err_channeldata_dict.items():
                    self._push_to_output_channels(
@@ -1486,6 +1566,90 @@ class RequestOp(Op):
            _LOGGER.critical("Op(Request) Failed to init: {}".format(e))
            os._exit(-1)
+    def proto_tensor_2_numpy(self, tensor):
+        """
+        Convert proto tensor to numpy array, The supported types are as follows:
+                INT64
+                FP32
+		INT32
+		FP64
+		INT16
+		FP16
+		BF16
+		UINT8
+		INT8
+		BOOL
+                BYTES
+        Unsupported type:
+                STRING
+                COMPLEX64
+                COMPLEX128
+        Args:
+            tensor: one tensor in request.tensors.
+        Returns:
+            np.ndnumpy
+        """
+        if tensor is None or tensor.elem_type is None or tensor.name is None:
+            _LOGGER.error("input params of tensor is wrong. tensor: {}".format(
+                tensor))
+            return None
+        dims = []
+        if tensor.shape is None:
+            dims.append(1)
+        else:
+            for one_dim in tensor.shape:
+                dims.append(one_dim)
+        np_data = None
+        _LOGGER.info("proto_to_numpy, name:{}, type:{}, dims:{}".format(
+            tensor.name, tensor.elem_type, dims))
+        if tensor.elem_type == 0:
+            # VarType: INT64
+            np_data = np.array(tensor.int64_data).astype(int64).reshape(dims)
+        elif tensor.elem_type == 1:
+            # VarType: FP32
+            np_data = np.array(tensor.float_data).astype(float32).reshape(dims)
+        elif tensor.elem_type == 2:
+            # VarType: INT32
+            np_data = np.array(tensor.int_data).astype(int32).reshape(dims)
+        elif tensor.elem_type == 3:
+            # VarType: FP64
+            np_data = np.array(tensor.float64_data).astype(float64).reshape(
+                dims)
+        elif tensor.elem_type == 4:
+            # VarType: INT16
+            np_data = np.array(tensor.int_data).astype(int16).reshape(dims)
+        elif tensor.elem_type == 5:
+            # VarType: FP16
+            np_data = np.array(tensor.float_data).astype(float16).reshape(dims)
+        elif tensor.elem_type == 6:
+            # VarType: BF16
+            np_data = np.array(tensor.uint32_data).astype(uint16).reshape(dims)
+        elif tensor.elem_type == 7:
+            # VarType: UINT8
+            np_data = np.array(tensor.uint32_data).astype(uint8).reshape(dims)
+        elif tensor.elem_type == 8:
+            # VarType: INT8
+            np_data = np.array(tensor.int_data).astype(int8).reshape(dims)
+        elif tensor.elem_type == 9:
+            # VarType: BOOL
+            np_data = np.array(tensor.bool_data).astype(bool).reshape(dims)
+        elif tensor.elem_type == 13:
+            # VarType: BYTES
+            byte_data = BytesIO(tensor.byte_data)
+            np_data = np.load(byte_data, allow_pickle=True)
+        else:
+            _LOGGER.error("Sorry, the type {} of tensor {} is not supported.".
+                          format(tensor.elem_type, tensor.name))
+            raise ValueError(
+                "Sorry, the type {} of tensor {} is not supported.".format(
+                    tensor.elem_type, tensor.name))
+        return np_data
    def unpack_request_package(self, request):
        """
        Unpack request package by gateway.proto
@@ -1506,12 +1670,47 @@ class RequestOp(Op):
            _LOGGER.critical("request is None")
            raise ValueError("request is None")
+        # unpack key/value string list
        for idx, key in enumerate(request.key):
            dict_data[key] = request.value[idx]
        log_id = request.logid
-        _LOGGER.debug("RequestOp unpack one request. log_id:{}, clientip:{} \
-            name:{}, method:{}".format(log_id, request.clientip, request.name,
+        # unpack proto.tensors data.
-                                       request.method))
+        for one_tensor in request.tensors:
+            name = one_tensor.name
+            elem_type = one_tensor.elem_type
+            if one_tensor.name is None:
+                _LOGGER.error("Tensor name is None.")
+                raise ValueError("Tensor name is None.")
+            numpy_dtype = _TENSOR_DTYPE_2_NUMPY_DATA_DTYPE.get(elem_type)
+            if numpy_dtype is None:
+                _LOGGER.error(
+                    "elem_type:{} is dismatch in unpack_request_package.",
+                    format(elem_type))
+                raise ValueError("elem_type:{} error".format(elem_type))
+            if numpy_dtype == "string":
+                new_string = ""
+                if one_tensor.str_data is None:
+                    _LOGGER.error(
+                        "str_data of tensor:{} is None, elem_type is {}.".
+                        format(name, elem_type))
+                    raise ValueError(
+                        "str_data of tensor:{} is None, elem_type is {}.".
+                        format(name, elem_type))
+                for one_str in one_tensor.str_data:
+                    new_string += one_str
+                dict_data[name] = new_string
+            else:
+                dict_data[name] = self.proto_tensor_2_numpy(one_tensor)
+        _LOGGER.info("RequestOp unpack one request. log_id:{}, clientip:{} \
+            name:{}, method:{}, time:{}"
+                     .format(log_id, request.clientip, request.name,
+                             request.method, time.time()))
        return dict_data, log_id, None, ""
@@ -1530,6 +1729,7 @@ class ResponseOp(Op):
        """
        super(ResponseOp, self).__init__(
            name="@DAGExecutor", input_ops=input_ops)
        # init op
        try:
            self.init_op()
@@ -1538,6 +1738,12 @@ class ResponseOp(Op):
                e, exc_info=True))
            os._exit(-1)
+        # init ResponseOp
+        self.is_pack_tensor = False
+    def set_pack_format(self, isTensor=False):
+        self.is_pack_tensor = isTensor
    def pack_response_package(self, channeldata):
        """
        Getting channeldata from the last channel, packting the response 

--- a/python/pipeline/pipeline_client.py
+++ b/python/pipeline/pipeline_client.py
@@ -14,6 +14,7 @@
 # pylint: disable=doc-string-missing
 import grpc
 import sys
+import time
 import numpy as np
 from numpy import *
 import logging
@@ -24,6 +25,7 @@ from .channel import ChannelDataErrcode
 from .proto import pipeline_service_pb2
 from .proto import pipeline_service_pb2_grpc
 import six
+from io import BytesIO
 _LOGGER = logging.getLogger(__name__)
@@ -46,7 +48,8 @@ class PipelineClient(object):
        self._stub = pipeline_service_pb2_grpc.PipelineServiceStub(
            self._channel)
-    def _pack_request_package(self, feed_dict, profile):
+    def _pack_request_package(self, feed_dict, pack_tensor_format,
+                              use_tensor_bytes, profile):
        req = pipeline_service_pb2.Request()
        logid = feed_dict.get("logid")
@@ -69,38 +72,120 @@ class PipelineClient(object):
            feed_dict.pop("clientip")
        np.set_printoptions(threshold=sys.maxsize)
-        for key, value in feed_dict.items():
+        if pack_tensor_format is False:
-            req.key.append(key)
+            # pack string key/val format
+            for key, value in feed_dict.items():
-            if (sys.version_info.major == 2 and isinstance(value,
+                req.key.append(key)
-                                                           (str, unicode)) or
-                ((sys.version_info.major == 3) and isinstance(value, str))):
+                if (sys.version_info.major == 2 and
-                req.value.append(value)
+                        isinstance(value, (str, unicode)) or
-                continue
+                    ((sys.version_info.major == 3) and isinstance(value, str))):
+                    req.value.append(value)
-            if isinstance(value, np.ndarray):
+                    continue
-                req.value.append(value.__repr__())
-            elif isinstance(value, list):
+                if isinstance(value, np.ndarray):
-                req.value.append(np.array(value).__repr__())
+                    req.value.append(value.__repr__())
-            else:
+                elif isinstance(value, list):
-                raise TypeError("only str and np.ndarray type is supported: {}".
+                    req.value.append(np.array(value).__repr__())
-                                format(type(value)))
+                else:
-        if profile:
+                    raise TypeError(
-            req.key.append(self._profile_key)
+                        "only str and np.ndarray type is supported: {}".format(
-            req.value.append(self._profile_value)
+                            type(value)))
+            if profile:
+                req.key.append(self._profile_key)
+                req.value.append(self._profile_value)
+        else:
+            # pack tensor format
+            for key, value in feed_dict.items():
+                one_tensor = req.tensors.add()
+                one_tensor.name = key
+                if isinstance(value, str):
+                    one_tensor.string_data.add(value)
+                    one_tensor.elem_type = 12  #12 => string in proto
+                    continue
+                if isinstance(value, np.ndarray):
+                    # copy shape
+                    _LOGGER.debug(
+                        "key:{}, use_tensor_bytes:{}, value.shape:{}, value.dtype:{}".
+                        format(key, use_tensor_bytes, value.shape, value.dtype))
+                    for one_dim in value.shape:
+                        one_tensor.shape.append(one_dim)
+                    # packed into bytes
+                    if use_tensor_bytes is True:
+                        np_bytes = BytesIO()
+                        np.save(np_bytes, value, allow_pickle=True)
+                        one_tensor.byte_data = np_bytes.getvalue()
+                        one_tensor.elem_type = 13  #13 => bytes in proto
+                        continue
+                    flat_value = value.flatten().tolist()
+                    # copy data
+                    if value.dtype == "int64":
+                        one_tensor.int64_data.extend(flat_value)
+                        one_tensor.elem_type = 0
+                    elif value.dtype == "float32":
+                        one_tensor.float_data.extend(flat_value)
+                        one_tensor.elem_type = 1
+                    elif value.dtype == "int32":
+                        one_tensor.int_data.extend(flat_value)
+                        one_tensor.elem_type = 2
+                    elif value.dtype == "float64":
+                        one_tensor.float64_data.extend(flat_value)
+                        one_tensor.elem_type = 3
+                    elif value.dtype == "int16":
+                        one_tensor.int_data.extend(flat_value)
+                        one_tensor.elem_type = 4
+                    elif value.dtype == "float16":
+                        one_tensor.float_data.extend(flat_value)
+                        one_tensor.elem_type = 5
+                    elif value.dtype == "uint16":
+                        one_tensor.uint32_data.extend(flat_value)
+                        one_tensor.elem_type = 6
+                    elif value.dtype == "uint8":
+                        one_tensor.uint32_data.extend(flat_value)
+                        one_tensor.elem_type = 7
+                    elif value.dtype == "int8":
+                        one_tensor.int_data.extend(flat_value)
+                        one_tensor.elem_type = 8
+                    elif value.dtype == "bool":
+                        one_tensor.bool_data.extend(flat_value)
+                        one_tensor.elem_type = 9
+                    else:
+                        _LOGGER.error(
+                            "value type {} of tensor {} is not supported.".
+                            format(value.dtype, key))
+                else:
+                    raise TypeError(
+                        "only str and np.ndarray type is supported: {}".format(
+                            type(value)))
        return req
    def _unpack_response_package(self, resp, fetch):
        return resp
-    def predict(self, feed_dict, fetch=None, asyn=False, profile=False):
+    def predict(self,
+                feed_dict,
+                fetch=None,
+                asyn=False,
+                pack_tensor_format=False,
+                use_tensor_bytes=False,
+                profile=False,
+                log_id=0):
        if not isinstance(feed_dict, dict):
            raise TypeError(
                "feed must be dict type with format: {name: value}.")
        if fetch is not None and not isinstance(fetch, list):
            raise TypeError("fetch must be list type with format: [name].")
-        req = self._pack_request_package(feed_dict, profile)
+        print("PipelineClient::predict pack_data time:{}".format(time.time()))
+        req = self._pack_request_package(feed_dict, pack_tensor_format,
+                                         use_tensor_bytes, profile)
+        req.logid = log_id
        if not asyn:
+            print("PipelineClient::predict before time:{}".format(time.time()))
            resp = self._stub.inference(req)
            return self._unpack_response_package(resp, fetch)
        else:

--- a/python/pipeline/pipeline_server.py
+++ b/python/pipeline/pipeline_server.py
@@ -22,6 +22,7 @@ from contextlib import closing
 import multiprocessing
 import yaml
 import io
+import time
 from .proto import pipeline_service_pb2_grpc, pipeline_service_pb2
 from . import operator
@@ -47,8 +48,9 @@ class PipelineServicer(pipeline_service_pb2_grpc.PipelineServiceServicer):
        _LOGGER.info("[PipelineServicer] succ init")
    def inference(self, request, context):
-        _LOGGER.info("(log_id={}) inference request name:{} self.name:{}".
+        _LOGGER.info(
-                     format(request.logid, request.name, self._name))
+            "(log_id={}) inference request name:{} self.name:{} time:{}".format(
+                request.logid, request.name, self._name, time.time()))
        if request.name != "" and request.name != self._name:
            _LOGGER.error("(log_id={}) name dismatch error. request.name:{},"
                          "server.name={}".format(request.logid, request.name,
@@ -339,7 +341,7 @@ class ServerYamlConfChecker(object):
                             " or yml_dict can be selected as the parameter.")
        if yml_file is not None:
            with io.open(yml_file, encoding='utf-8') as f:
-                conf = yaml.load(f.read())
+                conf = yaml.load(f.read(), yaml.FullLoader)
        elif yml_dict is not None:
            conf = yml_dict
        else:
@@ -469,6 +471,7 @@ class ServerYamlConfChecker(object):
            "channel_size": 0,
            "is_thread_op": True,
            "tracer": {},
+            "channel_recv_frist_arrive": False,
        }
        conf_type = {
@@ -477,6 +480,7 @@ class ServerYamlConfChecker(object):
            "use_profile": bool,
            "channel_size": int,
            "is_thread_op": bool,
+            "channel_recv_frist_arrive": bool,
        }
        conf_qualification = {

--- a/python/pipeline/proto/pipeline_service.proto
+++ b/python/pipeline/proto/pipeline_service.proto
@@ -12,25 +12,120 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-syntax = "proto2";
+syntax = "proto3";
 package baidu.paddle_serving.pipeline_serving;
+// Tensor structure, consistent with PADDLE variable types.
+// Descriptions of input and output data.
+message Tensor {
+  // VarType: INT64
+  repeated int64 int64_data = 1;
+  // VarType: FP32, FP16
+  repeated float float_data = 2;
+  // VarType: INT32, INT16, INT8
+  repeated int32 int_data = 3;
+  // VarType: FP64
+  repeated double float64_data = 4;
+  // VarType: BF16, UINT8
+  repeated uint32 uint32_data = 5;
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+  // VarType: STRING
+  repeated string str_data = 9;
+  // VarType: BYTES, is suitable for big data. No need to save data types and
+  // dimensions
+  // pack method: pack by BytesIO, saved by np.save
+  // unpack method: load by np.load, unpack by BytesIO.
+  bytes byte_data = 10;
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  12 => STRING
+  //  13 => BYTES
+  int32 elem_type = 20;
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 21;
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 22;
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 23;
+};
+// The structure of the service request. The input data can be repeated string
+// pairs or tensors.
 message Request {
+  // The input data are repeated string pairs.
+  // for examples. key is "words", value is the string of words.
  repeated string key = 1;
  repeated string value = 2;
-  optional string name = 3;
-  optional string method = 4;
+  // The input data are repeated tensors for complex data structures.
-  optional int64 logid = 5;
+  // Becase tensors can save more data information and reduce the amount of data
-  optional string clientip = 6;
+  // transferred.
+  repeated Tensor tensors = 3;
+  // The name field in the RESTful API
+  string name = 4;
+  // The method field in the RESTful API
+  string method = 5;
+  // For tracing requests and logs
+  int64 logid = 6;
+  // For tracking sources
+  string clientip = 7;
 };
+// The structure of the service response. The output data can be repeated string
+// pairs or tensors.
 message Response {
-  optional int32 err_no = 1;
+  // Error code
-  optional string err_msg = 2;
+  int32 err_no = 1;
+  // Error messages
+  string err_msg = 2;
+  // The results of string pairs
  repeated string key = 3;
  repeated string value = 4;
+  // The results of tensors
+  repeated Tensor tensors = 5;
 };
+// Python pipeline service
 service PipelineService {
  rpc inference(Request) returns (Response) {}
 };
--- a/python/pipeline/util.py
+++ b/python/pipeline/util.py
@@ -39,7 +39,7 @@ class AvailablePortGenerator(object):
    def port_is_available(port):
        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
            sock.settimeout(2)
-            result = sock.connect_ex(('0.0.0.0', port))
+            result = sock.connect_ex(('127.0.0.1', port))
        if result != 0:
            return True
        else:

--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -7,7 +7,7 @@ protobuf>=3.12.2
 grpcio-tools>=1.28.1
 grpcio>=1.28.1
 func-timeout>=4.3.5
-pyyaml>=1.3.0
+pyyaml>=5.1
 flask>=1.1.2
 click==7.1.2
 itsdangerous==1.1.0

--- a/python/requirements_mac.txt
+++ b/python/requirements_mac.txt
@@ -6,7 +6,7 @@ google>=2.0.3
 opencv-python==4.2.0.32
 protobuf>=3.12.2
 func-timeout>=4.3.5
-pyyaml>=1.3.0
+pyyaml>=5.1
 flask>=1.1.2
 click==7.1.2
 itsdangerous==1.1.0

--- a/python/setup.py.server.in
+++ b/python/setup.py.server.in
@@ -33,7 +33,7 @@ util.gen_pipeline_code("paddle_serving_server")
 REQUIRED_PACKAGES = [
    'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio <= 1.33.2', 'grpcio-tools <= 1.33.2',
-    'flask >= 1.1.1', 'click==7.1.2', 'itsdangerous==1.1.0', 'Jinja2==2.11.3',
+    'flask >= 1.1.1,<2.0.0', 'click==7.1.2', 'itsdangerous==1.1.0', 'Jinja2==2.11.3',
    'MarkupSafe==1.1.1', 'Werkzeug==1.0.1', 'func_timeout', 'pyyaml'
 ]

--- a/tools/scripts/ipipe_py3.sh
+++ b/tools/scripts/ipipe_py3.sh
@@ -40,10 +40,10 @@ go env -w GO111MODULE=auto
 build_whl_list=(build_cpu_server build_gpu_server build_client build_app)
 rpc_model_list=(grpc_fit_a_line grpc_yolov4 pipeline_imagenet bert_rpc_gpu bert_rpc_cpu ResNet50_rpc \
-lac_rpc cnn_rpc bow_rpc lstm_rpc fit_a_line_rpc deeplabv3_rpc mobilenet_rpc unet_rpc resnetv2_rpc \
+lac_rpc_asyn cnn_rpc_asyn bow_rpc lstm_rpc fit_a_line_rpc deeplabv3_rpc mobilenet_rpc unet_rpc resnetv2_rpc \
 criteo_ctr_rpc_cpu criteo_ctr_rpc_gpu ocr_rpc yolov4_rpc_gpu faster_rcnn_hrnetv2p_w18_1x_encrypt \
-faster_rcnn_model_rpc low_precision_resnet50_int8 ocr_c++_service)
+faster_rcnn_model_rpc low_precision_resnet50_int8 ocr_c++_service ocr_c++_service_asyn)
-http_model_list=(fit_a_line_http lac_http cnn_http bow_http lstm_http ResNet50_http bert_http \
+http_model_list=(fit_a_line_http lac_http imdb_http_proto imdb_http_json imdb_grpc ResNet50_http bert_http \
 pipeline_ocr_cpu_http)
 function setproxy() {
@@ -492,7 +492,7 @@ function ResNet101_rpc() {
    kill_server_process
 }
-function cnn_rpc() {
+function cnn_rpc_asyn() {
    dir=${log_dir}rpc_model/cnn_rpc/
    check_dir ${dir}
    unsetproxy
@@ -500,8 +500,9 @@ function cnn_rpc() {
    data_dir=${data}imdb/
    link_data ${data_dir}
    sed -i 's/9292/8865/g' test_client.py
-    ${py_version} -m paddle_serving_server.serve --model imdb_cnn_model/ --port 8865 > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model imdb_cnn_model/ --port 8865 --op_num 4 --thread 10 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
-    check_result server 5
+    check_result server 8
+    check_gpu_memory 0
    head test_data/part-0 | ${py_version} test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab > ${dir}client_log.txt 2>&1
    check_result client "cnn_CPU_RPC server test completed"
    kill_server_process
@@ -537,7 +538,7 @@ function lstm_rpc() {
    kill_server_process
 }
-function lac_rpc() {
+function lac_rpc_asyn() {
    dir=${log_dir}rpc_model/lac_rpc/
    check_dir ${dir}
    unsetproxy
@@ -545,8 +546,9 @@ function lac_rpc() {
    data_dir=${data}lac/
    link_data ${data_dir}
    sed -i 's/9292/8868/g' lac_client.py
-    ${py_version} -m paddle_serving_server.serve --model lac_model/ --port 8868 > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model lac_model/ --port 8868 --gpu_ids 0 --op_num 2 > ${dir}server_log.txt 2>&1 &
-    check_result server 5
+    check_result server 8
+    check_gpu_memory 0
    echo "我爱北京天安门" | ${py_version} lac_client.py lac_client/serving_client_conf.prototxt lac_dict/ > ${dir}client_log.txt 2>&1
    check_result client "lac_CPU_RPC server test completed"
    kill_server_process
@@ -768,10 +770,9 @@ function fit_a_line_http() {
    check_dir ${dir}
    unsetproxy
    cd ${build_path}/python/examples/fit_a_line
-    sed -i "s/9393/8871/g" test_server.py
+    ${py_version} -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 > ${dir}server_log.txt 2>&1 &
-    ${py_version} test_server.py > ${dir}server_log.txt 2>&1 &
    check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:8871/uci/prediction > ${dir}client_log.txt 2>&1
+    ${py_version} test_httpclient.py uci_housing_client/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
    check_result client "fit_a_line_CPU_HTTP server test completed"
    kill_server_process
 }
@@ -781,61 +782,64 @@ function lac_http() {
    check_dir ${dir}
    unsetproxy
    cd ${build_path}/python/examples/lac
-    ${py_version} lac_web_service.py lac_model/ lac_workdir 8872 > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model lac_model/ --port 9292 > ${dir}server_log.txt 2>&1 &
    check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "我爱北京天安门"}], "fetch":["word_seg"]}' http://127.0.0.1:8872/lac/prediction > ${dir}client_log.txt 2>&1
+    echo "我爱北京天安门" | ${py_version} lac_http_client.py lac_client/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
    check_result client "lac_CPU_HTTP server test completed"
    kill_server_process
 }
-function cnn_http() {
+function imdb_http_proto() {
-    dir=${log_dir}http_model/cnn_http/
+    dir=${log_dir}http_model/imdb_http_proto/
    check_dir ${dir}
    unsetproxy
    cd ${build_path}/python/examples/imdb
-    ${py_version} text_classify_service.py imdb_cnn_model/ workdir/ 8873 imdb.vocab > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292 > ${dir}server_log.txt 2>&1 &
    check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:8873/imdb/prediction > ${dir}client_log.txt 2>&1
+    head test_data/part-0 | ${py_version} test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab > ${dir}client_log.txt 2>&1
-    check_result client "cnn_CPU_HTTP server test completed"
+    check_result client "imdb_CPU_HTTP-proto server test completed"
    kill_server_process
 }
-function bow_http() {
+function imdb_http_json() {
-    dir=${log_dir}http_model/bow_http/
+    dir=${log_dir}http_model/imdb_http_json/
    check_dir ${dir}
    unsetproxy
    cd ${build_path}/python/examples/imdb
-    ${py_version} text_classify_service.py imdb_bow_model/ workdir/ 8874 imdb.vocab > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292 > ${dir}server_log.txt 2>&1 &
    check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:8874/imdb/prediction > ${dir}client_log.txt 2>&1
+    sed -i "s/#client.set_http_proto(True)/client.set_http_proto(False)/g" test_http_client.py
-    check_result client "bow_CPU_HTTP server test completed"
+    head test_data/part-0 | ${py_version} test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab > ${dir}client_log.txt 2>&1
+    check_result client "imdb_CPU_HTTP-json server test completed"
    kill_server_process
 }
-function lstm_http() {
+function imdb_grpc() {
-    dir=${log_dir}http_model/lstm_http/
+    dir=${log_dir}http_model/imdb_grpc/
    check_dir ${dir}
    unsetproxy
    cd ${build_path}/python/examples/imdb
-    ${py_version} text_classify_service.py imdb_bow_model/ workdir/ 8875 imdb.vocab > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292 --gpu_ids 1 > ${dir}server_log.txt 2>&1 &
    check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:8875/imdb/prediction > ${dir}client_log.txt 2>&1
+    check_gpu_memory 1
-    check_result client "lstm_CPU_HTTP server test completed"
+    sed -i "s/client.set_http_proto(False)/#client.set_http_proto(False)/g" test_http_client.py
+    sed -i "s/#client.set_use_grpc_client(True)/client.set_use_grpc_client(True)/g" test_http_client.py
+    head test_data/part-0 | ${py_version} test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab > ${dir}client_log.txt 2>&1
+    check_result client "imdb_GPU_GRPC server test completed"
    kill_server_process
 }
 function ResNet50_http() {
-    echo "pass"
+    dir=${log_dir}http_model/ResNet50_http/
-#    dir=${log_dir}http_model/ResNet50_http/
+    check_dir ${dir}
-#    check_dir ${dir}
+    unsetproxy
-#    unsetproxy
+    cd ${build_path}/python/examples/imagenet
-#    cd ${build_path}/python/examples/imagenet
+    ${py_version} -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
-#    ${py_version} resnet50_web_service.py ResNet50_vd_model gpu 8876 > ${dir}server_log.txt 2>&1 &
+    check_result server 10
-#    check_result server 10
+    check_gpu_memory 0
-#    check_gpu_memory 0
+    ${py_version} resnet50_http_client.py ResNet50_vd_client_config/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
-#    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"image": "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"}], "fetch": ["score"]}' http://127.0.0.1:8876/image/prediction > ${dir}client_log.txt 2>&1
+    check_result client "ResNet50_GPU_HTTP server test completed"
-#    check_result client "ResNet50_GPU_HTTP server test completed"
+    kill_server_process
-#    kill_server_process
 }
 function bert_http() {
@@ -845,10 +849,11 @@ function bert_http() {
    cd ${build_path}/python/examples/bert
    cp data-c.txt.1 data-c.txt
    cp vocab.txt.1 vocab.txt
-    export CUDA_VISIBLE_DEVICES=0
+    export CUDA_VISIBLE_DEVICES=0,1
-    ${py_version} bert_web_service.py bert_seq128_model/ 8878 > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
-    check_result server 8
+    check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:8878/bert/prediction > ${dir}client_log.txt 2>&1
+    check_gpu_memory 0
+    head data-c.txt | ${py_version} bert_httpclient.py --model bert_seq128_client/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
    check_result client "bert_GPU_HTTP server test completed"
    kill_server_process
 }
@@ -904,7 +909,7 @@ function ocr_c++_service() {
    cp -r ocr_det_client/ ./ocr_det_client_cp
    rm -rf ocr_det_client
    mv ocr_det_client_cp ocr_det_client
-    sed -i "s/feed_type: 1/feed_type: 3/g" ocr_det_client/serving_client_conf.prototxt
+    sed -i "s/feed_type: 1/feed_type: 20/g" ocr_det_client/serving_client_conf.prototxt
    sed -i "s/shape: 3/shape: 1/g" ocr_det_client/serving_client_conf.prototxt
    sed -i '7,8d' ocr_det_client/serving_client_conf.prototxt
    echo -e "${GREEN_COLOR}OCR_C++_Service_GPU_RPC server started${RES}"
@@ -920,6 +925,23 @@ function ocr_c++_service() {
    kill_server_process
 }
+function ocr_c++_service_asyn() {
+    dir=${log_dir}rpc_model/ocr_c++_serving/
+    cd ${build_path}/python/examples/ocr
+    check_dir ${dir}
+    echo -e "${GREEN_COLOR}OCR_C++_Service_GPU_RPC asyn_server started${RES}"
+    $py_version -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_id 0 --op_num 4 > ${dir}server_log.txt 2>&1 &
+    check_result server 8
+    check_gpu_memory 0
+    echo -e "${GREEN_COLOR}OCR_C++_Service_GPU_RPC client started${RES}"
+    echo "------------------first:"
+    $py_version ocr_cpp_client.py ocr_det_client ocr_rec_client
+    echo "------------------second:"
+    $py_version ocr_cpp_client.py ocr_det_client ocr_rec_client > ${dir}client_log.txt 2>&1
+    check_result client "OCR_C++_Service_GPU_RPC server test completed"
+    kill_server_process
+}
 function build_all_whl() {
    for whl in ${build_whl_list[@]}
    do