diff --git a/README.md b/README.md
index 6c6d0924bf44137dc463fb68599713835d4cb0f2..34bc154956ec62cf66de97071986d174d97d863b 100644
--- a/README.md
+++ b/README.md
@@ -188,7 +188,7 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 | `use_lite` (Only for Intel x86 CPU or ARM CPU) | -    | -       | Run PaddleLite inference                              |
 | `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU        |
 | `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8              |
-| `use_calib`                                    | bool | False   | Only for deployment with TensorRT                     |
+| `use_calib`                                    | bool | False   | Use TRT int8 calibration                              |
 | `gpu_multi_stream`                             | bool | False   | EnableGpuMultiStream to get larger QPS                |
 
 #### Description of asynchronous model
diff --git a/README_CN.md b/README_CN.md
index a1bb9f9e7c513a3d772cce2d56d0bcd76e3548f9..d4bcc1a205bb787971373293f697b23b55435979 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -187,7 +187,7 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 | `use_lite` (Only for Intel x86 CPU or ARM CPU) | -    | -       | Run PaddleLite inference                              |
 | `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU        |
 | `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8              |
-| `use_calib`                                    | bool | False   | Only for deployment with TensorRT                     |
+| `use_calib`                                    | bool | False   | Use TRT int8 calibration                              |
 | `gpu_multi_stream`                             | bool | False   | EnableGpuMultiStream to get larger QPS                |
 
 #### 异步模型的说明
diff --git a/cmake/external/cudnn.cmake b/cmake/external/cudnn.cmake
index 98466d44fc0dd91ef0cc8e8eac2660c42a19267c..f560d8723523264881d692a88a2d276035a7ac77 100644
--- a/cmake/external/cudnn.cmake
+++ b/cmake/external/cudnn.cmake
@@ -61,8 +61,11 @@ else()
 endif()
 
 if(CUDNN_FOUND)
-    file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
-
+    if(EXISTS "${CUDNN_INCLUDE_DIR}/cudnn_version.h")
+        file(READ ${CUDNN_INCLUDE_DIR}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS)
+    elseif(EXISTS "${CUDNN_INCLUDE_DIR}/cudnn.h")
+        file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+    endif()
     get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)
 
     string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)"
diff --git a/cmake/paddlepaddle.cmake b/cmake/paddlepaddle.cmake
index 78c398010153d9aa482388bfad34cb71ffd01c6e..ef1f0bb9d05b0b9c23c4e98dabfdb335df2a1e4e 100644
--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -27,52 +27,54 @@ set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/
 
 message( "WITH_GPU = ${WITH_GPU}")
 
-
 # Paddle Version should be one of:
 # latest: latest develop build
 # version number like 1.5.2
-SET(PADDLE_VERSION "2.1.0")
+SET(PADDLE_VERSION "2.2.0-rc0")
 if (WITH_GPU)
-    if(CUDA_VERSION EQUAL 11.0)
-        set(CUDA_SUFFIX "cuda11.0-cudnn8-mkl-gcc8.2")
+    message("CUDA: ${CUDA_VERSION}, CUDNN_MAJOR_VERSION: ${CUDNN_MAJOR_VERSION}")
+    # cuda 11.0 is not supported, 11.2 would be added.
+    if(CUDA_VERSION EQUAL 10.1)
+        set(CUDA_SUFFIX "x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5")
         set(WITH_TRT ON)
     elseif(CUDA_VERSION EQUAL 10.2)
-        set(CUDA_SUFFIX "cuda10.2-cudnn8-mkl-gcc8.2")
-        set(WITH_TRT ON)
-    elseif(CUDA_VERSION EQUAL 10.1)
-        set(CUDA_SUFFIX "cuda10.1-cudnn7-mkl-gcc8.2")
+        if(CUDNN_MAJOR_VERSION EQUAL 7)
+            set(CUDA_SUFFIX "x86-64_gcc5.4_avx_mkl_cuda10.2_cudnn7.6.5_trt6.0.1.5")
+            set(WITH_TRT ON)
+        elseif(CUDNN_MAJOR_VERSION EQUAL 8)
+            set(CUDA_SUFFIX "x86-64_gcc8.2_avx_mkl_cuda10.2_cudnn8.1.1_trt7.2.3.4")
+            set(WITH_TRT ON)
+        endif()
+    elseif(CUDA_VERSION EQUAL 11.2)
+        set(CUDA_SUFFIX "x86-64_gcc8.2_avx_mkl_cuda11.2_cudnn8.2.1_trt8.0.3.4")
         set(WITH_TRT ON)
-    elseif(CUDA_VERSION EQUAL 10.0)
-        set(CUDA_SUFFIX "cuda10-cudnn7-avx-mkl")
-    elseif(CUDA_VERSION EQUAL 9.0)
-        set(CUDA_SUFFIX "cuda9-cudnn7-avx-mkl")
     endif()
 else()
     set(WITH_TRT OFF)
 endif()  
 
 if (WITH_GPU)
-    SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-${CUDA_SUFFIX}")
+    SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/GPU/${CUDA_SUFFIX}")
 elseif (WITH_LITE)
     if (WITH_XPU)
-        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-${CMAKE_SYSTEM_PROCESSOR}-xpu")
+        SET(PADDLE_LIB_VERSION "arm64_gcc7.3_openblas")
     else()
         SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-${CMAKE_SYSTEM_PROCESSOR}")
     endif()
 else()
     if (WITH_AVX)
         if (WITH_MKLML)
-            SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-cpu-avx-mkl")
+            SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/CPU/gcc8.2_avx_mkl")
         else()
-            SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-cpu-avx-openblas")
+            SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/CPU/gcc8.2_avx_openblas")
         endif()
     else()
-        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-cpu-noavx-openblas")
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}/cxx_c/Linux/CPU/gcc8.2_openblas")
     endif()
 endif()
 
 if(WITH_LITE)
-    SET(PADDLE_LIB_PATH "http://paddle-serving.bj.bcebos.com/inferlib/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
+    SET(PADDLE_LIB_PATH "https://paddle-inference-lib.bj.bcebos.com/2.2.0-rc0/cxx_c/Linux/XPU/${PADDLE_LIB_VERSION}/paddle_inference_install_dir.tar.gz ")
 else()
     SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
 endif()
diff --git a/core/configure/proto/general_model_service.proto b/core/configure/proto/general_model_service.proto
index 89ac489f8ae3b90b74c94a3f9f3c82711086cd64..c2deab2f69ea6f6ca5e77354ec955bf679f9a3d6 100644
--- a/core/configure/proto/general_model_service.proto
+++ b/core/configure/proto/general_model_service.proto
@@ -12,41 +12,97 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-syntax = "proto2";
+syntax = "proto3";
 package baidu.paddle_serving.predictor.general_model;
 option java_multiple_files = true;
+option cc_generic_services = true;
 
 message Tensor {
-  repeated string data = 1;
-  repeated int32 int_data = 2;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
-  optional int32 elem_type =
-      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
-  repeated int32 shape = 6;       // shape should include batch
-  repeated int32 lod = 7;         // only for fetch tensor currently
-  optional string name = 8;       // get from the Model prototxt
-  optional string alias_name = 9; // get from the Model prototxt
+  // VarType: INT64
+  repeated int64 int64_data = 1;
+
+  // VarType: FP32
+  repeated float float_data = 2;
+
+  // VarType: INT32
+  repeated int32 int_data = 3;
+
+  // VarType: FP64
+  repeated double float64_data = 4;
+
+  // VarType: UINT32
+  repeated uint32 uint32_data = 5;
+
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+
+  // VarType: STRING
+  repeated string data = 9;
+
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  20 => STRING
+  int32 elem_type = 10;
+
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 11;
+
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 12;
+
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 13;
+
+  // Correspond to the variable 'alias_name' in the model description prototxt.
+  string alias_name = 14; // get from the Model prototxt
+
+  // VarType: FP16, INT16, INT8, BF16, UINT8
+  bytes tensor_content = 15;
 };
 
 message Request {
   repeated Tensor tensor = 1;
   repeated string fetch_var_names = 2;
-  optional bool profile_server = 3 [ default = false ];
-  required uint64 log_id = 4 [ default = 0 ];
+  bool profile_server = 3;
+  uint64 log_id = 4;
 };
 
 message Response {
   repeated ModelOutput outputs = 1;
   repeated int64 profile_time = 2;
+  // Error code
+  int32 err_no = 3;
+
+  // Error messages
+  string err_msg = 4;
 };
 
 message ModelOutput {
   repeated Tensor tensor = 1;
-  optional string engine_name = 2;
+  string engine_name = 2;
 }
 
 service GeneralModelService {
-  rpc inference(Request) returns (Response) {}
-  rpc debug(Request) returns (Response) {}
+  rpc inference(Request) returns (Response);
+  rpc debug(Request) returns (Response);
 };
diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto
old mode 100644
new mode 100755
index 5cace06420e29e1590218f63777c85bbcf504b29..13b9d39553b9219f0ab7f494f58ab0b7cfd3b7e8
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -22,11 +22,8 @@ message EngineDesc {
   required string reloadable_type = 4;
   required string model_dir = 5;
   repeated int32 gpu_ids = 6;
-  required int32 runtime_thread_num = 7;
-  required int32 batch_infer_size = 8;
-  required int32 enable_batch_align = 9;
-  optional string version_file = 10;
-  optional string version_type = 11;
+  optional string version_file = 7;
+  optional string version_type = 8;
 
   /*
    * Sparse Parameter Service type. Valid types are:
@@ -39,17 +36,34 @@ message EngineDesc {
     LOCAL = 1;
     REMOTE = 2;
   }
-  optional SparseParamServiceType sparse_param_service_type = 12;
-  optional string sparse_param_service_table_name = 13;
-  optional bool enable_memory_optimization = 14;
-  optional bool enable_ir_optimization = 15;
-  optional bool use_trt = 16;
-  optional bool use_lite = 17;
-  optional bool use_xpu = 18;
-  optional bool use_gpu = 19;
-  optional bool combined_model = 20;
-  optional bool encrypted_model = 21;
-  optional bool gpu_multi_stream = 22;
+  optional SparseParamServiceType sparse_param_service_type = 10;
+  optional string sparse_param_service_table_name = 11;
+  optional bool enable_memory_optimization = 12;
+  optional bool enable_ir_optimization = 13;
+  optional bool use_trt = 14;
+  optional bool use_lite = 15;
+  optional bool use_xpu = 16;
+  optional bool use_gpu = 17;
+  optional bool combined_model = 18;
+  optional bool encrypted_model = 19;
+  optional bool gpu_multi_stream = 20;
+
+  /*
+   * "runtime_thread_num": n == 0 means don`t use Asynchronous task scheduling
+   * mode.
+   * n > 0 means how many Predictor for this engine in Asynchronous task
+   * scheduling mode.
+   * "batch_infer_size": the max batch for this engine in Asynchronous task
+   * scheduling mode.
+   * "enable_overrun": always put a whole task into the TaskQueue even if the
+   * total batch is bigger than "batch_infer_size".
+   * "allow_split_request": allow to split task(which is corresponding to
+   * request).
+   */
+  optional int32 runtime_thread_num = 30 [ default = 0 ];
+  optional int32 batch_infer_size = 31 [ default = 32 ];
+  optional bool enable_overrun = 32 [ default = false ];
+  optional bool allow_split_request = 33 [ default = true ];
 };
 
 // model_toolkit conf
@@ -61,11 +75,14 @@ message ResourceConf {
   repeated string model_toolkit_file = 2;
   repeated string general_model_path = 3;
   repeated string general_model_file = 4;
-  optional string cube_config_path = 5;
-  optional string cube_config_file = 6;
-  optional int32 cube_quant_bits = 7; // set 0 if no quant.
-  optional string auth_product_name = 8;
-  optional string auth_container_id = 9;
+
+  optional string cube_config_path = 10;
+  optional string cube_config_file = 11;
+  optional int32 cube_quant_bits = 12;
+  optional string cube_cache_path = 13;
+
+  optional string auth_product_name = 20;
+  optional string auth_container_id = 21;
 };
 
 // DAG node depency info
diff --git a/core/cube/cube-api/go-api/conf/cube.conf b/core/cube/cube-api/go-api/conf/cube.conf
new file mode 100644
index 0000000000000000000000000000000000000000..0a21e83926c722b92a1b6efd9202b5d2c9c29418
--- /dev/null
+++ b/core/cube/cube-api/go-api/conf/cube.conf
@@ -0,0 +1,11 @@
+[{
+    "dict_name": "test",
+    "shard": 2,
+    "nodes": [{
+        "ip": "127.0.0.1",
+        "port": 8731
+    },{
+        "ip": "127.0.0.1",
+        "port": 8730
+    }]
+}]
diff --git a/core/cube/cube-api/go-api/demo.go b/core/cube/cube-api/go-api/demo.go
new file mode 100644
index 0000000000000000000000000000000000000000..bd82040db74890f2b4dd1c24780d1a8bff9e91b8
--- /dev/null
+++ b/core/cube/cube-api/go-api/demo.go
@@ -0,0 +1,33 @@
+package main
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io/ioutil"
+)
+
+func main() {
+	dict_name := flag.String("n", "test", "cube name")
+	conf_path := flag.String("c", "./conf/cube.conf", "cube conf path")
+	input_path := flag.String("i", "./input.json", "keys to seek")
+	output_path := flag.String("o", "./output.json", "result to save")
+	flag.Parse()
+	bytes, err := ioutil.ReadFile(*conf_path)
+	if err != nil {
+		fmt.Println("读取配置文件失败", err)
+		return
+	}
+	var meta Meta
+	err = json.Unmarshal(bytes, &meta.Servers)
+	if err != nil {
+		fmt.Println("解析数据失败", err)
+		return
+	}
+
+	err = meta.Seek(*dict_name, *input_path, *output_path)
+	if err != nil {
+		fmt.Println(err)
+	}
+	return
+}
diff --git a/core/cube/cube-api/go-api/input.json b/core/cube/cube-api/go-api/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..1eb1f62b7d1cbf0ee9a2a9421683ae84c9f3cdce
--- /dev/null
+++ b/core/cube/cube-api/go-api/input.json
@@ -0,0 +1,2 @@
+{"keys": [0,1,2,3,4,5,6,7]}
+{"keys": [1]}
diff --git a/core/cube/cube-api/go-api/meta.go b/core/cube/cube-api/go-api/meta.go
new file mode 100644
index 0000000000000000000000000000000000000000..a7757b4d91f76de23632a3abaed69e4665ca7619
--- /dev/null
+++ b/core/cube/cube-api/go-api/meta.go
@@ -0,0 +1,24 @@
+package main
+
+import "fmt"
+
+type Meta struct {
+	Servers []CubeServer `json:"servers,omitempty"`
+}
+
+func (meta *Meta) Seek(dict_name string, input string, output string) (err error) {
+	var server CubeServer
+
+	for _, s := range meta.Servers {
+		if s.Name == dict_name {
+			server = s
+			break
+		}
+	}
+	if server.Name != dict_name {
+		err = fmt.Errorf("%s server not exist", dict_name)
+		return err
+	}
+	err = server.Seek(input, output)
+	return err
+}
diff --git a/core/cube/cube-api/go-api/server.go b/core/cube/cube-api/go-api/server.go
new file mode 100644
index 0000000000000000000000000000000000000000..2c6c81d3c0009de6fcfcd57c73feff134d7b3b73
--- /dev/null
+++ b/core/cube/cube-api/go-api/server.go
@@ -0,0 +1,117 @@
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net/http"
+	"os"
+)
+
+type Input struct {
+	Keys []uint64 `json:"keys"`
+}
+
+type SingleValue struct {
+	Status uint32 `json:"status"`
+	Value  string `json:"value"`
+}
+type Output struct {
+	Values []SingleValue `json:"values"`
+}
+
+type ServerNode struct {
+	Ip   string `json:"ip"`
+	Port uint64 `json:"port"`
+}
+
+type CubeServer struct {
+	Name  string       `json:"dict_name"`
+	Shard uint64       `json:"shard"`
+	Nodes []ServerNode `json:"nodes"`
+}
+
+func (server *CubeServer) SplitKeys(keys []uint64) (splited_keys map[uint64]Input, offset map[uint64][]uint64) {
+	splited_keys = make(map[uint64]Input)
+
+	offset = make(map[uint64][]uint64)
+	for i, key := range keys {
+		shard_id := key % server.Shard
+		temp_split, _ := splited_keys[shard_id]
+		temp_split.Keys = append(temp_split.Keys, key)
+		splited_keys[shard_id] = temp_split
+
+		temp_offset, _ := offset[shard_id]
+		temp_offset = append(temp_offset, uint64(i))
+		offset[shard_id] = temp_offset
+	}
+
+	return splited_keys, offset
+}
+
+func (server *CubeServer) Seek(input string, output_path string) (err error) {
+	file, err := os.Open(input)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+
+	buf := bufio.NewReader(file)
+
+	for {
+		line, err := buf.ReadBytes('\n')
+		//line = strings.TrimSpace(line)
+		if err != nil || io.EOF == err {
+			break
+		}
+		var temp_input Input
+		json.Unmarshal(line, &temp_input)
+		key_nums := len(temp_input.Keys)
+		var output Output
+		output.Values = make([]SingleValue, key_nums+1)
+		splited_keys, offset := server.SplitKeys(temp_input.Keys)
+		for shard_id, keys := range splited_keys {
+			cur_output, _ := server.Post(shard_id, keys)
+			for index, single_value := range cur_output.Values {
+				output.Values[offset[shard_id][index]] = single_value
+			}
+		}
+		json_str, _ := json.Marshal(output)
+		fp, err := os.OpenFile(output_path, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0755)
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer fp.Close()
+		_, err = fp.Write(json_str)
+	}
+	return err
+}
+
+func (server *CubeServer) Post(shard_id uint64, input Input) (output Output, err error) {
+	if shard_id >= uint64(len(server.Nodes)) {
+		err = fmt.Errorf("have no shard:%v", shard_id)
+		return output, err
+	}
+	json_str, _ := json.Marshal(input)
+	URL := fmt.Sprintf("http://%s:%v/DictService/seek", server.Nodes[shard_id].Ip, server.Nodes[shard_id].Port)
+	req, err := http.NewRequest("POST", URL, bytes.NewBuffer(json_str))
+	if err != nil {
+		return output, err
+	}
+	req.Header.Set("Content-Type", "application/json")
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		return output, err
+	}
+	body, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		return output, err
+	}
+	err = json.Unmarshal(body, &output)
+	return output, err
+}
diff --git a/core/cube/cube-api/python-api/conf/cube.conf b/core/cube/cube-api/python-api/conf/cube.conf
new file mode 100644
index 0000000000000000000000000000000000000000..0a21e83926c722b92a1b6efd9202b5d2c9c29418
--- /dev/null
+++ b/core/cube/cube-api/python-api/conf/cube.conf
@@ -0,0 +1,11 @@
+[{
+    "dict_name": "test",
+    "shard": 2,
+    "nodes": [{
+        "ip": "127.0.0.1",
+        "port": 8731
+    },{
+        "ip": "127.0.0.1",
+        "port": 8730
+    }]
+}]
diff --git a/core/cube/cube-api/python-api/demo.py b/core/cube/cube-api/python-api/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0d63e6bce58e04c9395d4448645203fd138d23c
--- /dev/null
+++ b/core/cube/cube-api/python-api/demo.py
@@ -0,0 +1,76 @@
+#coding=utf-8
+
+import requests
+import sys
+import json
+
+class Meta(object):
+    """记录cube分片server路由"""
+    def __init__(self, conf_path):
+        """根据配置文件初始化路由"""
+        self.server_api = "/DictService/seek"
+        self.server_meta = {}
+        with open(conf_path, "r", encoding="utf8") as fp:
+            cube_servcers = json.load(fp)
+            for server in cube_servcers:
+                self.server_meta[server["dict_name"]] = server
+            fp.close()
+
+    def seek(self, dict_name, keys_path, save_path):
+        """查询"""
+        save_file = open(save_path, 'w')
+        with open(keys_path, "r", encoding="utf8") as fp:
+            lines = fp.readlines()
+            for line in lines:
+                json_line = json.loads(line)
+                values = [{} for i in range(len(json_line["keys"]))]
+                splited_keys, offset = self.split_keys(json_line)
+                for shard_id, keys in splited_keys.items():
+                    results = self.post(dict_name, shard_id, keys)
+                    for i, result in enumerate(results["values"]):
+                        values[offset[shard_id][i]] = result
+                cur_line_results = {}
+                cur_line_results["values"] = values
+                
+                json.dump(cur_line_results, save_file)
+                save_file.write("\n")
+                
+            fp.close()
+        save_file.close()
+
+    def split_keys(self, json_line):
+        """根据key值及分片数判断去哪一个分片上查询"""
+        keys_split = {}
+        offset = {}
+        i = 0
+        for key in json_line["keys"]:
+            shard_id = key % self.server_meta[dict_name]["shard"]
+            if shard_id not in keys_split:
+                keys_split[shard_id] = []
+            keys_split[shard_id].append(key)
+            if shard_id not in offset:
+                offset[shard_id] = []
+            offset[shard_id].append(i)
+            i += 1
+        return keys_split, offset
+
+    def post(self, dict_name, shard_id, keys):
+        """向分片server发送post请求"""
+        api = "http://%s:%s%s" % (self.server_meta[dict_name]["nodes"][shard_id]["ip"],
+            self.server_meta[dict_name]["nodes"][shard_id]["port"],
+            self.server_api)
+        data = {"keys": keys}
+        response = requests.post(api, json.dumps(data))
+        return response.json()
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 5:
+        print('please usage: python demo.py conf_path dict_name keys_path save_path')
+        exit(0)
+    conf_path = sys.argv[1]
+    dict_name = sys.argv[2]
+    keys_path = sys.argv[3]
+    save_path = sys.argv[4]
+    meta = Meta(conf_path)
+    meta.seek(dict_name, keys_path, save_path)
diff --git a/core/cube/cube-api/python-api/input.json b/core/cube/cube-api/python-api/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..74db7d86a8f64047cdf3c3b9b119574b6a96d286
--- /dev/null
+++ b/core/cube/cube-api/python-api/input.json
@@ -0,0 +1,2 @@
+{"keys": [0,1,2,3,4,5,6,7]}
+{"keys": [1]}
\ No newline at end of file
diff --git a/core/cube/cube-api/python-api/ptyhon_api.md b/core/cube/cube-api/python-api/ptyhon_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b321d9153f938e0ad94ddb321e3aae19da755b1
--- /dev/null
+++ b/core/cube/cube-api/python-api/ptyhon_api.md
@@ -0,0 +1,32 @@
+# cube python api说明文档
+参考[大规模稀疏参数服务Cube的部署和使用](https://github.com/PaddlePaddle/Serving/blob/master/doc/DEPLOY.md#2-大规模稀疏参数服务cube的部署和使用)文档进行cube的部署。
+使用python api，可替代上述文档中第3节预测服务的部署、使用
+
+## 配置说明
+conf/cube.conf 以json格式，设置各个分片cube server的ip以及port，shard与分片数一致，示例：
+```bash
+[{
+    "dict_name": "test",
+    "shard": 2,
+    "nodes": [{
+        "ip": "127.0.0.1",
+        "port": 8731
+    },{
+        "ip": "127.0.0.1",
+        "port": 8730
+    }]
+}]
+```
+
+## 数据格式
+```bash
+{"keys": [0,1,2,3,4,5,6,7]}
+{"keys": [1]}
+```
+支持批量查询，每次查询一行
+
+## 使用
+```bash
+cd ./python-api
+python3 demo.py conf/cube.conf test input.json result.json
+```
\ No newline at end of file
diff --git a/core/cube/cube-api/python-api/result.json b/core/cube/cube-api/python-api/result.json
new file mode 100644
index 0000000000000000000000000000000000000000..dbe282debae1f906d99f25b4e7ea0169c874b7ae
--- /dev/null
+++ b/core/cube/cube-api/python-api/result.json
@@ -0,0 +1,2 @@
+{"values": [{"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}, {"status": 4294967295, "value": ""}]}
+{"values": [{"status": 4294967295, "value": ""}]}
diff --git a/core/general-client/CMakeLists.txt b/core/general-client/CMakeLists.txt
index d6079317a75d3f45b61920836e6695bd6b31d951..0a7f2ee4b2899a1e6c6b4557dc26f767efe842e1 100644
--- a/core/general-client/CMakeLists.txt
+++ b/core/general-client/CMakeLists.txt
@@ -3,3 +3,24 @@ add_subdirectory(pybind11)
 pybind11_add_module(serving_client src/general_model.cpp src/pybind_general_model.cpp)
 target_link_libraries(serving_client PRIVATE -Wl,--whole-archive utils sdk-cpp pybind python -Wl,--no-whole-archive -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -Wl,-rpath,'$ORIGIN'/lib)
 endif()
+
+if(CLIENT)
+FILE(GLOB client_srcs include/*.h src/client.cpp src/brpc_client.cpp)
+add_library(client ${client_srcs})
+add_dependencies(client utils sdk-cpp)
+target_link_libraries(client utils sdk-cpp)
+endif()
+
+if(CLIENT)
+include_directories(SYSTEM  ${CMAKE_CURRENT_LIST_DIR}/../../)
+add_executable(simple_client example/simple_client.cpp)
+
+add_dependencies(simple_client utils sdk-cpp client)
+
+target_link_libraries(simple_client -Wl,--whole-archive
+        -Wl,--no-whole-archive -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -Wl,-rpath,'$ORIGIN'/lib)
+
+target_link_libraries(simple_client utils)
+target_link_libraries(simple_client sdk-cpp)
+target_link_libraries(simple_client client)
+endif()
\ No newline at end of file
diff --git a/core/general-client/README_CN.md b/core/general-client/README_CN.md
new file mode 100755
index 0000000000000000000000000000000000000000..d391ed8612b5296843b7b0dfadf951a699c9dfa5
--- /dev/null
+++ b/core/general-client/README_CN.md
@@ -0,0 +1,33 @@
+# 用于Paddle Serving的C++客户端
+
+(简体中文|[English](./README.md))
+
+## 请求BRPC-Server
+
+### 服务端启动
+
+以fit_a_line模型为例，服务端启动与常规BRPC-Server端启动命令一样。
+
+```
+cd ../../python/examples/fit_a_line
+sh get_data.sh
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
+```
+
+### 客户端预测
+
+客户端目前支持BRPC
+目前已经实现了BRPC的封装函数，详见[brpc_client.cpp](./src/brpc_client.cpp)
+
+```
+./simple_client --client_conf="uci_housing_client/serving_client_conf.prototxt" --server_port="127.0.0.1:9393" --test_type="brpc" --sample_type="fit_a_line"
+```
+
+更多示例详见[simple_client.cpp](./example/simple_client.cpp)
+
+| Argument                                       | Type | Default                              | Description                                           |
+| ---------------------------------------------- | ---- | ------------------------------------ | ----------------------------------------------------- |
+| `client_conf`                                  | str  | `"serving_client_conf.prototxt"`     | Path of client conf                                   |
+| `server_port`                                  | str  | `"127.0.0.1:9393"`                   | Exposed ip:port of server                             |
+| `test_type`                                    | str  | `"brpc"`                             | Mode of request "brpc"                                |
+| `sample_type`                                  | str  | `"fit_a_line"`                       | Type of sample include "fit_a_line,bert"              |
diff --git a/core/general-client/example/simple_client.cpp b/core/general-client/example/simple_client.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1052c346f66569d36a4e7cddbe73ca4f70cbd9e
--- /dev/null
+++ b/core/general-client/example/simple_client.cpp
@@ -0,0 +1,129 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <vector>
+
+#include "core/general-client/include/brpc_client.h"
+
+using baidu::paddle_serving::client::ServingClient;
+using baidu::paddle_serving::client::ServingBrpcClient;
+using baidu::paddle_serving::client::PredictorInputs;
+using baidu::paddle_serving::client::PredictorOutputs;
+
+DEFINE_string(server_port, "127.0.0.1:9292", "ip:port");
+DEFINE_string(client_conf, "serving_client_conf.prototxt", "Path of client conf");
+DEFINE_string(test_type, "brpc", "brpc");
+// fit_a_line, bert
+DEFINE_string(sample_type, "fit_a_line", "List: fit_a_line, bert");
+
+namespace {
+int prepare_fit_a_line(PredictorInputs& input, std::vector<std::string>& fetch_name) {
+  std::vector<float> float_feed = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
+            0.0582f, -0.0727f, -0.1583f, -0.0584f,
+            0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
+  std::vector<int> float_shape = {1, 13};
+  std::string feed_name = "x";
+  fetch_name = {"price"};
+  std::vector<int> lod;
+  input.add_float_data(float_feed, feed_name, float_shape, lod);
+  return 0;
+}
+
+int prepare_bert(PredictorInputs& input, std::vector<std::string>& fetch_name) {
+  {
+    std::vector<float> float_feed(128, 0.0f);
+    float_feed[0] = 1.0f;
+    std::vector<int> float_shape = {1, 128, 1};
+    std::string feed_name = "input_mask";
+    std::vector<int> lod;
+    input.add_float_data(float_feed, feed_name, float_shape, lod);
+  }
+  {
+    std::vector<int64_t> feed(128, 0);
+    std::vector<int> shape = {1, 128, 1};
+    std::string feed_name = "position_ids";
+    std::vector<int> lod;
+    input.add_int64_data(feed, feed_name, shape, lod);
+  }
+  {
+    std::vector<int64_t> feed(128, 0);
+    feed[0] = 101;
+    std::vector<int> shape = {1, 128, 1};
+    std::string feed_name = "input_ids";
+    std::vector<int> lod;
+    input.add_int64_data(feed, feed_name, shape, lod);
+  }
+  {
+    std::vector<int64_t> feed(128, 0);
+    std::vector<int> shape = {1, 128, 1};
+    std::string feed_name = "segment_ids";
+    std::vector<int> lod;
+    input.add_int64_data(feed, feed_name, shape, lod);
+  }
+  
+  fetch_name = {"pooled_output"};
+  return 0;
+}
+} // namespace
+
+int main(int argc, char* argv[]) {
+
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  std::string url = FLAGS_server_port;
+  std::string conf = FLAGS_client_conf;
+  std::string test_type = FLAGS_test_type;
+  std::string sample_type = FLAGS_sample_type;
+  LOG(INFO) << "url = " << url << ";"
+            << "client_conf = " << conf << ";"
+            << "test_type = " << test_type
+            << "sample_type = " << sample_type;
+  std::unique_ptr<ServingClient> client;
+  // default type is brpc
+  // will add grpc&http in the future
+  if (test_type == "brpc") {
+    client.reset(new ServingBrpcClient());
+  } else {
+    client.reset(new ServingBrpcClient());
+  }
+  std::vector<std::string> confs;
+  confs.push_back(conf);
+  if (client->init(confs, url) != 0) {
+    LOG(ERROR) << "Failed to init client!";
+    return 0;
+  }
+
+  PredictorInputs input;
+  PredictorOutputs output;
+  std::vector<std::string> fetch_name;
+
+  if (sample_type == "fit_a_line") {
+    prepare_fit_a_line(input, fetch_name);
+  }
+  else if (sample_type == "bert") {
+    prepare_bert(input, fetch_name);
+  }
+  else {
+    prepare_fit_a_line(input, fetch_name);
+  }
+
+  if (client->predict(input, output, fetch_name, 0) != 0) {
+    LOG(ERROR) << "Failed to predict!";
+  }
+  else {
+    LOG(INFO) << output.print();
+  }
+  
+  return 0;
+}
diff --git a/core/general-client/include/brpc_client.h b/core/general-client/include/brpc_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..05fc23e89950f92307fa4c887be82c5023ebc368
--- /dev/null
+++ b/core/general-client/include/brpc_client.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "core/general-client/include/client.h"
+#include "core/sdk-cpp/include/predictor_sdk.h"
+using baidu::paddle_serving::sdk_cpp::Predictor;
+using baidu::paddle_serving::sdk_cpp::PredictorApi;
+
+namespace baidu {
+namespace paddle_serving {
+namespace client {
+
+class ServingBrpcClient : public ServingClient {
+ public:
+
+  ServingBrpcClient() {};
+
+  ~ServingBrpcClient() {};
+
+  virtual int connect(const std::string server_port);
+
+  int predict(const PredictorInputs& inputs,
+              PredictorOutputs& outputs,
+              const std::vector<std::string>& fetch_name,
+              const uint64_t log_id);
+
+ private:
+  // generate default SDKConf
+  std::string gen_desc(const std::string server_port);
+
+ private:
+  PredictorApi _api;
+  Predictor* _predictor;
+};
+
+}  // namespace client
+}  // namespace paddle_serving
+}  // namespace baidu
\ No newline at end of file
diff --git a/core/general-client/include/client.h b/core/general-client/include/client.h
new file mode 100644
index 0000000000000000000000000000000000000000..11c6a2b7aa324cd09d9895f7ba1c2f8b990aad29
--- /dev/null
+++ b/core/general-client/include/client.h
@@ -0,0 +1,262 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+#include <vector>
+#include <map>
+#include <sstream>
+#include <memory>
+
+namespace baidu {
+namespace paddle_serving {
+namespace predictor {
+  namespace general_model {
+    class Request;
+    class Response;
+  }
+}
+namespace client {
+
+class PredictorInputs;
+class PredictorOutputs;
+
+class ServingClient {
+ public:
+  ServingClient() {};
+
+  virtual ~ServingClient() = default;
+
+  int init(const std::vector<std::string>& client_conf,
+           const std::string server_port);
+
+  int load_client_config(const std::vector<std::string>& client_conf);
+
+  virtual int connect(const std::string server_port) = 0;
+
+  virtual int predict(const PredictorInputs& inputs,
+                      PredictorOutputs& outputs,
+                      const std::vector<std::string>& fetch_name,
+                      const uint64_t log_id) = 0;
+
+ protected:
+  std::map<std::string, int> _feed_name_to_idx;
+  std::vector<std::string> _feed_name;
+  std::map<std::string, int> _fetch_name_to_idx;
+  std::map<std::string, std::string> _fetch_name_to_var_name;
+  std::map<std::string, int> _fetch_name_to_type;
+  std::vector<std::vector<int>> _shape;
+  std::vector<int> _type;
+  std::vector<int64_t> _last_request_ts;
+};
+
+class PredictorData {
+ public:
+  PredictorData() {};
+  virtual ~PredictorData() {};
+
+  void add_float_data(const std::vector<float>& data,
+                      const std::string& name,
+                      const std::vector<int>& shape,
+                      const std::vector<int>& lod,
+                      const int datatype = 1);
+
+  void add_int64_data(const std::vector<int64_t>& data,
+                      const std::string& name,
+                      const std::vector<int>& shape,
+                      const std::vector<int>& lod,
+                      const int datatype = 0);
+
+  void add_int32_data(const std::vector<int32_t>& data,
+                      const std::string& name,
+                      const std::vector<int>& shape,
+                      const std::vector<int>& lod,
+                      const int datatype = 2);
+
+  void add_string_data(const std::string& data,
+                       const std::string& name,
+                       const std::vector<int>& shape,
+                       const std::vector<int>& lod,
+                       const int datatype = 20);
+
+  const std::map<std::string, std::vector<float>>& float_data_map() const {
+    return _float_data_map;
+  };
+
+  std::map<std::string, std::vector<float>>* mutable_float_data_map() {
+    return &_float_data_map;
+  };
+
+  const std::map<std::string, std::vector<int64_t>>& int64_data_map() const {
+    return _int64_data_map;
+  };
+
+  std::map<std::string, std::vector<int64_t>>* mutable_int64_data_map() {
+    return &_int64_data_map;
+  };
+
+  const std::map<std::string, std::vector<int32_t>>& int_data_map() const {
+    return _int32_data_map;
+  };
+
+  std::map<std::string, std::vector<int32_t>>* mutable_int_data_map() {
+    return &_int32_data_map;
+  };
+
+  const std::map<std::string, std::string>& string_data_map() const {
+    return _string_data_map;
+  };
+
+  std::map<std::string, std::string>* mutable_string_data_map() {
+    return &_string_data_map;
+  };
+
+  const std::map<std::string, std::vector<int>>& shape_map() const {
+    return _shape_map;
+  };
+
+  std::map<std::string, std::vector<int>>* mutable_shape_map() {
+    return &_shape_map;
+  };
+
+  const std::map<std::string, std::vector<int>>& lod_map() const {
+    return _lod_map;
+  };
+
+  std::map<std::string, std::vector<int>>* mutable_lod_map() {
+    return &_lod_map;
+  };
+
+  int get_datatype(std::string name) const;
+
+  void set_datatype(std::string name, int type);
+
+  std::string print();
+
+ private:
+  // used to print vector data map e.g. _float_data_map
+  template<typename T1, typename T2>
+  std::string map2string(const std::map<T1, std::vector<T2>>& map) {
+    std::ostringstream oss;
+    oss.str("");
+    oss.precision(6);
+	  oss.setf(std::ios::fixed);
+    std::string key_seg = ":";
+    std::string val_seg = ",";
+    std::string end_seg = "\n";
+    typename std::map<T1, std::vector<T2>>::const_iterator it = map.begin();
+    typename std::map<T1, std::vector<T2>>::const_iterator itEnd = map.end();
+    for (; it != itEnd; it++) {
+      oss << "{";
+      oss << it->first << key_seg;
+      const std::vector<T2>& v = it->second;
+      oss << v.size() << key_seg;
+      for (size_t i = 0; i < v.size(); ++i) {
+        if (i != v.size() - 1) {
+          oss << v[i] << val_seg;
+        }
+        else {
+          oss << v[i];
+        }
+      }
+      oss << "}";
+    }
+    return oss.str();
+  };
+
+  // used to print data map without vector e.g. _string_data_map
+  template<typename T1, typename T2>
+  std::string map2string(const std::map<T1, T2>& map) {
+    std::ostringstream oss;
+    oss.str("");
+    std::string key_seg = ":";
+    std::string val_seg = ",";
+    std::string end_seg = "\n";
+    typename std::map<T1, T2>::const_iterator it = map.begin();
+    typename std::map<T1, T2>::const_iterator itEnd = map.end();
+    for (; it != itEnd; it++) {
+      oss << "{";
+      oss << it->first << key_seg
+          << "size=" << it->second.size() << key_seg
+          << "type=" << this->get_datatype(it->first);
+      oss << "}";
+    }
+    return oss.str();
+  };
+
+ protected:
+  std::map<std::string, std::vector<float>> _float_data_map;
+  std::map<std::string, std::vector<int64_t>> _int64_data_map;
+  std::map<std::string, std::vector<int32_t>> _int32_data_map;
+  std::map<std::string, std::string> _string_data_map;
+  std::map<std::string, std::vector<int>> _shape_map;
+  std::map<std::string, std::vector<int>> _lod_map;
+  std::map<std::string, int> _datatype_map;
+};
+
+class PredictorInputs : public PredictorData {
+ public:
+  PredictorInputs() {};
+  virtual ~PredictorInputs() {};
+
+  // generate proto from inputs
+  // feed_name_to_idx: mapping alias name to idx
+  // feed_name: mapping idx to name
+  static int GenProto(const PredictorInputs& inputs,
+                      const std::map<std::string, int>& feed_name_to_idx,
+                      const std::vector<std::string>& feed_name,
+                      predictor::general_model::Request& req);
+};
+
+class PredictorOutputs {
+ public:
+  struct PredictorOutput {
+    std::string engine_name;
+    PredictorData data;
+  };
+
+  PredictorOutputs() {};
+  virtual ~PredictorOutputs() {};
+
+  const std::vector<std::shared_ptr<PredictorOutputs::PredictorOutput>>& datas() {
+    return _datas;
+  };
+
+  std::vector<std::shared_ptr<PredictorOutputs::PredictorOutput>>* mutable_datas() {
+    return &_datas;
+  };
+
+  void add_data(const std::shared_ptr<PredictorOutputs::PredictorOutput>& data) {
+    _datas.push_back(data);
+  };
+
+  std::string print();
+
+  void clear();
+
+  // Parse proto to outputs
+  // fetch_name: name of data to be output
+  // fetch_name_to_type: mapping of fetch_name to datatype
+  static int ParseProto(const predictor::general_model::Response& res,
+                        const std::vector<std::string>& fetch_name,
+                        std::map<std::string, int>& fetch_name_to_type,
+                        PredictorOutputs& outputs);
+
+ protected:
+  std::vector<std::shared_ptr<PredictorOutputs::PredictorOutput>> _datas;
+};
+
+}  // namespace client
+}  // namespace paddle_serving
+}  // namespace baidu
\ No newline at end of file
diff --git a/core/general-client/include/general_model.h b/core/general-client/include/general_model.h
index 7c80500d03b482c8bbaa0515b0484d72d518434e..788cb8fe3422c9252b5bec9faa96d6cb834bf84e 100644
--- a/core/general-client/include/general_model.h
+++ b/core/general-client/include/general_model.h
@@ -51,8 +51,13 @@ class ModelRes {
                             res._float_value_map.end());
     _int32_value_map.insert(res._int32_value_map.begin(),
                             res._int32_value_map.end());
+    _string_value_map.insert(res._string_value_map.begin(),
+                            res._string_value_map.end());
     _shape_map.insert(res._shape_map.begin(), res._shape_map.end());
     _lod_map.insert(res._lod_map.begin(), res._lod_map.end());
+    _tensor_alias_names.insert(_tensor_alias_names.end(),
+                               res._tensor_alias_names.begin(),
+                               res._tensor_alias_names.end());
   }
   ModelRes(ModelRes&& res) {
     _engine_name = std::move(res._engine_name);
@@ -65,10 +70,17 @@ class ModelRes {
     _int32_value_map.insert(
         std::make_move_iterator(std::begin(res._int32_value_map)),
         std::make_move_iterator(std::end(res._int32_value_map)));
+    _string_value_map.insert(
+        std::make_move_iterator(std::begin(res._string_value_map)),
+        std::make_move_iterator(std::end(res._string_value_map)));
     _shape_map.insert(std::make_move_iterator(std::begin(res._shape_map)),
                       std::make_move_iterator(std::end(res._shape_map)));
     _lod_map.insert(std::make_move_iterator(std::begin(res._lod_map)),
                     std::make_move_iterator(std::end(res._lod_map)));
+    _tensor_alias_names.insert(
+        _tensor_alias_names.end(),
+        std::make_move_iterator(std::begin(res._tensor_alias_names)),
+        std::make_move_iterator(std::end(res._tensor_alias_names)));
   }
   ~ModelRes() {}
   const std::vector<int64_t>& get_int64_by_name(const std::string& name) {
@@ -89,6 +101,12 @@ class ModelRes {
   std::vector<int32_t>&& get_int32_by_name_with_rv(const std::string& name) {
     return std::move(_int32_value_map[name]);
   }
+  const std::string& get_string_by_name(const std::string& name) {
+    return _string_value_map[name];
+  }
+  std::string&& get_string_by_name_with_rv(const std::string& name) {
+    return std::move(_string_value_map[name]);
+  }
   const std::vector<int>& get_shape_by_name(const std::string& name) {
     return _shape_map[name];
   }
@@ -105,6 +123,10 @@ class ModelRes {
     _engine_name = engine_name;
   }
   const std::string& engine_name() { return _engine_name; }
+
+  const std::vector<std::string>& tensor_alias_names() {
+    return _tensor_alias_names;
+  }
   ModelRes& operator=(ModelRes&& res) {
     if (this != &res) {
       _engine_name = std::move(res._engine_name);
@@ -117,10 +139,17 @@ class ModelRes {
       _int32_value_map.insert(
           std::make_move_iterator(std::begin(res._int32_value_map)),
           std::make_move_iterator(std::end(res._int32_value_map)));
+      _string_value_map.insert(
+          std::make_move_iterator(std::begin(res._string_value_map)),
+          std::make_move_iterator(std::end(res._string_value_map)));
       _shape_map.insert(std::make_move_iterator(std::begin(res._shape_map)),
                         std::make_move_iterator(std::end(res._shape_map)));
       _lod_map.insert(std::make_move_iterator(std::begin(res._lod_map)),
                       std::make_move_iterator(std::end(res._lod_map)));
+      _tensor_alias_names.insert(
+          _tensor_alias_names.end(),
+          std::make_move_iterator(std::begin(res._tensor_alias_names)),
+          std::make_move_iterator(std::end(res._tensor_alias_names)));
     }
     return *this;
   }
@@ -130,8 +159,10 @@ class ModelRes {
   std::map<std::string, std::vector<int64_t>> _int64_value_map;
   std::map<std::string, std::vector<float>> _float_value_map;
   std::map<std::string, std::vector<int32_t>> _int32_value_map;
+  std::map<std::string, std::string> _string_value_map;
   std::map<std::string, std::vector<int>> _shape_map;
   std::map<std::string, std::vector<int>> _lod_map;
+  std::vector<std::string> _tensor_alias_names;
 };
 
 class PredictorRes {
@@ -168,6 +199,14 @@ class PredictorRes {
                                                    const std::string& name) {
     return std::move(_models[model_idx].get_int32_by_name_with_rv(name));
   }
+  const std::string& get_string_by_name(const int model_idx,
+                                                const std::string& name) {
+    return _models[model_idx].get_string_by_name(name);
+  }
+  std::string&& get_string_by_name_with_rv(const int model_idx,
+                                                   const std::string& name) {
+    return std::move(_models[model_idx].get_string_by_name_with_rv(name));
+  }
   const std::vector<int>& get_shape_by_name(const int model_idx,
                                             const std::string& name) {
     return _models[model_idx].get_shape_by_name(name);
@@ -193,11 +232,16 @@ class PredictorRes {
   }
   const std::string& variant_tag() { return _variant_tag; }
   const std::vector<std::string>& get_engine_names() { return _engine_names; }
+  const std::vector<std::string>& get_tensor_alias_names(const int model_idx) {
+    _tensor_alias_names = _models[model_idx].tensor_alias_names();
+    return _tensor_alias_names;
+  }
 
  private:
   std::vector<ModelRes> _models;
   std::string _variant_tag;
   std::vector<std::string> _engine_names;
+  std::vector<std::string> _tensor_alias_names;
 };
 
 class PredictorClient {
@@ -222,10 +266,14 @@ class PredictorClient {
                     const std::vector<std::string>& float_feed_name,
                     const std::vector<std::vector<int>>& float_shape,
                     const std::vector<std::vector<int>>& float_lod_slot_batch,
-                    const std::vector<py::array_t<int64_t>>& int_feed,
-                    const std::vector<std::string>& int_feed_name,
-                    const std::vector<std::vector<int>>& int_shape,
-                    const std::vector<std::vector<int>>& int_lod_slot_batch,
+                    const std::vector<py::array_t<int32_t>> &int32_feed,
+                    const std::vector<std::string> &int32_feed_name,
+                    const std::vector<std::vector<int>> &int32_shape,
+                    const std::vector<std::vector<int>> &int32_lod_slot_batch,
+                    const std::vector<py::array_t<int64_t>> &int64_feed,
+                    const std::vector<std::string> &int64_feed_name,
+                    const std::vector<std::vector<int>> &int64_shape,
+                    const std::vector<std::vector<int>> &int64_lod_slot_batch,
                     const std::vector<std::string>& string_feed,
                     const std::vector<std::string>& string_feed_name,
                     const std::vector<std::vector<int>>& string_shape,
diff --git a/core/general-client/src/brpc_client.cpp b/core/general-client/src/brpc_client.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..74f99e2b6d4e096ba0739b07d87e4bde4cdcc03f
--- /dev/null
+++ b/core/general-client/src/brpc_client.cpp
@@ -0,0 +1,200 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "core/general-client/include/brpc_client.h"
+#include "core/sdk-cpp/include/common.h"
+#include "core/util/include/timer.h"
+#include "core/sdk-cpp/builtin_format.pb.h"
+#include "core/sdk-cpp/general_model_service.pb.h"
+DEFINE_bool(profile_client, false, "");
+DEFINE_bool(profile_server, false, "");
+#define BRPC_MAX_BODY_SIZE 512 * 1024 * 1024
+
+namespace baidu {
+namespace paddle_serving {
+namespace client {
+
+using baidu::paddle_serving::Timer;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+
+using configure::SDKConf;
+using configure::VariantConf;
+using configure::Predictor;
+using configure::VariantConf;
+
+int ServingBrpcClient::connect(const std::string server_port) {
+  brpc::fLU64::FLAGS_max_body_size = BRPC_MAX_BODY_SIZE;
+  if (_api.create(gen_desc(server_port)) != 0) {
+    LOG(ERROR) << "Predictor Creation Failed";
+    return -1;
+  }
+  // _api.thrd_initialize();
+  return 0;
+}
+
+std::string ServingBrpcClient::gen_desc(const std::string server_port) {
+  // default config for brpc
+  SDKConf sdk_conf;
+
+  Predictor* predictor = sdk_conf.add_predictors();
+  predictor->set_name("general_model");
+  predictor->set_service_name("baidu.paddle_serving.predictor.general_model.GeneralModelService");
+  predictor->set_endpoint_router("WeightedRandomRender");
+  predictor->mutable_weighted_random_render_conf()->set_variant_weight_list("100");
+  VariantConf* predictor_var = predictor->add_variants();
+  predictor_var->set_tag("default_tag_1");
+  std::string cluster = "list://" + server_port;
+  predictor_var->mutable_naming_conf()->set_cluster(cluster);
+
+  VariantConf* var = sdk_conf.mutable_default_variant_conf();
+  var->set_tag("default");
+  var->mutable_connection_conf()->set_connect_timeout_ms(2000);
+  var->mutable_connection_conf()->set_rpc_timeout_ms(200000);
+  var->mutable_connection_conf()->set_connect_retry_count(2);
+  var->mutable_connection_conf()->set_max_connection_per_host(100);
+  var->mutable_connection_conf()->set_hedge_request_timeout_ms(-1);
+  var->mutable_connection_conf()->set_hedge_fetch_retry_count(2);
+  var->mutable_connection_conf()->set_connection_type("pooled");
+  var->mutable_connection_conf()->set_connect_timeout_ms(2000);
+
+  var->mutable_naming_conf()->set_cluster_filter_strategy("Default");
+  var->mutable_naming_conf()->set_load_balance_strategy("la");
+
+  var->mutable_rpc_parameter()->set_compress_type(0);
+  var->mutable_rpc_parameter()->set_package_size(20);
+  var->mutable_rpc_parameter()->set_protocol("baidu_std");
+  var->mutable_rpc_parameter()->set_max_channel_per_request(3);
+
+  return sdk_conf.SerializePartialAsString();
+}
+
+int ServingBrpcClient::predict(const PredictorInputs& inputs,
+                               PredictorOutputs& outputs,
+                               const std::vector<std::string>& fetch_name,
+                               const uint64_t log_id) {
+  Timer timeline;
+  int64_t preprocess_start = timeline.TimeStampUS();
+  // thread initialize for StubTLS
+  _api.thrd_initialize();
+  std::string variant_tag;
+  // predictor is bound to request with brpc::Controller
+  _predictor = _api.fetch_predictor("general_model", &variant_tag);
+  if (_predictor == NULL) {
+    LOG(ERROR) << "Failed fetch predictor so predict error!";
+    return -1;
+  }
+  // predict_res_batch.set_variant_tag(variant_tag);
+  VLOG(2) << "fetch general model predictor done.";
+  VLOG(2) << "variant_tag:" << variant_tag;
+  VLOG(2) << "max body size : " << brpc::fLU64::FLAGS_max_body_size;
+  Request req;
+  req.set_log_id(log_id);
+  for (auto &name : fetch_name) {
+    req.add_fetch_var_names(name);
+  }
+
+  if (PredictorInputs::GenProto(inputs, _feed_name_to_idx, _feed_name, req) != 0) {
+    LOG(ERROR) << "Failed to preprocess req!";
+    return -1;
+  }
+
+  int64_t preprocess_end = timeline.TimeStampUS();
+  int64_t client_infer_start = timeline.TimeStampUS();
+  Response res;
+
+  int64_t client_infer_end = 0;
+  int64_t postprocess_start = 0;
+  int64_t postprocess_end = 0;
+
+  if (FLAGS_profile_server) {
+    req.set_profile_server(true);
+  }
+
+  res.Clear();
+  if (_predictor->inference(&req, &res) != 0) {
+    LOG(ERROR) << "failed call predictor with req: " << req.ShortDebugString();
+    return -1;
+  }
+
+  client_infer_end = timeline.TimeStampUS();
+  postprocess_start = client_infer_end;
+  if (PredictorOutputs::ParseProto(res, fetch_name, _fetch_name_to_type, outputs) != 0) {
+    LOG(ERROR) << "Failed to post_process res!";
+    return -1;
+  }
+  postprocess_end = timeline.TimeStampUS();
+
+  if (FLAGS_profile_client) {
+    std::ostringstream oss;
+    oss << "PROFILE\t"
+        << "pid:" << getpid() << "\t"
+        << "prepro_0:" << preprocess_start << " "
+        << "prepro_1:" << preprocess_end << " "
+        << "client_infer_0:" << client_infer_start << " "
+        << "client_infer_1:" << client_infer_end << " ";
+    if (FLAGS_profile_server) {
+      int op_num = res.profile_time_size() / 2;
+      for (int i = 0; i < op_num; ++i) {
+        oss << "op" << i << "_0:" << res.profile_time(i * 2) << " ";
+        oss << "op" << i << "_1:" << res.profile_time(i * 2 + 1) << " ";
+      }
+    }
+
+    oss << "postpro_0:" << postprocess_start << " ";
+    oss << "postpro_1:" << postprocess_end;
+
+    fprintf(stderr, "%s\n", oss.str().c_str());
+  }
+
+  // release predictor
+  _api.thrd_clear();
+
+  std::ostringstream oss;
+  oss << "[client]"
+      << "logid=" << log_id <<",";
+  if (FLAGS_profile_client) {
+    double pre_cost = (preprocess_end - preprocess_start) / 1000.0;
+    double infer_cost = (client_infer_end - client_infer_start) / 1000.0;
+    double post_cost = (postprocess_end - postprocess_start) / 1000.0;
+    oss << "client_pre_cost=" << pre_cost << "ms,"
+        << "client_infer_cost=" << infer_cost << "ms,"
+        << "client_post_cost=" << post_cost << "ms,";
+  }
+  double client_cost = (postprocess_end - preprocess_start) / 1000.0;
+  oss << "client_cost=" << client_cost << "ms,";
+
+  int op_num = res.profile_time_size() / 2;
+  if (FLAGS_profile_server) {
+    for (int i = 0; i < op_num - 1; ++i) {
+      double t = (res.profile_time(i * 2 + 1)
+                 - res.profile_time(i * 2)) / 1000.0;
+      oss << "op" << i << "=" << t << "ms,";
+    }
+  }
+  if (op_num > 0) {
+    int i = op_num - 1;
+    double server_cost = (res.profile_time(i * 2 + 1)
+                 - res.profile_time(i * 2)) / 1000.0;
+    oss << "server_cost=" << server_cost << "ms.";
+  }
+  LOG(INFO) << oss.str();
+
+  return 0;
+}
+
+}  // namespace general_model
+}  // namespace paddle_serving
+}  // namespace baidu
diff --git a/core/general-client/src/client.cpp b/core/general-client/src/client.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cc55dd30a5649afac98810fb83f98a837932a523
--- /dev/null
+++ b/core/general-client/src/client.cpp
@@ -0,0 +1,449 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "core/general-client/include/client.h"
+#include "core/sdk-cpp/include/common.h"
+#include "core/sdk-cpp/general_model_service.pb.h"
+
+namespace baidu {
+namespace paddle_serving {
+namespace client {
+using configure::GeneralModelConfig;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+// support: FLOAT32, INT64, INT32, UINT8, INT8, FLOAT16
+enum ProtoDataType {
+  P_INT64 = 0,
+  P_FLOAT32,
+  P_INT32,
+  P_FP64,
+  P_INT16,
+  P_FP16,
+  P_BF16,
+  P_UINT8,
+  P_INT8,
+  P_BOOL,
+  P_COMPLEX64,
+  P_COMPLEX128,
+  P_STRING = 20,
+};
+
+int ServingClient::init(const std::vector<std::string>& client_conf,
+           const std::string server_port) {
+  if (load_client_config(client_conf) != 0) {
+    LOG(ERROR) << "Failed to load client config";
+    return -1;
+  }
+
+  // pure virtual func, subclass implementation
+  if (connect(server_port) != 0) {
+    LOG(ERROR) << "Failed to connect";
+    return -1;
+  }
+
+  return 0;
+}
+
+int ServingClient::load_client_config(const std::vector<std::string> &conf_file) {
+  try {
+    GeneralModelConfig model_config;
+    if (configure::read_proto_conf(conf_file[0].c_str(), &model_config) != 0) {
+      LOG(ERROR) << "Failed to load general model config"
+                 << ", file path: " << conf_file[0];
+      return -1;
+    }
+
+    _feed_name_to_idx.clear();
+    _fetch_name_to_idx.clear();
+    _shape.clear();
+    int feed_var_num = model_config.feed_var_size();
+    _feed_name.clear();
+    VLOG(2) << "feed var num: " << feed_var_num;
+    for (int i = 0; i < feed_var_num; ++i) {
+      _feed_name_to_idx[model_config.feed_var(i).alias_name()] = i;
+      VLOG(2) << "feed [" << i << "]"
+              << " name: " << model_config.feed_var(i).name();
+      _feed_name.push_back(model_config.feed_var(i).name());
+      VLOG(2) << "feed alias name: " << model_config.feed_var(i).alias_name()
+              << " index: " << i;
+      std::vector<int> tmp_feed_shape;
+      VLOG(2) << "feed"
+              << "[" << i << "] shape:";
+      for (int j = 0; j < model_config.feed_var(i).shape_size(); ++j) {
+        tmp_feed_shape.push_back(model_config.feed_var(i).shape(j));
+        VLOG(2) << "shape[" << j << "]: " << model_config.feed_var(i).shape(j);
+      }
+      _type.push_back(model_config.feed_var(i).feed_type());
+      VLOG(2) << "feed"
+              << "[" << i
+              << "] feed type: " << model_config.feed_var(i).feed_type();
+      _shape.push_back(tmp_feed_shape);
+    }
+
+    if (conf_file.size() > 1) {
+      model_config.Clear();
+      if (configure::read_proto_conf(conf_file[conf_file.size() - 1].c_str(),
+                                     &model_config) != 0) {
+        LOG(ERROR) << "Failed to load general model config"
+                   << ", file path: " << conf_file[conf_file.size() - 1];
+        return -1;
+      }
+    }
+    int fetch_var_num = model_config.fetch_var_size();
+    VLOG(2) << "fetch_var_num: " << fetch_var_num;
+    for (int i = 0; i < fetch_var_num; ++i) {
+      _fetch_name_to_idx[model_config.fetch_var(i).alias_name()] = i;
+      VLOG(2) << "fetch [" << i << "]"
+              << " alias name: " << model_config.fetch_var(i).alias_name();
+      _fetch_name_to_var_name[model_config.fetch_var(i).alias_name()] =
+          model_config.fetch_var(i).name();
+      _fetch_name_to_type[model_config.fetch_var(i).alias_name()] =
+          model_config.fetch_var(i).fetch_type();
+    }
+  } catch (std::exception &e) {
+    LOG(ERROR) << "Failed load general model config" << e.what();
+    return -1;
+  }
+  return 0;
+}
+
+void PredictorData::add_float_data(const std::vector<float>& data,
+                                    const std::string& name,
+                                    const std::vector<int>& shape,
+                                    const std::vector<int>& lod,
+                                    const int datatype) {
+  _float_data_map[name] = data;
+  _shape_map[name] = shape;
+  _lod_map[name] = lod;
+  _datatype_map[name] = datatype;
+}
+
+void PredictorData::add_int64_data(const std::vector<int64_t>& data,
+                                    const std::string& name,
+                                    const std::vector<int>& shape,
+                                    const std::vector<int>& lod,
+                                    const int datatype) {
+  _int64_data_map[name] = data;
+  _shape_map[name] = shape;
+  _lod_map[name] = lod;
+  _datatype_map[name] = datatype;
+}
+
+void PredictorData::add_int32_data(const std::vector<int32_t>& data,
+                                    const std::string& name,
+                                    const std::vector<int>& shape,
+                                    const std::vector<int>& lod,
+                                    const int datatype) {
+  _int32_data_map[name] = data;
+  _shape_map[name] = shape;
+  _lod_map[name] = lod;
+  _datatype_map[name] = datatype;
+}
+
+void PredictorData::add_string_data(const std::string& data,
+                                    const std::string& name,
+                                    const std::vector<int>& shape,
+                                    const std::vector<int>& lod,
+                                    const int datatype) {
+  _string_data_map[name] = data;
+  _shape_map[name] = shape;
+  _lod_map[name] = lod;
+  _datatype_map[name] = datatype;
+}
+
+int PredictorData::get_datatype(std::string name) const {
+  std::map<std::string, int>::const_iterator it = _datatype_map.find(name);
+  if (it != _datatype_map.end()) {
+    return it->second;
+  }
+  return 0;
+}
+
+void PredictorData::set_datatype(std::string name, int type) {
+  _datatype_map[name] = type;
+}
+
+std::string PredictorData::print() {
+  std::string res;
+  res.append(map2string<std::string, float>(_float_data_map));
+  res.append(map2string<std::string, int64_t>(_int64_data_map));
+  res.append(map2string<std::string, int32_t>(_int32_data_map));
+  res.append(map2string<std::string, std::string>(_string_data_map));
+  return res;
+}
+
+int PredictorInputs::GenProto(const PredictorInputs& inputs,
+                              const std::map<std::string, int>& feed_name_to_idx,
+                              const std::vector<std::string>& feed_name,
+                              Request& req) {
+  const std::map<std::string, std::vector<float>>& float_feed_map = inputs.float_data_map();
+  const std::map<std::string, std::vector<int64_t>>& int64_feed_map = inputs.int64_data_map();
+  const std::map<std::string, std::vector<int32_t>>& int32_feed_map = inputs.int_data_map();
+  const std::map<std::string, std::string>& string_feed_map = inputs.string_data_map();
+  const std::map<std::string, std::vector<int>>& shape_map = inputs.shape_map();
+  const std::map<std::string, std::vector<int>>& lod_map = inputs.lod_map();
+
+  VLOG(2) << "float feed name size: " << float_feed_map.size();
+  VLOG(2) << "int feed name size: " << int64_feed_map.size();
+  VLOG(2) << "string feed name size: " << string_feed_map.size();
+
+  // batch is already in Tensor.
+
+  for (std::map<std::string, std::vector<float>>::const_iterator iter = float_feed_map.begin();
+        iter != float_feed_map.end();
+        ++iter) {
+    std::string name = iter->first;
+    const std::vector<float>& float_data = iter->second;
+    const std::vector<int>& float_shape = shape_map.at(name);
+    const std::vector<int>& float_lod = lod_map.at(name);
+    // default datatype = P_FLOAT32
+    int datatype = inputs.get_datatype(name);
+    std::map<std::string, int>::const_iterator feed_name_it = feed_name_to_idx.find(name);
+    if (feed_name_it == feed_name_to_idx.end()) {
+      LOG(ERROR) << "Do not find [" << name << "] in feed_map!";
+      return -1;
+    }
+    int idx = feed_name_to_idx.at(name);
+    VLOG(2) << "prepare float feed " << name << " idx " << idx;
+    int total_number = float_data.size();
+    Tensor *tensor = req.add_tensor();
+
+    VLOG(2) << "prepare float feed " << name << " shape size "
+            << float_shape.size();
+    for (uint32_t j = 0; j < float_shape.size(); ++j) {
+      tensor->add_shape(float_shape[j]);
+    }
+    for (uint32_t j = 0; j < float_lod.size(); ++j) {
+      tensor->add_lod(float_lod[j]);
+    }
+    tensor->set_elem_type(datatype);
+
+    tensor->set_name(feed_name[idx]);
+    tensor->set_alias_name(name);
+
+    tensor->mutable_float_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_float_data()->mutable_data(), float_data.data(), total_number * sizeof(float));
+  }
+
+  for (std::map<std::string, std::vector<int64_t>>::const_iterator iter = int64_feed_map.begin();
+        iter != int64_feed_map.end();
+        ++iter) {
+    std::string name = iter->first;
+    const std::vector<int64_t>& int64_data = iter->second;
+    const std::vector<int>& int64_shape = shape_map.at(name);
+    const std::vector<int>& int64_lod = lod_map.at(name);
+    // default datatype = P_INT64
+    int datatype = inputs.get_datatype(name);
+    std::map<std::string, int>::const_iterator feed_name_it = feed_name_to_idx.find(name);
+    if (feed_name_it == feed_name_to_idx.end()) {
+      LOG(ERROR) << "Do not find [" << name << "] in feed_map!";
+      return -1;
+    }
+    int idx = feed_name_to_idx.at(name);
+    Tensor *tensor = req.add_tensor();
+    int total_number = int64_data.size();
+
+    for (uint32_t j = 0; j < int64_shape.size(); ++j) {
+      tensor->add_shape(int64_shape[j]);
+    }
+    for (uint32_t j = 0; j < int64_lod.size(); ++j) {
+      tensor->add_lod(int64_lod[j]);
+    }
+    tensor->set_elem_type(datatype);
+    tensor->set_name(feed_name[idx]);
+    tensor->set_alias_name(name);
+
+    tensor->mutable_int64_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_int64_data()->mutable_data(), int64_data.data(), total_number * sizeof(int64_t));
+  }
+
+  for (std::map<std::string, std::vector<int32_t>>::const_iterator iter = int32_feed_map.begin();
+        iter != int32_feed_map.end();
+        ++iter) {
+    std::string name = iter->first;
+    const std::vector<int32_t>& int32_data = iter->second;
+    const std::vector<int>& int32_shape = shape_map.at(name);
+    const std::vector<int>& int32_lod = lod_map.at(name);
+    // default datatype = P_INT32
+    int datatype = inputs.get_datatype(name);
+    std::map<std::string, int>::const_iterator feed_name_it = feed_name_to_idx.find(name);
+    if (feed_name_it == feed_name_to_idx.end()) {
+      LOG(ERROR) << "Do not find [" << name << "] in feed_map!";
+      return -1;
+    }
+    int idx = feed_name_to_idx.at(name);
+    Tensor *tensor = req.add_tensor();
+    int total_number = int32_data.size();
+
+    for (uint32_t j = 0; j < int32_shape.size(); ++j) {
+      tensor->add_shape(int32_shape[j]);
+    }
+    for (uint32_t j = 0; j < int32_lod.size(); ++j) {
+      tensor->add_lod(int32_lod[j]);
+    }
+    tensor->set_elem_type(datatype);
+    tensor->set_name(feed_name[idx]);
+    tensor->set_alias_name(name);
+
+    tensor->mutable_int_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_int_data()->mutable_data(), int32_data.data(), total_number * sizeof(int32_t));
+  }
+
+  for (std::map<std::string, std::string>::const_iterator iter = string_feed_map.begin();
+        iter != string_feed_map.end();
+        ++iter) {
+    std::string name = iter->first;
+    const std::string& string_data = iter->second;
+    const std::vector<int>& string_shape = shape_map.at(name);
+    const std::vector<int>& string_lod = lod_map.at(name);
+    // default datatype = P_STRING
+    int datatype = inputs.get_datatype(name);
+    std::map<std::string, int>::const_iterator feed_name_it = feed_name_to_idx.find(name);
+    if (feed_name_it == feed_name_to_idx.end()) {
+      LOG(ERROR) << "Do not find [" << name << "] in feed_map!";
+      return -1;
+    }
+    int idx = feed_name_to_idx.at(name);
+    Tensor *tensor = req.add_tensor();
+
+    for (uint32_t j = 0; j < string_shape.size(); ++j) {
+      tensor->add_shape(string_shape[j]);
+    }
+    for (uint32_t j = 0; j < string_lod.size(); ++j) {
+      tensor->add_lod(string_lod[j]);
+    }
+    tensor->set_elem_type(datatype);
+    tensor->set_name(feed_name[idx]);
+    tensor->set_alias_name(name);
+
+    if (datatype == P_STRING) {
+      const int string_shape_size = string_shape.size();
+      // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
+      // we pass string via vector<vector<string> >.
+      if (string_shape_size != 1) {
+        LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
+                   << string_shape_size;
+        return -1;
+      }
+      switch (string_shape_size) {
+        case 1: {
+          tensor->add_data(string_data);
+          break;
+        }
+      }
+    } else {
+      tensor->set_tensor_content(string_data);
+    }
+    
+  }
+  return 0;
+}
+
+std::string PredictorOutputs::print() {
+  std::string res = "";
+  for (size_t i = 0; i < _datas.size(); ++i) {
+    res.append(_datas[i]->engine_name);
+    res.append(":");
+    res.append(_datas[i]->data.print());
+    res.append("\n");
+  }
+  return res;
+}
+
+void PredictorOutputs::clear() {
+  _datas.clear();
+}
+
+int PredictorOutputs::ParseProto(const Response& res,
+                                  const std::vector<std::string>& fetch_name,
+                                  std::map<std::string, int>& fetch_name_to_type,
+                                  PredictorOutputs& outputs) {
+  VLOG(2) << "get model output num";
+  uint32_t model_num = res.outputs_size();
+  VLOG(2) << "model num: " << model_num;
+  for (uint32_t m_idx = 0; m_idx < model_num; ++m_idx) {
+    VLOG(2) << "process model output index: " << m_idx;
+    auto& output = res.outputs(m_idx);
+    std::shared_ptr<PredictorOutputs::PredictorOutput> predictor_output =
+        std::make_shared<PredictorOutputs::PredictorOutput>();
+    predictor_output->engine_name = output.engine_name();
+
+    PredictorData& predictor_data = predictor_output->data;
+    std::map<std::string, std::vector<float>>& float_data_map = *predictor_output->data.mutable_float_data_map();
+    std::map<std::string, std::vector<int64_t>>& int64_data_map = *predictor_output->data.mutable_int64_data_map();
+    std::map<std::string, std::vector<int32_t>>& int32_data_map = *predictor_output->data.mutable_int_data_map();
+    std::map<std::string, std::string>& string_data_map = *predictor_output->data.mutable_string_data_map();
+    std::map<std::string, std::vector<int>>& shape_map = *predictor_output->data.mutable_shape_map();
+    std::map<std::string, std::vector<int>>& lod_map = *predictor_output->data.mutable_lod_map();
+
+    int idx = 0;
+    for (auto &name : fetch_name) {
+      // int idx = _fetch_name_to_idx[name];
+      int shape_size = output.tensor(idx).shape_size();
+      VLOG(2) << "fetch var " << name << " index " << idx << " shape size "
+              << shape_size;
+      shape_map[name].resize(shape_size);
+      for (int i = 0; i < shape_size; ++i) {
+        shape_map[name][i] = output.tensor(idx).shape(i);
+      }
+      int lod_size = output.tensor(idx).lod_size();
+      if (lod_size > 0) {
+        lod_map[name].resize(lod_size);
+        for (int i = 0; i < lod_size; ++i) {
+          lod_map[name][i] = output.tensor(idx).lod(i);
+        }
+      }
+      idx += 1;
+    }
+    idx = 0;
+
+    for (auto &name : fetch_name) {
+      // int idx = _fetch_name_to_idx[name];
+      if (fetch_name_to_type[name] == P_INT64) {
+        VLOG(2) << "fetch var " << name << "type int64";
+        int size = output.tensor(idx).int64_data_size();
+        int64_data_map[name] = std::vector<int64_t>(
+            output.tensor(idx).int64_data().begin(),
+            output.tensor(idx).int64_data().begin() + size);
+      } else if (fetch_name_to_type[name] == P_FLOAT32) {
+        VLOG(2) << "fetch var " << name << "type float";
+        int size = output.tensor(idx).float_data_size();
+        float_data_map[name] = std::vector<float>(
+            output.tensor(idx).float_data().begin(),
+            output.tensor(idx).float_data().begin() + size);
+      } else if (fetch_name_to_type[name] == P_INT32) {
+        VLOG(2) << "fetch var " << name << "type int32";
+        int size = output.tensor(idx).int_data_size();
+        int32_data_map[name] = std::vector<int32_t>(
+            output.tensor(idx).int_data().begin(),
+            output.tensor(idx).int_data().begin() + size);
+      } else if (fetch_name_to_type[name] == P_UINT8
+                || fetch_name_to_type[name] == P_INT8
+                || fetch_name_to_type[name] == P_FP16) {
+        VLOG(2) << "fetch var [" << name << "]type="
+                << fetch_name_to_type[name];
+        string_data_map[name] = output.tensor(idx).tensor_content();
+      }
+      predictor_data.set_datatype(name, output.tensor(idx).elem_type());
+      idx += 1;
+    }
+    outputs.add_data(predictor_output);
+  }
+  return 0;
+}
+
+}  // namespace client
+}  // namespace paddle_serving
+}  // namespace baidu
diff --git a/core/general-client/src/general_model.cpp b/core/general-client/src/general_model.cpp
index cf85048b0f4a43659801f58df963a8597e0c2aba..403119594c759a35d5dfd6251174627f367d9c65 100644
--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -25,7 +25,22 @@ using baidu::paddle_serving::Timer;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Tensor;
-enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
+// support: FLOAT32, INT64, INT32, UINT8, INT8, FLOAT16
+enum ProtoDataType {
+  P_INT64 = 0,
+  P_FLOAT32,
+  P_INT32,
+  P_FP64,
+  P_INT16,
+  P_FP16,
+  P_BF16,
+  P_UINT8,
+  P_INT8,
+  P_BOOL,
+  P_COMPLEX64,
+  P_COMPLEX128,
+  P_STRING = 20,
+};
 std::once_flag gflags_init_flag;
 namespace py = pybind11;
 
@@ -152,10 +167,14 @@ int PredictorClient::numpy_predict(
     const std::vector<std::string> &float_feed_name,
     const std::vector<std::vector<int>> &float_shape,
     const std::vector<std::vector<int>> &float_lod_slot_batch,
-    const std::vector<py::array_t<int64_t>> &int_feed,
-    const std::vector<std::string> &int_feed_name,
-    const std::vector<std::vector<int>> &int_shape,
-    const std::vector<std::vector<int>> &int_lod_slot_batch,
+    const std::vector<py::array_t<int32_t>> &int32_feed,
+    const std::vector<std::string> &int32_feed_name,
+    const std::vector<std::vector<int>> &int32_shape,
+    const std::vector<std::vector<int>> &int32_lod_slot_batch,
+    const std::vector<py::array_t<int64_t>> &int64_feed,
+    const std::vector<std::string> &int64_feed_name,
+    const std::vector<std::vector<int>> &int64_shape,
+    const std::vector<std::vector<int>> &int64_lod_slot_batch,
     const std::vector<std::string> &string_feed,
     const std::vector<std::string> &string_feed_name,
     const std::vector<std::vector<int>> &string_shape,
@@ -168,15 +187,14 @@ int PredictorClient::numpy_predict(
   Timer timeline;
   int64_t preprocess_start = timeline.TimeStampUS();
 
-  int fetch_name_num = fetch_name.size();
-
   _api.thrd_initialize();
   std::string variant_tag;
   _predictor = _api.fetch_predictor("general_model", &variant_tag);
   predict_res_batch.set_variant_tag(variant_tag);
   VLOG(2) << "fetch general model predictor done.";
   VLOG(2) << "float feed name size: " << float_feed_name.size();
-  VLOG(2) << "int feed name size: " << int_feed_name.size();
+  VLOG(2) << "int feed name size: " << int32_feed_name.size();
+  VLOG(2) << "int feed name size: " << int64_feed_name.size();
   VLOG(2) << "string feed name size: " << string_feed_name.size();
   VLOG(2) << "max body size : " << brpc::fLU64::FLAGS_max_body_size;
   Request req;
@@ -193,7 +211,11 @@ int PredictorClient::numpy_predict(
     tensor_vec.push_back(req.add_tensor());
   }
 
-  for (auto &name : int_feed_name) {
+  for (auto &name : int32_feed_name) {
+    tensor_vec.push_back(req.add_tensor());
+  }
+
+  for (auto &name : int64_feed_name) {
     tensor_vec.push_back(req.add_tensor());
   }
 
@@ -233,37 +255,63 @@ int PredictorClient::numpy_predict(
   }
 
   vec_idx = 0;
-  for (auto &name : int_feed_name) {
+  for (auto &name : int32_feed_name) {
     int idx = _feed_name_to_idx[name];
     if (idx >= tensor_vec.size()) {
       LOG(ERROR) << "idx > tensor_vec.size()";
       return -1;
     }
     Tensor *tensor = tensor_vec[idx];
-    int nbytes = int_feed[vec_idx].nbytes();
-    void *rawdata_ptr = (void *)(int_feed[vec_idx].data(0));
-    int total_number = int_feed[vec_idx].size();
+    int nbytes = int32_feed[vec_idx].nbytes();
+    void *rawdata_ptr = (void *)(int32_feed[vec_idx].data(0));
+    int total_number = int32_feed[vec_idx].size();
 
-    for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
-      tensor->add_shape(int_shape[vec_idx][j]);
+    for (uint32_t j = 0; j < int32_shape[vec_idx].size(); ++j) {
+      tensor->add_shape(int32_shape[vec_idx][j]);
     }
-    for (uint32_t j = 0; j < int_lod_slot_batch[vec_idx].size(); ++j) {
-      tensor->add_lod(int_lod_slot_batch[vec_idx][j]);
+    for (uint32_t j = 0; j < int32_lod_slot_batch[vec_idx].size(); ++j) {
+      tensor->add_lod(int32_lod_slot_batch[vec_idx][j]);
     }
     tensor->set_elem_type(_type[idx]);
     tensor->set_name(_feed_name[idx]);
     tensor->set_alias_name(name);
 
-    if (_type[idx] == P_INT64) {
-      tensor->mutable_int64_data()->Resize(total_number, 0);
-      memcpy(tensor->mutable_int64_data()->mutable_data(), rawdata_ptr, nbytes);
-    } else {
-      tensor->mutable_int_data()->Resize(total_number, 0);
-      memcpy(tensor->mutable_int_data()->mutable_data(), rawdata_ptr, nbytes);
+    tensor->mutable_int_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_int_data()->mutable_data(), rawdata_ptr, nbytes);
+    vec_idx++;
+  }
+
+
+  // Individual INT_64 feed data of int_input to tensor_content
+  vec_idx = 0;
+  for (auto &name : int64_feed_name) {
+    int idx = _feed_name_to_idx[name];
+    if (idx >= tensor_vec.size()) {
+      LOG(ERROR) << "idx > tensor_vec.size()";
+      return -1;
+    }
+    Tensor *tensor = tensor_vec[idx];
+    int nbytes = int64_feed[vec_idx].nbytes();
+    void *rawdata_ptr = (void *)(int64_feed[vec_idx].data(0));
+    int total_number = int64_feed[vec_idx].size();
+
+    for (uint32_t j = 0; j < int64_shape[vec_idx].size(); ++j) {
+      tensor->add_shape(int64_shape[vec_idx][j]);
+    }
+    for (uint32_t j = 0; j < int64_lod_slot_batch[vec_idx].size(); ++j) {
+      tensor->add_lod(int64_lod_slot_batch[vec_idx][j]);
     }
+    tensor->set_elem_type(_type[idx]);
+    tensor->set_name(_feed_name[idx]);
+    tensor->set_alias_name(name);
+
+    tensor->mutable_int64_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_int64_data()->mutable_data(), rawdata_ptr, nbytes);
     vec_idx++;
   }
 
+  // Add !P_STRING feed data of string_input to tensor_content
+  // UINT8 INT8 FLOAT16
   vec_idx = 0;
   for (auto &name : string_feed_name) {
     int idx = _feed_name_to_idx[name];
@@ -279,22 +327,27 @@ int PredictorClient::numpy_predict(
     for (uint32_t j = 0; j < string_lod_slot_batch[vec_idx].size(); ++j) {
       tensor->add_lod(string_lod_slot_batch[vec_idx][j]);
     }
-    tensor->set_elem_type(P_STRING);
     tensor->set_name(_feed_name[idx]);
     tensor->set_alias_name(name);
 
-    const int string_shape_size = string_shape[vec_idx].size();
-    // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
-    // we pass string via vector<vector<string> >.
-    if (string_shape_size != 1) {
-      LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
-                 << string_shape_size;
-      return -1;
-    }
-    switch (string_shape_size) {
-      case 1: {
-        tensor->add_data(string_feed[vec_idx]);
-        break;
+    if (_type[idx] != P_STRING) {
+      tensor->set_elem_type(_type[idx]);
+      tensor->set_tensor_content(string_feed[vec_idx]);
+    } else {
+      tensor->set_elem_type(P_STRING);
+      const int string_shape_size = string_shape[vec_idx].size();
+      // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
+      // we pass string via vector<vector<string> >.
+      if (string_shape_size != 1) {
+        LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
+                   << string_shape_size;
+        return -1;
+      }
+      switch (string_shape_size) {
+        case 1: {
+          tensor->add_data(string_feed[vec_idx]);
+          break;
+        }
       }
     }
     vec_idx++;
@@ -308,10 +361,8 @@ int PredictorClient::numpy_predict(
   int64_t postprocess_start = 0;
   int64_t postprocess_end = 0;
 
-  if (FLAGS_profile_client) {
-    if (FLAGS_profile_server) {
-      req.set_profile_server(true);
-    }
+  if (FLAGS_profile_server) {
+    req.set_profile_server(true);
   }
 
   res.Clear();
@@ -329,10 +380,12 @@ int PredictorClient::numpy_predict(
       auto output = res.outputs(m_idx);
       ModelRes model;
       model.set_engine_name(output.engine_name());
-
-      int idx = 0;
-      for (auto &name : fetch_name) {
+      // 在ResponseOp处，已经按照fetch_name对输出数据进行了处理
+      // 所以，输出的数据与fetch_name是严格对应的，按顺序处理即可。
+      for (int idx = 0; idx < output.tensor_size(); ++idx) {
         // int idx = _fetch_name_to_idx[name];
+        const std::string name = output.tensor(idx).alias_name();
+        model._tensor_alias_names.push_back(name);
         int shape_size = output.tensor(idx).shape_size();
         VLOG(2) << "fetch var " << name << " index " << idx << " shape size "
                 << shape_size;
@@ -347,13 +400,7 @@ int PredictorClient::numpy_predict(
             model._lod_map[name][i] = output.tensor(idx).lod(i);
           }
         }
-        idx += 1;
-      }
-
-      idx = 0;
 
-      for (auto &name : fetch_name) {
-        // int idx = _fetch_name_to_idx[name];
         if (_fetch_name_to_type[name] == P_INT64) {
           VLOG(2) << "ferch var " << name << "type int64";
           int size = output.tensor(idx).int64_data_size();
@@ -372,8 +419,16 @@ int PredictorClient::numpy_predict(
           model._int32_value_map[name] = std::vector<int32_t>(
               output.tensor(idx).int_data().begin(),
               output.tensor(idx).int_data().begin() + size);
+        } else if (_fetch_name_to_type[name] == P_UINT8) {
+          VLOG(2) << "fetch var " << name << "type uint8";
+          model._string_value_map[name] = output.tensor(idx).tensor_content();
+        } else if (_fetch_name_to_type[name] == P_INT8) {
+          VLOG(2) << "fetch var " << name << "type int8";
+          model._string_value_map[name] = output.tensor(idx).tensor_content();
+        } else if (_fetch_name_to_type[name] == P_FP16) {
+          VLOG(2) << "fetch var " << name << "type float16";
+          model._string_value_map[name] = output.tensor(idx).tensor_content();
         }
-        idx += 1;
       }
       predict_res_batch.add_model_res(std::move(model));
     }
@@ -403,6 +458,36 @@ int PredictorClient::numpy_predict(
   }
 
   _api.thrd_clear();
+
+  std::ostringstream oss;
+  oss << "[client]"
+      << "logid=" << log_id <<",";
+  if (FLAGS_profile_client) {
+    double pre_cost = (preprocess_end - preprocess_start) / 1000.0;
+    double infer_cost = (client_infer_end - client_infer_start) / 1000.0;
+    double post_cost = (postprocess_end - postprocess_start) / 1000.0;
+    oss << "client_pre_cost=" << pre_cost << "ms,"
+        << "client_infer_cost=" << infer_cost << "ms,"
+        << "client_post_cost=" << post_cost << "ms,";
+  }
+  double client_cost = (postprocess_end - preprocess_start) / 1000.0;
+  oss << "client_cost=" << client_cost << "ms,";
+
+  int op_num = res.profile_time_size() / 2;
+  if (FLAGS_profile_server) {
+    for (int i = 0; i < op_num - 1; ++i) {
+      double t = (res.profile_time(i * 2 + 1)
+                 - res.profile_time(i * 2)) / 1000.0;
+      oss << "op" << i << "=" << t << "ms,";
+    }
+  }
+  if (op_num > 0) {
+    int i = op_num - 1;
+    double server_cost = (res.profile_time(i * 2 + 1)
+                 - res.profile_time(i * 2)) / 1000.0;
+    oss << "server_cost=" << server_cost << "ms.";
+  }
+  LOG(INFO) << oss.str();
   return 0;
 }
 }  // namespace general_model
diff --git a/core/general-client/src/pybind_general_model.cpp b/core/general-client/src/pybind_general_model.cpp
index d5c95d1af55e962db40e347823c5c491216851bb..b9a338df57d1db988448fc2b6e52c48d43f36dd7 100644
--- a/core/general-client/src/pybind_general_model.cpp
+++ b/core/general-client/src/pybind_general_model.cpp
@@ -49,6 +49,19 @@ PYBIND11_MODULE(serving_client, m) {
              });
              return py::array(ptr->size(), ptr->data(), capsule);
            })
+      .def("get_int32_by_name",
+           [](PredictorRes &self, int model_idx, std::string &name) {
+             std::vector<int32_t> *ptr = new std::vector<int32_t>(
+                 std::move(self.get_int32_by_name_with_rv(model_idx, name)));
+             auto capsule = py::capsule(ptr, [](void *p) {
+               delete reinterpret_cast<std::vector<int32_t> *>(p);
+             });
+             return py::array(ptr->size(), ptr->data(), capsule);
+           })
+      .def("get_string_by_name",
+           [](PredictorRes &self, int model_idx, std::string &name) {
+             return self.get_string_by_name_with_rv(model_idx, name);
+           })
       .def("get_shape",
            [](PredictorRes &self, int model_idx, std::string &name) {
              std::vector<int> *ptr = new std::vector<int>(
@@ -69,7 +82,10 @@ PYBIND11_MODULE(serving_client, m) {
            })
       .def("variant_tag", [](PredictorRes &self) { return self.variant_tag(); })
       .def("get_engine_names",
-           [](PredictorRes &self) { return self.get_engine_names(); });
+           [](PredictorRes &self) { return self.get_engine_names(); })
+      .def("get_tensor_alias_names", [](PredictorRes &self, int model_idx) {
+        return self.get_tensor_alias_names(model_idx);
+      });
 
   py::class_<PredictorClient>(m, "PredictorClient", py::buffer_protocol())
       .def(py::init())
@@ -101,10 +117,14 @@ PYBIND11_MODULE(serving_client, m) {
               const std::vector<std::string> &float_feed_name,
               const std::vector<std::vector<int>> &float_shape,
               const std::vector<std::vector<int>> &float_lod_slot_batch,
-              const std::vector<py::array_t<int64_t>> &int_feed,
-              const std::vector<std::string> &int_feed_name,
-              const std::vector<std::vector<int>> &int_shape,
-              const std::vector<std::vector<int>> &int_lod_slot_batch,
+              const std::vector<py::array_t<int32_t>> &int32_feed,
+              const std::vector<std::string> &int32_feed_name,
+              const std::vector<std::vector<int>> &int32_shape,
+              const std::vector<std::vector<int>> &int32_lod_slot_batch,
+              const std::vector<py::array_t<int64_t>> &int64_feed,
+              const std::vector<std::string> &int64_feed_name,
+              const std::vector<std::vector<int>> &int64_shape,
+              const std::vector<std::vector<int>> &int64_lod_slot_batch,
               const std::vector<std::string> &string_feed,
               const std::vector<std::string> &string_feed_name,
               const std::vector<std::vector<int>> &string_shape,
@@ -117,10 +137,14 @@ PYBIND11_MODULE(serving_client, m) {
                                        float_feed_name,
                                        float_shape,
                                        float_lod_slot_batch,
-                                       int_feed,
-                                       int_feed_name,
-                                       int_shape,
-                                       int_lod_slot_batch,
+                                       int32_feed,
+                                       int32_feed_name,
+                                       int32_shape,
+                                       int32_lod_slot_batch,
+                                       int64_feed,
+                                       int64_feed_name,
+                                       int64_shape,
+                                       int64_lod_slot_batch,
                                        string_feed,
                                        string_feed_name,
                                        string_shape,
diff --git a/core/general-server/op/general_detection_op.cpp b/core/general-server/op/general_detection_op.cpp
index 46f5ddf1b508681661b69c60a25b6d7d000e6d4e..b62a2d2544e12d493033cf1bb8e6606d72f614d3 100644
--- a/core/general-server/op/general_detection_op.cpp
+++ b/core/general-server/op/general_detection_op.cpp
@@ -191,42 +191,64 @@ int GeneralDetectionOp::inference() {
 
     boxes = post_processor_.FilterTagDetRes(boxes, ratio_h, ratio_w, srcimg);
 
-    for (int i = boxes.size() - 1; i >= 0; i--) {
-      crop_img = GetRotateCropImage(img, boxes[i]);
-
-      float wh_ratio = float(crop_img.cols) / float(crop_img.rows);
+    float max_wh_ratio = 0.0f;
+    std::vector<cv::Mat> crop_imgs;
+    std::vector<cv::Mat> resize_imgs;
+    int max_resize_w = 0;
+    int max_resize_h = 0;
+    int box_num = boxes.size();
+    std::vector<std::vector<float>> output_rec;
+    for (int i = 0; i < box_num; ++i) {
+      cv::Mat line_img = GetRotateCropImage(img, boxes[i]);
+      float wh_ratio = float(line_img.cols) / float(line_img.rows);
+      max_wh_ratio = max_wh_ratio > wh_ratio ? max_wh_ratio : wh_ratio;
+      crop_imgs.push_back(line_img);
+    }
 
+    for (int i = 0; i < box_num; ++i) {
+      cv::Mat resize_img;
+      crop_img = crop_imgs[i];
       this->resize_op_rec.Run(
-          crop_img, resize_img_rec, wh_ratio, this->use_tensorrt_);
+          crop_img, resize_img, max_wh_ratio, this->use_tensorrt_);
 
       this->normalize_op_.Run(
-          &resize_img_rec, this->mean_rec, this->scale_rec, this->is_scale_);
-
-      std::vector<float> output_rec(
-          1 * 3 * resize_img_rec.rows * resize_img_rec.cols, 0.0f);
-
-      this->permute_op_.Run(&resize_img_rec, output_rec.data());
-
-      // Inference.
-      output_shape = {1, 3, resize_img_rec.rows, resize_img_rec.cols};
-      out_num = std::accumulate(
-          output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
-      databuf_size_out = out_num * sizeof(float);
-      databuf_data_out = MempoolWrapper::instance().malloc(databuf_size_out);
-      if (!databuf_data_out) {
-        LOG(ERROR) << "Malloc failed, size: " << databuf_size_out;
-        return -1;
-      }
-      memcpy(databuf_data_out, output_rec.data(), databuf_size_out);
-      databuf_char_out = reinterpret_cast<char*>(databuf_data_out);
-      paddle::PaddleBuf paddleBuf(databuf_char_out, databuf_size_out);
-      paddle::PaddleTensor tensor_out;
-      tensor_out.name = "image";
-      tensor_out.dtype = paddle::PaddleDType::FLOAT32;
-      tensor_out.shape = {1, 3, resize_img_rec.rows, resize_img_rec.cols};
-      tensor_out.data = paddleBuf;
-      out->push_back(tensor_out);
+          &resize_img, this->mean_rec, this->scale_rec, this->is_scale_);
+
+      max_resize_w = std::max(max_resize_w, resize_img.cols);
+      max_resize_h = std::max(max_resize_h, resize_img.rows);
+      resize_imgs.push_back(resize_img);
+    }
+    int buf_size = 3 * max_resize_h * max_resize_w;
+    output_rec = std::vector<std::vector<float>>(box_num,
+                     std::vector<float>(buf_size, 0.0f));
+    for (int i = 0; i < box_num; ++i) {
+      resize_img_rec = resize_imgs[i];
+
+      this->permute_op_.Run(&resize_img_rec, output_rec[i].data());
+    }
+
+    // Inference.
+    output_shape = {box_num, 3, max_resize_h, max_resize_w};
+    out_num = std::accumulate(
+        output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+    databuf_size_out = out_num * sizeof(float);
+    databuf_data_out = MempoolWrapper::instance().malloc(databuf_size_out);
+    if (!databuf_data_out) {
+      LOG(ERROR) << "Malloc failed, size: " << databuf_size_out;
+      return -1;
+    }
+    int offset = buf_size * sizeof(float);
+    for (int i = 0; i < box_num; ++i) {
+      memcpy(databuf_data_out + i * offset, output_rec[i].data(), offset);
     }
+    databuf_char_out = reinterpret_cast<char*>(databuf_data_out);
+    paddle::PaddleBuf paddleBuf(databuf_char_out, databuf_size_out);
+    paddle::PaddleTensor tensor_out;
+    tensor_out.name = "image";
+    tensor_out.dtype = paddle::PaddleDType::FLOAT32;
+    tensor_out.shape = output_shape;
+    tensor_out.data = paddleBuf;
+    out->push_back(tensor_out);
   }
   out->erase(out->begin(), out->begin() + infer_outnum);
 
diff --git a/core/general-server/op/general_detection_op.h b/core/general-server/op/general_detection_op.h
index 272ed5ff40575d42ac3058ad1824285925fc252c..2cc027f5ed761f2d040c0c1858e81cb70a93fcb0 100644
--- a/core/general-server/op/general_detection_op.h
+++ b/core/general-server/op/general_detection_op.h
@@ -63,7 +63,7 @@ class GeneralDetectionOp
 
     double det_db_thresh_ = 0.3;
     double det_db_box_thresh_ = 0.5;
-    double det_db_unclip_ratio_ = 2.0;
+    double det_db_unclip_ratio_ = 1.5;
 
     std::vector<float> mean_det = {0.485f, 0.456f, 0.406f};
     std::vector<float> scale_det = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f};
diff --git a/core/general-server/op/general_dist_kv_infer_op.cpp b/core/general-server/op/general_dist_kv_infer_op.cpp
index 2228ccb952b1a91a5e34f990ae4c186570b91f5d..238d4cac3a085ef188f427c8cc3669b7617443d7 100644
--- a/core/general-server/op/general_dist_kv_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_infer_op.cpp
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <utility>
 #include "core/cube/cube-api/include/cube_api.h"
+#include "core/predictor/framework/cache.h"
 #include "core/predictor/framework/infer.h"
 #include "core/predictor/framework/memory.h"
 #include "core/predictor/framework/resource.h"
@@ -36,10 +37,11 @@ using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+using baidu::paddle_serving::predictor::CubeCache;
 
 // DistKV Infer Op: seek cube and then call paddle inference
 // op seq: general_reader-> dist_kv_infer -> general_response
-int GeneralDistKVInferOp::inference() { 
+int GeneralDistKVInferOp::inference() {
   VLOG(2) << "Going to run inference";
   const std::vector<std::string> pre_node_names = pre_names();
   if (pre_node_names.size() != 1) {
@@ -60,8 +62,8 @@ int GeneralDistKVInferOp::inference() {
 
   GeneralBlob *output_blob = mutable_data<GeneralBlob>();
   if (!output_blob) {
-    LOG(ERROR) <<  "(logid=" << log_id << ") output_blob is nullptr,error";
-      return -1;
+    LOG(ERROR) << "(logid=" << log_id << ") output_blob is nullptr,error";
+    return -1;
   }
   output_blob->SetLogId(log_id);
 
@@ -70,21 +72,30 @@ int GeneralDistKVInferOp::inference() {
                << ") Failed mutable depended argument, op:" << pre_name;
     return -1;
   }
-
+  Timer timeline;
+  timeline.Start();
   const TensorVector *in = &input_blob->tensor_vector;
   TensorVector *out = &output_blob->tensor_vector;
   std::vector<uint64_t> keys;
+  std::vector<uint64_t> unique_keys;
+  std::unordered_map<uint64_t, rec::mcube::CubeValue *> key_map;
   std::vector<rec::mcube::CubeValue> values;
-  int sparse_count = 0; // sparse inputs counts, sparse would seek cube
-  int dense_count = 0; // dense inputs counts, dense would directly call paddle infer
+  // sparse inputs counts, sparse would seek cube
+  int sparse_count = 0;
+  // dense inputs counts, dense would directly call paddle infer
+  int dense_count = 0;
   std::vector<std::pair<int64_t *, size_t>> dataptr_size_pairs;
   size_t key_len = 0;
+
   for (size_t i = 0; i < in->size(); ++i) {
     if (in->at(i).dtype != paddle::PaddleDType::INT64) {
+      // dense input type is not int64
       ++dense_count;
       continue;
     }
+    // sparse input type is int64
     ++sparse_count;
+
     size_t elem_num = 1;
     for (size_t s = 0; s < in->at(i).shape.size(); ++s) {
       elem_num *= in->at(i).shape[s];
@@ -94,7 +105,8 @@ int GeneralDistKVInferOp::inference() {
     dataptr_size_pairs.push_back(std::make_pair(data_ptr, elem_num));
   }
   keys.resize(key_len);
-  VLOG(3) << "(logid=" << log_id << ") cube number of keys to look up: " << key_len;
+  unique_keys.resize(key_len);
+
   int key_idx = 0;
   for (size_t i = 0; i < dataptr_size_pairs.size(); ++i) {
     std::copy(dataptr_size_pairs[i].first,
@@ -102,20 +114,81 @@ int GeneralDistKVInferOp::inference() {
               keys.begin() + key_idx);
     key_idx += dataptr_size_pairs[i].second;
   }
+
+  // filter dumplicate keys
+  int unique_keys_count = 0;
+  for (size_t i = 0; i < keys.size(); ++i) {
+    if (key_map.find(keys[i]) == key_map.end()) {
+      key_map[keys[i]] = nullptr;
+      unique_keys[unique_keys_count++] = keys[i];
+    }
+  }
+  unique_keys.resize(unique_keys_count);
+  VLOG(1) << "(logid=" << log_id
+          << ") cube number of keys to look up: " << key_len
+          << " uniq keys: " << unique_keys_count;
+
+  // fitler cache keys
+  size_t hit_counts = 0;
+  int64_t seek_cache_start = timeline.TimeStampUS();
+  CubeCache *p_cube_cache =
+      InferManager::instance().get_cube_cache(engine_name().c_str());
+  if (p_cube_cache != nullptr) {
+    for (size_t i = 0; i < unique_keys_count; ++i) {
+      rec::mcube::CubeValue *hit_val = p_cube_cache->get_data(unique_keys[i]);
+      if (hit_val) {
+        // LOG(WARNING) << "Hit one cache. key:" << unique_keys[i];
+        key_map[unique_keys[i]] = hit_val;
+        if (hit_counts % 100 == 0) {
+          LOG(WARNING) << "hit cache! key:" << unique_keys[i]
+                       << " value:" << hit_val->buff;
+        }
+        unique_keys[i] = 0;
+        ++hit_counts;
+      }
+    }
+  } else {
+    LOG(WARNING) << "get cube cache fail. model: " << engine_name();
+  }
+  // clear unique keys which hit caches
+  if (hit_counts > 0) {
+    for (auto it = unique_keys.begin(); it < unique_keys.end();) {
+      if (*it == 0) {
+        it = unique_keys.erase(it);
+        --unique_keys_count;
+      } else {
+        ++it;
+      }
+    }
+  }
+  int64_t seek_cache_end = timeline.TimeStampUS();
+  VLOG(2) << "cache hit " << hit_counts
+          << " keys in cube cache, last unique_keys:" << unique_keys.size()
+          << " , seek_time:" << seek_cache_end - seek_cache_start;
+
+  // seek sparse params
   rec::mcube::CubeAPI *cube = rec::mcube::CubeAPI::instance();
   std::vector<std::string> table_names = cube->get_table_names();
   if (table_names.size() == 0) {
     LOG(ERROR) << "cube init error or cube config not given.";
     return -1;
   }
-  // gather keys and seek cube servers, put results in values 
-  int ret = cube->seek(table_names[0], keys, &values);
-  VLOG(3) << "(logid=" << log_id << ") cube seek status: " << ret;
+  int64_t seek_start = timeline.TimeStampUS();
+  int ret = cube->seek(table_names[0], unique_keys, &values);
+  int64_t seek_end = timeline.TimeStampUS();
+  VLOG(2) << "(logid=" << log_id << ") cube seek status: " << ret
+          << " , unique_key: " << unique_keys.size()
+          << " , seek_time: " << seek_end - seek_start;
+
+  for (size_t i = 0; i < unique_keys.size(); ++i) {
+    key_map[unique_keys[i]] = &values[i];
+  }
   if (values.size() != keys.size() || values[0].buff.size() == 0) {
     LOG(ERROR) << "cube value return null";
   }
-  // EMBEDDING_SIZE means the length of sparse vector, user can define length here. 
-  size_t EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
+   size_t EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
+  // size_t EMBEDDING_SIZE = (values[0].buff.size() - 10) / sizeof(float);
+  //size_t EMBEDDING_SIZE = 9;
   TensorVector sparse_out;
   sparse_out.resize(sparse_count);
   TensorVector dense_out;
@@ -126,8 +199,10 @@ int GeneralDistKVInferOp::inference() {
   std::unordered_map<int, int> in_out_map;
   baidu::paddle_serving::predictor::Resource &resource =
       baidu::paddle_serving::predictor::Resource::instance();
-  std::shared_ptr<PaddleGeneralModelConfig> model_config = resource.get_general_model_config().front();
-  //copy data to tnsor
+  std::shared_ptr<PaddleGeneralModelConfig> model_config =
+      resource.get_general_model_config().front();
+  int cube_key_found = 0;
+  int cube_key_miss = 0;
   for (size_t i = 0; i < in->size(); ++i) {
     if (in->at(i).dtype != paddle::PaddleDType::INT64) {
       dense_out[dense_idx] = in->at(i);
@@ -142,43 +217,75 @@ int GeneralDistKVInferOp::inference() {
                 sparse_out[sparse_idx].lod[x].begin());
     }
     sparse_out[sparse_idx].dtype = paddle::PaddleDType::FLOAT32;
-    sparse_out[sparse_idx].shape.push_back(sparse_out[sparse_idx].lod[0].back());
+    sparse_out[sparse_idx].shape.push_back(
+        sparse_out[sparse_idx].lod[0].back());
     sparse_out[sparse_idx].shape.push_back(EMBEDDING_SIZE);
     sparse_out[sparse_idx].name = model_config->_feed_name[i];
     sparse_out[sparse_idx].data.Resize(sparse_out[sparse_idx].lod[0].back() *
                                        EMBEDDING_SIZE * sizeof(float));
     float *dst_ptr = static_cast<float *>(sparse_out[sparse_idx].data.data());
+    if (!dst_ptr) {
+      VLOG(2) << "dst_ptr is null. sparse_idx:" << sparse_idx;
+      continue;
+    }
     for (int x = 0; x < sparse_out[sparse_idx].lod[0].back(); ++x) {
       float *data_ptr = dst_ptr + x * EMBEDDING_SIZE;
-      memcpy(data_ptr,
-             values[cube_val_idx].buff.data(),
-             values[cube_val_idx].buff.size());
-      cube_val_idx++;
+      uint64_t cur_key = keys[cube_val_idx];
+      rec::mcube::CubeValue *cur_val = key_map[cur_key];
+      if (cur_val->buff.size() == 0) {
+        memset(data_ptr, (float)0.0, sizeof(float) * EMBEDDING_SIZE);
+        ++cube_key_miss;
+        ++cube_val_idx;
+        continue;
+      }
+
+      // The data generated by pslib has 10 bytes of information to be filtered
+      // out
+      memcpy(data_ptr, cur_val->buff.data(), cur_val->buff.size() );
+      // VLOG(3) <<  keys[cube_val_idx] << ":" << data_ptr[0] << ", " <<
+      // data_ptr[1] << ", " <<data_ptr[2] << ", " <<data_ptr[3] << ", "
+      // <<data_ptr[4] << ", " <<data_ptr[5] << ", " <<data_ptr[6] << ", "
+      // <<data_ptr[7] << ", " <<data_ptr[8];
+      ++cube_key_found;
+      ++cube_val_idx;
     }
     ++sparse_idx;
   }
-  VLOG(3) << "(logid=" << log_id << ") sparse tensor load success.";
+  bool cube_fail = (cube_key_found == 0);
+  if (cube_fail) {
+    LOG(WARNING) << "(logid=" << log_id << ") cube seek fail";
+  }
+  VLOG(2) << "(logid=" << log_id << ") cube key found: " << cube_key_found
+          << " , cube key miss: " << cube_key_miss;
+  VLOG(2) << "(logid=" << log_id << ") sparse tensor load success.";
+  timeline.Pause();
+  VLOG(2) << "dist kv, cube and datacopy time: " << timeline.ElapsedUS();
+
   TensorVector infer_in;
   infer_in.insert(infer_in.end(), dense_out.begin(), dense_out.end());
   infer_in.insert(infer_in.end(), sparse_out.begin(), sparse_out.end());
   int batch_size = input_blob->_batch_size;
   output_blob->_batch_size = batch_size;
-  Timer timeline;
   int64_t start = timeline.TimeStampUS();
   timeline.Start();
   // call paddle inference here
   if (InferManager::instance().infer(
           engine_name().c_str(), &infer_in, out, batch_size)) {
-    LOG(ERROR) << "(logid=" << log_id << ") Failed do infer in fluid model: " << engine_name();
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name();
     return -1;
   }
   int64_t end = timeline.TimeStampUS();
-
+  if (cube_fail) {
+    float *out_ptr = static_cast<float *>(out->at(0).data.data());
+    out_ptr[0] = 0.0;
+  }
+  timeline.Pause();
+  VLOG(2) << "dist kv, pure paddle infer time: " << timeline.ElapsedUS();
   CopyBlobInfo(input_blob, output_blob);
   AddBlobInfo(output_blob, start);
   AddBlobInfo(output_blob, end);
-  return 0; 
-
+  return 0;
 }
 DEFINE_OP(GeneralDistKVInferOp);
 
diff --git a/core/general-server/op/general_reader_op.cpp b/core/general-server/op/general_reader_op.cpp
index af77df553837c594789b0e9943790fc37fc01c95..2ad3e4cab6b77b305494c3833f0e3781ed0fd0b7 100644
--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -31,7 +31,22 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
-enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
+// support: FLOAT32, INT64, INT32, UINT8, INT8, FLOAT16
+enum ProtoDataType {
+  P_INT64 = 0,
+  P_FLOAT32,
+  P_INT32,
+  P_FP64,
+  P_INT16,
+  P_FP16,
+  P_BF16,
+  P_UINT8,
+  P_INT8,
+  P_BOOL,
+  P_COMPLEX64,
+  P_COMPLEX128,
+  P_STRING = 20,
+};
 
 int GeneralReaderOp::inference() {
   // read request from client
@@ -78,6 +93,7 @@ int GeneralReaderOp::inference() {
   int64_t elem_type = 0;
   int64_t elem_size = 0;
   int64_t databuf_size = 0;
+  const void* src_ptr = nullptr;
   for (int i = 0; i < var_num; ++i) {
     paddle::PaddleTensor paddleTensor;
     const Tensor &tensor = req->tensor(i);
@@ -86,19 +102,38 @@ int GeneralReaderOp::inference() {
     elem_size = 0;
     databuf_size = 0;
     elem_type = tensor.elem_type();
-    VLOG(2) << "var[" << i << "] has elem type: " << elem_type;
+    src_ptr = nullptr ;
     if (elem_type == P_INT64) {  // int64
       elem_size = sizeof(int64_t);
       paddleTensor.dtype = paddle::PaddleDType::INT64;
       data_len = tensor.int64_data_size();
+      src_ptr = tensor.int64_data().data();
     } else if (elem_type == P_FLOAT32) {
       elem_size = sizeof(float);
       paddleTensor.dtype = paddle::PaddleDType::FLOAT32;
       data_len = tensor.float_data_size();
+      src_ptr = tensor.float_data().data();
     } else if (elem_type == P_INT32) {
       elem_size = sizeof(int32_t);
       paddleTensor.dtype = paddle::PaddleDType::INT32;
       data_len = tensor.int_data_size();
+      src_ptr = tensor.int_data().data();
+    } else if (elem_type == P_UINT8) {
+      elem_size = sizeof(uint8_t);
+      paddleTensor.dtype = paddle::PaddleDType::UINT8;
+      data_len = tensor.tensor_content().size();
+      src_ptr = tensor.tensor_content().data();
+    } else if (elem_type == P_INT8) {
+      elem_size = sizeof(int8_t);
+      paddleTensor.dtype = paddle::PaddleDType::INT8;
+      data_len = tensor.tensor_content().size();
+      src_ptr = tensor.tensor_content().data();
+    } else if (elem_type == P_FP16) {
+      // copy bytes from tensor content to TensorVector
+      elem_size = 1;
+      paddleTensor.dtype = paddle::PaddleDType::FLOAT16;
+      data_len = tensor.tensor_content().size();
+      src_ptr = tensor.tensor_content().data();
     } else if (elem_type == P_STRING) {
       // use paddle::PaddleDType::UINT8 as for String.
       elem_size = sizeof(char);
@@ -109,8 +144,18 @@ int GeneralReaderOp::inference() {
       // now only support single string
       for (int idx = 0; idx < tensor.data_size(); idx++) {
         data_len += tensor.data()[idx].length() + 1;
+        src_ptr = tensor.data()[idx].data();
       }
     }
+    VLOG(2) << "var[" << i << "] has elem type: " << elem_type << ";"
+            << "elem_size=" << elem_size << ";"
+            << "dtype=" << paddleTensor.dtype << ";"
+            << "data_len=" << data_len;
+    if (src_ptr == nullptr) {
+      LOG(ERROR) << "Not support var[" << i << "] with elem_type[" 
+                 << elem_type << "]";
+      continue;
+    }
     // implement lod tensor here
     // only support 1-D lod
     // TODO(HexToString): support 2-D lod
@@ -141,44 +186,17 @@ int GeneralReaderOp::inference() {
       VLOG(2) << "(logid=" << log_id << ") var[" << i
               << "] has lod_tensor and len=" << out->at(i).lod[0].back();
     }
-    if (elem_type == P_INT64) {
-      int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
-              << "] is " << tensor.int64_data(0);
-      if (!dst_ptr) {
-        LOG(ERROR) << "dst_ptr is nullptr";
-        return -1;
-      }
-      memcpy(dst_ptr, tensor.int64_data().data(), databuf_size);
-      /*
-      int elem_num = tensor.int64_data_size();
-      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[k] = tensor.int64_data(k);
-      }
-      */
-    } else if (elem_type == P_FLOAT32) {
-      float *dst_ptr = static_cast<float *>(out->at(i).data.data());
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
-              << "] is " << tensor.float_data(0);
-      if (!dst_ptr) {
-        LOG(ERROR) << "dst_ptr is nullptr";
-        return -1;
-      }
-      memcpy(dst_ptr, tensor.float_data().data(), databuf_size);
-      /*int elem_num = tensor.float_data_size();
-      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[k] = tensor.float_data(k);
-      }*/
-    } else if (elem_type == P_INT32) {
-      int32_t *dst_ptr = static_cast<int32_t *>(out->at(i).data.data());
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
-              << "] is " << tensor.int_data(0);
-      if (!dst_ptr) {
-        LOG(ERROR) << "dst_ptr is nullptr";
-        return -1;
-      }
-      memcpy(dst_ptr, tensor.int_data().data(), databuf_size);
-    } else if (elem_type == P_STRING) {
+    void* dst_ptr = out->at(i).data.data();
+    if (!dst_ptr) {
+      LOG(ERROR) << "dst_ptr is nullptr";
+      return -1;
+    }
+
+    // For common data, we just copy from src to dst
+    // For string data, we need to iterate through all str
+    if (elem_type != P_STRING) {
+      memcpy(dst_ptr, src_ptr, databuf_size);
+    } else {
       char *dst_ptr = static_cast<char *>(out->at(i).data.data());
       VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
               << "] is " << tensor.data(0);
diff --git a/core/general-server/op/general_response_op.cpp b/core/general-server/op/general_response_op.cpp
index 161e291117b8893703844ab07ec93a891fc46f27..07d3473ec6ce12373114bfc50a67890ac2757634 100644
--- a/core/general-server/op/general_response_op.cpp
+++ b/core/general-server/op/general_response_op.cpp
@@ -74,10 +74,19 @@ int GeneralResponseOp::inference() {
   // and the order of Output is the same as the prototxt FetchVar.
   // otherwise, you can only get the Output by the corresponding of
   // Name -- Alias_name.
-  fetch_index.resize(req->fetch_var_names_size());
-  for (int i = 0; i < req->fetch_var_names_size(); ++i) {
-    fetch_index[i] =
-        model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
+  if (req->fetch_var_names_size() > 0) {
+    fetch_index.resize(req->fetch_var_names_size());
+    for (int i = 0; i < req->fetch_var_names_size(); ++i) {
+      fetch_index[i] =
+          model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
+    }
+  } else {
+    fetch_index.resize(model_config->_fetch_alias_name.size());
+    for (int i = 0; i < model_config->_fetch_alias_name.size(); ++i) {
+      fetch_index[i] =
+          model_config
+              ->_fetch_alias_name_to_index[model_config->_fetch_alias_name[i]];
+    }
   }
 
   for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
@@ -105,7 +114,7 @@ int GeneralResponseOp::inference() {
     // fetch_index is the real index in FetchVar of Fetchlist
     // for example, FetchVar = {0:A, 1:B, 2:C}
     // FetchList = {0:C,1:A}, at this situation.
-    // fetch_index = [2,0], C`index = 2 and A`index = 0 
+    // fetch_index = [2,0], C`index = 2 and A`index = 0
     for (auto &idx : fetch_index) {
       Tensor *tensor = output->add_tensor();
       tensor->set_name(in->at(idx).name);
@@ -159,6 +168,21 @@ int GeneralResponseOp::inference() {
         google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr,
                                                           data_ptr + cap);
         output->mutable_tensor(var_idx)->mutable_int_data()->Swap(&tmp_data);
+      } else if (dtype == paddle::PaddleDType::UINT8) {
+        tensor->set_elem_type(7);
+        VLOG(2) << "(logid=" << log_id << ")Prepare uint8 var ["
+                << model_config->_fetch_name[idx] << "].";
+        tensor->set_tensor_content(in->at(idx).data.data(), in->at(idx).data.length());
+      } else if (dtype == paddle::PaddleDType::INT8) {
+        tensor->set_elem_type(8);
+        VLOG(2) << "(logid=" << log_id << ")Prepare int8 var ["
+                << model_config->_fetch_name[idx] << "].";
+        tensor->set_tensor_content(in->at(idx).data.data(), in->at(idx).data.length());
+      } else if (dtype == paddle::PaddleDType::FLOAT16) {
+        tensor->set_elem_type(5);
+        VLOG(2) << "(logid=" << log_id << ")Prepare float16 var ["
+                << model_config->_fetch_name[idx] << "].";
+        tensor->set_tensor_content(in->at(idx).data.data(), in->at(idx).data.length());
       }
 
       VLOG(2) << "(logid=" << log_id << ") fetch var ["
diff --git a/core/general-server/proto/general_model_service.proto b/core/general-server/proto/general_model_service.proto
index 8fedb60e97ec5b81263687b47ff0794880da8671..4b6282637ca6ea0617096a18bbbc3268067906bc 100755
--- a/core/general-server/proto/general_model_service.proto
+++ b/core/general-server/proto/general_model_service.proto
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-syntax = "proto2";
+syntax = "proto3";
 import "pds_option.proto";
 import "builtin_format.proto";
 package baidu.paddle_serving.predictor.general_model;
@@ -20,33 +20,88 @@ package baidu.paddle_serving.predictor.general_model;
 option cc_generic_services = true;
 
 message Tensor {
-  repeated string data = 1;
-  repeated int32 int_data = 2;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
-  optional int32 elem_type =
-      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
-  repeated int32 shape = 6;       // shape should include batch
-  repeated int32 lod = 7;         // only for fetch tensor currently
-  optional string name = 8;       // get from the Model prototxt
-  optional string alias_name = 9; // get from the Model prototxt
+  // VarType: INT64
+  repeated int64 int64_data = 1;
+
+  // VarType: FP32
+  repeated float float_data = 2;
+
+  // VarType: INT32
+  repeated int32 int_data = 3;
+
+  // VarType: FP64
+  repeated double float64_data = 4;
+
+  // VarType: UINT32
+  repeated uint32 uint32_data = 5;
+
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+
+  // VarType: STRING
+  repeated string data = 9;
+
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  20 => STRING
+  int32 elem_type = 10;
+
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 11;
+
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 12;
+
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 13;
+
+  // Correspond to the variable 'alias_name' in the model description prototxt.
+  string alias_name = 14; // get from the Model prototxt
+
+  // VarType: FP16, INT16, INT8, BF16, UINT8
+  bytes tensor_content = 15;
 };
 
 message Request {
   repeated Tensor tensor = 1;
   repeated string fetch_var_names = 2;
-  optional bool profile_server = 3 [ default = false ];
-  required uint64 log_id = 4 [ default = 0 ];
+  bool profile_server = 3;
+  uint64 log_id = 4;
 };
 
 message Response {
   repeated ModelOutput outputs = 1;
   repeated int64 profile_time = 2;
+  // Error code
+  int32 err_no = 3;
+
+  // Error messages
+  string err_msg = 4;
 };
 
 message ModelOutput {
   repeated Tensor tensor = 1;
-  optional string engine_name = 2;
+  string engine_name = 2;
 }
 
 service GeneralModelService {
diff --git a/core/pdcodegen/src/pdcodegen.cpp b/core/pdcodegen/src/pdcodegen.cpp
index a99828ee3466a32d45dcabb61a2700f9362539d4..b4ab1afc2f34c947745a9b33a941d317083ee372 100644
--- a/core/pdcodegen/src/pdcodegen.cpp
+++ b/core/pdcodegen/src/pdcodegen.cpp
@@ -276,43 +276,65 @@ class PdsCodeGenerator : public CodeGenerator {
           "output_name",
           google::protobuf::dots_to_colons(m->output_type()->full_name()));
       if (m->name() == "inference") {
+        std::string inference_body = "";
+        inference_body += "  brpc::ClosureGuard done_guard(done);\n";
+        inference_body += "  brpc::Controller* cntl = \n";
+        inference_body += "        static_cast<brpc::Controller*>(cntl_base);\n";
+        inference_body += "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n";
+        inference_body += "  uint64_t log_id = request->log_id();\n";
+        inference_body += "  cntl->set_log_id(log_id);\n";
+        inference_body += "  ::baidu::paddle_serving::predictor::InferService* svr = \n";
+        inference_body += "       ";
+        inference_body += "::baidu::paddle_serving::predictor::InferServiceManager::instance(";
+        inference_body += ").item(\"$service$\");\n";
+        inference_body += "  if (svr == NULL) {\n";
+        inference_body += "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: ";
+        inference_body += "$service$\";\n";
+        inference_body += "    cntl->SetFailed(404, \"Not found service: $service$\");\n";
+        inference_body += "    return ;\n";
+        inference_body += "  }\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "remote_side=\[\" << cntl->remote_side() << ";  // NOLINT
+        inference_body += "\"\]\";\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "local_side=\[\" << cntl->local_side() << ";  // NOLINT
+        inference_body += "\"\]\";\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "service_name=\[\" << \"$name$\" << \"\]\";\n";  // NOLINT
+        inference_body += "  int err_code = svr->inference(request, response, log_id);\n";
+        inference_body += "  if (err_code != 0) {\n";
+        inference_body += "    LOG(WARNING)\n";
+        inference_body += "        << \"(logid=\" << log_id << \") Failed call ";
+        inference_body += "inferservice[$name$], name[$service$]\"\n";
+        inference_body += "        << \", error_code: \" << err_code;\n";
+        inference_body += "    cntl->SetFailed(err_code, \"InferService inference ";
+        inference_body += "failed!\");\n";
+        inference_body += "  }\n";
+        inference_body += "  gettimeofday(&tv, NULL);\n";
+        inference_body += "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n";
+        if (service_name == "GeneralModelService") {
+          inference_body += "  std::ostringstream oss;\n";
+          inference_body += "  oss << \"[serving]\"\n";
+          inference_body += "      << \"logid=\" << log_id << \",\";\n";
+          inference_body += "  int op_num = response->profile_time_size() / 2;\n";
+          inference_body += "  for (int i = 0; i < op_num; ++i) {\n";
+          inference_body += "    double t = (response->profile_time(i * 2 + 1)\n";
+          inference_body += "                - response->profile_time(i * 2)) / 1000.0;\n";
+          inference_body += "    oss << \"op\" << i << \"=\" << t << \"ms,\";\n";
+          inference_body += "  }\n";
+          inference_body += "  double total_time = (end - start) / 1000.0;\n";
+          inference_body += "  oss << \"cost=\" << total_time << \"ms.\";\n";
+          inference_body += "  // flush notice log\n";
+          inference_body += "  LOG(INFO) << oss.str();\n";
+          inference_body += "  response->add_profile_time(start);\n";
+          inference_body += "  response->add_profile_time(end);\n";
+        } else {
+          inference_body += "  // flush notice log\n";
+          inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - ";  // NOLINT
+          inference_body += "start) << \"\]\";\n";
+        }
         printer->Print(
-            "  baidu::rpc::ClosureGuard done_guard(done);\n"
-            "  baidu::rpc::Controller* cntl = \n"
-            "        static_cast<baidu::rpc::Controller*>(cntl_base);\n"
-            "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
-            "  uint64_t log_id = request->log_id();\n"
-            "  cntl->set_log_id(log_id);\n"
-            "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
-            "       "
-            "::baidu::paddle_serving::predictor::InferServiceManager::instance("
-            ").item(\"$service$\");\n"
-            "  if (svr == NULL) {\n"
-            "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: "
-            "$service$\";\n"
-            "    cntl->SetFailed(404, \"Not found service: $service$\");\n"
-            "    return ;\n"
-            "  }\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") remote_side=\[\" "  // NOLINT
-            "<< cntl->remote_side() << \"\]\";\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") local_side=\[\" "  // NOLINT
-            "<< cntl->local_side() << \"\]\";\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") service_name=\[\" "  // NOLINT
-            "<< \"$name$\" << \"\]\";\n"
-            "  int err_code = svr->inference(request, response, log_id);\n"
-            "  if (err_code != 0) {\n"
-            "    LOG(WARNING)\n"
-            "        << \"(logid=\" << log_id << \") Failed call "
-            "inferservice[$name$], name[$service$]\"\n"
-            "        << \", error_code: \" << err_code;\n"
-            "    cntl->SetFailed(err_code, \"InferService inference "
-            "failed!\");\n"
-            "  }\n"
-            "  gettimeofday(&tv, NULL);\n"
-            "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n"
-            "  // flush notice log\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - "  // NOLINT
-            "start) << \"\]\";\n",  // NOLINT
+            inference_body.c_str(),
             "name",
             class_name,
             "service",
@@ -1021,45 +1043,65 @@ class PdsCodeGenerator : public CodeGenerator {
           "output_name",
           google::protobuf::dots_to_colons(m->output_type()->full_name()));
       if (m->name() == "inference") {
+        std::string inference_body = "";
+        inference_body += "  brpc::ClosureGuard done_guard(done);\n";
+        inference_body += "  brpc::Controller* cntl = \n";
+        inference_body += "        static_cast<brpc::Controller*>(cntl_base);\n";
+        inference_body += "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n";
+        inference_body += "  uint64_t log_id = request->log_id();\n";
+        inference_body += "  cntl->set_log_id(log_id);\n";
+        inference_body += "  ::baidu::paddle_serving::predictor::InferService* svr = \n";
+        inference_body += "       ";
+        inference_body += "::baidu::paddle_serving::predictor::InferServiceManager::instance(";
+        inference_body += ").item(\"$service$\");\n";
+        inference_body += "  if (svr == NULL) {\n";
+        inference_body += "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: ";
+        inference_body += "$service$\";\n";
+        inference_body += "    cntl->SetFailed(404, \"Not found service: $service$\");\n";
+        inference_body += "    return ;\n";
+        inference_body += "  }\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "remote_side=\[\" << cntl->remote_side() << ";  // NOLINT
+        inference_body += "\"\]\";\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "local_side=\[\" << cntl->local_side() << ";  // NOLINT
+        inference_body += "\"\]\";\n";
+        inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") ";
+        inference_body += "service_name=\[\" << \"$name$\" << \"\]\";\n";  // NOLINT
+        inference_body += "  int err_code = svr->inference(request, response, log_id);\n";
+        inference_body += "  if (err_code != 0) {\n";
+        inference_body += "    LOG(WARNING)\n";
+        inference_body += "        << \"(logid=\" << log_id << \") Failed call ";
+        inference_body += "inferservice[$name$], name[$service$]\"\n";
+        inference_body += "        << \", error_code: \" << err_code;\n";
+        inference_body += "    cntl->SetFailed(err_code, \"InferService inference ";
+        inference_body += "failed!\");\n";
+        inference_body += "  }\n";
+        inference_body += "  gettimeofday(&tv, NULL);\n";
+        inference_body += "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n";
+        if (service_name == "GeneralModelService") {
+          inference_body += "  std::ostringstream oss;\n";
+          inference_body += "  oss << \"[serving]\"\n";
+          inference_body += "      << \"logid=\" << log_id << \",\";\n";
+          inference_body += "  int op_num = response->profile_time_size() / 2;\n";
+          inference_body += "  for (int i = 0; i < op_num; ++i) {\n";
+          inference_body += "    double t = (response->profile_time(i * 2 + 1)\n";
+          inference_body += "                - response->profile_time(i * 2)) / 1000.0;\n";
+          inference_body += "    oss << \"op\" << i << \"=\" << t << \"ms,\";\n";
+          inference_body += "  }\n";
+          inference_body += "  double total_time = (end - start) / 1000.0;\n";
+          inference_body += "  oss << \"cost=\" << total_time << \"ms.\";\n";
+          inference_body += "  // flush notice log\n";
+          inference_body += "  LOG(INFO) << oss.str();\n";
+          inference_body += "  response->add_profile_time(start);\n";
+          inference_body += "  response->add_profile_time(end);\n";
+        } else {
+          inference_body += "  // flush notice log\n";
+          inference_body += "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - ";  // NOLINT
+          inference_body += "start) << \"\]\";\n";
+        }
         printer->Print(
-            "  brpc::ClosureGuard done_guard(done);\n"
-            "  brpc::Controller* cntl = \n"
-            "        static_cast<brpc::Controller*>(cntl_base);\n"
-            "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
-            "  uint64_t log_id = request->log_id();\n"
-            "  cntl->set_log_id(log_id);\n"
-            "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
-            "       "
-            "::baidu::paddle_serving::predictor::InferServiceManager::instance("
-            ").item(\"$service$\");\n"
-            "  if (svr == NULL) {\n"
-            "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: "
-            "$service$\";\n"
-            "    cntl->SetFailed(404, \"Not found service: $service$\");\n"
-            "    return ;\n"
-            "  }\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") "
-            "remote_side=\[\" << cntl->remote_side() << "  // NOLINT
-            "\"\]\";\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") "
-            "local_side=\[\" << cntl->local_side() << "  // NOLINT
-            "\"\]\";\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") "
-            "service_name=\[\" << \"$name$\" << \"\]\";\n"  // NOLINT
-            "  int err_code = svr->inference(request, response, log_id);\n"
-            "  if (err_code != 0) {\n"
-            "    LOG(WARNING)\n"
-            "        << \"(logid=\" << log_id << \") Failed call "
-            "inferservice[$name$], name[$service$]\"\n"
-            "        << \", error_code: \" << err_code;\n"
-            "    cntl->SetFailed(err_code, \"InferService inference "
-            "failed!\");\n"
-            "  }\n"
-            "  gettimeofday(&tv, NULL);\n"
-            "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n"
-            "  // flush notice log\n"
-            "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - "  // NOLINT
-            "start) << \"\]\";\n",  // NOLINT
+            inference_body.c_str(),
             "name",
             class_name,
             "service",
@@ -1492,11 +1534,6 @@ class PdsCodeGenerator : public CodeGenerator {
       const FieldDescriptor* fd = in_shared_fields[si];
       std::string field_name = fd->name();
       printer->Print("\n/////$field_name$\n", "field_name", field_name);
-      if (fd->is_optional()) {
-        printer->Print(
-            "if (req->has_$field_name$()) {\n", "field_name", field_name);
-        printer->Indent();
-      }
       if (fd->cpp_type() ==
               google::protobuf::FieldDescriptor::CPPTYPE_MESSAGE ||
           fd->is_repeated()) {
@@ -1509,10 +1546,6 @@ class PdsCodeGenerator : public CodeGenerator {
                        "field_name",
                        field_name);
       }
-      if (fd->is_optional()) {
-        printer->Outdent();
-        printer->Print("}\n");
-      }
     }
 
     printer->Print(
diff --git a/core/predictor/common/constant.cpp b/core/predictor/common/constant.cpp
index 5fa1277de1a4b0d33d14a9c33d3cb4b280bc3b5c..70f0096ba002ebb8f185cd73f8fe4f8d4d06b83f 100644
--- a/core/predictor/common/constant.cpp
+++ b/core/predictor/common/constant.cpp
@@ -25,7 +25,7 @@ DEFINE_int32(port, 8010, "");
 DEFINE_string(workflow_path, "./conf", "");
 DEFINE_string(workflow_file, "workflow.prototxt", "");
 DEFINE_string(inferservice_path, "./conf", "");
-DEFINE_string(inferservice_file, "service.prototxt", "");
+DEFINE_string(inferservice_file, "infer_service.prototxt", "");
 DEFINE_string(logger_path, "./conf", "");
 DEFINE_string(logger_file, "log.conf", "");
 DEFINE_string(resource_path, "./conf", "");
diff --git a/core/predictor/framework/CMakeLists.txt b/core/predictor/framework/CMakeLists.txt
index 641ba7efbad9c97497cd2ef9372fa08391f6769c..2b33dfed0ae1130b7044aa835164e591385fd5b5 100644
--- a/core/predictor/framework/CMakeLists.txt
+++ b/core/predictor/framework/CMakeLists.txt
@@ -1,3 +1,3 @@
-FILE(GLOB framework_srcs ${CMAKE_CURRENT_LIST_DIR}/*.cpp)
+FILE(GLOB framework_srcs ${CMAKE_CURRENT_LIST_DIR}/*.cpp ${CMAKE_CURRENT_LIST_DIR}/../../cube/cube-builder/src/seqfile_reader.cpp)
 LIST(APPEND pdserving_srcs ${framework_srcs})
 LIST(APPEND pclient_srcs ${framework_srcs})
diff --git a/core/predictor/framework/bsf-inl.h b/core/predictor/framework/bsf-inl.h
old mode 100644
new mode 100755
index 1f5d272d2875ee878f09ac2882364afe9fd899fb..401c51091ad858f48ad154d083c841a0e6a20010
--- a/core/predictor/framework/bsf-inl.h
+++ b/core/predictor/framework/bsf-inl.h
@@ -26,9 +26,90 @@
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/framework/memory.h"
 
+// this file is included by bsf.h
 namespace im {
 namespace bsf {
 
+template <typename InItemT, typename OutItemT>
+bool Task<InItemT, OutItemT>::task_fetch_init(BatchTasks<TaskT>& batchTask) {
+  // 双检锁，减少加锁的粒度
+  if (!fetch_init) {
+    if (taskmeta_num > 1) {
+      // 对于task被拆分为多个taskmeta,需要加锁。
+      AutoMutex lock(task_mut);
+      task_fetch_create(batchTask);
+    } else {
+      // 对于task只有1个taskmeta,不需要加锁。
+      task_fetch_create(batchTask);
+    }
+  }
+  return true;
+}
+
+template <typename InItemT, typename OutItemT>
+bool Task<InItemT, OutItemT>::task_fetch_create(BatchTasks<TaskT>& batchTask) {
+  if (!fetch_init) {
+    vector_fetch_lod_index = batchTask.vector_fetch_lod_index;
+    set_fetch_nobatch_index = batchTask.set_fetch_nobatch_index;
+    OutVectorT taskMetaOutLodTensor;
+    size_t fetchvar_num = batchTask._batch_out.size();
+    for (size_t fetchvar_index = 0; fetchvar_index < fetchvar_num;
+         ++fetchvar_index) {
+      size_t fetchvar_bytesize_index =
+          batchTask.fetchvar_bytesize(fetchvar_index);
+      size_t fetchvar_batch = 0;
+      // 1. nobatch fetchvar情况
+      if (set_fetch_nobatch_index.size() > 0 &&
+          set_fetch_nobatch_index.find(fetchvar_index) !=
+              set_fetch_nobatch_index.end()) {
+        fetchvar_batch = 1;
+      } else if (vector_fetch_lod_index.size() > 0 &&
+                 std::find(vector_fetch_lod_index.begin(),
+                           vector_fetch_lod_index.end(),
+                           fetchvar_index) != vector_fetch_lod_index.end()) {
+        // lod fetchvar情况，此时无法确定总的shape[0]
+        // 根据task中的task_num总数开辟task_num个临时空间
+        // 每个lod型的fetchvar拷贝到对应的临时空间中
+        // 最后再计算临时空间的总量，合并fetchvar和lod
+        fetchvar_batch = 0;
+
+      } else {
+        // 普通fetchvar情况，此时该Task总的fetchvar_batch =
+        // 输入的总的batch_size()
+        fetchvar_batch = batch_size();
+      }
+      paddle::PaddleTensor tensor_out;
+      tensor_out.name = batchTask._batch_out[fetchvar_index].name;
+      tensor_out.dtype =
+          paddle::PaddleDType(batchTask._batch_out[fetchvar_index].dtype);
+      tensor_out.shape = batchTask._batch_out[fetchvar_index].shape;
+      tensor_out.shape[0] = fetchvar_batch;
+      if (fetchvar_batch != 0) {
+        // 此时 lod 为空。
+        tensor_out.lod = batchTask._batch_out[fetchvar_index].lod;
+        // resize all batch memory at one time
+        size_t databuf_size = fetchvar_batch * fetchvar_bytesize_index;
+        tensor_out.data.Resize(databuf_size);
+      } else {
+        // 当taskmeta_num = 1时，由于同时只有一个taskMeta操作task
+        // 不涉及线程安全问题，所以此时可以直接由taskMeta->task->resize->copy
+
+        // 当task被分为多个taskMeta时，需要临时对象记录
+        // 收齐后再一起合并
+        if (taskmeta_num > 1) {
+          taskMetaOutLodTensor.push_back(tensor_out);
+        }
+      }
+      outVectorT_ptr->push_back(tensor_out);
+    }
+    // outLodTensorVector实际是一个双层vector
+    // shape为taskmeta_num * vector_fetch_lod_index.size();
+    outLodTensorVector.resize(taskmeta_num, taskMetaOutLodTensor);
+    fetch_init = true;
+  }
+  return true;
+}
+
 template <typename TaskT>
 void* TaskExecutor<TaskT>::thread_entry(void* args) {
   ThreadContext<TaskT>* context = static_cast<ThreadContext<TaskT>*>(args);
@@ -134,9 +215,10 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(
     LOG(ERROR) << "Failed get TaskT from object pool";
     return TaskHandler<TaskT>::valid_handle();
   }
+  task->clear();
 
   /*
-  if (!BatchTasks<TaskT>::check_valid(in, out, _batch_align)) {
+  if (!BatchTasks<TaskT>::check_valid(in, out, _overrun)) {
     LOG(ERROR) << "Invalid input & output";
     return TaskHandler<TaskT>::valid_handle();
   }
@@ -156,9 +238,11 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(
 
   task->inVectorT_ptr = (const InVectorT*)inVectorT_ptr;
   task->outVectorT_ptr = (OutVectorT*)outVectorT_ptr;
+  if (!task->task_init()) {
+    LOG(ERROR) << "task->init() failed";
+  }
   task->rem = task->batch_size();
   task->index.store(0, butil::memory_order_relaxed);
-
   AutoMutex lock(_mut);
   _task_queue.push_back(task);
   THREAD_COND_SIGNAL(&_cond);
@@ -168,11 +252,12 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(
 
 // this function is accessed by multi thread.
 // so AutoMutex at first.
-// so batch.append_task is thread safe.
+// so batchTask.append_task is thread safe.
 // you dont need to add extra lock in append_task()
+// task is already init.
 template <typename TaskT>
 bool TaskExecutor<TaskT>::move_task_to_batch(
-    BatchTasks<TaskT>& batch) {  // NOLINT
+    BatchTasks<TaskT>& batchTask) {  // NOLINT
   AutoMutex lock(_mut);
   while (_task_queue.empty()) {
     THREAD_COND_WAIT(&_cond, &_mut);
@@ -183,15 +268,65 @@ bool TaskExecutor<TaskT>::move_task_to_batch(
     return false;
   }
 
+  TaskT* previous_task = nullptr;
   while (!_task_queue.empty()) {
     TaskT* task = _task_queue.front();
-    size_t rem = batch.append_task(task);
+
+    // 由于无法确定fetchVar是否为lod（即使输入是非lod，输出也可能是lod）
+    // 简单的处理方法是：task不能被拆分，即用户的请求可以合并一起预测，但不能拆分两个小部分去预测。
+    // 只需要设置engine的属性allow_split_request = false即可。
+
+    // 复杂的处理方法是允许拆分Task，无论是否包含lod.
+    // 难点：预测前，能够知道被拆成了几个taskmeta,但只有预测后，才知道有多少个fetchvar,多少个lod的fetchvar
+    // 所以，task中先要创建taskmeta_num* fetchvar
+    // num（lod类型的）个临时PaddleTensor（存储data及Lod）
+    // 由于多线程调度的单位是taskmeta，故只能在notify_task中，用taskmeta->task去创建
+    // 此时由于多个taskmeta对应一个task，存在多线程竞争，所以需要在task中加锁。
+    // 原子操作不可行，因为多个线程必须等待创建好上述的PaddleTensor后才能继续。
+    // 对于普通的fetch，也需要加锁去创建PaddleTensor，后续才能往里拷贝。
+
+    // _overrun表示，异步BatchTasks是否允许单次临时超过限制。
+    // _overrun为true时，即使BatchTasks剩下1-batch，也会全放入一个完整的Task，允许临时超限。
+    // _overrun为false时，不允许。
+    // 对于模型本身有最大Batch限制的情况，应将该值设为false，默认为false。
+    // 对于模型本身无最大Batch限制，但自己设置了BatchTasks的最大Batch，可以考虑设置为True。
+
+    // _allow_split_request ==
+    // true，则允许拆分task.BatchTasks剩下1-batch，则会从下一个Task中拆出1-Batch
+    // _allow_split_request ==
+    // false，则每个task不会被拆分。BatchTasks剩下1-batch会被浪费
+    // 默认为true，允许拆分task从而使得空间利用率最大。
+    if (!batchTask.get_allow_split_request()) {
+      if (task->batch_size() > batchTask.get_rem_size() &&
+          !batchTask.get_overrun()) {
+        break;
+      }
+    }
+
+    // combine_task_valid负责判断是否能够合并
+    // 除最外层的shape外，内层shape应一致才能合并。
+    // 否则跳出循环,放入下一个batchTask中。
+    // 以此保证batch.append_task(task)中的task的内层shape相同。
+
+    // 对于Shape[0] = 1 而!=batch的情况，因为合并时，取其中一个的值
+    // 所以要求该feedvar必须相等，才能合并。
+    // 否则跳出循环,放入下一个batchTask中。
+    // 目前没有PaddleTensor和PaddleBuff没有重载==，所以只能比较内存.
+    // TODO(HexToString): 可以考虑后期支持AutoPadding.
+    if (previous_task != nullptr) {
+      if (!task->combine_task_valid(previous_task)) {
+        break;
+      }
+    }
+    size_t rem = batchTask.append_task(task);
+    previous_task = task;
     if (task->rem <= 0) {
       _task_queue.pop_front();
     }
     if (rem <= 0) break;
   }
-
+  LOG(INFO) << "Number of tasks remaining in _task_queue is"
+            << _task_queue.size();
   return true;
 }
 
@@ -201,11 +336,12 @@ bool TaskExecutor<TaskT>::move_task_to_batch(
 // TaskT is from the SingleTon TaskExecutor`s _task_queue
 // although TaskMeta is a local variable, but several TaskMeta may points to
 // the same TaskT which is get from the SingleTon TaskExecutor`s _task_queue.
-// put TaskMeta to the local variable BatchTasks<TaskT> batch.
+// put TaskMeta to the local variable BatchTasks<TaskT> batchTask.
 
-// batch.merge_tasks() and batch.notify_tasks() has no lock.
-// BatchTasks<TaskT> batch itself is a local variable, it`s thread safe.
-// If batch.merge_tasks() and batch.notify_tasks() do something to TaskMeta
+// batchTask.merge_tasks() and batchTask.notify_tasks() has no lock.
+// BatchTasks<TaskT> batchTask itself is a local variable, it`s thread safe.
+// If batchTask.merge_tasks() and batchTask.notify_tasks() do something to
+// TaskMeta
 // you need to pay attention to that.
 // Multi-Thread deal with different TaskMeta(cause it`s created as local
 // variable)
@@ -242,11 +378,23 @@ int TaskExecutor<TaskT>::work(ThreadContext<TaskT>* context) {
       return -1;
     }
 
-    BatchTasks<TaskT> batch(_batch_size, _batch_align);
-    if (move_task_to_batch(batch)) {
-      batch.merge_tasks();
-      _fn(&batch.in(), &batch.out());
-      batch.notify_tasks();
+    // move_task_to_batch() take the original task from the `_task_queue`
+    // put the original task into its own Vector<taskmeta>
+    // the capacity of its own Vector<taskmeta> is decided by `_batch_size` or
+    // `_overrun`
+
+    // merge_tasks() move the imput-data into `_batch_in` from its own
+    // Vector<taskmeta>.
+    // because the predictor`s input is the `_batch_in`
+
+    // notify_tasks() move the output-data into every single taskmeta from
+    // `_batch_out`.
+    // because the predictor`s output is the `_batch_out`
+    BatchTasks<TaskT> batchTask(_batch_size, _overrun, _allow_split_request);
+    if (move_task_to_batch(batchTask)) {
+      batchTask.merge_tasks();
+      _fn(&batchTask.in(), &batchTask.out());
+      batchTask.notify_tasks();
     }
   }
 
diff --git a/core/predictor/framework/bsf.h b/core/predictor/framework/bsf.h
old mode 100644
new mode 100755
index 7a8629e75b87aec889a1cce98b6392dddad32ce0..17f0c3d2ace16d50c223692b91f5dd30b3764cd0
--- a/core/predictor/framework/bsf.h
+++ b/core/predictor/framework/bsf.h
@@ -16,7 +16,9 @@
 
 #include <errno.h>
 #include <algorithm>
+#include <cstring>
 #include <list>
+#include <set>
 #include <vector>
 
 #ifdef BCLOUD
@@ -46,7 +48,8 @@ static const size_t DEFAULT_BATCH_SIZE = 100;
 // `rem` don`t need to be atomic, cause the operation `put` is synchronous.
 // actually, the reason is that lock have been added outside the operation
 // `put`.
-
+template <typename TaskT>
+class BatchTasks;
 // size_t `index` records how many batch have been processing completed.
 // `index` need to be atomic, cause the operation 'notify' is asynchronous.
 template <typename InItemT, typename OutItemT>
@@ -56,7 +59,7 @@ struct Task {
   typedef InItemT InType;
   typedef OutItemT OutType;
   typedef Task<InItemT, OutItemT> TaskT;
-  typedef std::vector<int> ShapeVector;
+  typedef std::vector<size_t> ShapeVector;
   typedef std::vector<ShapeVector> VectorOfShapeVector;
 
   int read_fd;
@@ -65,7 +68,17 @@ struct Task {
   const InVectorT* inVectorT_ptr;
   OutVectorT* outVectorT_ptr;
   size_t rem;
+  size_t total_feed_batch;
+  std::set<size_t> set_feed_lod_index;
+  std::set<size_t> set_feed_nobatch_index;
+  std::vector<size_t> vector_fetch_lod_index;
+  std::set<size_t> set_fetch_nobatch_index;
   butil::atomic<size_t> index;
+  size_t taskmeta_num;
+  THREAD_MUTEX_T task_mut;
+  bool fetch_init;
+  // taskmeta_num * set_feed_lod_index.size()
+  std::vector<OutVectorT> outLodTensorVector;
 
   Task() {
     read_fd = -1;
@@ -73,11 +86,57 @@ struct Task {
     owner_tid = -1;
     inVectorT_ptr = NULL;
     outVectorT_ptr = NULL;
+    set_feed_lod_index.clear();
+    set_feed_nobatch_index.clear();
+    vector_fetch_lod_index.clear();
+    set_fetch_nobatch_index.clear();
     rem = -1;
+    total_feed_batch = 0;
+    taskmeta_num = 0;
     index.store(0, butil::memory_order_relaxed);
+    THREAD_MUTEX_INIT(&task_mut, NULL);
+    fetch_init = false;
+    outLodTensorVector.clear();
+  }
+  ~Task() {
+    read_fd = -1;
+    write_fd = -1;
+    owner_tid = -1;
+    inVectorT_ptr = NULL;
+    outVectorT_ptr = NULL;
+    set_feed_lod_index.clear();
+    set_feed_nobatch_index.clear();
+    vector_fetch_lod_index.clear();
+    set_fetch_nobatch_index.clear();
+    rem = -1;
+    total_feed_batch = 0;
+    taskmeta_num = 0;
+    index.store(0, butil::memory_order_relaxed);
+    THREAD_MUTEX_DESTROY(&task_mut);
+    fetch_init = false;
+    outLodTensorVector.clear();
   }
 
-  bool check_feedvar_valid(int feedvar_index) {
+  void clear(){
+    read_fd = -1;
+    write_fd = -1;
+    owner_tid = -1;
+    inVectorT_ptr = NULL;
+    outVectorT_ptr = NULL;
+    set_feed_lod_index.clear();
+    set_feed_nobatch_index.clear();
+    vector_fetch_lod_index.clear();
+    set_fetch_nobatch_index.clear();
+    rem = -1;
+    total_feed_batch = 0;
+    taskmeta_num = 0;
+    index.store(0, butil::memory_order_relaxed);
+    THREAD_MUTEX_INIT(&task_mut, NULL);
+    fetch_init = false;
+    outLodTensorVector.clear();
+  }
+
+  bool check_feedvar_valid(size_t feedvar_index) {
     if (feedvar_index < 0 || inVectorT_ptr->size() <= feedvar_index) {
       LOG(ERROR) << "feedvar doesnt exsit or feedvar_index error";
       return 0;
@@ -91,20 +150,47 @@ struct Task {
     return 1;
   }
 
-  // Now, it simply assume that the first dimension of data is batch.
-  // so the batch is PaddleTensor.shape[0]
+  bool combine_task_valid(Task* other_task) {
+    // TODO(HexToString): auto-padding
+    // 除最外层的shape外，内层shape应一致才能合并。
+    // 否则跳出循环,放入下一个batchTask中。
+    // 以此保证batch.append_task(task)中的task的内层shape相同。
+    if (other_task->feedvar_shape_nobatch() != feedvar_shape_nobatch()) {
+      return false;
+    }
+
+    // 对于Shape[0] = 1 而!=batch的情况，因为合并时，取其中一个的值
+    // 所以要求该feedvar必须相等，才能合并。
+    // 目前没有PaddleTensor和PaddleBuff没有重载==，所以只能比较内存.
+    for (size_t feedvar_index = 0;
+         feedvar_index < set_feed_nobatch_index.size();
+         ++feedvar_index) {
+      int result =
+          std::memcmp((*inVectorT_ptr)[feedvar_index].data.data(),
+                      (*(other_task->inVectorT_ptr))[feedvar_index].data.data(),
+                      (*inVectorT_ptr)[feedvar_index].data.length());
+      if (result != 0) return false;
+    }
+    return true;
+  }
 
-  // If batch information is added into feedvar.prototxt.
-  // we can get the information from the feedvar.prototxt instead of assume.
-  size_t feedvar_batch_size(int feedvar_index) {
+  size_t feedvar_batch_size(size_t feedvar_index) {
     if (!check_feedvar_valid(feedvar_index)) {
       return 0;
     }
-
+    // if lod, 'lod[0].size()-1' is batch.
+    // for PaddleTensor lod is vector<vector<size_t>>, so lod[0] is real lod.
+    // for example, lod = [0,3,4,6], shape = [6,340,340], batch is 3 actually.
+    // for lod, the batch < shape[0].
+    if ((*inVectorT_ptr)[feedvar_index].lod.size() > 0 &&
+        (*inVectorT_ptr)[feedvar_index].lod[0].size() > 0) {
+      return (*inVectorT_ptr)[feedvar_index].lod[0].size() - 1;
+    }
+    // if not lod, the first dimension of data `PaddleTensor.shape[0]` is batch.
     return (*inVectorT_ptr)[feedvar_index].shape[0];
   }
 
-  size_t feedvar_element_bytesize(int feedvar_index) {
+  size_t feedvar_element_bytesize(size_t feedvar_index) {
     if (!check_feedvar_valid(feedvar_index)) {
       return 0;
     }
@@ -126,7 +212,7 @@ struct Task {
 
   // Now, the implementation of this function is based on assumption
   // that shape [0] = batch_size.
-  size_t feedvar_element_num(int feedvar_index) {
+  size_t feedvar_element_num(size_t feedvar_index) {
     if (!check_feedvar_valid(feedvar_index)) {
       return 0;
     }
@@ -138,18 +224,18 @@ struct Task {
       return 1;
     }
     // start from shape[1], cause shape[0] = batch_size.
-    for (int i = 1; i < (*inVectorT_ptr)[feedvar_index].shape.size(); ++i) {
+    for (size_t i = 1; i < (*inVectorT_ptr)[feedvar_index].shape.size(); ++i) {
       element_num *= (*inVectorT_ptr)[feedvar_index].shape[i];
     }
     return element_num;
   }
 
-  size_t feedvar_bytesize(int feedvar_index) {
+  size_t feedvar_bytesize(size_t feedvar_index) {
     return feedvar_element_num(feedvar_index) *
            feedvar_element_bytesize(feedvar_index);
   }
 
-  ShapeVector feedvar_shape_nobatch(int feedvar_index) {
+  ShapeVector feedvar_shape_nobatch(size_t feedvar_index) {
     if (!check_feedvar_valid(feedvar_index)) {
       return ShapeVector();
     }
@@ -158,40 +244,167 @@ struct Task {
   }
 
   VectorOfShapeVector feedvar_shape_nobatch() {
-    VectorOfShapeVector vector_of_feedvar_shape_nobatch(inVectorT_ptr->size());
-    for (int index = 0; index < inVectorT_ptr->size(); ++index) {
-      vector_of_feedvar_shape_nobatch.push_back(feedvar_shape_nobatch(index));
+    VectorOfShapeVector vector_of_feedvar_shape_nobatch;
+    for (size_t feedvar_index = 0; feedvar_index < inVectorT_ptr->size();
+         ++feedvar_index) {
+      vector_of_feedvar_shape_nobatch.push_back(
+          feedvar_shape_nobatch(feedvar_index));
     }
     return vector_of_feedvar_shape_nobatch;
   }
 
-  // At present, it is considered that the batch of all feedvar is consistent.
-  // so for each feedvar, PaddleTensor.shape[0] should be the same.
-  bool check_batch_align() {
-    int batch_size_align = feedvar_batch_size(0);
-    for (int feedvar_index = 0; feedvar_index < inVectorT_ptr->size();
+  // For each feedvar, batch should be 1 or batch_size.
+  // if feedvar-1: batch_size = 1 (always not batch).
+  // feedvar-2: batch_size = n,  batch = n.
+  // this function is not thread safe. only called when task is creating.
+  bool task_init() {
+    total_feed_batch = feedvar_batch_size(0);
+    // which means error.
+    if (total_feed_batch <= 0) return false;
+
+    for (size_t feedvar_index = 0; feedvar_index < inVectorT_ptr->size();
          ++feedvar_index) {
-      if (feedvar_batch_size(feedvar_index) != batch_size_align) {
-        return 0;
+      // TODO(HexToString): Distinguish between nobatch and batch =
+      // 1(By:HexToString)
+      // 当数据中feedvar-1: 带batch,且batch =1，shape[0] = 1
+      // feedvar-2:不带batch，由于不带batch导致shape[0] =1
+      // 此时，无法分辨是否是天然nobatch，此时set_feed_nobatch_index会漏掉
+      // 后续希望在其他地方能够区分两者。
+      if (feedvar_batch_size(feedvar_index) != total_feed_batch) {
+        // which means error.
+        if (feedvar_batch_size(feedvar_index) != 1 && total_feed_batch != 1) {
+          return false;
+        } else {
+          // which means feedvar shape[0] = 1.
+          // shape[0] does not change with batch
+          set_feed_nobatch_index.insert(feedvar_index);
+          total_feed_batch =
+              std::max(feedvar_batch_size(feedvar_index), total_feed_batch);
+        }
+      }
+      // 将lod feedvar index加入到vector中。
+      if ((*inVectorT_ptr)[feedvar_index].lod.size() > 0 &&
+          (*inVectorT_ptr)[feedvar_index].lod[0].size() > 0) {
+        set_feed_lod_index.insert(feedvar_index);
       }
     }
-    /*
-    for(int fetchvar_index = 0; fetchvar_index < outVectorT_ptr->size();
-    ++fetchvar_index) {
-      if(fetchvar_batch_size(fetchvar_index) != batch_size_align) {
-        return 0;
+    return true;
+  }
+
+  size_t batch_size() { return total_feed_batch; }
+
+  // start_batch range is 0~batch_size, end_batch range is 1~batch_size
+  // start_batch should not be included, end_batch > start_batch
+  // return is (start_batch, end_batch] = [start_batch+1,end_batch]
+  // for not lod, shape0_index = [(start_batch+1)-1,end_batch-1] =
+  // [start_batch,end_batch-1] = [start_batch,end_batch)
+  // for lod, shape0_index = [lod[start_batch],lod[end_batch]-1] =
+  // [lod[start_batch],lod[end_batch])
+  // for nobatch, shape0_index = [0,1)
+  // 对于调用者，拿到shape0_index后，for(size_t myindex =shape0_index[0];
+  // myindex <shape0_index[1];myindex++)即可.
+
+  // 原始lod= [0,3,4,6] 取的batch为(start_batch = 1,end_batch =
+  // 3]，即取batch=2,3.
+  // 此时lod=[3,4,6]，处理后得到[1,3]
+  // 这样处理后，合并lod比较方便，直接加上上一个lod的结尾的值即可。
+  std::vector<std::vector<size_t>> get_feature_by_batch(size_t feedvar_index,
+                                                        size_t start_batch,
+                                                        size_t end_batch) {
+    std::vector<std::vector<size_t>> feature_vector;
+    // feature_vector是双层vector,这么设计是由于一个遍历即可处理所有的特征。
+    // feature_vector[0]是由shape0_index的范围值组成的vector,包含两个元素最小和最大值。
+    // feature_vector[1]是由lod组成的vector，包含指定batch的lod信息.
+    // feature_vector[2]是由单个元素的组成的vector，元素值为1表示是nobatch的feedvar。
+
+    // if 为 nobatch feedvar情况。
+    // else if 为带lod的feedvar情况。
+    // else为不带lod 普通feedvar情况。
+    if (set_feed_nobatch_index.size() > 0 &&
+        set_feed_nobatch_index.find(feedvar_index) !=
+            set_feed_nobatch_index.end()) {
+      feature_vector = {{0, 1}, {}, {1}};
+    } else if (set_feed_lod_index.size() > 0 &&
+               set_feed_lod_index.find(feedvar_index) !=
+                   set_feed_lod_index.end()) {
+      std::vector<size_t> feed_lod_vector(end_batch - start_batch);
+      for (size_t lod_index = start_batch + 1, vector_index = 0;
+           lod_index < end_batch + 1;
+           ++lod_index, ++vector_index) {
+        feed_lod_vector[vector_index] =
+            (*inVectorT_ptr)[feedvar_index].lod[0][lod_index] -
+            (*inVectorT_ptr)[feedvar_index].lod[0][start_batch];
       }
+      size_t shape0_start = (*inVectorT_ptr)[feedvar_index].lod[0][start_batch];
+      size_t shape0_end = (*inVectorT_ptr)[feedvar_index].lod[0][end_batch];
+      feature_vector = {{shape0_start, shape0_end}, feed_lod_vector};
+      // feature_vector.push_back(feed_lod_vector);
+    } else {
+      feature_vector = {{start_batch, end_batch}};
     }
-    */
-    return 1;
+    return feature_vector;
   }
 
-  size_t batch_size() {
-    if (check_batch_align()) {
-      return feedvar_batch_size(0);
+  bool combine_taskmeta() {
+    // 只有含有lod类型的fetch输出，且task被拆分为多个taskmeta的情况
+    // 才需要将数据从outLodTensorVector搬运到outVectorT_ptr
+    if (vector_fetch_lod_index.size() > 0 && taskmeta_num > 1) {
+      for (size_t index = 0; index < vector_fetch_lod_index.size(); ++index) {
+        size_t data_length = 0;
+        size_t lod_length = 0;
+        size_t total_shape0 = 0;
+        size_t feedvar_index = vector_fetch_lod_index[index];
+        // 由于PaddleTensor的resize实现，是每次都会清空，所以必须先统计总长度。
+        for (size_t taskmeta_index = 0; taskmeta_index < taskmeta_num;
+             ++taskmeta_index) {
+          data_length +=
+              outLodTensorVector[taskmeta_index][index].data.length();
+          lod_length += outLodTensorVector[taskmeta_index][index].lod[0].size();
+          total_shape0 += outLodTensorVector[taskmeta_index][index].shape[0];
+        }
+        // 一次性扩容PaddleTensor中的data和lod
+        paddle::PaddleTensor& fetchVarTensor = (*outVectorT_ptr)[feedvar_index];
+        fetchVarTensor.data.Resize(data_length);
+        // task中的lod补0
+        if (fetchVarTensor.lod.size() <= 0) {
+          fetchVarTensor.lod.push_back({0});
+        } else if (fetchVarTensor.lod[0].size() <= 0) {
+          fetchVarTensor.lod[0].push_back(0);
+        }
+        fetchVarTensor.lod[0].resize(lod_length + 1, 0);
+
+        //
+        size_t data_length_offset = 0;
+        size_t lod_length_offset = 0;
+        size_t once_data_length = 0;
+        size_t once_lod_length = 0;
+        size_t last_lod_value = fetchVarTensor.lod[0][lod_length_offset];
+        for (size_t taskmeta_index = 0; taskmeta_index < taskmeta_num;
+             ++taskmeta_index) {
+          void* dst_ptr = fetchVarTensor.data.data() + data_length_offset;
+          void* source_ptr =
+              outLodTensorVector[taskmeta_index][index].data.data();
+          once_data_length =
+              outLodTensorVector[taskmeta_index][index].data.length();
+          memcpy(dst_ptr, source_ptr, once_data_length);
+          once_lod_length =
+              outLodTensorVector[taskmeta_index][index].lod[0].size();
+          for (size_t once_index = 0; once_index < once_lod_length;
+               ++once_index) {
+            fetchVarTensor.lod[0][lod_length_offset + 1] =
+                last_lod_value +
+                outLodTensorVector[taskmeta_index][index].lod[0][once_index];
+          }
+          data_length_offset += once_data_length;
+          lod_length_offset += once_lod_length;
+        }
+      }
     }
-    return 0;
+    return true;
   }
+
+  bool task_fetch_init(BatchTasks<TaskT>& batchTask);
+  bool task_fetch_create(BatchTasks<TaskT>& batchTask);
 };
 
 // `Several Task` or `part of batch in Task` can be a TaskMeta.
@@ -206,61 +419,164 @@ struct Task {
 
 // TaskMeta is necessary.
 // cause we need know the the corresponding relationship between
-// `batch_out`(which is in BatchTasks) and `outVectorT_ptr`(which is in Task).
+// `_batch_out`(which is in BatchTasks) and `outVectorT_ptr`(which is in Task).
 // especially when 1 Task be divided into several TaskMeta and be put into
 // several different BatchTasks.
+
+// begin、add、end means batch, not shape[0].
+// if not lod, batch == shape[0]. if lod, batch != shape[0]
+// for example, lod = [0,3,4,6], shape = [6,340,340]
+// there is 3 batch actually, add = 3, but shape[0] = 6.
 template <typename TaskT>
 struct TaskMeta {
-  TaskMeta(TaskT* ptr, size_t start, size_t add)
-      : task(ptr), begin(start), end(start + add) {}
+  TaskMeta(TaskT* ptr, size_t start, size_t add, size_t taskmeta_index)
+      : task(ptr),
+        begin(start),
+        end(start + add),
+        taskmeta_index(taskmeta_index) {
+    feedvar_num = ptr->inVectorT_ptr->size();
+    for (size_t feedvar_index = 0; feedvar_index < feedvar_num;
+         ++feedvar_index) {
+      std::vector<std::vector<size_t>> feature =
+          ptr->get_feature_by_batch(feedvar_index, start, start + add);
+      feed_shape0_range.push_back(feature[0]);
+      feedvar_type.push_back(feature.size());
+      if (feature.size() == 1) {
+        feed_lod_vector.push_back({});
+      } else if (feature.size() == 2) {
+        feed_lod_vector.push_back(feature[1]);
+      } else {
+        feed_lod_vector.push_back({});
+      }
+    }
+  }
 
   TaskT* task;
   size_t begin;
   size_t end;
+  size_t feedvar_num;
+  size_t taskmeta_index;
+  std::vector<std::vector<size_t>> feed_shape0_range;
+  std::vector<std::vector<size_t>> feed_lod_vector;
+  std::vector<size_t> feedvar_type;
 };
 
 // each TaskT is already include batch in itself
 // BatchTasks need to combine several `small TaskMeta` into a new `big TaskT`.
 // The only difference between the `big TaskT` and `small TaskT` is that
-// the TaskT.inVectorT_ptr->[feedvar_index].shape[0]
-// which is actually batch_size is different.
+// the TaskT.inVectorT_ptr->[feedvar_index].shape[0] is different
+// `big TaskT`.inVectorT_ptr->[feedvar_index].shape[0] is actually batch_size .
 template <typename TaskT>
 class BatchTasks {
  public:
   typedef typename TaskT::InType InType;
   typedef typename TaskT::OutType OutType;
   typedef TaskMeta<TaskT> TaskMetaT;
+  typedef std::vector<size_t> ShapeVector;
+  typedef std::vector<ShapeVector> VectorOfShapeVector;
+  typedef std::vector<size_t> LodVector;
+  typedef std::vector<LodVector> PaddleTensorLod;
+  friend TaskT;
 
-  explicit BatchTasks(size_t batch_size, bool batch_align = true)
+  explicit BatchTasks(size_t batch_size,
+                      bool overrun = false,
+                      bool allow_split_request = true)
       : _batch_size(batch_size),
         _rem_size(batch_size),
-        _batch_align(batch_align) {
+        _overrun(overrun),
+        _allow_split_request(allow_split_request) {
     _batch_in.clear();
     _batch_in_offset.clear();
+    _total_shape0_batch_in.clear();
+    _total_feed_batch = 0;
+    _batch_in_lod.clear();
+
     _batch_out.clear();
     _batch_out_offset.clear();
+    _total_fetch_batch = 0;
     _taskmeta_vector.clear();
+    set_fetch_nobatch_index.clear();
+    vector_fetch_lod_index.clear();
   }
 
   ~BatchTasks() {
     _batch_in.clear();
     _batch_in_offset.clear();
+    _total_shape0_batch_in.clear();
+    _total_feed_batch = 0;
+    _batch_in_lod.clear();
+
     _batch_out.clear();
     _batch_out_offset.clear();
+    _total_fetch_batch = 0;
     _taskmeta_vector.clear();
+    set_fetch_nobatch_index.clear();
+    vector_fetch_lod_index.clear();
   }
 
   // synchronized operation
   // because Upper level callers of this function have already locked.
+  // 能进到此函数的task都是同类task，在该函数之前已保证了这点。
   size_t append_task(TaskT* task) {
     size_t add = std::min(task->rem, _rem_size);
-    if (!_batch_align) {
+    // when _overrun == true, it means always take a whole task as TaskMeta
+    // we can temporary breakthrough the limit of BatchTask`s capacity
+    // BatchTask`s capacity is _batch_size or _rem_size
+    if (_overrun) {
       add = task->rem;
     }
     int start_index = task->batch_size() - task->rem;
-    TaskMetaT tm(task, start_index, add);
+    TaskMetaT tm(task, start_index, add, task->taskmeta_num);
+    task->taskmeta_num += 1;
     _taskmeta_vector.push_back(tm);
-
+    if (_batch_in_offset.size() == 0) {
+      _batch_in_offset.resize(tm.feedvar_num, 0);
+    }
+    if (_total_shape0_batch_in.size() == 0) {
+      _total_shape0_batch_in.resize(tm.feedvar_num, 0);
+    }
+    if (_batch_in_lod.size() == 0) {
+      PaddleTensorLod null_lod;
+      _batch_in_lod.resize(tm.feedvar_num, null_lod);
+    }
+    _total_feed_batch += add;
+    for (size_t feedvar_index = 0; feedvar_index < tm.feedvar_num;
+         ++feedvar_index) {
+      if (tm.feedvar_type[feedvar_index] == 1) {
+        // 普通的非lod feedvar
+        // 累计计算shape0的累加值，为后面初始化PaddleTensor做准备。
+        _total_shape0_batch_in[feedvar_index] +=
+            tm.feed_shape0_range[feedvar_index][1] -
+            tm.feed_shape0_range[feedvar_index][0];
+      } else if (tm.feedvar_type[feedvar_index] == 2) {
+        // lod类型的feedvar
+        // 累计计算shape0的累加值，为后面初始化PaddleTensor做准备。
+        _total_shape0_batch_in[feedvar_index] +=
+            tm.feed_shape0_range[feedvar_index][1] -
+            tm.feed_shape0_range[feedvar_index][0];
+        // 在Lod最前面加0
+        if (_batch_in_lod[feedvar_index].size() <= 0) {
+          _batch_in_lod[feedvar_index].push_back({0});
+        } else if (_batch_in_lod[feedvar_index][0].size() <= 0) {
+          _batch_in_lod[feedvar_index][0].push_back(0);
+        }
+        // 将lod加上前一组lod的结尾最大值，组合Lod
+        size_t last_lod_value = _batch_in_lod[feedvar_index][0].back();
+        for (size_t lod_index = 0;
+             lod_index < tm.feed_lod_vector[feedvar_index].size();
+             ++lod_index) {
+          _batch_in_lod[feedvar_index][0].push_back(
+              last_lod_value + tm.feed_lod_vector[feedvar_index][lod_index]);
+        }
+      } else {
+        // tm.feedvar_type[feedvar_index] == 3
+        // nobatch类型的feedvar.
+        // 此时不累加，且值应为1
+        _total_shape0_batch_in[feedvar_index] =
+            tm.feed_shape0_range[feedvar_index][1] -
+            tm.feed_shape0_range[feedvar_index][0];
+      }
+    }
     task->rem -= add;
     _rem_size -= add;
     return _rem_size;
@@ -281,72 +597,56 @@ class BatchTasks {
   // cause maybe next time we don`t need to do the extra copy.
   // directly copy the every Task into the Predictor.
 
-  // lod is not supported.
-  // if lod is set, we should not allow to use the bsf task.
-
   // batch.merge_tasks() is thread-safe function
   // cause batch is a local variable and Task is just read, not written.
+
   void merge_tasks() {
     if (_taskmeta_vector.size() <= 0) {
       return;
     }
 
-    // Temporarily, the batch of each feedvar is consistent
-    // If not consistent, use feedvar_batch_size instead of task->batch_size().
-    int temp_batch = 0;
-    for (size_t ti = 0; ti < _taskmeta_vector.size(); ++ti) {
-      TaskMetaT& tm = _taskmeta_vector[ti];
-      temp_batch += tm.task->batch_size();
-    }
-    if (temp_batch > _batch_size) {
-      LOG(ERROR) << "_realNumber_batch_in >_batch_size, error.";
-      return;
-    }
-
-    int feedvar_num = _taskmeta_vector[0].task->inVectorT_ptr->size();
-    if (_batch_in_offset.size() == 0) {
-      _batch_in_offset.resize(feedvar_num, 0);
-      _realNumber_batch_in.resize(feedvar_num, temp_batch);
-    }
-
     for (size_t ti = 0; ti < _taskmeta_vector.size(); ++ti) {
       TaskMetaT& tm = _taskmeta_vector[ti];
 
-      for (int index = 0; index < feedvar_num; ++index) {
+      for (size_t feedvar_index = 0; feedvar_index < tm.feedvar_num;
+           ++feedvar_index) {
         const paddle::PaddleTensor& feedVarTensor =
-            (*tm.task->inVectorT_ptr)[index];
-        size_t feedvar_bytesize = tm.task->feedvar_bytesize(index);
+            (*tm.task->inVectorT_ptr)[feedvar_index];
+        size_t feedvar_bytesize = tm.task->feedvar_bytesize(feedvar_index);
 
         if (ti == 0) {
-          if (feedVarTensor.lod.size() > 0 && feedVarTensor.lod[0].size() > 0) {
-            LOG(ERROR) << "lod Tensor is not supported now.";
-            return;
-          }
+          // Create the entire tensor at once
           // for now, we assume that every task feedvar_bytesize is the same.
           // which means we dont support auto embedding.
           // but for different feedvar, it is different.
           paddle::PaddleTensor paddleTensor;
           paddleTensor.dtype = feedVarTensor.dtype;
           paddleTensor.name = feedVarTensor.name;
-          paddleTensor.lod = feedVarTensor.lod;
+          paddleTensor.lod = _batch_in_lod[feedvar_index];
           paddleTensor.shape = feedVarTensor.shape;
-          paddleTensor.shape[0] = _realNumber_batch_in[index];
+          paddleTensor.shape[0] = _total_shape0_batch_in[feedvar_index];
           paddleTensor.data.Resize(feedvar_bytesize *
-                                   _realNumber_batch_in[index]);
+                                   _total_shape0_batch_in[feedvar_index]);
           _batch_in.push_back(paddleTensor);
         }
 
-        void* dst_ptr = _batch_in[index].data.data() + _batch_in_offset[index];
+        void* dst_ptr = _batch_in[feedvar_index].data.data() +
+                        _batch_in_offset[feedvar_index];
         void* source_ptr =
-            feedVarTensor.data.data() + feedvar_bytesize * tm.begin;
-        size_t length = feedvar_bytesize * (tm.end - tm.begin);
+            feedVarTensor.data.data() +
+            feedvar_bytesize * tm.feed_shape0_range[feedvar_index][0];
+        size_t length =
+            feedvar_bytesize * (tm.feed_shape0_range[feedvar_index][1] -
+                                tm.feed_shape0_range[feedvar_index][0]);
         memcpy(dst_ptr, source_ptr, length);
-        _batch_in_offset[index] += length;
+        // nobatch类型的feedvar，不叠加.
+        if (tm.feedvar_type[feedvar_index] != 3)
+          _batch_in_offset[feedvar_index] += length;
       }
     }
   }
 
-  bool check_fetchvar_valid(int fetchvar_index) {
+  bool check_fetchvar_valid(size_t fetchvar_index) {
     if (fetchvar_index < 0 || _batch_out.size() <= fetchvar_index) {
       LOG(ERROR) << "fetchvar doesnt exsit or fetchvar_index error";
       return 0;
@@ -360,19 +660,11 @@ class BatchTasks {
     return 1;
   }
 
-  size_t fetchvar_batch_size(int fetchvar_index) {
-    if (!check_fetchvar_valid(fetchvar_index)) {
-      return 0;
-    }
-
-    return _batch_out[fetchvar_index].shape[0];
-  }
-
-  size_t fetchvar_element_bytesize(int fetchvar_index) {
+  size_t fetchvar_element_bytesize(size_t fetchvar_index) {
     if (!check_fetchvar_valid(fetchvar_index)) {
       return 0;
     }
-    int dtype = _batch_out[fetchvar_index].dtype;
+    size_t dtype = _batch_out[fetchvar_index].dtype;
     if (dtype == paddle::PaddleDType::INT64) {
       return sizeof(int64_t);
     }
@@ -390,7 +682,7 @@ class BatchTasks {
 
   // Now, the implementation of this function is based on assumption
   // that shape [0] = batch_size.
-  size_t fetchvar_element_num(int fetchvar_index) {
+  size_t fetchvar_element_num(size_t fetchvar_index) {
     if (!check_fetchvar_valid(fetchvar_index)) {
       return 0;
     }
@@ -400,35 +692,66 @@ class BatchTasks {
       return 1;
     }
     // start from shape[1], cause shape[0] = batch_size.
-    for (int i = 1; i < _batch_out[fetchvar_index].shape.size(); ++i) {
+    for (size_t i = 1; i < _batch_out[fetchvar_index].shape.size(); ++i) {
       element_num *= _batch_out[fetchvar_index].shape[i];
     }
     return element_num;
   }
 
-  size_t fetchvar_bytesize(int fetchvar_index) {
+  size_t fetchvar_bytesize(size_t fetchvar_index) {
     return fetchvar_element_num(fetchvar_index) *
            fetchvar_element_bytesize(fetchvar_index);
   }
 
-  bool check_fetchvar_batch_align() {
-    int batch_size_align = fetchvar_batch_size(0);
-
-    for (int fetchvar_index = 0; fetchvar_index < _batch_out.size();
-         ++fetchvar_index) {
-      if (fetchvar_batch_size(fetchvar_index) != batch_size_align) {
-        return 0;
-      }
+  size_t fetchvar_batch_size(size_t fetchvar_index) {
+    if (!check_fetchvar_valid(fetchvar_index)) {
+      return 0;
     }
-
-    return 1;
+    // if lod, 'lod[0].size()-1' is batch.
+    // for PaddleTensor lod is vector<vector<size_t>>, so lod[0] is real lod.
+    // for example, lod = [0,3,4,6], shape = [6,340,340], batch is 3 actually.
+    // for lod, the batch < shape[0].
+    if (_batch_out[fetchvar_index].lod.size() > 0 &&
+        _batch_out[fetchvar_index].lod[0].size() > 0) {
+      return _batch_out[fetchvar_index].lod[0].size() - 1;
+    }
+    // if not lod, the first dimension of data `PaddleTensor.shape[0]` is batch.
+    return _batch_out[fetchvar_index].shape[0];
   }
 
-  size_t fetchvar_batch_size() {
-    if (check_fetchvar_batch_align()) {
-      return fetchvar_batch_size(0);
+  size_t fetchvar_batch_size() { return _total_fetch_batch; }
+
+  bool deal_batch_out() {
+    _total_fetch_batch = fetchvar_batch_size(0);
+    if (_total_fetch_batch <= 0) return false;
+    for (size_t fetchvar_index = 0; fetchvar_index < _batch_out.size();
+         ++fetchvar_index) {
+      // TODO(HexToString): Distinguish between nobatch and batch =
+      // 1(By:HexToString)
+      // 当数据中fetchvar-1: 带batch,且batch =1，shape[0] = 1
+      // fetchvar-2:不带batch，由于不带batch导致shape[0] =1
+      // 此时，无法分辨是否是天然nobatch，此时set_fetch_nobatch_index会漏掉
+      // 后续希望在其他地方能够区分两者。
+      if (fetchvar_batch_size(fetchvar_index) != _total_fetch_batch) {
+        // which means error.
+        if (fetchvar_batch_size(fetchvar_index) != 1 &&
+            _total_fetch_batch != 1) {
+          return false;
+        } else {
+          // which means fetchvar shape[0] = 1.
+          // shape[0] does not change with batch
+          set_fetch_nobatch_index.insert(fetchvar_index);
+          _total_fetch_batch =
+              std::max(fetchvar_batch_size(fetchvar_index), _total_fetch_batch);
+        }
+      }
+      // 将lod fetchvar index加入到vector中。
+      if (_batch_out[fetchvar_index].lod.size() > 0 &&
+          _batch_out[fetchvar_index].lod[0].size() > 0) {
+        vector_fetch_lod_index.push_back(fetchvar_index);
+      }
     }
-    return 0;
+    return true;
   }
 
   void notify_tasks() {
@@ -436,12 +759,16 @@ class BatchTasks {
       LOG(ERROR) << "_taskmeta_vector.size() <=0, error.";
       return;
     }
-    if (_realNumber_batch_in[0] != fetchvar_batch_size()) {
+    // 根据_batch_out，求出输出的整体batch
+    // 并将lod类型和nobatch类型的fetchvar的index记录到set中，方便后续查看。
+    deal_batch_out();
+    // 若输出的batch不是1，且不与输入batch对应，则错误
+    if (_total_feed_batch != _total_fetch_batch && _total_fetch_batch != 1) {
       LOG(ERROR) << "_batch_out`s batch != _batch_in`s batch, error.";
       return;
     }
 
-    int fetchvar_num = _batch_out.size();
+    size_t fetchvar_num = _batch_out.size();
     if (_batch_out_offset.size() == 0) {
       _batch_out_offset.resize(fetchvar_num, 0);
     }
@@ -451,44 +778,132 @@ class BatchTasks {
       size_t begin = _taskmeta_vector[ti].begin;
       size_t end = _taskmeta_vector[ti].end;
       size_t add = end - begin;
-
-      for (int index = 0; index < fetchvar_num; ++index) {
-        // the task->outVectorT_ptr is null before core->run().
-        // first time we should copy from _batch_out
-        // so we need init.
-        size_t fetchvar_bytesize_index = fetchvar_bytesize(index);
-        if (task->outVectorT_ptr->size() <= index) {
-          paddle::PaddleTensor tensor_out;
-          tensor_out.name = _batch_out[index].name;
-          tensor_out.dtype = paddle::PaddleDType(_batch_out[index].dtype);
-          tensor_out.shape = _batch_out[index].shape;
-          tensor_out.shape[0] = task->batch_size();
-          tensor_out.lod = _batch_out[index].lod;
-          // resize all batch memory at one time
-          size_t databuf_size = task->batch_size() * fetchvar_bytesize_index;
-          tensor_out.data.Resize(databuf_size);
-          task->outVectorT_ptr->push_back(tensor_out);
-        }
-
-        paddle::PaddleTensor& fetchVarTensor = (*task->outVectorT_ptr)[index];
-
-        void* dst_ptr =
-            fetchVarTensor.data.data() + fetchvar_bytesize_index * begin;
-        size_t length = fetchvar_bytesize_index * add;
-        if (_batch_out_offset[index] + length >
-            fetchvar_batch_size() * fetchvar_bytesize(index)) {
-          LOG(ERROR) << "_batch_out is less than taskmeta, error.";
-          return;
+      size_t taskmeta_index = _taskmeta_vector[ti].taskmeta_index;
+      // 对task中的outVectorT_ptr进行初始化
+      // 如果是lod输出+多个taskmeta，此时对outLodTensorVector也需要初始化
+      if (!task->task_fetch_init(*this)) {
+        LOG(ERROR) << " task_fetch_init error.";
+        return;
+      }
+      size_t fetch_lod_index = 0;
+
+      for (size_t fetchvar_index = 0; fetchvar_index < fetchvar_num;
+           ++fetchvar_index) {
+        size_t fetchvar_bytesize_index = fetchvar_bytesize(fetchvar_index);
+
+        if (set_fetch_nobatch_index.size() > 0 &&
+            set_fetch_nobatch_index.find(fetchvar_index) !=
+                set_fetch_nobatch_index.end()) {
+          // nobatch fetchvar情况
+          // 无论输入是多少batch，该index的fetchvar始终就shape[0] = 1
+          paddle::PaddleTensor& fetchVarTensor =
+              (*task->outVectorT_ptr)[fetchvar_index];
+          void* dst_ptr = fetchVarTensor.data.data();
+          size_t length = fetchvar_bytesize_index * 1;
+          void* source_ptr = _batch_out[fetchvar_index].data.data();
+          memcpy(dst_ptr, source_ptr, length);
+        } else if (vector_fetch_lod_index.size() > 0 &&
+                   std::find(vector_fetch_lod_index.begin(),
+                             vector_fetch_lod_index.end(),
+                             fetchvar_index) != vector_fetch_lod_index.end()) {
+          // lod fetchvar情况，此时无法确定总的shape[0]
+          // 根据task中的task_num总数开辟task_num个临时空间
+          // 每个lod型的fetchvar拷贝到对应的临时空间中
+          // 最后再计算临时空间的总量，合并fetchvar和lod
+          size_t last_batch = _batch_out_offset[fetchvar_index];
+          size_t shape0_index_start =
+              _batch_out[fetchvar_index].lod[0][last_batch];
+          size_t shape0_index_end =
+              _batch_out[fetchvar_index].lod[0][last_batch + add];
+          size_t shape0_length = shape0_index_end - shape0_index_start;
+          // task被拆分为多个taskmeta时，不能直接拷入task->outVectorT_ptr
+          // 此时,先拷入task->outLodTensorVector[taskmeta_index]
+          // 当task所有的taskmeta都完成时，再按照顺序进行拷贝回task->outVectorT_ptr。
+          if (task->taskmeta_num > 1) {
+            paddle::PaddleTensor& fetchVarTensor =
+                task->outLodTensorVector[taskmeta_index][fetch_lod_index];
+            size_t length = fetchvar_bytesize_index * shape0_length;
+            fetchVarTensor.shape[0] = shape0_length;
+            fetchVarTensor.data.Resize(length);
+            void* dst_ptr = fetchVarTensor.data.data();
+            void* source_ptr = _batch_out[fetchvar_index].data.data() +
+                               shape0_index_start * fetchvar_bytesize_index;
+            memcpy(dst_ptr, source_ptr, length);
+            // 由于是拆分的各个lod，不要补0，在最后合并给Task中的outVectorT_ptr时再补。
+            if (fetchVarTensor.lod.size() <= 0) {
+              fetchVarTensor.lod.push_back({});
+            }
+            fetchVarTensor.lod[0].resize(add, 0);
+            size_t last_lod_value =
+                _batch_out[fetchvar_index].lod[0][last_batch];
+            for (size_t lod_index = last_batch + 1, my_index = 0;
+                 lod_index < last_batch + add + 1;
+                 ++lod_index, ++my_index) {
+              fetchVarTensor.lod[0][my_index] =
+                  (_batch_out[fetchvar_index].lod[0][lod_index] -
+                   last_lod_value);
+            }
+          } else {
+            // task未被拆分为多个taskmeta，故只有某个线程中的taskmeta会操作task不存在多线程竞争
+            // 此时resize后，直接写入task->outVectorT_ptr中即可。
+            paddle::PaddleTensor& fetchVarTensor =
+                (*task->outVectorT_ptr)[fetchvar_index];
+            size_t length = fetchvar_bytesize_index * shape0_length;
+            fetchVarTensor.shape[0] = shape0_length;
+            fetchVarTensor.data.Resize(length);
+            void* dst_ptr = fetchVarTensor.data.data();
+            void* source_ptr = _batch_out[fetchvar_index].data.data() +
+                               shape0_index_start * fetchvar_bytesize_index;
+            memcpy(dst_ptr, source_ptr, length);
+
+            // task中的lod补0
+            if (fetchVarTensor.lod.size() <= 0) {
+              fetchVarTensor.lod.push_back({0});
+            } else if (fetchVarTensor.lod[0].size() <= 0) {
+              fetchVarTensor.lod[0].push_back(0);
+            }
+            // 将合并的lod信息对应的batch，拆分到task中。
+            // 注意，此时需要去掉前面lod导致的前置积累。
+            // 例如: 合lod = [0,2,5;7,10]，是由两组batch=2的task合并后预测的。
+            // 此时拆分，第一组时，都减去0,得到[2,5]+(由于前面已经补了0了) =
+            // [0,2,5]
+            // 第二组，都需要减5,得到[2,5]，这样处理才对。
+            fetchVarTensor.lod[0].resize(add + 1, 0);
+            size_t last_lod_value =
+                _batch_out[fetchvar_index].lod[0][last_batch];
+            for (size_t lod_index = last_batch + 1, my_index = 1;
+                 lod_index < last_batch + add + 1;
+                 ++lod_index, ++my_index) {
+              fetchVarTensor.lod[0][my_index] =
+                  (_batch_out[fetchvar_index].lod[0][lod_index] -
+                   last_lod_value);
+            }
+          }
+          fetch_lod_index++;
+        } else {
+          // 普通fetchvar情况，此时该Task总的fetchvar_batch =
+          // 输入的总的batch_size()
+          // 输出的batch应与输入的batch对应相等。
+          paddle::PaddleTensor& fetchVarTensor =
+              (*task->outVectorT_ptr)[fetchvar_index];
+          void* dst_ptr =
+              fetchVarTensor.data.data() + fetchvar_bytesize_index * begin;
+          size_t length = fetchvar_bytesize_index * add;
+          void* source_ptr =
+              _batch_out[fetchvar_index].data.data() +
+              _batch_out_offset[fetchvar_index] * fetchvar_bytesize_index;
+
+          memcpy(dst_ptr, source_ptr, length);
         }
-        void* source_ptr =
-            _batch_out[index].data.data() + _batch_out_offset[index];
-
-        memcpy(dst_ptr, source_ptr, length);
-        _batch_out_offset[index] += length;
+        _batch_out_offset[fetchvar_index] += add;
       }
 
+      // index是局部变量，fetch_add是原子操作，成功则返回原值。
+      // 只有最后一个taskmeta都完成后，该线程的index+add才能>task->batch_size()
+      // 故只有一个线程能进入if{}内.不会造成多线程竞争的问题。
       size_t index = task->index.fetch_add(add);
       if ((index + add) >= task->batch_size()) {
+        task->combine_taskmeta();
         char c = 0;
         while (write(task->write_fd, &c, 1) != 1 && errno == EINTR) {
         }
@@ -503,17 +918,32 @@ class BatchTasks {
 
   size_t task_size() { return _taskmeta_vector.size(); }
 
+  const size_t get_rem_size() { return _rem_size; }
+
+  bool get_overrun() { return _overrun; }
+
+  bool get_allow_split_request() { return _allow_split_request; }
+
  private:
   std::vector<TaskMetaT> _taskmeta_vector;
   typename TaskT::InVectorT _batch_in;
   std::vector<size_t> _batch_in_offset;
-  std::vector<size_t> _realNumber_batch_in;
+  std::vector<size_t> _total_shape0_batch_in;
+  size_t _total_feed_batch;
+  std::vector<PaddleTensorLod> _batch_in_lod;
+
   typename TaskT::OutVectorT _batch_out;
   std::vector<size_t> _batch_out_offset;
-  std::vector<size_t> _realNumber_batch_out;
+  // std::vector<size_t> _total_shape0_batch_out;
+  size_t _total_fetch_batch;
+  // std::vector<PaddleTensorLod>  _batch_out_lod;
+  std::set<size_t> set_fetch_nobatch_index;
+  std::vector<size_t> vector_fetch_lod_index;
+
   size_t _rem_size;
   size_t _batch_size;
-  bool _batch_align;
+  bool _overrun;
+  bool _allow_split_request;
 };
 
 // BSF task handle
@@ -589,6 +1019,8 @@ class TaskExecutor {
   typedef typename TaskT::OutVectorT OutVectorT;
   typedef std::vector<TaskT> TaskArrayT;
   typedef baidu::paddle_serving::predictor::MempoolWrapper MempoolWrapper;
+  typedef std::vector<size_t> ShapeVector;
+  typedef std::vector<ShapeVector> VectorOfShapeVector;
 
   TaskExecutor()
       : _stop(false),
@@ -596,7 +1028,7 @@ class TaskExecutor {
         _thread_reset_fn(NULL),
         _user_thread_contexts(NULL),
         _batch_size(DEFAULT_BATCH_SIZE),
-        _batch_align(false),
+        _overrun(false),
         _fn(NULL) {
     THREAD_MUTEX_INIT(&_mut, NULL);
     THREAD_COND_INIT(&_cond, NULL);
@@ -617,7 +1049,11 @@ class TaskExecutor {
 
   void set_batch_size(size_t batch_size) { _batch_size = batch_size; }
 
-  void set_batch_align(size_t batch_align) { _batch_align = batch_align; }
+  void set_overrun(bool overrun) { _overrun = overrun; }
+
+  void set_allow_split_request(bool allow_split_request) {
+    _allow_split_request = allow_split_request;
+  }
 
   void set_thread_init_fn(boost::function<int(void*)> init_fn,
                           void** contexts = NULL) {
@@ -642,7 +1078,7 @@ class TaskExecutor {
 
   TaskHandler<TaskT> schedule(const void*, void*);
 
-  bool move_task_to_batch(BatchTasks<TaskT>& batch);  // NOLINT
+  bool move_task_to_batch(BatchTasks<TaskT>& batchTask);  // NOLINT
 
  private:
   TaskExecutor(TaskExecutor<TaskT> const& other) = delete;
@@ -669,7 +1105,8 @@ class TaskExecutor {
   std::vector<ThreadContext<TaskT>*> _thread_contexts;
 
   size_t _batch_size;
-  bool _batch_align;
+  bool _overrun;
+  bool _allow_split_request;
 
   boost::function<void(const void*, void*)> _fn;
 };
@@ -687,12 +1124,12 @@ class TaskExecutorVector {
 
   void resize(int size) { _vector_executor.resize(size); }
 
-  TaskExecutor<TaskT>& operator[](int index) {
-    if (_vector_executor.size() <= index || index <= -1) {
-      LOG(ERROR) << "_vector_executor.size() <= index or <= -1";
-      throw "_vector_executor.size() <= index or <= -1";
+  TaskExecutor<TaskT>& operator[](int task_index) {
+    if (_vector_executor.size() <= task_index || task_index <= -1) {
+      LOG(ERROR) << "_vector_executor.size() <= task_index or <= -1";
+      throw "_vector_executor.size() <= task_index or <= -1";
     }
-    return _vector_executor[index];
+    return _vector_executor[task_index];
   }
 
  private:
@@ -717,8 +1154,8 @@ class TaskManager {
   typedef typename TaskT::InVectorT InVectorT;
   typedef typename TaskT::OutVectorT OutVectorT;
 
-  explicit TaskManager(uint32_t index)  // NOLINT
-      : _model_index(index) {}
+  explicit TaskManager(uint32_t model_index)  // NOLINT
+      : _model_index(model_index) {}
 
   ~TaskManager() { wait(); }
 
diff --git a/core/predictor/framework/cache.cpp b/core/predictor/framework/cache.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8715b85a66eccb71469bca294de8d8488cb59288
--- /dev/null
+++ b/core/predictor/framework/cache.cpp
@@ -0,0 +1,115 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+
+#include "core/predictor/framework/cache.h"
+#include <dirent.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <string>
+#include <utility>
+#include "core/cube/cube-builder/include/cube-builder/seqfile_reader.h"
+
+namespace baidu {
+namespace paddle_serving {
+namespace predictor {
+
+int CubeCache::clear() {
+  for (auto it = _map_cache.begin(); it != _map_cache.end(); ++it) {
+    if (it->second) {
+      delete (it->second);
+      it->second = nullptr;
+    }
+  }
+  _map_cache.clear();
+
+  return 0;
+}
+
+rec::mcube::CubeValue* CubeCache::get_data(uint64_t key) {
+  auto it = _map_cache.find(key);
+  if (it != _map_cache.end()) {
+    return it->second;
+  }
+  return nullptr;
+}
+
+int CubeCache::reload_data(const std::string& cache_path) {
+  LOG(INFO) << "cube cache is loading data, path: " << cache_path;
+  DIR* dp = nullptr;
+  struct dirent* dirp = nullptr;
+  struct stat st;
+
+  // clear cache data
+  clear();
+
+  // loading data from cache files
+  if (stat(cache_path.c_str(), &st) < 0 || !S_ISDIR(st.st_mode)) {
+    LOG(ERROR) << "invalid cache path " << cache_path;
+    return -1;
+  }
+  if ((dp = opendir(cache_path.c_str())) == nullptr) {
+    LOG(ERROR) << "opendir " << cache_path << " fail.";
+    return -1;
+  }
+  while ((dirp = readdir(dp)) != nullptr) {
+    // filtering by file type.
+    if (dirp->d_type != DT_REG) {
+      continue;
+    }
+    // Filter upper-level directories and hidden files
+    if ((!strncmp(dirp->d_name, ".", 1)) || (!strncmp(dirp->d_name, "..", 2))) {
+      continue;
+    }
+    // Match the file whose name prefix is ​​'part-'
+    if (std::string(dirp->d_name).find("part-") != std::string::npos) {
+      SequenceFileRecordReader reader(cache_path + "/" + dirp->d_name);
+
+      if (reader.open() != 0) {
+        LOG(ERROR) << "open file failed! " << dirp->d_name;
+        continue;
+      }
+      if (reader.read_header() != 0) {
+        LOG(ERROR) << "read header error! " << dirp->d_name;
+        reader.close();
+        continue;
+      }
+
+      Record record(reader.get_header());
+      while (reader.next(&record) == 0) {
+        uint64_t key =
+            *reinterpret_cast<uint64_t*>(const_cast<char*>(record.key.data()));
+
+        auto it_find = _map_cache.find(key);
+        if (it_find != _map_cache.end()) {
+          // load dumplicate key
+          LOG(WARNING) << "Load dumplicate key:" << key
+                       << " from file:" << dirp->d_name;
+          continue;
+        }
+        rec::mcube::CubeValue* new_value = new rec::mcube::CubeValue();
+        new_value->error = 0;
+        new_value->buff.swap(record.value);
+        _map_cache.insert(std::make_pair(key, new_value));
+      }
+
+      LOG(WARNING) << "Load cube cache file " << dirp->d_name << " done.";
+    }
+    LOG(WARNING) << "Load all cube cache files done";
+  }
+  return 0;
+}
+
+}  // namespace predictor
+}  // namespace paddle_serving
+}  // namespace baidu
diff --git a/core/predictor/framework/cache.h b/core/predictor/framework/cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..1acc46e0539a33df530b77673809385711f6ea63
--- /dev/null
+++ b/core/predictor/framework/cache.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <sys/types.h>
+#include <numeric>
+#include <string>
+#include <unordered_map>
+#include "core/cube/cube-api/include/cube_api.h"
+
+namespace baidu {
+namespace paddle_serving {
+namespace predictor {
+
+// Large models that use sparse parameters may use cube cache.
+// When the cube cache exists, the model is required to be
+// consistent with the version of the cube cache. Therefore,
+// when the model is updated, the model and the cube cache are
+// required to be reloaded at the same time.
+// Load all cached data at once without updating, it's lock free
+// switching two cube cache.
+class CubeCache {
+ public:
+  CubeCache() {}
+
+  ~CubeCache() { clear(); }
+
+  // clear cache data.
+  int clear();
+
+  // get cache data by key
+  rec::mcube::CubeValue* get_data(uint64_t key);
+
+  // reload all cache files from cache_path
+  int reload_data(const std::string& cache_path);
+
+ private:
+  // switching free lock, key type is uint64_t, value type is CubeValue*
+  std::unordered_map<uint64_t, rec::mcube::CubeValue*> _map_cache;
+};
+
+}  // namespace predictor
+}  // namespace paddle_serving
+}  // namespace baidu
diff --git a/core/predictor/framework/dag_view.cpp b/core/predictor/framework/dag_view.cpp
index 29a4e97378c20d6f9caae8a97de7dc5f714960e9..64383514f604688a085097a7ec0043c91f25a9b4 100644
--- a/core/predictor/framework/dag_view.cpp
+++ b/core/predictor/framework/dag_view.cpp
@@ -21,6 +21,15 @@
 #include <string>
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/framework/op_repository.h"
+#ifdef BCLOUD
+#include <base/atomicops.h>
+#else
+#include <butil/atomicops.h>
+#endif
+#include <errno.h>
+
+#include "core/predictor/framework/resource.h"
+using baidu::paddle_serving::predictor::Resource;
 
 namespace baidu {
 namespace paddle_serving {
@@ -238,6 +247,77 @@ const Channel* DagView::get_response_channel(const uint64_t log_id) const {
   return last_op->mutable_channel();
 }
 
+void* call_back(void* ori_args) {
+  Resource::instance().thread_initialize();
+  Args* args = (Args*)ori_args;
+  Op* op = static_cast<Op*>(args->_op);
+  uint64_t log_id = static_cast<uint64_t>(args->_log_id);
+  bool debug = static_cast<bool>(args->_debug);
+  args->errcode = op->process(log_id, debug);
+  return nullptr;
+}
+
+int ParallelDagView::execute_one_stage(ViewStage* vstage,
+                                       const uint64_t log_id,
+                                       butil::IOBufBuilder* debug_os) {
+  butil::Timer stage_time(butil::Timer::STARTED);
+  uint32_t node_size = vstage->nodes.size();
+  std::vector<THREAD_T> tids(node_size);
+  Args* args = new Args[node_size];
+  VLOG(2) << "(logid=" << log_id << ") vstage->nodes.size(): " << node_size;
+  for (uint32_t ni = 0; ni < node_size; ni++) {
+    ViewNode* vnode = vstage->nodes[ni];
+    DagNode* conf = vnode->conf;
+    Op* op = vnode->op;
+    TRACEPRINTF(
+        "(logid=%" PRIu64 ") start to execute op[%s]", log_id, op->name());
+
+    args[ni]._op = op;
+    args[ni]._log_id = log_id;
+    args[ni]._debug = (debug_os != NULL);
+    int rc = THREAD_CREATE(&tids[ni], NULL, call_back, (void*)(args + ni));
+    if (rc != 0) {
+      LOG(ERROR) << "failed to create ParallelDagView worker thread: index="
+                 << ni << ", rc=" << rc << ", errno=" << errno << ":"
+                 << strerror(errno);
+      delete[] args;
+      return -1;
+    }
+  }
+  for (uint32_t ni = 0; ni < node_size; ni++) {
+    THREAD_JOIN(tids[ni], NULL);
+    int errcode = args[ni].errcode;
+    Op* op = args[ni]._op;
+    TRACEPRINTF(
+        "(logid=%" PRIu64 ") finish to execute op[%s]", log_id, op->name());
+    if (errcode < 0) {
+      LOG(ERROR) << "(logid=" << log_id
+                 << ") Execute failed, Op:" << op->debug_string();
+      delete[] args;
+      return errcode;
+    }
+
+    if (errcode > 0) {
+      LOG(INFO) << "(logid=" << log_id
+                << ") Execute ignore, Op:" << op->debug_string();
+      continue;
+    }
+
+    if (debug_os) {
+      (*debug_os) << "(logid=" << log_id << ") {\"op_name\": \"" << op->name()
+                  << "\", \"debug_str:\": \"" << op->debug_string()
+                  << "\", \"time_info\": \"" << op->time_info() << "\"}";
+    }
+
+    // LOG(DEBUG) << "Execute succ, Op:" << op->debug_string();
+  }
+  stage_time.stop();
+  PredictorMetric::GetInstance()->update_latency_metric(
+      STAGE_METRIC_PREFIX + vstage->full_name, stage_time.u_elapsed());
+  delete[] args;
+  return ERR_OK;
+}
+
 }  // namespace predictor
 }  // namespace paddle_serving
 }  // namespace baidu
diff --git a/core/predictor/framework/dag_view.h b/core/predictor/framework/dag_view.h
index 8ba9d224c577b475d0a52b79e92f72bd1abaa187..5b12238396e359069fff2ce288eba3724ca0a764 100644
--- a/core/predictor/framework/dag_view.h
+++ b/core/predictor/framework/dag_view.h
@@ -24,7 +24,7 @@ namespace baidu {
 namespace paddle_serving {
 namespace predictor {
 
-class Op;
+// class Op;
 
 struct ViewNode {
   Op* op;  // op->full_name == service_workflow_stageindex_opname
@@ -75,11 +75,20 @@ class DagView {
   Bus* _bus;
 };
 
+struct Args {
+  Op* _op;
+  uint64_t _log_id;
+  bool _debug;
+  int errcode;
+};
+
 // The derived DagView supports parallel execution
 // strategy, by implments the execute_one_stage().
 class ParallelDagView : public DagView {
  public:
-  int execute_one_stage(ViewStage* vstage, butil::IOBufBuilder*) { return 0; }
+  virtual int execute_one_stage(ViewStage* vstage,
+                                const uint64_t log_id,
+                                butil::IOBufBuilder* debug_os);
 };
 
 }  // namespace predictor
diff --git a/core/predictor/framework/infer.cpp b/core/predictor/framework/infer.cpp
index 5149a4852570298d16183709f6c2d457e1cc524f..0290612287de7c5d63626fb28ebc092f03dd4d15 100644
--- a/core/predictor/framework/infer.cpp
+++ b/core/predictor/framework/infer.cpp
@@ -25,7 +25,8 @@ int ReloadableInferEngine::proc_initialize_impl(
   _model_dir = conf.model_dir();
   _infer_thread_num = conf.runtime_thread_num();
   _infer_batch_size = conf.batch_infer_size();
-  _infer_batch_align = conf.enable_batch_align();
+  _infer_overrun = conf.enable_overrun();
+  _allow_split_request = conf.allow_split_request();
 
   _conf = conf;
 
@@ -56,9 +57,6 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,
   }
 
   // init bsf framework
-  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
-      .set_thread_init_fn(
-          boost::bind(&InferEngine::thrd_initialize_impl, this));
   im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
       .set_thread_init_fn(
           boost::bind(&InferEngine::thrd_initialize_impl, this));
@@ -69,8 +67,10 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,
           boost::bind(&InferEngine::task_infer_impl, this, _1, _2));
   im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_batch_size(
       _infer_batch_size);
-  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_batch_align(
-      _infer_batch_align);
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_overrun(
+      _infer_overrun);
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
+      .set_allow_split_request(_allow_split_request);
   if (im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].start(
           _infer_thread_num) != 0) {
     LOG(ERROR) << "Failed start bsf executor, threads:" << _infer_thread_num;
@@ -79,7 +79,8 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,
 
   LOG(WARNING) << "Enable batch schedule framework, thread_num:"
                << _infer_thread_num << ", batch_size:" << _infer_batch_size
-               << ", enable_batch_align:" << _infer_batch_align;
+               << ", enable_overrun:" << _infer_overrun
+               << ", allow_split_request:" << _allow_split_request;
   return 0;
 }
 
@@ -348,7 +349,7 @@ T* VersionedInferEngine::get_core() {
 }
 
 template <typename T>
-T* VersionedInferEngine::get_core(uint64_t version) {
+T* VersionedInferEngine::get_core(const uint64_t version) {
   auto iter = _versions.find(version);
   if (iter == _versions.end()) {
     LOG(ERROR) << "Not found version engine: " << version;
@@ -363,6 +364,15 @@ T* VersionedInferEngine::get_core(uint64_t version) {
   return NULL;
 }
 
+CubeCache* VersionedInferEngine::get_cube_cache() {
+  InferEngine* engine = default_engine();
+  if (!engine) {
+    LOG(WARNING) << "fail to get default engine";
+    return nullptr;
+  }
+  return engine->get_cube_cache();
+}
+
 int VersionedInferEngine::proc_initialize_impl(
     const configure::EngineDesc& conf, bool) {
   return -1;
@@ -382,6 +392,11 @@ int VersionedInferEngine::task_infer_impl(const void* in,
   return -1;
 }
 
+int InferManager::set_taskexecutor_num(size_t total_engine_num) {
+  im::bsf::TaskExecutorVector<TaskT>::instance().resize(total_engine_num);
+  return 0;
+}
+
 int InferManager::proc_initialize(const char* path,
                                   const char* file,
                                   std::shared_ptr<int> engine_index_ptr) {
@@ -391,8 +406,6 @@ int InferManager::proc_initialize(const char* path,
     return -1;
   }
   uint32_t engine_num = model_toolkit_conf.engines_size();
-  im::bsf::TaskExecutorVector<TaskT>::instance().resize(*engine_index_ptr +
-                                                        engine_num);
   for (uint32_t ei = 0; ei < engine_num; ++ei) {
     LOG(INFO) << "model_toolkit_conf.engines(" << ei
               << ").name: " << model_toolkit_conf.engines(ei).name();
@@ -502,6 +515,15 @@ T* InferManager::get_core(const char* model_name) {
   return NULL;
 }
 
+CubeCache* InferManager::get_cube_cache(const char* model_name) {
+  auto it = _map.find(model_name);
+  if (it == _map.end()) {
+    LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
+    return nullptr;
+  }
+  return it->second->get_cube_cache();
+}
+
 // Versioned inference interface
 int InferManager::infer(const char* model_name,
                         const void* in,
@@ -517,7 +539,7 @@ int InferManager::infer(const char* model_name,
 }
 
 template <typename T>
-T* InferManager::get_core(const char* model_name, uint64_t version) {
+T* InferManager::get_core(const char* model_name, const uint64_t version) {
   auto it = _map.find(model_name);
   if (it == _map.end()) {
     LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
diff --git a/core/predictor/framework/infer.h b/core/predictor/framework/infer.h
index 93be13c684874b8b5a6686f3aeddd2942037d84c..a6815d4939edfb2a0d6dcebaa602b545b770d52f 100644
--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <pthread.h>
 #include <sys/stat.h>
+#include <sys/syscall.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <functional>
@@ -25,16 +26,19 @@
 #include <vector>
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/framework/bsf.h"
+#include "core/predictor/framework/cache.h"
 #include "core/predictor/framework/factory.h"
 #include "core/predictor/framework/infer_data.h"
 #include "core/predictor/framework/memory.h"
 #include "paddle_inference_api.h"  // NOLINT
+#include "experimental/float16.h"
 namespace baidu {
 namespace paddle_serving {
 namespace predictor {
 
 using configure::ModelToolkitConf;
 
+// Auto mutex lock
 class AutoLock {
  public:
   explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
@@ -46,6 +50,7 @@ class AutoLock {
   pthread_mutex_t& _mut;
 };
 
+// Gloabl singleton mutex lock
 class GlobalCreateMutex {
  public:
   pthread_mutex_t& mutex() { return _mut; }
@@ -60,6 +65,7 @@ class GlobalCreateMutex {
   pthread_mutex_t _mut;
 };
 
+// InferEngine
 class InferEngine {
  public:
   virtual ~InferEngine() {}
@@ -90,11 +96,13 @@ class InferEngine {
                          void* out,
                          uint32_t batch_size = -1) = 0;
   virtual int task_infer_impl(const void* in, void* out) = 0;  // NOLINT
+  virtual CubeCache* get_cube_cache() = 0;
 
  protected:
   uint32_t _model_index;
   // end: framework inner call
 };
+
 typedef im::bsf::Task<paddle::PaddleTensor, paddle::PaddleTensor> TaskT;
 class ReloadableInferEngine : public InferEngine {
  public:
@@ -163,28 +171,37 @@ class ReloadableInferEngine : public InferEngine {
   uint32_t _infer_batch_size;
 
   // Need to align batch_size in inferring
-  bool _infer_batch_align;
+  bool _infer_overrun;
 
+  // allow to split request in inferring
+  bool _allow_split_request;
   // model version
   uint64_t _version;
 };
 
-// Lock free switching two models
+// Lock free switching two models and cube caches
 template <typename EngineCore>
 struct ModelData {
   ModelData() : current_idx(1) {
-    cores[0] = NULL;
-    cores[1] = NULL;
+    cores[0] = nullptr;
+    cores[1] = nullptr;
+    caches[0] = nullptr;
+    caches[1] = nullptr;
   }
 
   ~ModelData() {
     delete cores[0];
     delete cores[1];
+    delete caches[0];
+    delete caches[1];
   }
 
-  void* get() { return cores[current_idx]->get(); }
+  void* get_core() { return cores[current_idx]->get(); }
+
+  CubeCache* get_cache() { return caches[current_idx]; }
 
   EngineCore* cores[2];
+  CubeCache* caches[2];
   uint32_t current_idx;
 };
 
@@ -196,7 +213,7 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
   int proc_initialize(const configure::EngineDesc& conf, bool version) {
     THREAD_KEY_CREATE(&_skey, NULL);
     THREAD_MUTEX_INIT(&_mutex, NULL);
-    gpu_index = 0;
+    _gpu_index = 0;
     return ReloadableInferEngine::proc_initialize(conf, version);
   }
 
@@ -209,7 +226,7 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
     if (_reload_vec.empty()) {
       return 0;
     }
-    gpu_index = 0;
+    _gpu_index = 0;
     for (uint32_t ti = 0; ti < _reload_vec.size(); ++ti) {
       if (load_data(_reload_vec[ti], conf) != 0) {
         LOG(ERROR) << "Failed reload engine model: " << ti;
@@ -224,26 +241,56 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
   virtual int load_data(ModelData<EngineCore>* md,
                         const configure::EngineDesc& conf) {
     uint32_t next_idx = (md->current_idx + 1) % 2;
+
+    // reload engine core
     if (md->cores[next_idx]) {
       delete md->cores[next_idx];
     }
-
     md->cores[next_idx] = new (std::nothrow) EngineCore;
-
-    // params.dump();
+    if (nullptr == md->cores[next_idx]) {
+      LOG(ERROR) << "Allocating memory failed. ";
+      return -1;
+    }
     size_t gpu_ids_num = conf.gpu_ids_size();
     im::bsf::AutoMutex lock(_mutex);
     int gpu_id = -1;
     if (gpu_ids_num > 0) {
-      gpu_id = conf.gpu_ids(gpu_index % gpu_ids_num);
+      gpu_id = conf.gpu_ids(_gpu_index % gpu_ids_num);
     }
+    LOG(WARNING) << "Loading EngineCore[" << next_idx << "] ...";
     if (!md->cores[next_idx] ||
         md->cores[next_idx]->create(conf, gpu_id) != 0) {
       LOG(ERROR) << "Failed create model, path: " << conf.model_dir();
       return -1;
     }
-    gpu_index++;
+    _gpu_index++;
+    LOG(WARNING) << "Loading EngineCore[" << next_idx << "] done.";
+
+    // reload cube cache
+    if (nullptr == md->caches[next_idx]) {
+      md->caches[next_idx] = new (std::nothrow) CubeCache;
+    }
+
+    if (nullptr == md->caches[next_idx]) {
+      LOG(ERROR) << "Allocating memory failed.";
+      return -1;
+    }
+    LOG(WARNING) << "Loading cube cache[" << next_idx << "] ...";
+    std::string model_path = conf.model_dir();
+    if (access(model_path.c_str(), F_OK) == 0) {
+      std::string cube_cache_path = model_path + "/cube_cache";
+      int reload_cache_ret = md->caches[next_idx]->reload_data(cube_cache_path);
+      LOG(WARNING) << "Loading cube cache[" << next_idx << "] done.";
+    } else {
+      LOG(ERROR) << "model_path " << model_path
+                 << " is not exits. Ignore cube cache!";
+    }
+
+    // switch current_idx
     md->current_idx = next_idx;
+    LOG(WARNING)
+        << "Reload model and cube cache done. switching to current_idx["
+        << next_idx << "]";
     return 0;
   }
 
@@ -309,11 +356,25 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
     return md->cores[md->current_idx];
   }
 
+  CubeCache* get_cube_cache() {
+    ModelData<EngineCore>* md =
+        (ModelData<EngineCore>*)THREAD_GETSPECIFIC(_skey);
+    if (!md) {
+      LOG(ERROR) << "Failed get thread specific data";
+      return NULL;
+    }
+    return md->get_cache();
+  }
+
  protected:
   THREAD_KEY_T _skey;
   THREAD_MUTEX_T _mutex;
+
+  // vector of all model engines
   std::vector<ModelData<EngineCore>*> _reload_vec;
-  int gpu_index = 0;
+
+  // gpu card id
+  int _gpu_index = 0;
 };
 
 // 多个EngineCore共用同一份模型数据
@@ -331,12 +392,20 @@ class CloneDBReloadableInferEngine
 
   virtual int load_data(ModelData<EngineCore>* md,
                         const configure::EngineDesc& conf) {
+    int tid = syscall(SYS_gettid);
     uint32_t next_idx = (md->current_idx + 1) % 2;
     if (md->cores[next_idx]) {
       delete md->cores[next_idx];
     }
     md->cores[next_idx] = new (std::nothrow) EngineCore;
 
+    if (nullptr == md->caches[next_idx]) {
+      md->caches[next_idx] = new (std::nothrow) CubeCache;
+    }
+    if (nullptr == md->cores[next_idx] || nullptr == md->caches[next_idx]) {
+      LOG(ERROR) << "Allocating memory fail.";
+      return -1;
+    }
     // params.dump();
     // gpu_ids_num > 0 is always true.
     // if use CPU, gpu_ids = [-1].
@@ -347,46 +416,70 @@ class CloneDBReloadableInferEngine
     im::bsf::AutoMutex lock(DBReloadableInferEngine<EngineCore>::_mutex);
     int gpu_id = -1;
     if (gpu_ids_num > 0) {
-      gpu_id = conf.gpu_ids(DBReloadableInferEngine<EngineCore>::gpu_index %
+      gpu_id = conf.gpu_ids(DBReloadableInferEngine<EngineCore>::_gpu_index %
                             gpu_ids_num);
     } else {
       gpu_ids_num = 1;
     }
-    // gpu_index will be set to be 0, when load() or proc_initial() is called.
-    // gpu_index < gpu_ids_num, means there are predictors still not create
+
+    // _gpu_index will be set to be 0, when load() or proc_initial() is called.
+    // _gpu_index < gpu_ids_num, means there are predictors still not create
     // on some GPU card.
     // so we need to create the predictor.
-    // gpu_index >= gpu_ids_num, means each GPU card has already create one.
+    // _gpu_index >= gpu_ids_num, means each GPU card has already create one.
     // so we need to clone the predictor.
-    if (DBReloadableInferEngine<EngineCore>::gpu_index < gpu_ids_num) {
-      if (!md->cores[next_idx] ||
-          md->cores[next_idx]->create(conf, gpu_id) != 0) {
+    LOG(WARNING) << "tid:" << tid << " Loading clone model ...";
+    if (DBReloadableInferEngine<EngineCore>::_gpu_index < gpu_ids_num) {
+      // create cores
+      if (md->cores[next_idx]->create(conf, gpu_id) != 0) {
         LOG(ERROR) << "Failed create model, path: " << conf.model_dir();
         return -1;
       }
-      DBReloadableInferEngine<EngineCore>::gpu_index++;
-      md->current_idx = next_idx;
+      // create caches
+      std::string model_path = conf.model_dir();
+      if (access(model_path.c_str(), F_OK) == 0) {
+        std::string cube_cache_path = model_path + "/cube_cache";
+        int reload_cache_ret =
+            md->caches[next_idx]->reload_data(cube_cache_path);
+        LOG(WARNING) << "create cube cache[" << next_idx << "] done.";
+      } else {
+        LOG(WARNING) << "model_path " << model_path
+                     << " is not exits. Ignore cube cache!";
+      }
+
+      DBReloadableInferEngine<EngineCore>::_gpu_index++;
+      // md->current_idx = next_idx;
       if (_cloneTemplate.size() <
-          DBReloadableInferEngine<EngineCore>::gpu_index) {
+          DBReloadableInferEngine<EngineCore>::_gpu_index) {
         _cloneTemplate.push_back(md);
       } else {
-        _cloneTemplate[DBReloadableInferEngine<EngineCore>::gpu_index - 1] = md;
+        _cloneTemplate[DBReloadableInferEngine<EngineCore>::_gpu_index - 1] =
+            md;
       }
     } else {
-      int template_index = DBReloadableInferEngine<EngineCore>::gpu_index %
+      int template_index = DBReloadableInferEngine<EngineCore>::_gpu_index %
                            _cloneTemplate.size();
-      if (!md->cores[next_idx] ||
-          md->cores[next_idx]->clone(_cloneTemplate[template_index]->get()) !=
-              0) {
+
+      // clone cores
+      if (md->cores[next_idx]->clone(
+              _cloneTemplate[template_index]->get_core()) != 0) {
         LOG(ERROR) << "Failed clone model from core";
         return -1;
       }
-      DBReloadableInferEngine<EngineCore>::gpu_index++;
-      md->current_idx = next_idx;
-      LOG(WARNING) << "core clone model succ, cur_idx[" << md->current_idx
-                   << "].";
+      // clone caches
+      md->caches[next_idx] = _cloneTemplate[template_index]->get_cache();
+      LOG(WARNING) << "tid:" << tid << " clone caches done";
+
+      DBReloadableInferEngine<EngineCore>::_gpu_index++;
     }
 
+    // switch current_idx
+    md->current_idx = next_idx;
+    LOG(WARNING)
+        << "[" << tid
+        << "] Reload clone model and cube cache done. switching to current_idx["
+        << next_idx << "]";
+
     return 0;
   }
 
@@ -441,7 +534,28 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<EngineCore> {
                  paddle::PaddleDType::INT32) {
         int32_t* data = static_cast<int32_t*>(origin_data);
         lod_tensor_in->CopyFromCpu(data);
+      } else if ((*tensorVector_in_pointer)[i].dtype ==
+                 paddle::PaddleDType::UINT8) {
+        uint8_t* data = static_cast<uint8_t*>(origin_data);
+        lod_tensor_in->CopyFromCpu(data);
+      } else if ((*tensorVector_in_pointer)[i].dtype ==
+                 paddle::PaddleDType::INT8) {
+        int8_t* data = static_cast<int8_t*>(origin_data);
+        lod_tensor_in->CopyFromCpu(data);
+      } else if ((*tensorVector_in_pointer)[i].dtype ==
+               paddle::PaddleDType::FLOAT16) {
+        paddle::platform::float16* data =
+            static_cast<paddle::platform::float16*>(origin_data);
+        lod_tensor_in->CopyFromCpu(data);
+      } else {
+        LOG(ERROR) << "Inference not support type["
+                   << (*tensorVector_in_pointer)[i].dtype << "],name["
+                   << (*tensorVector_in_pointer)[i].name << "]"
+                   << " copy into core failed!";
       }
+      VLOG(2) << "Tensor:name=" << (*tensorVector_in_pointer)[i].name
+              << ";in_dtype=" << (*tensorVector_in_pointer)[i].dtype
+              << ";tensor_dtype=" << lod_tensor_in->type();
     }
     // After the input data is passed in,
     // call 'core->Run()' perform the prediction process.
@@ -506,7 +620,39 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<EngineCore> {
         int32_t* data_out = reinterpret_cast<int32_t*>(databuf_data);
         lod_tensor_out->CopyToCpu(data_out);
         databuf_char = reinterpret_cast<char*>(data_out);
+      } else if (dataType == paddle::PaddleDType::UINT8) {
+        databuf_size = out_num * sizeof(uint8_t);
+        databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+        if (!databuf_data) {
+          LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+          return -1;
+        }
+        uint8_t* data_out = reinterpret_cast<uint8_t*>(databuf_data);
+        lod_tensor_out->CopyToCpu(data_out);
+        databuf_char = reinterpret_cast<char*>(data_out);
+      } else if (dataType == paddle::PaddleDType::INT8) {
+        databuf_size = out_num * sizeof(int8_t);
+        databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+        if (!databuf_data) {
+          LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+          return -1;
+        }
+        int8_t* data_out = reinterpret_cast<int8_t*>(databuf_data);
+        lod_tensor_out->CopyToCpu(data_out);
+        databuf_char = reinterpret_cast<char*>(data_out);
+      } else if (dataType == paddle::PaddleDType::FLOAT16) {
+        databuf_size = out_num * sizeof(paddle::platform::float16);
+        databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+        if (!databuf_data) {
+          LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+          return -1;
+        }
+        paddle::platform::float16* data_out =
+            reinterpret_cast<paddle::platform::float16*>(databuf_data);
+        lod_tensor_out->CopyToCpu(data_out);
+        databuf_char = reinterpret_cast<char*>(data_out);
       }
+
       // Because task scheduling requires OPs to use 'Channel'
       // (which is a data structure) to transfer data between OPs.
       // We need to copy the processed data to the 'Channel' for the next OP.
@@ -532,6 +678,10 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<EngineCore> {
   int task_infer_impl(const void* in, void* out) {  // NOLINT
     return infer_impl(in, out);
   }
+
+  CubeCache* get_cube_cache() {
+    return DBReloadableInferEngine<EngineCore>::get_cube_cache();
+  }
 };
 
 typedef FactoryPool<InferEngine> StaticInferFactory;
@@ -565,11 +715,13 @@ class VersionedInferEngine : public InferEngine {
   template <typename T>
   T* get_core();
 
+  CubeCache* get_cube_cache();
+
   // versioned inference interface
   int infer(const void* in, void* out, uint32_t batch_size, uint64_t version);
 
   template <typename T>
-  T* get_core(uint64_t version);
+  T* get_core(const uint64_t version);
 
   int proc_initialize_impl(const configure::EngineDesc& conf, bool);
 
@@ -600,6 +752,8 @@ class InferManager {
                       const char* file,
                       std::shared_ptr<int> engine_index_ptr);
 
+  int set_taskexecutor_num(size_t total_engine_num);
+
   int thrd_initialize();
 
   int thrd_clear();
@@ -616,9 +770,13 @@ class InferManager {
             void* out,
             uint32_t batch_size = -1);
 
+  // get engine core
   template <typename T>
   T* get_core(const char* model_name);
 
+  // get cube cache
+  CubeCache* get_cube_cache(const char* model_name);
+
   // Versioned inference interface
   int infer(const char* model_name,
             const void* in,
@@ -626,9 +784,11 @@ class InferManager {
             uint32_t batch_size,
             uint64_t version);
 
+  // Versioned get engine core
   template <typename T>
-  T* get_core(const char* model_name, uint64_t version);
+  T* get_core(const char* model_name, const uint64_t version);
 
+  // query model version
   int query_version(const std::string& model, uint64_t& version);
 
  private:
diff --git a/core/predictor/framework/resource.cpp b/core/predictor/framework/resource.cpp
old mode 100644
new mode 100755
index 1da9783888fa379b653eaa46311c10f3d6c6ec66..0f5539d18e1942ffde31714333fe0ce89a49ff6f
--- a/core/predictor/framework/resource.cpp
+++ b/core/predictor/framework/resource.cpp
@@ -135,6 +135,17 @@ int Resource::initialize(const std::string& path, const std::string& file) {
 
   if (FLAGS_enable_model_toolkit) {
     size_t model_toolkit_num = resource_conf.model_toolkit_path_size();
+    // 此处暂时认为，每个model_toolkit仅包含一个engine
+    // 故认为 model_toolkit_num == engine总数
+    // 若以后出现model_toolkit仅包含多个engine
+    // 则应先for循环统计engine总数,再set_taskexecutor_num
+    // 切不可动态im::bsf::TaskExecutorVector<TaskT>::instance().resize
+    // TaskExecutor是线程池，内含锁，在engine进程初始化时已开始work加锁循环运行了
+    // 之后再resize内存搬运，会导致work使用原锁，而搬运后的TaskExecutor的锁内存已改变
+    if (InferManager::instance().set_taskexecutor_num(model_toolkit_num) != 0) {
+      LOG(ERROR) << "failed set_taskexecutor_num";
+      return -1;
+    }
     std::shared_ptr<int> engine_index_ptr(new int(0));
     for (size_t mi = 0; mi < model_toolkit_num; ++mi) {
       std::string model_toolkit_path = resource_conf.model_toolkit_path(mi);
@@ -165,18 +176,18 @@ int Resource::initialize(const std::string& path, const std::string& file) {
     rec::mcube::CubeAPI* cube = rec::mcube::CubeAPI::instance();
     std::string cube_config_fullpath = "./" + resource_conf.cube_config_path() +
                                        "/" + resource_conf.cube_config_file();
-    this->cube_config_fullpath = cube_config_fullpath;
-    this->cube_quant_bits = resource_conf.has_cube_quant_bits()
-                                ? resource_conf.cube_quant_bits()
-                                : 0;
-    if (this->cube_quant_bits != 0 && this->cube_quant_bits != 8) {
+    this->_cube_config_fullpath = cube_config_fullpath;
+    this->_cube_quant_bits = resource_conf.has_cube_quant_bits()
+                                 ? resource_conf.cube_quant_bits()
+                                 : 0;
+    if (this->_cube_quant_bits != 0 && this->_cube_quant_bits != 8) {
       LOG(ERROR) << "Cube quant bits illegal! should be 0 or 8.";
       return -1;
     }
-    if (this->cube_quant_bits == 0) {
+    if (this->_cube_quant_bits == 0) {
       LOG(INFO) << "cube quant mode OFF";
     } else {
-      LOG(INFO) << "cube quant mode ON, quant bits: " << this->cube_quant_bits;
+      LOG(INFO) << "cube quant mode ON, quant bits: " << this->_cube_quant_bits;
     }
   }
 
@@ -187,10 +198,10 @@ int Resource::initialize(const std::string& path, const std::string& file) {
 // model config
 int Resource::general_model_initialize(const std::string& path,
                                        const std::string& file) {
-  if (this->cube_config_fullpath.size() != 0) {
-    LOG(INFO) << "init cube by config file : " << this->cube_config_fullpath;
+  if (this->_cube_config_fullpath.size() != 0) {
+    LOG(INFO) << "init cube by config file : " << this->_cube_config_fullpath;
     rec::mcube::CubeAPI* cube = rec::mcube::CubeAPI::instance();
-    int ret = cube->init(this->cube_config_fullpath.c_str());
+    int ret = cube->init(this->_cube_config_fullpath.c_str());
     if (ret != 0) {
       LOG(ERROR) << "cube init error";
       return -1;
@@ -315,7 +326,7 @@ int Resource::thread_clear() {
   }
   return 0;
 }
-size_t Resource::get_cube_quant_bits() { return this->cube_quant_bits; }
+size_t Resource::get_cube_quant_bits() { return this->_cube_quant_bits; }
 
 int Resource::reload() {
   if (FLAGS_enable_model_toolkit && InferManager::instance().reload() != 0) {
diff --git a/core/predictor/framework/resource.h b/core/predictor/framework/resource.h
index d8a114dab581b71182c1a510db16aa0d2e818b0a..3fd3d5030d72ebbac6ec229a56c645dcbaf31b92 100644
--- a/core/predictor/framework/resource.h
+++ b/core/predictor/framework/resource.h
@@ -16,8 +16,10 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "core/cube/cube-api/include/cube_api.h"
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/framework/infer.h"
@@ -27,6 +29,8 @@ namespace baidu {
 namespace paddle_serving {
 namespace predictor {
 
+// Paddle general model configuration, read the model configuration information
+// from the general_model_config.proto file
 class PaddleGeneralModelConfig {
  public:
   PaddleGeneralModelConfig() {}
@@ -34,23 +38,47 @@ class PaddleGeneralModelConfig {
   ~PaddleGeneralModelConfig() {}
 
  public:
+  // feed/fetch name and alias_name
   std::vector<std::string> _feed_name;
   std::vector<std::string> _feed_alias_name;
-  std::vector<int> _feed_type;      // 0 int64, 1 float
-  std::vector<bool> _is_lod_feed;   // true lod tensor
-  std::vector<bool> _is_lod_fetch;  // whether a fetch var is lod_tensor
-  std::vector<int> _capacity;       //  capacity for each tensor
-                                    /*
-                                      feed_shape_ for feeded variable
-                                      feed_shape_[i][j] represents the jth dim for ith input Tensor
-                                      if is_lod_feed_[i] == False, feed_shape_[i][0] = -1
-                                     */
-  std::vector<std::vector<int>> _feed_shape;
-
   std::vector<std::string> _fetch_name;
   std::vector<std::string> _fetch_alias_name;
+
+  // Be consistent with model saving interface var type conversion
+  // (python/paddle serving client/io/__init__)
+  // int64 => 0;
+  // float32 => 1;
+  // int32 => 2;
+  // float64 => 3;
+  // int16 => 4;
+  // float16 => 5;
+  // bfloat16 => 6;
+  // uint8 => 7;
+  // int8 => 8;
+  // bool => 9;
+  // complex64 => 10,
+  // complex128 => 11;
+  std::vector<int> _feed_type;
+
+  // whether a feed or fetch var is lod_tensor.
+  std::vector<bool> _is_lod_feed;
+  std::vector<bool> _is_lod_fetch;
+
+  // capacity for each tensor
+  std::vector<int> _capacity;
+
+  // _feed_shape and _fetch_shape are used to represent the dimensional
+  // information of tensor.
+  // for examples, feed_shape_[i][j] represents the j(th) dim for i(th) input
+  // tensor.
+  // if is_lod_feed_[i] == False, feed_shape_[i][0] = -1
+  std::vector<std::vector<int>> _feed_shape;
   std::vector<std::vector<int>> _fetch_shape;
+
+  // fetch name -> index of fetch_name vector.
   std::map<std::string, int> _fetch_name_to_index;
+
+  // fetch alias name -> index of fetch_alias_name vector.
   std::map<std::string, int> _fetch_alias_name_to_index;
 };
 
@@ -73,33 +101,50 @@ class Resource {
     return ins;
   }
 
+  // initialize resource
   int initialize(const std::string& path, const std::string& file);
 
+  // loading all models configurations from prototxt
   int general_model_initialize(const std::string& path,
                                const std::string& file);
 
+  // initialize thread local data
   int thread_initialize();
 
+  // clear thread local data
   int thread_clear();
 
+  // reload resources
   int reload();
 
+  // finalize
   int finalize();
 
+  // get all model configs
   std::vector<std::shared_ptr<PaddleGeneralModelConfig>>
   get_general_model_config();
 
+  // print all configurations of all models
   void print_general_model_config(
       const std::shared_ptr<PaddleGeneralModelConfig>& config);
 
+  // get cube quantity bit size
   size_t get_cube_quant_bits();
 
  private:
   int thread_finalize() { return 0; }
+
+ private:
+  // configuration infermation of all models, loading from prototxt files
   std::vector<std::shared_ptr<PaddleGeneralModelConfig>> _configs;
-  std::string cube_config_fullpath;
-  int cube_quant_bits;  // 0 if no empty
 
+  // full path of cube configuration file.
+  std::string _cube_config_fullpath;
+
+  // cube quantify bit size, support 0/8. set 0 if no quant.
+  size_t _cube_quant_bits;
+
+  // bthread local key
   THREAD_KEY_T _tls_bspec_key;
 };
 
diff --git a/core/predictor/tools/ocrtools/preprocess_op.cpp b/core/predictor/tools/ocrtools/preprocess_op.cpp
index ab69e4d23abbcbebfbfb5c453fbca46ff5e51967..045984e4c004f965d52badc8b8a0b8996224ab7c 100644
--- a/core/predictor/tools/ocrtools/preprocess_op.cpp
+++ b/core/predictor/tools/ocrtools/preprocess_op.cpp
@@ -82,14 +82,14 @@ void ResizeImgType0::Run(const cv::Mat &img, cv::Mat &resize_img,
   else if (resize_h / 32 < 1 + 1e-5)
     resize_h = 32;
   else
-    resize_h = (resize_h / 32) * 32;
+    resize_h = (resize_h / 32 - 1) * 32;
 
   if (resize_w % 32 == 0)
     resize_w = resize_w;
   else if (resize_w / 32 < 1 + 1e-5)
     resize_w = 32;
   else
-    resize_w = (resize_w / 32) * 32;
+    resize_w = (resize_w / 32 - 1) * 32;
   if (!use_tensorrt) {
     cv::resize(img, resize_img, cv::Size(resize_w, resize_h));
     ratio_h = float(resize_h) / float(h);
diff --git a/core/sdk-cpp/proto/general_model_service.proto b/core/sdk-cpp/proto/general_model_service.proto
index 92032ab77e88a515c48db312e20b8acb13c9cddc..5340f4226e12b0b99147bc2972928b7d7c733057 100755
--- a/core/sdk-cpp/proto/general_model_service.proto
+++ b/core/sdk-cpp/proto/general_model_service.proto
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-syntax = "proto2";
+syntax = "proto3";
 import "pds_option.proto";
 import "builtin_format.proto";
 package baidu.paddle_serving.predictor.general_model;
@@ -20,33 +20,88 @@ package baidu.paddle_serving.predictor.general_model;
 option cc_generic_services = true;
 
 message Tensor {
-  repeated string data = 1;
-  repeated int32 int_data = 2;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
-  optional int32 elem_type =
-      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
-  repeated int32 shape = 6;       // shape should include batch
-  repeated int32 lod = 7;         // only for fetch tensor currently
-  optional string name = 8;       // get from the Model prototxt
-  optional string alias_name = 9; // get from the Model prototxt
+  // VarType: INT64
+  repeated int64 int64_data = 1;
+
+  // VarType: FP32
+  repeated float float_data = 2;
+
+  // VarType: INT32
+  repeated int32 int_data = 3;
+
+  // VarType: FP64
+  repeated double float64_data = 4;
+
+  // VarType: UINT32
+  repeated uint32 uint32_data = 5;
+
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+
+  // VarType: STRING
+  repeated string data = 9;
+
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  20 => STRING
+  int32 elem_type = 10;
+
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 11;
+
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 12;
+
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 13;
+
+  // Correspond to the variable 'alias_name' in the model description prototxt.
+  string alias_name = 14; // get from the Model prototxt
+
+  // VarType: FP16, INT16, INT8, BF16, UINT8
+  bytes tensor_content = 15;
 };
 
 message Request {
   repeated Tensor tensor = 1;
   repeated string fetch_var_names = 2;
-  optional bool profile_server = 3 [ default = false ];
-  required uint64 log_id = 4 [ default = 0 ];
+  bool profile_server = 3;
+  uint64 log_id = 4;
 };
 
 message Response {
   repeated ModelOutput outputs = 1;
   repeated int64 profile_time = 2;
+  // Error code
+  int32 err_no = 3;
+
+  // Error messages
+  string err_msg = 4;
 };
 
 message ModelOutput {
   repeated Tensor tensor = 1;
-  optional string engine_name = 2;
+  string engine_name = 2;
 }
 
 service GeneralModelService {
diff --git a/doc/HTTP_SERVICE_CN.md b/doc/HTTP_SERVICE_CN.md
old mode 100644
new mode 100755
index ff7082b0c6c2f091a199420be45ce83403befdd4..ef35eff2f3d9cd259a7d66800dc6866605d4cf6d
--- a/doc/HTTP_SERVICE_CN.md
+++ b/doc/HTTP_SERVICE_CN.md
@@ -12,7 +12,7 @@ BRPC-Server会尝试去JSON字符串中再去反序列化出Proto格式的数据
 
 ### Http+protobuf方式
 各种语言都提供了对ProtoBuf的支持，如果您对此比较熟悉，您也可以先将数据使用ProtoBuf序列化，再将序列化后的数据放入Http请求数据体中，然后指定Content-Type: application/proto，从而使用http/h2+protobuf二进制串访问服务。
-实测随着数据量的增大，使用JSON方式的Http的数据量和反序列化的耗时会大幅度增加，推荐当您的数据量较大时，使用Http+protobuf方式，后续我们会在框架的HttpClient中增加该功能，目前暂没有支持。
+实测随着数据量的增大，使用JSON方式的Http的数据量和反序列化的耗时会大幅度增加，推荐当您的数据量较大时，使用Http+protobuf方式，目前已经在Java和Python的Client端提供了支持。
 
 **理论上讲，序列化/反序列化的性能从高到底排序为：protobuf > http/h2+protobuf > http**
 
@@ -42,7 +42,7 @@ python3.6 -m paddle_serving_server.serve --model uci_housing_model --thread 10 -
 
 为了方便用户快速的使用Http方式请求Server端预测服务，我们已经将常用的Http请求的数据体封装、压缩、请求加密等功能封装为一个HttpClient类提供给用户，方便用户使用。
 
-使用HttpClient最简单只需要三步，1、创建一个HttpClient对象。2、加载Client端的prototxt配置文件（本例中为python/examples/fit_a_line/目录下的uci_housing_client/serving_client_conf.prototxt)，3、调用Predict函数，通过Http方式请求预测服务。
+使用HttpClient最简单只需要四步，1、创建一个HttpClient对象。2、加载Client端的prototxt配置文件（本例中为python/examples/fit_a_line/目录下的uci_housing_client/serving_client_conf.prototxt)。3、调用connect函数。4、调用Predict函数，通过Http方式请求预测服务。
 
 此外，您可以根据自己的需要配置Server端IP、Port、服务名称（此服务名称需要与[`core/general-server/proto/general_model_service.proto`](../core/general-server/proto/general_model_service.proto)文件中的Service服务名和rpc方法名对应，即`GeneralModelService`字段和`inference`字段），设置Request数据体压缩，设置Response支持压缩传输，模型加密预测（需要配置Server端使用模型加密）、设置响应超时时间等功能。
 
@@ -52,7 +52,9 @@ Java的HttpClient使用示例见[`java/examples/src/main/java/PaddleServingClien
 
 如果不能满足您的需求，您也可以在此基础上添加一些功能。
 
-如需支持https或者自定义Response的Status Code等,则需要对C++端brpc-Server进行一定的二次开发，请参考https://github.com/apache/incubator-brpc/blob/master/docs/cn/http_service.md，后续如果需求很大，我们也会将这部分功能加入到Server中，尽情期待。
+如需支持https或者自定义Response的Status Code等,则需要对C++端brpc-Server进行一定的二次开发，请参考https://github.com/apache/incubator-brpc/blob/master/docs/cn/http_service.md
+
+后续如果需求很大，我们也会将这部分功能加入到Server中，尽情期待。
 
 
 ### curl方式发送Http请求(基本原理)
@@ -101,7 +103,7 @@ repeated int32 numbers = 1;
 ```
 #### elem_type
 
-表示数据类型，0 means int64, 1 means float32, 2 means int32, 3 means bytes(string)
+表示数据类型，0 means int64, 1 means float32, 2 means int32, 20 means bytes(string)
 
 #### fetch_var_names
 
diff --git a/java/README_CN.md b/java/README_CN.md
old mode 100644
new mode 100755
index 70162e8778595456c3a006055c751dd051b2d93c..ef53ac9b1b020940679db9fecbfe1d33111b79f1
--- a/java/README_CN.md
+++ b/java/README_CN.md
@@ -7,8 +7,8 @@
 为了方便用户使用java进行开发，我们提供了编译好的Serving工程放置在java镜像当中，获取镜像并进入开发环境的方式是
 
 ```
-docker pull registry.baidubce.com/paddlepaddle/serving:0.5.0-java
-docker run --rm -dit --name java_serving registry.baidubce.com/paddlepaddle/serving:0.5.0-java
+docker pull registry.baidubce.com/paddlepaddle/serving:0.6.0-java
+docker run --rm -dit --name java_serving registry.baidubce.com/paddlepaddle/serving:0.6.0-java
 docker exec -it java_serving bash
 cd Serving/java
 ```
@@ -29,7 +29,7 @@ mvn install
 
 ## 请求BRPC-Server
 
-###服务端启动
+### 服务端启动
 
 以fit_a_line模型为例，服务端启动与常规BRPC-Server端启动命令一样。
 
@@ -39,7 +39,7 @@ sh get_data.sh
 python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
 ```
 
-###客户端预测
+### 客户端预测
 客户端目前支持多种请求方式，目前支持HTTP（数据为JSON格式）、HTTP（数据为PROTO格式）、GRPC
 
 推荐您使用HTTP（数据为PROTO格式），此时数据体为PROTO格式，传输的数据量小，速度快，目前已经帮用户实现了HTTP/GRPC的数据体（JSON/PROTO）的封装函数,详见[Client.java](./src/main/java/io/paddle/serving/client/Client.java)
@@ -47,14 +47,14 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 cd ../../../java/examples/target
 java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample http_proto <configPath>
 ```
-**注意  <configPath>为客户端配置文件，一般是名为serving_client_conf.prototxt的文件。**
+**注意  `<configPath>`为客户端配置文件，一般是名为serving_client_conf.prototxt的文件。**
 
 更多示例详见[PaddleServingClientExample.java](./examples/src/main/java/PaddleServingClientExample.java)
 
 
 ## 请求Pipeline-Server
 
-###服务端启动
+### 服务端启动
 
 对于input data type = string类型，以IMDB model ensemble模型为例，服务端启动
 
@@ -66,14 +66,14 @@ python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.
 python test_pipeline_server.py &>pipeline.log &
 ```
 
-客户端预测(同步)
+### 客户端预测(同步)
 
 ```
 cd ../../../java/examples/target
 java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample string_imdb_predict
 ```
 
-客户端预测(异步)
+### 客户端预测(异步)
 
 ```
 cd ../../../java/examples/target
@@ -81,7 +81,7 @@ java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar Pipeli
 ```
 
 
-对于input data type = INDArray类型，以Simple Pipeline WebService中的uci_housing_model模型为例，服务端启动
+### 对于input data type = INDArray类型，以Simple Pipeline WebService中的uci_housing_model模型为例，服务端启动
 
 ```
 cd ../../python/examples/pipeline/simple_web_service
@@ -89,7 +89,7 @@ sh get_data.sh
 python web_service_java.py &>log.txt &
 ```
 
-客户端预测(同步)
+### 客户端预测(同步)
 
 ```
 cd ../../../java/examples/target
@@ -98,7 +98,7 @@ java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar Pipeli
 
 ### 注意事项
 
-1.在示例中，端口号都是9393，ip默认设置为了0.0.0.0表示本机，注意ip和port需要与Server端对应。
+1.在示例中，端口号都是9393，ip默认设置为了127.0.0.1表示本机，注意ip和port需要与Server端对应。
 
 2.目前Serving已推出Pipeline模式（原理详见[Pipeline Serving](../doc/PIPELINE_SERVING_CN.md)），面向Java的Pipeline Serving Client已发布。
 
diff --git a/java/examples/src/main/java/PaddleServingClientExample.java b/java/examples/src/main/java/PaddleServingClientExample.java
index 73fe94856ff90177f7fa34535f2a2733c92068e3..153cc4afdd172b524383a78b8d1340ecebf1cc44 100755
--- a/java/examples/src/main/java/PaddleServingClientExample.java
+++ b/java/examples/src/main/java/PaddleServingClientExample.java
@@ -25,7 +25,7 @@ public class PaddleServingClientExample {
         List<String> fetch = Arrays.asList("price");
         
         Client client = new Client();
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
         client.setPort("9393");
         client.loadClientConfig(model_config_path);
         String result = client.predict(feed_data, fetch, true, 0);
@@ -49,7 +49,7 @@ public class PaddleServingClientExample {
         
         Client client = new Client();
         //注意：跨docker，需要设置--net-host或直接访问另一个docker的ip
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
         client.setPort("9393");
         client.set_http_proto(false);
         client.loadClientConfig(model_config_path);
@@ -73,7 +73,7 @@ public class PaddleServingClientExample {
         List<String> fetch = Arrays.asList("price");
         
         Client client = new Client();
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
         client.setPort("9393");
         client.loadClientConfig(model_config_path);
         client.set_use_grpc_client(true);
@@ -97,7 +97,7 @@ public class PaddleServingClientExample {
         List<String> fetch = Arrays.asList("price");
         
         Client client = new Client();
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
         client.setPort("9393");
         client.loadClientConfig(model_config_path);
         client.use_key(keyFilePath);
@@ -125,7 +125,7 @@ public class PaddleServingClientExample {
         List<String> fetch = Arrays.asList("price");
         
         Client client = new Client();
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
         client.setPort("9393");
         client.loadClientConfig(model_config_path);
         client.set_request_compress(true);
@@ -176,7 +176,7 @@ public class PaddleServingClientExample {
             }};
         List<String> fetch = Arrays.asList("save_infer_model/scale_0.tmp_0");
         Client client = new Client();
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
         client.setPort("9393");
         client.loadClientConfig(model_config_path);
         String result = client.predict(feed_data, fetch, true, 0);
@@ -198,7 +198,7 @@ public class PaddleServingClientExample {
             }};
         List<String> fetch = Arrays.asList("pooled_output");
         Client client = new Client();
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
         client.setPort("9393");
         client.loadClientConfig(model_config_path);
         String result = client.predict(feed_data, fetch, true, 0);
@@ -268,7 +268,7 @@ public class PaddleServingClientExample {
             }};
         List<String> fetch = Arrays.asList("prob");
         Client client = new Client();
-        client.setIP("0.0.0.0");
+        client.setIP("127.0.0.1");
         client.setPort("9393");
         client.loadClientConfig(model_config_path);
         String result = client.predict(feed_data, fetch, true, 0);
diff --git a/java/src/main/java/io/paddle/serving/client/Client.java b/java/src/main/java/io/paddle/serving/client/Client.java
index 5c283635f3c3146b5c9e3d227f1ee6485e24d5c6..af4ccc5246262336ef9df05aa65beb5b91de33fd 100755
--- a/java/src/main/java/io/paddle/serving/client/Client.java
+++ b/java/src/main/java/io/paddle/serving/client/Client.java
@@ -59,9 +59,20 @@ import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
  
 
-enum ElementType
-{
-    Int64_type, Float32_type, Int32_type, Bytes_type;
+class ElementType {
+    public static final int Int64_type = 0;
+    public static final int Float32_type = 1;
+    public static final int Int32_type = 2;
+    public static final int String_type = 20;
+    public static final Map<Integer, String> feedTypeToDataKey_;
+    static
+    {
+        feedTypeToDataKey_ = new HashMap<Integer, String>();
+        feedTypeToDataKey_.put(ElementType.Int64_type, "int64_data");
+        feedTypeToDataKey_.put(ElementType.Float32_type, "float_data");
+        feedTypeToDataKey_.put(ElementType.Int32_type, "int_data");
+        feedTypeToDataKey_.put(ElementType.String_type, "data");
+    }
 }
 
 class Profiler {
@@ -104,7 +115,6 @@ public class Client {
     private Map<String, Integer> feedTypes_;
     private Map<String, List<Integer>> feedShapes_;
     private Map<String, Integer> feedNameToIndex_;
-    private Map<Integer, String> feedTypeToDataKey_;
     private List<String> fetchNames_;
     private Map<String, Integer> fetchTypes_;
     private Set<String> lodTensorSet_;
@@ -134,7 +144,7 @@ public class Client {
         feedTensorLen_ = null;
         feedNameToIndex_ = null;
         timeoutS_ = 200000;
-        ip = "0.0.0.0";
+        ip = "127.0.0.1";
         port = "9393";
         serverPort = "9393";
         serviceName = "/GeneralModelService/inference";
@@ -147,12 +157,6 @@ public class Client {
         channel_ = null;
         blockingStub_ = null;
 
-        feedTypeToDataKey_ = new HashMap<Integer, String>();
-        feedTypeToDataKey_.put(0, "int64_data");
-        feedTypeToDataKey_.put(1, "float_data");
-        feedTypeToDataKey_.put(2, "int_data");
-        feedTypeToDataKey_.put(3, "data");
-
         profiler_ = new Profiler();
         boolean is_profile = false;
         String FLAGS_profile_client = System.getenv("FLAGS_profile_client");
@@ -525,7 +529,7 @@ public class Client {
                     jsonTensor.put("elem_type", element_type);
 
                     // 处理数据与shape
-                    String protoDataKey = feedTypeToDataKey_.get(element_type);
+                    String protoDataKey = ElementType.feedTypeToDataKey_.get(element_type);
                     // 如果是INDArray类型，先转为一维.
                     // 此时shape为INDArray的shape
                     if(objectValue instanceof INDArray){
@@ -535,11 +539,11 @@ public class Client {
                         for(long dim:indarrayShape){
                             shape.add((int)dim);
                         }
-                        if(element_type == ElementType.Int64_type.ordinal()){
+                        if(element_type == ElementType.Int64_type){
                             objectValue = tempIndArray.data().asLong();
-                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                        }else if(element_type == ElementType.Int32_type){
                             objectValue = tempIndArray.data().asInt();
-                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                        }else if(element_type == ElementType.Float32_type){
                             objectValue = tempIndArray.data().asFloat();
                         }else{
                             throw new Exception("INDArray 类型不支持");
@@ -564,11 +568,11 @@ public class Client {
                         // 此时无法获取batch信息，故对shape不处理
                         // 由于Proto中为Repeated,需要把数据包装成list
                         if(objectValue instanceof String){
-                            if(feedTypes_.get(protoDataKey)!= ElementType.Bytes_type.ordinal()){
+                            if(feedTypes_.get(protoDataKey)!= ElementType.String_type){
                                 throw new Exception("feedvar is not string-type,feed can`t be a single string.");
                             }
                         }else{
-                            if(feedTypes_.get(protoDataKey)== ElementType.Bytes_type.ordinal()){
+                            if(feedTypes_.get(protoDataKey)== ElementType.String_type){
                                 throw new Exception("feedvar is string-type,feed, feed can`t be a single int or others.");
                             }
                         }
@@ -662,17 +666,17 @@ public class Client {
                         for(long dim:indarrayShape){
                             shape.add((int)dim);
                         }   
-                        if(element_type == ElementType.Int64_type.ordinal()){
+                        if(element_type == ElementType.Int64_type){
                             
                             List<Long> iter = Arrays.stream(tempIndArray.data().asLong()).boxed().collect(Collectors.toList());
                             tensor_builder.addAllInt64Data(iter);
                             
-                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                        }else if(element_type == ElementType.Int32_type){
                             
                             List<Integer> iter = Arrays.stream(tempIndArray.data().asInt()).boxed().collect(Collectors.toList());
                             tensor_builder.addAllIntData(iter);
                             
-                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                        }else if(element_type == ElementType.Float32_type){
                             List<Float> iter = Arrays.asList(ArrayUtils.toObject(tempIndArray.data().asFloat()));
                             tensor_builder.addAllFloatData(iter);
                             
@@ -684,13 +688,13 @@ public class Client {
                         // 如果是数组类型，则无须处理，直接使用即可。
                         // 且数组无法嵌套，此时batch无法从数据中获取
                         // 默认batch维度为1，或者feedVar的shape信息中已包含batch
-                        if(element_type == ElementType.Int64_type.ordinal()){
+                        if(element_type == ElementType.Int64_type){
                             List<Long> iter = Arrays.stream((long[])objectValue).boxed().collect(Collectors.toList());
                             tensor_builder.addAllInt64Data(iter);
-                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                        }else if(element_type == ElementType.Int32_type){
                             List<Integer> iter = Arrays.stream((int[])objectValue).boxed().collect(Collectors.toList());
                             tensor_builder.addAllIntData(iter);
-                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                        }else if(element_type == ElementType.Float32_type){
                             List<Float> iter = Arrays.asList(ArrayUtils.toObject((float[])objectValue));
                             tensor_builder.addAllFloatData(iter);
                         }else{
@@ -707,11 +711,11 @@ public class Client {
                             // 在index=0处，加上batch
                             shape.add(0, list.size());
                         }
-                        if(element_type == ElementType.Int64_type.ordinal()){
+                        if(element_type == ElementType.Int64_type){
                             tensor_builder.addAllInt64Data((List<Long>)(List)recursiveExtract(objectValue));
-                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                        }else if(element_type == ElementType.Int32_type){
                             tensor_builder.addAllIntData((List<Integer>)(List)recursiveExtract(objectValue));
-                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                        }else if(element_type == ElementType.Float32_type){
                             tensor_builder.addAllFloatData((List<Float>)(List)recursiveExtract(objectValue));
                         }else{
                             // 看接口是String还是Bytes
@@ -723,11 +727,11 @@ public class Client {
                         // 由于Proto中为Repeated,需要把数据包装成list
                         List<Object> tempList = new ArrayList<>();
                         tempList.add(objectValue);
-                        if(element_type == ElementType.Int64_type.ordinal()){
+                        if(element_type == ElementType.Int64_type){
                             tensor_builder.addAllInt64Data((List<Long>)(List)tempList);
-                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                        }else if(element_type == ElementType.Int32_type){
                             tensor_builder.addAllIntData((List<Integer>)(List)tempList);
-                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                        }else if(element_type == ElementType.Float32_type){
                             tensor_builder.addAllFloatData((List<Float>)(List)tempList);
                         }else{
                             // 看接口是String还是Bytes
diff --git a/java/src/main/proto/general_model_service.proto b/java/src/main/proto/general_model_service.proto
index 89ac489f8ae3b90b74c94a3f9f3c82711086cd64..aa06d388a468d71e968aa53b19f25c55f8c42ee1 100644
--- a/java/src/main/proto/general_model_service.proto
+++ b/java/src/main/proto/general_model_service.proto
@@ -12,41 +12,96 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-syntax = "proto2";
+syntax = "proto3";
 package baidu.paddle_serving.predictor.general_model;
 option java_multiple_files = true;
 
 message Tensor {
-  repeated string data = 1;
-  repeated int32 int_data = 2;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
-  optional int32 elem_type =
-      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
-  repeated int32 shape = 6;       // shape should include batch
-  repeated int32 lod = 7;         // only for fetch tensor currently
-  optional string name = 8;       // get from the Model prototxt
-  optional string alias_name = 9; // get from the Model prototxt
+  // VarType: INT64
+  repeated int64 int64_data = 1;
+
+  // VarType: FP32
+  repeated float float_data = 2;
+
+  // VarType: INT32
+  repeated int32 int_data = 3;
+
+  // VarType: FP64
+  repeated double float64_data = 4;
+
+  // VarType: UINT32
+  repeated uint32 uint32_data = 5;
+
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+
+  // VarType: STRING
+  repeated string data = 9;
+
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  20 => STRING
+  int32 elem_type = 10;
+
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 11;
+
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 12;
+
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 13;
+
+  // Correspond to the variable 'alias_name' in the model description prototxt.
+  string alias_name = 14; // get from the Model prototxt
+
+  // VarType: FP16, INT16, INT8, BF16, UINT8
+  bytes tensor_content = 15;
 };
 
 message Request {
   repeated Tensor tensor = 1;
   repeated string fetch_var_names = 2;
-  optional bool profile_server = 3 [ default = false ];
-  required uint64 log_id = 4 [ default = 0 ];
+  bool profile_server = 3;
+  uint64 log_id = 4;
 };
 
 message Response {
   repeated ModelOutput outputs = 1;
   repeated int64 profile_time = 2;
+  // Error code
+  int32 err_no = 3;
+
+  // Error messages
+  string err_msg = 4;
 };
 
 message ModelOutput {
   repeated Tensor tensor = 1;
-  optional string engine_name = 2;
+  string engine_name = 2;
 }
 
 service GeneralModelService {
-  rpc inference(Request) returns (Response) {}
-  rpc debug(Request) returns (Response) {}
+  rpc inference(Request) returns (Response);
+  rpc debug(Request) returns (Response);
 };
diff --git a/paddle_inference/paddle/include/paddle_engine.h b/paddle_inference/paddle/include/paddle_engine.h
index d2027ed2823ace230a14e85eb6ee37fe82e5a21f..7cc8120f4eb818905c303b22a0b00d6b205bddb4 100644
--- a/paddle_inference/paddle/include/paddle_engine.h
+++ b/paddle_inference/paddle/include/paddle_engine.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <dirent.h>
 #include <pthread.h>
 #include <fstream>
 #include <map>
@@ -69,7 +70,33 @@ PrecisionType GetPrecision(const std::string& precision_data) {
   return PrecisionType::kFloat32;
 }
 
-// Engine Base
+const std::string getFileBySuffix(
+    const std::string& path, const std::vector<std::string>& suffixVector) {
+  DIR* dp = nullptr;
+  std::string fileName = "";
+  struct dirent* dirp = nullptr;
+  if ((dp = opendir(path.c_str())) == nullptr) {
+    return fileName;
+  }
+  while ((dirp = readdir(dp)) != nullptr) {
+    if (dirp->d_type == DT_REG) {
+      for (int idx = 0; idx < suffixVector.size(); ++idx) {
+        if (std::string(dirp->d_name).find(suffixVector[idx]) !=
+            std::string::npos) {
+          fileName = static_cast<std::string>(dirp->d_name);
+          break;
+        }
+      }
+    }
+    if (fileName.length() != 0) break;
+  }
+  closedir(dp);
+  return fileName;
+}
+
+// Engine Core is the base class of inference engines, which can be derived from
+// paddle Inference Engine, or inference engines of other machine learning
+// platforms
 class EngineCore {
  public:
   virtual ~EngineCore() {}
@@ -116,6 +143,11 @@ class EngineCore {
   virtual void* get() { return _predictor.get(); }
 
  protected:
+  // _predictor is a prediction instance of Paddle Inference.
+  // when inferring on the CPU, _predictor is bound to a model.
+  // when inferring on the GPU, _predictor is bound to a model and a GPU card.
+  // Therefore, when using GPU multi-card inference, you need to create multiple
+  // EngineCore.
   std::shared_ptr<Predictor> _predictor;
 };
 
@@ -131,9 +163,21 @@ class PaddleInferenceEngine : public EngineCore {
     }
 
     Config config;
-    // todo, auto config(zhangjun)
-    if (engine_conf.has_encrypted_model() && engine_conf.encrypted_model()) {
+    std::vector<std::string> suffixParaVector = {".pdiparams", "__params__"};
+    std::vector<std::string> suffixModelVector = {".pdmodel", "__model__"};
+    std::string paraFileName = getFileBySuffix(model_path, suffixParaVector);
+    std::string modelFileName = getFileBySuffix(model_path, suffixModelVector);
+
+    std::string encryParaPath = model_path + "/encrypt_model";
+    std::string encryModelPath = model_path + "/encrypt_params";
+    std::string encryKeyPath = model_path + "/key";
+
+    // encrypt model
+    if (access(encryParaPath.c_str(), F_OK) != -1 &&
+        access(encryModelPath.c_str(), F_OK) != -1 &&
+        access(encryKeyPath.c_str(), F_OK) != -1) {
       // decrypt model
+
       std::string model_buffer, params_buffer, key_buffer;
       predictor::ReadBinaryFile(model_path + "/encrypt_model", &model_buffer);
       predictor::ReadBinaryFile(model_path + "/encrypt_params", &params_buffer);
@@ -147,16 +191,11 @@ class PaddleInferenceEngine : public EngineCore {
                             real_model_buffer.size(),
                             &real_params_buffer[0],
                             real_params_buffer.size());
-    } else if (engine_conf.has_combined_model()) {
-      if (!engine_conf.combined_model()) {
-        config.SetModel(model_path);
-      } else {
-        config.SetParamsFile(model_path + "/__params__");
-        config.SetProgFile(model_path + "/__model__");
-      }
+    } else if (paraFileName.length() != 0 && modelFileName.length() != 0) {
+      config.SetParamsFile(model_path + "/" + paraFileName);
+      config.SetProgFile(model_path + "/" + modelFileName);
     } else {
-      config.SetParamsFile(model_path + "/__params__");
-      config.SetProgFile(model_path + "/__model__");
+      config.SetModel(model_path);
     }
 
     config.SwitchSpecifyInputNames(true);
diff --git a/python/examples/bert/README.md b/python/examples/bert/README.md
old mode 100644
new mode 100755
index 7bada93876f8f043b0046b83c3dc3707129079a7..5d3242837f6d8be08f321d68890587e4bba725e8
--- a/python/examples/bert/README.md
+++ b/python/examples/bert/README.md
@@ -1,4 +1,4 @@
-## Bert as service
+Http## Bert as service
 
 ([简体中文](./README_CN.md)|English)
 
@@ -42,48 +42,36 @@ sh get_data.sh
 ```
 this script will download Chinese Dictionary File vocab.txt and Chinese Sample Data data-c.txt
 
-### RPC Inference Service
+### Inference Service(Support BRPC-Client、GRPC-Client、Http-Client)
 start cpu inference service,Run
 ```
-python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #cpu inference service
+python3 -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #cpu inference service
 ```
 Or,start gpu inference service,Run
 ```
-python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #launch gpu inference service at GPU 0
+python3 -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #launch gpu inference service at GPU 0
 ```
 
-### RPC Inference
+### BRPC-Client Inference
 
 before prediction we should install paddle_serving_app. This module provides data preprocessing for BERT model.
 ```
-pip install paddle_serving_app
+pip3 install paddle_serving_app
 ```
 Run
 ```
-head data-c.txt | python bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
+head data-c.txt | python3 bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
 ```
 
 the client reads data from data-c.txt and send prediction request, the prediction is given by word vector. (Due to massive data in the word vector, we do not print it).
 
-### HTTP Inference Service
-start cpu HTTP inference service,Run
-```
- python bert_web_service.py bert_seq128_model/ 9292 #launch cpu inference service
+#### GRPC-Client/HTTP-Client
+Run
 ```
+head data-c.txt | python3 bert_httpclient.py --model bert_seq128_client/serving_client_conf.prototxt
 
-Or,start gpu HTTP inference service,Run
-```
- export CUDA_VISIBLE_DEVICES=0,1
-```
-set environmental variable to specify which gpus are used, the command above means gpu 0 and gpu 1 is used.
 ```
- python bert_web_service_gpu.py bert_seq128_model/ 9292 #launch gpu inference service
-```
-### HTTP Inference 
 
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:9292/bert/prediction
-```
 
 ## Benchmark
 ``` shell
diff --git a/python/examples/bert/README_CN.md b/python/examples/bert/README_CN.md
old mode 100644
new mode 100755
index a03b577493fc763c43d1ce96766d4e9eb260565e..42bc3ffab0ad51e304b11a78634b5a90415d1ace
--- a/python/examples/bert/README_CN.md
+++ b/python/examples/bert/README_CN.md
@@ -40,15 +40,15 @@ sh get_data.sh
 ```
 脚本将下载中文词典vocab.txt和中文样例数据data-c.txt
 
-### 启动RPC预测服务
+### 启动预测服务（支持BRPC-Client、GRPC-Client、HTTP-Client三种方式访问）
 启动cpu预测服务，执行
 ```
-python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #启动cpu预测服务
+python3 -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #启动cpu预测服务
 
 ```
 或者，启动gpu预测服务，执行
 ```
-python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务
+python3 -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务
 
 ```
 
@@ -56,37 +56,22 @@ python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --g
 
 执行预测前需要安装paddle_serving_app，模块中提供了BERT模型的数据预处理方法。
 ```
-pip install paddle_serving_app
+pip3 install paddle_serving_app
 ```
+
+#### BRPC-Client
 执行
 ```
-head data-c.txt | python bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
+head data-c.txt | python3 bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
 
 ```
 启动client读取data-c.txt中的数据进行预测，预测结果为文本的向量表示（由于数据较多，脚本中没有将输出进行打印），server端的地址在脚本中修改。
 
-
-
-### 启动HTTP预测服务
-启动cpu HTTP预测服务，执行
-```
-python bert_web_service.py bert_seq128_model/ 9292 #启动CPU预测服务
-
-```
-
-或者，启动gpu HTTP预测服务，执行
-```
- export CUDA_VISIBLE_DEVICES=0,1
-```
-通过环境变量指定gpu预测服务使用的gpu，示例中指定索引为0和1的两块gpu
-```
-python bert_web_service_gpu.py bert_seq128_model/ 9292 #启动gpu预测服务
+#### GRPC-Client/HTTP-Client
+执行
 ```
+head data-c.txt | python3 bert_httpclient.py --model bert_seq128_client/serving_client_conf.prototxt
 
-### 执行预测
-
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:9292/bert/prediction
 ```
 
 ## 性能测试
diff --git a/python/examples/bert/bert_httpclient.py b/python/examples/bert/bert_httpclient.py
new file mode 100644
index 0000000000000000000000000000000000000000..255c78ec0ca7e33ddd1486f05cf6d9d225a5f406
--- /dev/null
+++ b/python/examples/bert/bert_httpclient.py
@@ -0,0 +1,58 @@
+# coding:utf-8
+# pylint: disable=doc-string-missing
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from paddle_serving_client import HttpClient
+from paddle_serving_client.utils import benchmark_args
+from paddle_serving_app.reader import ChineseBertReader
+import numpy as np
+args = benchmark_args()
+
+reader = ChineseBertReader({"max_seq_len": 128})
+fetch = ["pooled_output"]
+endpoint_list = ['127.0.0.1:9292']
+client = HttpClient()
+client.load_client_config(args.model)
+''' 
+if you want use GRPC-client, set_use_grpc_client(True)
+or you can directly use client.grpc_client_predict(...)
+as for HTTP-client,set_use_grpc_client(False)(which is default)
+or you can directly use client.http_client_predict(...)
+'''
+#client.set_use_grpc_client(True)
+'''
+if you want to enable Encrypt Module,uncommenting the following line
+'''
+#client.use_key("./key")
+'''
+if you want to compress,uncommenting the following line
+'''
+#client.set_response_compress(True)
+#client.set_request_compress(True)
+'''
+we recommend use Proto data format in HTTP-body, set True(which is default)
+if you want use JSON data format in HTTP-body, set False
+'''
+#client.set_http_proto(True)
+client.connect(endpoint_list)
+
+for line in sys.stdin:
+    feed_dict = reader.process(line)
+    for key in feed_dict.keys():
+        feed_dict[key] = np.array(feed_dict[key]).reshape((128, 1))
+    #print(feed_dict)
+    result = client.predict(feed=feed_dict, fetch=fetch, batch=False)
+print(result)
diff --git a/python/examples/bert/bert_web_service.py b/python/examples/bert/bert_web_service.py
deleted file mode 100644
index 7cd34fb99e0ecebbf2f6bec47e9c9d163ac3a44c..0000000000000000000000000000000000000000
--- a/python/examples/bert/bert_web_service.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server.web_service import WebService
-from paddle_serving_app.reader import ChineseBertReader
-import sys
-import os
-import numpy as np
-
-
-class BertService(WebService):
-    def load(self):
-        self.reader = ChineseBertReader({
-            "vocab_file": "vocab.txt",
-            "max_seq_len": 128
-        })
-
-    def preprocess(self, feed=[], fetch=[]):
-        feed_res = []
-        is_batch = False
-        for ins in feed:
-            feed_dict = self.reader.process(ins["words"].encode("utf-8"))
-            for key in feed_dict.keys():
-                feed_dict[key] = np.array(feed_dict[key]).reshape(
-                    (len(feed_dict[key]), 1))
-            feed_res.append(feed_dict)
-        return feed_res, fetch, is_batch
-
-
-bert_service = BertService(name="bert")
-bert_service.load()
-bert_service.load_model_config(sys.argv[1])
-bert_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[2]), device="cpu")
-bert_service.run_rpc_service()
-bert_service.run_web_service()
diff --git a/python/examples/bert/bert_web_service_gpu.py b/python/examples/bert/bert_web_service_gpu.py
deleted file mode 100644
index 5f0b40856263f939e8a341c93898e4c0bfa9a244..0000000000000000000000000000000000000000
--- a/python/examples/bert/bert_web_service_gpu.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server.web_service import WebService
-from paddle_serving_app.reader import ChineseBertReader
-import sys
-import os
-import numpy as np
-
-
-class BertService(WebService):
-    def load(self):
-        self.reader = ChineseBertReader({
-            "vocab_file": "vocab.txt",
-            "max_seq_len": 128
-        })
-
-    def preprocess(self, feed=[], fetch=[]):
-        feed_res = []
-        is_batch = False
-        for ins in feed:
-            feed_dict = self.reader.process(ins["words"].encode("utf-8"))
-            for key in feed_dict.keys():
-                feed_dict[key] = np.array(feed_dict[key]).reshape(
-                    (len(feed_dict[key]), 1))
-            feed_res.append(feed_dict)
-        return feed_res, fetch, is_batch
-
-
-bert_service = BertService(name="bert")
-bert_service.load()
-bert_service.load_model_config(sys.argv[1])
-bert_service.set_gpus("0")
-bert_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[2]), device="gpu")
-bert_service.run_rpc_service()
-bert_service.run_web_service()
diff --git a/python/examples/blazeface/README.md b/python/examples/blazeface/README.md
index 6f9d3c5adab5f3275989479078cb4329d14589fd..29e3026b4d972e141eabcc1a180d7a5cdb804a52 100644
--- a/python/examples/blazeface/README.md
+++ b/python/examples/blazeface/README.md
@@ -2,7 +2,7 @@
 
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model blazeface
+python3 -m paddle_serving_app.package --get_model blazeface
 tar -xf blazeface.tar.gz
 ```
 
@@ -11,13 +11,13 @@ tar -xf blazeface.tar.gz
 ### Start Service
 
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9494
+python3 -m paddle_serving_server.serve --model serving_server --port 9494
 ```
 
 ### Client Prediction
 
 ```
-python test_client.py serving_client/serving_client_conf.prototxt test.jpg
+python3 test_client.py serving_client/serving_client_conf.prototxt test.jpg
 ```
 
 the result is in `output` folder, including a json file and image file with bounding boxes.
diff --git a/python/examples/cascade_rcnn/README.md b/python/examples/cascade_rcnn/README.md
index f8aa79e8bf97da5dd998ac6d340c0abd398931c0..8029f39a11fcbadefe7f7c77ad709b4a0080707e 100644
--- a/python/examples/cascade_rcnn/README.md
+++ b/python/examples/cascade_rcnn/README.md
@@ -10,12 +10,12 @@ If you want to have more detection models, please refer to [Paddle Detection Mod
 
 ### Start the service
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
 ```
 
 ### Perform prediction
 ```
-python test_client.py 
+python3 test_client.py 000000570688.jpg
 ```
 
 Image with bounding boxes and json result would be saved in `output` folder.
diff --git a/python/examples/cascade_rcnn/README_CN.md b/python/examples/cascade_rcnn/README_CN.md
index 99606de41812cb591a46e443c8a2f72c30ba19e0..828aba8a9546465c89ef673625b8b2b5140f96a6 100644
--- a/python/examples/cascade_rcnn/README_CN.md
+++ b/python/examples/cascade_rcnn/README_CN.md
@@ -10,12 +10,12 @@ sh get_data.sh
 
 ### 启动服务
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
 ```
 
 ### 执行预测
 ```
-python test_client.py
+python3 test_client.py 000000570688.jpg
 ```
 
 客户端已经为图片做好了后处理，在`output`文件夹下存放各个框的json格式信息还有后处理结果图片。
diff --git a/python/examples/cascade_rcnn/get_data.sh b/python/examples/cascade_rcnn/get_data.sh
index 0aa9c7dc340367790eb52f5cc0074cb5d6fd0d05..204ae1a269e00a0156141946db7cfed37475564f 100644
--- a/python/examples/cascade_rcnn/get_data.sh
+++ b/python/examples/cascade_rcnn/get_data.sh
@@ -1,2 +1,2 @@
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/cascade_rcnn_r50_fpx_1x_serving.tar.gz
-tar xf cascade_rcnn_r50_fpx_1x_serving.tar.gz
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco_serving.tar.gz
+tar xf cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco_serving.tar.gz
diff --git a/python/examples/cascade_rcnn/label_list.txt b/python/examples/cascade_rcnn/label_list.txt
index d7d43a94adf73208f997f0efd6581bef11ca734e..941cb4e1392266f6a6c09b1fdc5f79503b2e5df6 100644
--- a/python/examples/cascade_rcnn/label_list.txt
+++ b/python/examples/cascade_rcnn/label_list.txt
@@ -1,4 +1,3 @@
-background
 person
 bicycle
 car
diff --git a/python/examples/cascade_rcnn/test_client.py b/python/examples/cascade_rcnn/test_client.py
index b40e97acc3e84a7dc7411a7ad3c3f8c1dc8171a6..aac9f67216863c5f4ecb6bd45dc57dfc8c50ab32 100644
--- a/python/examples/cascade_rcnn/test_client.py
+++ b/python/examples/cascade_rcnn/test_client.py
@@ -12,29 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
+import numpy as np
 from paddle_serving_client import Client
 from paddle_serving_app.reader import *
-import numpy as np
+import cv2
 
-preprocess = Sequential([
-    File2Image(), BGR2RGB(), Div(255.0),
-    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
-    Resize(800, 1333), Transpose((2, 0, 1)), PadStride(32)
+preprocess = DetectionSequential([
+        DetectionFile2Image(),
+        DetectionResize((800, 1333), True, interpolation=2), 
+        DetectionNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True),
+        DetectionTranspose((2,0,1)),
+        DetectionPadStride(32)
 ])
+
 postprocess = RCNNPostprocess("label_list.txt", "output")
 client = Client()
+
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9292'])
-im = preprocess('000000570688.jpg')
+
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
     feed={
         "image": im,
-        "im_info": np.array(list(im.shape[1:]) + [1.0]),
-        "im_shape": np.array(list(im.shape[1:]) + [1.0])
+        "im_shape": np.array(list(im.shape[1:])).reshape(-1),
+        "scale_factor": im_info['scale_factor'],
     },
-    fetch=["multiclass_nms_0.tmp_0"],
+    fetch=["save_infer_model/scale_0.tmp_1"],
     batch=False)
-fetch_map["image"] = '000000570688.jpg'
 print(fetch_map)
+fetch_map["image"] = sys.argv[1]
 postprocess(fetch_map)
-print(fetch_map)
diff --git a/python/examples/criteo_ctr/README.md b/python/examples/criteo_ctr/README.md
index 46be4d0ae9d3167bc107ec45b0000520920d6dea..6c1d79e7362a0240a49a9f0243f3de3340119ce3 100644
--- a/python/examples/criteo_ctr/README.md
+++ b/python/examples/criteo_ctr/README.md
@@ -19,13 +19,13 @@ the directories like `ctr_serving_model` and `ctr_client_conf` will appear.
 ### Start RPC Inference Service
 
 ```
-python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #CPU RPC Service
-python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #RPC Service on GPU 0
+python3 -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #CPU RPC Service
+python3 -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #RPC Service on GPU 0
 ```
 
 ### RPC Infer
 
 ```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0
+python3 test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0
 ```
 the latency will display in the end.
diff --git a/python/examples/criteo_ctr/README_CN.md b/python/examples/criteo_ctr/README_CN.md
index c7d6255e0b21aa447c5decc823a9bbb5bdb4ad65..c5b1da76055e64bd08bcf2a00dffe537bc931ee9 100644
--- a/python/examples/criteo_ctr/README_CN.md
+++ b/python/examples/criteo_ctr/README_CN.md
@@ -19,13 +19,13 @@ mv models/ctr_serving_model .
 ### 启动RPC预测服务
 
 ```
-python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #启动CPU预测服务
-python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #在GPU 0上启动预测服务
+python3 -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #启动CPU预测服务
+python3 -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #在GPU 0上启动预测服务
 ```
 
 ### 执行预测
 
 ```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0
+python3 test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0
 ```
 预测完毕会输出预测过程的耗时。
diff --git a/python/examples/criteo_ctr_with_cube/README.md b/python/examples/criteo_ctr_with_cube/README.md
index 493b3d72c1fff9275c2a99cfee45efd4bef1af4c..de5c3269228a8d7ef619a8c46f2252208e53b982 100755
--- a/python/examples/criteo_ctr_with_cube/README.md
+++ b/python/examples/criteo_ctr_with_cube/README.md
@@ -32,13 +32,13 @@ Here, the sparse parameter is loaded by cube sparse parameter indexing service C
 ### Start RPC Predictor, the number of serving thread is 4（configurable in test_server.py）
 
 ```
-python test_server.py ctr_serving_model_kv 
+python3 test_server.py ctr_serving_model_kv 
 ```
 
 ### Run Prediction
 
 ```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
+python3 test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
 ```
 
 ### Benchmark
diff --git a/python/examples/criteo_ctr_with_cube/README_CN.md b/python/examples/criteo_ctr_with_cube/README_CN.md
index 7a0eb43c203aafeb38b64d249954cdabf7bf7a38..15d61160317f866aae25a4d777d76e14725424d3 100644
--- a/python/examples/criteo_ctr_with_cube/README_CN.md
+++ b/python/examples/criteo_ctr_with_cube/README_CN.md
@@ -30,13 +30,13 @@ sh cube_prepare.sh &
 ### 启动RPC预测服务，服务端线程数为4（可在test_server.py配置）
 
 ```
-python test_server.py ctr_serving_model_kv 
+python3 test_server.py ctr_serving_model_kv 
 ```
 
 ### 执行预测
 
 ```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
+python3 test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
 ```
 
 ### Benchmark
diff --git a/python/examples/criteo_ctr_with_cube/cube/conf/cube.conf b/python/examples/criteo_ctr_with_cube/cube/conf/cube.conf
new file mode 100755
index 0000000000000000000000000000000000000000..b70f6e34247e410f9b80054010338d3c8f452ec6
--- /dev/null
+++ b/python/examples/criteo_ctr_with_cube/cube/conf/cube.conf
@@ -0,0 +1,13 @@
+[{
+    "dict_name": "test_dict",
+    "shard": 1,
+    "dup": 1,
+    "timeout": 200,
+    "retry": 3,
+    "backup_request": 100,
+    "type": "ipport_list",
+    "load_balancer": "rr",
+    "nodes": [{
+        "ipport_list": "list://127.0.0.1:8027"
+    }]
+}]
diff --git a/python/examples/criteo_ctr_with_cube/cube/conf/gflags.conf b/python/examples/criteo_ctr_with_cube/cube/conf/gflags.conf
new file mode 100755
index 0000000000000000000000000000000000000000..21c7bddebd8f22b91d0ba26a6121007f96a4380b
--- /dev/null
+++ b/python/examples/criteo_ctr_with_cube/cube/conf/gflags.conf
@@ -0,0 +1,4 @@
+--port=8027
+--dict_split=1
+--in_mem=true
+--log_dir=./log/
diff --git a/python/examples/criteo_ctr_with_cube/cube/keys b/python/examples/criteo_ctr_with_cube/cube/keys
new file mode 100755
index 0000000000000000000000000000000000000000..f00c965d8307308469e537302baa73048488f162
--- /dev/null
+++ b/python/examples/criteo_ctr_with_cube/cube/keys
@@ -0,0 +1,10 @@
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
diff --git a/python/examples/criteo_ctr_with_cube/test_client.py b/python/examples/criteo_ctr_with_cube/test_client.py
index bef04807e9b5d5c2cdc316828ed6f960f0eeb0f8..f12d727a3d2c4f6fce013d1f815f8b589a327dd5 100755
--- a/python/examples/criteo_ctr_with_cube/test_client.py
+++ b/python/examples/criteo_ctr_with_cube/test_client.py
@@ -16,7 +16,7 @@
 from paddle_serving_client import Client
 import sys
 import os
-import criteo as criteo
+import criteo_reader as criteo
 import time
 from paddle_serving_client.metric import auc
 import numpy as np
@@ -35,22 +35,23 @@ reader = dataset.infer_reader(test_filelists, batch, buf_size)
 label_list = []
 prob_list = []
 start = time.time()
-for ei in range(10000):
+for ei in range(100):
     if py_version == 2:
         data = reader().next()
     else:
         data = reader().__next__()
     feed_dict = {}
-    feed_dict['dense_input'] = data[0][0]
+    feed_dict['dense_input'] = np.array(data[0][0]).reshape(1, len(data[0][0]))
+
     for i in range(1, 27):
-        feed_dict["embedding_{}.tmp_0".format(i - 1)] = np.array(data[0][i]).reshape(-1)
+        feed_dict["embedding_{}.tmp_0".format(i - 1)] = np.array(data[0][i]).reshape(len(data[0][i]))
         feed_dict["embedding_{}.tmp_0.lod".format(i - 1)] = [0, len(data[0][i])]
-    fetch_map = client.predict(feed=feed_dict, fetch=["prob"])
+    fetch_map = client.predict(feed=feed_dict, fetch=["prob"],batch=True)
     print(fetch_map)
     prob_list.append(fetch_map['prob'][0][1])
     label_list.append(data[0][-1][0])
 
-print(auc(label_list, prob_list))
+
 end = time.time()
 print(end - start)
 
diff --git a/python/examples/deeplabv3/README.md b/python/examples/deeplabv3/README.md
index 28bec77bb500e42919734433617ea2df1b9e95c0..08022618fcec5220667ca19bfb803cba36519c7b 100644
--- a/python/examples/deeplabv3/README.md
+++ b/python/examples/deeplabv3/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 
 ```
-python -m paddle_serving_app.package --get_model deeplabv3
+python3 -m paddle_serving_app.package --get_model deeplabv3
 tar -xzvf deeplabv3.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf deeplabv3.tar.gz
 ### Start Service
 
 ```
-python -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
+python3 -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
 ```
 
 ### Client Prediction
 
 ```
-python deeplabv3_client.py
+python3 deeplabv3_client.py
 ```
diff --git a/python/examples/deeplabv3/README_CN.md b/python/examples/deeplabv3/README_CN.md
index 6de3c420833d31f871ad79122e1d77aee4208e35..16f11daba354349f1b73f8bba00cac8ff5c88864 100644
--- a/python/examples/deeplabv3/README_CN.md
+++ b/python/examples/deeplabv3/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 
 ```
-python -m paddle_serving_app.package --get_model deeplabv3
+python3 -m paddle_serving_app.package --get_model deeplabv3
 tar -xzvf deeplabv3.tar.gz
 ```
 
@@ -12,10 +12,10 @@ tar -xzvf deeplabv3.tar.gz
 ### 启动服务端
 
 ```
-python -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
+python3 -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
 ```
 
 ### 客户端预测
 
 ```
-python deeplabv3_client.py
+python3 deeplabv3_client.py
diff --git a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
index ff4eb10139b4843c81fa2a256f3e6ff116e32472..3c0fb8dbee6c0d6eac7b09cb16428679cb8b9e5d 100644
--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
@@ -4,13 +4,13 @@
 
 ### Get The Faster RCNN HRNet Model
 ```
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/faster_rcnn_hrnetv2p_w18_1x.tar
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/faster_rcnn_hrnetv2p_w18_1x.tar.gz
 ```
 
 ### Start the service
 ```
-tar xf faster_rcnn_hrnetv2p_w18_1x.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+tar xf faster_rcnn_hrnetv2p_w18_1x.tar.gz
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 This model support TensorRT, if you want a faster inference, please use `--use_trt`. But you need to do some extra work.
@@ -19,5 +19,5 @@ Please reference to https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/m
 
 ### Prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
diff --git a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
index 4bd51128f4a5cb8c09b624c5a1f3dc82b5556b23..11dcbd85fe62f4dae5a4714ad3996424499024c0 100644
--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
@@ -4,19 +4,19 @@
 
 ## 获得Faster RCNN HRNet模型
 ```
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/faster_rcnn_hrnetv2p_w18_1x.tar
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/faster_rcnn_hrnetv2p_w18_1x.tar.gz
 ```
 
 
 ### 启动服务
 ```
-tar xf faster_rcnn_hrnetv2p_w18_1x.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+tar xf faster_rcnn_hrnetv2p_w18_1x.tar.gz
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项,但此时需要额外设置子图的TRT变长最大最小最优shape.
 请参考https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40
 
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
diff --git a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/test_client.py b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/test_client.py
index da21478a5de0feef816a4dedb9e3aab7cd011719..329f6effb4cb8a8a163cada106f6aaacc1cc3857 100644
--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/test_client.py
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/test_client.py
@@ -1,12 +1,29 @@
-from paddle_serving_client import Client
-from paddle_serving_app.reader import *
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
 
-preprocess = Sequential([
-    File2Image(), BGR2RGB(), Div(255.0),
-    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
-    Resize(640, 640), Transpose((2, 0, 1))
+preprocess = DetectionSequential([
+       DetectionFile2Image(),
+       DetectionResize((800, 1333), True, interpolation=2), 
+       DetectionNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True),
+       DetectionTranspose((2,0,1)),
+       DetectionPadStride(32)
 ])
 
 postprocess = RCNNPostprocess("label_list.txt", "output")
@@ -15,13 +32,15 @@ client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9494'])
 
-im = preprocess(sys.argv[1])
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
     feed={
         "image": im,
-        "im_info": np.array(list(im.shape[1:]) + [1.0]),
-        "im_shape": np.array(list(im.shape[1:]) + [1.0])
+        "im_shape": np.array(list(im.shape[1:])).reshape(-1),
+        "scale_factor": im_info['scale_factor'],
     },
-    fetch=["multiclass_nms_0.tmp_0"],
+    fetch=["save_infer_model/scale_0.tmp_1"],
     batch=False)
 print(fetch_map)
+fetch_map["image"] = sys.argv[1]
+postprocess(fetch_map)
diff --git a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
index 1fb0dfeecc6e82045bcaa026412f561e8a43908e..d56aa416b9e54114646f9271c27f6afde7d41259 100644
--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
@@ -10,7 +10,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 This model support TensorRT, if you want a faster inference, please use `--use_trt`. But you need to do some extra work.
@@ -19,7 +19,7 @@ Please reference to https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/m
 
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
 
 ## 3. Result analysis
diff --git a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
index 7617df7aa0c732d047e7cbd056f93e6a16f403d6..f8475daf029ae2230432871237281970052fe3e3 100644
--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
@@ -11,14 +11,14 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项,但此时需要额外设置子图的TRT变长最大最小最优shape.
 请参考https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40
 
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
 
 ## 3. 结果分析
diff --git a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/test_client.py b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/test_client.py
index a81f53dced2076615e14d05c844a91e40da19321..b6b2c534b0609692fea34bafcf4059222738debd 100644
--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/test_client.py
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/test_client.py
@@ -12,15 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle_serving_client import Client
-from paddle_serving_app.reader import *
 import sys
 import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
 
-preprocess = Sequential([
-    File2Image(), BGR2RGB(), Div(255.0),
-    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
-    Resize(640, 640), Transpose((2, 0, 1))
+preprocess = DetectionSequential([
+        DetectionFile2Image(),
+        DetectionNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True),
+        DetectionResize(
+        (800, 1333), True, interpolation=cv2.INTER_LINEAR), 
+        DetectionTranspose((2,0,1)),
+        DetectionPadStride(128)
 ])
 
 postprocess = RCNNPostprocess("label_list.txt", "output")
@@ -29,15 +33,14 @@ client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9494'])
 
-im = preprocess(sys.argv[1])
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
     feed={
         "image": im,
         "im_shape": np.array(list(im.shape[1:])).reshape(-1),
-        "scale_factor": np.array([1.0, 1.0]).reshape(-1),
+        "scale_factor": im_info['scale_factor'],
     },
     fetch=["save_infer_model/scale_0.tmp_1"],
     batch=False)
-print(fetch_map)
 fetch_map["image"] = sys.argv[1]
 postprocess(fetch_map)
diff --git a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/000000014439.jpg b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/000000014439.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0abbdab06eb5950b93908cc91adfa640e8a3ac78
Binary files /dev/null and b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/000000014439.jpg differ
diff --git a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/000000570688.jpg b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/000000570688.jpg
deleted file mode 100644
index cb304bd56c4010c08611a30dcca58ea9140cea54..0000000000000000000000000000000000000000
Binary files a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/000000570688.jpg and /dev/null differ
diff --git a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README.md b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README.md
index d0cdb1582584cb7e0e95d00231c2c8a5fb33d464..58d13e53fe9ac3b177a3b6e6661a1370efa796b9 100644
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README.md
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README.md
@@ -10,11 +10,11 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf fcos_dcn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000014439.jpg
 ```
diff --git a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README_CN.md b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README_CN.md
index 56c2505c8c7ee2be7627a2f6fd9e108868428805..af2fd8753cc56ef9c732c21020712674313ac4fa 100644
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README_CN.md
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README_CN.md
@@ -11,12 +11,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf fcos_dcn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000014439.jpg
 ```
diff --git a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/test_client.py b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/test_client.py
index bf5504105a61df7912e6c34037287610a1939479..7ad59d75b84cad081449df31393e06a26d7441dd 100644
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/test_client.py
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/test_client.py
@@ -12,15 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle_serving_client import Client
-from paddle_serving_app.reader import *
 import sys
 import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
 
-preprocess = Sequential([
-    File2Image(), BGR2RGB(), Div(255.0),
-    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
-    Resize(640, 640), Transpose((2, 0, 1))
+preprocess = DetectionSequential([
+        DetectionFile2Image(),
+        DetectionNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True),
+        DetectionResize(
+        (800, 1333), True, interpolation=cv2.INTER_LINEAR), 
+        DetectionTranspose((2,0,1)),
+        DetectionPadStride(128)
 ])
 
 postprocess = RCNNPostprocess("label_list.txt", "output")
@@ -29,12 +33,14 @@ client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9494'])
 
-im = preprocess(sys.argv[1])
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
     feed={
         "image": im,
-        "scale_factor": np.array([1.0, 1.0]).reshape(-1),
+        "scale_factor": im_info['scale_factor'],
     },
     fetch=["save_infer_model/scale_0.tmp_1"],
     batch=False)
 print(fetch_map)
+fetch_map["image"] = sys.argv[1]
+postprocess(fetch_map)
diff --git a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md
index 8c3d5142ad2a88dc151478965e41def5075e4b2f..8060e087107e54bc401849fd576497e9fc9cd421 100644
--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md
@@ -10,13 +10,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ppyolo_r50vd_dcn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
-
diff --git a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md
index 1aebb8db9a0a3b3523d233a70ff42afe4f40a610..3071db7b124fd998d15901be7a78a67018d0de0f 100644
--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md
@@ -11,13 +11,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf ppyolo_r50vd_dcn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
-
diff --git a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/test_client.py b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/test_client.py
index dc3b2b77dc07822a3d69a85a7c38690c58a442c2..f40f2d5c87bdd64f588620d2d6f6ebf98a3894a7 100644
--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/test_client.py
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/test_client.py
@@ -12,15 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle_serving_client import Client
-from paddle_serving_app.reader import *
 import sys
 import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
 
-preprocess = Sequential([
-    File2Image(), BGR2RGB(), Div(255.0),
-    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
-    Resize((608, 608)), Transpose((2, 0, 1))
+preprocess = DetectionSequential([
+        DetectionFile2Image(),
+        DetectionNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True),
+        DetectionResize(
+        (608, 608), False, interpolation=2), 
+        DetectionTranspose((2,0,1))
 ])
 
 postprocess = RCNNPostprocess("label_list.txt", "output")
@@ -29,15 +32,14 @@ client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9494'])
 
-im = preprocess(sys.argv[1])
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
     feed={
         "image": im,
         "im_shape": np.array(list(im.shape[1:])).reshape(-1),
-        "scale_factor": np.array([1.0, 1.0]).reshape(-1),
+        "scale_factor": im_info['scale_factor'],
     },
     fetch=["save_infer_model/scale_0.tmp_1"],
     batch=False)
-print(fetch_map)
 fetch_map["image"] = sys.argv[1]
 postprocess(fetch_map)
diff --git a/python/examples/detection/ssd_vgg16_300_240e_voc/000000014439.jpg b/python/examples/detection/ssd_vgg16_300_240e_voc/000000014439.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0abbdab06eb5950b93908cc91adfa640e8a3ac78
Binary files /dev/null and b/python/examples/detection/ssd_vgg16_300_240e_voc/000000014439.jpg differ
diff --git a/python/examples/detection/ssd_vgg16_300_240e_voc/000000570688.jpg b/python/examples/detection/ssd_vgg16_300_240e_voc/000000570688.jpg
deleted file mode 100644
index cb304bd56c4010c08611a30dcca58ea9140cea54..0000000000000000000000000000000000000000
Binary files a/python/examples/detection/ssd_vgg16_300_240e_voc/000000570688.jpg and /dev/null differ
diff --git a/python/examples/detection/ssd_vgg16_300_240e_voc/README.md b/python/examples/detection/ssd_vgg16_300_240e_voc/README.md
index 062941bfb8deff3a09c938e9c43cd2b710cbb0e5..8a9a766c7b24d8468cbc72d6affd90263e86b013 100644
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/README.md
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/README.md
@@ -10,11 +10,11 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ssd_vgg16_300_240e_voc.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000014439.jpg
 ```
diff --git a/python/examples/detection/ssd_vgg16_300_240e_voc/README_CN.md b/python/examples/detection/ssd_vgg16_300_240e_voc/README_CN.md
index 32c19b5159a497e52df1c5fd01a87fd43f7d67e4..d3df37d774bd1a478af0a41a9fca9f238ca69aac 100644
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/README_CN.md
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/README_CN.md
@@ -11,12 +11,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf ssd_vgg16_300_240e_voc.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000014439.jpg
 ```
diff --git a/python/examples/detection/ssd_vgg16_300_240e_voc/label_list.txt b/python/examples/detection/ssd_vgg16_300_240e_voc/label_list.txt
index 941cb4e1392266f6a6c09b1fdc5f79503b2e5df6..8420ab35ede7400974f25836a6bb543024686a0e 100644
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/label_list.txt
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/label_list.txt
@@ -1,80 +1,20 @@
-person
+aeroplane
 bicycle
-car
-motorcycle
-airplane
-bus
-train
-truck
-boat
-traffic light
-fire hydrant
-stop sign
-parking meter
-bench
 bird
+boat
+bottle
+bus
+car
 cat
+chair
+cow
+diningtable
 dog
 horse
+motorbike
+person
+pottedplant
 sheep
-cow
-elephant
-bear
-zebra
-giraffe
-backpack
-umbrella
-handbag
-tie
-suitcase
-frisbee
-skis
-snowboard
-sports ball
-kite
-baseball bat
-baseball glove
-skateboard
-surfboard
-tennis racket
-bottle
-wine glass
-cup
-fork
-knife
-spoon
-bowl
-banana
-apple
-sandwich
-orange
-broccoli
-carrot
-hot dog
-pizza
-donut
-cake
-chair
-couch
-potted plant
-bed
-dining table
-toilet
-tv
-laptop
-mouse
-remote
-keyboard
-cell phone
-microwave
-oven
-toaster
-sink
-refrigerator
-book
-clock
-vase
-scissors
-teddy bear
-hair drier
-toothbrush
+sofa
+train
+tvmonitor
diff --git a/python/examples/detection/ssd_vgg16_300_240e_voc/test_client.py b/python/examples/detection/ssd_vgg16_300_240e_voc/test_client.py
index 3409ef8859e1e408a5a5c6ac833d3ad2d4b508d9..1df635c89d7228d10d8b08d4e011713200e9c828 100644
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/test_client.py
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/test_client.py
@@ -12,15 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle_serving_client import Client
-from paddle_serving_app.reader import *
 import sys
 import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
 
-preprocess = Sequential([
-    File2Image(), BGR2RGB(),
-    Normalize([123.675, 116.28, 103.53], [58.395, 57.12, 57.375], False),
-    Resize((512, 512)), Transpose((2, 0, 1))
+preprocess = DetectionSequential([
+        DetectionFile2Image(),
+        DetectionResize(
+        (300, 300), False, interpolation=cv2.INTER_LINEAR), 
+        DetectionNormalize([104.0, 117.0, 123.0], [1.0, 1.0, 1.0], False),
+        DetectionTranspose((2,0,1)),
 ])
 
 postprocess = RCNNPostprocess("label_list.txt", "output")
@@ -29,13 +32,15 @@ client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9494'])
 
-im = preprocess(sys.argv[1])
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
     feed={
         "image": im,
-        "im_shape": np.array([512, 512]),
-        "scale_factor": np.array([1.0, 1.0]).reshape(-1),
+        "im_shape": np.array(list(im.shape[1:])).reshape(-1),
+        "scale_factor": im_info['scale_factor'],
     },
     fetch=["save_infer_model/scale_0.tmp_1"],
     batch=False)
 print(fetch_map)
+fetch_map["image"] = sys.argv[1]
+postprocess(fetch_map)
diff --git a/python/examples/detection/ttfnet_darknet53_1x_coco/README.md b/python/examples/detection/ttfnet_darknet53_1x_coco/README.md
index 58c538b7cdc5ff7975b57d292b1d8b0c7d5dd2b7..adf5de2abd39c3b440ac43ab9b1c1c58aba69c51 100644
--- a/python/examples/detection/ttfnet_darknet53_1x_coco/README.md
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/README.md
@@ -4,18 +4,17 @@
 
 ### Get Model
 ```
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/ttfnet_darknet53_1x_coco.tar
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/ttfnet_darknet53_1x_coco.tar
 ```
 
 ### Start the service
 ```
 tar xf ttfnet_darknet53_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
-
diff --git a/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md b/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md
index 641086cd2eba4b274325bca47791a60c6a5ec97f..7a2c860967643585023ce0f644a36e9c056c21a2 100644
--- a/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md
@@ -4,20 +4,19 @@
 
 ## 获得模型
 ```
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/ttfnet_darknet53_1x_coco.tar
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/ttfnet_darknet53_1x_coco.tar
 ```
 
 
 ### 启动服务
 ```
 tar xf ttfnet_darknet53_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
-
diff --git a/python/examples/detection/ttfnet_darknet53_1x_coco/test_client.py b/python/examples/detection/ttfnet_darknet53_1x_coco/test_client.py
index 59024d010a27c1569b5a07afd4508ad19894d89e..f735c01bc52db529a6823dbff4e72eb236525344 100644
--- a/python/examples/detection/ttfnet_darknet53_1x_coco/test_client.py
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/test_client.py
@@ -11,16 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from paddle_serving_client import Client
-from paddle_serving_app.reader import *
 import sys
 import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
 
-preprocess = Sequential([
-    File2Image(), BGR2RGB(), 
-    Normalize([123.675, 116.28, 103.53], [58.395, 57.12, 57.375], False),
-    Resize((512, 512)), Transpose((2, 0, 1))
+preprocess = DetectionSequential([
+        DetectionFile2Image(),
+        DetectionResize(
+        (512, 512), False, interpolation=cv2.INTER_LINEAR),
+        DetectionNormalize([123.675, 116.28, 103.53], [58.395, 57.12, 57.375], False),
+        DetectionTranspose((2,0,1))
 ])
 
 postprocess = RCNNPostprocess("label_list.txt", "output")
@@ -29,11 +31,14 @@ client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9494'])
 
-im = preprocess(sys.argv[1])
+im, im_info = preprocess(sys.argv[1])
+
+
 fetch_map = client.predict(
     feed={
         "image": im,
-        "scale_factor": np.array([1.0, 1.0]).reshape(-1),
+        "im_shape": np.array(list(im.shape[1:])).reshape(-1),
+        "scale_factor": im_info['scale_factor'],
     },
     fetch=["save_infer_model/scale_0.tmp_1"],
     batch=False)
diff --git a/python/examples/detection/yolov3_darknet53_270e_coco/README.md b/python/examples/detection/yolov3_darknet53_270e_coco/README.md
index 6357c3030a5936b4ec9105860dd63144bfd8098e..32670748db42336053d01e61bf087d00c03c7e06 100644
--- a/python/examples/detection/yolov3_darknet53_270e_coco/README.md
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/README.md
@@ -10,13 +10,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf yolov3_darknet53_270e_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
-
diff --git a/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md b/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md
index 166d562e79a91bbc59cd7dc15b7e5667f4e0cb27..4185e0fe4963113ed0f9c0ea865705fd33226d1b 100644
--- a/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md
@@ -11,13 +11,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf yolov3_darknet53_270e_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
-
diff --git a/python/examples/detection/yolov3_darknet53_270e_coco/test_client.py b/python/examples/detection/yolov3_darknet53_270e_coco/test_client.py
index dc3b2b77dc07822a3d69a85a7c38690c58a442c2..04f21b32aebbf83694fa37aa30193ec5d5b7dbac 100644
--- a/python/examples/detection/yolov3_darknet53_270e_coco/test_client.py
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/test_client.py
@@ -12,15 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle_serving_client import Client
-from paddle_serving_app.reader import *
 import sys
 import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
 
-preprocess = Sequential([
-    File2Image(), BGR2RGB(), Div(255.0),
-    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
-    Resize((608, 608)), Transpose((2, 0, 1))
+preprocess = DetectionSequential([
+        DetectionFile2Image(),
+        DetectionResize(
+        (608, 608), False, interpolation=2), 
+        DetectionNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True),
+        DetectionTranspose((2,0,1)),
 ])
 
 postprocess = RCNNPostprocess("label_list.txt", "output")
@@ -29,15 +32,14 @@ client = Client()
 client.load_client_config("serving_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9494'])
 
-im = preprocess(sys.argv[1])
+im, im_info = preprocess(sys.argv[1])
 fetch_map = client.predict(
     feed={
         "image": im,
         "im_shape": np.array(list(im.shape[1:])).reshape(-1),
-        "scale_factor": np.array([1.0, 1.0]).reshape(-1),
+        "scale_factor": im_info['scale_factor'],
     },
     fetch=["save_infer_model/scale_0.tmp_1"],
     batch=False)
-print(fetch_map)
 fetch_map["image"] = sys.argv[1]
 postprocess(fetch_map)
diff --git a/python/examples/encryption/README.md b/python/examples/encryption/README.md
index 0d92604d15070df35e0125d084e7c68e1b36ae1b..3120422ebfaa2a88851eda18c42e7740fe29e884 100644
--- a/python/examples/encryption/README.md
+++ b/python/examples/encryption/README.md
@@ -12,9 +12,9 @@ sh get_data.sh
 
 ## Encrypt Model
 
-The `paddlepaddle` package is used in this example, you may need to download the corresponding package(`pip install paddlepaddle`).
+The `paddlepaddle` package is used in this example, you may need to download the corresponding package(`pip3 install paddlepaddle`).
 
-[python encrypt.py](./encrypt.py)
+[python3 encrypt.py](./encrypt.py)
 
 [//file]:#encrypt.py
 ``` python
@@ -35,14 +35,14 @@ client-side configuration file are stored in the `encrypt_client` directory.
 ## Start Encryption Service
 CPU Service
 ```
-python -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model
+python3 -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model
 ```
 GPU Service
 ```
-python -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model --gpu_ids 0
+python3 -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model --gpu_ids 0
 ```
 
 ## Prediction
 ```
-python test_client.py encrypt_client/serving_client_conf.prototxt
+python3 test_client.py encrypt_client/serving_client_conf.prototxt
 ```
diff --git a/python/examples/encryption/README_CN.md b/python/examples/encryption/README_CN.md
index b6f8fb8411a8d93097a9e1dc28393096f3ebccc2..ad82d49b61cb70093a9423ad83dbc30663b6d4f1 100644
--- a/python/examples/encryption/README_CN.md
+++ b/python/examples/encryption/README_CN.md
@@ -11,9 +11,9 @@ sh get_data.sh
 ```
 
 ## 模型加密
-本示例中使用了`paddlepaddle`包中的模块，需要进行下载（`pip install paddlepaddle`）。
+本示例中使用了`paddlepaddle`包中的模块，需要进行下载（`pip3 install paddlepaddle`）。
 
-运行[python encrypt.py](./encrypt.py)进行模型加密
+运行[python3 encrypt.py](./encrypt.py)进行模型加密
 
 [//file]:#encrypt.py
 ``` python
@@ -36,14 +36,14 @@ def serving_encryption():
 ## 启动加密预测服务
 CPU预测服务
 ```
-python -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model
+python3 -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model
 ```
 GPU预测服务
 ```
-python -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model --gpu_ids 0
+python3 -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model --gpu_ids 0
 ```
 
 ## 预测
 ```
-python test_client.py encrypt_client/serving_client_conf.prototxt
+python3 test_client.py encrypt_client/serving_client_conf.prototxt
 ```
diff --git a/python/examples/fit_a_line/README.md b/python/examples/fit_a_line/README.md
index 3a16316ea8b0bafdaa43736e11662d8c6b5165f5..9586cd670240eb43e4a706ff89ea435b7a8c6d1c 100644
--- a/python/examples/fit_a_line/README.md
+++ b/python/examples/fit_a_line/README.md
@@ -15,22 +15,22 @@ sh get_data.sh
 ### Start server
 
 ```shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
 ```
 
 ## Client prediction
 
 ### RPC Client
-The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip install paddlepaddle`).
+The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip3 install paddlepaddle`).
 
 ``` shell
-python test_client.py uci_housing_client/serving_client_conf.prototxt
+python3 test_client.py uci_housing_client/serving_client_conf.prototxt
 ```
 
 ### Http Client
 
 ``` shell
-python test_httpclient.py uci_housing_client/serving_client_conf.prototxt
+python3 test_httpclient.py uci_housing_client/serving_client_conf.prototxt
 ```
 
 
diff --git a/python/examples/fit_a_line/README_CN.md b/python/examples/fit_a_line/README_CN.md
old mode 100644
new mode 100755
index 20e3c5d63b4a73634a7940ecef0e39232fe75d30..d1cace5e2c5b5cee2195deaa1667af68e5f1f067
--- a/python/examples/fit_a_line/README_CN.md
+++ b/python/examples/fit_a_line/README_CN.md
@@ -9,28 +9,26 @@ sh get_data.sh
 ```
 
 
-
-
-## 开启服务端
+## 开启服务端（支持BRPC-Client/GRPC Client/Http-Client）
 
 ```shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
 ```
 
 ## 客户端预测
 
-### 客户端RPC
+### BRPC-Client
 
-`test_client.py`中使用了`paddlepaddle`包，需要进行下载（`pip install paddlepaddle`）。
+`test_client.py`中使用了`paddlepaddle`包，需要进行下载（`pip3 install paddlepaddle`）。
 
 ``` shell
-python test_client.py uci_housing_client/serving_client_conf.prototxt
+python3 test_client.py uci_housing_client/serving_client_conf.prototxt
 ```
 
-### 客户端Http预测
+### GRPC-Client/Http-Client
 
 ``` shell
-python test_httpclient.py uci_housing_client/serving_client_conf.prototxt
+python3 test_httpclient.py uci_housing_client/serving_client_conf.prototxt
 ```
 
 
diff --git a/python/examples/fit_a_line/test_httpclient.py b/python/examples/fit_a_line/test_httpclient.py
index 96145b6343c7936774b8e3ad8841f8019e54eaec..c9f785dc99e2699027862fd2a28bd429e8b1a0a5 100755
--- a/python/examples/fit_a_line/test_httpclient.py
+++ b/python/examples/fit_a_line/test_httpclient.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing
 
-from paddle_serving_client.httpclient import GeneralClient
+from paddle_serving_client.httpclient import HttpClient
 import sys
 import numpy as np
 import time
 
-client = GeneralClient()
+client = HttpClient()
 client.load_client_config(sys.argv[1])
 ''' 
 if you want use GRPC-client, set_use_grpc_client(True)
@@ -41,13 +41,14 @@ we recommend use Proto data format in HTTP-body, set True(which is default)
 if you want use JSON data format in HTTP-body, set False
 '''
 #client.set_http_proto(True)
+client.connect(["127.0.0.1:9393"])
+fetch_list = client.get_fetch_names()
 
 import paddle
 test_reader = paddle.batch(
     paddle.reader.shuffle(
         paddle.dataset.uci_housing.test(), buf_size=500),
     batch_size=1)
-fetch_list = client.get_fetch_names()
 for data in test_reader():
     new_data = np.zeros((1, 13)).astype("float32")
     new_data[0] = data[0][0]
diff --git a/python/examples/imagenet/README.md b/python/examples/imagenet/README.md
old mode 100644
new mode 100755
index ad8b12b5bb8bf5669a34cf88637f34e640ca0a65..eaff522a5ae31eab08786489cbce0fa83f85e91d
--- a/python/examples/imagenet/README.md
+++ b/python/examples/imagenet/README.md
@@ -12,38 +12,30 @@ sh get_model.sh
 ### Install preprocess module
 
 ```
-pip install paddle_serving_app
+pip3 install paddle_serving_app
 ```
 
-### HTTP Service
-
-launch server side
-```
-python resnet50_web_service.py ResNet50_vd_model cpu 9696 #cpu inference service
-```
-```
-python resnet50_web_service.py ResNet50_vd_model gpu 9696 #gpu inference service
-```
 
+### Inference Service(Support BRPC-Client/GRPC-Client/Http-Client)
 
-client send inference request
+launch server side
 ```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"image": "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"}], "fetch": ["score"]}' http://127.0.0.1:9696/image/prediction
+python3 -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu inference service
 ```
 
-### RPC Service
-
-launch server side
 ```
-python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu inference service
+python3 -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu inference service
 ```
 
+### BRPC-Client
+client send inference request
 ```
-python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu inference service
+python3 resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
 ```
+*the port of server side in this example is 9696
 
+### GRPC-Client/Http-Client
 client send inference request
 ```
-python resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
+python3 resnet50_http_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
 ```
-*the port of server side in this example is 9696
diff --git a/python/examples/imagenet/README_CN.md b/python/examples/imagenet/README_CN.md
old mode 100644
new mode 100755
index 8650d51a6b41ac3ad68d49e3a7c966f0c0425ad1..642bee3d0cbab98a48f2f09284ea887751752667
--- a/python/examples/imagenet/README_CN.md
+++ b/python/examples/imagenet/README_CN.md
@@ -12,38 +12,30 @@ sh get_model.sh
 ### 安装数据预处理模块
 
 ```
-pip install paddle_serving_app
+pip3 install paddle_serving_app
 ```
 
-### HTTP服务
+### 启动服务端（支持BRPC-Client、GRPC-Client、Http-Client）
 
 启动server端
 ```
-python resnet50_web_service.py ResNet50_vd_model cpu 9696 #cpu预测服务
+python3 -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu预测服务
 ```
-```
-python resnet50_web_service.py ResNet50_vd_model gpu 9696 #gpu预测服务
-```
-
 
-发送HTTP POST请求
 ```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"image": "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"}], "fetch": ["score"]}' http://127.0.0.1:9696/image/prediction
+python3 -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu预测服务
 ```
 
-### RPC服务
-
-启动server端
+### BRPC-Client预测
+client端进行预测
 ```
-python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu预测服务
+python3 resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
 ```
+*server端示例中服务端口为9696端口
 
-```
-python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu预测服务
-```
 
+### GRPC-Client/Http-Client预测
 client端进行预测
 ```
-python resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
+python3 resnet50_http_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
 ```
-*server端示例中服务端口为9696端口
diff --git a/python/examples/imagenet/resnet50_http_client.py b/python/examples/imagenet/resnet50_http_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..77782671b72a1fa41e65ca02b3edeb2a7753face
--- /dev/null
+++ b/python/examples/imagenet/resnet50_http_client.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from paddle_serving_client import HttpClient
+from paddle_serving_app.reader import Sequential, URL2Image, Resize
+from paddle_serving_app.reader import CenterCrop, RGB2BGR, Transpose, Div, Normalize
+import time
+
+client = HttpClient()
+client.load_client_config(sys.argv[1])
+''' 
+if you want use GRPC-client, set_use_grpc_client(True)
+or you can directly use client.grpc_client_predict(...)
+as for HTTP-client,set_use_grpc_client(False)(which is default)
+or you can directly use client.http_client_predict(...)
+'''
+#client.set_use_grpc_client(True)
+'''
+if you want to enable Encrypt Module,uncommenting the following line
+'''
+#client.use_key("./key")
+'''
+if you want to compress,uncommenting the following line
+'''
+#client.set_response_compress(True)
+#client.set_request_compress(True)
+'''
+we recommend use Proto data format in HTTP-body, set True(which is default)
+if you want use JSON data format in HTTP-body, set False
+'''
+#client.set_http_proto(True)
+client.connect(["127.0.0.1:9696"])
+
+label_dict = {}
+label_idx = 0
+with open("imagenet.label") as fin:
+    for line in fin:
+        label_dict[label_idx] = line.strip()
+        label_idx += 1
+
+seq = Sequential([
+    URL2Image(), Resize(256), CenterCrop(224), RGB2BGR(), Transpose((2, 0, 1)),
+    Div(255), Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True)
+])
+
+start = time.time()
+image_file = "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"
+for i in range(10):
+    img = seq(image_file)
+    fetch_map = client.predict(
+        feed={"image": img}, fetch=["score"], batch=False)
+    print(fetch_map)
+
+end = time.time()
+print(end - start)
diff --git a/python/examples/imagenet/resnet50_web_service.py b/python/examples/imagenet/resnet50_web_service.py
deleted file mode 100644
index ca111615deb9d240f9d8b042f1f7edb599a1b775..0000000000000000000000000000000000000000
--- a/python/examples/imagenet/resnet50_web_service.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-from paddle_serving_client import Client
-import numpy as np
-from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-
-if len(sys.argv) != 4:
-    print("python resnet50_web_service.py model device port")
-    sys.exit(-1)
-
-device = sys.argv[2]
-
-if device == "cpu":
-    from paddle_serving_server.web_service import WebService
-else:
-    from paddle_serving_server.web_service import WebService
-
-
-class ImageService(WebService):
-    def init_imagenet_setting(self):
-        self.seq = Sequential([
-            URL2Image(), Resize(256), CenterCrop(224), RGB2BGR(), Transpose(
-                (2, 0, 1)), Div(255), Normalize([0.485, 0.456, 0.406],
-                                                [0.229, 0.224, 0.225], True)
-        ])
-        self.label_dict = {}
-        label_idx = 0
-        with open("imagenet.label") as fin:
-            for line in fin:
-                self.label_dict[label_idx] = line.strip()
-                label_idx += 1
-
-    def preprocess(self, feed=[], fetch=[]):
-        feed_batch = []
-        is_batch = True
-        for ins in feed:
-            if "image" not in ins:
-                raise ("feed data error!")
-            img = self.seq(ins["image"])
-            feed_batch.append({"image": img[np.newaxis, :]})
-        return feed_batch, fetch, is_batch
-
-    def postprocess(self, feed=[], fetch=[], fetch_map={}):
-        score_list = fetch_map["score"]
-        result = {"label": [], "prob": []}
-        for score in score_list:
-            score = score.tolist()
-            max_score = max(score)
-            result["label"].append(self.label_dict[score.index(max_score)]
-                                   .strip().replace(",", ""))
-            result["prob"].append(max_score)
-        return result
-
-
-image_service = ImageService(name="image")
-image_service.load_model_config(sys.argv[1])
-image_service.init_imagenet_setting()
-if device == "gpu":
-    image_service.set_gpus("0")
-image_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[3]), device=device)
-image_service.run_rpc_service()
-image_service.run_web_service()
diff --git a/python/examples/imdb/README.md b/python/examples/imdb/README.md
old mode 100644
new mode 100755
index e2b9a74c98e8993f19b14888f3e21343f526b81d..573ac47db37d23406e66fb1605ac60ea58189ffa
--- a/python/examples/imdb/README.md
+++ b/python/examples/imdb/README.md
@@ -9,24 +9,20 @@ sh get_data.sh
 ```
 the package downloaded contains cnn, lstm and bow model config along with their test_data and train_data.
 
-### Start RPC inference service
+### Start inference service(Support BRPC-Client/GRPC-Client/Http-Client)
 
 ```
-python -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292
+python3 -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292
 ```
-### RPC Infer
+### BRPC-Client Infer
 ```
-head test_data/part-0 | python test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
+head test_data/part-0 | python3 test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
 ```
 
 it will get predict results of the first 10 test cases.
 
-### Start HTTP inference service
-```
-python text_classify_service.py imdb_cnn_model/ workdir/ 9292 imdb.vocab
-```
-### HTTP Infer
 
+### GRPC-Client/Http-Client Infer
 ```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:9292/imdb/prediction
+head test_data/part-0 | python3 test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
 ```
diff --git a/python/examples/imdb/README_CN.md b/python/examples/imdb/README_CN.md
old mode 100644
new mode 100755
index a669e29e94f6c6cce238473a8fc33405e29e8471..a1fecc8af35dcd2f5a38f47480b9b80b3cf96054
--- a/python/examples/imdb/README_CN.md
+++ b/python/examples/imdb/README_CN.md
@@ -9,23 +9,18 @@ sh get_data.sh
 ```
 脚本会下载和解压出cnn、lstm和bow三种模型的配置文文件以及test_data和train_data。
 
-### 启动RPC预测服务
+### 启动预测服务(支持BRPC-Client/GRPC-Client/Http-Client)
 
 ```
-python -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292
+python3 -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292
 ```
-### 执行预测
+### BRPC-Client预测
 ```
-head test_data/part-0 | python test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
+head test_data/part-0 | python3 test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
 ```
 预测test_data/part-0的前十个样例。
 
-### 启动HTTP预测服务
+### BRPC-Client预测
 ```
-python text_classify_service.py imdb_cnn_model/ workdir/ 9292 imdb.vocab
-```
-### 执行预测
-
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:9292/imdb/prediction
+head test_data/part-0 | python3 test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
 ```
diff --git a/python/examples/imdb/imdb_web_service_demo.sh b/python/examples/imdb/imdb_web_service_demo.sh
deleted file mode 100644
index 05d1b729c64359025119e443ed601c902a87ae4d..0000000000000000000000000000000000000000
--- a/python/examples/imdb/imdb_web_service_demo.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-wget https://paddle-serving.bj.bcebos.com/imdb-demo/imdb_service.tar.gz
-tar -xzf imdb_service.tar.gz
-wget --no-check-certificate https://fleet.bj.bcebos.com/text_classification_data.tar.gz
-tar -zxvf text_classification_data.tar.gz
-python text_classify_service.py serving_server_model/ workdir imdb.vocab
diff --git a/python/examples/imdb/test_http_client.py b/python/examples/imdb/test_http_client.py
new file mode 100755
index 0000000000000000000000000000000000000000..e3cc705150ccc197ab1be24bf11e0a92e1d62380
--- /dev/null
+++ b/python/examples/imdb/test_http_client.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_client import HttpClient
+from paddle_serving_app.reader.imdb_reader import IMDBDataset
+import sys
+import numpy as np
+
+client = HttpClient()
+client.load_client_config(sys.argv[1])
+''' 
+if you want use GRPC-client, set_use_grpc_client(True)
+or you can directly use client.grpc_client_predict(...)
+as for HTTP-client,set_use_grpc_client(False)(which is default)
+or you can directly use client.http_client_predict(...)
+'''
+#client.set_use_grpc_client(True)
+'''
+if you want to enable Encrypt Module,uncommenting the following line
+'''
+#client.use_key("./key")
+'''
+if you want to compress,uncommenting the following line
+'''
+#client.set_response_compress(True)
+#client.set_request_compress(True)
+'''
+we recommend use Proto data format in HTTP-body, set True(which is default)
+if you want use JSON data format in HTTP-body, set False
+'''
+#client.set_http_proto(True)
+client.connect(["127.0.0.1:9292"])
+
+# you can define any english sentence or dataset here
+# This example reuses imdb reader in training, you
+# can define your own data preprocessing easily.
+imdb_dataset = IMDBDataset()
+imdb_dataset.load_resource(sys.argv[2])
+
+for line in sys.stdin:
+    word_ids, label = imdb_dataset.get_words_and_label(line)
+    word_len = len(word_ids)
+    feed = {
+        "words": np.array(word_ids).reshape(word_len, 1),
+        "words.lod": [0, word_len]
+    }
+    #print(feed)
+    fetch = ["prediction"]
+    fetch_map = client.predict(feed=feed, fetch=fetch, batch=True)
+    print(fetch_map)
diff --git a/python/examples/imdb/text_classify_service.py b/python/examples/imdb/text_classify_service.py
deleted file mode 100644
index ca1e26002baf0284f282add235706080f7902c33..0000000000000000000000000000000000000000
--- a/python/examples/imdb/text_classify_service.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-from paddle_serving_server.web_service import WebService
-from paddle_serving_app.reader.imdb_reader import IMDBDataset
-import sys
-import numpy as np
-
-
-class IMDBService(WebService):
-    def prepare_dict(self, args={}):
-        if len(args) == 0:
-            exit(-1)
-        self.dataset = IMDBDataset()
-        self.dataset.load_resource(args["dict_file_path"])
-
-    def preprocess(self, feed={}, fetch=[]):
-        feed_batch = []
-        words_lod = [0]
-        is_batch = True
-        for ins in feed:
-            words = self.dataset.get_words_only(ins["words"])
-            words = np.array(words).reshape(len(words), 1)
-            words_lod.append(words_lod[-1] + len(words))
-            feed_batch.append(words)
-        feed = {"words": np.concatenate(feed_batch), "words.lod": words_lod}
-        return feed, fetch, is_batch
-
-
-imdb_service = IMDBService(name="imdb")
-imdb_service.load_model_config(sys.argv[1])
-imdb_service.prepare_server(
-    workdir=sys.argv[2], port=int(sys.argv[3]), device="cpu")
-imdb_service.prepare_dict({"dict_file_path": sys.argv[4]})
-imdb_service.run_rpc_service()
-imdb_service.run_web_service()
diff --git a/python/examples/lac/README.md b/python/examples/lac/README.md
old mode 100644
new mode 100755
index 8d7adfb583f8e8e1fde0681a73f2bba65452fa87..108d5051b50b2b639e28c023364d36ec9a0a0a44
--- a/python/examples/lac/README.md
+++ b/python/examples/lac/README.md
@@ -4,28 +4,23 @@
 
 ### Get Model
 ```
-python -m paddle_serving_app.package --get_model lac
+python3 -m paddle_serving_app.package --get_model lac
 tar -xzvf lac.tar.gz
 ```
 
-#### Start RPC inference service
+#### Start inference service(Support BRPC-Client/GRPC-Client/Http-Client)
 
 ```
-python -m paddle_serving_server.serve --model lac_model/ --port 9292
+python3 -m paddle_serving_server.serve --model lac_model/ --port 9292
 ```
-### RPC Infer
+### BRPC Infer
 ```
-echo "我爱北京天安门" | python lac_client.py lac_client/serving_client_conf.prototxt
+echo "我爱北京天安门" | python3 lac_client.py lac_client/serving_client_conf.prototxt
 ```
 
 It will get the segmentation result. 
 
-### Start HTTP inference service
+### GRPC/Http Infer
 ```
-python lac_web_service.py lac_model/ lac_workdir 9292
-```
-### HTTP Infer
-
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "我爱北京天安门"}], "fetch":["word_seg"]}' http://127.0.0.1:9292/lac/prediction
+echo "我爱北京天安门" | python3 lac_http_client.py lac_client/serving_client_conf.prototxt
 ```
diff --git a/python/examples/lac/README_CN.md b/python/examples/lac/README_CN.md
old mode 100644
new mode 100755
index 2379aa8ed69c026c6afd94b8b791774882eaf567..5634128c80c23126836677f4cb434df68dde9056
--- a/python/examples/lac/README_CN.md
+++ b/python/examples/lac/README_CN.md
@@ -4,28 +4,23 @@
 
 ### 获取模型
 ```
-python -m paddle_serving_app.package --get_model lac
+python3 -m paddle_serving_app.package --get_model lac
 tar -xzvf lac.tar.gz
 ```
 
-#### 开启RPC预测服务
+#### 开启预测服务(支持BRPC-Client/GRPC-Client/Http-Client)
 
 ```
-python -m paddle_serving_server.serve --model lac_model/ --port 9292
+python3 -m paddle_serving_server.serve --model lac_model/ --port 9292
 ```
-### 执行RPC预测
+### 执行BRPC预测
 ```
-echo "我爱北京天安门" | python lac_client.py lac_client/serving_client_conf.prototxt
+echo "我爱北京天安门" | python3 lac_client.py lac_client/serving_client_conf.prototxt
 ```
 
 我们就能得到分词结果
 
-### 开启HTTP预测服务
+### 执行GRPC/Http预测
 ```
-python lac_web_service.py lac_model/ lac_workdir 9292
-```
-### 执行HTTP预测
-
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "我爱北京天安门"}], "fetch":["word_seg"]}' http://127.0.0.1:9292/lac/prediction
+echo "我爱北京天安门" | python3 lac_http_client.py lac_client/serving_client_conf.prototxt
 ```
diff --git a/python/examples/lac/lac_http_client.py b/python/examples/lac/lac_http_client.py
old mode 100644
new mode 100755
index 852d785f368e95bb16bfd5804e3153b022945f59..5cdfaf1df46a43d04b7e09f0f6376364a9dcb89f
--- a/python/examples/lac/lac_http_client.py
+++ b/python/examples/lac/lac_http_client.py
@@ -1,3 +1,4 @@
+# encoding=utf-8
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,17 +12,55 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#coding=utf-8
-import requests
-import json
-import time
+# pylint: disable=doc-string-missing
 
-if __name__ == "__main__":
-    server = "http://127.0.0.1:9280/lac/prediction"
-    fin = open("jieba_test.txt", "r")
-    start = time.time()
-    for line in fin:
-        req_data = {"words": line.strip(), "fetch": ["crf_decode"]}
-        r = requests.post(server, json=req_data)
-    end = time.time()
-    print(end - start)
+from paddle_serving_client import HttpClient
+from paddle_serving_app.reader import LACReader
+import sys
+import os
+import io
+import numpy as np
+
+client = HttpClient()
+client.load_client_config(sys.argv[1])
+''' 
+if you want use GRPC-client, set_use_grpc_client(True)
+or you can directly use client.grpc_client_predict(...)
+as for HTTP-client,set_use_grpc_client(False)(which is default)
+or you can directly use client.http_client_predict(...)
+'''
+#client.set_use_grpc_client(True)
+'''
+if you want to enable Encrypt Module,uncommenting the following line
+'''
+#client.use_key("./key")
+'''
+if you want to compress,uncommenting the following line
+'''
+#client.set_response_compress(True)
+#client.set_request_compress(True)
+'''
+we recommend use Proto data format in HTTP-body, set True(which is default)
+if you want use JSON data format in HTTP-body, set False
+'''
+#client.set_http_proto(True)
+client.connect(["127.0.0.1:9292"])
+
+reader = LACReader()
+for line in sys.stdin:
+    if len(line) <= 0:
+        continue
+    feed_data = reader.process(line)
+    if len(feed_data) <= 0:
+        continue
+    print(feed_data)
+    #fetch_map = client.predict(feed={"words": np.array(feed_data).reshape(len(feed_data), 1), "words.lod": [0, len(feed_data)]}, fetch=["crf_decode"], batch=True)
+    fetch_map = client.predict(
+        feed={
+            "words": np.array(feed_data + feed_data).reshape(
+                len(feed_data) * 2, 1),
+            "words.lod": [0, len(feed_data), 2 * len(feed_data)]
+        },
+        fetch=["crf_decode"],
+        batch=True)
+    print(fetch_map)
diff --git a/python/examples/lac/lac_web_service.py b/python/examples/lac/lac_web_service.py
deleted file mode 100644
index cf37f66294bd154324f2c7cacd1a35571b6c6350..0000000000000000000000000000000000000000
--- a/python/examples/lac/lac_web_service.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle_serving_server.web_service import WebService
-import sys
-from paddle_serving_app.reader import LACReader
-import numpy as np
-
-
-class LACService(WebService):
-    def load_reader(self):
-        self.reader = LACReader()
-
-    def preprocess(self, feed={}, fetch=[]):
-        feed_batch = []
-        fetch = ["crf_decode"]
-        lod_info = [0]
-        is_batch = True
-        for ins in feed:
-            if "words" not in ins:
-                raise ("feed data error!")
-            feed_data = self.reader.process(ins["words"])
-            feed_batch.append(np.array(feed_data).reshape(len(feed_data), 1))
-            lod_info.append(lod_info[-1] + len(feed_data))
-        feed_dict = {
-            "words": np.concatenate(
-                feed_batch, axis=0),
-            "words.lod": lod_info
-        }
-        return feed_dict, fetch, is_batch
-
-    def postprocess(self, feed={}, fetch=[], fetch_map={}):
-        batch_ret = []
-        for idx, ins in enumerate(feed):
-            begin = fetch_map['crf_decode.lod'][idx]
-            end = fetch_map['crf_decode.lod'][idx + 1]
-            segs = self.reader.parse_result(ins["words"],
-                                            fetch_map["crf_decode"][begin:end])
-            batch_ret.append({"word_seg": "|".join(segs)})
-        return batch_ret
-
-
-lac_service = LACService(name="lac")
-lac_service.load_model_config(sys.argv[1])
-lac_service.load_reader()
-lac_service.prepare_server(
-    workdir=sys.argv[2], port=int(sys.argv[3]), device="cpu")
-lac_service.run_rpc_service()
-lac_service.run_web_service()
diff --git a/python/examples/low_precision/resnet50/README.md b/python/examples/low_precision/resnet50/README.md
index 9e1ff16c676b067437183e6e19446e8a526feed5..b4ae2552c3dcd1c30c67b5731d81095e05ca9a86 100644
--- a/python/examples/low_precision/resnet50/README.md
+++ b/python/examples/low_precision/resnet50/README.md
@@ -11,15 +11,15 @@ Firstly, download the [Resnet50 int8 model](https://paddle-inference-dist.bj.bce
 wget https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ResNet50_quant.tar.gz
 tar zxvf ResNet50_quant.tar.gz
 
-python -m paddle_serving_client.convert --dirname ResNet50_quant
+python3 -m paddle_serving_client.convert --dirname ResNet50_quant
 ```
 Start RPC service, specify the GPU id and precision mode
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 
+python3 -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 
 ```
 Request the serving service with Client
 ```
-python resnet50_client.py
+python3 resnet50_client.py
 ```
 
 ## Reference
diff --git a/python/examples/low_precision/resnet50/README_CN.md b/python/examples/low_precision/resnet50/README_CN.md
index 1c1a3be1de1690e9736d994016ac05cfba12bcab..648b64dd2b0a5089ce8539c42c0222862e89d8f3 100644
--- a/python/examples/low_precision/resnet50/README_CN.md
+++ b/python/examples/low_precision/resnet50/README_CN.md
@@ -10,15 +10,15 @@
 wget https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ResNet50_quant.tar.gz
 tar zxvf ResNet50_quant.tar.gz
 
-python -m paddle_serving_client.convert --dirname ResNet50_quant
+python3 -m paddle_serving_client.convert --dirname ResNet50_quant
 ```
 启动rpc服务, 设定所选GPU id、部署模型精度
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 
+python3 -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 
 ```
 使用client进行请求
 ```
-python resnet50_client.py
+python3 resnet50_client.py
 ```
 
 ## 参考文档
diff --git a/python/examples/mobilenet/README.md b/python/examples/mobilenet/README.md
index 4a808026af0ca5cc1920a292c3f85c82962a3f41..1a16b749220bdf8e6db0dd8950fc505620cbc8fc 100644
--- a/python/examples/mobilenet/README.md
+++ b/python/examples/mobilenet/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 
 ```
-python -m paddle_serving_app.package --get_model mobilenet_v2_imagenet
+python3 -m paddle_serving_app.package --get_model mobilenet_v2_imagenet
 tar -xzvf mobilenet_v2_imagenet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf mobilenet_v2_imagenet.tar.gz
 ### Start Service
 
 ```
-python -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
+python3 -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
 ```
 
 ### Client Prediction
 
 ```
-python mobilenet_tutorial.py
+python3 mobilenet_tutorial.py
 ```
diff --git a/python/examples/mobilenet/README_CN.md b/python/examples/mobilenet/README_CN.md
index d4f91837ec5e03c4ef32041580e5d6b30039480e..68474e5d80afdec183cb5bac0e9ebfc13a7f9ac6 100644
--- a/python/examples/mobilenet/README_CN.md
+++ b/python/examples/mobilenet/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 
 ```
-python -m paddle_serving_app.package --get_model mobilenet_v2_imagenet
+python3 -m paddle_serving_app.package --get_model mobilenet_v2_imagenet
 tar -xzvf mobilenet_v2_imagenet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf mobilenet_v2_imagenet.tar.gz
 ### 启动服务端
 
 ```
-python -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
+python3 -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
 ```
 
 ### 客户端预测
 
 ```
-python mobilenet_tutorial.py
+python3 mobilenet_tutorial.py
 ```
diff --git a/python/examples/ocr/README.md b/python/examples/ocr/README.md
old mode 100644
new mode 100755
index dfa836fdc7ae29747ad400d21e585a775e0593e4..95cc210a7e68d5582e68460f2eec89419bf7fd7c
--- a/python/examples/ocr/README.md
+++ b/python/examples/ocr/README.md
@@ -4,9 +4,9 @@
 
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model ocr_rec
+python3 -m paddle_serving_app.package --get_model ocr_rec
 tar -xzvf ocr_rec.tar.gz
-python -m paddle_serving_app.package --get_model ocr_det
+python3 -m paddle_serving_app.package --get_model ocr_det
 tar -xzvf ocr_det.tar.gz
 ```
 
@@ -23,16 +23,16 @@ tar xf test_imgs.tar
 ```
 #choose one of cpu/gpu commands as following
 #for cpu user
-python -m paddle_serving_server.serve --model ocr_det_model --port 9293
-python ocr_web_server.py cpu
+python3 -m paddle_serving_server.serve --model ocr_det_model --port 9293
+python3 ocr_web_server.py cpu
 #for gpu user
-python -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_ids 0
-python ocr_web_server.py gpu
+python3 -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_ids 0
+python3 ocr_web_server.py gpu
 ```
 
 ### Client Prediction
 ```
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 If you want a faster web service, please try Web LocalPredictor Service
 
@@ -40,14 +40,14 @@ If you want a faster web service, please try Web LocalPredictor Service
 ```
 #choose one of cpu/gpu commands as following
 #for cpu user
-python ocr_debugger_server.py cpu
+python3 ocr_debugger_server.py cpu
 #for gpu user
-python ocr_debugger_server.py gpu 
+python3 ocr_debugger_server.py gpu 
 ```
 
 ## Web LocalPredictor Client Prediction
 ```
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 
 ## Benchmark
@@ -69,34 +69,34 @@ if you are going to detect images not recognize it or directly recognize the wor
 ### Det Server 
 
 ```
-python det_web_server.py cpu #for cpu user
-python det_web_server.py gpu #for gpu user
+python3 det_web_server.py cpu #for cpu user
+python3 det_web_server.py gpu #for gpu user
 #or
-python det_debugger_server.py cpu #for cpu user
-python det_debugger_server.py gpu #for gpu user
+python3 det_debugger_server.py cpu #for cpu user
+python3 det_debugger_server.py gpu #for gpu user
 ```
 
 ### Det Client
 
 ```
 # also use ocr_web_client.py
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 
 ### Rec Server
 
 ```
-python rec_web_server.py cpu #for cpu user
-python rec_web_server.py gpu #for gpu user
+python3 rec_web_server.py cpu #for cpu user
+python3 rec_web_server.py gpu #for gpu user
 #or
-python rec_debugger_server.py cpu #for cpu user
-python rec_debugger_server.py gpu #for gpu user
+python3 rec_debugger_server.py cpu #for cpu user
+python3 rec_debugger_server.py gpu #for gpu user
 ```
 
 ### Rec Client
 
 ```
-python rec_web_client.py
+python3 rec_web_client.py
 ```
 
 ## C++ OCR Service
@@ -109,9 +109,9 @@ Select a startup mode according to CPU / GPU device
 After the -- model parameter, the folder path of multiple model files is passed in to start the prediction service of multiple model concatenation.
 ```
 #for cpu user
-python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
+python3 -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
 #for gpu user
-python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_ids 0
 ```
 
 ### Client Prediction
@@ -119,9 +119,9 @@ The pre-processing and post-processing is in the C + + server part, the image's
 
 so the value of parameter `feed_var` which is in the file `ocr_det_client/serving_client_conf.prototxt` should be changed.
 
-for this case, `feed_type` should be 3(which means the data type is string),`shape` should be 1.
+for this case, `feed_type` should be 20(which means the data type is string),`shape` should be 1.
 
 By passing in multiple client folder paths, the client can be started for multi model prediction.
 ```
-python ocr_cpp_client.py ocr_det_client ocr_rec_client
+python3 ocr_cpp_client.py ocr_det_client ocr_rec_client
 ```
diff --git a/python/examples/ocr/README_CN.md b/python/examples/ocr/README_CN.md
old mode 100644
new mode 100755
index 7e02109252c37bb972b36214713a65dd334524dd..5c0734c94aa6d61e1fdb9e8f87d5ee187c805ff0
--- a/python/examples/ocr/README_CN.md
+++ b/python/examples/ocr/README_CN.md
@@ -4,9 +4,9 @@
 
 ## 获取模型
 ```
-python -m paddle_serving_app.package --get_model ocr_rec
+python3 -m paddle_serving_app.package --get_model ocr_rec
 tar -xzvf ocr_rec.tar.gz
-python -m paddle_serving_app.package --get_model ocr_det
+python3 -m paddle_serving_app.package --get_model ocr_det
 tar -xzvf ocr_det.tar.gz
 ```
 ## 获取数据集（可选）
@@ -22,16 +22,16 @@ tar xf test_imgs.tar
 ```
 #根据CPU/GPU设备选择一种启动方式
 #for cpu user
-python -m paddle_serving_server.serve --model ocr_det_model --port 9293
-python ocr_web_server.py cpu
+python3 -m paddle_serving_server.serve --model ocr_det_model --port 9293
+python3 ocr_web_server.py cpu
 #for gpu user
-python -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_ids 0
-python ocr_web_server.py gpu
+python3 -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_ids 0
+python3 ocr_web_server.py gpu
 ```
 
 ### 启动客户端
 ```
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 
 如果用户需要更快的执行速度，请尝试LocalPredictor版Web服务
@@ -39,14 +39,14 @@ python ocr_web_client.py
 ```
 #根据CPU/GPU设备选择一种启动方式
 #for cpu user
-python ocr_debugger_server.py cpu
+python3 ocr_debugger_server.py cpu
 #for gpu user
-python ocr_debugger_server.py gpu
+python3 ocr_debugger_server.py gpu
 ```
 
 ## 启动客户端
 ```
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 
 ## 性能指标
@@ -69,34 +69,34 @@ GPU: Nvidia Tesla V100单卡
 ### 启动检测服务
 
 ```
-python det_web_server.py cpu #for cpu user
-python det_web_server.py gpu #for gpu user
+python3 det_web_server.py cpu #for cpu user
+python3 det_web_server.py gpu #for gpu user
 #or
-python det_debugger_server.py cpu #for cpu user
-python det_debugger_server.py gpu #for gpu user
+python3 det_debugger_server.py cpu #for cpu user
+python3 det_debugger_server.py gpu #for gpu user
 ```
 
 ### 检测服务客户端
 
 ```
 # also use ocr_web_client.py
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 
 ### 启动识别服务
 
 ```
-python rec_web_server.py cpu #for cpu user
-python rec_web_server.py gpu #for gpu user
+python3 rec_web_server.py cpu #for cpu user
+python3 rec_web_server.py gpu #for gpu user
 #or
-python rec_debugger_server.py cpu #for cpu user
-python rec_debugger_server.py gpu #for gpu user
+python3 rec_debugger_server.py cpu #for cpu user
+python3 rec_debugger_server.py gpu #for gpu user
 ```
 
 ### 识别服务客户端
 
 ```
-python rec_web_client.py
+python3 rec_web_client.py
 ```
 ## C++ OCR Service服务
 
@@ -108,9 +108,9 @@ python rec_web_client.py
 通过--model后，指定多个模型文件的文件夹路径来启动多模型串联的预测服务。
 ```
 #for cpu user
-python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
+python3 -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
 #for gpu user
-python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_ids 0
 ```
 
 ### 启动客户端
@@ -118,9 +118,9 @@ python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port
 
 即`ocr_det_client/serving_client_conf.prototxt`中`feed_var`字段
 
-对于本示例而言，`feed_type`应修改为3(数据类型为string),`shape`为1.
+对于本示例而言，`feed_type`应修改为20(数据类型为string),`shape`为1.
 
 通过在客户端启动后加入多个client模型的client配置文件夹路径，启动client进行预测。
 ```
-python ocr_cpp_client.py ocr_det_client ocr_rec_client
+python3 ocr_cpp_client.py ocr_det_client ocr_rec_client
 ```
diff --git a/python/examples/pipeline/PaddleClas/DarkNet53/README.md b/python/examples/pipeline/PaddleClas/DarkNet53/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/DarkNet53/README.md
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/DarkNet53/README_CN.md b/python/examples/pipeline/PaddleClas/DarkNet53/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/DarkNet53/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/DarkNet53/benchmark.py b/python/examples/pipeline/PaddleClas/DarkNet53/benchmark.py
index d643b90f5b7ac6ef6892bb83e7dfb20b650df49b..3e5db19b69fc8693adfe77a84297436bfb497642 100644
--- a/python/examples/pipeline/PaddleClas/DarkNet53/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/DarkNet53/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/DarkNet53/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/DarkNet53/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/HRNet_W18_C/README.md b/python/examples/pipeline/PaddleClas/HRNet_W18_C/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/README.md
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/HRNet_W18_C/README_CN.md b/python/examples/pipeline/PaddleClas/HRNet_W18_C/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/HRNet_W18_C/benchmark.py b/python/examples/pipeline/PaddleClas/HRNet_W18_C/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..c80da12ce36618e75897b33d58e4f4febd382861 100644
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/HRNet_W18_C/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/HRNet_W18_C/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/HRNet_W18_C/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/HRNet_W18_C/resnet50_web_service.py
index 3e43ce8608e5e0edac1802910856be2ed6e6b635..c246e45db331925e47b8d026f4801c5acf5f2ae7 100644
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV1/README.md b/python/examples/pipeline/PaddleClas/MobileNetV1/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/README.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV1/README_CN.md b/python/examples/pipeline/PaddleClas/MobileNetV1/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV1/benchmark.py b/python/examples/pipeline/PaddleClas/MobileNetV1/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..c80da12ce36618e75897b33d58e4f4febd382861 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV1/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/MobileNetV1/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV1/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/MobileNetV1/resnet50_web_service.py
index 3e43ce8608e5e0edac1802910856be2ed6e6b635..c246e45db331925e47b8d026f4801c5acf5f2ae7 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV2/README.md b/python/examples/pipeline/PaddleClas/MobileNetV2/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/README.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV2/README_CN.md b/python/examples/pipeline/PaddleClas/MobileNetV2/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV2/benchmark.py b/python/examples/pipeline/PaddleClas/MobileNetV2/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..c80da12ce36618e75897b33d58e4f4febd382861 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV2/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/MobileNetV2/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV2/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/MobileNetV2/resnet50_web_service.py
index 3e43ce8608e5e0edac1802910856be2ed6e6b635..c246e45db331925e47b8d026f4801c5acf5f2ae7 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README.md b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README_CN.md b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/benchmark.py b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..c80da12ce36618e75897b33d58e4f4febd382861 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/resnet50_web_service.py
index 3e43ce8608e5e0edac1802910856be2ed6e6b635..c246e45db331925e47b8d026f4801c5acf5f2ae7 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
diff --git a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README.md b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README_CN.md b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/benchmark.py b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..c80da12ce36618e75897b33d58e4f4febd382861 100644
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/resnet50_web_service.py
index 3e43ce8608e5e0edac1802910856be2ed6e6b635..c246e45db331925e47b8d026f4801c5acf5f2ae7 100644
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd/README.md b/python/examples/pipeline/PaddleClas/ResNet50_vd/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd/README_CN.md b/python/examples/pipeline/PaddleClas/ResNet50_vd/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd/benchmark.py b/python/examples/pipeline/PaddleClas/ResNet50_vd/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..c80da12ce36618e75897b33d58e4f4febd382861 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/ResNet50_vd/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/ResNet50_vd/resnet50_web_service.py
index 3e43ce8608e5e0edac1802910856be2ed6e6b635..c246e45db331925e47b8d026f4801c5acf5f2ae7 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README.md b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README_CN.md b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/benchmark.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..c80da12ce36618e75897b33d58e4f4febd382861 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/resnet50_web_service.py
index b89c2cc74f4c57906ff871e1dde244d5b37098c4..43dac2a27c64d79f85f73011755c418cc6a59f1e 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README.md b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README_CN.md b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/benchmark.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..c80da12ce36618e75897b33d58e4f4febd382861 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/resnet50_web_service.py
index 7aade27ea198afe1cbac7b775cfe3a6cbcb3b1df..569b15bcfa61a1a1732de303e2980e9b4387c9a0 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README.md b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README_CN.md b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/benchmark.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..c80da12ce36618e75897b33d58e4f4febd382861 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/resnet50_web_service.py
index 2734521dda15fe1c491fc66c5536203888d00d23..debc1753cc9174dd79bf3a0072681b352c8be17b 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
diff --git a/python/examples/pipeline/PaddleClas/ResNet_V2_50/README.md b/python/examples/pipeline/PaddleClas/ResNet_V2_50/README.md
index 5b909301d9e114019ae8c6ac2bbfcec3cb188b33..1297abfb7a649e3eced26ea4c08848e0a51fbdbf 100644
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/README.md
@@ -4,17 +4,17 @@ This document will takes Imagenet service as an example to introduce how to use
 
 ## Get model
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
 
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/ResNet_V2_50/README_CN.md b/python/examples/pipeline/PaddleClas/ResNet_V2_50/README_CN.md
index cc2fcdd7514fc197ec892826ec56b76906150578..d547b289281cb13a3abb49343b6b77230a2f3d2c 100644
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/README_CN.md
@@ -4,18 +4,17 @@
 
 ## 获取模型
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
 
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/ResNet_V2_50/benchmark.py b/python/examples/pipeline/PaddleClas/ResNet_V2_50/benchmark.py
index 98babc4acddb9a548afeafed1dfee16a88244714..4b0336f97c2c520a46d596bf5e435c2b9e3094a9 100644
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18000/imagenet/prediction"    
+    url = "http://127.0.0.1:18000/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/ResNet_V2_50/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/ResNet_V2_50/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README.md b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README.md
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README_CN.md b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/benchmark.py b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..c80da12ce36618e75897b33d58e4f4febd382861 100644
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,16 +19,16 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -24,9 +38,10 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..a816eb8eed49d922d5caf729dfd089fc28936853 100644
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/pipeline_rpc_client.py
@@ -11,10 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/resnet50_web_service.py
index 3e43ce8608e5e0edac1802910856be2ed6e6b635..c246e45db331925e47b8d026f4801c5acf5f2ae7 100644
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
diff --git a/python/examples/pipeline/PaddleDetection/faster_rcnn/README.md b/python/examples/pipeline/PaddleDetection/faster_rcnn/README.md
index 4d242be2f3f7550c3bb64053a5689894a6b2c76c..a56ecbef06d82eef59510a1242de7f19c0915d55 100644
--- a/python/examples/pipeline/PaddleDetection/faster_rcnn/README.md
+++ b/python/examples/pipeline/PaddleDetection/faster_rcnn/README.md
@@ -8,11 +8,11 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
-python web_service.py
+python3 web_service.py
 ```
 
 ### Perform prediction
 
 ```
-python pipeline_http_client.py
+python3 pipeline_http_client.py
 ```
diff --git a/python/examples/pipeline/PaddleDetection/faster_rcnn/benchmark.py b/python/examples/pipeline/PaddleDetection/faster_rcnn/benchmark.py
index f0a55614c1390b1d4f73bd015b1ce21b85e4ba55..f8d5f2b4fd196048a139867a893b06f47d2778bb 100644
--- a/python/examples/pipeline/PaddleDetection/faster_rcnn/benchmark.py
+++ b/python/examples/pipeline/PaddleDetection/faster_rcnn/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import yaml
@@ -6,20 +20,20 @@ import time
 import json
 import cv2
 import base64
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -29,17 +43,19 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 30}
     if device == "gpu":
         config["op"]["faster_rcnn"]["local_service_conf"]["device_type"] = 1
-        config["op"]["faster_rcnn"]["local_service_conf"]["devices"] = gpu_id        
-    with open("config2.yml", "w") as fout: 
+        config["op"]["faster_rcnn"]["local_service_conf"]["devices"] = gpu_id
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
     url = "http://127.0.0.1:18082/faster_rcnn/prediction"
@@ -65,6 +81,7 @@ def run_http(idx, batch_size):
             break
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -84,22 +101,25 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     pass
 
+
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         gpu_id = sys.argv[5]
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -110,4 +130,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleDetection/faster_rcnn/web_service.py b/python/examples/pipeline/PaddleDetection/faster_rcnn/web_service.py
index fa026000e399cf0246df4afa2a37005d40d53d70..08a9122296b801689f3d5faf2c75113b293ea220 100644
--- a/python/examples/pipeline/PaddleDetection/faster_rcnn/web_service.py
+++ b/python/examples/pipeline/PaddleDetection/faster_rcnn/web_service.py
@@ -25,7 +25,7 @@ class FasterRCNNOp(Op):
         self.img_preprocess = Sequential([
             BGR2RGB(), Div(255.0),
             Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
-            Resize((640, 640)), Transpose((2, 0, 1))
+            Resize(640, 640), Transpose((2, 0, 1))
         ])
         self.img_postprocess = RCNNPostprocess("label_list.txt", "output")
 
diff --git a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/README.md b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/README.md
index a37ca74056fb9dcd4a609f87f914e1ac71df070d..73087efca7abc75d9ed7d6178d962911b9a2b1cb 100644
--- a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/README.md
+++ b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/README.md
@@ -10,11 +10,10 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ppyolo_mbv3_large_coco.tar
-python web_service.py
+python3 web_service.py
 ```
 
 ### Perform prediction
 ```
-python pipeline_http_client.py
+python3 pipeline_http_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/benchmark.py b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/benchmark.py
index a23f64314ef448f2617f92ab40f94f75cc6e707f..611712b6754efd88fc7b51027e99b9bb3e82cf7d 100644
--- a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/benchmark.py
+++ b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import yaml
@@ -6,20 +20,20 @@ import time
 import json
 import cv2
 import base64
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -29,17 +43,19 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 30}
     if device == "gpu":
         config["op"]["ppyolo_mbv3"]["local_service_conf"]["device_type"] = 1
         config["op"]["ppyolo_mbv3"]["local_service_conf"]["devices"] = gpu_id
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
     url = "http://127.0.0.1:18082/ppyolo_mbv3/prediction"
@@ -65,6 +81,7 @@ def run_http(idx, batch_size):
             break
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -84,22 +101,25 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     pass
 
+
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         gpu_id = sys.argv[5]
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -110,4 +130,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleDetection/yolov3/README.md b/python/examples/pipeline/PaddleDetection/yolov3/README.md
index 1a1431a2a90d404c23728e5515c00aebce0fa4a7..8340f1060d0be6b100575ecbcb0270db0a6227f4 100644
--- a/python/examples/pipeline/PaddleDetection/yolov3/README.md
+++ b/python/examples/pipeline/PaddleDetection/yolov3/README.md
@@ -10,11 +10,10 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf yolov3_darknet53_270e_coco.tar
-python web_service.py
+python3 web_service.py
 ```
 
 ### Perform prediction
 ```
-python pipeline_http_client.py
+python3 pipeline_http_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleDetection/yolov3/benchmark.py b/python/examples/pipeline/PaddleDetection/yolov3/benchmark.py
index ae9c5a8fb25f56eebe3c3893a4a4d251f21e5b61..cb73d2f932c12d0559af307b3ecf12ecf7986390 100644
--- a/python/examples/pipeline/PaddleDetection/yolov3/benchmark.py
+++ b/python/examples/pipeline/PaddleDetection/yolov3/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import yaml
@@ -6,20 +20,20 @@ import time
 import json
 import cv2
 import base64
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -29,17 +43,19 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 30}
     if device == "gpu":
         config["op"]["yolov3"]["local_service_conf"]["device_type"] = 1
-        config["op"]["yolov3"]["local_service_conf"]["devices"] = gpu_id        
-    with open("config2.yml", "w") as fout: 
+        config["op"]["yolov3"]["local_service_conf"]["devices"] = gpu_id
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
     url = "http://127.0.0.1:18082/yolov3/prediction"
@@ -65,6 +81,7 @@ def run_http(idx, batch_size):
             break
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -84,22 +101,25 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     pass
 
+
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         gpu_id = sys.argv[5]
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -110,4 +130,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/bert/benchmark.py b/python/examples/pipeline/bert/benchmark.py
index 3dece4914d6a606753c2b91db2a6d759e0ec6897..ccdbbdf599943ebf757d336b96d4f19b92e1b94a 100644
--- a/python/examples/pipeline/bert/benchmark.py
+++ b/python/examples/pipeline/bert/benchmark.py
@@ -1,13 +1,25 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
@@ -38,9 +50,11 @@ from paddle_serving_client.utils import benchmark_args, show_latency
 2021-03-16 10:26:01,840 	chl0(In: ['@DAGExecutor'], Out: ['bert']) size[0/0]
 2021-03-16 10:26:01,841 	chl1(In: ['bert'], Out: ['@DAGExecutor']) size[0/0]
 '''
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -50,20 +64,22 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
         config["op"]["bert"]["local_service_conf"]["device_type"] = 1
-        config["op"]["bert"]["local_service_conf"]["devices"] = "2"        
-    with open("config2.yml", "w") as fout: 
+        config["op"]["bert"]["local_service_conf"]["devices"] = "2"
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18082/bert/prediction"    
+    url = "http://127.0.0.1:18082/bert/prediction"
     start = time.time()
     with open("data-c.txt", 'r') as fin:
         start = time.time()
@@ -84,9 +100,11 @@ def run_http(idx, batch_size):
         end = time.time()
     return [[end - start]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_http , thread, batch_size)
+    result = multi_thread_runner.run(run_http, thread, batch_size)
+
 
 def run_rpc(thread, batch_size):
     client = PipelineClient()
@@ -110,16 +128,17 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         gen_yml(device)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -130,4 +149,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/ocr/benchmark.py b/python/examples/pipeline/ocr/benchmark.py
index 79ecead3801cc48714812a7a8732e8b7a2367989..3c1243a1c327a5f94544c7fa56524321cad2892f 100644
--- a/python/examples/pipeline/ocr/benchmark.py
+++ b/python/examples/pipeline/ocr/benchmark.py
@@ -19,10 +19,8 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
@@ -30,7 +28,7 @@ from paddle_serving_client.utils import benchmark_args, show_latency
 
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
-        res = yaml.load(fin)
+        res = yaml.load(fin, yaml.FullLoader)
         del_list = []
         for key in res["DAG"].keys():
             if "call" in key:
@@ -43,7 +41,7 @@ def parse_benchmark(filein, fileout):
 
 def gen_yml(device):
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
diff --git a/python/examples/pipeline/simple_web_service/README.md b/python/examples/pipeline/simple_web_service/README.md
index f52f7a85d1c9da98572def013e8d83c5aca2419c..ce2fc841b92b27e1b310353d2b8ef31ae48a2aeb 100644
--- a/python/examples/pipeline/simple_web_service/README.md
+++ b/python/examples/pipeline/simple_web_service/README.md
@@ -10,7 +10,7 @@ sh get_data.sh
 ## Start server
 
 ```
-python web_service.py &>log.txt &
+python3 web_service.py &>log.txt &
 ```
 
 ## Http test
diff --git a/python/examples/pipeline/simple_web_service/README_CN.md b/python/examples/pipeline/simple_web_service/README_CN.md
index 8b07942c19c566f5638e4497eb7c4d5a9fc1f2b2..b7007d366e058af40e0383fb05f8cfcbca6e19d2 100644
--- a/python/examples/pipeline/simple_web_service/README_CN.md
+++ b/python/examples/pipeline/simple_web_service/README_CN.md
@@ -10,7 +10,7 @@ sh get_data.sh
 ## 启动服务
 
 ```
-python web_service.py &>log.txt &
+python3 web_service.py &>log.txt &
 ```
 
 ## 测试
diff --git a/python/examples/pipeline/simple_web_service/benchmark.py b/python/examples/pipeline/simple_web_service/benchmark.py
index f5041fab1c3d7f91f0b4b61a9a63fad168753dc6..88c3ea21722ad9e6420e193a69299b2cf8e443a4 100644
--- a/python/examples/pipeline/simple_web_service/benchmark.py
+++ b/python/examples/pipeline/simple_web_service/benchmark.py
@@ -1,28 +1,42 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 
+
 def gen_yml():
     fin = open("config.yml", "r")
-    config = yaml.load(fin)
+    config = yaml.load(fin, yaml.FullLoader)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 5}
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18082/uci/prediction"    
+    url = "http://127.0.0.1:18082/uci/prediction"
     start = time.time()
     value = "0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"
     all_value = ";".join([value for i in range(batch_size)])
@@ -33,9 +47,11 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_http , thread, batch_size)
+    result = multi_thread_runner.run(run_http, thread, batch_size)
+
 
 def run_rpc(thread, batch_size):
     client = PipelineClient()
@@ -44,25 +60,26 @@ def run_rpc(thread, batch_size):
     all_value = ";".join([value for i in range(batch_size)])
     data = {"key": "x", "value": all_value}
     for i in range(1000):
-        ret = client.predict(feed_dict={data["key"]: data["value"]}, fetch=["res"])
+        ret = client.predict(
+            feed_dict={data["key"]: data["value"]}, fetch=["res"])
     print(ret)
 
+
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         gen_yml()
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
             multithread_http(thread, batch_size)
         elif mode == "rpc":
             multithread_rpc(thread, batch_size)
-
-    
diff --git a/python/examples/resnet_v2_50/README.md b/python/examples/resnet_v2_50/README.md
index 0279918b664dfc5d5d922e8d7ba6bc6aaa15106a..12144b0ea9836c9eb647fa6482db244f1030354b 100644
--- a/python/examples/resnet_v2_50/README.md
+++ b/python/examples/resnet_v2_50/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### Start Service
 
 ```
-python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
+python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
 ```
 
 ### Client Prediction
 
 ```
-python resnet50_v2_tutorial.py
+python3 resnet50_v2_tutorial.py
 ```
diff --git a/python/examples/resnet_v2_50/README_CN.md b/python/examples/resnet_v2_50/README_CN.md
index c67e4f7c3e06c8fe0f3266ed51fc7d6db813ae7b..fee0e01f3cbac29052e4ae931027574ab6f778a0 100644
--- a/python/examples/resnet_v2_50/README_CN.md
+++ b/python/examples/resnet_v2_50/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### 启动服务端
 
 ```
-python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
+python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
 ```
 
 ### 客户端预测
 
 ```
-python resnet50_v2_tutorial.py
+python3 resnet50_v2_tutorial.py
 ```
diff --git a/python/examples/senta/README.md b/python/examples/senta/README.md
index 8929a9312c17264800f299f77afb583221006068..9a159133eeb20832c1870bb949136a59ae461901 100644
--- a/python/examples/senta/README.md
+++ b/python/examples/senta/README.md
@@ -3,16 +3,16 @@
 
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model senta_bilstm
-python -m paddle_serving_app.package --get_model lac
+python3 -m paddle_serving_app.package --get_model senta_bilstm
+python3 -m paddle_serving_app.package --get_model lac
 tar -xzvf senta_bilstm.tar.gz
 tar -xzvf lac.tar.gz
 ```
 
 ## Start HTTP Service
 ```
-python -m paddle_serving_server.serve --model lac_model --port 9300
-python senta_web_service.py
+python3 -m paddle_serving_server.serve --model lac_model --port 9300
+python3 senta_web_service.py
 ```
 In the Chinese sentiment classification task, the Chinese word segmentation needs to be done through [LAC task] (../lac). 
 In this demo, the LAC task is placed in the preprocessing part of the HTTP prediction service of the sentiment classification task.
diff --git a/python/examples/senta/README_CN.md b/python/examples/senta/README_CN.md
index e5624dc975e6bc00de219f68cbf74dea7cac8360..a09fd117767cbdd01847d6cdef06992caf4a9715 100644
--- a/python/examples/senta/README_CN.md
+++ b/python/examples/senta/README_CN.md
@@ -3,16 +3,16 @@
 
 ## 获取模型文件
 ```
-python -m paddle_serving_app.package --get_model senta_bilstm
-python -m paddle_serving_app.package --get_model lac
+python3 -m paddle_serving_app.package --get_model senta_bilstm
+python3 -m paddle_serving_app.package --get_model lac
 tar -xzvf lac.tar.gz
 tar -xzvf senta_bilstm.tar.gz
 ```
 
 ## 启动HTTP服务
 ```
-python -m paddle_serving_server.serve --model lac_model --port 9300
-python senta_web_service.py
+python3 -m paddle_serving_server.serve --model lac_model --port 9300
+python3 senta_web_service.py
 ```
 中文情感分类任务中需要先通过[LAC任务](../lac)进行中文分词。
 示例中将LAC任务放在情感分类任务的HTTP预测服务的预处理部分。
diff --git a/python/examples/unet_for_image_seg/README.md b/python/examples/unet_for_image_seg/README.md
index 170dc133aea41a6f31696c2161d8e60ccfb4a621..59004712bd76f5388d6e57947f70ce22562f8dbe 100644
--- a/python/examples/unet_for_image_seg/README.md
+++ b/python/examples/unet_for_image_seg/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 
 ```
-python -m paddle_serving_app.package --get_model unet
+python3 -m paddle_serving_app.package --get_model unet
 tar -xzvf unet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf unet.tar.gz
 ### Start Service
 
 ```
-python -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
+python3 -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
 ```
 
 ### Client Prediction
 
 ```
-python seg_client.py
+python3 seg_client.py
 ```
diff --git a/python/examples/unet_for_image_seg/README_CN.md b/python/examples/unet_for_image_seg/README_CN.md
index eed1313eb938be67b80331e498b01a9749cb5dc6..53c2f1893a879d5585cea0b77103fc1461086784 100644
--- a/python/examples/unet_for_image_seg/README_CN.md
+++ b/python/examples/unet_for_image_seg/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 
 ```
-python -m paddle_serving_app.package --get_model unet
+python3 -m paddle_serving_app.package --get_model unet
 tar -xzvf unet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf unet.tar.gz
 ### 启动服务端
 
 ```
-python -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
+python3 -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
 ```
 
 ### 客户端预测
 
 ```
-python seg_client.py
+python3 seg_client.py
 ```
diff --git a/python/examples/util/README.md b/python/examples/util/README.md
index 64cb44a0a84d243810be409e2efd3870c8a4f75c..678ca388df5106e57f146a9758e3ef8da485e270 100644
--- a/python/examples/util/README.md
+++ b/python/examples/util/README.md
@@ -13,14 +13,14 @@ In order to show the time consuming of each stage more intuitively, a script is
 
 When using, first save the output of the client to a file, taking `profile` as an example.
 ```
-python show_profile.py profile ${thread_num}
+python3 show_profile.py profile ${thread_num}
 ```
 Here the `thread_num` parameter is the number of processes when the client is running, and the script will calculate the average time spent in each phase according to this parameter.
 
 The script calculates the time spent in each stage, divides by the number of threads to average, and prints to standard output.
 
 ```
-python timeline_trace.py profile trace
+python3 timeline_trace.py profile trace
 ```
 The script converts the time-dot information in the log into a json format and saves it to a trace file. The trace file can be visualized through the tracing function of the Chrome browser.
 
diff --git a/python/examples/util/README_CN.md b/python/examples/util/README_CN.md
index 43acef8073148b7a4978ed5c02fa5fa05258f6a0..aaca0ae21dd1af33a3fb708efd0b2113525e5141 100644
--- a/python/examples/util/README_CN.md
+++ b/python/examples/util/README_CN.md
@@ -13,14 +13,14 @@ export FLAGS_profile_server=1 #开启server端各阶段时间打点
 
 使用时先将client的输出保存到文件，以profile为例。
 ```
-python show_profile.py profile ${thread_num}
+python3 show_profile.py profile ${thread_num}
 ```
 这里thread_num参数为client运行时的进程数，脚本将按照这个参数来计算各阶段的平均耗时。
 
 脚本将计算各阶段的耗时，并除以线程数做平均，打印到标准输出。
 
 ```
-python timeline_trace.py profile trace
+python3 timeline_trace.py profile trace
 ```
 脚本将日志中的时间打点信息转换成json格式保存到trace文件，trace文件可以通过chrome浏览器的tracing功能进行可视化。
 
diff --git a/python/examples/xpu/bert/bert_web_service.py b/python/examples/xpu/bert/bert_web_service.py
deleted file mode 100644
index f8d805c231d193dff6543ecb0f4ba787e61703b9..0000000000000000000000000000000000000000
--- a/python/examples/xpu/bert/bert_web_service.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server.web_service import WebService
-from paddle_serving_app.reader import ChineseBertReader
-import sys
-import os
-import numpy as np
-
-
-class BertService(WebService):
-    def load(self):
-        self.reader = ChineseBertReader({
-            "vocab_file": "vocab.txt",
-            "max_seq_len": 128
-        })
-
-    def preprocess(self, feed=[], fetch=[]):
-        feed_res = []
-        is_batch = False
-        for ins in feed:
-            feed_dict = self.reader.process(ins["words"].encode("utf-8"))
-            for key in feed_dict.keys():
-                feed_dict[key] = np.array(feed_dict[key]).reshape(
-                    (len(feed_dict[key]), 1))
-            feed_res.append(feed_dict)
-        return feed_res, fetch, is_batch
-
-
-bert_service = BertService(name="bert")
-bert_service.load()
-bert_service.load_model_config(sys.argv[1])
-bert_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[2]), use_lite=True, use_xpu=True, ir_optim=True)
-bert_service.run_rpc_service()
-bert_service.run_web_service()
diff --git a/python/examples/xpu/ernie/ernie_web_service.py b/python/examples/xpu/ernie/ernie_web_service.py
deleted file mode 100644
index f8d805c231d193dff6543ecb0f4ba787e61703b9..0000000000000000000000000000000000000000
--- a/python/examples/xpu/ernie/ernie_web_service.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server.web_service import WebService
-from paddle_serving_app.reader import ChineseBertReader
-import sys
-import os
-import numpy as np
-
-
-class BertService(WebService):
-    def load(self):
-        self.reader = ChineseBertReader({
-            "vocab_file": "vocab.txt",
-            "max_seq_len": 128
-        })
-
-    def preprocess(self, feed=[], fetch=[]):
-        feed_res = []
-        is_batch = False
-        for ins in feed:
-            feed_dict = self.reader.process(ins["words"].encode("utf-8"))
-            for key in feed_dict.keys():
-                feed_dict[key] = np.array(feed_dict[key]).reshape(
-                    (len(feed_dict[key]), 1))
-            feed_res.append(feed_dict)
-        return feed_res, fetch, is_batch
-
-
-bert_service = BertService(name="bert")
-bert_service.load()
-bert_service.load_model_config(sys.argv[1])
-bert_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[2]), use_lite=True, use_xpu=True, ir_optim=True)
-bert_service.run_rpc_service()
-bert_service.run_web_service()
diff --git a/python/examples/xpu/fit_a_line_xpu/README.md b/python/examples/xpu/fit_a_line_xpu/README.md
old mode 100644
new mode 100755
index e54dc69f1042a6031e9f5a1570d67c5696817191..2640344de82ab5c5f6a9d8e267b0496b603e91f5
--- a/python/examples/xpu/fit_a_line_xpu/README.md
+++ b/python/examples/xpu/fit_a_line_xpu/README.md
@@ -13,28 +13,13 @@ sh get_data.sh
 ### Start server
 You can use the following code to start the RPC service 
 ```shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
 ```
 
 ### Client prediction
 
-The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip install paddlepaddle`).
+The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip3 install paddlepaddle`).
 
 ``` shell
-python test_client.py uci_housing_client/serving_client_conf.prototxt
-```
-
-## HTTP service
-
-### Start server
-
-Start a web service with default web service hosting modules:
-``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim --name uci
-```
-
-### Client prediction
-
-``` shell
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9393/uci/prediction
+python3 test_client.py uci_housing_client/serving_client_conf.prototxt
 ```
diff --git a/python/examples/xpu/fit_a_line_xpu/README_CN.md b/python/examples/xpu/fit_a_line_xpu/README_CN.md
old mode 100644
new mode 100755
index e19a17afb643db84129d20979b5822931ee335d7..268acf5a92a54a7c93541a0eaa1d3ae2f2d2656e
--- a/python/examples/xpu/fit_a_line_xpu/README_CN.md
+++ b/python/examples/xpu/fit_a_line_xpu/README_CN.md
@@ -15,35 +15,19 @@ sh get_data.sh
 ### 开启服务端
 
 ``` shell
-python test_server.py uci_housing_model/
+python3 test_server.py uci_housing_model/
 ```
 
 也可以通过下面的一行代码开启默认RPC服务：
 
 ```shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
 ```
 
 ### 客户端预测
 
-`test_client.py`中使用了`paddlepaddle`包，需要进行下载（`pip install paddlepaddle`）。
+`test_client.py`中使用了`paddlepaddle`包，需要进行下载（`pip3 install paddlepaddle`）。
 
 ``` shell
-python test_client.py uci_housing_client/serving_client_conf.prototxt
-```
-
-## HTTP服务
-
-### 开启服务端
-
-通过下面的一行代码开启默认web服务：
-
-``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim --name uci
-```
-
-### 客户端预测
-
-``` shell
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9393/uci/prediction
+python3 test_client.py uci_housing_client/serving_client_conf.prototxt
 ```
diff --git a/python/examples/xpu/resnet_v2_50_xpu/README.md b/python/examples/xpu/resnet_v2_50_xpu/README.md
index ba19b6d7e442346fbc4ee890c34f6fa6c5b55bf7..76b04d614bd4513e806d9a139c38d66b8bce6569 100644
--- a/python/examples/xpu/resnet_v2_50_xpu/README.md
+++ b/python/examples/xpu/resnet_v2_50_xpu/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### Start Service
 
 ```
-python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
+python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
 ```
 
 ### Client Prediction
 
 ```
-python resnet50_client.py
+python3 resnet50_client.py
 ```
diff --git a/python/examples/xpu/resnet_v2_50_xpu/README_CN.md b/python/examples/xpu/resnet_v2_50_xpu/README_CN.md
index 007c90e4a498dc576982fc26a2814918ec1a7b91..652c4f672fd82b494a2240f327463e50dca8829c 100644
--- a/python/examples/xpu/resnet_v2_50_xpu/README_CN.md
+++ b/python/examples/xpu/resnet_v2_50_xpu/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### 启动服务端
 
 ```
-python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
+python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
 ```
 
 ### 客户端预测
 
 ```
-python resnet50_client.py
+python3 resnet50_client.py
 ```
diff --git a/python/examples/xpu/vgg19/README.md b/python/examples/xpu/vgg19/README.md
index 338a80562df3a74033c839cf42ab66e87982595c..d8520684f55a9caf88818905f4cc309f55304fe0 100644
--- a/python/examples/xpu/vgg19/README.md
+++ b/python/examples/xpu/vgg19/README.md
@@ -26,5 +26,5 @@ python3 -m paddle_serving_server.serve --model serving_server --port 7702 --use_
 ### Client Prediction
 
 ```
-python vgg19_client.py
+python3 vgg19_client.py
 ```
diff --git a/python/examples/yolov4/README.md b/python/examples/yolov4/README.md
index fb1bc7622da88cc827b64cfc37336a4de3331831..0c7cfa7c0ffb4938456aa908015aff2daf367727 100644
--- a/python/examples/yolov4/README.md
+++ b/python/examples/yolov4/README.md
@@ -5,19 +5,19 @@
 ## Get Model
 
 ```
-python -m paddle_serving_app.package --get_model yolov4
+python3 -m paddle_serving_app.package --get_model yolov4
 tar -xzvf yolov4.tar.gz
 ```
 
 ## Start RPC Service
 
 ```
-python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
 ```
 
 ## Prediction
 
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
 After the prediction is completed, a json file to save the prediction result and a picture with the detection result box will be generated in the `./outpu folder.
diff --git a/python/examples/yolov4/README_CN.md b/python/examples/yolov4/README_CN.md
index 72923c5af51d2584ae151cbc15ba62efb48adced..1c773033418b9d072a7096a91d47b665b465c322 100644
--- a/python/examples/yolov4/README_CN.md
+++ b/python/examples/yolov4/README_CN.md
@@ -5,20 +5,20 @@
 ## 获取模型
 
 ```
-python -m paddle_serving_app.package --get_model yolov4
+python3 -m paddle_serving_app.package --get_model yolov4
 tar -xzvf yolov4.tar.gz
 ```
 
 ## 启动RPC服务
 
 ```
-python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
 ```
 
 ## 预测
 
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
 
 预测完成会在`./output`文件夹下生成保存预测结果的json文件以及标出检测结果框的图片。
diff --git a/python/paddle_serving_app/local_predict.py b/python/paddle_serving_app/local_predict.py
index b4bc96e2b96f724a9d871b5a843635eba7aff4a2..afe4ba62d69850482e82ba97d43ac747e0f69aaf 100644
--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -22,6 +22,7 @@ import argparse
 from .proto import general_model_config_pb2 as m_config
 import paddle.inference as paddle_infer
 import logging
+import glob
 
 logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger("LocalPredictor")
@@ -51,6 +52,23 @@ class LocalPredictor(object):
         self.fetch_names_to_idx_ = {}
         self.fetch_names_to_type_ = {}
 
+    def search_suffix_files(self, model_path, target_suffix):
+        """
+        Find all files with the suffix xxx in the specified directory.
+
+        Args:
+            model_path: model directory, not None.
+            target_suffix: filenames with target suffix, not None. e.g: *.pdmodel
+
+        Returns:
+            file_list, None, [] or [path, ] . 
+        """
+        if model_path is None or target_suffix is None:
+            return None
+
+        file_list = glob.glob(os.path.join(model_path, target_suffix))
+        return file_list
+
     def load_model_config(self,
                           model_path,
                           use_gpu=False,
@@ -97,11 +115,30 @@ class LocalPredictor(object):
         f = open(client_config, 'r')
         model_conf = google.protobuf.text_format.Merge(
             str(f.read()), model_conf)
+
+        # Init paddle_infer config
+        # Paddle's model files and parameter files have multiple naming rules:
+        #   1) __model__, __params__
+        #   2) *.pdmodel, *.pdiparams
+        #   3) __model__, conv2d_1.w_0, conv2d_2.w_0, fc_1.w_0, conv2d_1.b_0, ... 
+        pdmodel_file_list = self.search_suffix_files(model_path, "*.pdmodel")
+        pdiparams_file_list = self.search_suffix_files(model_path,
+                                                       "*.pdiparams")
         if os.path.exists(os.path.join(model_path, "__params__")):
+            # case 1) initializing
             config = paddle_infer.Config(
                 os.path.join(model_path, "__model__"),
                 os.path.join(model_path, "__params__"))
+        elif pdmodel_file_list and len(
+                pdmodel_file_list) > 0 and pdiparams_file_list and len(
+                    pdiparams_file_list) > 0:
+            # case 2) initializing
+            logger.info("pdmodel_file_list:{}, pdiparams_file_list:{}".format(
+                pdmodel_file_list, pdiparams_file_list))
+            config = paddle_infer.Config(pdmodel_file_list[0],
+                                         pdiparams_file_list[0])
         else:
+            # case 3) initializing.
             config = paddle_infer.Config(model_path)
 
         logger.info(
@@ -201,8 +238,9 @@ class LocalPredictor(object):
         Run model inference by Paddle Inference API.
 
         Args:
-            feed: feed var
-            fetch: fetch var
+            feed: feed var list, None is not allowed.
+            fetch: fetch var list, None allowed. when it is None, all fetch 
+                   vars are returned. Otherwise, return fetch specified result.
             batch: batch data or not, False default.If batch is False, a new
                    dimension is added to header of the shape[np.newaxis].
             log_id: for logging
@@ -210,16 +248,8 @@ class LocalPredictor(object):
         Returns:
             fetch_map: dict 
         """
-        if feed is None or fetch is None:
-            raise ValueError("You should specify feed and fetch for prediction.\
-                log_id:{}".format(log_id))
-        fetch_list = []
-        if isinstance(fetch, str):
-            fetch_list = [fetch]
-        elif isinstance(fetch, list):
-            fetch_list = fetch
-        else:
-            raise ValueError("Fetch only accepts string and list of string.\
+        if feed is None:
+            raise ValueError("You should specify feed vars for prediction.\
                 log_id:{}".format(log_id))
 
         feed_batch = []
@@ -231,18 +261,20 @@ class LocalPredictor(object):
             raise ValueError("Feed only accepts dict and list of dict.\
                 log_id:{}".format(log_id))
 
-        fetch_names = []
+        fetch_list = []
+        if fetch is not None:
+            if isinstance(fetch, str):
+                fetch_list = [fetch]
+            elif isinstance(fetch, list):
+                fetch_list = fetch
+
         # Filter invalid fetch names
+        fetch_names = []
         for key in fetch_list:
             if key in self.fetch_names_:
                 fetch_names.append(key)
 
-        if len(fetch_names) == 0:
-            raise ValueError(
-                "Fetch names should not be empty or out of saved fetch list.\
-                    log_id:{}".format(log_id))
-
-        # Assemble the input data of paddle predictor 
+        # Assemble the input data of paddle predictor, and filter invalid inputs. 
         input_names = self.predictor.get_input_names()
         for name in input_names:
             if isinstance(feed[name], list):
@@ -282,11 +314,15 @@ class LocalPredictor(object):
                 input_tensor_handle.copy_from_cpu(feed[name][np.newaxis, :])
             else:
                 input_tensor_handle.copy_from_cpu(feed[name])
+
+        # set output tensor handlers
         output_tensor_handles = []
+        output_name_to_index_dict = {}
         output_names = self.predictor.get_output_names()
-        for output_name in output_names:
+        for i, output_name in enumerate(output_names):
             output_tensor_handle = self.predictor.get_output_handle(output_name)
             output_tensor_handles.append(output_tensor_handle)
+            output_name_to_index_dict[output_name] = i
 
         # Run inference 
         self.predictor.run()
@@ -296,10 +332,43 @@ class LocalPredictor(object):
         for output_tensor_handle in output_tensor_handles:
             output = output_tensor_handle.copy_to_cpu()
             outputs.append(output)
+        outputs_len = len(outputs)
+
+        # Copy fetch vars. If fetch is None, it will copy all results from output_tensor_handles. 
+        # Otherwise, it will copy the fields specified from output_tensor_handles.
         fetch_map = {}
-        for i, name in enumerate(fetch):
-            fetch_map[name] = outputs[i]
-            if len(output_tensor_handles[i].lod()) > 0:
-                fetch_map[name + ".lod"] = np.array(output_tensor_handles[i]
-                                                    .lod()[0]).astype('int32')
+        if fetch is None:
+            for i, name in enumerate(output_names):
+                fetch_map[name] = outputs[i]
+                if len(output_tensor_handles[i].lod()) > 0:
+                    fetch_map[name + ".lod"] = np.array(output_tensor_handles[
+                        i].lod()[0]).astype('int32')
+        else:
+            # Because the save_inference_model interface will increase the scale op 
+            # in the network, the name of fetch_var is different from that in prototxt. 
+            # Therefore, it is compatible with v0.6.x and the previous model save format,
+            # and here is compatible with the results that do not match.
+            fetch_match_num = 0
+            for i, name in enumerate(fetch):
+                output_index = output_name_to_index_dict.get(name)
+                if output_index is None:
+                    continue
+
+                fetch_map[name] = outputs[output_index]
+                fetch_match_num += 1
+                if len(output_tensor_handles[output_index].lod()) > 0:
+                    fetch_map[name + ".lod"] = np.array(output_tensor_handles[
+                        output_index].lod()[0]).astype('int32')
+
+            # Compatible with v0.6.x and lower versions model saving formats.
+            if fetch_match_num == 0:
+                logger.debug("fetch match num is 0. Retrain the model please!")
+                for i, name in enumerate(fetch):
+                    if i >= outputs_len:
+                        break
+                    fetch_map[name] = outputs[i]
+                    if len(output_tensor_handles[i].lod()) > 0:
+                        fetch_map[name + ".lod"] = np.array(
+                            output_tensor_handles[i].lod()[0]).astype('int32')
+
         return fetch_map
diff --git a/python/paddle_serving_app/reader/__init__.py b/python/paddle_serving_app/reader/__init__.py
index 7d7ebad6f74407238ba908ff787b9a96dd7457cc..185b20992ab0efb1d474286260d5570041784007 100644
--- a/python/paddle_serving_app/reader/__init__.py
+++ b/python/paddle_serving_app/reader/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from .chinese_bert_reader import ChineseBertReader
 from .image_reader import ImageReader, File2Image, URL2Image, Sequential, Normalize, Base64ToImage
+from .image_reader import DetectionFile2Image, DetectionSequential, DetectionNormalize, DetectionTranspose, DetectionResize, DetectionBGR2RGB, DetectionPadStride
 from .image_reader import CenterCrop, Resize, Transpose, Div, RGB2BGR, BGR2RGB, ResizeByFactor
 from .image_reader import RCNNPostprocess, SegPostprocess, PadStride, BlazeFacePostprocess
 from .image_reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
diff --git a/python/paddle_serving_app/reader/image_reader.py b/python/paddle_serving_app/reader/image_reader.py
index ca97af94f1de7edf4dc53caccbcf2c601eb31163..24d1759a052420fa01b52f377b77cc21475cc04c 100644
--- a/python/paddle_serving_app/reader/image_reader.py
+++ b/python/paddle_serving_app/reader/image_reader.py
@@ -498,6 +498,42 @@ class Sequential(object):
         return format_string_
 
 
+class DetectionSequential(object):
+    """
+    Args:
+        sequence (sequence of ``Transform`` objects): list of transforms to chain.
+
+    This API references some of the design pattern of torchvision
+    Users can simply use this API in training as well
+
+    Example:
+        >>> image_reader.Sequnece([
+        >>>     transforms.CenterCrop(10),
+        >>> ])
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, im):
+        im_info = {
+        'scale_factor': np.array(
+            [1., 1.], dtype=np.float32),
+        'im_shape': None,
+        }
+        for t in self.transforms:
+            im, im_info = t(im, im_info)
+        return im, im_info
+
+    def __repr__(self):
+        format_string_ = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string_ += '\n'
+            format_string_ += '    {0}'.format(t)
+        format_string_ += '\n)'
+        return format_string_
+
+
 class RGB2BGR(object):
     def __init__(self):
         pass
@@ -520,6 +556,17 @@ class BGR2RGB(object):
         return self.__class__.__name__ + "()"
 
 
+class DetectionBGR2RGB(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, img, img_info=None):
+        return img[:, :, ::-1], img_info
+
+    def __repr__(self):
+        return self.__class__.__name__ + "()"
+
+
 class String2Image(object):
     def __init__(self):
         pass
@@ -556,6 +603,33 @@ class File2Image(object):
     def __repr__(self):
         return self.__class__.__name__ + "()"
 
+class DetectionFile2Image(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, img_path, im_info=None):
+        if py_version == 2:
+            fin = open(img_path)
+        else:
+            fin = open(img_path, "rb")
+        sample = fin.read()
+        data = np.fromstring(sample, np.uint8)
+        img = cv2.imdecode(data, cv2.IMREAD_COLOR)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        '''
+        img = cv2.imread(img_path, -1)
+        channels = img.shape[2]
+        ori_h = img.shape[0]
+        ori_w = img.shape[1]
+        '''
+        if im_info is not None:
+            im_info['im_shape'] = np.array(img.shape[:2], dtype=np.float32)
+            im_info['scale_factor'] = np.array([1., 1.], dtype=np.float32)
+        return img, im_info
+
+    def __repr__(self):
+        return self.__class__.__name__ + "()"
+
 
 class URL2Image(object):
     def __init__(self):
@@ -607,6 +681,27 @@ class Div(object):
     def __repr__(self):
         return self.__class__.__name__ + "({})".format(self.value)
 
+class DetectionDiv(object):
+    """ divide by some float number """
+
+    def __init__(self, value):
+        self.value = value
+
+    def __call__(self, img, img_info=None):
+        """
+        Args:
+            img (numpy array): (int8 numpy array)
+
+        Returns:
+            img (numpy array): (float32 numpy array)
+        """
+        img = img.astype('float32') / self.value
+
+        return img, img_info
+
+    def __repr__(self):
+        return self.__class__.__name__ + "({})".format(self.value)
+
 
 class Normalize(object):
     """Normalize a tensor image with mean and standard deviation.
@@ -643,6 +738,51 @@ class Normalize(object):
                                                                       self.std)
 
 
+class DetectionNormalize(object):
+    """Normalize a tensor image with mean and standard deviation.
+    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform
+    will normalize each channel of the input ``torch.*Tensor`` i.e.
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input tensor.
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        is_scale (bool): whether need im / 255
+
+    """
+
+    def __init__(self, mean, std, is_scale=True):
+        self.mean = mean
+        self.std = std
+        self.is_scale = is_scale
+
+    def __call__(self, im, im_info=None):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.astype(np.float32, copy=False)
+        mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+        std = np.array(self.std)[np.newaxis, np.newaxis, :]
+
+        if self.is_scale:
+            im = im / 255.0
+        im -= mean
+        im /= std
+        return im, im_info
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean,
+                                                                      self.std)
+
+
 class Lambda(object):
     """Apply a user-defined lambda as a transform.
        Very shame to just copy from
@@ -716,6 +856,124 @@ class Resize(object):
             self.size, self.max_size,
             _cv2_interpolation_to_str[self.interpolation])
 
+class DetectionResize(object):
+    """resize image by target_size and max_size
+    Args:
+        target_size (int): the target size of image
+        keep_ratio (bool): whether keep_ratio or not, default true
+        interp (int): method of resize
+    """
+
+    def __init__(self, target_size, keep_ratio=True, interpolation=cv2.INTER_LINEAR):
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+        self.keep_ratio = keep_ratio
+        self.interpolation = interpolation
+
+    def __call__(self, im, im_info=None):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        im_channel = im.shape[2]
+        im_scale_y, im_scale_x = self.generate_scale(im)
+        im = cv2.resize(
+            im,
+            None,
+            None,
+            fx=im_scale_x,
+            fy=im_scale_y,
+            interpolation=self.interpolation)
+        if im_info is not None:
+            im_info['im_shape'] = np.array(im.shape[:2]).astype('float32')
+            im_info['scale_factor'] = np.array(
+                [im_scale_y, im_scale_x]).astype('float32')
+        return im, im_info
+
+    def generate_scale(self, im):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+        Returns:
+            im_scale_x: the resize ratio of X
+            im_scale_y: the resize ratio of Y
+        """
+        origin_shape = im.shape[:2]
+        im_c = im.shape[2]
+        if self.keep_ratio:
+            im_size_min = np.min(origin_shape)
+            im_size_max = np.max(origin_shape)
+            target_size_min = np.min(self.target_size)
+            target_size_max = np.max(self.target_size)
+            im_scale = float(target_size_min) / float(im_size_min)
+            if np.round(im_scale * im_size_max) > target_size_max:
+                im_scale = float(target_size_max) / float(im_size_max)
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = self.target_size
+            im_scale_y = resize_h / float(origin_shape[0])
+            im_scale_x = resize_w / float(origin_shape[1])
+        return im_scale_y, im_scale_x
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0}, max_size={1}, interpolation={2})'.format(
+            self.size, self.max_size,
+            _cv2_interpolation_to_str[self.interpolation])
+
+
+class PadStride(object):
+    def __init__(self, stride):
+        self.coarsest_stride = stride
+
+    def __call__(self, img):
+        coarsest_stride = self.coarsest_stride
+        if coarsest_stride == 0:
+            return img
+        im_c, im_h, im_w = img.shape
+        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
+        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
+        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
+        padding_im[:, :im_h, :im_w] = img
+        im_info = {}
+        im_info['resize_shape'] = padding_im.shape[1:]
+        return padding_im
+
+class DetectionPadStride(object):
+    """ padding image for model with FPN, instead PadBatch(pad_to_stride) in original config
+    Args:
+        stride (bool): model with FPN need image shape % stride == 0
+    """
+
+    def __init__(self, stride=0):
+        self.coarsest_stride = stride
+
+    def __call__(self, im, im_info=None):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        coarsest_stride = self.coarsest_stride
+        if coarsest_stride <= 0:
+            return im
+        im_c, im_h, im_w = im.shape
+        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
+        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
+        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
+        padding_im[:, :im_h, :im_w] = im
+        return padding_im, im_info
+
 
 class ResizeByFactor(object):
     """Resize the input numpy array Image to a size multiple of factor which is usually required by a network
@@ -768,24 +1026,6 @@ class ResizeByFactor(object):
             self.factor, self.max_side_len)
 
 
-class PadStride(object):
-    def __init__(self, stride):
-        self.coarsest_stride = stride
-
-    def __call__(self, img):
-        coarsest_stride = self.coarsest_stride
-        if coarsest_stride == 0:
-            return img
-        im_c, im_h, im_w = img.shape
-        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
-        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
-        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
-        padding_im[:, :im_h, :im_w] = img
-        im_info = {}
-        im_info['resize_shape'] = padding_im.shape[1:]
-        return padding_im
-
-
 class Transpose(object):
     def __init__(self, transpose_target):
         self.transpose_target = transpose_target
@@ -799,6 +1039,19 @@ class Transpose(object):
                         "({})".format(self.transpose_target)
         return format_string
 
+class DetectionTranspose(object):
+    def __init__(self, transpose_target):
+        self.transpose_target = transpose_target
+
+    def __call__(self, im, im_info=None):
+        im = F.transpose(im, self.transpose_target)
+        return im, im_info
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + \
+                        "({})".format(self.transpose_target)
+        return format_string
+
 
 class SortedBoxes(object):
     """
diff --git a/python/paddle_serving_client/client.py b/python/paddle_serving_client/client.py
index 42ba64198a04d70ec511ddd03edd9c9ef26d21a5..648678f3afd9ffdc0af4c505779fc5eca0c42a37 100755
--- a/python/paddle_serving_client/client.py
+++ b/python/paddle_serving_client/client.py
@@ -31,15 +31,21 @@ sys.path.append(
 #param 'type'(which is in feed_var or fetch_var) = 0 means dataType is int64
 #param 'type'(which is in feed_var or fetch_var) = 1 means dataType is float32
 #param 'type'(which is in feed_var or fetch_var) = 2 means dataType is int32
-#param 'type'(which is in feed_var or fetch_var) = 3 means dataType is string(also called bytes in proto)
+#param 'type'(which is in feed_var or fetch_var) = 5 means dataType is float16
+#param 'type'(which is in feed_var or fetch_var) = 7 means dataType is uint8
+#param 'type'(which is in feed_var or fetch_var) = 8 means dataType is int8
+#param 'type'(which is in feed_var or fetch_var) = 20 means dataType is string(also called bytes in proto)
 int64_type = 0
 float32_type = 1
 int32_type = 2
-bytes_type = 3
+float16_type = 5
+uint8_type = 7
+int8_type = 8
+bytes_type = 20
 #int_type,float_type,string_type are the set of each subdivision classes.
 int_type = set([int64_type, int32_type])
 float_type = set([float32_type])
-string_type = set([bytes_type])
+string_type = set([bytes_type, float16_type, uint8_type, int8_type])
 
 
 class _NOPProfiler(object):
@@ -289,31 +295,39 @@ class Client(object):
                 log_id=0):
         self.profile_.record('py_prepro_0')
 
-        if feed is None or fetch is None:
-            raise ValueError("You should specify feed and fetch for prediction")
+        # fetch 可以为空，此时会取所有的输出结果
+        if feed is None:
+            raise ValueError("You should specify feed for prediction")
 
         fetch_list = []
         if isinstance(fetch, str):
             fetch_list = [fetch]
         elif isinstance(fetch, list):
             fetch_list = fetch
+        # fetch 可以为空，此时会取所有的输出结果
+        elif fetch == None:
+            pass
         else:
-            raise ValueError("Fetch only accepts string and list of string")
+            raise ValueError("Fetch only accepts string or list of string")
 
         feed_batch = []
         if isinstance(feed, dict):
             feed_batch.append(feed)
         elif isinstance(feed, list):
-            # if input is a list and the number of feed_var is 1.
-            # create a temp_dict { key = feed_var_name, value = list}
-            # put the temp_dict into the feed_batch.
-            if len(self.feed_names_) != 1:
-                raise ValueError(
-                    "input is a list, but we got 0 or 2+ feed_var, don`t know how to divide the feed list"
-                )
-            temp_dict = {}
-            temp_dict[self.feed_names_[0]] = feed
-            feed_batch.append(temp_dict)
+            # feed = [dict]
+            if len(feed) == 1 and isinstance(feed[0], dict):
+                feed_batch = feed
+            else:
+                # if input is a list and the number of feed_var is 1.
+                # create a temp_dict { key = feed_var_name, value = list}
+                # put the temp_dict into the feed_batch.
+                if len(self.feed_names_) != 1:
+                    raise ValueError(
+                        "input is a list, but we got 0 or 2+ feed_var, don`t know how to divide the feed list"
+                    )
+                temp_dict = {}
+                temp_dict[self.feed_names_[0]] = feed
+                feed_batch.append(temp_dict)
         else:
             raise ValueError("Feed only accepts dict and list of dict")
 
@@ -321,10 +335,15 @@ class Client(object):
         if len(feed_batch) != 1:
             raise ValueError("len of feed_batch can only be 1.")
 
-        int_slot = []
-        int_feed_names = []
-        int_shape = []
-        int_lod_slot_batch = []
+        int32_slot = []
+        int32_feed_names = []
+        int32_shape = []
+        int32_lod_slot_batch = []
+
+        int64_slot = []
+        int64_feed_names = []
+        int64_shape = []
+        int64_lod_slot_batch = []
 
         float_slot = []
         float_feed_names = []
@@ -341,10 +360,6 @@ class Client(object):
             if key in self.fetch_names_:
                 fetch_names.append(key)
 
-        if len(fetch_names) == 0:
-            raise ValueError(
-                "Fetch names should not be empty or out of saved fetch list.")
-
         feed_dict = feed_batch[0]
         for key in feed_dict:
             if ".lod" not in key and key not in self.feed_names_:
@@ -354,27 +369,39 @@ class Client(object):
 
             self.shape_check(feed_dict, key)
             if self.feed_types_[key] in int_type:
-                int_feed_names.append(key)
                 shape_lst = []
                 if batch == False:
                     feed_dict[key] = np.expand_dims(feed_dict[key], 0).repeat(
                         1, axis=0)
-                if isinstance(feed_dict[key], np.ndarray):
-                    shape_lst.extend(list(feed_dict[key].shape))
-                    int_shape.append(shape_lst)
-                else:
-                    int_shape.append(self.feed_shapes_[key])
-                if "{}.lod".format(key) in feed_dict:
-                    int_lod_slot_batch.append(feed_dict["{}.lod".format(key)])
-                else:
-                    int_lod_slot_batch.append([])
-
-                if isinstance(feed_dict[key], np.ndarray):
-                    int_slot.append(np.ascontiguousarray(feed_dict[key]))
-                    self.has_numpy_input = True
+                # verify different input int_type
+                if(self.feed_types_[key] == int64_type):
+                    int64_feed_names.append(key)
+                    if isinstance(feed_dict[key], np.ndarray):
+                        shape_lst.extend(list(feed_dict[key].shape))
+                        int64_shape.append(shape_lst)
+                        self.has_numpy_input = True
+                    else:
+                        int64_shape.append(self.feed_shapes_[key])
+                        self.all_numpy_input = False
+                    if "{}.lod".format(key) in feed_dict:
+                        int64_lod_slot_batch.append(feed_dict["{}.lod".format(key)])
+                    else:
+                        int64_lod_slot_batch.append([])
+                    int64_slot.append(np.ascontiguousarray(feed_dict[key]))
                 else:
-                    int_slot.append(np.ascontiguousarray(feed_dict[key]))
-                    self.all_numpy_input = False
+                    int32_feed_names.append(key)
+                    if isinstance(feed_dict[key], np.ndarray):
+                        shape_lst.extend(list(feed_dict[key].shape))
+                        int32_shape.append(shape_lst)
+                        self.has_numpy_input = True
+                    else:
+                        int32_shape.append(self.feed_shapes_[key])
+                        self.all_numpy_input = False
+                    if "{}.lod".format(key) in feed_dict:
+                        int32_lod_slot_batch.append(feed_dict["{}.lod".format(key)])
+                    else:
+                        int32_lod_slot_batch.append([])
+                    int32_slot.append(np.ascontiguousarray(feed_dict[key]))
 
             elif self.feed_types_[key] in float_type:
                 float_feed_names.append(key)
@@ -407,7 +434,10 @@ class Client(object):
                         key)])
                 else:
                     string_lod_slot_batch.append([])
-                string_slot.append(feed_dict[key])
+                if type(feed_dict[key]) is np.ndarray:
+                    string_slot.append(feed_dict[key].tostring())
+                else:
+                    string_slot.append(feed_dict[key])
                 self.has_numpy_input = True
 
         self.profile_.record('py_prepro_1')
@@ -417,7 +447,8 @@ class Client(object):
         if self.all_numpy_input:
             res = self.client_handle_.numpy_predict(
                 float_slot, float_feed_names, float_shape, float_lod_slot_batch,
-                int_slot, int_feed_names, int_shape, int_lod_slot_batch,
+                int32_slot, int32_feed_names, int32_shape, int32_lod_slot_batch,
+                int64_slot, int64_feed_names, int64_shape, int64_lod_slot_batch,
                 string_slot, string_feed_names, string_shape,
                 string_lod_slot_batch, fetch_names, result_batch_handle,
                 self.pid, log_id)
@@ -439,6 +470,9 @@ class Client(object):
         model_engine_names = result_batch_handle.get_engine_names()
         for mi, engine_name in enumerate(model_engine_names):
             result_map = {}
+            # fetch 为空，则会取所有的输出结果
+            if len(fetch_names) == 0:
+                fetch_names = result_batch_handle.get_tensor_alias_names(mi)
             # result map needs to be a numpy array
             for i, name in enumerate(fetch_names):
                 if self.fetch_names_to_type_[name] == int64_type:
@@ -485,6 +519,54 @@ class Client(object):
                         tmp_lod = result_batch_handle.get_lod(mi, name)
                         if np.size(tmp_lod) > 0:
                             result_map["{}.lod".format(name)] = tmp_lod
+                elif self.fetch_names_to_type_[name] == uint8_type:
+                    # result_map[name] will be py::array(numpy array)
+                    tmp_str = result_batch_handle.get_string_by_name(
+                        mi, name)
+                    result_map[name] = np.fromstring(tmp_str, dtype = np.uint8)
+                    if result_map[name].size == 0:
+                        raise ValueError(
+                            "Failed to fetch, maybe the type of [{}]"
+                            " is wrong, please check the model file".format(
+                                name))
+                    shape = result_batch_handle.get_shape(mi, name)
+                    result_map[name].shape = shape
+                    if name in self.lod_tensor_set:
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
+                elif self.fetch_names_to_type_[name] == int8_type:
+                    # result_map[name] will be py::array(numpy array)
+                    tmp_str = result_batch_handle.get_string_by_name(
+                        mi, name)
+                    result_map[name] = np.fromstring(tmp_str, dtype = np.int8)
+                    if result_map[name].size == 0:
+                        raise ValueError(
+                            "Failed to fetch, maybe the type of [{}]"
+                            " is wrong, please check the model file".format(
+                                name))
+                    shape = result_batch_handle.get_shape(mi, name)
+                    result_map[name].shape = shape
+                    if name in self.lod_tensor_set:
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
+                elif self.fetch_names_to_type_[name] == float16_type:
+                    # result_map[name] will be py::array(numpy array)
+                    tmp_str = result_batch_handle.get_string_by_name(
+                        mi, name)
+                    result_map[name] = np.fromstring(tmp_str, dtype = np.float16)
+                    if result_map[name].size == 0:
+                        raise ValueError(
+                            "Failed to fetch, maybe the type of [{}]"
+                            " is wrong, please check the model file".format(
+                                name))
+                    shape = result_batch_handle.get_shape(mi, name)
+                    result_map[name].shape = shape
+                    if name in self.lod_tensor_set:
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
             multi_result_map.append(result_map)
         ret = None
         if len(model_engine_names) == 1:
diff --git a/python/paddle_serving_client/convert.py b/python/paddle_serving_client/convert.py
index e3cd3a05f8e09155b0c884e3ddf12b57234de3dd..984deec609e884a4222a0be1609d068505d97f62 100644
--- a/python/paddle_serving_client/convert.py
+++ b/python/paddle_serving_client/convert.py
@@ -23,6 +23,12 @@ from .io import inference_model_to_serving
 
 def parse_args():  # pylint: disable=doc-string-missing
     parser = argparse.ArgumentParser("convert")
+    parser.add_argument(
+        "--show_proto",
+        type=bool,
+        default=False,
+        help='If yes, you can preview the proto and then determine your feed var alias name and fetch var alias name.'
+    )
     parser.add_argument(
         "--dirname",
         type=str,
@@ -53,6 +59,18 @@ def parse_args():  # pylint: disable=doc-string-missing
         default=None,
         help='The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. Default: None.'
     )
+    parser.add_argument(
+        "--feed_alias_names",
+        type=str,
+        default=None,
+        help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of feed vars'
+    )
+    parser.add_argument(
+        "--fetch_alias_names",
+        type=str,
+        default=None,
+        help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of fetch vars'
+    )
     return parser.parse_args()
 
 
@@ -63,4 +81,7 @@ if __name__ == "__main__":
         serving_server=args.serving_server,
         serving_client=args.serving_client,
         model_filename=args.model_filename,
-        params_filename=args.params_filename)
+        params_filename=args.params_filename,
+        show_proto=args.show_proto,
+        feed_alias_names=args.feed_alias_names,
+        fetch_alias_names=args.fetch_alias_names)
diff --git a/python/paddle_serving_client/httpclient.py b/python/paddle_serving_client/httpclient.py
index 06564e0ace7659ba88665b2593259dc3bb85b79e..bb056a99732aeb1fa855b6ce1e020ada82072ed0 100755
--- a/python/paddle_serving_client/httpclient.py
+++ b/python/paddle_serving_client/httpclient.py
@@ -22,6 +22,7 @@ import gzip
 from collections import Iterable
 import base64
 import sys
+import re
 
 import grpc
 from .proto import general_model_service_pb2
@@ -31,13 +32,18 @@ from .proto import general_model_service_pb2_grpc
 #param 'type'(which is in feed_var or fetch_var) = 0 means dataType is int64
 #param 'type'(which is in feed_var or fetch_var) = 1 means dataType is float32
 #param 'type'(which is in feed_var or fetch_var) = 2 means dataType is int32
-#param 'type'(which is in feed_var or fetch_var) = 3 means dataType is string(also called bytes in proto)
+#param 'type'(which is in feed_var or fetch_var) = 20 means dataType is string(also called bytes in proto)
 int64_type = 0
 float32_type = 1
 int32_type = 2
-bytes_type = 3
+bytes_type = 20
 # this is corresponding to the proto
-proto_data_key_list = ["int64_data", "float_data", "int_data", "data"]
+proto_data_key_list = {
+    0: "int64_data",
+    1: "float_data",
+    2: "int_data",
+    20: "data"
+}
 
 
 def list_flatten(items, ignore_types=(str, bytes)):
@@ -73,9 +79,9 @@ def data_bytes_number(datalist):
 # 可以直接调用需要的http_client_predict/grpc_client_predict
 # 例如，如果想使用GRPC方式，set_use_grpc_client(True)
 # 或者直接调用grpc_client_predict()
-class GeneralClient(object):
+class HttpClient(object):
     def __init__(self,
-                 ip="0.0.0.0",
+                 ip="127.0.0.1",
                  port="9393",
                  service_name="/GeneralModelService/inference"):
         self.feed_names_ = []
@@ -84,7 +90,7 @@ class GeneralClient(object):
         self.feed_shapes_ = {}
         self.feed_types_ = {}
         self.feed_names_to_idx_ = {}
-        self.timeout_ms = 200000
+        self.timeout_ms = 20000
         self.ip = ip
         self.port = port
         self.server_port = port
@@ -93,9 +99,24 @@ class GeneralClient(object):
         self.try_request_gzip = False
         self.try_response_gzip = False
         self.total_data_number = 0
+        self.headers = {}
         self.http_proto = True
+        self.headers["Content-Type"] = "application/proto"
         self.max_body_size = 512 * 1024 * 1024
         self.use_grpc_client = False
+        self.http_s = "http://"
+
+        # 使用连接池能够不用反复建立连接
+        self.requests_session = requests.session()
+        # 初始化grpc_stub
+        options = [('grpc.max_receive_message_length', self.max_body_size),
+                   ('grpc.max_send_message_length', self.max_body_size)]
+
+        endpoints = [self.ip + ":" + self.server_port]
+        g_endpoint = 'ipv4:{}'.format(','.join(endpoints))
+        self.channel_ = grpc.insecure_channel(g_endpoint, options=options)
+        self.stub_ = general_model_service_pb2_grpc.GeneralModelServiceStub(
+            self.channel_)
 
     def load_client_config(self, model_config_path_list):
         if isinstance(model_config_path_list, str):
@@ -162,14 +183,57 @@ class GeneralClient(object):
         else:
             self.timeout_ms = timeout_ms
 
-    def set_ip(self, ip):
-        self.ip = ip
+    def set_max_retries(self, retry_times=3):
+        if not isinstance(retry_times, int):
+            raise ValueError("retry_times must be int type.")
+        else:
+            self.requests_session.mount(
+                self.http_s, HTTPAdapter(max_retries=retry_times))
 
     def set_service_name(self, service_name):
         self.service_name = service_name
 
-    def set_port(self, port):
-        self.port = port
+    def connect(self, url=None, encryption=False):
+        if isinstance(url, (list, tuple)):
+            if len(url) > 1:
+                raise ValueError("HttpClient only support 1 endpoint")
+            else:
+                url = url[0]
+        if isinstance(url, str):
+            if url.startswith("https://"):
+                url = url[8:]
+                self.http_s = "https://"
+            if url.startswith("http://"):
+                url = url[7:]
+                self.http_s = "http://"
+            url_parts = url.split(':')
+            if len(url_parts) != 2 or self.check_ip(url_parts[0]) == False:
+                raise ValueError(
+                    "url not right, it should be like 127.0.0.1:9393 or http://127.0.0.1:9393"
+                )
+            else:
+                self.ip = url_parts[0]
+                self.port = url_parts[1]
+                self.server_port = url_parts[1]
+        if encryption:
+            self.get_serving_port()
+        if self.use_grpc_client:
+            self.init_grpc_stub()
+
+    def check_ip(self, ipAddr):
+        compile_ip = re.compile(
+            '^(1\d{2}|2[0-4]\d|25[0-5]|[1-9]\d|[1-9])\.(1\d{2}|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.(1\d{2}|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.(1\d{2}|2[0-4]\d|25[0-5]|[1-9]\d|\d)$'
+        )
+        if compile_ip.match(ipAddr):
+            return True
+        else:
+            return False
+
+    def add_http_headers(self, headers):
+        if isinstance(headers, dict):
+            self.headers.update(headers)
+        else:
+            print("headers must be a dict")
 
     def set_request_compress(self, try_request_gzip):
         self.try_request_gzip = try_request_gzip
@@ -179,6 +243,10 @@ class GeneralClient(object):
 
     def set_http_proto(self, http_proto):
         self.http_proto = http_proto
+        if self.http_proto:
+            self.headers["Content-Type"] = "application/proto"
+        else:
+            self.headers["Content-Type"] = "application/json"
 
     def set_use_grpc_client(self, use_grpc_client):
         self.use_grpc_client = use_grpc_client
@@ -187,21 +255,21 @@ class GeneralClient(object):
     def use_key(self, key_filename):
         with open(key_filename, "rb") as f:
             self.key = f.read()
-            self.get_serving_port()
 
     def get_serving_port(self):
-        encrypt_url = "http://" + str(self.ip) + ":" + str(self.port)
+        encrypt_url = self.http_s + str(self.ip) + ":" + str(self.port)
         if self.key is not None:
             req = json.dumps({"key": base64.b64encode(self.key).decode()})
         else:
             req = json.dumps({})
-        r = requests.post(encrypt_url, req)
-        result = r.json()
-        if "endpoint_list" not in result:
-            raise ValueError("server not ready")
-        else:
-            self.server_port = str(result["endpoint_list"][0])
-            print("rpc port is ", self.server_port)
+        with requests.post(
+                encrypt_url, data=req, timeout=self.timeout_ms / 1000) as r:
+            result = r.json()
+            if "endpoint_list" not in result:
+                raise ValueError("server not ready")
+            else:
+                self.server_port = str(result["endpoint_list"][0])
+                print("rpc port is ", self.server_port)
 
     def get_feed_names(self):
         return self.feed_names_
@@ -210,35 +278,34 @@ class GeneralClient(object):
         return self.fetch_names_
 
     def get_legal_fetch(self, fetch):
-        if fetch is None:
-            raise ValueError("You should specify feed and fetch for prediction")
 
         fetch_list = []
         if isinstance(fetch, str):
             fetch_list = [fetch]
         elif isinstance(fetch, (list, tuple)):
             fetch_list = fetch
+        elif fetch == None:
+            pass
         else:
-            raise ValueError("Fetch only accepts string and list of string")
+            raise ValueError("Fetch only accepts string/list/tuple of string")
 
         fetch_names = []
         for key in fetch_list:
             if key in self.fetch_names_:
                 fetch_names.append(key)
-
-        if len(fetch_names) == 0:
-            raise ValueError(
-                "Fetch names should not be empty or out of saved fetch list.")
-            return {}
         return fetch_names
 
     def get_feedvar_dict(self, feed):
         if feed is None:
-            raise ValueError("You should specify feed and fetch for prediction")
+            raise ValueError("You should specify feed for prediction")
         feed_dict = {}
         if isinstance(feed, dict):
             feed_dict = feed
         elif isinstance(feed, (list, str, tuple)):
+            # feed = [dict]
+            if len(feed) == 1 and isinstance(feed[0], dict):
+                feed_dict = feed[0]
+                return feed_dict
             # if input is a list or str or tuple, and the number of feed_var is 1.
             # create a feed_dict { key = feed_var_name, value = list}
             if len(self.feed_names_) == 1:
@@ -376,17 +443,19 @@ class GeneralClient(object):
             # 此时先统一处理为一个list
             # 由于输入比较特殊，shape保持原feedvar中不变
             data_value = []
-            data_value.append(feed_dict[key])
-            if isinstance(feed_dict[key], str):
+            if isinstance(feed_dict[key], (str, bytes)):
                 if self.feed_types_[key] != bytes_type:
                     raise ValueError(
                         "feedvar is not string-type,feed can`t be a single string."
                     )
+                if isinstance(feed_dict[key], bytes):
+                    feed_dict[key] = feed_dict[key].decode()
             else:
                 if self.feed_types_[key] == bytes_type:
                     raise ValueError(
-                        "feedvar is string-type,feed, feed can`t be a single int or others."
+                        "feedvar is string-type,feed can`t be a single int or others."
                     )
+            data_value.append(feed_dict[key])
         # 如果不压缩，那么不需要统计数据量。
         if self.try_request_gzip:
             self.total_data_number = self.total_data_number + data_bytes_number(
@@ -427,36 +496,42 @@ class GeneralClient(object):
 
         feed_dict = self.get_feedvar_dict(feed)
         fetch_list = self.get_legal_fetch(fetch)
-        headers = {}
         postData = ''
 
         if self.http_proto == True:
             postData = self.process_proto_data(feed_dict, fetch_list, batch,
                                                log_id).SerializeToString()
-            headers["Content-Type"] = "application/proto"
+
         else:
             postData = self.process_json_data(feed_dict, fetch_list, batch,
                                               log_id)
-            headers["Content-Type"] = "application/json"
 
-        web_url = "http://" + self.ip + ":" + self.server_port + self.service_name
+        web_url = self.http_s + self.ip + ":" + self.server_port + self.service_name
         # 当数据区长度大于512字节时才压缩.
+        self.headers.pop("Content-Encoding", "nokey")
         try:
             if self.try_request_gzip and self.total_data_number > 512:
-                origin_data = postData
-                postData = gzip.compress(bytes(postData, 'utf-8'))
-                headers["Content-Encoding"] = "gzip"
+
+                if self.http_proto:
+                    postData = gzip.compress(postData)
+                else:
+                    postData = gzip.compress(bytes(postData, 'utf-8'))
+                self.headers["Content-Encoding"] = "gzip"
             if self.try_response_gzip:
-                headers["Accept-encoding"] = "gzip"
+                self.headers["Accept-encoding"] = "gzip"
         # 压缩异常，使用原始数据
         except:
             print("compress error, we will use the no-compress data")
-            headers.pop("Content-Encoding", "nokey")
-            postData = origin_data
-
+            self.headers.pop("Content-Encoding", "nokey")
         # requests支持自动识别解压
         try:
-            result = requests.post(url=web_url, headers=headers, data=postData)
+            result = self.requests_session.post(
+                url=web_url,
+                headers=self.headers,
+                data=postData,
+                timeout=self.timeout_ms / 1000,
+                verify=False)
+            result.raise_for_status()
         except:
             print("http post error")
             return None
@@ -484,6 +559,16 @@ class GeneralClient(object):
 
         postData = self.process_proto_data(feed_dict, fetch_list, batch, log_id)
 
+        try:
+            resp = self.stub_.inference(
+                postData, timeout=self.timeout_ms / 1000)
+        except:
+            print("Grpc inference error occur")
+            return None
+        else:
+            return resp
+
+    def init_grpc_stub(self):
         # https://github.com/tensorflow/serving/issues/1382
         options = [('grpc.max_receive_message_length', self.max_body_size),
                    ('grpc.max_send_message_length', self.max_body_size)]
@@ -493,10 +578,7 @@ class GeneralClient(object):
         self.channel_ = grpc.insecure_channel(g_endpoint, options=options)
         self.stub_ = general_model_service_pb2_grpc.GeneralModelServiceStub(
             self.channel_)
-        try:
-            resp = self.stub_.inference(postData, timeout=self.timeout_ms)
-        except:
-            print("Grpc inference error occur")
-            return None
-        else:
-            return resp
+
+    def __del__(self):
+        self.requests_session.close()
+        self.channel_.close()
diff --git a/python/paddle_serving_client/io/__init__.py b/python/paddle_serving_client/io/__init__.py
index 7e09a53c77510a21fba993de74a4517b7267372d..07f443196d5e460d5158112dda33bb9c186394b5 100644
--- a/python/paddle_serving_client/io/__init__.py
+++ b/python/paddle_serving_client/io/__init__.py
@@ -67,7 +67,6 @@ def save_dygraph_model(serving_model_folder, client_config_folder, model):
     }
     config = model_conf.GeneralModelConfig()
 
-    #int64 = 0; float32 = 1; int32 = 2;
     for key in feed_var_dict:
         feed_var = model_conf.FeedVar()
         feed_var.alias_name = key
@@ -127,7 +126,6 @@ def save_dygraph_model(serving_model_folder, client_config_folder, model):
 def var_type_conversion(dtype):
     """
     Variable type conversion
-
     Args:
         dtype: type of core.VarDesc.VarType.xxxxx
         (https://github.com/PaddlePaddle/Paddle/blob/release/2.1/python/paddle/framework/dtype.py) 
@@ -184,7 +182,12 @@ def save_model(server_model_folder,
                main_program=None,
                encryption=False,
                key_len=128,
-               encrypt_conf=None):
+               encrypt_conf=None,
+               model_filename=None,
+               params_filename=None,
+               show_proto=False,
+               feed_alias_names=None,
+               fetch_alias_names=None):
     executor = Executor(place=CPUPlace())
 
     feed_var_names = [feed_var_dict[x].name for x in feed_var_dict]
@@ -194,16 +197,30 @@ def save_model(server_model_folder,
         target_vars.append(fetch_var_dict[key])
         target_var_names.append(key)
 
-    if not encryption:
-        save_inference_model(
-            server_model_folder,
-            feed_var_names,
-            target_vars,
-            executor,
-            model_filename="__model__",
-            params_filename="__params__",
-            main_program=main_program)
-    else:
+    if not encryption and not show_proto:
+        if not os.path.exists(server_model_folder):
+            os.makedirs(server_model_folder)
+        if not model_filename:
+            model_filename = "model.pdmodel"
+        if not params_filename:
+            params_filename = "params.pdiparams"
+
+        new_model_path = os.path.join(server_model_folder, model_filename)
+        new_params_path = os.path.join(server_model_folder, params_filename)
+
+        with open(new_model_path, "wb") as new_model_file:
+            new_model_file.write(main_program.desc.serialize_to_string())
+        
+        paddle.static.save_vars(
+            executor=executor,
+            dirname=server_model_folder,
+            main_program=main_program,
+            vars=None,
+            predicate=paddle.static.io.is_persistable,
+            filename=params_filename)
+    elif not show_proto:
+        if not os.path.exists(server_model_folder):
+            os.makedirs(server_model_folder)
         if encrypt_conf == None:
             aes_cipher = CipherFactory.create_cipher()
         else:
@@ -221,10 +238,19 @@ def save_model(server_model_folder,
         os.chdir("..")
 
     config = model_conf.GeneralModelConfig()
-
-    for key in feed_var_dict:
+    if feed_alias_names is None:
+        feed_alias = list(feed_var_dict.keys())
+    else:
+        feed_alias = feed_alias_names.split(',')
+    if fetch_alias_names is None:
+        fetch_alias = target_var_names
+    else:
+        fetch_alias = fetch_alias_names.split(',')
+    if len(feed_alias) != len(feed_var_dict.keys()) or len(fetch_alias) != len(target_var_names):
+        raise ValueError("please check the input --feed_alias_names and --fetch_alias_names, should be same size with feed_vars and fetch_vars") 
+    for i, key in enumerate(feed_var_dict):
         feed_var = model_conf.FeedVar()
-        feed_var.alias_name = key
+        feed_var.alias_name = feed_alias[i]
         feed_var.name = feed_var_dict[key].name
         feed_var.feed_type = var_type_conversion(feed_var_dict[key].dtype)
 
@@ -239,9 +265,9 @@ def save_model(server_model_folder,
             feed_var.shape.extend(tmp_shape)
         config.feed_var.extend([feed_var])
 
-    for key in target_var_names:
+    for i, key in enumerate(target_var_names):
         fetch_var = model_conf.FetchVar()
-        fetch_var.alias_name = key
+        fetch_var.alias_name = fetch_alias[i]
         fetch_var.name = fetch_var_dict[key].name
         fetch_var.fetch_type = var_type_conversion(fetch_var_dict[key].dtype)
 
@@ -257,6 +283,9 @@ def save_model(server_model_folder,
             fetch_var.shape.extend(tmp_shape)
         config.fetch_var.extend([fetch_var])
 
+    if show_proto:
+        print(str(config))
+        return
     try:
         save_dirname = os.path.normpath(client_config_folder)
         os.makedirs(save_dirname)
@@ -284,7 +313,10 @@ def inference_model_to_serving(dirname,
                                params_filename=None,
                                encryption=False,
                                key_len=128,
-                               encrypt_conf=None):
+                               encrypt_conf=None,
+                               show_proto=False,
+                               feed_alias_names=None,
+                               fetch_alias_names=None):
     paddle.enable_static()
     place = fluid.CPUPlace()
     exe = fluid.Executor(place)
@@ -296,7 +328,8 @@ def inference_model_to_serving(dirname,
     }
     fetch_dict = {x.name: x for x in fetch_targets}
     save_model(serving_server, serving_client, feed_dict, fetch_dict,
-               inference_program, encryption, key_len, encrypt_conf)
+               inference_program, encryption, key_len, encrypt_conf,
+               model_filename, params_filename, show_proto, feed_alias_names, fetch_alias_names)
     feed_names = feed_dict.keys()
     fetch_names = fetch_dict.keys()
     return feed_names, fetch_names
diff --git a/python/paddle_serving_server/parse_profile.py b/python/paddle_serving_server/parse_profile.py
index 37e801c255272778c6926642beabdcf2f3f92cf0..e718e4685e13ed35f8dba16eb0d5f8a3ff6fd305 100644
--- a/python/paddle_serving_server/parse_profile.py
+++ b/python/paddle_serving_server/parse_profile.py
@@ -96,7 +96,7 @@ if __name__ == "__main__":
     args = parse_args()
     benchmark_cfg_filename = args.benchmark_cfg
     f = open(benchmark_cfg_filename, 'r')
-    benchmark_config = yaml.load(f)
+    benchmark_config = yaml.load(f, yaml.FullLoader)
     f.close()
     benchmark_log_filename = args.benchmark_log
     f = open(benchmark_log_filename, 'r')
diff --git a/python/paddle_serving_server/serve.py b/python/paddle_serving_server/serve.py
index abccf5e179ccd4946fdf51a5f74ff6e5ee685b4c..8531e83fc1bb3a330d276e0c0d72616a810eea72 100755
--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
@@ -37,7 +37,7 @@ import socket
 def port_is_available(port):
     with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
         sock.settimeout(2)
-        result = sock.connect_ex(('0.0.0.0', port))
+        result = sock.connect_ex(('127.0.0.1', port))
     if result != 0:
         return True
     else:
diff --git a/python/paddle_serving_server/server.py b/python/paddle_serving_server/server.py
old mode 100644
new mode 100755
index d1d3155112e44b0c71faa0bdd704dffa826aa077..f21e13aaf40dd7720cab87da046c6754845a9bd4
--- a/python/paddle_serving_server/server.py
+++ b/python/paddle_serving_server/server.py
@@ -228,7 +228,8 @@ class Server(object):
             engine.batch_infer_size = self.op_max_batch[index %
                                                         len(self.op_max_batch)]
 
-            engine.enable_batch_align = 1
+            engine.enable_overrun = False
+            engine.allow_split_request = True
             engine.model_dir = model_config_path
             engine.enable_memory_optimization = self.memory_optimization
             engine.enable_ir_optimization = self.ir_optimization
@@ -537,7 +538,7 @@ class Server(object):
     def port_is_available(self, port):
         with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
             sock.settimeout(2)
-            result = sock.connect_ex(('0.0.0.0', port))
+            result = sock.connect_ex(('127.0.0.1', port))
         if result != 0:
             return True
         else:
@@ -563,7 +564,7 @@ class Server(object):
                     "-num_threads {} " \
                     "-port {} " \
                     "-precision {} " \
-                    "-use_calib {} " \
+                    "-use_calib={} " \
                     "-reload_interval_s {} " \
                     "-resource_path {} " \
                     "-resource_file {} " \
diff --git a/python/paddle_serving_server/web_service.py b/python/paddle_serving_server/web_service.py
old mode 100644
new mode 100755
index e6405fbf2f06a895241ed1bf4b813ade7b42ed7a..52e394b2c07fd85b0da7be591c0b9a77669e19b6
--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -33,7 +33,7 @@ from paddle_serving_server.serve import format_gpu_to_strlist
 def port_is_available(port):
     with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
         sock.settimeout(2)
-        result = sock.connect_ex(('0.0.0.0', port))
+        result = sock.connect_ex(('127.0.0.1', port))
     if result != 0:
         return True
     else:
diff --git a/python/pipeline/analyse.py b/python/pipeline/analyse.py
index 814b43acaf52bbf0c066ff4bbdce2a0165508a2d..a571ccfe9018fac70523803c40d05df1cf16e271 100644
--- a/python/pipeline/analyse.py
+++ b/python/pipeline/analyse.py
@@ -274,7 +274,7 @@ class OpAnalyst(object):
         """
         import yaml
         with open(op_config_yaml) as f:
-            op_config = yaml.load(f)
+            op_config = yaml.load(f, yaml.FullLoader)
 
         # check that each model is deployed on a different card
         card_set = set()
diff --git a/python/pipeline/channel.py b/python/pipeline/channel.py
index f1851d6281f4422848eb81ef3224ef2f93ccc01c..b7c229ea6f85794849e156524616647f4bc121ec 100644
--- a/python/pipeline/channel.py
+++ b/python/pipeline/channel.py
@@ -28,6 +28,7 @@ import logging
 import enum
 import os
 import copy
+import time
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -45,7 +46,9 @@ class ChannelDataErrcode(enum.Enum):
     CLOSED_ERROR = 6
     NO_SERVICE = 7
     UNKNOW = 8
-    PRODUCT_ERROR = 9
+    INPUT_PARAMS_ERROR = 9
+
+    PRODUCT_ERROR = 100
 
 
 class ProductErrCode(enum.Enum):
@@ -124,7 +127,6 @@ class ChannelData(object):
 
     def get_size(self):
         size = 0
-        dict_data = None
         if isinstance(self.dictdata, dict):
             for k in self.dictdata:
                 size += sys.getsizeof(self.dictdata[k]) + sys.getsizeof(k)
@@ -259,7 +261,11 @@ class ProcessChannel(object):
         maintains the data obtained from queue.
     """
 
-    def __init__(self, manager, name=None, maxsize=0):
+    def __init__(self,
+                 manager,
+                 name=None,
+                 maxsize=0,
+                 channel_recv_frist_arrive=False):
         # For queue multiprocess: after putting an object on 
         # an empty queue there may be an infinitessimal delay
         # before the queue's :meth:`~Queue.empty`
@@ -283,6 +289,9 @@ class ProcessChannel(object):
         self._base_cursor = manager.Value('i', 0)
         self._output_buf = manager.list()
 
+        self._cur_max_dataid = manager.Value('i', -1)
+        self._channel_recv_frist_arrive = channel_recv_frist_arrive
+
     def get_maxsize(self):
         return self._maxsize
 
@@ -325,9 +334,10 @@ class ProcessChannel(object):
     def push(self, channeldata, op_name=None):
         _LOGGER.debug(
             self._log(
-                "(data_id={} log_id={}) Op({}) Enter channel::push producers:{}".
+                "(data_id={} log_id={}) Op({}) Enter channel::push producers:{}, time:{}".
                 format(channeldata.id, channeldata.log_id, op_name,
-                       len(self._producers))))
+                       len(self._producers), time.time())))
+
         if len(self._producers) == 0:
             _LOGGER.critical(
                 self._log(
@@ -355,16 +365,55 @@ class ProcessChannel(object):
                 self._cv.notify_all()
                 notify_all_time = _time()
                 _LOGGER.debug(
-                    "(data_id={}) Op({}) channel push cost! enter_cv:{} ms, push_que:{} ms, notify:{} ms, data_size:{}".
+                    "(data_id={}) Op({}) channel push cost! enter_cv:{} ms, push_que:{} ms, notify:{} ms, data_size:{}, time:{}".
                     format(channeldata.id, op_name, (enter_cv_time - start_time)
                            * 1000, (push_que_time - enter_cv_time) * 1000, (
                                notify_all_time - push_que_time) * 1000,
-                           channeldata.get_size()))
+                           channeldata.get_size(), time.time()))
             _LOGGER.debug(
                 self._log(
                     "(data_id={} log_id={}) Op({}) Pushed data into internal queue.".
                     format(channeldata.id, channeldata.log_id, op_name)))
             return True
+        elif self._channel_recv_frist_arrive == True:
+            start_time = _time()
+            with self._cv:
+                _LOGGER.debug(
+                    "(data_id={}) Op({}) Channel({}) enter channel_recv_first_arrive. _cur_max_dataid:{}".
+                    format(channeldata.id, op_name, self.name,
+                           self._cur_max_dataid.value))
+                if channeldata.id > self._cur_max_dataid.value:
+                    enter_cv_time = _time()
+                    push_que_time = enter_cv_time
+                    while self._stop.value == 0:
+                        try:
+                            self._que.put((channeldata.id, {
+                                op_name: channeldata
+                            }),
+                                          timeout=0)
+                            push_que_time = _time()
+                            self._cur_max_dataid.value = channeldata.id
+                            break
+                        except Queue.Full:
+                            self._cv.wait()
+                    if self._stop.value == 1:
+                        raise ChannelStopError()
+                    self._cv.notify_all()
+                    notify_all_time = _time()
+                    _LOGGER.debug(
+                        "(data_id={}) Op({}) channel push cost! enter_cv:{} ms, push_que:{} ms, notify:{} ms, data_size:{}, time:{}".
+                        format(channeldata.id, op_name, (
+                            enter_cv_time - start_time) * 1000, (
+                                push_que_time - enter_cv_time) * 1000, (
+                                    notify_all_time - push_que_time) * 1000,
+                               channeldata.get_size(), time.time()))
+                else:
+                    # log and drop it
+                    _LOGGER.debug(
+                        "(data_id={}) Op({}) send data is dropped! cur_max_dataid:{}".
+                        format(channeldata.id, op_name,
+                               self._cur_max_dataid.value))
+            return True
         elif op_name is None:
             _LOGGER.critical(
                 self._log(
@@ -414,8 +463,8 @@ class ProcessChannel(object):
 
                 _LOGGER.debug(
                     self._log(
-                        "(data_id={} log_id={}) Op({}) Pushed data into internal_queue.".
-                        format(data_id, log_id, op_name)))
+                        "(data_id={} log_id={}) Op({}) Pushed data into internal_queue. time:{}".
+                        format(data_id, log_id, op_name, time.time())))
             self._cv.notify_all()
         return True
 
@@ -464,9 +513,9 @@ class ProcessChannel(object):
             key = list(resp.keys())[0]
             data_id = resp[key].id
             _LOGGER.debug(
-                "(data_id={}) op({}) front cost enter_cv:{} ms, queue_get:{} ms".
+                "(data_id={}) op({}) front cost enter_cv:{} ms, queue_get:{} ms, time:{}".
                 format(data_id, op_name, (time_2 - time_1) / 1000.0, (
-                    time_3 - time_2) / 1000.0))
+                    time_3 - time_2) / 1000.0, time.time()))
             if resp is not None:
                 list_values = list(resp.values())
                 _LOGGER.debug(
@@ -501,9 +550,9 @@ class ProcessChannel(object):
                     list_values = list(channeldata.values())
                     _LOGGER.debug(
                         self._log(
-                            "(data_id={} log_id={}) Op({}) Pop ready item into output_buffer".
+                            "(data_id={} log_id={}) Op({}) Pop ready item into output_buffer, time:{}".
                             format(list_values[0].id, list_values[0].log_id,
-                                   op_name)))
+                                   op_name, time.time())))
                     break
                 except Queue.Empty:
                     if timeout is not None:
@@ -561,8 +610,9 @@ class ProcessChannel(object):
             list_values = list(resp.values())
             _LOGGER.debug(
                 self._log(
-                    "(data_id={} log_id={}) Op({}) Got data from output_buffer".
-                    format(list_values[0].id, list_values[0].log_id, op_name)))
+                    "(data_id={} log_id={}) Op({}) Got data from output_buffer, time:{}".
+                    format(list_values[0].id, list_values[0].log_id, op_name,
+                           time.time())))
         return resp
 
     def stop(self):
@@ -601,7 +651,7 @@ class ThreadChannel(Queue.PriorityQueue):
         maintains the data obtained from queue.
     """
 
-    def __init__(self, name=None, maxsize=-1):
+    def __init__(self, name=None, maxsize=-1, channel_recv_frist_arrive=False):
         Queue.Queue.__init__(self, maxsize=maxsize)
         self._maxsize = maxsize
         self.name = name
@@ -619,6 +669,9 @@ class ThreadChannel(Queue.PriorityQueue):
         self._base_cursor = 0
         self._output_buf = []
 
+        self._channel_recv_frist_arrive = channel_recv_frist_arrive
+        self._cur_max_dataid = -1
+
     def get_maxsize(self):
         return self._maxsize
 
@@ -662,6 +715,7 @@ class ThreadChannel(Queue.PriorityQueue):
         _LOGGER.debug(
             self._log("(data_id={} log_id={}) Op({}) Pushing data".format(
                 channeldata.id, channeldata.log_id, op_name)))
+
         if len(self._producers) == 0:
             _LOGGER.critical(
                 self._log(
@@ -688,6 +742,29 @@ class ThreadChannel(Queue.PriorityQueue):
                     "(data_id={} log_id={}) Op({}) Pushed data into internal_queue.".
                     format(channeldata.id, channeldata.log_id, op_name)))
             return True
+        elif self._channel_recv_frist_arrive is True:
+            with self._cv:
+                if channeldata.id > self._cur_max_dataid:
+                    while self._stop is False:
+                        try:
+                            self.put((channeldata.id, {
+                                op_name: channeldata
+                            }),
+                                     timeout=0)
+                            self._cur_max_dataid = channeldata.id
+                            break
+                        except Queue.Full:
+                            self._cv.wait()
+                    if self._stop:
+                        raise ChannelStopError()
+                    self._cv.notify_all()
+                else:
+                    # log and drop it
+                    _LOGGER.debug(
+                        "(data_id={}) Op({}) send data is dropped! cur_max_dataid:{}".
+                        format(channeldata.id, op_name, self._cur_max_dataid))
+            return True
+
         elif op_name is None:
             _LOGGER.critical(
                 self._log(
diff --git a/python/pipeline/dag.py b/python/pipeline/dag.py
index 69ed7124f51948e643e204001c699f820bf288f4..c1a158677df8738651b6947024cf7441cff06701 100644
--- a/python/pipeline/dag.py
+++ b/python/pipeline/dag.py
@@ -63,6 +63,7 @@ class DAGExecutor(object):
         self._retry = dag_conf["retry"]
         self._server_use_profile = dag_conf["use_profile"]
         channel_size = dag_conf["channel_size"]
+        channel_recv_frist_arrive = dag_conf["channel_recv_frist_arrive"]
         self._is_thread_op = dag_conf["is_thread_op"]
 
         tracer_conf = dag_conf["tracer"]
@@ -79,7 +80,7 @@ class DAGExecutor(object):
 
         self._dag = DAG(self.name, response_op, self._server_use_profile,
                         self._is_thread_op, channel_size, build_dag_each_worker,
-                        self._tracer)
+                        self._tracer, channel_recv_frist_arrive)
         (in_channel, out_channel, pack_rpc_func,
          unpack_rpc_func) = self._dag.build()
         self._dag.start()
@@ -480,7 +481,8 @@ class DAG(object):
     """
 
     def __init__(self, request_name, response_op, use_profile, is_thread_op,
-                 channel_size, build_dag_each_worker, tracer):
+                 channel_size, build_dag_each_worker, tracer,
+                 channel_recv_frist_arrive):
         self._request_name = request_name
         self._response_op = response_op
         self._use_profile = use_profile
@@ -488,6 +490,7 @@ class DAG(object):
         self._channel_size = channel_size
         self._build_dag_each_worker = build_dag_each_worker
         self._tracer = tracer
+        self._channel_recv_frist_arrive = channel_recv_frist_arrive
         if not self._is_thread_op:
             self._manager = PipelineProcSyncManager()
         _LOGGER.info("[DAG] Succ init")
@@ -543,10 +546,15 @@ class DAG(object):
         channel = None
         if self._is_thread_op:
             channel = ThreadChannel(
-                name=name_gen.next(), maxsize=self._channel_size)
+                name=name_gen.next(),
+                maxsize=self._channel_size,
+                channel_recv_frist_arrive=self._channel_recv_frist_arrive)
         else:
             channel = ProcessChannel(
-                self._manager, name=name_gen.next(), maxsize=self._channel_size)
+                self._manager,
+                name=name_gen.next(),
+                maxsize=self._channel_size,
+                channel_recv_frist_arrive=self._channel_recv_frist_arrive)
         _LOGGER.debug("[DAG] Generate channel: {}".format(channel.name))
         return channel
 
diff --git a/python/pipeline/gateway/proto/gateway.proto b/python/pipeline/gateway/proto/gateway.proto
index e56d428ece5201c78742575dcb84501a6a7b8455..8dc27a2e953ffe66aa40818ff02ec313293b4c2a 100644
--- a/python/pipeline/gateway/proto/gateway.proto
+++ b/python/pipeline/gateway/proto/gateway.proto
@@ -18,22 +18,117 @@ option go_package = "./;pipeline_serving";
 
 import "google/api/annotations.proto";
 
+// Tensor structure, consistent with PADDLE variable types.
+// Descriptions of input and output data.
+message Tensor {
+
+  // VarType: INT64
+  repeated int64 int64_data = 1;
+
+  // VarType: FP32, FP16
+  repeated float float_data = 2;
+
+  // VarType: INT32, INT16, INT8
+  repeated int32 int_data = 3;
+
+  // VarType: FP64
+  repeated double float64_data = 4;
+
+  // VarType: BF16, UINT8
+  repeated uint32 uint32_data = 5;
+
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+
+  // VarType: STRING
+  repeated string str_data = 9;
+
+  // VarType: BYTES, is suitable for big data. No need to save data types and
+  // dimensions
+  // pack method: pack by BytesIO, saved by np.save
+  // unpack method: load by np.load, unpack by BytesIO.
+  bytes byte_data = 10;
+
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  12 => STRING
+  //  13 => BYTES
+  int32 elem_type = 20;
+
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 21;
+
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 22;
+
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 23;
+};
+
+// The structure of the service request. The input data can be repeated string
+// pairs or tensors.
+message Request {
+  // The input data are repeated string pairs.
+  // for examples. key is "words", value is the string of words.
+  repeated string key = 1;
+  repeated string value = 2;
+
+  // The input data are repeated tensors for complex data structures.
+  // Becase tensors can save more data information and reduce the amount of data
+  // transferred.
+  repeated Tensor tensors = 3;
+
+  // The name field in the RESTful API
+  string name = 4;
+
+  // The method field in the RESTful API
+  string method = 5;
+
+  // For tracing requests and logs
+  int64 logid = 6;
+
+  // For tracking sources
+  string clientip = 7;
+};
+
+// The structure of the service response. The output data can be repeated string
+// pairs or tensors.
 message Response {
+  // Error code
   int32 err_no = 1;
+
+  // Error messages
   string err_msg = 2;
+
+  // The results of string pairs
   repeated string key = 3;
   repeated string value = 4;
-};
 
-message Request {
-  repeated string key = 1;
-  repeated string value = 2;
-  string name = 3;
-  string method = 4;
-  int64 logid = 5;
-  string clientip = 6;
+  // The results of tensors
+  repeated Tensor tensors = 5;
 };
 
+// Python pipeline service
 service PipelineService {
   rpc inference(Request) returns (Response) {
     option (google.api.http) = {
diff --git a/python/pipeline/operator.py b/python/pipeline/operator.py
index 87df16e060a5caec7211dba5d970afb5818c121c..4d4717d66f6877e9b9832c7d1a346eea48ddc84b 100644
--- a/python/pipeline/operator.py
+++ b/python/pipeline/operator.py
@@ -26,6 +26,7 @@ import collections
 import numpy as np
 import json
 from numpy import *
+from io import BytesIO
 if sys.version_info.major == 2:
     import Queue
 elif sys.version_info.major == 3:
@@ -40,10 +41,29 @@ from .channel import (ThreadChannel, ProcessChannel, ChannelDataErrcode,
 from .util import NameGenerator
 from .profiler import UnsafeTimeProfiler as TimeProfiler
 from . import local_service_handler
+from .pipeline_client import PipelineClient as PPClient
 
 _LOGGER = logging.getLogger(__name__)
 _op_name_gen = NameGenerator("Op")
 
+# data type of tensor to numpy_data
+_TENSOR_DTYPE_2_NUMPY_DATA_DTYPE = {
+    0: "int64",  # VarType.INT64
+    1: "float32",  # VarType.FP32
+    2: "int32",  # VarType.INT32
+    3: "float64",  # VarType.FP64
+    4: "int16",  # VarType.int16
+    5: "float16",  # VarType.FP32
+    6: "uint16",  # VarType.BF16
+    7: "uint8",  # VarType.UINT8
+    8: "int8",  # VarType.INT8
+    9: "bool",  # VarType.BOOL
+    10: "complex64",  # VarType.COMPLEX64
+    11: "complex128",  # VarType.COMPLEX128
+    12: "string",  # load by numpy
+    13: "bytes",  # load by numpy
+}
+
 
 class Op(object):
     def __init__(self,
@@ -84,6 +104,9 @@ class Op(object):
         self._server_use_profile = False
         self._tracer = None
 
+        # for grpc_pipeline predict mode. False, string key/val; True, tensor format.
+        self._pack_tensor_format = False
+
         # only for thread op
         self._for_init_op_lock = threading.Lock()
         self._for_close_op_lock = threading.Lock()
@@ -330,9 +353,8 @@ class Op(object):
         if self.client_type == 'brpc':
             client = Client()
             client.load_client_config(client_config)
-        # 待测试完成后，使用brpc-http替代。
-        # elif self.client_type == 'grpc':
-        #   client = MultiLangClient()
+        elif self.client_type == 'pipeline_grpc':
+            client = PPClient()
         elif self.client_type == 'local_predictor':
             if self.local_predictor is None:
                 raise ValueError("local predictor not yet created")
@@ -372,6 +394,9 @@ class Op(object):
                 os._exit(-1)
             self._input_ops.append(op)
 
+    def set_pack_tensor_format(self, is_tensor_format=False):
+        self._pack_tensor_format = is_tensor_format
+
     def get_jump_to_ops(self):
         return self._jump_to_ops
 
@@ -483,7 +508,7 @@ class Op(object):
             os._exit(-1)
         channel.add_producer(self.name)
         self._outputs.append(channel)
-        _LOGGER.info("op:{} add output_channel {}".format(self.name, channel))
+        _LOGGER.debug("op:{} add output_channel {}".format(self.name, channel))
 
     def clean_output_channels(self):
         self._outputs = []
@@ -531,32 +556,73 @@ class Op(object):
         Returns:
             call_result: predict result
         """
-        err, err_info = ChannelData.check_batch_npdata(feed_batch)
-        if err != 0:
-            _LOGGER.critical(
-                self._log("Failed to run process: {}. Please override "
-                          "preprocess func.".format(err_info)))
-            os._exit(-1)
+
+        call_result = None
+        err_code = ChannelDataErrcode.OK.value
+        err_info = ""
+
         if self.client_type == "local_predictor":
+            err, err_info = ChannelData.check_batch_npdata(feed_batch)
+            if err != 0:
+                _LOGGER.error(
+                    self._log("Failed to run process: {}. feed_batch must be \
+                        npdata in process for local_predictor mode."
+                              .format(err_info)))
+                return call_result, ChannelDataErrcode.TYPE_ERROR.value, "feed_batch must be npdata"
+
             call_result = self.client.predict(
                 feed=feed_batch[0],
                 fetch=self._fetch_names,
                 batch=True,
                 log_id=typical_logid)
-        else:
+
+        elif self.client_type == "brpc":
+            err, err_info = ChannelData.check_batch_npdata(feed_batch)
+            if err != 0:
+                _LOGGER.error(
+                    self._log("Failed to run process: {}. feed_batch must be \
+                        npdata in process for brpc mode.".format(err_info)))
+                return call_result, ChannelDataErrcode.TYPE_ERROR.value, "feed_batch must be npdata"
             call_result = self.client.predict(
-                feed=feed_batch,
+                feed=feed_batch[0],
                 fetch=self._fetch_names,
                 batch=True,
                 log_id=typical_logid)
-        # 后续用HttpClient替代
-        '''
-        if isinstance(self.client, MultiLangClient):
-            if call_result is None or call_result["serving_status_code"] != 0:
-                return None
-            call_result.pop("serving_status_code")
-        '''
-        return call_result
+
+        elif self.client_type == "pipeline_grpc":
+            err, err_info = ChannelData.check_dictdata(feed_batch)
+            if err != 0:
+                _LOGGER.error(
+                    self._log("Failed to run process: {}. feed_batch must be \
+                       npdata in process for pipeline_grpc mode."
+                              .format(err_info)))
+                return call_result, ChannelDataErrcode.TYPE_ERROR.value, "feed_batch must be dict"
+
+            call_result = self.client.predict(
+                feed_dict=feed_batch[0],
+                fetch=self._fetch_names,
+                asyn=False,
+                pack_tensor_format=self._pack_tensor_format,
+                profile=False)
+            if call_result is None:
+                _LOGGER.error(
+                    self._log("Failed in pipeline_grpc. call_result is None."))
+                return call_result, ChannelDataErrcode.UNKNOW.value, "pipeline_grpc error"
+            if call_result.err_no != 0:
+                _LOGGER.error(
+                    self._log("Failed in pipeline_grpc. err_no:{}, err_info:{}".
+                              format(call_result.err_no, call_result.err_msg)))
+                return call_result, ChannelDataErrcode(
+                    call_result.err_no).value, call_result.err_msg
+
+            new_dict = {}
+            err_code = ChannelDataErrcode(call_result.err_no).value
+            err_info = call_result.err_msg
+            for idx, key in enumerate(call_result.key):
+                new_dict[key] = [call_result.value[idx]]
+            call_result = new_dict
+
+        return call_result, err_code, err_info
 
     def postprocess(self, input_data, fetch_data, data_id=0, log_id=0):
         """
@@ -891,16 +957,20 @@ class Op(object):
 
         midped_batch = None
         error_code = ChannelDataErrcode.OK.value
+        error_info = ""
         if self._timeout <= 0:
             # No retry
             try:
                 if batch_input is False:
-                    midped_batch = self.process(feed_batch, typical_logid)
+                    midped_batch, error_code, error_info = self.process(
+                        feed_batch, typical_logid)
                 else:
                     midped_batch = []
                     for idx in range(len(feed_batch)):
-                        predict_res = self.process([feed_batch[idx]],
-                                                   typical_logid)
+                        predict_res, error_code, error_info = self.process(
+                            [feed_batch[idx]], typical_logid)
+                        if error_code != ChannelDataErrcode.OK.value:
+                            break
                         midped_batch.append(predict_res)
             except Exception as e:
                 error_code = ChannelDataErrcode.UNKNOW.value
@@ -913,14 +983,14 @@ class Op(object):
                 try:
                     # time out for each process
                     if batch_input is False:
-                        midped_batch = func_timeout.func_timeout(
+                        midped_batch, error_code, error_info = func_timeout.func_timeout(
                             self._timeout,
                             self.process,
                             args=(feed_batch, typical_logid))
                     else:
                         midped_batch = []
                         for idx in range(len(feed_batch)):
-                            predict_res = func_timeout.func_timeout(
+                            predict_res, error_code, error_info = func_timeout.func_timeout(
                                 self._timeout,
                                 self.process,
                                 args=([feed_batch[idx]], typical_logid))
@@ -1265,6 +1335,8 @@ class Op(object):
                 break
             end = int(round(_time() * 1000000))
             in_time = end - start
+            _LOGGER.debug("op:{} in_time_end:{}".format(op_info_prefix,
+                                                        time.time()))
 
             # parse channeldata batch
             try:
@@ -1278,6 +1350,8 @@ class Op(object):
             if len(parsed_data_dict) == 0:
                 # data in the whole batch is all error data
                 continue
+            _LOGGER.debug("op:{} parse_end:{}".format(op_info_prefix,
+                                                      time.time()))
 
             # print
             front_cost = int(round(_time() * 1000000)) - start
@@ -1292,6 +1366,8 @@ class Op(object):
                     = self._run_preprocess(parsed_data_dict, op_info_prefix, logid_dict)
             end = profiler.record("prep#{}_1".format(op_info_prefix))
             prep_time = end - start
+            _LOGGER.debug("op:{} preprocess_end:{}, cost:{}".format(
+                op_info_prefix, time.time(), prep_time))
             try:
                 # put error requests into output channel, skip process and postprocess stage
                 for data_id, err_channeldata in err_channeldata_dict.items():
@@ -1313,6 +1389,8 @@ class Op(object):
                     = self._run_process(preped_data_dict, op_info_prefix, skip_process_dict, logid_dict)
             end = profiler.record("midp#{}_1".format(op_info_prefix))
             midp_time = end - start
+            _LOGGER.debug("op:{} process_end:{}, cost:{}".format(
+                op_info_prefix, time.time(), midp_time))
             try:
                 for data_id, err_channeldata in err_channeldata_dict.items():
                     self._push_to_output_channels(
@@ -1334,6 +1412,8 @@ class Op(object):
             end = profiler.record("postp#{}_1".format(op_info_prefix))
             postp_time = end - start
             after_postp_time = _time()
+            _LOGGER.debug("op:{} postprocess_end:{}, cost:{}".format(
+                op_info_prefix, time.time(), postp_time))
             try:
                 for data_id, err_channeldata in err_channeldata_dict.items():
                     self._push_to_output_channels(
@@ -1486,6 +1566,90 @@ class RequestOp(Op):
             _LOGGER.critical("Op(Request) Failed to init: {}".format(e))
             os._exit(-1)
 
+    def proto_tensor_2_numpy(self, tensor):
+        """
+        Convert proto tensor to numpy array, The supported types are as follows:
+                INT64
+                FP32
+		INT32
+		FP64
+		INT16
+		FP16
+		BF16
+		UINT8
+		INT8
+		BOOL
+                BYTES
+        Unsupported type:
+                STRING
+                COMPLEX64
+                COMPLEX128
+
+        Args:
+            tensor: one tensor in request.tensors.
+
+        Returns:
+            np.ndnumpy
+        """
+        if tensor is None or tensor.elem_type is None or tensor.name is None:
+            _LOGGER.error("input params of tensor is wrong. tensor: {}".format(
+                tensor))
+            return None
+
+        dims = []
+        if tensor.shape is None:
+            dims.append(1)
+        else:
+            for one_dim in tensor.shape:
+                dims.append(one_dim)
+
+        np_data = None
+        _LOGGER.info("proto_to_numpy, name:{}, type:{}, dims:{}".format(
+            tensor.name, tensor.elem_type, dims))
+        if tensor.elem_type == 0:
+            # VarType: INT64
+            np_data = np.array(tensor.int64_data).astype(int64).reshape(dims)
+        elif tensor.elem_type == 1:
+            # VarType: FP32
+            np_data = np.array(tensor.float_data).astype(float32).reshape(dims)
+        elif tensor.elem_type == 2:
+            # VarType: INT32
+            np_data = np.array(tensor.int_data).astype(int32).reshape(dims)
+        elif tensor.elem_type == 3:
+            # VarType: FP64
+            np_data = np.array(tensor.float64_data).astype(float64).reshape(
+                dims)
+        elif tensor.elem_type == 4:
+            # VarType: INT16
+            np_data = np.array(tensor.int_data).astype(int16).reshape(dims)
+        elif tensor.elem_type == 5:
+            # VarType: FP16
+            np_data = np.array(tensor.float_data).astype(float16).reshape(dims)
+        elif tensor.elem_type == 6:
+            # VarType: BF16
+            np_data = np.array(tensor.uint32_data).astype(uint16).reshape(dims)
+        elif tensor.elem_type == 7:
+            # VarType: UINT8
+            np_data = np.array(tensor.uint32_data).astype(uint8).reshape(dims)
+        elif tensor.elem_type == 8:
+            # VarType: INT8
+            np_data = np.array(tensor.int_data).astype(int8).reshape(dims)
+        elif tensor.elem_type == 9:
+            # VarType: BOOL
+            np_data = np.array(tensor.bool_data).astype(bool).reshape(dims)
+        elif tensor.elem_type == 13:
+            # VarType: BYTES
+            byte_data = BytesIO(tensor.byte_data)
+            np_data = np.load(byte_data, allow_pickle=True)
+        else:
+            _LOGGER.error("Sorry, the type {} of tensor {} is not supported.".
+                          format(tensor.elem_type, tensor.name))
+            raise ValueError(
+                "Sorry, the type {} of tensor {} is not supported.".format(
+                    tensor.elem_type, tensor.name))
+
+        return np_data
+
     def unpack_request_package(self, request):
         """
         Unpack request package by gateway.proto
@@ -1506,12 +1670,47 @@ class RequestOp(Op):
             _LOGGER.critical("request is None")
             raise ValueError("request is None")
 
+        # unpack key/value string list
         for idx, key in enumerate(request.key):
             dict_data[key] = request.value[idx]
         log_id = request.logid
-        _LOGGER.debug("RequestOp unpack one request. log_id:{}, clientip:{} \
-            name:{}, method:{}".format(log_id, request.clientip, request.name,
-                                       request.method))
+
+        # unpack proto.tensors data.
+        for one_tensor in request.tensors:
+            name = one_tensor.name
+            elem_type = one_tensor.elem_type
+
+            if one_tensor.name is None:
+                _LOGGER.error("Tensor name is None.")
+                raise ValueError("Tensor name is None.")
+
+            numpy_dtype = _TENSOR_DTYPE_2_NUMPY_DATA_DTYPE.get(elem_type)
+            if numpy_dtype is None:
+                _LOGGER.error(
+                    "elem_type:{} is dismatch in unpack_request_package.",
+                    format(elem_type))
+                raise ValueError("elem_type:{} error".format(elem_type))
+
+            if numpy_dtype == "string":
+                new_string = ""
+                if one_tensor.str_data is None:
+                    _LOGGER.error(
+                        "str_data of tensor:{} is None, elem_type is {}.".
+                        format(name, elem_type))
+                    raise ValueError(
+                        "str_data of tensor:{} is None, elem_type is {}.".
+                        format(name, elem_type))
+                for one_str in one_tensor.str_data:
+                    new_string += one_str
+
+                dict_data[name] = new_string
+            else:
+                dict_data[name] = self.proto_tensor_2_numpy(one_tensor)
+
+        _LOGGER.info("RequestOp unpack one request. log_id:{}, clientip:{} \
+            name:{}, method:{}, time:{}"
+                     .format(log_id, request.clientip, request.name,
+                             request.method, time.time()))
 
         return dict_data, log_id, None, ""
 
@@ -1530,6 +1729,7 @@ class ResponseOp(Op):
         """
         super(ResponseOp, self).__init__(
             name="@DAGExecutor", input_ops=input_ops)
+
         # init op
         try:
             self.init_op()
@@ -1538,6 +1738,12 @@ class ResponseOp(Op):
                 e, exc_info=True))
             os._exit(-1)
 
+        # init ResponseOp
+        self.is_pack_tensor = False
+
+    def set_pack_format(self, isTensor=False):
+        self.is_pack_tensor = isTensor
+
     def pack_response_package(self, channeldata):
         """
         Getting channeldata from the last channel, packting the response 
diff --git a/python/pipeline/pipeline_client.py b/python/pipeline/pipeline_client.py
index 132cf043cd49f097c4ee47e36ce67f53f022b82a..cff7e9c3f77db88c762383b83dc466ed2a8240ae 100644
--- a/python/pipeline/pipeline_client.py
+++ b/python/pipeline/pipeline_client.py
@@ -14,6 +14,7 @@
 # pylint: disable=doc-string-missing
 import grpc
 import sys
+import time
 import numpy as np
 from numpy import *
 import logging
@@ -24,6 +25,7 @@ from .channel import ChannelDataErrcode
 from .proto import pipeline_service_pb2
 from .proto import pipeline_service_pb2_grpc
 import six
+from io import BytesIO
 _LOGGER = logging.getLogger(__name__)
 
 
@@ -46,7 +48,8 @@ class PipelineClient(object):
         self._stub = pipeline_service_pb2_grpc.PipelineServiceStub(
             self._channel)
 
-    def _pack_request_package(self, feed_dict, profile):
+    def _pack_request_package(self, feed_dict, pack_tensor_format,
+                              use_tensor_bytes, profile):
         req = pipeline_service_pb2.Request()
 
         logid = feed_dict.get("logid")
@@ -69,38 +72,120 @@ class PipelineClient(object):
             feed_dict.pop("clientip")
 
         np.set_printoptions(threshold=sys.maxsize)
-        for key, value in feed_dict.items():
-            req.key.append(key)
-
-            if (sys.version_info.major == 2 and isinstance(value,
-                                                           (str, unicode)) or
-                ((sys.version_info.major == 3) and isinstance(value, str))):
-                req.value.append(value)
-                continue
-
-            if isinstance(value, np.ndarray):
-                req.value.append(value.__repr__())
-            elif isinstance(value, list):
-                req.value.append(np.array(value).__repr__())
-            else:
-                raise TypeError("only str and np.ndarray type is supported: {}".
-                                format(type(value)))
-        if profile:
-            req.key.append(self._profile_key)
-            req.value.append(self._profile_value)
+        if pack_tensor_format is False:
+            # pack string key/val format
+            for key, value in feed_dict.items():
+                req.key.append(key)
+
+                if (sys.version_info.major == 2 and
+                        isinstance(value, (str, unicode)) or
+                    ((sys.version_info.major == 3) and isinstance(value, str))):
+                    req.value.append(value)
+                    continue
+
+                if isinstance(value, np.ndarray):
+                    req.value.append(value.__repr__())
+                elif isinstance(value, list):
+                    req.value.append(np.array(value).__repr__())
+                else:
+                    raise TypeError(
+                        "only str and np.ndarray type is supported: {}".format(
+                            type(value)))
+
+            if profile:
+                req.key.append(self._profile_key)
+                req.value.append(self._profile_value)
+        else:
+            # pack tensor format
+            for key, value in feed_dict.items():
+                one_tensor = req.tensors.add()
+                one_tensor.name = key
+
+                if isinstance(value, str):
+                    one_tensor.string_data.add(value)
+                    one_tensor.elem_type = 12  #12 => string in proto
+                    continue
+
+                if isinstance(value, np.ndarray):
+                    # copy shape
+                    _LOGGER.debug(
+                        "key:{}, use_tensor_bytes:{}, value.shape:{}, value.dtype:{}".
+                        format(key, use_tensor_bytes, value.shape, value.dtype))
+                    for one_dim in value.shape:
+                        one_tensor.shape.append(one_dim)
+
+                    # packed into bytes
+                    if use_tensor_bytes is True:
+                        np_bytes = BytesIO()
+                        np.save(np_bytes, value, allow_pickle=True)
+                        one_tensor.byte_data = np_bytes.getvalue()
+                        one_tensor.elem_type = 13  #13 => bytes in proto
+                        continue
+
+                    flat_value = value.flatten().tolist()
+                    # copy data
+                    if value.dtype == "int64":
+                        one_tensor.int64_data.extend(flat_value)
+                        one_tensor.elem_type = 0
+                    elif value.dtype == "float32":
+                        one_tensor.float_data.extend(flat_value)
+                        one_tensor.elem_type = 1
+                    elif value.dtype == "int32":
+                        one_tensor.int_data.extend(flat_value)
+                        one_tensor.elem_type = 2
+                    elif value.dtype == "float64":
+                        one_tensor.float64_data.extend(flat_value)
+                        one_tensor.elem_type = 3
+                    elif value.dtype == "int16":
+                        one_tensor.int_data.extend(flat_value)
+                        one_tensor.elem_type = 4
+                    elif value.dtype == "float16":
+                        one_tensor.float_data.extend(flat_value)
+                        one_tensor.elem_type = 5
+                    elif value.dtype == "uint16":
+                        one_tensor.uint32_data.extend(flat_value)
+                        one_tensor.elem_type = 6
+                    elif value.dtype == "uint8":
+                        one_tensor.uint32_data.extend(flat_value)
+                        one_tensor.elem_type = 7
+                    elif value.dtype == "int8":
+                        one_tensor.int_data.extend(flat_value)
+                        one_tensor.elem_type = 8
+                    elif value.dtype == "bool":
+                        one_tensor.bool_data.extend(flat_value)
+                        one_tensor.elem_type = 9
+                    else:
+                        _LOGGER.error(
+                            "value type {} of tensor {} is not supported.".
+                            format(value.dtype, key))
+                else:
+                    raise TypeError(
+                        "only str and np.ndarray type is supported: {}".format(
+                            type(value)))
         return req
 
     def _unpack_response_package(self, resp, fetch):
         return resp
 
-    def predict(self, feed_dict, fetch=None, asyn=False, profile=False):
+    def predict(self,
+                feed_dict,
+                fetch=None,
+                asyn=False,
+                pack_tensor_format=False,
+                use_tensor_bytes=False,
+                profile=False,
+                log_id=0):
         if not isinstance(feed_dict, dict):
             raise TypeError(
                 "feed must be dict type with format: {name: value}.")
         if fetch is not None and not isinstance(fetch, list):
             raise TypeError("fetch must be list type with format: [name].")
-        req = self._pack_request_package(feed_dict, profile)
+        print("PipelineClient::predict pack_data time:{}".format(time.time()))
+        req = self._pack_request_package(feed_dict, pack_tensor_format,
+                                         use_tensor_bytes, profile)
+        req.logid = log_id
         if not asyn:
+            print("PipelineClient::predict before time:{}".format(time.time()))
             resp = self._stub.inference(req)
             return self._unpack_response_package(resp, fetch)
         else:
diff --git a/python/pipeline/pipeline_server.py b/python/pipeline/pipeline_server.py
index 5fcc31879e4a6e55968f40c44058b37e48d03475..5d3fa3540149412186b9335741964910a7ed56d2 100644
--- a/python/pipeline/pipeline_server.py
+++ b/python/pipeline/pipeline_server.py
@@ -22,6 +22,7 @@ from contextlib import closing
 import multiprocessing
 import yaml
 import io
+import time
 
 from .proto import pipeline_service_pb2_grpc, pipeline_service_pb2
 from . import operator
@@ -47,8 +48,9 @@ class PipelineServicer(pipeline_service_pb2_grpc.PipelineServiceServicer):
         _LOGGER.info("[PipelineServicer] succ init")
 
     def inference(self, request, context):
-        _LOGGER.info("(log_id={}) inference request name:{} self.name:{}".
-                     format(request.logid, request.name, self._name))
+        _LOGGER.info(
+            "(log_id={}) inference request name:{} self.name:{} time:{}".format(
+                request.logid, request.name, self._name, time.time()))
         if request.name != "" and request.name != self._name:
             _LOGGER.error("(log_id={}) name dismatch error. request.name:{},"
                           "server.name={}".format(request.logid, request.name,
@@ -339,7 +341,7 @@ class ServerYamlConfChecker(object):
                              " or yml_dict can be selected as the parameter.")
         if yml_file is not None:
             with io.open(yml_file, encoding='utf-8') as f:
-                conf = yaml.load(f.read())
+                conf = yaml.load(f.read(), yaml.FullLoader)
         elif yml_dict is not None:
             conf = yml_dict
         else:
@@ -469,6 +471,7 @@ class ServerYamlConfChecker(object):
             "channel_size": 0,
             "is_thread_op": True,
             "tracer": {},
+            "channel_recv_frist_arrive": False,
         }
 
         conf_type = {
@@ -477,6 +480,7 @@ class ServerYamlConfChecker(object):
             "use_profile": bool,
             "channel_size": int,
             "is_thread_op": bool,
+            "channel_recv_frist_arrive": bool,
         }
 
         conf_qualification = {
diff --git a/python/pipeline/proto/pipeline_service.proto b/python/pipeline/proto/pipeline_service.proto
index e29cc1f7f688a0d3fdaa0c01edf21eab94c0cd74..ff3eda8bf9f5357a2930b05da0b7f893d916b2bd 100644
--- a/python/pipeline/proto/pipeline_service.proto
+++ b/python/pipeline/proto/pipeline_service.proto
@@ -12,25 +12,120 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-syntax = "proto2";
+syntax = "proto3";
 package baidu.paddle_serving.pipeline_serving;
 
+// Tensor structure, consistent with PADDLE variable types.
+// Descriptions of input and output data.
+message Tensor {
+
+  // VarType: INT64
+  repeated int64 int64_data = 1;
+
+  // VarType: FP32, FP16
+  repeated float float_data = 2;
+
+  // VarType: INT32, INT16, INT8
+  repeated int32 int_data = 3;
+
+  // VarType: FP64
+  repeated double float64_data = 4;
+
+  // VarType: BF16, UINT8
+  repeated uint32 uint32_data = 5;
+
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+
+  // VarType: STRING
+  repeated string str_data = 9;
+
+  // VarType: BYTES, is suitable for big data. No need to save data types and
+  // dimensions
+  // pack method: pack by BytesIO, saved by np.save
+  // unpack method: load by np.load, unpack by BytesIO.
+  bytes byte_data = 10;
+
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  12 => STRING
+  //  13 => BYTES
+  int32 elem_type = 20;
+
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 21;
+
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 22;
+
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 23;
+};
+
+// The structure of the service request. The input data can be repeated string
+// pairs or tensors.
 message Request {
+  // The input data are repeated string pairs.
+  // for examples. key is "words", value is the string of words.
   repeated string key = 1;
   repeated string value = 2;
-  optional string name = 3;
-  optional string method = 4;
-  optional int64 logid = 5;
-  optional string clientip = 6;
+
+  // The input data are repeated tensors for complex data structures.
+  // Becase tensors can save more data information and reduce the amount of data
+  // transferred.
+  repeated Tensor tensors = 3;
+
+  // The name field in the RESTful API
+  string name = 4;
+
+  // The method field in the RESTful API
+  string method = 5;
+
+  // For tracing requests and logs
+  int64 logid = 6;
+
+  // For tracking sources
+  string clientip = 7;
 };
 
+// The structure of the service response. The output data can be repeated string
+// pairs or tensors.
 message Response {
-  optional int32 err_no = 1;
-  optional string err_msg = 2;
+  // Error code
+  int32 err_no = 1;
+
+  // Error messages
+  string err_msg = 2;
+
+  // The results of string pairs
   repeated string key = 3;
   repeated string value = 4;
+
+  // The results of tensors
+  repeated Tensor tensors = 5;
 };
 
+// Python pipeline service
 service PipelineService {
   rpc inference(Request) returns (Response) {}
 };
diff --git a/python/pipeline/util.py b/python/pipeline/util.py
old mode 100644
new mode 100755
index d7847f179de7557b5446958536008adc3c981f95..8bc15446b81c24162bbe2e236f204ffd1d0c23d1
--- a/python/pipeline/util.py
+++ b/python/pipeline/util.py
@@ -39,7 +39,7 @@ class AvailablePortGenerator(object):
     def port_is_available(port):
         with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
             sock.settimeout(2)
-            result = sock.connect_ex(('0.0.0.0', port))
+            result = sock.connect_ex(('127.0.0.1', port))
         if result != 0:
             return True
         else:
diff --git a/python/requirements.txt b/python/requirements.txt
index c28133c67f6c85e3dd12b08914c7aa0848a4cad7..ba7cf42d9e0a6b4cd713ef245108bb45e7244dda 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -7,7 +7,7 @@ protobuf>=3.12.2
 grpcio-tools>=1.28.1
 grpcio>=1.28.1
 func-timeout>=4.3.5
-pyyaml>=1.3.0
+pyyaml>=5.1
 flask>=1.1.2
 click==7.1.2
 itsdangerous==1.1.0
diff --git a/python/requirements_mac.txt b/python/requirements_mac.txt
index b14fbd5fc5b779e6f2d216df434bfeb615d59b05..6a396239c5e68e545bd5af0928b3e6f42b19c82b 100644
--- a/python/requirements_mac.txt
+++ b/python/requirements_mac.txt
@@ -6,7 +6,7 @@ google>=2.0.3
 opencv-python==4.2.0.32
 protobuf>=3.12.2
 func-timeout>=4.3.5
-pyyaml>=1.3.0
+pyyaml>=5.1
 flask>=1.1.2
 click==7.1.2
 itsdangerous==1.1.0
diff --git a/python/setup.py.server.in b/python/setup.py.server.in
index cf579db0ba082606e289eb49f8713b9441053743..dfe3761035c18cad0d74f25f9a17b268003dd201 100644
--- a/python/setup.py.server.in
+++ b/python/setup.py.server.in
@@ -33,7 +33,7 @@ util.gen_pipeline_code("paddle_serving_server")
 
 REQUIRED_PACKAGES = [
     'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio <= 1.33.2', 'grpcio-tools <= 1.33.2',
-    'flask >= 1.1.1', 'click==7.1.2', 'itsdangerous==1.1.0', 'Jinja2==2.11.3',
+    'flask >= 1.1.1,<2.0.0', 'click==7.1.2', 'itsdangerous==1.1.0', 'Jinja2==2.11.3',
     'MarkupSafe==1.1.1', 'Werkzeug==1.0.1', 'func_timeout', 'pyyaml'
 ]
 
diff --git a/tools/scripts/ipipe_py3.sh b/tools/scripts/ipipe_py3.sh
index 9778f87fe7d707aaa893911976921477e6a65a21..cbc7a11abc0b8b848f5d0b9f62595ed6d09284c6 100644
--- a/tools/scripts/ipipe_py3.sh
+++ b/tools/scripts/ipipe_py3.sh
@@ -40,10 +40,10 @@ go env -w GO111MODULE=auto
 
 build_whl_list=(build_cpu_server build_gpu_server build_client build_app)
 rpc_model_list=(grpc_fit_a_line grpc_yolov4 pipeline_imagenet bert_rpc_gpu bert_rpc_cpu ResNet50_rpc \
-lac_rpc cnn_rpc bow_rpc lstm_rpc fit_a_line_rpc deeplabv3_rpc mobilenet_rpc unet_rpc resnetv2_rpc \
+lac_rpc_asyn cnn_rpc_asyn bow_rpc lstm_rpc fit_a_line_rpc deeplabv3_rpc mobilenet_rpc unet_rpc resnetv2_rpc \
 criteo_ctr_rpc_cpu criteo_ctr_rpc_gpu ocr_rpc yolov4_rpc_gpu faster_rcnn_hrnetv2p_w18_1x_encrypt \
-faster_rcnn_model_rpc low_precision_resnet50_int8 ocr_c++_service)
-http_model_list=(fit_a_line_http lac_http cnn_http bow_http lstm_http ResNet50_http bert_http \
+faster_rcnn_model_rpc low_precision_resnet50_int8 ocr_c++_service ocr_c++_service_asyn)
+http_model_list=(fit_a_line_http lac_http imdb_http_proto imdb_http_json imdb_grpc ResNet50_http bert_http \
 pipeline_ocr_cpu_http)
 
 function setproxy() {
@@ -492,7 +492,7 @@ function ResNet101_rpc() {
     kill_server_process
 }
 
-function cnn_rpc() {
+function cnn_rpc_asyn() {
     dir=${log_dir}rpc_model/cnn_rpc/
     check_dir ${dir}
     unsetproxy
@@ -500,8 +500,9 @@ function cnn_rpc() {
     data_dir=${data}imdb/
     link_data ${data_dir}
     sed -i 's/9292/8865/g' test_client.py
-    ${py_version} -m paddle_serving_server.serve --model imdb_cnn_model/ --port 8865 > ${dir}server_log.txt 2>&1 &
-    check_result server 5
+    ${py_version} -m paddle_serving_server.serve --model imdb_cnn_model/ --port 8865 --op_num 4 --thread 10 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
+    check_result server 8
+    check_gpu_memory 0
     head test_data/part-0 | ${py_version} test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab > ${dir}client_log.txt 2>&1
     check_result client "cnn_CPU_RPC server test completed"
     kill_server_process
@@ -537,7 +538,7 @@ function lstm_rpc() {
     kill_server_process
 }
 
-function lac_rpc() {
+function lac_rpc_asyn() {
     dir=${log_dir}rpc_model/lac_rpc/
     check_dir ${dir}
     unsetproxy
@@ -545,8 +546,9 @@ function lac_rpc() {
     data_dir=${data}lac/
     link_data ${data_dir}
     sed -i 's/9292/8868/g' lac_client.py
-    ${py_version} -m paddle_serving_server.serve --model lac_model/ --port 8868 > ${dir}server_log.txt 2>&1 &
-    check_result server 5
+    ${py_version} -m paddle_serving_server.serve --model lac_model/ --port 8868 --gpu_ids 0 --op_num 2 > ${dir}server_log.txt 2>&1 &
+    check_result server 8
+    check_gpu_memory 0
     echo "我爱北京天安门" | ${py_version} lac_client.py lac_client/serving_client_conf.prototxt lac_dict/ > ${dir}client_log.txt 2>&1
     check_result client "lac_CPU_RPC server test completed"
     kill_server_process
@@ -768,10 +770,9 @@ function fit_a_line_http() {
     check_dir ${dir}
     unsetproxy
     cd ${build_path}/python/examples/fit_a_line
-    sed -i "s/9393/8871/g" test_server.py
-    ${py_version} test_server.py > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 > ${dir}server_log.txt 2>&1 &
     check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:8871/uci/prediction > ${dir}client_log.txt 2>&1
+    ${py_version} test_httpclient.py uci_housing_client/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
     check_result client "fit_a_line_CPU_HTTP server test completed"
     kill_server_process
 }
@@ -781,61 +782,64 @@ function lac_http() {
     check_dir ${dir}
     unsetproxy
     cd ${build_path}/python/examples/lac
-    ${py_version} lac_web_service.py lac_model/ lac_workdir 8872 > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model lac_model/ --port 9292 > ${dir}server_log.txt 2>&1 &
     check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "我爱北京天安门"}], "fetch":["word_seg"]}' http://127.0.0.1:8872/lac/prediction > ${dir}client_log.txt 2>&1
+    echo "我爱北京天安门" | ${py_version} lac_http_client.py lac_client/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
     check_result client "lac_CPU_HTTP server test completed"
     kill_server_process
 }
 
-function cnn_http() {
-    dir=${log_dir}http_model/cnn_http/
+function imdb_http_proto() {
+    dir=${log_dir}http_model/imdb_http_proto/
     check_dir ${dir}
     unsetproxy
     cd ${build_path}/python/examples/imdb
-    ${py_version} text_classify_service.py imdb_cnn_model/ workdir/ 8873 imdb.vocab > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292 > ${dir}server_log.txt 2>&1 &
     check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:8873/imdb/prediction > ${dir}client_log.txt 2>&1
-    check_result client "cnn_CPU_HTTP server test completed"
+    head test_data/part-0 | ${py_version} test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab > ${dir}client_log.txt 2>&1
+    check_result client "imdb_CPU_HTTP-proto server test completed"
     kill_server_process
 }
 
-function bow_http() {
-    dir=${log_dir}http_model/bow_http/
+function imdb_http_json() {
+    dir=${log_dir}http_model/imdb_http_json/
     check_dir ${dir}
     unsetproxy
     cd ${build_path}/python/examples/imdb
-    ${py_version} text_classify_service.py imdb_bow_model/ workdir/ 8874 imdb.vocab > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292 > ${dir}server_log.txt 2>&1 &
     check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:8874/imdb/prediction > ${dir}client_log.txt 2>&1
-    check_result client "bow_CPU_HTTP server test completed"
+    sed -i "s/#client.set_http_proto(True)/client.set_http_proto(False)/g" test_http_client.py
+    head test_data/part-0 | ${py_version} test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab > ${dir}client_log.txt 2>&1
+    check_result client "imdb_CPU_HTTP-json server test completed"
     kill_server_process
 }
 
-function lstm_http() {
-    dir=${log_dir}http_model/lstm_http/
+function imdb_grpc() {
+    dir=${log_dir}http_model/imdb_grpc/
     check_dir ${dir}
     unsetproxy
     cd ${build_path}/python/examples/imdb
-    ${py_version} text_classify_service.py imdb_bow_model/ workdir/ 8875 imdb.vocab > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292 --gpu_ids 1 > ${dir}server_log.txt 2>&1 &
     check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:8875/imdb/prediction > ${dir}client_log.txt 2>&1
-    check_result client "lstm_CPU_HTTP server test completed"
+    check_gpu_memory 1
+    sed -i "s/client.set_http_proto(False)/#client.set_http_proto(False)/g" test_http_client.py
+    sed -i "s/#client.set_use_grpc_client(True)/client.set_use_grpc_client(True)/g" test_http_client.py
+    head test_data/part-0 | ${py_version} test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab > ${dir}client_log.txt 2>&1
+    check_result client "imdb_GPU_GRPC server test completed"
     kill_server_process
 }
 
 function ResNet50_http() {
-    echo "pass"
-#    dir=${log_dir}http_model/ResNet50_http/
-#    check_dir ${dir}
-#    unsetproxy
-#    cd ${build_path}/python/examples/imagenet
-#    ${py_version} resnet50_web_service.py ResNet50_vd_model gpu 8876 > ${dir}server_log.txt 2>&1 &
-#    check_result server 10
-#    check_gpu_memory 0
-#    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"image": "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"}], "fetch": ["score"]}' http://127.0.0.1:8876/image/prediction > ${dir}client_log.txt 2>&1
-#    check_result client "ResNet50_GPU_HTTP server test completed"
-#    kill_server_process
+    dir=${log_dir}http_model/ResNet50_http/
+    check_dir ${dir}
+    unsetproxy
+    cd ${build_path}/python/examples/imagenet
+    ${py_version} -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
+    check_result server 10
+    check_gpu_memory 0
+    ${py_version} resnet50_http_client.py ResNet50_vd_client_config/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
+    check_result client "ResNet50_GPU_HTTP server test completed"
+    kill_server_process
 }
 
 function bert_http() {
@@ -845,10 +849,11 @@ function bert_http() {
     cd ${build_path}/python/examples/bert
     cp data-c.txt.1 data-c.txt
     cp vocab.txt.1 vocab.txt
-    export CUDA_VISIBLE_DEVICES=0
-    ${py_version} bert_web_service.py bert_seq128_model/ 8878 > ${dir}server_log.txt 2>&1 &
-    check_result server 8
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:8878/bert/prediction > ${dir}client_log.txt 2>&1
+    export CUDA_VISIBLE_DEVICES=0,1
+    ${py_version} -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
+    check_result server 10
+    check_gpu_memory 0
+    head data-c.txt | ${py_version} bert_httpclient.py --model bert_seq128_client/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
     check_result client "bert_GPU_HTTP server test completed"
     kill_server_process
 }
@@ -904,7 +909,7 @@ function ocr_c++_service() {
     cp -r ocr_det_client/ ./ocr_det_client_cp
     rm -rf ocr_det_client
     mv ocr_det_client_cp ocr_det_client
-    sed -i "s/feed_type: 1/feed_type: 3/g" ocr_det_client/serving_client_conf.prototxt
+    sed -i "s/feed_type: 1/feed_type: 20/g" ocr_det_client/serving_client_conf.prototxt
     sed -i "s/shape: 3/shape: 1/g" ocr_det_client/serving_client_conf.prototxt
     sed -i '7,8d' ocr_det_client/serving_client_conf.prototxt
     echo -e "${GREEN_COLOR}OCR_C++_Service_GPU_RPC server started${RES}"
@@ -920,6 +925,23 @@ function ocr_c++_service() {
     kill_server_process
 }
 
+function ocr_c++_service_asyn() {
+    dir=${log_dir}rpc_model/ocr_c++_serving/
+    cd ${build_path}/python/examples/ocr
+    check_dir ${dir}
+    echo -e "${GREEN_COLOR}OCR_C++_Service_GPU_RPC asyn_server started${RES}"
+    $py_version -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_id 0 --op_num 4 > ${dir}server_log.txt 2>&1 &
+    check_result server 8
+    check_gpu_memory 0
+    echo -e "${GREEN_COLOR}OCR_C++_Service_GPU_RPC client started${RES}"
+    echo "------------------first:"
+    $py_version ocr_cpp_client.py ocr_det_client ocr_rec_client
+    echo "------------------second:"
+    $py_version ocr_cpp_client.py ocr_det_client ocr_rec_client > ${dir}client_log.txt 2>&1
+    check_result client "OCR_C++_Service_GPU_RPC server test completed"
+    kill_server_process
+}
+
 function build_all_whl() {
     for whl in ${build_whl_list[@]}
     do