diff --git a/CMakeLists.txt b/CMakeLists.txt
old mode 100755
new mode 100644
index 4cb661faf996bc32424f88103f238088efd08520..cad0bb5bc638e08bd05a573fe548c7a81323435c
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,7 @@ find_package(Threads REQUIRED)
 find_package(CUDA QUIET)
 
 include(simd)
-
+# SET(CMAKE_BUILD_TYPE "Debug")
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
diff --git a/README.md b/README.md
index ab6b1c0148315f2d19838b67a84cc732f175c944..6c6d0924bf44137dc463fb68599713835d4cb0f2 100644
--- a/README.md
+++ b/README.md
@@ -175,9 +175,12 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 
 | Argument                                       | Type | Default | Description                                           |
 | ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- |
-| `thread`                                       | int  | `4`     | Concurrency of current service                        |
+| `thread`                                       | int  | `2`     | Number of brpc service thread                         |
+| `op_num`                                       | int[]| `0`     | Thread Number for each model in asynchronous mode     |
+| `op_max_batch`                                 | int[]| `0`     | Batch Number for each model in asynchronous mode      |
+| `gpu_ids`                                      | str[]| `"-1"`  | Gpu card id for each model                            |
 | `port`                                         | int  | `9292`  | Exposed port of current service to users              |
-| `model`                                        | str  | `""`    | Path of paddle model directory to be served           |
+| `model`                                        | str[]| `""`    | Path of paddle model directory to be served           |
 | `mem_optim_off`                                | -    | -       | Disable memory / graphic memory optimization          |
 | `ir_optim`                                     | bool | False   | Enable analysis and optimization of calculation graph |
 | `use_mkl` (Only for cpu version)               | -    | -       | Run inference with MKL                                |
@@ -186,7 +189,24 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 | `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU        |
 | `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8              |
 | `use_calib`                                    | bool | False   | Only for deployment with TensorRT                     |
-
+| `gpu_multi_stream`                             | bool | False   | EnableGpuMultiStream to get larger QPS                |
+
+#### Description of asynchronous model
+    Asynchronous mode is suitable for 1. When the number of requests is very large, 2. When multiple models are concatenated and you want to specify the concurrency number of each model.
+    Asynchronous mode helps to improve the throughput (QPS) of service, but for a single request, the delay will increase slightly.
+    In asynchronous mode, each model will start n threads of the number you specify, and each thread contains a model instance. In other words, each model is equivalent to a thread pool containing N threads, and the task is taken from the task queue of the thread pool to execute.
+    In asynchronous mode, each RPC server thread is only responsible for putting the request into the task queue of the model thread pool. After the task is executed, the completed task is removed from the task queue.
+    In the above table, the number of RPC server threads is specified by --thread, and the default value is 2.
+    --op_num specifies the number of threads in the thread pool of each model. The default value is 0, indicating that asynchronous mode is not used.
+    --op_max_batch specifies the number of batches for each model. The default value is 32. It takes effect when --op_num is not 0.
+#### When you want a model to use multiple GPU cards.
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0,1,2
+#### When you want 2 models.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292
+#### When you want 2 models, and want each of them use multiple GPU cards.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2
+#### When a service contains two models, and each model needs to specify multiple GPU cards, and needs asynchronous mode, each model specifies different concurrency number.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2 --op_num 4 8
 </center>
 
 ```python
diff --git a/README_CN.md b/README_CN.md
index d728071dbd80ae2400a6e95b5ccb06010fd7ef06..a1bb9f9e7c513a3d772cce2d56d0bcd76e3548f9 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -172,19 +172,40 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 ```
 <center>
 
-| Argument                                       | Type | Default | Description                                            |
-| ---------------------------------------------- | ---- | ------- | ------------------------------------------------------ |
-| `thread`                                       | int  | `4`     | Concurrency of current service                         |
-| `port`                                         | int  | `9292`  | Exposed port of current service to users               |
-| `name`                                         | str  | `""`    | Service name, can be used to generate HTTP request url |
-| `model`                                        | str  | `""`    | Path of paddle model directory to be served            |
-| `mem_optim_off`                                | -    | -       | Disable memory optimization                            |
-| `ir_optim`                                     | bool | False   | Enable analysis and optimization of calculation graph  |
-| `use_mkl` (Only for cpu version)               | -    | -       | Run inference with MKL                                 |
-| `use_trt` (Only for Cuda>=10.1 version)        | -    | -       | Run inference with TensorRT                            |
-| `use_lite` (Only for Intel x86 CPU or ARM CPU) | -    | -       | Run PaddleLite inference                               |
-| `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU         |
-| `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8               |
+| Argument                                       | Type | Default | Description                                           |
+| ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- |
+| `thread`                                       | int  | `2`     | Number of brpc service thread                         |
+| `op_num`                                       | int[]| `0`     | Thread Number for each model in asynchronous mode     |
+| `op_max_batch`                                 | int[]| `32`    | Batch Number for each model in asynchronous mode      |
+| `gpu_ids`                                      | str[]| `"-1"`  | Gpu card id for each model                            |
+| `port`                                         | int  | `9292`  | Exposed port of current service to users              |
+| `model`                                        | str[]| `""`    | Path of paddle model directory to be served           |
+| `mem_optim_off`                                | -    | -       | Disable memory / graphic memory optimization          |
+| `ir_optim`                                     | bool | False   | Enable analysis and optimization of calculation graph |
+| `use_mkl` (Only for cpu version)               | -    | -       | Run inference with MKL                                |
+| `use_trt` (Only for trt version)               | -    | -       | Run inference with TensorRT                           |
+| `use_lite` (Only for Intel x86 CPU or ARM CPU) | -    | -       | Run PaddleLite inference                              |
+| `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU        |
+| `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8              |
+| `use_calib`                                    | bool | False   | Only for deployment with TensorRT                     |
+| `gpu_multi_stream`                             | bool | False   | EnableGpuMultiStream to get larger QPS                |
+
+#### 异步模型的说明
+    异步模式适用于1、请求数量非常大的情况，2、多模型串联，想要分别指定每个模型的并发数的情况。
+    异步模式有助于提高Service服务的吞吐（QPS），但对于单次请求而言，时延会有少量增加。
+    异步模式中，每个模型会启动您指定个数的N个线程，每个线程中包含一个模型实例，换句话说每个模型相当于包含N个线程的线程池，从线程池的任务队列中取任务来执行。
+    异步模式中，各个RPC Server的线程只负责将Request请求放入模型线程池的任务队列中，等任务被执行完毕后，再从任务队列中取出已完成的任务。
+    上表中通过 --thread 10 指定的是RPC Server的线程数量，默认值为2，--op_num 指定的是各个模型的线程池中线程数N，默认值为0，表示不使用异步模式。
+    --op_max_batch 指定的各个模型的batch数量，默认值为32，该参数只有当--op_num不为0时才生效。
+    
+#### 当您的某个模型想使用多张GPU卡部署时.
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0,1,2
+#### 当您的一个服务包含两个模型部署时.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292
+#### 当您的一个服务包含两个模型，且每个模型都需要指定多张GPU卡部署时.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2
+#### 当您的一个服务包含两个模型，且每个模型都需要指定多张GPU卡，且需要异步模式每个模型指定不同的并发数时.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2 --op_num 4 8
 
 </center>
 
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
old mode 100644
new mode 100755
index 9fe5e89cbc89edd2238653b6cf5aeda41184a8a6..5f217566921adf3a6235ed11d7d126b56f58f506
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -39,11 +39,11 @@ INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
 set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
 
 if(WITH_LITE)
-    set(BRPC_REPO "https://github.com/zhangjun/incubator-brpc.git")
-    set(BRPC_TAG "master")
+    set(BRPC_REPO "https://github.com/apache/incubator-brpc")
+    set(BRPC_TAG "1.0.0-rc01")
 else()
-    set(BRPC_REPO "https://github.com/wangjiawei04/brpc")
-    set(BRPC_TAG "6d79e0b17f25107c35b705ea58d888083f59ff47")
+    set(BRPC_REPO "https://github.com/apache/incubator-brpc")
+    set(BRPC_TAG "1.0.0-rc01")
 endif()
 
 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
old mode 100755
new mode 100644
diff --git a/core/configure/CMakeLists.txt b/core/configure/CMakeLists.txt
old mode 100644
new mode 100755
index 32534fee141ee5b4b0b7b1eed580e1769deb5cff..846f5e2af66b821f71c3364e08ecce3edb70eaa7
--- a/core/configure/CMakeLists.txt
+++ b/core/configure/CMakeLists.txt
@@ -33,9 +33,9 @@ if (WITH_PYTHON)
   add_custom_target(general_model_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
   add_dependencies(general_model_config_py_proto general_model_config_py_proto_init)
   
-  py_grpc_proto_compile(multi_lang_general_model_service_py_proto SRCS proto/multi_lang_general_model_service.proto)
-  add_custom_target(multi_lang_general_model_service_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-  add_dependencies(multi_lang_general_model_service_py_proto multi_lang_general_model_service_py_proto_init)
+  py_grpc_proto_compile(general_model_service_py_proto SRCS proto/general_model_service.proto)
+  add_custom_target(general_model_service_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+  add_dependencies(general_model_service_py_proto general_model_service_py_proto_init)
   
   if (CLIENT)
     py_proto_compile(sdk_configure_py_proto SRCS proto/sdk_configure.proto)
@@ -52,12 +52,13 @@ if (WITH_PYTHON)
                     COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
                     COMMENT "Copy generated general_model_config proto file into directory paddle_serving_client/proto."
                     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    
-    add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
+
+    add_custom_command(TARGET general_model_service_py_proto POST_BUILD
                     COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
                     COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-                    COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_client/proto."
+                    COMMENT "Copy generated general_model_service proto file into directory paddle_serving_client/proto."
                     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
   endif()
   
   if (APP)
@@ -83,12 +84,13 @@ if (WITH_PYTHON)
     		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
     		COMMENT "Copy generated general_model_config proto file into directory paddle_serving_server/proto."
     		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    
-    add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
+
+    add_custom_command(TARGET general_model_service_py_proto POST_BUILD
                     COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
                     COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-                    COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server/proto."
+                    COMMENT "Copy generated general_model_service proto file into directory paddle_serving_server/proto."
                     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    
   endif()
 
 endif()
diff --git a/core/configure/proto/general_model_service.proto b/core/configure/proto/general_model_service.proto
new file mode 100644
index 0000000000000000000000000000000000000000..89ac489f8ae3b90b74c94a3f9f3c82711086cd64
--- /dev/null
+++ b/core/configure/proto/general_model_service.proto
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+package baidu.paddle_serving.predictor.general_model;
+option java_multiple_files = true;
+
+message Tensor {
+  repeated string data = 1;
+  repeated int32 int_data = 2;
+  repeated int64 int64_data = 3;
+  repeated float float_data = 4;
+  optional int32 elem_type =
+      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
+  repeated int32 shape = 6;       // shape should include batch
+  repeated int32 lod = 7;         // only for fetch tensor currently
+  optional string name = 8;       // get from the Model prototxt
+  optional string alias_name = 9; // get from the Model prototxt
+};
+
+message Request {
+  repeated Tensor tensor = 1;
+  repeated string fetch_var_names = 2;
+  optional bool profile_server = 3 [ default = false ];
+  required uint64 log_id = 4 [ default = 0 ];
+};
+
+message Response {
+  repeated ModelOutput outputs = 1;
+  repeated int64 profile_time = 2;
+};
+
+message ModelOutput {
+  repeated Tensor tensor = 1;
+  optional string engine_name = 2;
+}
+
+service GeneralModelService {
+  rpc inference(Request) returns (Response) {}
+  rpc debug(Request) returns (Response) {}
+};
diff --git a/core/configure/proto/multi_lang_general_model_service.proto b/core/configure/proto/multi_lang_general_model_service.proto
deleted file mode 100755
index 18fbcf760647e1694e738c0832fe45f4f7d9934f..0000000000000000000000000000000000000000
--- a/core/configure/proto/multi_lang_general_model_service.proto
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-syntax = "proto2";
-
-package baidu.paddle_serving.multi_lang;
-
-option java_multiple_files = true;
-option java_package = "io.paddle.serving.grpc";
-option java_outer_classname = "ServingProto";
-
-message Tensor {
-  optional bytes data = 1;
-  repeated int32 int_data = 2;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
-  optional int32 elem_type = 5;
-  repeated int32 shape = 6;
-  repeated int32 lod = 7; // only for fetch tensor currently
-};
-
-message FeedInst { repeated Tensor tensor_array = 1; };
-
-message FetchInst { repeated Tensor tensor_array = 1; };
-
-message InferenceRequest {
-  repeated FeedInst insts = 1;
-  repeated string feed_var_names = 2;
-  repeated string fetch_var_names = 3;
-  required bool is_python = 4 [ default = false ];
-  required uint64 log_id = 5 [ default = 0 ];
-};
-
-message InferenceResponse {
-  repeated ModelOutput outputs = 1;
-  optional string tag = 2;
-  required int32 err_code = 3;
-};
-
-message ModelOutput {
-  repeated FetchInst insts = 1;
-  optional string engine_name = 2;
-}
-
-message SetTimeoutRequest { required int32 timeout_ms = 1; }
-
-message SimpleResponse { required int32 err_code = 1; }
-
-message GetClientConfigRequest {}
-
-message GetClientConfigResponse { required string client_config_str = 1; }
-
-service MultiLangGeneralModelService {
-  rpc Inference(InferenceRequest) returns (InferenceResponse) {}
-  rpc SetTimeout(SetTimeoutRequest) returns (SimpleResponse) {}
-  rpc GetClientConfig(GetClientConfigRequest)
-      returns (GetClientConfigResponse) {}
-};
diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto
old mode 100755
new mode 100644
index 24fb62806476effdcf453cb7b4047122731106ea..5cace06420e29e1590218f63777c85bbcf504b29
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -21,11 +21,12 @@ message EngineDesc {
   required string reloadable_meta = 3;
   required string reloadable_type = 4;
   required string model_dir = 5;
-  required int32 runtime_thread_num = 6;
-  required int32 batch_infer_size = 7;
-  required int32 enable_batch_align = 8;
-  optional string version_file = 9;
-  optional string version_type = 10;
+  repeated int32 gpu_ids = 6;
+  required int32 runtime_thread_num = 7;
+  required int32 batch_infer_size = 8;
+  required int32 enable_batch_align = 9;
+  optional string version_file = 10;
+  optional string version_type = 11;
 
   /*
    * Sparse Parameter Service type. Valid types are:
@@ -38,16 +39,17 @@ message EngineDesc {
     LOCAL = 1;
     REMOTE = 2;
   }
-  optional SparseParamServiceType sparse_param_service_type = 11;
-  optional string sparse_param_service_table_name = 12;
-  optional bool enable_memory_optimization = 13;
-  optional bool enable_ir_optimization = 14;
-  optional bool use_trt = 15;
-  optional bool use_lite = 16;
-  optional bool use_xpu = 17;
-  optional bool use_gpu = 18;
-  optional bool combined_model = 19;
-  optional bool encrypted_model = 20;
+  optional SparseParamServiceType sparse_param_service_type = 12;
+  optional string sparse_param_service_table_name = 13;
+  optional bool enable_memory_optimization = 14;
+  optional bool enable_ir_optimization = 15;
+  optional bool use_trt = 16;
+  optional bool use_lite = 17;
+  optional bool use_xpu = 18;
+  optional bool use_gpu = 19;
+  optional bool combined_model = 20;
+  optional bool encrypted_model = 21;
+  optional bool gpu_multi_stream = 22;
 };
 
 // model_toolkit conf
diff --git a/core/cube/CMakeLists.txt b/core/cube/CMakeLists.txt
index f9dc4d2c2508720f450b4aee3aba5dfdd7ccd43b..a61d2df92a92bc26fabd4a3cf87c6db1dc1cc3f0 100644
--- a/core/cube/CMakeLists.txt
+++ b/core/cube/CMakeLists.txt
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-
-#execute_process(COMMAND go env -w GO111MODULE=off)
 add_subdirectory(cube-server)
 add_subdirectory(cube-api)
 add_subdirectory(cube-builder)
-#add_subdirectory(cube-transfer)
-#add_subdirectory(cube-agent)
+add_subdirectory(cube-transfer)
+add_subdirectory(cube-agent)
diff --git a/core/cube/cube-agent/CMakeLists.txt b/core/cube/cube-agent/CMakeLists.txt
index 30158aa506e53ec8a37d10aef4f29bfcd5a60d06..701f0c8a55e3326e1327f3b1f68458f99c60143b 100644
--- a/core/cube/cube-agent/CMakeLists.txt
+++ b/core/cube/cube-agent/CMakeLists.txt
@@ -15,7 +15,6 @@
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
 project(cube-agent Go)
-
 include(cmake/golang.cmake)
 
 ExternalGoProject_Add(agent-docopt-go github.com/docopt/docopt-go)
diff --git a/core/cube/cube-agent/src/agent/http.go b/core/cube/cube-agent/src/agent/http.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-agent/src/agent/http_get.go b/core/cube/cube-agent/src/agent/http_get.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-agent/src/agent/http_post.go b/core/cube/cube-agent/src/agent/http_post.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-builder/CMakeLists.txt b/core/cube/cube-builder/CMakeLists.txt
old mode 100755
new mode 100644
index 65f77f4eb0ff16299d5ee54f192c2171ac5b956c..00278939b78235ba5f5b3042d347ad905ac3c8fe
--- a/core/cube/cube-builder/CMakeLists.txt
+++ b/core/cube/cube-builder/CMakeLists.txt
@@ -22,7 +22,7 @@ include_directories(SYSTEM ${CMAKE_CURRENT_BINARY_DIR}/../)
 
 add_executable(cube-builder src/main.cpp include/cube-builder/util.h src/util.cpp src/builder_job.cpp include/cube-builder/builder_job.h include/cube-builder/define.h src/seqfile_reader.cpp include/cube-builder/seqfile_reader.h include/cube-builder/raw_reader.h include/cube-builder/vtext.h src/crovl_builder_increment.cpp include/cube-builder/crovl_builder_increment.h src/curl_simple.cpp include/cube-builder/curl_simple.h)
 
-add_dependencies(cube-builder jsoncpp boost)
+add_dependencies(cube-builder jsoncpp boost brpc)
 
 set(DYNAMIC_LIB
     gflags
@@ -39,4 +39,8 @@ target_link_libraries(cube-builder ${DYNAMIC_LIB})
 # install
 install(TARGETS cube-builder RUNTIME DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/bin)
 
-install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/tool DESTINATION ${PADDLE_SERVING_INSTALL_DIR})
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/tool/kvtool.py DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/tool)
+
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/tool/kv_to_seqfile.py DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/tool)
+
+install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/tool/source DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/tool)
diff --git a/core/cube/cube-server/include/cube/slim_hash_map.h b/core/cube/cube-server/include/cube/slim_hash_map.h
index 761ce9214f628a824f257611c07b07dab2503a48..26e9cd8c5702810a3fcaa83eeaeac17cdae97ba1 100644
--- a/core/cube/cube-server/include/cube/slim_hash_map.h
+++ b/core/cube/cube-server/include/cube/slim_hash_map.h
@@ -212,7 +212,7 @@ class slim_hash_map {
 
   int copy_data_from(const slim_hash_map& rhs) {
     destroy();
-
+    LOG(INFO) << "start copy data, rhs info, mHashSize: " << rhs.m_nHashSize;
     if (rhs.m_nHashSize > 0) {
       m_hashTable = new (std::nothrow) uint32_t[rhs.m_nHashSize];
       if (!m_hashTable) {
@@ -231,7 +231,7 @@ class slim_hash_map {
                    << sizeof(hash_node_t) * BLOCK_SIZE;
         return -1;
       }
-
+      LOG(INFO) << "copy data, m_nBlockNum: " << m_nBlockNum << " , copy size:" << sizeof(hash_node_t) * BLOCK_SIZE;
       memcpy(m_blockAddr[m_nBlockNum],
              rhs.m_blockAddr[m_nBlockNum],
              sizeof(hash_node_t) * BLOCK_SIZE);
@@ -265,11 +265,13 @@ class slim_hash_map {
     }
     size_type index = key % m_nHashSize;
     hash_node_t* node = get_node(m_hashTable[index]);
-
+    int node_cnt = 0;
     while (node != NULL && node->data.first != key) {
+      LOG(INFO) << "node link get:" << node->data.first;
+      node_cnt++;
       node = get_node(node->next);
     }
-
+    LOG(INFO) << "key: " << key << " , found count: " << node_cnt;  
     if (node == NULL) {
       return end();
     }
@@ -390,7 +392,6 @@ class slim_hash_map {
     if (node != NULL) {
       return node->data.second;
     }
-
     return add_node(index, key)->data.second;
   }
   void clear() {
@@ -399,16 +400,16 @@ class slim_hash_map {
     m_nFreeEntries = 0;
     m_nSize = 0;
   }
-  bool load(const char* file) {
+  bool load(const char* file, uint32_t block_id) {
     // clear();
+    // bias = 0 means base mode, bias = K means patch mode, and base dict has size K
     int size = sizeof(key_t) + sizeof(value_t);
     FILE* fp = fopen(file, "rb");
     char* buf = reinterpret_cast<char*>(malloc(size * 100000));
-
+    LOG(INFO) << "current block id: " << block_id;
     if (fp == NULL || buf == NULL) {
       return false;
     }
-
     size_t read_count;
     bool err = false;
     key_t key;
@@ -423,6 +424,8 @@ class slim_hash_map {
       for (int i = 0; i < static_cast<int>(read_count); ++i) {
         key = *(reinterpret_cast<key_t*>(buf + i * size));
         value = *(reinterpret_cast<value_t*>(buf + i * size + sizeof(key_t)));
+        value = ((uint64_t)block_id << 32) | value;
+        LOG(INFO) << "slim map key: " << key << " , value: " << value; 
         (*this)[key] = value;
       }
     }
@@ -557,7 +560,6 @@ class slim_hash_map {
   }
   hash_node_t* add_node(uint32_t index, const key_type& key) {
     ++m_nSize;
-
     if (m_nFreeEntries) {
       uint32_t addr = m_nFreeEntries;
       hash_node_t* node = get_node(addr);
@@ -569,7 +571,7 @@ class slim_hash_map {
     }
 
     uint32_t block = ((m_nNextEntry & 0xFF800000) >> 23);
-
+    //LOG(INFO) << "key: " << key << " here. index: " << index << " , m_nNextEntry: "<< m_nNextEntry << " , block:" << block<< ", m_nBlockNum:" << m_nBlockNum;
     if (block >= m_nBlockNum) {
       try {
         m_blockAddr[m_nBlockNum++] = new hash_node_t[BLOCK_SIZE];
@@ -581,7 +583,6 @@ class slim_hash_map {
         return NULL;
       }
     }
-
     uint32_t addr = m_nNextEntry;
     ++m_nNextEntry;
     hash_node_t* node = get_node(addr);
diff --git a/core/cube/cube-server/src/dict.cpp b/core/cube/cube-server/src/dict.cpp
index 05f401115ab5e95f8b014bf30bda71d8a10a74cb..dd21d518e61bd199108032c2e382d76d3b8b55a7 100644
--- a/core/cube/cube-server/src/dict.cpp
+++ b/core/cube/cube-server/src/dict.cpp
@@ -51,13 +51,12 @@ int Dict::load(const std::string& dict_path,
                bool in_mem,
                const std::string& v_path) {
   TIME_FLAG(load_start);
-
   int ret = load_index(dict_path, v_path);
   if (ret != E_OK) {
     LOG(WARNING) << "load index failed";
     return ret;
   }
-
+  LOG(INFO) << "load index in mem mode: " << in_mem ;
   if (in_mem) {
     ret = load_data(dict_path, v_path);
     if (ret != E_OK) {
@@ -81,8 +80,11 @@ int Dict::load_index(const std::string& dict_path, const std::string& v_path) {
   std::string index_n_path(dict_path);
   index_n_path.append(v_path);
   index_n_path.append("/index.n");
+  
+  uint32_t cur_block_id = 0;
+  if (_base_dict) cur_block_id = _base_dict->_block_set.size(); 
   LOG(INFO) << "index file path: " << index_n_path;
-
+  //ERR HERE
   std::unique_ptr<FILE, decltype(&fclose)> pf(fopen(index_n_path.c_str(), "rb"),
                                               &fclose);
   if (pf.get() == NULL) {
@@ -150,12 +152,16 @@ int Dict::load_index(const std::string& dict_path, const std::string& v_path) {
         return E_DATA_ERROR;
       }
     } else {
+      if (_slim_table.copy_data_from(_base_dict->_slim_table) != 0) {
+        LOG(ERROR) << "copy data from old index failed in patch mode";
+        return E_DATA_ERROR;
+      }
       file_idx = 0;
       LOG(INFO)
-          << "index check file len failed in patch mode, set file_idx to 0";
+          << "index check fail, direct copy";
     }
   }
-
+  LOG(INFO) << "resize slim table, new count: " << count/2;
   _slim_table.resize(count / 2);
 
   char file[1024];
@@ -167,6 +173,7 @@ int Dict::load_index(const std::string& dict_path, const std::string& v_path) {
              dict_path.c_str(),
              v_path.c_str(),
              file_idx);
+    LOG(INFO) << "load file str: " << file;
     if (stat(file, &fstat) < 0) {
       if (errno == ENOENT) {
         LOG(WARNING) << "index." << file_idx << " not exist";
@@ -181,8 +188,8 @@ int Dict::load_index(const std::string& dict_path, const std::string& v_path) {
                  << (uint64_t)fstat.st_size;
       return E_DATA_ERROR;
     }
-    LOG(INFO) << "loading from index." << file_idx;
-    if (!_slim_table.load(file) || _slim_table.size() > count) {
+    LOG(INFO) << "loading from index." << file_idx << " . table size: " << _slim_table.size();
+    if (!_slim_table.load(file, cur_block_id)) {
       return E_DATA_ERROR;
     }
 
@@ -193,8 +200,15 @@ int Dict::load_index(const std::string& dict_path, const std::string& v_path) {
 }
 
 int Dict::load_data(const std::string& dict_path, const std::string& v_path) {
+  std::vector<uint32_t> block_size;
+  uint64_t total_data_size = 0;
   if (_base_dict) {
     _block_set = _base_dict->_block_set;
+    LOG(INFO)<< "load data base dict block set size: " << _block_set[0].size;
+    for (size_t i = 0; i < _block_set.size(); ++i) {
+      block_size.push_back(_block_set[i].size); 
+      total_data_size += _block_set[i].size;     
+    }
   }
 
   std::string data_n_path(dict_path);
@@ -212,8 +226,6 @@ int Dict::load_data(const std::string& dict_path, const std::string& v_path) {
     return E_DATA_ERROR;
   }
 
-  std::vector<uint32_t> block_size;
-  uint64_t total_data_size = 0;
   for (uint32_t i = 0; i < count; ++i) {
     uint32_t size = 0;
     if (fread(reinterpret_cast<void*>(&size), sizeof(uint32_t), 1, pf) != 1) {
@@ -222,6 +234,7 @@ int Dict::load_data(const std::string& dict_path, const std::string& v_path) {
       return E_DATA_ERROR;
     }
     block_size.push_back(size);
+    LOG(INFO) << "new block size: " << size;
     total_data_size += size;
   }
   g_data_size << (total_data_size / 1024 / 1024);
@@ -229,36 +242,35 @@ int Dict::load_data(const std::string& dict_path, const std::string& v_path) {
   pf = NULL;
 
   uint32_t old_size = _block_set.size();
+  LOG(INFO) << "load data old size: " << old_size;
   for (size_t i = 0; i < old_size; ++i) {
     if (_block_set[i].size != block_size[i]) {
       old_size = 0;
       break;
     }
   }
-  _block_set.resize(count);
+  LOG(INFO) << "load data block set count: " << count << " , old size: " << old_size;
+  _block_set.resize(count + old_size);
   for (size_t i = old_size; i < _block_set.size(); ++i) {
     char data_path[1024];
     LOG(INFO) << "load from data." << i;
-    snprintf(
-        data_path, 1024, "%s%s/data.%lu", dict_path.c_str(), v_path.c_str(), i);
-
+    //snprintf(
+      //  data_path, 1024, "%s%s/data.%lu", dict_path.c_str(), v_path.c_str(), i);
+    snprintf(data_path, 1024, "%s%s/data.%lu", dict_path.c_str(), v_path.c_str(), i - old_size);
     FILE* data_file = fopen(data_path, "rb");
     if (data_file == NULL) {
-      LOG(WARNING) << "open data file [" << data_path << " failed";
+      LOG(WARNING) << "open data file [" << data_path << " ]failed";
       _block_set[i].s_data.reset();
       _block_set[i].size = 0;
       continue;
     }
-
-    _block_set[i].s_data.reset(
-        reinterpret_cast<char*>(malloc(block_size[i] * sizeof(char))));
+    _block_set[i].s_data.reset(reinterpret_cast<char*>(malloc(block_size[i] * sizeof(char))));
     if (_block_set[i].s_data.get() == NULL) {
       LOG(ERROR) << "malloc data failed";
       fclose(data_file);
       return E_OOM;
     }
     _block_set[i].size = block_size[i];
-
     if (fread(reinterpret_cast<void*>(_block_set[i].s_data.get()),
               sizeof(char),
               _block_set[i].size,
@@ -267,7 +279,10 @@ int Dict::load_data(const std::string& dict_path, const std::string& v_path) {
       fclose(data_file);
       return E_DATA_ERROR;
     }
-
+    LOG(INFO) << "load new data to BlockSet succ";
+    for (size_t ii = 0; ii < 20; ++ii) {
+       LOG(INFO) << "data ptr: " << (int)(_block_set[i].s_data.get()[ii]);
+    }
     fclose(data_file);
   }
 
@@ -386,12 +401,11 @@ bool Dict::seek(uint64_t key, char* buff, uint64_t* buff_size) {
   uint64_t flag = it->second;
   uint32_t id = (uint32_t)(flag >> 32);
   uint64_t addr = (uint32_t)(flag);
-
+  LOG(INFO) << "search key: " << id << " , addr: " << addr;
   if (_block_set.size() > id) {
     uint32_t block_size = _block_set[id].size;
     char* block_data = NULL;
     block_data = _block_set[id].s_data.get();
-
     if (block_data && addr + sizeof(uint32_t) <= block_size) {
       uint32_t len = *(reinterpret_cast<uint32_t*>(block_data + addr));
       if (addr + len <= block_size && len >= sizeof(uint32_t)) {
@@ -405,6 +419,7 @@ bool Dict::seek(uint64_t key, char* buff, uint64_t* buff_size) {
                      << default_buffer_size;
           return false;
         }
+        LOG(INFO) << "seek key: " << key << " , addr: " << addr;
         memcpy(buff,
                (block_data + addr + sizeof(uint32_t)),
                len - sizeof(uint32_t));
diff --git a/core/cube/cube-transfer/CMakeLists.txt b/core/cube/cube-transfer/CMakeLists.txt
index 78e47c5b840631a3092f3a799e2424d370444a2e..2e9d3dede03c5b27bcd0e24eaa6584df343c09e2 100644
--- a/core/cube/cube-transfer/CMakeLists.txt
+++ b/core/cube/cube-transfer/CMakeLists.txt
@@ -18,11 +18,9 @@ project(cube-transfer Go)
 
 include(cmake/golang.cmake)
 
-ExternalGoProject_Add(rfw github.com/mipearson/rfw)
-ExternalGoProject_Add(docopt-go github.com/docopt/docopt-go)  
-add_custom_target(logex
-                  COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get github.com/Badangel/logex
-                  DEPENDS rfw)
+ExternalGoProject_Add(transfer-rfw github.com/mipearson/rfw)
+ExternalGoProject_Add(transfer-docopt-go github.com/docopt/docopt-go)  
+ExternalGoProject_Add(transfer-logex github.com/Badangel/logex)
 
 add_subdirectory(src)
 install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/conf DESTINATION ${PADDLE_SERVING_INSTALL_DIR})
diff --git a/core/cube/cube-transfer/conf/transfer.conf b/core/cube/cube-transfer/conf/transfer.conf
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/CMakeLists.txt b/core/cube/cube-transfer/src/CMakeLists.txt
index 62d3f7ef7759a0d2a09eb4fe32a064694ece5408..b71278537a2ee03468019e7bd7e5ec4d786becf2 100644
--- a/core/cube/cube-transfer/src/CMakeLists.txt
+++ b/core/cube/cube-transfer/src/CMakeLists.txt
@@ -14,6 +14,6 @@
 
 set(SOURCE_FILE cube-transfer.go)
 add_go_executable(cube-transfer ${SOURCE_FILE})
-add_dependencies(cube-transfer docopt-go)
-add_dependencies(cube-transfer rfw)
-add_dependencies(cube-transfer logex)
+add_dependencies(cube-transfer transfer-docopt-go)
+add_dependencies(cube-transfer transfer-rfw)
+add_dependencies(cube-transfer transfer-logex)
diff --git a/core/cube/cube-transfer/src/cube-transfer.go b/core/cube/cube-transfer/src/cube-transfer.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/builder.go b/core/cube/cube-transfer/src/transfer/builder.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/config.go b/core/cube/cube-transfer/src/transfer/config.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/deployer.go b/core/cube/cube-transfer/src/transfer/deployer.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/dict/cube_agent_server.go b/core/cube/cube-transfer/src/transfer/dict/cube_agent_server.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/dict/define.go b/core/cube/cube-transfer/src/transfer/dict/define.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/dict/dict_info.go b/core/cube/cube-transfer/src/transfer/dict/dict_info.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/dict/dict_instance_status.go b/core/cube/cube-transfer/src/transfer/dict/dict_instance_status.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/dict/dict_shard_info.go b/core/cube/cube-transfer/src/transfer/dict/dict_shard_info.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/dict/dict_version_info.go b/core/cube/cube-transfer/src/transfer/dict/dict_version_info.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/global.go b/core/cube/cube-transfer/src/transfer/global.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/http.go b/core/cube/cube-transfer/src/transfer/http.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/http_get.go b/core/cube/cube-transfer/src/transfer/http_get.go
old mode 100755
new mode 100644
diff --git a/core/cube/cube-transfer/src/transfer/transfer.go b/core/cube/cube-transfer/src/transfer/transfer.go
old mode 100755
new mode 100644
index 84ab7427333b3a639efd2e48df3dd248209924be..d29c29a496a62930a86ae1dcb44c02a4d32f1552
--- a/core/cube/cube-transfer/src/transfer/transfer.go
+++ b/core/cube/cube-transfer/src/transfer/transfer.go
@@ -17,68 +17,56 @@ package transfer
 import (
 	"fmt"
 	"github.com/Badangel/logex"
-	"os"
-    "time"
 	"transfer/dict"
 )
 
 func Start() {
 
-	go BackupTransfer()
-	logex.Notice(">>> starting server...")
-	addr := ":" + Port
-	err := startHttp(addr)
-	if err != nil {
-		logex.Fatalf("start http(addr=%v) failed: %v", addr, err)
-		os.Exit(255)
-	}
-
-	logex.Notice(">>> start server succ")
+	BackupTransfer()
 }
 
 func BackupTransfer() {
-	for {
-		//trigger
-		version, err := TriggerStart(Dict.DonefileAddress)
-		if err != nil {
-			logex.Fatalf("[trigger err]trigger err:%v ", err)
-			fmt.Printf("[error]trigger err:%v \n", err)
-			break
-		}
-		logex.Noticef("[trigger] get version:%v \n", version)
-		if version.Id == 0 {
-			logex.Noticef("[sleep]no new version, sleep 5 min")
-			fmt.Printf("[sleep]no new version, wait 5 min\n")
-            time.Sleep(5 * time.Minute)
-            continue
-        }
+	//trigger
+	version, err := TriggerStart(Dict.DonefileAddress)
+	if err != nil {
+		logex.Fatalf("[trigger err]trigger err:%v ", err)
+		fmt.Printf("[error]trigger err:%v \n", err)
+		fmt.Print("transfer over!")
+        	logex.Noticef("[transfer]status machine exit!")
+                return
+	}
+	logex.Noticef("[trigger] get version:%v \n", version)
         Dict.WaitVersionInfo = version
-		logex.Noticef("[trigger finish] WaitVersionInfo version:%v \n", Dict.WaitVersionInfo)
-		WriteWaitVersionInfoToFile()
+	logex.Noticef("[trigger finish] WaitVersionInfo version:%v \n", Dict.WaitVersionInfo)
+	WriteWaitVersionInfoToFile()
         
-		//builder
-		Dict.WaitVersionInfo.Status = dict.Dict_Status_Building
-		Dict.WaitVersionInfo.MetaInfos = make(map[int]string)
-		WriteWaitVersionInfoToFile()
-		if err = BuilderStart(Dict.WaitVersionInfo); err != nil {
-			logex.Fatalf("builder err:%v \n", err)
-		}
+	//builder
+	Dict.WaitVersionInfo.Status = dict.Dict_Status_Building
+	Dict.WaitVersionInfo.MetaInfos = make(map[int]string)
+	WriteWaitVersionInfoToFile()
+	if err = BuilderStart(Dict.WaitVersionInfo); err != nil {
+		logex.Fatalf("builder err:%v \n", err)
+	}
 
-		if Dict.WaitVersionInfo.Mode == dict.BASE {
-			var newCurrentVersion []dict.DictVersionInfo
-			Dict.CurrentVersionInfo = newCurrentVersion
-			WriteCurrentVersionInfoToFile()
-		}
-		logex.Noticef("[builder finish] WaitVersionInfo version:%v \n", Dict.WaitVersionInfo)
+	if Dict.WaitVersionInfo.Mode == dict.BASE {
+		var newCurrentVersion []dict.DictVersionInfo
+		Dict.CurrentVersionInfo = newCurrentVersion
+		WriteCurrentVersionInfoToFile()
+	}
+        if Dict.WaitVersionInfo.Mode == dict.DELTA {
+                var newCurrentVersion []dict.DictVersionInfo
+                Dict.CurrentVersionInfo = newCurrentVersion
+                WriteCurrentVersionInfoToFile()
+        }
+	logex.Noticef("[builder finish] WaitVersionInfo version:%v \n", Dict.WaitVersionInfo)
 
-		//deployer
-		Dict.WaitVersionInfo.Status = dict.Dict_Status_Deploying
-		WriteWaitVersionInfoToFile()
-		if err = DeployStart(Dict.WaitVersionInfo); err != nil {
-			logex.Fatalf("deploy err:%v \n", err)
-		}
-        logex.Noticef("[deploy finish]current version: %v\n",Dict.CurrentVersionInfo)
+	//deployer
+	Dict.WaitVersionInfo.Status = dict.Dict_Status_Deploying
+	WriteWaitVersionInfoToFile()
+	if err = DeployStart(Dict.WaitVersionInfo); err != nil {
+		logex.Fatalf("deploy err:%v \n", err)
 	}
+        logex.Noticef("[deploy finish]current version: %v\n",Dict.CurrentVersionInfo)
 	fmt.Print("transfer over!")
 	logex.Noticef("[transfer]status machine exit!")
 }
diff --git a/core/cube/cube-transfer/src/transfer/trigger.go b/core/cube/cube-transfer/src/transfer/trigger.go
old mode 100755
new mode 100644
index b3696dc58b7ca33de307cbe7ea2d4509d269753c..768f7218c036d7d948c046a6763d11c68ce9a306
--- a/core/cube/cube-transfer/src/transfer/trigger.go
+++ b/core/cube/cube-transfer/src/transfer/trigger.go
@@ -38,18 +38,19 @@ func GetDoneFileInfo(addr string) (version dict.DictVersionInfo, err error) {
 		Wget(addr, donefileAddr)
 		addr = donefileAddr
 	}
-
-	baseDonefile := addr + "/base.txt"
-	fmt.Printf("[trigrer]donefile path:%v \n", baseDonefile)
-	logex.Noticef("[trigrer]base donefile path:%v", baseDonefile)
-	contents, err := ioutil.ReadFile(baseDonefile)
 	VersionLen := len(Dict.CurrentVersionInfo)
 	version.DictName = Dict.DictName
-	if err != nil {
-		fmt.Printf("[trigrer]read files err:%v \n", err)
-		logex.Fatalf("[trigrer]read files err:%v ", err)
+        fmt.Printf("get into mode check here\n")
+        if Dict.DictMode == dict.BASE_ONLY {
+          baseDonefile := addr + "/base.txt"
+          fmt.Printf("[trigrer]donefile path:%v \n", baseDonefile)
+          logex.Noticef("[trigrer]base donefile path:%v", baseDonefile)
+          contents, err_0 := ioutil.ReadFile(baseDonefile)
+	  if err_0 != nil {
+		fmt.Printf("[trigrer]read files err:%v \n", err_0)
+		logex.Fatalf("[trigrer]read files err:%v ", err_0)
 		return
-	} else {
+	  } else {
 		contentss := string(contents)
 		lines := strings.Split(contentss, "\n")
 		index := len(lines) - 1
@@ -80,19 +81,21 @@ func GetDoneFileInfo(addr string) (version dict.DictVersionInfo, err error) {
 			version.Mode = dict.BASE
 			return
 		}
-	}
-	if Dict.DictMode == dict.BASR_DELTA && VersionLen > 0 {
+	  }
+        }
+	if Dict.DictMode == dict.BASR_DELTA {
 		patchDonefile := addr + "/patch.txt"
 		fmt.Printf("[trigrer]patchDonefile path:%v \n", patchDonefile)
 		logex.Noticef("[trigrer]patch donefile path:%v", patchDonefile)
-		contents, err = ioutil.ReadFile(patchDonefile)
-		if err != nil {
-			fmt.Printf("read files err:%v \n", err)
+		contents, err_0 := ioutil.ReadFile(patchDonefile)
+		if err_0 != nil {
+			fmt.Printf("[trigrer]read files err:%v \n", err_0)
+                        logex.Fatalf("[trigrer]read files err:%v ", err_0)
 			return
 		} else {
 			contentss := string(contents)
 			lines := strings.Split(contentss, "\n")
-
+                        fmt.Printf("[trigger]get patch lines here\n")
 			for index := 0; index < len(lines)-1; index++ {
 				if len(lines[index]) < 3 {
 					logex.Noticef("[trigrer]get patch donfile info error")
@@ -106,14 +109,15 @@ func GetDoneFileInfo(addr string) (version dict.DictVersionInfo, err error) {
 				logex.Noticef("[trigrer]donfile info:%v", donefileInfo)
 				newId, _ := strconv.Atoi(donefileInfo.Id)
 				newKey, _ := strconv.Atoi(donefileInfo.Key)
-				if newId > Dict.CurrentVersionInfo[VersionLen-1].Id && newKey == Dict.CurrentVersionInfo[VersionLen-1].Key {
+                                fmt.Printf("[trigger]read patch id: %d, key: %d\n", newId, newKey)
+				if VersionLen == 0 || newId > Dict.CurrentVersionInfo[VersionLen-1].Id {
 					version.Id = newId
 					version.Key, _ = strconv.Atoi(donefileInfo.Key)
 					version.Input = donefileInfo.Input
 					deployVersion := int(time.Now().Unix())
 					version.CreateTime = deployVersion
 					version.Version = deployVersion
-					version.Depend = Dict.CurrentVersionInfo[VersionLen-1].Depend
+                                        version.Depend = deployVersion
 					version.Mode = dict.DELTA
 					return
 				}
diff --git a/core/cube/cube-transfer/src/transfer/util.go b/core/cube/cube-transfer/src/transfer/util.go
old mode 100755
new mode 100644
index f3c1834319ab2752a2338cda737855854cf73356..8f9e5c545f35e504f248466e84f8a7d368b80db8
--- a/core/cube/cube-transfer/src/transfer/util.go
+++ b/core/cube/cube-transfer/src/transfer/util.go
@@ -96,7 +96,8 @@ func ExeCommad(files string, params []string) (err error) {
 
 func Wget(ftpPath string, downPath string) {
 	var params []string
-	params = append(params, "-P")
+	params = append(params, "--limit-rate=100m")
+        params = append(params, "-P")
 	params = append(params, downPath)
 	params = append(params, "-r")
 	params = append(params, "-N")
@@ -110,4 +111,4 @@ func Wget(ftpPath string, downPath string) {
 	if err != nil {
 		fmt.Printf("wget exe: %v\n", err)
 	}
-}
\ No newline at end of file
+}
diff --git a/core/general-client/include/general_model.h b/core/general-client/include/general_model.h
old mode 100755
new mode 100644
index b1c4f71f5602bed4eded49822d7afe7caac6e242..88ec7a59f1181eec32e2da800a9a1b71e3cdc084
--- a/core/general-client/include/general_model.h
+++ b/core/general-client/include/general_model.h
@@ -53,6 +53,9 @@ class ModelRes {
                             res._int32_value_map.end());
     _shape_map.insert(res._shape_map.begin(), res._shape_map.end());
     _lod_map.insert(res._lod_map.begin(), res._lod_map.end());
+    _tensor_alias_names.insert(_tensor_alias_names.end(),
+                               res._tensor_alias_names.begin(),
+                               res._tensor_alias_names.end());
   }
   ModelRes(ModelRes&& res) {
     _engine_name = std::move(res._engine_name);
@@ -69,6 +72,10 @@ class ModelRes {
                       std::make_move_iterator(std::end(res._shape_map)));
     _lod_map.insert(std::make_move_iterator(std::begin(res._lod_map)),
                     std::make_move_iterator(std::end(res._lod_map)));
+    _tensor_alias_names.insert(
+        _tensor_alias_names.end(),
+        std::make_move_iterator(std::begin(res._tensor_alias_names)),
+        std::make_move_iterator(std::end(res._tensor_alias_names)));
   }
   ~ModelRes() {}
   const std::vector<int64_t>& get_int64_by_name(const std::string& name) {
@@ -105,6 +112,10 @@ class ModelRes {
     _engine_name = engine_name;
   }
   const std::string& engine_name() { return _engine_name; }
+
+  const std::vector<std::string>& tensor_alias_names() {
+    return _tensor_alias_names;
+  }
   ModelRes& operator=(ModelRes&& res) {
     if (this != &res) {
       _engine_name = std::move(res._engine_name);
@@ -121,6 +132,10 @@ class ModelRes {
                         std::make_move_iterator(std::end(res._shape_map)));
       _lod_map.insert(std::make_move_iterator(std::begin(res._lod_map)),
                       std::make_move_iterator(std::end(res._lod_map)));
+      _tensor_alias_names.insert(
+          _tensor_alias_names.end(),
+          std::make_move_iterator(std::begin(res._tensor_alias_names)),
+          std::make_move_iterator(std::end(res._tensor_alias_names)));
     }
     return *this;
   }
@@ -132,6 +147,7 @@ class ModelRes {
   std::map<std::string, std::vector<int32_t>> _int32_value_map;
   std::map<std::string, std::vector<int>> _shape_map;
   std::map<std::string, std::vector<int>> _lod_map;
+  std::vector<std::string> _tensor_alias_names;
 };
 
 class PredictorRes {
@@ -193,11 +209,16 @@ class PredictorRes {
   }
   const std::string& variant_tag() { return _variant_tag; }
   const std::vector<std::string>& get_engine_names() { return _engine_names; }
+  const std::vector<std::string>& get_tensor_alias_names(const int model_idx) {
+    _tensor_alias_names = _models[model_idx].tensor_alias_names();
+    return _tensor_alias_names;
+  }
 
  private:
   std::vector<ModelRes> _models;
   std::string _variant_tag;
   std::vector<std::string> _engine_names;
+  std::vector<std::string> _tensor_alias_names;
 };
 
 class PredictorClient {
@@ -207,7 +228,7 @@ class PredictorClient {
 
   void init_gflags(std::vector<std::string> argv);
 
-  int init(const std::vector<std::string> &client_conf);
+  int init(const std::vector<std::string>& client_conf);
 
   void set_predictor_conf(const std::string& conf_path,
                           const std::string& conf_file);
@@ -218,23 +239,22 @@ class PredictorClient {
 
   int destroy_predictor();
 
-  int numpy_predict(
-      const std::vector<std::vector<py::array_t<float>>>& float_feed_batch,
-      const std::vector<std::string>& float_feed_name,
-      const std::vector<std::vector<int>>& float_shape,
-      const std::vector<std::vector<int>>& float_lod_slot_batch,
-      const std::vector<std::vector<py::array_t<int64_t>>>& int_feed_batch,
-      const std::vector<std::string>& int_feed_name,
-      const std::vector<std::vector<int>>& int_shape,
-      const std::vector<std::vector<int>>& int_lod_slot_batch,
-      const std::vector<std::vector<std::string>>& string_feed_batch,
-      const std::vector<std::string>& string_feed_name,
-      const std::vector<std::vector<int>>& string_shape,
-      const std::vector<std::vector<int>>& string_lod_slot_batch,
-      const std::vector<std::string>& fetch_name,
-      PredictorRes& predict_res_batch,  // NOLINT
-      const int& pid,
-      const uint64_t log_id);
+  int numpy_predict(const std::vector<py::array_t<float>>& float_feed,
+                    const std::vector<std::string>& float_feed_name,
+                    const std::vector<std::vector<int>>& float_shape,
+                    const std::vector<std::vector<int>>& float_lod_slot_batch,
+                    const std::vector<py::array_t<int64_t>>& int_feed,
+                    const std::vector<std::string>& int_feed_name,
+                    const std::vector<std::vector<int>>& int_shape,
+                    const std::vector<std::vector<int>>& int_lod_slot_batch,
+                    const std::vector<std::string>& string_feed,
+                    const std::vector<std::string>& string_feed_name,
+                    const std::vector<std::vector<int>>& string_shape,
+                    const std::vector<std::vector<int>>& string_lod_slot_batch,
+                    const std::vector<std::string>& fetch_name,
+                    PredictorRes& predict_res_batch,  // NOLINT
+                    const int& pid,
+                    const uint64_t log_id);
 
  private:
   PredictorApi _api;
@@ -243,6 +263,7 @@ class PredictorClient {
   std::string _predictor_path;
   std::string _conf_file;
   std::map<std::string, int> _feed_name_to_idx;
+  std::vector<std::string> _feed_name;
   std::map<std::string, int> _fetch_name_to_idx;
   std::map<std::string, std::string> _fetch_name_to_var_name;
   std::map<std::string, int> _fetch_name_to_type;
diff --git a/core/general-client/src/general_model.cpp b/core/general-client/src/general_model.cpp
index 0ade573de6ac2da59156ba82f5ff3e04f1b7f6b2..d04ab89ae31d048e5a38ada7abec5f27d46ab62f 100644
--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -25,8 +25,6 @@ using baidu::paddle_serving::Timer;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Tensor;
-using baidu::paddle_serving::predictor::general_model::FeedInst;
-using baidu::paddle_serving::predictor::general_model::FetchInst;
 enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
 std::once_flag gflags_init_flag;
 namespace py = pybind11;
@@ -68,9 +66,13 @@ int PredictorClient::init(const std::vector<std::string> &conf_file) {
     _fetch_name_to_idx.clear();
     _shape.clear();
     int feed_var_num = model_config.feed_var_size();
+    _feed_name.clear();
     VLOG(2) << "feed var num: " << feed_var_num;
     for (int i = 0; i < feed_var_num; ++i) {
       _feed_name_to_idx[model_config.feed_var(i).alias_name()] = i;
+      VLOG(2) << "feed [" << i << "]"
+              << " name: " << model_config.feed_var(i).name();
+      _feed_name.push_back(model_config.feed_var(i).name());
       VLOG(2) << "feed alias name: " << model_config.feed_var(i).alias_name()
               << " index: " << i;
       std::vector<int> tmp_feed_shape;
@@ -146,15 +148,15 @@ int PredictorClient::create_predictor() {
 }
 
 int PredictorClient::numpy_predict(
-    const std::vector<std::vector<py::array_t<float>>> &float_feed_batch,
+    const std::vector<py::array_t<float>> &float_feed,
     const std::vector<std::string> &float_feed_name,
     const std::vector<std::vector<int>> &float_shape,
     const std::vector<std::vector<int>> &float_lod_slot_batch,
-    const std::vector<std::vector<py::array_t<int64_t>>> &int_feed_batch,
+    const std::vector<py::array_t<int64_t>> &int_feed,
     const std::vector<std::string> &int_feed_name,
     const std::vector<std::vector<int>> &int_shape,
     const std::vector<std::vector<int>> &int_lod_slot_batch,
-    const std::vector<std::vector<std::string>> &string_feed_batch,
+    const std::vector<std::string> &string_feed,
     const std::vector<std::string> &string_feed_name,
     const std::vector<std::vector<int>> &string_shape,
     const std::vector<std::vector<int>> &string_lod_slot_batch,
@@ -162,16 +164,10 @@ int PredictorClient::numpy_predict(
     PredictorRes &predict_res_batch,
     const int &pid,
     const uint64_t log_id) {
-  int batch_size = std::max(float_feed_batch.size(), int_feed_batch.size());
-  batch_size = batch_size > string_feed_batch.size() ? batch_size
-                                                     : string_feed_batch.size();
-  VLOG(2) << "batch size: " << batch_size;
   predict_res_batch.clear();
   Timer timeline;
   int64_t preprocess_start = timeline.TimeStampUS();
 
-  int fetch_name_num = fetch_name.size();
-
   _api.thrd_initialize();
   std::string variant_tag;
   _predictor = _api.fetch_predictor("general_model", &variant_tag);
@@ -188,134 +184,122 @@ int PredictorClient::numpy_predict(
   }
 
   int vec_idx = 0;
-  for (int bi = 0; bi < batch_size; bi++) {
-    VLOG(2) << "prepare batch " << bi;
-    std::vector<Tensor *> tensor_vec;
-    FeedInst *inst = req.add_insts();
-    std::vector<py::array_t<float>> float_feed = float_feed_batch[bi];
-    std::vector<py::array_t<int64_t>> int_feed = int_feed_batch[bi];
-    std::vector<std::string> string_feed = string_feed_batch[bi];
-    for (auto &name : float_feed_name) {
-      tensor_vec.push_back(inst->add_tensor_array());
-    }
-
-    for (auto &name : int_feed_name) {
-      tensor_vec.push_back(inst->add_tensor_array());
-    }
+  // batch is already in Tensor.
+  std::vector<Tensor *> tensor_vec;
 
-    for (auto &name : string_feed_name) {
-      tensor_vec.push_back(inst->add_tensor_array());
-    }
+  for (auto &name : float_feed_name) {
+    tensor_vec.push_back(req.add_tensor());
+  }
 
-    VLOG(2) << "batch [" << bi << "] "
-            << "prepared";
+  for (auto &name : int_feed_name) {
+    tensor_vec.push_back(req.add_tensor());
+  }
 
-    vec_idx = 0;
-    for (auto &name : float_feed_name) {
-      int idx = _feed_name_to_idx[name];
-      if (idx >= tensor_vec.size()) {
-        LOG(ERROR) << "idx > tensor_vec.size()";
-        return -1;
-      }
-      int nbytes = float_feed[vec_idx].nbytes();
-      void *rawdata_ptr = (void *)(float_feed[vec_idx].data(0));
-      int total_number = float_feed[vec_idx].size();
-      Tensor *tensor = tensor_vec[idx];
-
-      VLOG(2) << "prepare float feed " << name << " shape size "
-              << float_shape[vec_idx].size();
-      for (uint32_t j = 0; j < float_shape[vec_idx].size(); ++j) {
-        tensor->add_shape(float_shape[vec_idx][j]);
-      }
-      for (uint32_t j = 0; j < float_lod_slot_batch[vec_idx].size(); ++j) {
-        tensor->add_lod(float_lod_slot_batch[vec_idx][j]);
-      }
-      tensor->set_elem_type(P_FLOAT32);
+  for (auto &name : string_feed_name) {
+    tensor_vec.push_back(req.add_tensor());
+  }
 
-      tensor->mutable_float_data()->Resize(total_number, 0);
-      memcpy(tensor->mutable_float_data()->mutable_data(), rawdata_ptr, nbytes);
-      vec_idx++;
+  vec_idx = 0;
+  for (auto &name : float_feed_name) {
+    int idx = _feed_name_to_idx[name];
+    if (idx >= tensor_vec.size()) {
+      LOG(ERROR) << "idx > tensor_vec.size()";
+      return -1;
+    }
+    VLOG(2) << "prepare float feed " << name << " idx " << idx;
+    int nbytes = float_feed[vec_idx].nbytes();
+    void *rawdata_ptr = (void *)(float_feed[vec_idx].data(0));
+    int total_number = float_feed[vec_idx].size();
+    Tensor *tensor = tensor_vec[idx];
+
+    VLOG(2) << "prepare float feed " << name << " shape size "
+            << float_shape[vec_idx].size();
+    for (uint32_t j = 0; j < float_shape[vec_idx].size(); ++j) {
+      tensor->add_shape(float_shape[vec_idx][j]);
+    }
+    for (uint32_t j = 0; j < float_lod_slot_batch[vec_idx].size(); ++j) {
+      tensor->add_lod(float_lod_slot_batch[vec_idx][j]);
     }
+    tensor->set_elem_type(P_FLOAT32);
 
-    VLOG(2) << "batch [" << bi << "] "
-            << "float feed value prepared";
+    tensor->set_name(_feed_name[idx]);
+    tensor->set_alias_name(name);
 
-    vec_idx = 0;
-    for (auto &name : int_feed_name) {
-      int idx = _feed_name_to_idx[name];
-      if (idx >= tensor_vec.size()) {
-        LOG(ERROR) << "idx > tensor_vec.size()";
-        return -1;
-      }
-      Tensor *tensor = tensor_vec[idx];
-      int nbytes = int_feed[vec_idx].nbytes();
-      void *rawdata_ptr = (void *)(int_feed[vec_idx].data(0));
-      int total_number = int_feed[vec_idx].size();
+    tensor->mutable_float_data()->Resize(total_number, 0);
+    memcpy(tensor->mutable_float_data()->mutable_data(), rawdata_ptr, nbytes);
+    vec_idx++;
+  }
 
-      for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
-        tensor->add_shape(int_shape[vec_idx][j]);
-      }
-      for (uint32_t j = 0; j < int_lod_slot_batch[vec_idx].size(); ++j) {
-        tensor->add_lod(int_lod_slot_batch[vec_idx][j]);
-      }
-      tensor->set_elem_type(_type[idx]);
-
-      if (_type[idx] == P_INT64) {
-        tensor->mutable_int64_data()->Resize(total_number, 0);
-        memcpy(
-            tensor->mutable_int64_data()->mutable_data(), rawdata_ptr, nbytes);
-      } else {
-        tensor->mutable_int_data()->Resize(total_number, 0);
-        memcpy(tensor->mutable_int_data()->mutable_data(), rawdata_ptr, nbytes);
-      }
-      vec_idx++;
+  vec_idx = 0;
+  for (auto &name : int_feed_name) {
+    int idx = _feed_name_to_idx[name];
+    if (idx >= tensor_vec.size()) {
+      LOG(ERROR) << "idx > tensor_vec.size()";
+      return -1;
     }
+    Tensor *tensor = tensor_vec[idx];
+    int nbytes = int_feed[vec_idx].nbytes();
+    void *rawdata_ptr = (void *)(int_feed[vec_idx].data(0));
+    int total_number = int_feed[vec_idx].size();
 
-    VLOG(2) << "batch [" << bi << "] "
-            << "int feed value prepared";
+    for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
+      tensor->add_shape(int_shape[vec_idx][j]);
+    }
+    for (uint32_t j = 0; j < int_lod_slot_batch[vec_idx].size(); ++j) {
+      tensor->add_lod(int_lod_slot_batch[vec_idx][j]);
+    }
+    tensor->set_elem_type(_type[idx]);
+    tensor->set_name(_feed_name[idx]);
+    tensor->set_alias_name(name);
+
+    if (_type[idx] == P_INT64) {
+      tensor->mutable_int64_data()->Resize(total_number, 0);
+      memcpy(tensor->mutable_int64_data()->mutable_data(), rawdata_ptr, nbytes);
+    } else {
+      tensor->mutable_int_data()->Resize(total_number, 0);
+      memcpy(tensor->mutable_int_data()->mutable_data(), rawdata_ptr, nbytes);
+    }
+    vec_idx++;
+  }
 
-    vec_idx = 0;
-    for (auto &name : string_feed_name) {
-      int idx = _feed_name_to_idx[name];
-      if (idx >= tensor_vec.size()) {
-        LOG(ERROR) << "idx > tensor_vec.size()";
-        return -1;
-      }
-      Tensor *tensor = tensor_vec[idx];
+  vec_idx = 0;
+  for (auto &name : string_feed_name) {
+    int idx = _feed_name_to_idx[name];
+    if (idx >= tensor_vec.size()) {
+      LOG(ERROR) << "idx > tensor_vec.size()";
+      return -1;
+    }
+    Tensor *tensor = tensor_vec[idx];
 
-      for (uint32_t j = 0; j < string_shape[vec_idx].size(); ++j) {
-        tensor->add_shape(string_shape[vec_idx][j]);
-      }
-      for (uint32_t j = 0; j < string_lod_slot_batch[vec_idx].size(); ++j) {
-        tensor->add_lod(string_lod_slot_batch[vec_idx][j]);
-      }
-      tensor->set_elem_type(P_STRING);
-
-      const int string_shape_size = string_shape[vec_idx].size();
-      // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
-      // we pass string via vector<vector<string> >.
-      if (string_shape_size != 1) {
-        LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
-                   << string_shape_size;
-        return -1;
-      }
-      switch (string_shape_size) {
-        case 1: {
-          tensor->add_data(string_feed[vec_idx]);
-          break;
-        }
+    for (uint32_t j = 0; j < string_shape[vec_idx].size(); ++j) {
+      tensor->add_shape(string_shape[vec_idx][j]);
+    }
+    for (uint32_t j = 0; j < string_lod_slot_batch[vec_idx].size(); ++j) {
+      tensor->add_lod(string_lod_slot_batch[vec_idx][j]);
+    }
+    tensor->set_elem_type(P_STRING);
+    tensor->set_name(_feed_name[idx]);
+    tensor->set_alias_name(name);
+
+    const int string_shape_size = string_shape[vec_idx].size();
+    // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
+    // we pass string via vector<vector<string> >.
+    if (string_shape_size != 1) {
+      LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
+                 << string_shape_size;
+      return -1;
+    }
+    switch (string_shape_size) {
+      case 1: {
+        tensor->add_data(string_feed[vec_idx]);
+        break;
       }
-      vec_idx++;
     }
-
-    VLOG(2) << "batch [" << bi << "] "
-            << "string feed value prepared";
+    vec_idx++;
   }
 
   int64_t preprocess_end = timeline.TimeStampUS();
-
   int64_t client_infer_start = timeline.TimeStampUS();
-
   Response res;
 
   int64_t client_infer_end = 0;
@@ -343,52 +327,46 @@ int PredictorClient::numpy_predict(
       auto output = res.outputs(m_idx);
       ModelRes model;
       model.set_engine_name(output.engine_name());
-
-      int idx = 0;
-      for (auto &name : fetch_name) {
+      // 在ResponseOp处，已经按照fetch_name对输出数据进行了处理
+      // 所以，输出的数据与fetch_name是严格对应的，按顺序处理即可。
+      for (int idx = 0; idx < output.tensor_size(); ++idx) {
         // int idx = _fetch_name_to_idx[name];
-        int shape_size = output.insts(0).tensor_array(idx).shape_size();
+        const std::string name = output.tensor(idx).alias_name();
+        model._tensor_alias_names.push_back(name);
+        int shape_size = output.tensor(idx).shape_size();
         VLOG(2) << "fetch var " << name << " index " << idx << " shape size "
                 << shape_size;
         model._shape_map[name].resize(shape_size);
         for (int i = 0; i < shape_size; ++i) {
-          model._shape_map[name][i] =
-              output.insts(0).tensor_array(idx).shape(i);
+          model._shape_map[name][i] = output.tensor(idx).shape(i);
         }
-        int lod_size = output.insts(0).tensor_array(idx).lod_size();
+        int lod_size = output.tensor(idx).lod_size();
         if (lod_size > 0) {
           model._lod_map[name].resize(lod_size);
           for (int i = 0; i < lod_size; ++i) {
-            model._lod_map[name][i] = output.insts(0).tensor_array(idx).lod(i);
+            model._lod_map[name][i] = output.tensor(idx).lod(i);
           }
         }
-        idx += 1;
-      }
 
-      idx = 0;
-
-      for (auto &name : fetch_name) {
-        // int idx = _fetch_name_to_idx[name];
         if (_fetch_name_to_type[name] == P_INT64) {
           VLOG(2) << "ferch var " << name << "type int64";
-          int size = output.insts(0).tensor_array(idx).int64_data_size();
+          int size = output.tensor(idx).int64_data_size();
           model._int64_value_map[name] = std::vector<int64_t>(
-              output.insts(0).tensor_array(idx).int64_data().begin(),
-              output.insts(0).tensor_array(idx).int64_data().begin() + size);
+              output.tensor(idx).int64_data().begin(),
+              output.tensor(idx).int64_data().begin() + size);
         } else if (_fetch_name_to_type[name] == P_FLOAT32) {
           VLOG(2) << "fetch var " << name << "type float";
-          int size = output.insts(0).tensor_array(idx).float_data_size();
+          int size = output.tensor(idx).float_data_size();
           model._float_value_map[name] = std::vector<float>(
-              output.insts(0).tensor_array(idx).float_data().begin(),
-              output.insts(0).tensor_array(idx).float_data().begin() + size);
+              output.tensor(idx).float_data().begin(),
+              output.tensor(idx).float_data().begin() + size);
         } else if (_fetch_name_to_type[name] == P_INT32) {
           VLOG(2) << "fetch var " << name << "type int32";
-          int size = output.insts(0).tensor_array(idx).int_data_size();
+          int size = output.tensor(idx).int_data_size();
           model._int32_value_map[name] = std::vector<int32_t>(
-              output.insts(0).tensor_array(idx).int_data().begin(),
-              output.insts(0).tensor_array(idx).int_data().begin() + size);
+              output.tensor(idx).int_data().begin(),
+              output.tensor(idx).int_data().begin() + size);
         }
-        idx += 1;
       }
       predict_res_batch.add_model_res(std::move(model));
     }
diff --git a/core/general-client/src/pybind_general_model.cpp b/core/general-client/src/pybind_general_model.cpp
old mode 100755
new mode 100644
index 499f0856ad8b7ffae5f3f037142036ac486cc035..ad26bb7d3c175f08438ee22a5a42425fd5147117
--- a/core/general-client/src/pybind_general_model.cpp
+++ b/core/general-client/src/pybind_general_model.cpp
@@ -69,7 +69,10 @@ PYBIND11_MODULE(serving_client, m) {
            })
       .def("variant_tag", [](PredictorRes &self) { return self.variant_tag(); })
       .def("get_engine_names",
-           [](PredictorRes &self) { return self.get_engine_names(); });
+           [](PredictorRes &self) { return self.get_engine_names(); })
+      .def("get_tensor_alias_names", [](PredictorRes &self, int model_idx) {
+        return self.get_tensor_alias_names(model_idx);
+      });
 
   py::class_<PredictorClient>(m, "PredictorClient", py::buffer_protocol())
       .def(py::init())
@@ -97,33 +100,31 @@ PYBIND11_MODULE(serving_client, m) {
            [](PredictorClient &self) { self.destroy_predictor(); })
       .def("numpy_predict",
            [](PredictorClient &self,
-              const std::vector<std::vector<py::array_t<float>>>
-                  &float_feed_batch,
+              const std::vector<py::array_t<float>> &float_feed,
               const std::vector<std::string> &float_feed_name,
               const std::vector<std::vector<int>> &float_shape,
               const std::vector<std::vector<int>> &float_lod_slot_batch,
-              const std::vector<std::vector<py::array_t<int64_t>>>
-                  &int_feed_batch,
+              const std::vector<py::array_t<int64_t>> &int_feed,
               const std::vector<std::string> &int_feed_name,
               const std::vector<std::vector<int>> &int_shape,
               const std::vector<std::vector<int>> &int_lod_slot_batch,
-              const std::vector<std::vector<std::string>>& string_feed_batch,
-              const std::vector<std::string>& string_feed_name,
-              const std::vector<std::vector<int>>& string_shape,
-              const std::vector<std::vector<int>>& string_lod_slot_batch,
+              const std::vector<std::string> &string_feed,
+              const std::vector<std::string> &string_feed_name,
+              const std::vector<std::vector<int>> &string_shape,
+              const std::vector<std::vector<int>> &string_lod_slot_batch,
               const std::vector<std::string> &fetch_name,
               PredictorRes &predict_res_batch,
               const int &pid,
               const uint64_t log_id) {
-             return self.numpy_predict(float_feed_batch,
+             return self.numpy_predict(float_feed,
                                        float_feed_name,
                                        float_shape,
                                        float_lod_slot_batch,
-                                       int_feed_batch,
+                                       int_feed,
                                        int_feed_name,
                                        int_shape,
                                        int_lod_slot_batch,
-                                       string_feed_batch,
+                                       string_feed,
                                        string_feed_name,
                                        string_shape,
                                        string_lod_slot_batch,
diff --git a/core/general-server/CMakeLists.txt b/core/general-server/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/core/general-server/op/CMakeLists.txt b/core/general-server/op/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/core/general-server/op/general_copy_op.cpp b/core/general-server/op/general_copy_op.cpp
deleted file mode 100644
index 0391a98bcb7f471c0a0687dd9deb7b404a15a2bf..0000000000000000000000000000000000000000
--- a/core/general-server/op/general_copy_op.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "core/general-server/op/general_copy_op.h"
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include "core/general-server/op/general_infer_helper.h"
-#include "core/predictor/framework/infer.h"
-#include "core/predictor/framework/memory.h"
-#include "core/util/include/timer.h"
-
-namespace baidu {
-namespace paddle_serving {
-namespace serving {
-
-using baidu::paddle_serving::Timer;
-using baidu::paddle_serving::predictor::MempoolWrapper;
-using baidu::paddle_serving::predictor::general_model::Tensor;
-using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FeedInst;
-using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
-
-int GeneralCopyOp::inference() {
-  // reade request from client
-  const std::vector<std::string> pre_node_names = pre_names();
-  if (pre_node_names.size() != 1) {
-    LOG(ERROR) << "This op(" << op_name()
-               << ") can only have one predecessor op, but received "
-               << pre_node_names.size();
-    return -1;
-  }
-  const std::string pre_name = pre_node_names[0];
-
-  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
-  uint64_t log_id = input_blob->GetLogId();
-
-  VLOG(2) << "(logid=" << log_id << ") precedent name: " << pre_name;
-  const TensorVector *in = &input_blob->tensor_vector;
-  VLOG(2) << "(logid=" << log_id << ") input size: " << in->size();
-  int batch_size = input_blob->GetBatchSize();
-  int input_var_num = 0;
-
-  GeneralBlob *res = mutable_data<GeneralBlob>();
-  res->SetLogId(log_id);
-  TensorVector *out = &res->tensor_vector;
-
-  VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
-  res->SetBatchSize(batch_size);
-
-  if (!res) {
-    LOG(ERROR) << "(logid=" << log_id
-               << ") Failed get op tls reader object output";
-  }
-
-  Timer timeline;
-  int64_t start = timeline.TimeStampUS();
-
-  VLOG(2) << "(logid=" << log_id << ") Going to init lod tensor";
-  for (int i = 0; i < in->size(); ++i) {
-    paddle::PaddleTensor lod_tensor;
-    CopyLod(&in->at(i), &lod_tensor);
-    lod_tensor.dtype = in->at(i).dtype;
-    lod_tensor.name = in->at(i).name;
-    VLOG(2) << "(logid=" << log_id << ") lod tensor [" << i
-            << "].name = " << lod_tensor.name;
-    out->push_back(lod_tensor);
-  }
-
-  VLOG(2) << "(logid=" << log_id << ") pack done.";
-
-  for (int i = 0; i < out->size(); ++i) {
-    int64_t *src_ptr = static_cast<int64_t *>(in->at(i).data.data());
-    out->at(i).data.Resize(out->at(i).lod[0].back() * sizeof(int64_t));
-    out->at(i).shape = {out->at(i).lod[0].back(), 1};
-    int64_t *tgt_ptr = static_cast<int64_t *>(out->at(i).data.data());
-    for (int j = 0; j < out->at(i).lod[0].back(); ++j) {
-      tgt_ptr[j] = src_ptr[j];
-    }
-  }
-
-  VLOG(2) << "(logid=" << log_id << ") output done.";
-
-  timeline.Pause();
-  int64_t end = timeline.TimeStampUS();
-  CopyBlobInfo(input_blob, res);
-  AddBlobInfo(res, start);
-  AddBlobInfo(res, end);
-
-  VLOG(2) << "(logid=" << log_id << ") read data from client success";
-  return 0;
-}
-
-DEFINE_OP(GeneralCopyOp);
-}  // namespace serving
-}  // namespace paddle_serving
-}  // namespace baidu
diff --git a/core/general-server/op/general_copy_op.h b/core/general-server/op/general_copy_op.h
deleted file mode 100644
index 9b4caadc6a82f1f1a601ab66394b3f629af703ff..0000000000000000000000000000000000000000
--- a/core/general-server/op/general_copy_op.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "core/general-server/general_model_service.pb.h"
-#include "core/general-server/op/general_infer_helper.h"
-#include "core/predictor/framework/resource.h"
-#include "paddle_inference_api.h"  // NOLINT
-
-namespace baidu {
-namespace paddle_serving {
-namespace serving {
-
-class GeneralCopyOp
-    : public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
- public:
-  typedef std::vector<paddle::PaddleTensor> TensorVector;
-
-  DECLARE_OP(GeneralCopyOp);
-
-  int inference();
-};
-
-}  // namespace serving
-}  // namespace paddle_serving
-}  // namespace baidu
diff --git a/core/general-server/op/general_detection_op.cpp b/core/general-server/op/general_detection_op.cpp
old mode 100755
new mode 100644
index 7c33ec8efa8c6e89a7a778def6342415d19ffa94..46f5ddf1b508681661b69c60a25b6d7d000e6d4e
--- a/core/general-server/op/general_detection_op.cpp
+++ b/core/general-server/op/general_detection_op.cpp
@@ -36,7 +36,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FetchInst;
 using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
 
diff --git a/core/general-server/op/general_detection_op.h b/core/general-server/op/general_detection_op.h
old mode 100755
new mode 100644
diff --git a/core/general-server/op/general_dist_kv_infer_op.cpp b/core/general-server/op/general_dist_kv_infer_op.cpp
index 13db69e368e775735efef0bad1e335f5d72a915d..8ee5033d976284b149a2a8bde4e64deea636311f 100644
--- a/core/general-server/op/general_dist_kv_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_infer_op.cpp
@@ -34,10 +34,11 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FetchInst;
 using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
 
+// DistKV Infer Op: seek cube and then call paddle inference
+// op seq: general_reader-> dist_kv_infer -> general_response
 int GeneralDistKVInferOp::inference() { 
   VLOG(2) << "Going to run inference";
   const std::vector<std::string> pre_node_names = pre_names();
@@ -52,14 +53,14 @@ int GeneralDistKVInferOp::inference() {
   const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
   if (!input_blob) {
     LOG(ERROR) << "input_blob is nullptr,error";
-      return -1;
+    return -1;
   }
   uint64_t log_id = input_blob->GetLogId();
   VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
 
   GeneralBlob *output_blob = mutable_data<GeneralBlob>();
   if (!output_blob) {
-    LOG(ERROR) << "output_blob is nullptr,error";
+    LOG(ERROR) <<  "(logid=" << log_id << ") output_blob is nullptr,error";
       return -1;
   }
   output_blob->SetLogId(log_id);
@@ -77,8 +78,8 @@ int GeneralDistKVInferOp::inference() {
   std::vector<uint64_t> unique_keys;
   std::unordered_map<uint64_t, rec::mcube::CubeValue*> key_map;
   std::vector<rec::mcube::CubeValue> values;
-  int sparse_count = 0;
-  int dense_count = 0;
+  int sparse_count = 0; // sparse inputs counts, sparse would seek cube
+  int dense_count = 0; // dense inputs counts, dense would directly call paddle infer
   std::vector<std::pair<int64_t *, size_t>> dataptr_size_pairs;
   size_t key_len = 0;
   for (size_t i = 0; i < in->size(); ++i) {
@@ -97,6 +98,7 @@ int GeneralDistKVInferOp::inference() {
   }
   keys.resize(key_len);
   unique_keys.resize(key_len);
+
   int key_idx = 0;
   for (size_t i = 0; i < dataptr_size_pairs.size(); ++i) {
     std::copy(dataptr_size_pairs[i].first,
@@ -120,6 +122,7 @@ int GeneralDistKVInferOp::inference() {
     LOG(ERROR) << "cube init error or cube config not given.";
     return -1;
   }
+
   int64_t seek_start = timeline.TimeStampUS();
   int ret = cube->seek(table_names[0], unique_keys, &values);
   int64_t seek_end = timeline.TimeStampUS();
@@ -131,7 +134,7 @@ int GeneralDistKVInferOp::inference() {
     LOG(ERROR) << "cube value return null";
   }
   //size_t EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
-  size_t EMBEDDING_SIZE = 9;
+  size_t EMBEDDING_SIZE = (values[0].buff.size() - 10) / sizeof(float);
   TensorVector sparse_out;
   sparse_out.resize(sparse_count);
   TensorVector dense_out;
@@ -145,6 +148,7 @@ int GeneralDistKVInferOp::inference() {
   std::shared_ptr<PaddleGeneralModelConfig> model_config = resource.get_general_model_config().front();
   int cube_key_found = 0;
   int cube_key_miss = 0; 
+
   for (size_t i = 0; i < in->size(); ++i) {
     if (in->at(i).dtype != paddle::PaddleDType::INT64) {
       dense_out[dense_idx] = in->at(i);
@@ -194,6 +198,7 @@ int GeneralDistKVInferOp::inference() {
   VLOG(2) << "(logid=" << log_id << ") sparse tensor load success.";
   timeline.Pause();
   VLOG(2) << "dist kv, cube and datacopy time: " << timeline.ElapsedUS();
+
   TensorVector infer_in;
   infer_in.insert(infer_in.end(), dense_out.begin(), dense_out.end());
   infer_in.insert(infer_in.end(), sparse_out.begin(), sparse_out.end());
@@ -201,10 +206,10 @@ int GeneralDistKVInferOp::inference() {
   output_blob->_batch_size = batch_size;
   int64_t start = timeline.TimeStampUS();
   timeline.Start();
-
+  // call paddle inference here
   if (InferManager::instance().infer(
           engine_name().c_str(), &infer_in, out, batch_size)) {
-    LOG(ERROR) << "Failed do infer in fluid model: " << engine_name();
+    LOG(ERROR) << "(logid=" << log_id << ") Failed do infer in fluid model: " << engine_name();
     return -1;
   }
   int64_t end = timeline.TimeStampUS();
diff --git a/core/general-server/op/general_dist_kv_quant_infer_op.cpp b/core/general-server/op/general_dist_kv_quant_infer_op.cpp
old mode 100755
new mode 100644
index 756b83d625d04b9d2c6c6faf1ab42eecf5a19073..77036c35519d9355fa5100e57e99b8b1d2916c44
--- a/core/general-server/op/general_dist_kv_quant_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_quant_infer_op.cpp
@@ -35,7 +35,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FetchInst;
 using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
 
@@ -117,9 +116,6 @@ int GeneralDistKVQuantInferOp::inference() {
   std::unordered_map<int, int> in_out_map;
   baidu::paddle_serving::predictor::Resource &resource =
       baidu::paddle_serving::predictor::Resource::instance();
-  //TODO:Temporary addition, specific details to be studied by HexToString
-  std::shared_ptr<PaddleGeneralModelConfig> model_config =
-      resource.get_general_model_config()[0];
   int cube_quant_bits = resource.get_cube_quant_bits();
   size_t EMBEDDING_SIZE = 0;
   if (cube_quant_bits == 0) {
@@ -146,7 +142,7 @@ int GeneralDistKVQuantInferOp::inference() {
     sparse_out[sparse_idx].shape.push_back(
         sparse_out[sparse_idx].lod[0].back());
     sparse_out[sparse_idx].shape.push_back(EMBEDDING_SIZE);
-    sparse_out[sparse_idx].name = model_config->_feed_name[i];
+    sparse_out[sparse_idx].name = in->at(i).name;
     sparse_out[sparse_idx].data.Resize(sparse_out[sparse_idx].lod[0].back() *
                                        EMBEDDING_SIZE * sizeof(float));
     // END HERE
diff --git a/core/general-server/op/general_infer_op.cpp b/core/general-server/op/general_infer_op.cpp
old mode 100755
new mode 100644
index 46038e1fe20d5659d3061e3d7490af65f6d54092..00c408a0c5fbe6d886fc3a62285b92ff486aa154
--- a/core/general-server/op/general_infer_op.cpp
+++ b/core/general-server/op/general_infer_op.cpp
@@ -31,7 +31,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FetchInst;
 using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
 
@@ -49,7 +48,7 @@ int GeneralInferOp::inference() {
   const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
   if (!input_blob) {
     LOG(ERROR) << "input_blob is nullptr,error";
-      return -1;
+    return -1;
   }
   uint64_t log_id = input_blob->GetLogId();
   VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
@@ -57,7 +56,7 @@ int GeneralInferOp::inference() {
   GeneralBlob *output_blob = mutable_data<GeneralBlob>();
   if (!output_blob) {
     LOG(ERROR) << "output_blob is nullptr,error";
-      return -1;
+    return -1;
   }
   output_blob->SetLogId(log_id);
 
diff --git a/core/general-server/op/general_reader_op.cpp b/core/general-server/op/general_reader_op.cpp
index 3e1091dd844f0afd71c8556586f82aafc42c5097..af77df553837c594789b0e9943790fc37fc01c95 100644
--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -30,42 +30,8 @@ using baidu::paddle_serving::Timer;
 using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FeedInst;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
 enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
-int conf_check(const Request *req,
-               const std::shared_ptr<PaddleGeneralModelConfig> &model_config) {
-  int var_num = req->insts(0).tensor_array_size();
-  if (var_num != model_config->_feed_type.size()) {
-    LOG(ERROR) << "feed var number not match: model config["
-               << model_config->_feed_type.size() << "] vs. actual[" << var_num
-               << "]";
-    return -1;
-  }
-
-  VLOG(2) << "fetch var num in reader op: " << req->fetch_var_names_size();
-
-  for (int i = 0; i < var_num; ++i) {
-    const Tensor &tensor = req->insts(0).tensor_array(i);
-    if (model_config->_feed_type[i] != tensor.elem_type()) {
-      LOG(ERROR) << "feed type not match.";
-      return -1;
-    }
-    if (model_config->_feed_shape[i].size() == tensor.shape_size()) {
-      for (int j = 0; j < model_config->_feed_shape[i].size(); ++j) {
-        tensor.shape(j);
-        if (model_config->_feed_shape[i][j] != tensor.shape(j)) {
-          LOG(ERROR) << "feed shape not match.";
-          return -1;
-        }
-      }
-    } else {
-      LOG(ERROR) << "feed shape not match.";
-      return -1;
-    }
-  }
-  return 0;
-}
 
 int GeneralReaderOp::inference() {
   // read request from client
@@ -93,7 +59,8 @@ int GeneralReaderOp::inference() {
   res->SetLogId(log_id);
   Timer timeline;
   int64_t start = timeline.TimeStampUS();
-  int var_num = req->insts(0).tensor_array_size();
+  // var_num means the number of feed_var.
+  int var_num = req->tensor_size();
 
   VLOG(2) << "(logid=" << log_id << ") var num: " << var_num
           << ") start to call load general model_conf op";
@@ -102,19 +69,7 @@ int GeneralReaderOp::inference() {
       baidu::paddle_serving::predictor::Resource::instance();
 
   VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
-  // get the first InferOP's model_config as ReaderOp's model_config by default.
-  std::shared_ptr<PaddleGeneralModelConfig> model_config =
-      resource.get_general_model_config().front();
 
-  // TODO(guru4elephant): how to do conditional check?
-  /*
-  int ret = conf_check(req, model_config);
-  if (ret != 0) {
-    LOG(ERROR) << "model conf of server:";
-    resource.print_general_model_config(model_config);
-    return 0;
-  }
-  */
   // package tensor
   // prepare basic information for input
   // specify the memory needed for output tensor_vector
@@ -125,7 +80,7 @@ int GeneralReaderOp::inference() {
   int64_t databuf_size = 0;
   for (int i = 0; i < var_num; ++i) {
     paddle::PaddleTensor paddleTensor;
-    const Tensor &tensor = req->insts(0).tensor_array(i);
+    const Tensor &tensor = req->tensor(i);
     data_len = 0;
     elem_type = 0;
     elem_size = 0;
@@ -172,13 +127,16 @@ int GeneralReaderOp::inference() {
       VLOG(2) << "(logid=" << log_id << ") shape for var[" << i << "]: " << dim;
       paddleTensor.shape.push_back(dim);
     }
-    paddleTensor.name = model_config->_feed_name[i];
+    paddleTensor.name = tensor.name();
     out->push_back(paddleTensor);
 
     VLOG(2) << "(logid=" << log_id << ") tensor size for var[" << i
             << "]: " << data_len;
     databuf_size = data_len * elem_size;
-    out->at(i).data.Resize(databuf_size);
+    void *databuf_char = MempoolWrapper::instance().malloc(databuf_size);
+    paddle::PaddleBuf paddleBuf(databuf_char, databuf_size);
+    out->at(i).data = paddleBuf;
+    // out->at(i).data.Resize(databuf_size);
     if (out->at(i).lod.size() > 0) {
       VLOG(2) << "(logid=" << log_id << ") var[" << i
               << "] has lod_tensor and len=" << out->at(i).lod[0].back();
diff --git a/core/general-server/op/general_response_op.cpp b/core/general-server/op/general_response_op.cpp
old mode 100755
new mode 100644
index d8fece0f7e25a967a6a72f41a9090b0977bf252a..9f6c8aabd72c7e1e9b8ff933c807ee7fcdc0662f
--- a/core/general-server/op/general_response_op.cpp
+++ b/core/general-server/op/general_response_op.cpp
@@ -34,7 +34,6 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FetchInst;
 using baidu::paddle_serving::predictor::general_model::ModelOutput;
 using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
@@ -49,7 +48,6 @@ int GeneralResponseOp::inference() {
       get_depend_argument<GeneralBlob>(pre_node_names[0])->GetLogId();
 
   const Request *req = dynamic_cast<const Request *>(get_request_message());
-  // response inst with only fetch_var_names
   Response *res = mutable_data<Response>();
 
   Timer timeline;
@@ -63,7 +61,8 @@ int GeneralResponseOp::inference() {
       baidu::paddle_serving::predictor::Resource::instance();
 
   VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
-  //get the last InferOP's model_config as ResponseOp's model_config by default.
+  // get the last InferOP's model_config as ResponseOp's model_config by
+  // default.
   std::shared_ptr<PaddleGeneralModelConfig> model_config =
       resource.get_general_model_config().back();
 
@@ -71,10 +70,23 @@ int GeneralResponseOp::inference() {
           << ") max body size : " << brpc::fLU64::FLAGS_max_body_size;
 
   std::vector<int> fetch_index;
-  fetch_index.resize(req->fetch_var_names_size());
-  for (int i = 0; i < req->fetch_var_names_size(); ++i) {
-    fetch_index[i] =
-        model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
+  // this is based on GetOutPutNames() is ordered map.
+  // and the order of Output is the same as the prototxt FetchVar.
+  // otherwise, you can only get the Output by the corresponding of
+  // Name -- Alias_name.
+  if (req->fetch_var_names_size() > 0) {
+    fetch_index.resize(req->fetch_var_names_size());
+    for (int i = 0; i < req->fetch_var_names_size(); ++i) {
+      fetch_index[i] =
+          model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
+    }
+  } else {
+    fetch_index.resize(model_config->_fetch_alias_name.size());
+    for (int i = 0; i < model_config->_fetch_alias_name.size(); ++i) {
+      fetch_index[i] =
+          model_config
+              ->_fetch_alias_name_to_index[model_config->_fetch_alias_name[i]];
+    }
   }
 
   for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
@@ -95,40 +107,41 @@ int GeneralResponseOp::inference() {
     ModelOutput *output = res->add_outputs();
     // To get the order of model return values
     output->set_engine_name(pre_name);
-    FetchInst *fetch_inst = output->add_insts();
 
+    var_idx = 0;
+    // idx is the real index of FetchVar.
+    // idx is not the index of FetchList.
+    // fetch_index is the real index in FetchVar of Fetchlist
+    // for example, FetchVar = {0:A, 1:B, 2:C}
+    // FetchList = {0:C,1:A}, at this situation.
+    // fetch_index = [2,0], C`index = 2 and A`index = 0
     for (auto &idx : fetch_index) {
-      Tensor *tensor = fetch_inst->add_tensor_array();
-      //tensor->set_elem_type(1);
-      if (model_config->_is_lod_fetch[idx]) {
-        VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
-                << model_config->_fetch_name[idx] << " is lod_tensor";
-        for (int k = 0; k < in->at(idx).shape.size(); ++k) {
-          VLOG(2) << "(logid=" << log_id << ") shape[" << k
-                  << "]: " << in->at(idx).shape[k];
-          tensor->add_shape(in->at(idx).shape[k]);
-        }
-      } else {
-        VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
-                << model_config->_fetch_name[idx] << " is tensor";
-        for (int k = 0; k < in->at(idx).shape.size(); ++k) {
-          VLOG(2) << "(logid=" << log_id << ") shape[" << k
-                  << "]: " << in->at(idx).shape[k];
-          tensor->add_shape(in->at(idx).shape[k]);
+      Tensor *tensor = output->add_tensor();
+      tensor->set_name(in->at(idx).name);
+      tensor->set_alias_name(model_config->_fetch_alias_name[idx]);
+      for (int k = 0; k < in->at(idx).shape.size(); ++k) {
+        VLOG(2) << "(logid=" << log_id << ") shape[" << k
+                << "]: " << in->at(idx).shape[k];
+        tensor->add_shape(in->at(idx).shape[k]);
+      }
+      std::string str_tensor_type = "is tensor";
+      if (model_config->_is_lod_fetch[idx] && in->at(idx).lod.size() > 0) {
+        str_tensor_type = "is lod_tensor";
+        for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
+          tensor->add_lod(in->at(idx).lod[0][j]);
         }
       }
-    }
+      VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
+              << model_config->_fetch_name[idx] << str_tensor_type;
 
-    var_idx = 0;
-    for (auto &idx : fetch_index) {
       cap = 1;
       for (int j = 0; j < in->at(idx).shape.size(); ++j) {
         cap *= in->at(idx).shape[j];
       }
 
-      FetchInst *fetch_p = output->mutable_insts(0);
       auto dtype = in->at(idx).dtype;
       if (dtype == paddle::PaddleDType::INT64) {
+        tensor->set_elem_type(0);
         VLOG(2) << "(logid=" << log_id << ") Prepare int64 var ["
                 << model_config->_fetch_name[idx] << "].";
         int64_t *data_ptr = static_cast<int64_t *>(in->at(idx).data.data());
@@ -137,35 +150,24 @@ int GeneralResponseOp::inference() {
         // `Swap` method is faster than `{}` method.
         google::protobuf::RepeatedField<int64_t> tmp_data(data_ptr,
                                                           data_ptr + cap);
-        fetch_p->mutable_tensor_array(var_idx)->mutable_int64_data()->Swap(
-            &tmp_data);
+        output->mutable_tensor(var_idx)->mutable_int64_data()->Swap(&tmp_data);
       } else if (dtype == paddle::PaddleDType::FLOAT32) {
+        tensor->set_elem_type(1);
         VLOG(2) << "(logid=" << log_id << ") Prepare float var ["
                 << model_config->_fetch_name[idx] << "].";
-        
+
         float *data_ptr = static_cast<float *>(in->at(idx).data.data());
         google::protobuf::RepeatedField<float> tmp_data(data_ptr,
                                                         data_ptr + cap);
-        fetch_p->mutable_tensor_array(var_idx)->mutable_float_data()->Swap(
-            &tmp_data);
+        output->mutable_tensor(var_idx)->mutable_float_data()->Swap(&tmp_data);
       } else if (dtype == paddle::PaddleDType::INT32) {
-
+        tensor->set_elem_type(2);
         VLOG(2) << "(logid=" << log_id << ")Prepare int32 var ["
                 << model_config->_fetch_name[idx] << "].";
         int32_t *data_ptr = static_cast<int32_t *>(in->at(idx).data.data());
         google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr,
                                                           data_ptr + cap);
-        fetch_p->mutable_tensor_array(var_idx)->mutable_int_data()->Swap(
-            &tmp_data);
-      }
-
-      if (model_config->_is_lod_fetch[idx]) {
-        if (in->at(idx).lod.size() > 0) {
-          for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
-            fetch_p->mutable_tensor_array(var_idx)->add_lod(
-                in->at(idx).lod[0][j]);
-          }
-        }
+        output->mutable_tensor(var_idx)->mutable_int_data()->Swap(&tmp_data);
       }
 
       VLOG(2) << "(logid=" << log_id << ") fetch var ["
@@ -205,4 +207,4 @@ DEFINE_OP(GeneralResponseOp);
 
 }  // namespace serving
 }  // namespace paddle_serving
-}  // namespace baidu
\ No newline at end of file
+}  // namespace baidu
diff --git a/core/general-server/op/general_text_reader_op.cpp b/core/general-server/op/general_text_reader_op.cpp
deleted file mode 100755
index 6c305c18c0cb56bc5dd841c9c6a09807c6dbf518..0000000000000000000000000000000000000000
--- a/core/general-server/op/general_text_reader_op.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "core/general-server/op/general_text_reader_op.h"
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include "core/predictor/framework/infer.h"
-#include "core/predictor/framework/memory.h"
-#include "core/util/include/timer.h"
-
-namespace baidu {
-namespace paddle_serving {
-namespace serving {
-
-using baidu::paddle_serving::Timer;
-using baidu::paddle_serving::predictor::MempoolWrapper;
-using baidu::paddle_serving::predictor::general_model::Tensor;
-using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FeedInst;
-using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
-
-int GeneralTextReaderOp::inference() {
-  // reade request from client
-  const Request *req = dynamic_cast<const Request *>(get_request_message());
-  uint64_t log_id = req->log_id();
-
-  int batch_size = req->insts_size();
-  int input_var_num = 0;
-
-  std::vector<int64_t> elem_type;
-  std::vector<int64_t> elem_size;
-  std::vector<int64_t> capacity;
-
-  GeneralBlob *res = mutable_data<GeneralBlob>();
-
-  if (!res) {
-    LOG(ERROR) << "(logid=" << log_id
-               << ") Failed get op tls reader object output";
-  }
-
-  TensorVector *out = &res->tensor_vector;
-  res->SetBatchSize(batch_size);
-  res->SetLogId(log_id);
-
-  if (batch_size <= 0) {
-    LOG(ERROR) << "(logid=" << log_id << ") Batch size < 0";
-    return -1;
-  }
-
-  Timer timeline;
-  int64_t start = timeline.TimeStampUS();
-
-  int var_num = req->insts(0).tensor_array_size();
-  VLOG(2) << "(logid=" << log_id << ") var num: " << var_num;
-
-  VLOG(2) << "(logid=" << log_id
-          << ") start to call load general model_conf op";
-  baidu::paddle_serving::predictor::Resource &resource =
-      baidu::paddle_serving::predictor::Resource::instance();
-
-  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
-  std::shared_ptr<PaddleGeneralModelConfig> model_config =
-      resource.get_general_model_config()[0];
-
-  VLOG(2) << "(logid=" << log_id << ") print general model config done.";
-
-  elem_type.resize(var_num);
-  elem_size.resize(var_num);
-  capacity.resize(var_num);
-  for (int i = 0; i < var_num; ++i) {
-    paddle::PaddleTensor lod_tensor;
-    elem_type[i] = req->insts(0).tensor_array(i).elem_type();
-    VLOG(2) << "(logid=" << log_id << ") var[" << i
-            << "] has elem type: " << elem_type[i];
-    if (elem_type[i] == 0) {  // int64
-      elem_size[i] = sizeof(int64_t);
-      lod_tensor.dtype = paddle::PaddleDType::INT64;
-    } else {
-      elem_size[i] = sizeof(float);
-      lod_tensor.dtype = paddle::PaddleDType::FLOAT32;
-    }
-
-    if (req->insts(0).tensor_array(i).shape(0) == -1) {
-      lod_tensor.lod.resize(1);
-      lod_tensor.lod[0].push_back(0);
-      VLOG(2) << "(logid=" << log_id << ") var[" << i << "] is lod_tensor";
-    } else {
-      lod_tensor.shape.push_back(batch_size);
-      capacity[i] = 1;
-      for (int k = 0; k < req->insts(0).tensor_array(i).shape_size(); ++k) {
-        int dim = req->insts(0).tensor_array(i).shape(k);
-        VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
-                << "]: " << dim;
-        capacity[i] *= dim;
-        lod_tensor.shape.push_back(dim);
-      }
-      VLOG(2) << "(logid=" << log_id << ") var[" << i
-              << "] is tensor, capacity: " << capacity[i];
-    }
-    lod_tensor.name = model_config->_feed_name[i];
-    out->push_back(lod_tensor);
-  }
-
-  for (int i = 0; i < var_num; ++i) {
-    if (out->at(i).lod.size() == 1) {
-      for (int j = 0; j < batch_size; ++j) {
-        const Tensor &tensor = req->insts(j).tensor_array(i);
-        int data_len = tensor.int_data_size();
-        int cur_len = out->at(i).lod[0].back();
-        out->at(i).lod[0].push_back(cur_len + data_len);
-      }
-      out->at(i).data.Resize(out->at(i).lod[0].back() * elem_size[i]);
-      out->at(i).shape = {out->at(i).lod[0].back(), 1};
-      VLOG(2) << "(logid=" << log_id << ") var[" << i
-              << "] is lod_tensor and len=" << out->at(i).lod[0].back();
-    } else {
-      out->at(i).data.Resize(batch_size * capacity[i] * elem_size[i]);
-      VLOG(2) << "(logid=" << log_id << ") var[" << i
-              << "] is tensor and capacity=" << batch_size * capacity[i];
-    }
-  }
-
-  for (int i = 0; i < var_num; ++i) {
-    if (elem_type[i] == 0) {
-      int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
-      int offset = 0;
-      for (int j = 0; j < batch_size; ++j) {
-        for (int k = 0; k < req->insts(j).tensor_array(i).int_data_size();
-             ++k) {
-          dst_ptr[offset + k] = req->insts(j).tensor_array(i).int_data(k);
-        }
-        if (out->at(i).lod.size() == 1) {
-          offset = out->at(i).lod[0][j + 1];
-        } else {
-          offset += capacity[i];
-        }
-      }
-    } else {
-      float *dst_ptr = static_cast<float *>(out->at(i).data.data());
-      int offset = 0;
-      for (int j = 0; j < batch_size; ++j) {
-        for (int k = 0; k < req->insts(j).tensor_array(i).int_data_size();
-             ++k) {
-          dst_ptr[offset + k] = req->insts(j).tensor_array(i).int_data(k);
-        }
-        if (out->at(i).lod.size() == 1) {
-          offset = out->at(i).lod[0][j + 1];
-        } else {
-          offset += capacity[i];
-        }
-      }
-    }
-  }
-
-  int64_t end = timeline.TimeStampUS();
-  res->p_size = 0;
-  AddBlobInfo(res, start);
-  AddBlobInfo(res, end);
-
-  VLOG(2) << "(logid=" << log_id << ") read data from client success";
-  return 0;
-}
-DEFINE_OP(GeneralTextReaderOp);
-}  // namespace serving
-}  // namespace paddle_serving
-}  // namespace baidu
diff --git a/core/general-server/op/general_text_reader_op.h b/core/general-server/op/general_text_reader_op.h
deleted file mode 100644
index af822993dc37fae23c1fa584d640cbfe8d9950c8..0000000000000000000000000000000000000000
--- a/core/general-server/op/general_text_reader_op.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "core/general-server/general_model_service.pb.h"
-#include "core/general-server/load_general_model_service.pb.h"
-#include "core/general-server/op/general_infer_helper.h"
-#include "core/predictor/framework/resource.h"
-#include "paddle_inference_api.h"  // NOLINT
-
-namespace baidu {
-namespace paddle_serving {
-namespace serving {
-
-class GeneralTextReaderOp
-    : public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
- public:
-  typedef std::vector<paddle::PaddleTensor> TensorVector;
-
-  DECLARE_OP(GeneralTextReaderOp);
-
-  int inference();
-};
-
-}  // namespace serving
-}  // namespace paddle_serving
-}  // namespace baidu
diff --git a/core/general-server/op/general_text_response_op.cpp b/core/general-server/op/general_text_response_op.cpp
deleted file mode 100755
index 03ab08cd361ea9eb8060c4ba5372d319a34df1f6..0000000000000000000000000000000000000000
--- a/core/general-server/op/general_text_response_op.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "core/general-server/op/general_text_response_op.h"
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include "core/predictor/framework/infer.h"
-#include "core/predictor/framework/memory.h"
-#include "core/predictor/framework/resource.h"
-#include "core/util/include/timer.h"
-
-namespace baidu {
-namespace paddle_serving {
-namespace serving {
-
-using baidu::paddle_serving::Timer;
-using baidu::paddle_serving::predictor::MempoolWrapper;
-using baidu::paddle_serving::predictor::general_model::Tensor;
-using baidu::paddle_serving::predictor::general_model::Response;
-using baidu::paddle_serving::predictor::general_model::Request;
-using baidu::paddle_serving::predictor::general_model::FetchInst;
-using baidu::paddle_serving::predictor::general_model::ModelOutput;
-using baidu::paddle_serving::predictor::InferManager;
-using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
-
-int GeneralTextResponseOp::inference() {
-  VLOG(2) << "Going to run inference";
-  const std::vector<std::string> pre_node_names = pre_names();
-  VLOG(2) << "pre node names size: " << pre_node_names.size();
-  const GeneralBlob *input_blob;
-  uint64_t log_id =
-      get_depend_argument<GeneralBlob>(pre_node_names[0])->GetLogId();
-
-  const Request *req = dynamic_cast<const Request *>(get_request_message());
-  // response inst with only fetch_var_names
-  Response *res = mutable_data<Response>();
-
-  Timer timeline;
-  int64_t start = timeline.TimeStampUS();
-
-  VLOG(2) << "(logid=" << log_id
-          << ") start to call load general model_conf op";
-  baidu::paddle_serving::predictor::Resource &resource =
-      baidu::paddle_serving::predictor::Resource::instance();
-
-  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
-  std::shared_ptr<PaddleGeneralModelConfig> model_config =
-      resource.get_general_model_config().back();
-
-  std::vector<int> fetch_index;
-  fetch_index.resize(req->fetch_var_names_size());
-  for (int i = 0; i < req->fetch_var_names_size(); ++i) {
-    fetch_index[i] =
-        model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
-  }
-
-  for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
-    const std::string &pre_name = pre_node_names[pi];
-    VLOG(2) << "(logid=" << log_id << ") pre names[" << pi << "]: " << pre_name
-            << " (" << pre_node_names.size() << ")";
-    input_blob = get_depend_argument<GeneralBlob>(pre_name);
-    if (!input_blob) {
-      LOG(ERROR) << "(logid=" << log_id
-                 << ") Failed mutable depended argument, op: " << pre_name;
-      return -1;
-    }
-
-    const TensorVector *in = &input_blob->tensor_vector;
-    int batch_size = input_blob->GetBatchSize();
-    VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
-
-    ModelOutput *output = res->add_outputs();
-    output->set_engine_name(
-        pre_name);  // To get the order of model return values
-    for (int i = 0; i < batch_size; ++i) {
-      FetchInst *fetch_inst = output->add_insts();
-      for (auto &idx : fetch_index) {
-        Tensor *tensor = fetch_inst->add_tensor_array();
-        // currently only response float tensor or lod_tensor
-        tensor->set_elem_type(1);
-        if (model_config->_is_lod_fetch[idx]) {
-          VLOG(2) << "(logid=" << log_id << ") out[" << idx << " is lod_tensor";
-          tensor->add_shape(-1);
-        } else {
-          VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] is tensor";
-          for (int k = 1; k < in->at(idx).shape.size(); ++k) {
-            VLOG(2) << "(logid=" << log_id << ") shape[" << k - 1
-                    << "]: " << in->at(idx).shape[k];
-            tensor->add_shape(in->at(idx).shape[k]);
-          }
-        }
-      }
-    }
-
-    int var_idx = 0;
-    for (auto &idx : fetch_index) {
-      float *data_ptr = static_cast<float *>(in->at(idx).data.data());
-      int cap = 1;
-      for (int j = 1; j < in->at(idx).shape.size(); ++j) {
-        cap *= in->at(idx).shape[j];
-      }
-      if (model_config->_is_lod_fetch[idx]) {
-        for (int j = 0; j < batch_size; ++j) {
-          for (int k = in->at(idx).lod[0][j]; k < in->at(idx).lod[0][j + 1];
-               k++) {
-            output->mutable_insts(j)
-                ->mutable_tensor_array(var_idx)
-                ->add_float_data(data_ptr[k]);
-          }
-        }
-      } else {
-        for (int j = 0; j < batch_size; ++j) {
-          for (int k = j * cap; k < (j + 1) * cap; ++k) {
-            output->mutable_insts(j)
-                ->mutable_tensor_array(var_idx)
-                ->add_float_data(data_ptr[k]);
-          }
-        }
-      }
-      var_idx++;
-    }
-  }
-
-  if (req->profile_server()) {
-    int64_t end = timeline.TimeStampUS();
-    // TODO(barriery): multi-model profile_time.
-    // At present, only the response_op is multi-input, so here we get
-    // the profile_time by hard coding. It needs to be replaced with
-    // a more elegant way.
-    for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
-      input_blob = get_depend_argument<GeneralBlob>(pre_node_names[pi]);
-      VLOG(2) << "(logid=" << log_id
-              << ") p size for input blob: " << input_blob->p_size;
-      int profile_time_idx = -1;
-      if (pi == 0) {
-        profile_time_idx = 0;
-      } else {
-        profile_time_idx = input_blob->p_size - 2;
-      }
-      for (; profile_time_idx < input_blob->p_size; ++profile_time_idx) {
-        res->add_profile_time(input_blob->time_stamp[profile_time_idx]);
-      }
-    }
-    // TODO(guru4elephant): find more elegant way to do this
-    res->add_profile_time(start);
-    res->add_profile_time(end);
-  }
-
-  return 0;
-}
-DEFINE_OP(GeneralTextResponseOp);
-
-}  // namespace serving
-}  // namespace paddle_serving
-}  // namespace baidu
diff --git a/core/general-server/op/general_text_response_op.h b/core/general-server/op/general_text_response_op.h
deleted file mode 100644
index 334d98476e67f745635f7d66d7b8682de62da355..0000000000000000000000000000000000000000
--- a/core/general-server/op/general_text_response_op.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "core/general-server/general_model_service.pb.h"
-#include "core/general-server/op/general_infer_helper.h"
-#include "paddle_inference_api.h"  // NOLINT
-
-namespace baidu {
-namespace paddle_serving {
-namespace serving {
-
-class GeneralTextResponseOp
-    : public baidu::paddle_serving::predictor::OpWithChannel<
-          baidu::paddle_serving::predictor::general_model::Response> {
- public:
-  typedef std::vector<paddle::PaddleTensor> TensorVector;
-
-  DECLARE_OP(GeneralTextResponseOp);
-
-  int inference();
-};
-
-}  // namespace serving
-}  // namespace paddle_serving
-}  // namespace baidu
diff --git a/core/general-server/proto/general_model_service.proto b/core/general-server/proto/general_model_service.proto
old mode 100644
new mode 100755
index e7dd5fccf54be43db8e65a9ed1112ceaece93700..8fedb60e97ec5b81263687b47ff0794880da8671
--- a/core/general-server/proto/general_model_service.proto
+++ b/core/general-server/proto/general_model_service.proto
@@ -20,21 +20,20 @@ package baidu.paddle_serving.predictor.general_model;
 option cc_generic_services = true;
 
 message Tensor {
-  repeated bytes data = 1;
+  repeated string data = 1;
   repeated int32 int_data = 2;
   repeated int64 int64_data = 3;
   repeated float float_data = 4;
-  optional int32 elem_type = 5;
-  repeated int32 shape = 6;
-  repeated int32 lod = 7; // only for fetch tensor currently
+  optional int32 elem_type =
+      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
+  repeated int32 shape = 6;       // shape should include batch
+  repeated int32 lod = 7;         // only for fetch tensor currently
+  optional string name = 8;       // get from the Model prototxt
+  optional string alias_name = 9; // get from the Model prototxt
 };
 
-message FeedInst { repeated Tensor tensor_array = 1; };
-
-message FetchInst { repeated Tensor tensor_array = 1; };
-
 message Request {
-  repeated FeedInst insts = 1;
+  repeated Tensor tensor = 1;
   repeated string fetch_var_names = 2;
   optional bool profile_server = 3 [ default = false ];
   required uint64 log_id = 4 [ default = 0 ];
@@ -46,7 +45,7 @@ message Response {
 };
 
 message ModelOutput {
-  repeated FetchInst insts = 1;
+  repeated Tensor tensor = 1;
   optional string engine_name = 2;
 }
 
diff --git a/core/pdcodegen/src/pdcodegen.cpp b/core/pdcodegen/src/pdcodegen.cpp
index c505ca66385dd363ad0a76470012f07a925bcd17..a99828ee3466a32d45dcabb61a2700f9362539d4 100644
--- a/core/pdcodegen/src/pdcodegen.cpp
+++ b/core/pdcodegen/src/pdcodegen.cpp
@@ -280,6 +280,7 @@ class PdsCodeGenerator : public CodeGenerator {
             "  baidu::rpc::ClosureGuard done_guard(done);\n"
             "  baidu::rpc::Controller* cntl = \n"
             "        static_cast<baidu::rpc::Controller*>(cntl_base);\n"
+            "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
             "  uint64_t log_id = request->log_id();\n"
             "  cntl->set_log_id(log_id);\n"
             "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
@@ -322,6 +323,7 @@ class PdsCodeGenerator : public CodeGenerator {
             "  baidu::rpc::ClosureGuard done_guard(done);\n"
             "  baidu::rpc::Controller* cntl = \n"
             "        static_cast<baidu::rpc::Controller*>(cntl_base);\n"
+            "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
             "  uint64_t log_id = equest->log_id();\n"
             "  cntl->set_log_id(log_id);\n"
             "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
@@ -1023,6 +1025,7 @@ class PdsCodeGenerator : public CodeGenerator {
             "  brpc::ClosureGuard done_guard(done);\n"
             "  brpc::Controller* cntl = \n"
             "        static_cast<brpc::Controller*>(cntl_base);\n"
+            "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
             "  uint64_t log_id = request->log_id();\n"
             "  cntl->set_log_id(log_id);\n"
             "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
@@ -1067,6 +1070,7 @@ class PdsCodeGenerator : public CodeGenerator {
             "  brpc::ClosureGuard done_guard(done);\n"
             "  brpc::Controller* cntl = \n"
             "        static_cast<brpc::Controller*>(cntl_base);\n"
+            "  cntl->set_response_compress_type(brpc::COMPRESS_TYPE_GZIP);\n"
             "  uint64_t log_id = request->log_id();\n"
             "  cntl->set_log_id(log_id);\n"
             "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
diff --git a/core/predictor/common/constant.cpp b/core/predictor/common/constant.cpp
index 5fa1277de1a4b0d33d14a9c33d3cb4b280bc3b5c..70f0096ba002ebb8f185cd73f8fe4f8d4d06b83f 100644
--- a/core/predictor/common/constant.cpp
+++ b/core/predictor/common/constant.cpp
@@ -25,7 +25,7 @@ DEFINE_int32(port, 8010, "");
 DEFINE_string(workflow_path, "./conf", "");
 DEFINE_string(workflow_file, "workflow.prototxt", "");
 DEFINE_string(inferservice_path, "./conf", "");
-DEFINE_string(inferservice_file, "service.prototxt", "");
+DEFINE_string(inferservice_file, "infer_service.prototxt", "");
 DEFINE_string(logger_path, "./conf", "");
 DEFINE_string(logger_file, "log.conf", "");
 DEFINE_string(resource_path, "./conf", "");
diff --git a/core/predictor/framework/bsf-inl-tensor.h b/core/predictor/framework/bsf-inl-tensor.h
deleted file mode 100644
index b7c725b443281f355addffb8f2fcb36651b6d9b6..0000000000000000000000000000000000000000
--- a/core/predictor/framework/bsf-inl-tensor.h
+++ /dev/null
@@ -1,373 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef BCLOUD
-#include <base/atomicops.h>
-#else
-#include <butil/atomicops.h>
-#endif
-
-#include <errno.h>
-#include <algorithm>
-#include <deque>
-#include <vector>
-#include "core/predictor/common/inner_common.h"
-#include "core/predictor/framework/infer_data.h"
-#include "core/predictor/framework/memory.h"
-
-#include <boost/function.hpp>
-
-namespace im {
-namespace bsf {
-
-template <>
-struct Task<baidu::paddle_serving::predictor::Tensor,
-            baidu::paddle_serving::predictor::Tensor> {
-  typedef Task<baidu::paddle_serving::predictor::Tensor,
-               baidu::paddle_serving::predictor::Tensor>
-      TaskT;
-  typedef baidu::paddle_serving::predictor::Tensor Tensor;
-  typedef baidu::paddle_serving::predictor::Tensor InType;
-  typedef baidu::paddle_serving::predictor::Tensor OutType;
-  typedef baidu::paddle_serving::predictor::BatchTensor BatchTensor;
-  typedef baidu::paddle_serving::predictor::BatchTensor InArrayT;
-  typedef baidu::paddle_serving::predictor::BatchTensor OutArrayT;
-
-  struct Segment {
-    Segment(void* p, size_t b, size_t s) : ptr(p), begin(b), size(s) {}
-    void* ptr;
-    size_t begin;
-    size_t size;
-  };
-
-  int read_fd;
-  int write_fd;
-
-  pid_t owner_tid;
-
-  const InArrayT* in;
-  OutArrayT* out;
-
-  size_t rem;
-  size_t size;
-
-  butil::atomic<size_t> index;
-
-  const BatchTensor* get(bool is_in) const {
-    if (is_in) {
-      return in;
-    } else {
-      return out;
-    }
-  }
-
-  BatchTensor* get(bool is_in) {
-    if (is_in) {
-      return const_cast<BatchTensor*>(in);
-    } else {
-      return out;
-    }
-  }
-
-  Task() {
-    read_fd = -1;
-    write_fd = -1;
-    owner_tid = -1;
-    in = NULL;
-    out = NULL;
-    rem = -1;
-    size = -1;
-    index.store(0, butil::memory_order_relaxed);
-  }
-};
-
-template <>
-class BatchTasks<Task<baidu::paddle_serving::predictor::Tensor,
-                      baidu::paddle_serving::predictor::Tensor>> {
- public:
-  typedef baidu::paddle_serving::predictor::Tensor Tensor;
-  typedef baidu::paddle_serving::predictor::Tensor InType;
-  typedef baidu::paddle_serving::predictor::Tensor OutType;
-  typedef baidu::paddle_serving::predictor::DataBuf DataBuf;
-  typedef baidu::paddle_serving::predictor::MempoolWrapper MempoolWrapper;
-
-  typedef Task<baidu::paddle_serving::predictor::Tensor,
-               baidu::paddle_serving::predictor::Tensor>
-      TaskT;
-  typedef TaskMeta<TaskT> TaskMetaT;
-  typedef TaskT::InArrayT InArrayT;
-  typedef TaskT::OutArrayT OutArrayT;
-
-  explicit BatchTasks(size_t batch_size, bool batch_align = false)
-      : _batch_size(batch_size),
-        _rem_size(batch_size),
-        _batch_align(batch_align) {
-    _batch_in.clear();
-    _batch_out.clear();
-    _tasks.clear();
-  }
-
-  ~BatchTasks() {
-    _batch_in.clear();
-    _batch_out.clear();
-    _tasks.clear();
-  }
-
-  static bool check_valid(const InArrayT& in,
-                          OutArrayT& out,  // NOLINT
-                          bool align) {    // NOLINT
-    if (align) {
-      if (out.count() <= 0 || out.size() <= 0) {
-        LOG(ERROR) << "Out tensor is empty, when aligned";
-        return false;
-      }
-
-      if (out.size() != in.size()) {
-        LOG(ERROR) << "In/Out tensor size not eq: " << out.size()
-                   << "!=" << in.size();
-        return false;
-      }
-
-      for (size_t fi = 0, shape0 = 0; fi < out.count(); ++fi) {
-        if (!out[fi].valid()) {
-          LOG(ERROR) << "Out[" << fi << "] tensor not valid";
-          return false;
-        }
-
-        if (out.size() != out[fi].shape0()) {
-          LOG(ERROR) << "Shape0 not consistency, " << out.size()
-                     << "!=" << out[fi].shape0() << ", " << fi;
-          return false;
-        }
-      }
-    }
-
-    return true;
-  }
-
-  size_t append_task(TaskT* task) {
-    size_t add = std::min(task->rem, _rem_size);
-    if (!_batch_align) {
-      add = task->rem;
-    }
-    TaskMetaT tm(task, task->in->size() - task->rem, add);
-    _tasks.push_back(tm);
-
-    task->rem -= add;
-    _rem_size -= add;
-    return _rem_size;
-  }
-
-  void merge_tasks() {
-    merge_input();
-    merge_output();
-  }
-
-  void merge_input() {
-    if (_tasks.size() <= 0 || _tasks[0].task->in->count() <= 0) {
-      return;
-    }
-
-    if (_tasks.size() == 1 && !_batch_align) {
-      TaskMetaT& tm = _tasks[0];
-      _batch_in = *(tm.task->in);
-      return;
-    }
-
-    merge_tensor(true);
-  }
-
-  void merge_output() {
-    if (_batch_align) {
-      if (_tasks.size() <= 0 || _tasks[0].task->out->count() <= 0) {
-        return;
-      }
-    }
-
-    if (_tasks.size() <= 0 || _tasks[0].task->out->count() <= 0) {
-      return;
-    }
-
-    TaskMetaT& tm = _tasks[0];
-    if (_tasks.size() == 1 && !_batch_align) {
-      _batch_out = *(tm.task->out);
-      return;
-    }
-
-    if (tm.task->out->size() <= 0) {
-      // shape is empty
-      _batch_out = *(tm.task->out);
-      return;
-    }
-
-    if ((*tm.task->out)[0].data.data() == 0 ||
-        (*tm.task->out)[0].data.size() == 0) {
-      _batch_out = *(tm.task->out);
-      return;
-    }
-
-    merge_tensor(false);
-  }
-
-  void merge_tensor(bool is_in) {
-    // accumulate batch size from fetched tasks
-    size_t batch_size = 0;
-    for (size_t ti = 0; ti < _tasks.size(); ++ti) {
-      TaskMetaT& tm = _tasks[ti];
-      size_t add = tm.end - tm.begin;
-      batch_size += add;
-    }
-
-    // merge all instanses in each tensor data
-    size_t tensor_count = _tasks[0].task->get(is_in)->count();
-    for (size_t fi = 0; fi < tensor_count; ++fi) {
-      const Tensor& head = (*(_tasks[0].task->get(is_in)))[fi];
-      Tensor batch_tensor;
-      batch_tensor.name = head.name;
-      batch_tensor.type = head.type;
-      batch_tensor.shape.push_back(batch_size);
-
-      size_t ins_ele_count = 1;
-      for (size_t si = 1; si < head.shape.size(); ++si) {
-        batch_tensor.shape.push_back(head.shape[si]);
-        ins_ele_count *= head.shape[si];
-      }
-
-      size_t tensor_ele_count = ins_ele_count * batch_size;
-      size_t ins_byte = ins_ele_count * head.ele_byte();
-
-      size_t tensor_byte = tensor_ele_count * head.ele_byte();
-      void* data_buf = MempoolWrapper::instance().malloc(tensor_byte);
-      if (!data_buf) {
-        LOG(ERROR) << "Malloc failed, size: " << tensor_byte;
-        return;
-      }
-
-      size_t data_byte = 0;
-      for (size_t ti = 0; ti < _tasks.size(); ++ti) {
-        TaskMetaT& tm = _tasks[ti];
-        size_t acc_byte = ins_byte * (tm.end - tm.begin);
-        if (data_byte + acc_byte > tensor_byte) {
-          LOG(ERROR) << "Invalid bytes: " << data_byte << " + " << acc_byte
-                     << " >= " << tensor_byte;
-          return;
-        }
-
-        const Tensor& tensor = (*(tm.task->get(is_in)))[fi];
-        memcpy(
-            reinterpret_cast<char*>(data_buf) + data_byte,
-            reinterpret_cast<char*>(tensor.data.data()) + tm.begin * ins_byte,
-            acc_byte);
-        data_byte += acc_byte;
-      }
-
-      if (data_byte != tensor_byte) {
-        LOG(ERROR) << "Invalid tensor byte: " << data_byte
-                   << " != " << tensor_byte;
-        return;
-      }
-
-      batch_tensor.data =
-          DataBuf(reinterpret_cast<char*>(data_buf), tensor_byte);
-      if (is_in) {
-        _batch_in.push_back(batch_tensor);
-      } else {
-        _batch_out.push_back(batch_tensor);
-      }
-    }
-
-    LOG(INFO) << "merge input(" << is_in << ") samples: " << batch_size
-              << " from " << _tasks.size() << " pvs";
-  }
-
-  void notify_tasks() {
-    if (_batch_out.size() != _batch_in.size()) {
-      LOG(ERROR) << "batch size not consistency: " << _batch_out.size()
-                 << " != " << _batch_in.size();
-      return;
-    }
-
-    size_t tensor_count = _batch_out.count();
-    size_t batch_size = _batch_out.size();
-    for (size_t fi = 0; fi < tensor_count; ++fi) {
-      const Tensor& tensor = _batch_out[fi];
-      size_t ins_byte = tensor.ele_byte();
-      for (size_t si = 1; si < tensor.shape.size(); ++si) {
-        ins_byte *= tensor.shape[si];
-      }
-
-      for (size_t ti = 0, bi = 0, add = 0; ti < _tasks.size();
-           ++ti, bi += add) {
-        OutArrayT* dst = _tasks[ti].task->out;
-        add = _tasks[ti].end - _tasks[ti].begin;
-        size_t offset_src = ins_byte * bi;
-        size_t add_byte = add * ins_byte;
-
-        if (_batch_align) {  // merge all batchs
-          size_t offset_dst = ins_byte * _tasks[ti].begin;
-          void* ptr = const_cast<void*>((*dst)[fi].data.data());
-          memcpy(
-              reinterpret_cast<char*>(ptr) + offset_dst,
-              reinterpret_cast<char*>(_batch_out[fi].data.data()) + offset_src,
-              add_byte);
-        } else {  // overwrite
-          if (dst->count() <= 0) {
-            dst->push_back(_batch_out[fi]);
-          } else {
-            (*dst)[fi] = _batch_out[fi];
-          }
-
-          (*dst)[fi].shape[0] = add;
-          (*dst)[fi].data = DataBuf(
-              reinterpret_cast<char*>(_batch_out[fi].data.data()) + offset_src,
-              add_byte);
-        }
-      }
-    }
-
-    for (size_t ti = 0; ti < _tasks.size(); ++ti) {
-      TaskT* task = _tasks[ti].task;
-      size_t begin = _tasks[ti].begin;
-      size_t end = _tasks[ti].end;
-      size_t add = end - begin;
-
-      size_t index = task->index.fetch_add(add);
-      if ((index + add) >= task->in->size()) {
-        char c = 0;
-        while (write(task->write_fd, &c, 1) != 1 && errno == EINTR) {
-        }
-        butil::return_object(task);
-      }
-    }
-  }
-
-  const typename TaskT::InArrayT& in() const { return _batch_in; }
-
-  typename TaskT::OutArrayT& out() { return _batch_out; }
-
-  size_t task_size() { return _tasks.size(); }
-
- private:
-  std::vector<TaskMetaT> _tasks;
-  InArrayT _batch_in;
-  OutArrayT _batch_out;
-  size_t _batch_size;
-  size_t _rem_size;
-  bool _batch_align;
-};
-
-}  // namespace bsf
-}  // namespace im
diff --git a/core/predictor/framework/bsf-inl.h b/core/predictor/framework/bsf-inl.h
index 1193ce4860e595598b738adab738c7af9664cc26..1f5d272d2875ee878f09ac2882364afe9fd899fb 100644
--- a/core/predictor/framework/bsf-inl.h
+++ b/core/predictor/framework/bsf-inl.h
@@ -24,6 +24,7 @@
 #include <boost/bind.hpp>
 
 #include "core/predictor/common/inner_common.h"
+#include "core/predictor/framework/memory.h"
 
 namespace im {
 namespace bsf {
@@ -35,7 +36,7 @@ void* TaskExecutor<TaskT>::thread_entry(void* args) {
       static_cast<TaskExecutor<TaskT>*>(context->executor);
   executor->work(context);
 
-  return NULL;
+  return nullptr;
 }
 
 template <typename TaskT>
@@ -70,7 +71,7 @@ int TaskExecutor<TaskT>::start(uint32_t thread_num, uint32_t init_timeout_sec) {
     _thread_contexts.push_back(&contexts[i]);
   }
 
-  int init_timeout = init_timeout_sec * 1000 * 1000;
+  size_t init_timeout = init_timeout_sec * 1000 * 1000;
   bool has_error = false;
 
   bool has_timeout = true;
@@ -102,7 +103,7 @@ int TaskExecutor<TaskT>::start(uint32_t thread_num, uint32_t init_timeout_sec) {
     }
 
     // 100ms
-    const int sleep_interval = 100 * 1000;
+    const size_t sleep_interval = 100 * 1000;
     usleep(sleep_interval);
     init_timeout -= sleep_interval;
   }
@@ -125,18 +126,21 @@ void TaskExecutor<TaskT>::stop() {
 }
 
 template <typename TaskT>
-TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(const InArrayT& in,
-                                                 OutArrayT& out) {  // NOLINT
+TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(
+    const void* inVectorT_ptr,
+    void* outVectorT_ptr) {  // NOLINT
   TaskT* task = butil::get_object<TaskT>();
   if (!task) {
     LOG(ERROR) << "Failed get TaskT from object pool";
     return TaskHandler<TaskT>::valid_handle();
   }
 
+  /*
   if (!BatchTasks<TaskT>::check_valid(in, out, _batch_align)) {
     LOG(ERROR) << "Invalid input & output";
     return TaskHandler<TaskT>::valid_handle();
   }
+  */
 
   int fds[2];
   int rc = pipe(fds);
@@ -150,10 +154,9 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(const InArrayT& in,
   task->write_fd = fds[1];
   task->owner_tid = ::syscall(SYS_gettid);
 
-  task->in = &in;
-  task->out = &out;
-  task->rem = in.size();
-  task->size = in.size();
+  task->inVectorT_ptr = (const InVectorT*)inVectorT_ptr;
+  task->outVectorT_ptr = (OutVectorT*)outVectorT_ptr;
+  task->rem = task->batch_size();
   task->index.store(0, butil::memory_order_relaxed);
 
   AutoMutex lock(_mut);
@@ -163,8 +166,13 @@ TaskHandler<TaskT> TaskExecutor<TaskT>::schedule(const InArrayT& in,
   return TaskHandler<TaskT>(*task);
 }
 
+// this function is accessed by multi thread.
+// so AutoMutex at first.
+// so batch.append_task is thread safe.
+// you dont need to add extra lock in append_task()
 template <typename TaskT>
-bool TaskExecutor<TaskT>::fetch_batch(BatchTasks<TaskT>& batch) {  // NOLINT
+bool TaskExecutor<TaskT>::move_task_to_batch(
+    BatchTasks<TaskT>& batch) {  // NOLINT
   AutoMutex lock(_mut);
   while (_task_queue.empty()) {
     THREAD_COND_WAIT(&_cond, &_mut);
@@ -187,8 +195,30 @@ bool TaskExecutor<TaskT>::fetch_batch(BatchTasks<TaskT>& batch) {  // NOLINT
   return true;
 }
 
+// this function is accessed by multi thread.
+// move_task_to_batch have add lock inside the function.
+// Packaging 1 TaskT as 1 or Several TaskMeta.
+// TaskT is from the SingleTon TaskExecutor`s _task_queue
+// although TaskMeta is a local variable, but several TaskMeta may points to
+// the same TaskT which is get from the SingleTon TaskExecutor`s _task_queue.
+// put TaskMeta to the local variable BatchTasks<TaskT> batch.
+
+// batch.merge_tasks() and batch.notify_tasks() has no lock.
+// BatchTasks<TaskT> batch itself is a local variable, it`s thread safe.
+// If batch.merge_tasks() and batch.notify_tasks() do something to TaskMeta
+// you need to pay attention to that.
+// Multi-Thread deal with different TaskMeta(cause it`s created as local
+// variable)
+// But different TaskMeta may points to the same TaskT
+// which is get from the SingleTon TaskExecutor`s _task_queue.
+
 template <typename TaskT>
 int TaskExecutor<TaskT>::work(ThreadContext<TaskT>* context) {
+  if (MempoolWrapper::instance().thread_initialize() != 0) {
+    LOG(ERROR) << "Failed thread initialize mempool";
+    return -1;
+  }
+
   if (_thread_init_fn != NULL) {
     if (_thread_init_fn(context->user_thread_context) != 0) {
       LOG(ERROR) << "execute thread init thunk failed, BSF thread will exit";
@@ -207,10 +237,15 @@ int TaskExecutor<TaskT>::work(ThreadContext<TaskT>* context) {
       }
     }
 
+    if (MempoolWrapper::instance().thread_clear() != 0) {
+      LOG(ERROR) << "Failed thread clear mempool";
+      return -1;
+    }
+
     BatchTasks<TaskT> batch(_batch_size, _batch_align);
-    if (fetch_batch(batch)) {
+    if (move_task_to_batch(batch)) {
       batch.merge_tasks();
-      _fn(batch.in(), batch.out());
+      _fn(&batch.in(), &batch.out());
       batch.notify_tasks();
     }
   }
@@ -219,9 +254,10 @@ int TaskExecutor<TaskT>::work(ThreadContext<TaskT>* context) {
 }
 
 template <typename InItemT, typename OutItemT>
-bool TaskManager<InItemT, OutItemT>::schedule(const InArrayT& in,
-                                              OutArrayT& out) {  // NOLINT
-  TaskHandler<TaskT> handler = _executor.schedule(in, out);
+bool TaskManager<InItemT, OutItemT>::schedule(const void* in,
+                                              void* out) {  // NOLINT
+  TaskHandler<TaskT> handler =
+      TaskExecutorVector<TaskT>::instance()[_model_index].schedule(in, out);
 
   if (handler.valid()) {
     _task_owned = handler;
diff --git a/core/predictor/framework/bsf.h b/core/predictor/framework/bsf.h
index 36a00c381130c191de713e5024c7247d64cb96e7..7a8629e75b87aec889a1cce98b6392dddad32ce0 100644
--- a/core/predictor/framework/bsf.h
+++ b/core/predictor/framework/bsf.h
@@ -16,7 +16,7 @@
 
 #include <errno.h>
 #include <algorithm>
-#include <deque>
+#include <list>
 #include <vector>
 
 #ifdef BCLOUD
@@ -29,46 +29,186 @@
 
 #include "boost/function.hpp"
 
+#include "core/predictor/framework/memory.h"
+#include "paddle_inference_api.h"
+
 namespace im {
 namespace bsf {
 
 static const size_t DEFAULT_BATCH_SIZE = 100;
 
+// InItemT is paddle::PaddleTensor
+// InVectorT std::vector<paddle::PaddleTensor>
+// InVectorT means different feedvar, but not batch.
+// Batch is already inside the  paddle::PaddleTensor.
+
+// size_t `rem` records how many batch have not been put in BatchTasks.
+// `rem` don`t need to be atomic, cause the operation `put` is synchronous.
+// actually, the reason is that lock have been added outside the operation
+// `put`.
+
+// size_t `index` records how many batch have been processing completed.
+// `index` need to be atomic, cause the operation 'notify' is asynchronous.
 template <typename InItemT, typename OutItemT>
 struct Task {
-  typedef std::vector<InItemT> InArrayT;
-  typedef std::vector<OutItemT> OutArrayT;
+  typedef std::vector<InItemT> InVectorT;
+  typedef std::vector<OutItemT> OutVectorT;
   typedef InItemT InType;
   typedef OutItemT OutType;
   typedef Task<InItemT, OutItemT> TaskT;
+  typedef std::vector<int> ShapeVector;
+  typedef std::vector<ShapeVector> VectorOfShapeVector;
 
   int read_fd;
   int write_fd;
-
   pid_t owner_tid;
-
-  const InArrayT* in;
-  OutArrayT* out;
-
+  const InVectorT* inVectorT_ptr;
+  OutVectorT* outVectorT_ptr;
   size_t rem;
-  size_t size;
-
-  size_t batch_size() { return in->size(); }
-
   butil::atomic<size_t> index;
 
   Task() {
     read_fd = -1;
     write_fd = -1;
     owner_tid = -1;
-    in = NULL;
-    out = NULL;
+    inVectorT_ptr = NULL;
+    outVectorT_ptr = NULL;
     rem = -1;
-    size = -1;
     index.store(0, butil::memory_order_relaxed);
   }
+
+  bool check_feedvar_valid(int feedvar_index) {
+    if (feedvar_index < 0 || inVectorT_ptr->size() <= feedvar_index) {
+      LOG(ERROR) << "feedvar doesnt exsit or feedvar_index error";
+      return 0;
+    }
+
+    if ((*inVectorT_ptr)[feedvar_index].shape.size() <= 0) {
+      LOG(ERROR) << "feedvar[" << feedvar_index << "].shape.size()<=0,error";
+      return 0;
+    }
+
+    return 1;
+  }
+
+  // Now, it simply assume that the first dimension of data is batch.
+  // so the batch is PaddleTensor.shape[0]
+
+  // If batch information is added into feedvar.prototxt.
+  // we can get the information from the feedvar.prototxt instead of assume.
+  size_t feedvar_batch_size(int feedvar_index) {
+    if (!check_feedvar_valid(feedvar_index)) {
+      return 0;
+    }
+
+    return (*inVectorT_ptr)[feedvar_index].shape[0];
+  }
+
+  size_t feedvar_element_bytesize(int feedvar_index) {
+    if (!check_feedvar_valid(feedvar_index)) {
+      return 0;
+    }
+    int dtype = (*inVectorT_ptr)[feedvar_index].dtype;
+    if (dtype == paddle::PaddleDType::INT64) {
+      return sizeof(int64_t);
+    }
+    if (dtype == paddle::PaddleDType::FLOAT32) {
+      return sizeof(float);
+    }
+    if (dtype == paddle::PaddleDType::INT32) {
+      return sizeof(int32_t);
+    }
+    if (dtype == paddle::PaddleDType::UINT8) {
+      return sizeof(char);
+    }
+    return 0;
+  }
+
+  // Now, the implementation of this function is based on assumption
+  // that shape [0] = batch_size.
+  size_t feedvar_element_num(int feedvar_index) {
+    if (!check_feedvar_valid(feedvar_index)) {
+      return 0;
+    }
+    size_t element_num = 1;
+    if ((*inVectorT_ptr)[feedvar_index].shape.size() == 1) {
+      // cause shape[0] is batch_size.
+      // [10,1] = [10], so if shape[1] doesn`t exist.
+      // should return 1.
+      return 1;
+    }
+    // start from shape[1], cause shape[0] = batch_size.
+    for (int i = 1; i < (*inVectorT_ptr)[feedvar_index].shape.size(); ++i) {
+      element_num *= (*inVectorT_ptr)[feedvar_index].shape[i];
+    }
+    return element_num;
+  }
+
+  size_t feedvar_bytesize(int feedvar_index) {
+    return feedvar_element_num(feedvar_index) *
+           feedvar_element_bytesize(feedvar_index);
+  }
+
+  ShapeVector feedvar_shape_nobatch(int feedvar_index) {
+    if (!check_feedvar_valid(feedvar_index)) {
+      return ShapeVector();
+    }
+    return ShapeVector{(*inVectorT_ptr)[feedvar_index].shape.begin() + 1,
+                       (*inVectorT_ptr)[feedvar_index].shape.end()};
+  }
+
+  VectorOfShapeVector feedvar_shape_nobatch() {
+    VectorOfShapeVector vector_of_feedvar_shape_nobatch(inVectorT_ptr->size());
+    for (int index = 0; index < inVectorT_ptr->size(); ++index) {
+      vector_of_feedvar_shape_nobatch.push_back(feedvar_shape_nobatch(index));
+    }
+    return vector_of_feedvar_shape_nobatch;
+  }
+
+  // At present, it is considered that the batch of all feedvar is consistent.
+  // so for each feedvar, PaddleTensor.shape[0] should be the same.
+  bool check_batch_align() {
+    int batch_size_align = feedvar_batch_size(0);
+    for (int feedvar_index = 0; feedvar_index < inVectorT_ptr->size();
+         ++feedvar_index) {
+      if (feedvar_batch_size(feedvar_index) != batch_size_align) {
+        return 0;
+      }
+    }
+    /*
+    for(int fetchvar_index = 0; fetchvar_index < outVectorT_ptr->size();
+    ++fetchvar_index) {
+      if(fetchvar_batch_size(fetchvar_index) != batch_size_align) {
+        return 0;
+      }
+    }
+    */
+    return 1;
+  }
+
+  size_t batch_size() {
+    if (check_batch_align()) {
+      return feedvar_batch_size(0);
+    }
+    return 0;
+  }
 };
 
+// `Several Task` or `part of batch in Task` can be a TaskMeta.
+// Task is the original Request from User.
+// For example, the batch of Task is 30. There are 4 Requests.
+// The batch of BatchTasks is 100, which means we can deal 100 batch 1 time.
+// TaskMeta-1:{task-1,0,30} TaskMeta-2:{task-2,0,30} TaskMeta-3:{task-3,0,30}
+// but the last Task will be divided to 2 TaskMeta.
+// TaskMeta-4:{task-4,0,10} TaskMeta-5:{task-4,10,30}.
+// TaskMeta-1 ~ TaskMeta-4 will be inside BatchTasks-1.
+// TaskMeta-5 will be inside BatchTasks-2.
+
+// TaskMeta is necessary.
+// cause we need know the the corresponding relationship between
+// `batch_out`(which is in BatchTasks) and `outVectorT_ptr`(which is in Task).
+// especially when 1 Task be divided into several TaskMeta and be put into
+// several different BatchTasks.
 template <typename TaskT>
 struct TaskMeta {
   TaskMeta(TaskT* ptr, size_t start, size_t add)
@@ -79,6 +219,11 @@ struct TaskMeta {
   size_t end;
 };
 
+// each TaskT is already include batch in itself
+// BatchTasks need to combine several `small TaskMeta` into a new `big TaskT`.
+// The only difference between the `big TaskT` and `small TaskT` is that
+// the TaskT.inVectorT_ptr->[feedvar_index].shape[0]
+// which is actually batch_size is different.
 template <typename TaskT>
 class BatchTasks {
  public:
@@ -91,33 +236,38 @@ class BatchTasks {
         _rem_size(batch_size),
         _batch_align(batch_align) {
     _batch_in.clear();
+    _batch_in_offset.clear();
     _batch_out.clear();
-    _tasks.clear();
+    _batch_out_offset.clear();
+    _taskmeta_vector.clear();
   }
 
   ~BatchTasks() {
     _batch_in.clear();
+    _batch_in_offset.clear();
     _batch_out.clear();
-    _tasks.clear();
+    _batch_out_offset.clear();
+    _taskmeta_vector.clear();
   }
 
   // synchronized operation
+  // because Upper level callers of this function have already locked.
   size_t append_task(TaskT* task) {
     size_t add = std::min(task->rem, _rem_size);
     if (!_batch_align) {
       add = task->rem;
     }
-
-    TaskMetaT tm(task, task->in->size() - task->rem, add);
-    _tasks.push_back(tm);
+    int start_index = task->batch_size() - task->rem;
+    TaskMetaT tm(task, start_index, add);
+    _taskmeta_vector.push_back(tm);
 
     task->rem -= add;
     _rem_size -= add;
     return _rem_size;
   }
 
-  static bool check_valid(const typename TaskT::InArrayT& in,
-                          const typename TaskT::OutArrayT& out,
+  static bool check_valid(const typename TaskT::InVectorT& in,
+                          const typename TaskT::OutVectorT& out,
                           bool align) {
     (void)in;
     (void)out;
@@ -125,40 +275,220 @@ class BatchTasks {
     return true;
   }
 
+  // this should be modified totally.
+  // maybe we don`t need to do this inside the BatchTasks.
+  // we can do the copy work outside the BatchTasks.
+  // cause maybe next time we don`t need to do the extra copy.
+  // directly copy the every Task into the Predictor.
+
+  // lod is not supported.
+  // if lod is set, we should not allow to use the bsf task.
+
+  // batch.merge_tasks() is thread-safe function
+  // cause batch is a local variable and Task is just read, not written.
   void merge_tasks() {
-    for (size_t ti = 0; ti < _tasks.size(); ++ti) {
-      TaskMetaT& tm = _tasks[ti];
-      for (size_t vi = tm.begin; vi < tm.end; ++vi) {
-        _batch_in.push_back((*tm.task->in)[vi]);
-        _batch_out.push_back((*tm.task->out)[vi]);
+    if (_taskmeta_vector.size() <= 0) {
+      return;
+    }
+
+    // Temporarily, the batch of each feedvar is consistent
+    // If not consistent, use feedvar_batch_size instead of task->batch_size().
+    int temp_batch = 0;
+    for (size_t ti = 0; ti < _taskmeta_vector.size(); ++ti) {
+      TaskMetaT& tm = _taskmeta_vector[ti];
+      temp_batch += tm.task->batch_size();
+    }
+    if (temp_batch > _batch_size) {
+      LOG(ERROR) << "_realNumber_batch_in >_batch_size, error.";
+      return;
+    }
+
+    int feedvar_num = _taskmeta_vector[0].task->inVectorT_ptr->size();
+    if (_batch_in_offset.size() == 0) {
+      _batch_in_offset.resize(feedvar_num, 0);
+      _realNumber_batch_in.resize(feedvar_num, temp_batch);
+    }
+
+    for (size_t ti = 0; ti < _taskmeta_vector.size(); ++ti) {
+      TaskMetaT& tm = _taskmeta_vector[ti];
+
+      for (int index = 0; index < feedvar_num; ++index) {
+        const paddle::PaddleTensor& feedVarTensor =
+            (*tm.task->inVectorT_ptr)[index];
+        size_t feedvar_bytesize = tm.task->feedvar_bytesize(index);
+
+        if (ti == 0) {
+          if (feedVarTensor.lod.size() > 0 && feedVarTensor.lod[0].size() > 0) {
+            LOG(ERROR) << "lod Tensor is not supported now.";
+            return;
+          }
+          // for now, we assume that every task feedvar_bytesize is the same.
+          // which means we dont support auto embedding.
+          // but for different feedvar, it is different.
+          paddle::PaddleTensor paddleTensor;
+          paddleTensor.dtype = feedVarTensor.dtype;
+          paddleTensor.name = feedVarTensor.name;
+          paddleTensor.lod = feedVarTensor.lod;
+          paddleTensor.shape = feedVarTensor.shape;
+          paddleTensor.shape[0] = _realNumber_batch_in[index];
+          paddleTensor.data.Resize(feedvar_bytesize *
+                                   _realNumber_batch_in[index]);
+          _batch_in.push_back(paddleTensor);
+        }
+
+        void* dst_ptr = _batch_in[index].data.data() + _batch_in_offset[index];
+        void* source_ptr =
+            feedVarTensor.data.data() + feedvar_bytesize * tm.begin;
+        size_t length = feedvar_bytesize * (tm.end - tm.begin);
+        memcpy(dst_ptr, source_ptr, length);
+        _batch_in_offset[index] += length;
       }
     }
   }
 
+  bool check_fetchvar_valid(int fetchvar_index) {
+    if (fetchvar_index < 0 || _batch_out.size() <= fetchvar_index) {
+      LOG(ERROR) << "fetchvar doesnt exsit or fetchvar_index error";
+      return 0;
+    }
+
+    if (_batch_out[fetchvar_index].shape.size() <= 0) {
+      LOG(ERROR) << "fetchvar[" << fetchvar_index << "].shape.size()<=0,error";
+      return 0;
+    }
+
+    return 1;
+  }
+
+  size_t fetchvar_batch_size(int fetchvar_index) {
+    if (!check_fetchvar_valid(fetchvar_index)) {
+      return 0;
+    }
+
+    return _batch_out[fetchvar_index].shape[0];
+  }
+
+  size_t fetchvar_element_bytesize(int fetchvar_index) {
+    if (!check_fetchvar_valid(fetchvar_index)) {
+      return 0;
+    }
+    int dtype = _batch_out[fetchvar_index].dtype;
+    if (dtype == paddle::PaddleDType::INT64) {
+      return sizeof(int64_t);
+    }
+    if (dtype == paddle::PaddleDType::FLOAT32) {
+      return sizeof(float);
+    }
+    if (dtype == paddle::PaddleDType::INT32) {
+      return sizeof(int32_t);
+    }
+    if (dtype == paddle::PaddleDType::UINT8) {
+      return sizeof(char);
+    }
+    return 0;
+  }
+
+  // Now, the implementation of this function is based on assumption
+  // that shape [0] = batch_size.
+  size_t fetchvar_element_num(int fetchvar_index) {
+    if (!check_fetchvar_valid(fetchvar_index)) {
+      return 0;
+    }
+    size_t element_num = 1;
+    if (_batch_out[fetchvar_index].shape.size() == 1) {
+      // cause shape[0] is batch_size.
+      return 1;
+    }
+    // start from shape[1], cause shape[0] = batch_size.
+    for (int i = 1; i < _batch_out[fetchvar_index].shape.size(); ++i) {
+      element_num *= _batch_out[fetchvar_index].shape[i];
+    }
+    return element_num;
+  }
+
+  size_t fetchvar_bytesize(int fetchvar_index) {
+    return fetchvar_element_num(fetchvar_index) *
+           fetchvar_element_bytesize(fetchvar_index);
+  }
+
+  bool check_fetchvar_batch_align() {
+    int batch_size_align = fetchvar_batch_size(0);
+
+    for (int fetchvar_index = 0; fetchvar_index < _batch_out.size();
+         ++fetchvar_index) {
+      if (fetchvar_batch_size(fetchvar_index) != batch_size_align) {
+        return 0;
+      }
+    }
+
+    return 1;
+  }
+
+  size_t fetchvar_batch_size() {
+    if (check_fetchvar_batch_align()) {
+      return fetchvar_batch_size(0);
+    }
+    return 0;
+  }
+
   void notify_tasks() {
-    if (_batch_out.size() != _batch_in.size()) {
-      LOG(ERROR) << "batch size not consistency: " << _batch_out.size()
-                 << " != " << _batch_in.size();
+    if (_taskmeta_vector.size() <= 0) {
+      LOG(ERROR) << "_taskmeta_vector.size() <=0, error.";
+      return;
+    }
+    if (_realNumber_batch_in[0] != fetchvar_batch_size()) {
+      LOG(ERROR) << "_batch_out`s batch != _batch_in`s batch, error.";
       return;
     }
 
-    for (size_t ti = 0, bi = 0; ti < _tasks.size(); ++ti) {
-      TaskT* task = _tasks[ti].task;
-      size_t begin = _tasks[ti].begin;
-      size_t end = _tasks[ti].end;
+    int fetchvar_num = _batch_out.size();
+    if (_batch_out_offset.size() == 0) {
+      _batch_out_offset.resize(fetchvar_num, 0);
+    }
+
+    for (size_t ti = 0; ti < _taskmeta_vector.size(); ++ti) {
+      TaskT* task = _taskmeta_vector[ti].task;
+      size_t begin = _taskmeta_vector[ti].begin;
+      size_t end = _taskmeta_vector[ti].end;
       size_t add = end - begin;
 
-      for (size_t oi = begin; oi < end; ++oi, ++bi) {
-        if (bi >= _batch_in.size()) {
-          LOG(ERROR) << "batch index overflow: " << bi << " > "
-                     << _batch_in.size();
+      for (int index = 0; index < fetchvar_num; ++index) {
+        // the task->outVectorT_ptr is null before core->run().
+        // first time we should copy from _batch_out
+        // so we need init.
+        size_t fetchvar_bytesize_index = fetchvar_bytesize(index);
+        if (task->outVectorT_ptr->size() <= index) {
+          paddle::PaddleTensor tensor_out;
+          tensor_out.name = _batch_out[index].name;
+          tensor_out.dtype = paddle::PaddleDType(_batch_out[index].dtype);
+          tensor_out.shape = _batch_out[index].shape;
+          tensor_out.shape[0] = task->batch_size();
+          tensor_out.lod = _batch_out[index].lod;
+          // resize all batch memory at one time
+          size_t databuf_size = task->batch_size() * fetchvar_bytesize_index;
+          tensor_out.data.Resize(databuf_size);
+          task->outVectorT_ptr->push_back(tensor_out);
+        }
+
+        paddle::PaddleTensor& fetchVarTensor = (*task->outVectorT_ptr)[index];
+
+        void* dst_ptr =
+            fetchVarTensor.data.data() + fetchvar_bytesize_index * begin;
+        size_t length = fetchvar_bytesize_index * add;
+        if (_batch_out_offset[index] + length >
+            fetchvar_batch_size() * fetchvar_bytesize(index)) {
+          LOG(ERROR) << "_batch_out is less than taskmeta, error.";
           return;
         }
-        (*task->out)[oi] = _batch_out[bi];
+        void* source_ptr =
+            _batch_out[index].data.data() + _batch_out_offset[index];
+
+        memcpy(dst_ptr, source_ptr, length);
+        _batch_out_offset[index] += length;
       }
 
       size_t index = task->index.fetch_add(add);
-      if ((index + add) >= task->in->size()) {
+      if ((index + add) >= task->batch_size()) {
         char c = 0;
         while (write(task->write_fd, &c, 1) != 1 && errno == EINTR) {
         }
@@ -167,22 +497,33 @@ class BatchTasks {
     }
   }
 
-  const typename TaskT::InArrayT& in() const { return _batch_in; }
+  const typename TaskT::InVectorT& in() const { return _batch_in; }
 
-  typename TaskT::OutArrayT& out() { return _batch_out; }
+  typename TaskT::OutVectorT& out() { return _batch_out; }
 
-  size_t task_size() { return _tasks.size(); }
+  size_t task_size() { return _taskmeta_vector.size(); }
 
  private:
-  std::vector<TaskMetaT> _tasks;
-  typename TaskT::InArrayT _batch_in;
-  typename TaskT::OutArrayT _batch_out;
+  std::vector<TaskMetaT> _taskmeta_vector;
+  typename TaskT::InVectorT _batch_in;
+  std::vector<size_t> _batch_in_offset;
+  std::vector<size_t> _realNumber_batch_in;
+  typename TaskT::OutVectorT _batch_out;
+  std::vector<size_t> _batch_out_offset;
+  std::vector<size_t> _realNumber_batch_out;
   size_t _rem_size;
   size_t _batch_size;
   bool _batch_align;
 };
 
 // BSF task handle
+// TaskHandler is the handle of Task.
+// `read_fd` is used for receive signal in brpc Thread.
+// 'write_fd' is used for write signal in bsf Thread.
+// when TaskMeta is done, bsf Thread will write to 'write_fd'.
+// brpc Thread is keeping reading 'read_fd' in a while loop.
+// brpc Thread will receive signal when TaskMeta is done.
+// so `read_fd` and 'write_fd' is used for communicate in different Thread.
 template <typename TaskT>
 struct TaskHandler {
   int read_fd;
@@ -205,12 +546,11 @@ struct TaskHandler {
   }
 };
 
+// TaskExecutor is a Thread pool.
 template <typename TaskT>
 class TaskExecutor;
 
-template <typename InItemT, typename OutItemT>
-class TaskManager;
-
+// ThreadContext is used for start a bsf Thread.
 template <typename TaskT>
 struct ThreadContext {
   TaskExecutor<TaskT>* executor;
@@ -231,14 +571,24 @@ struct ThreadContext {
   }
 };
 
+// TaskExecutor is a Thread pool.
+// Each Model corresponding to a Model.
+// TaskT is actually a Request preprocessed by ReaderOp.
+// TaskT will be divided as TaskMeta which will be
+// put into _task_queue in brpc-Thread by schedule().
+// TaskHander will be returned to brpc-Thread.
+// start() function will create `thread_num` bsf Threads.
+// every bsf Thread check the _task_queue and take TaskMeta from it.
+// when a Task`s all TaskMeta is done, TaskHander will be noticed.
 template <typename TaskT>
 class TaskExecutor {
  public:
   typedef typename TaskT::InType InType;
   typedef typename TaskT::OutType OutType;
-  typedef typename TaskT::InArrayT InArrayT;
-  typedef typename TaskT::OutArrayT OutArrayT;
+  typedef typename TaskT::InVectorT InVectorT;
+  typedef typename TaskT::OutVectorT OutVectorT;
   typedef std::vector<TaskT> TaskArrayT;
+  typedef baidu::paddle_serving::predictor::MempoolWrapper MempoolWrapper;
 
   TaskExecutor()
       : _stop(false),
@@ -258,9 +608,11 @@ class TaskExecutor {
     THREAD_COND_DESTROY(&_cond);
   }
 
-  static TaskExecutor<TaskT>* instance() {
-    static TaskExecutor<TaskT> singleton;
-    return &singleton;
+  // cause vector.resize will use copy or move construct.
+  TaskExecutor(TaskExecutor<TaskT>&& other) noexcept {
+    if (this != &other) {
+      TaskExecutor();
+    }
   }
 
   void set_batch_size(size_t batch_size) { _batch_size = batch_size; }
@@ -277,8 +629,7 @@ class TaskExecutor {
     _thread_reset_fn = reset_fn;
   }
 
-  void set_thread_callback_fn(
-      boost::function<void(const InArrayT&, OutArrayT&)> cb) {
+  void set_thread_callback_fn(boost::function<void(const void*, void*)> cb) {
     _fn = cb;
   }
 
@@ -287,15 +638,21 @@ class TaskExecutor {
 
   static void* thread_entry(void* args);
 
- private:
-  TaskExecutor(TaskExecutor<TaskT> const& other);
-  TaskExecutor* operator=(TaskExecutor<TaskT> const& other);
-
   int work(ThreadContext<TaskT>* context);
 
-  TaskHandler<TaskT> schedule(const InArrayT&, OutArrayT&);
+  TaskHandler<TaskT> schedule(const void*, void*);
 
-  bool fetch_batch(BatchTasks<TaskT>& batch);  // NOLINT
+  bool move_task_to_batch(BatchTasks<TaskT>& batch);  // NOLINT
+
+ private:
+  TaskExecutor(TaskExecutor<TaskT> const& other) = delete;
+
+  TaskExecutor& operator=(TaskExecutor<TaskT> const& other) = delete;
+  /*
+  TaskExecutor(TaskExecutor<TaskT> && other) = delete;
+
+  TaskExecutor& operator=(TaskExecutor<TaskT> && other) = delete;
+  */
 
   bool _stop;
 
@@ -303,43 +660,76 @@ class TaskExecutor {
   THREAD_MUTEX_T _mut;
   THREAD_COND_T _cond;
 
-  std::deque<TaskT*> _task_queue;
+  std::list<TaskT*> _task_queue;
 
   boost::function<int(void*)> _thread_init_fn;
   boost::function<int(void*)> _thread_reset_fn;
   void** _user_thread_contexts;
 
   std::vector<ThreadContext<TaskT>*> _thread_contexts;
-  friend class TaskManager<InType, OutType>;
 
   size_t _batch_size;
   bool _batch_align;
 
-  boost::function<void(const InArrayT&, OutArrayT&)> _fn;
+  boost::function<void(const void*, void*)> _fn;
 };
 
+// TaskExecutorVector is a SingleTon class.
+// Each Model corresponding to a TaskExecutor.
+// So we need several TaskExecutor when there are more than 1 Model.
+template <typename TaskT>
+class TaskExecutorVector {
+ public:
+  static TaskExecutorVector<TaskT>& instance() {
+    static TaskExecutorVector<TaskT> singleton;
+    return singleton;
+  }
+
+  void resize(int size) { _vector_executor.resize(size); }
+
+  TaskExecutor<TaskT>& operator[](int index) {
+    if (_vector_executor.size() <= index || index <= -1) {
+      LOG(ERROR) << "_vector_executor.size() <= index or <= -1";
+      throw "_vector_executor.size() <= index or <= -1";
+    }
+    return _vector_executor[index];
+  }
+
+ private:
+  TaskExecutorVector() = default;
+  TaskExecutorVector(const TaskExecutorVector<TaskT>& other) = delete;
+  TaskExecutorVector& operator=(const TaskExecutorVector<TaskT>& other) =
+      delete;
+  TaskExecutorVector(TaskExecutorVector<TaskT>&& other) = delete;
+  TaskExecutorVector& operator=(TaskExecutorVector<TaskT>&& other) = delete;
+  std::vector<TaskExecutor<TaskT>> _vector_executor;
+};
+
+// TaskManager is actually a wrapper of Request in bsf.
+// TaskManager`s schedule() change Request to be TaskT.
+// and divided TaskT into several TaskMeta to put into the TaskExecutor`s
+// task_queue.
+// wait() is a while loop to receive signal when a whole Task is done.
 template <typename InItemT, typename OutItemT>
 class TaskManager {
  public:
   typedef Task<InItemT, OutItemT> TaskT;
-  typedef typename TaskT::InArrayT InArrayT;
-  typedef typename TaskT::OutArrayT OutArrayT;
-
-  explicit TaskManager(TaskExecutor<TaskT>& exe, size_t batch_size)  // NOLINT
-      : _executor(exe) {}
+  typedef typename TaskT::InVectorT InVectorT;
+  typedef typename TaskT::OutVectorT OutVectorT;
 
-  TaskManager() : _executor(*TaskExecutor<TaskT>::instance()) {}
+  explicit TaskManager(uint32_t index)  // NOLINT
+      : _model_index(index) {}
 
   ~TaskManager() { wait(); }
 
-  bool schedule(const InArrayT& in, OutArrayT& out);  // NOLINT
+  bool schedule(const void* in, void* out);  // NOLINT
   void wait();
 
   inline void clear() { wait(); }
 
  private:
-  TaskExecutor<TaskT>& _executor;
   TaskHandler<TaskT> _task_owned;
+  uint32_t _model_index;
 };  // class TaskManager
 
 class AutoMutex {
@@ -357,5 +747,5 @@ class AutoMutex {
 }  // namespace bsf
 }  // namespace im
 
-#include "core/predictor/framework/bsf-inl-tensor.h"
+// #include "core/predictor/framework/bsf-inl-tensor.h"
 #include "core/predictor/framework/bsf-inl.h"
diff --git a/core/predictor/framework/infer.cpp b/core/predictor/framework/infer.cpp
index e11861426fe3c1c1cea39811d66bb4feffdd8b9e..5149a4852570298d16183709f6c2d457e1cc524f 100644
--- a/core/predictor/framework/infer.cpp
+++ b/core/predictor/framework/infer.cpp
@@ -56,15 +56,23 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,
   }
 
   // init bsf framework
-  im::bsf::TaskExecutor<TaskT>::instance()->set_thread_init_fn(
-      boost::bind(&InferEngine::thrd_initialize_impl, this));
-  im::bsf::TaskExecutor<TaskT>::instance()->set_thread_reset_fn(
-      boost::bind(&InferEngine::thrd_clear_impl, this));
-  im::bsf::TaskExecutor<TaskT>::instance()->set_thread_callback_fn(
-      boost::bind(&InferEngine::task_infer_impl, this, _1, _2));
-  im::bsf::TaskExecutor<TaskT>::instance()->set_batch_size(_infer_batch_size);
-  im::bsf::TaskExecutor<TaskT>::instance()->set_batch_align(_infer_batch_align);
-  if (im::bsf::TaskExecutor<TaskT>::instance()->start(_infer_thread_num) != 0) {
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
+      .set_thread_init_fn(
+          boost::bind(&InferEngine::thrd_initialize_impl, this));
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
+      .set_thread_init_fn(
+          boost::bind(&InferEngine::thrd_initialize_impl, this));
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
+      .set_thread_reset_fn(boost::bind(&InferEngine::thrd_clear_impl, this));
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
+      .set_thread_callback_fn(
+          boost::bind(&InferEngine::task_infer_impl, this, _1, _2));
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_batch_size(
+      _infer_batch_size);
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_batch_align(
+      _infer_batch_align);
+  if (im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].start(
+          _infer_thread_num) != 0) {
     LOG(ERROR) << "Failed start bsf executor, threads:" << _infer_thread_num;
     return -1;
   }
@@ -75,6 +83,11 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,
   return 0;
 }
 
+// Multiple threads will enter this method of the same object
+// One Model corresponds to One ReloadableInferEngine object.
+// ReloadableInferEngine object is Process object.
+// One ReloadableInferEngine object can have several ModelData<EngineCore>
+// ModelData<EngineCore> is Thread object.
 int ReloadableInferEngine::infer(const void* in,
                                  void* out,
                                  uint32_t batch_size) {
@@ -82,9 +95,10 @@ int ReloadableInferEngine::infer(const void* in,
     return infer_impl(in, out, batch_size);
   }
 
-  im::bsf::TaskManager<Tensor, Tensor> task_manager;
-  task_manager.schedule(*(reinterpret_cast<const BatchTensor*>(in)),
-                        *(reinterpret_cast<BatchTensor*>(out)));
+  im::bsf::TaskManager<paddle::PaddleTensor, paddle::PaddleTensor> task_manager(
+      _model_index);
+
+  task_manager.schedule(in, out);
   task_manager.wait();
   return 0;
 }
@@ -110,7 +124,7 @@ int ReloadableInferEngine::proc_finalize() {
   }
 
   if (_infer_thread_num > 0) {
-    im::bsf::TaskExecutor<TaskT>::instance()->stop();
+    im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].stop();
   }
   return 0;
 }
@@ -191,6 +205,7 @@ int VersionedInferEngine::proc_initialize(const configure::EngineDesc& conf,
   std::string engine_type = conf.type();
   InferEngine* engine =
       StaticInferFactory::instance().generate_object(engine_type);
+  engine->set_model_index(_model_index);
   if (!engine) {
     LOG(ERROR) << "Failed generate engine with type:" << engine_type;
     return -1;
@@ -362,23 +377,30 @@ int VersionedInferEngine::infer_impl(const void* in,
                                      uint32_t batch_size) {
   return -1;
 }
-int VersionedInferEngine::task_infer_impl(const BatchTensor& in,
-                                          BatchTensor& out) {  // NOLINT
+int VersionedInferEngine::task_infer_impl(const void* in,
+                                          void* out) {  // NOLINT
   return -1;
 }
 
-int InferManager::proc_initialize(const char* path, const char* file) {
+int InferManager::proc_initialize(const char* path,
+                                  const char* file,
+                                  std::shared_ptr<int> engine_index_ptr) {
   ModelToolkitConf model_toolkit_conf;
   if (configure::read_proto_conf(path, file, &model_toolkit_conf) != 0) {
     LOG(ERROR) << "failed load infer config, path: " << path << "/" << file;
     return -1;
   }
-  size_t engine_num = model_toolkit_conf.engines_size();
-  for (size_t ei = 0; ei < engine_num; ++ei) {
+  uint32_t engine_num = model_toolkit_conf.engines_size();
+  im::bsf::TaskExecutorVector<TaskT>::instance().resize(*engine_index_ptr +
+                                                        engine_num);
+  for (uint32_t ei = 0; ei < engine_num; ++ei) {
     LOG(INFO) << "model_toolkit_conf.engines(" << ei
               << ").name: " << model_toolkit_conf.engines(ei).name();
     std::string engine_name = model_toolkit_conf.engines(ei).name();
     VersionedInferEngine* engine = new (std::nothrow) VersionedInferEngine();
+    int temp_engine_index_ptr = *engine_index_ptr;
+    engine->set_model_index(temp_engine_index_ptr);
+    *engine_index_ptr = temp_engine_index_ptr + 1;
     if (!engine) {
       LOG(ERROR) << "Failed generate versioned engine: " << engine_name;
       return -1;
diff --git a/core/predictor/framework/infer.h b/core/predictor/framework/infer.h
old mode 100755
new mode 100644
index 6113dc8eff60814af62ad145a334db666629f080..93be13c684874b8b5a6686f3aeddd2942037d84c
--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -17,6 +17,8 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <functional>
+#include <memory>
 #include <numeric>
 #include <string>
 #include <utility>
@@ -25,6 +27,7 @@
 #include "core/predictor/framework/bsf.h"
 #include "core/predictor/framework/factory.h"
 #include "core/predictor/framework/infer_data.h"
+#include "core/predictor/framework/memory.h"
 #include "paddle_inference_api.h"  // NOLINT
 namespace baidu {
 namespace paddle_serving {
@@ -71,7 +74,7 @@ class InferEngine {
   virtual int infer(const void* in, void* out, uint32_t batch_size = -1) {
     return infer_impl(in, out, batch_size);
   }
-
+  virtual void set_model_index(uint32_t index) { _model_index = index; }
   virtual int reload() = 0;
 
   virtual uint64_t version() const = 0;
@@ -86,12 +89,13 @@ class InferEngine {
   virtual int infer_impl(const void* in,
                          void* out,
                          uint32_t batch_size = -1) = 0;
-  virtual int task_infer_impl(const BatchTensor& in,
-                              BatchTensor& out) = 0;  // NOLINT
+  virtual int task_infer_impl(const void* in, void* out) = 0;  // NOLINT
 
+ protected:
+  uint32_t _model_index;
   // end: framework inner call
 };
-
+typedef im::bsf::Task<paddle::PaddleTensor, paddle::PaddleTensor> TaskT;
 class ReloadableInferEngine : public InferEngine {
  public:
   virtual ~ReloadableInferEngine() {}
@@ -104,7 +108,6 @@ class ReloadableInferEngine : public InferEngine {
   };
 
   virtual int load(const configure::EngineDesc& conf) = 0;
-  typedef im::bsf::Task<Tensor, Tensor> TaskT;
 
   int proc_initialize_impl(const configure::EngineDesc& conf, bool version);
 
@@ -179,6 +182,8 @@ struct ModelData {
     delete cores[1];
   }
 
+  void* get() { return cores[current_idx]->get(); }
+
   EngineCore* cores[2];
   uint32_t current_idx;
 };
@@ -191,14 +196,20 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
   int proc_initialize(const configure::EngineDesc& conf, bool version) {
     THREAD_KEY_CREATE(&_skey, NULL);
     THREAD_MUTEX_INIT(&_mutex, NULL);
+    gpu_index = 0;
     return ReloadableInferEngine::proc_initialize(conf, version);
   }
 
+  // 进程初始化会调用load，但由于未执行线程初始化，所以_reload_vec为空,不再继续执行。
+  // 热加载的话会调用load，由于线程已经初始化，_reload_vec不为空，所以继续执行load_data操作加载数据。
+  // 线程初始化会执行load_data操作加载数据，然后将engine加入_reload_vec中。
+  // 每个模型只有一个CloneDBReloadableInferEngine对象。
+  // 但一个CloneDBReloadableInferEngine对象，可以包含N个EngineCore。
   virtual int load(const configure::EngineDesc& conf) {
     if (_reload_vec.empty()) {
       return 0;
     }
-
+    gpu_index = 0;
     for (uint32_t ti = 0; ti < _reload_vec.size(); ++ti) {
       if (load_data(_reload_vec[ti], conf) != 0) {
         LOG(ERROR) << "Failed reload engine model: " << ti;
@@ -210,7 +221,8 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
     return 0;
   }
 
-  int load_data(ModelData<EngineCore>* md, const configure::EngineDesc& conf) {
+  virtual int load_data(ModelData<EngineCore>* md,
+                        const configure::EngineDesc& conf) {
     uint32_t next_idx = (md->current_idx + 1) % 2;
     if (md->cores[next_idx]) {
       delete md->cores[next_idx];
@@ -219,28 +231,29 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
     md->cores[next_idx] = new (std::nothrow) EngineCore;
 
     // params.dump();
-    if (!md->cores[next_idx] || md->cores[next_idx]->create(conf) != 0) {
+    size_t gpu_ids_num = conf.gpu_ids_size();
+    im::bsf::AutoMutex lock(_mutex);
+    int gpu_id = -1;
+    if (gpu_ids_num > 0) {
+      gpu_id = conf.gpu_ids(gpu_index % gpu_ids_num);
+    }
+    if (!md->cores[next_idx] ||
+        md->cores[next_idx]->create(conf, gpu_id) != 0) {
       LOG(ERROR) << "Failed create model, path: " << conf.model_dir();
       return -1;
     }
+    gpu_index++;
     md->current_idx = next_idx;
     return 0;
   }
 
   virtual int thrd_initialize_impl() {
-    // memory pool to be inited in non-serving-threads
-    if (MempoolWrapper::instance().thread_initialize() != 0) {
-      LOG(ERROR) << "Failed thread initialize mempool";
-      return -1;
-    }
-
     ModelData<EngineCore>* md = new (std::nothrow) ModelData<EngineCore>;
     if (!md || load_data(md, _conf) != 0) {
       LOG(ERROR) << "Failed create thread data from " << _conf.model_dir();
       return -1;
     }
 
-    LOG(ERROR) << "THREAD_SETSPECIFIC _skey = md";
     THREAD_SETSPECIFIC(_skey, md);
     im::bsf::AutoMutex lock(_mutex);
     _reload_vec.push_back(md);
@@ -248,11 +261,33 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
   }
 
   int thrd_clear_impl() {
-    // for non-serving-threads
-    if (MempoolWrapper::instance().thread_clear() != 0) {
-      LOG(ERROR) << "Failed thread clear mempool";
-      return -1;
-    }
+    // actually, there are 2 kinds of multi-thread.
+    // 1. brpc thread 2. bsf Task thread
+    // each request is in 1-single brpc thread.
+    // IF (bsf Task thread is not used)
+    // every single brpc thread corresponds to all the DBReloadableInferEngines.
+    // each request runs all models in 1-single brpc thread.
+    // every single brpc thread will create or clone N predictor.
+    // N = the number of Model.
+    // so if there are 2 models, and --thread 10.
+    // each brpc thread will create predictor of Model-1 and Model-2.
+    // there are totally 10 predictors of Model-1 and 10 predictors of Model-2
+    // cause there are 10 brpc threads.
+
+    // IF bsf Task thread is used。
+    // there will be a ThreadPool called bsf TaskExecutor.
+    // TaskExecutorVector is the vector of TaskExecutor.
+    // the number of TaskExecutor equals to the number of Model.
+    // 1 TaskExecutor corresponding to 1 Model.
+    // 1 TaskExecutor have N bsf threads.
+    // 1 bsf thread corresponds to 1 predictor of
+    // the Model corresponding to the TaskExecutor.
+    // brpc thread only put the data into the task_queue(which is in
+    // TaskExecutor)
+    // EngineCore->infer() is running in bsf Task thread.
+
+    // MempoolWrapper::instance() is actually a Thread-Local Mempool.
+    // so it belongs to a single Thread.
     return 0;
   }
 
@@ -278,6 +313,7 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
   THREAD_KEY_T _skey;
   THREAD_MUTEX_T _mutex;
   std::vector<ModelData<EngineCore>*> _reload_vec;
+  int gpu_index = 0;
 };
 
 // 多个EngineCore共用同一份模型数据
@@ -287,88 +323,76 @@ class CloneDBReloadableInferEngine
  public:
   virtual ~CloneDBReloadableInferEngine() {}
 
-  virtual int proc_initialize(const configure::EngineDesc& conf, bool version) {
-    _pd = new (std::nothrow) ModelData<EngineCore>;
-    if (!_pd) {
-      LOG(ERROR) << "Failed to allocate for ProcData";
-      return -1;
-    }
-    return DBReloadableInferEngine<EngineCore>::proc_initialize(conf, version);
-  }
+  // 进程初始化会调用load，但由于未执行线程初始化，所以_reload_vec为空,不再继续执行。
+  // 热加载的话会调用load，由于线程已经初始化，_reload_vec不为空，所以继续执行load_data操作加载数据。
+  // 线程初始化会执行load_data操作加载数据，然后将engine加入_reload_vec中。
+  // 每个模型只有一个CloneDBReloadableInferEngine对象。
+  // 但一个CloneDBReloadableInferEngine对象，可以包含N个EngineCore。
 
-  virtual int load(const configure::EngineDesc& conf) {
-    // 加载进程级模型数据
-    if (!_pd ||
-        DBReloadableInferEngine<EngineCore>::load_data(_pd, conf) != 0) {
-      LOG(ERROR) << "Failed to create common model from [" << conf.model_dir()
-                 << "].";
-      return -1;
+  virtual int load_data(ModelData<EngineCore>* md,
+                        const configure::EngineDesc& conf) {
+    uint32_t next_idx = (md->current_idx + 1) % 2;
+    if (md->cores[next_idx]) {
+      delete md->cores[next_idx];
     }
-    LOG(WARNING) << "Succ load common model[" << _pd->cores[_pd->current_idx]
-                 << "], path[" << conf.model_dir() << "].";
+    md->cores[next_idx] = new (std::nothrow) EngineCore;
 
-    if (DBReloadableInferEngine<EngineCore>::_reload_vec.empty()) {
-      return 0;
+    // params.dump();
+    // gpu_ids_num > 0 is always true.
+    // if use CPU, gpu_ids = [-1].
+    // if gpu_ids_num = 0, which means no gpuid is given.
+    // so we should set gpu_ids_num = 1, and gpu_id = -1.
+    // so that we can create at least 1 predictor.
+    size_t gpu_ids_num = conf.gpu_ids_size();
+    im::bsf::AutoMutex lock(DBReloadableInferEngine<EngineCore>::_mutex);
+    int gpu_id = -1;
+    if (gpu_ids_num > 0) {
+      gpu_id = conf.gpu_ids(DBReloadableInferEngine<EngineCore>::gpu_index %
+                            gpu_ids_num);
+    } else {
+      gpu_ids_num = 1;
     }
-
-    for (uint32_t ti = 0;
-         ti < DBReloadableInferEngine<EngineCore>::_reload_vec.size();
-         ++ti) {
-      if (load_data(DBReloadableInferEngine<EngineCore>::_reload_vec[ti],
-                    _pd->cores[_pd->current_idx]) != 0) {
-        LOG(ERROR) << "Failed reload engine model: " << ti;
+    // gpu_index will be set to be 0, when load() or proc_initial() is called.
+    // gpu_index < gpu_ids_num, means there are predictors still not create
+    // on some GPU card.
+    // so we need to create the predictor.
+    // gpu_index >= gpu_ids_num, means each GPU card has already create one.
+    // so we need to clone the predictor.
+    if (DBReloadableInferEngine<EngineCore>::gpu_index < gpu_ids_num) {
+      if (!md->cores[next_idx] ||
+          md->cores[next_idx]->create(conf, gpu_id) != 0) {
+        LOG(ERROR) << "Failed create model, path: " << conf.model_dir();
         return -1;
       }
+      DBReloadableInferEngine<EngineCore>::gpu_index++;
+      md->current_idx = next_idx;
+      if (_cloneTemplate.size() <
+          DBReloadableInferEngine<EngineCore>::gpu_index) {
+        _cloneTemplate.push_back(md);
+      } else {
+        _cloneTemplate[DBReloadableInferEngine<EngineCore>::gpu_index - 1] = md;
+      }
+    } else {
+      int template_index = DBReloadableInferEngine<EngineCore>::gpu_index %
+                           _cloneTemplate.size();
+      if (!md->cores[next_idx] ||
+          md->cores[next_idx]->clone(_cloneTemplate[template_index]->get()) !=
+              0) {
+        LOG(ERROR) << "Failed clone model from core";
+        return -1;
+      }
+      DBReloadableInferEngine<EngineCore>::gpu_index++;
+      md->current_idx = next_idx;
+      LOG(WARNING) << "core clone model succ, cur_idx[" << md->current_idx
+                   << "].";
     }
 
-    LOG(WARNING) << "Succ load clone model, path[" << conf.model_dir() << "]";
-    return 0;
-  }
-
-  // 加载线程级对象，多个线程级对象共用pd_core的模型数据
-  int load_data(ModelData<EngineCore>* td, EngineCore* pd_core) {
-    uint32_t next_idx = (td->current_idx + 1) % 2;
-    if (td->cores[next_idx]) {
-      delete td->cores[next_idx];
-    }
-
-    td->cores[next_idx] = new (std::nothrow) EngineCore;
-    if (!td->cores[next_idx] ||
-        td->cores[next_idx]->clone(pd_core->get()) != 0) {
-      LOG(ERROR) << "Failed clone model from pd_core[ " << pd_core << "], idx["
-                 << next_idx << "]";
-      return -1;
-    }
-    td->current_idx = next_idx;
-    LOG(WARNING) << "td_core[" << td->cores[td->current_idx]
-                 << "] clone model from pd_core[" << pd_core
-                 << "] succ, cur_idx[" << td->current_idx << "].";
-    return 0;
-  }
-
-  virtual int thrd_initialize_impl() {
-    // memory pool to be inited in non-serving-threads
-    if (MempoolWrapper::instance().thread_initialize() != 0) {
-      LOG(ERROR) << "Failed thread initialize mempool";
-      return -1;
-    }
-
-    ModelData<EngineCore>* md = new (std::nothrow) ModelData<EngineCore>;
-    if (!md || load_data(md, _pd->cores[_pd->current_idx]) != 0) {
-      LOG(ERROR) << "Failed clone thread data, origin_core["
-                 << _pd->cores[_pd->current_idx] << "].";
-      return -1;
-    }
-
-    THREAD_SETSPECIFIC(DBReloadableInferEngine<EngineCore>::_skey, md);
-    im::bsf::AutoMutex lock(DBReloadableInferEngine<EngineCore>::_mutex);
-    DBReloadableInferEngine<EngineCore>::_reload_vec.push_back(md);
     return 0;
   }
 
  protected:
-  ModelData<EngineCore>*
-      _pd;  // 进程级EngineCore，多个线程级EngineCore共用该对象的模型数据
+  // 模板EngineCore，如果已创建，则多个线程级EngineCore共用该对象的模型数据
+  std::vector<ModelData<EngineCore>*> _cloneTemplate;
 };
 
 template <typename EngineCore>
@@ -505,8 +529,8 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<EngineCore> {
     return 0;
   }
 
-  int task_infer_impl(const BatchTensor& in, BatchTensor& out) {  // NOLINT
-    return infer_impl(&in, &out);
+  int task_infer_impl(const void* in, void* out) {  // NOLINT
+    return infer_impl(in, out);
   }
 };
 
@@ -559,7 +583,7 @@ class VersionedInferEngine : public InferEngine {
 
   int infer_impl(const void* in, void* out, uint32_t batch_size = -1);
 
-  int task_infer_impl(const BatchTensor& in, BatchTensor& out);
+  int task_infer_impl(const void* in, void* out);
 
  private:
   boost::unordered_map<uint64_t, InferEngine*> _versions;
@@ -572,7 +596,9 @@ class InferManager {
     return ins;
   }
 
-  int proc_initialize(const char* path, const char* file);
+  int proc_initialize(const char* path,
+                      const char* file,
+                      std::shared_ptr<int> engine_index_ptr);
 
   int thrd_initialize();
 
diff --git a/core/predictor/framework/infer_data.h b/core/predictor/framework/infer_data.h
old mode 100755
new mode 100644
diff --git a/core/predictor/framework/memory.h b/core/predictor/framework/memory.h
old mode 100755
new mode 100644
diff --git a/core/predictor/framework/resource.cpp b/core/predictor/framework/resource.cpp
index 37c8092a7c206ae91ac783b15f3aadce780f0132..1da9783888fa379b653eaa46311c10f3d6c6ec66 100644
--- a/core/predictor/framework/resource.cpp
+++ b/core/predictor/framework/resource.cpp
@@ -135,12 +135,14 @@ int Resource::initialize(const std::string& path, const std::string& file) {
 
   if (FLAGS_enable_model_toolkit) {
     size_t model_toolkit_num = resource_conf.model_toolkit_path_size();
+    std::shared_ptr<int> engine_index_ptr(new int(0));
     for (size_t mi = 0; mi < model_toolkit_num; ++mi) {
       std::string model_toolkit_path = resource_conf.model_toolkit_path(mi);
       std::string model_toolkit_file = resource_conf.model_toolkit_file(mi);
 
-      if (InferManager::instance().proc_initialize(
-              model_toolkit_path.c_str(), model_toolkit_file.c_str()) != 0) {
+      if (InferManager::instance().proc_initialize(model_toolkit_path.c_str(),
+                                                   model_toolkit_file.c_str(),
+                                                   engine_index_ptr) != 0) {
         LOG(ERROR) << "failed proc initialize modeltoolkit, config: "
                    << model_toolkit_path << "/" << model_toolkit_file;
         return -1;
diff --git a/core/predictor/framework/resource.h b/core/predictor/framework/resource.h
index e144120e5a67bc2a43433cb3857331e9d1a465cf..d8a114dab581b71182c1a510db16aa0d2e818b0a 100644
--- a/core/predictor/framework/resource.h
+++ b/core/predictor/framework/resource.h
@@ -16,6 +16,7 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 #include "core/cube/cube-api/include/cube_api.h"
 #include "core/predictor/common/inner_common.h"
diff --git a/core/predictor/framework/server.cpp b/core/predictor/framework/server.cpp
index 25b4079509b5346277609648a44f8d361187708d..8ced6f1e9936059ada169633e21690d13bc48ae3 100644
--- a/core/predictor/framework/server.cpp
+++ b/core/predictor/framework/server.cpp
@@ -91,6 +91,7 @@ int ServerManager::start_and_wait() {
     }
   }
 
+  // rpc multi-thread start from here.
   if (_server.Start(FLAGS_port, &_options) != 0) {
     LOG(ERROR) << "Failed to start Paddle Inference Server";
     return -1;
diff --git a/core/predictor/framework/service.cpp b/core/predictor/framework/service.cpp
old mode 100755
new mode 100644
diff --git a/core/predictor/mempool/mempool.cpp b/core/predictor/mempool/mempool.cpp
index 88936687e47e9f9e3350dcf94b6eb38a93f9dc28..0deab022607a0af7c1984cc9863d40e82f9bc0ea 100644
--- a/core/predictor/mempool/mempool.cpp
+++ b/core/predictor/mempool/mempool.cpp
@@ -24,7 +24,7 @@ namespace fugue {
 namespace memory {
 
 void Region::init() {
-  _big_mem_capacity = 64 * 1024 * 1024;  // 64MB
+  _big_mem_capacity = 128 * 1024 * 1024;  // 128MB
   _big_mem_start = new char[_big_mem_capacity];
 }
 
diff --git a/core/predictor/mempool/mempool.h b/core/predictor/mempool/mempool.h
index a10e8f97a59a148ace77c575cd2c70d2dac79603..a4143d4b52460f80f3e4471fadd12e39d2e19319 100644
--- a/core/predictor/mempool/mempool.h
+++ b/core/predictor/mempool/mempool.h
@@ -129,7 +129,7 @@ class FreeList {
     to get the class Pointer
     for example
     T is the member of class Node, T data, 'data' is the name.
-    T* value is the member(pointer type) class Node
+    T* value is the member(pointer type) of class Node
     so we can get the Node* by calling container_of(value, Node, data)
     */
     Node* node = container_of(value, Node, data);
@@ -261,7 +261,11 @@ struct BlockReference {
 
 // because BlockFreeList is a threal-safe Singleton.
 // so we don`t release Block, it is global memory.
-// total number is 32*1024
+// total number is 256*1024.
+// the MAX_BLOCK_COUNT of Region(one thread one Region) is 1024.
+// so BlockFreeList allow 256 Region(means 256 thread).
+// the memory used by BlockFreeListType is sizeof(void*)*256*1024.
+// Block(2MB) memory is created only when get() is called.
 class BlockFreeList {
  public:
   static const int MAX_BLOCK_COUNT = 256 * 1024;
@@ -341,9 +345,10 @@ class Region {
       2 * 1024 *
       1024;  // 2MB,means when you need less than 2M, get memory from Block.
 
-  // 64MB,means when you need less than 64MB, get memory from BigMemory instead
+  // 128MB,means when you need less than 128MB, get memory from BigMemory
+  // instead
   // of BigNode
-  static const int BIGNODE_MEM_THRESHOLD = (64 * 1024 * 1024 + 1);
+  static const int BIGNODE_MEM_THRESHOLD = (128 * 1024 * 1024 + 1);
   static const int COUNTER_SIZE =
       BIGNODE_MEM_THRESHOLD / BIG_MEM_THRESHOLD + 1;  // this is not used
 
@@ -374,7 +379,8 @@ class Mempool {
   void* malloc(size_t size) {
     size = _align(size);
     // It does not enter the if statement the first time.
-    // Because the block has not been used up, it will enter.
+    // The if statement may enter after the block is created.
+    // If the block has not been used up, it will enter.
     if (size <= _free_size) {
       void* p = _free_cursor;
       _free_size -= size;
@@ -392,7 +398,7 @@ class Mempool {
       return;
     }
 
-    // memory in Block，update the pointer.
+    // memory in _block，update the pointer.
     if (_free_cursor - size == static_cast<char*>(p)) {
       // for example, you need to release -(8+1)bytes
       // you can only release -8bytes，cause -(8+2)byte is used by other.
@@ -424,9 +430,8 @@ class Mempool {
     }
 
     // 可能返回的是单独Region中malloc的内存。
-    // 也可能是Block，例如new_size=1M, old_data原本的指针头就在1.2M处，old_size
-    // =
-    // 0.5M
+    // 也可能是Block，例如new_size=1M, old_data原本的指针头就在1.2M处
+    // old_size = 0.5M
     // 此时,_free_size = 0.3M，new_size<2M,但是required = 1-0.5 >0.3
     // 分配出来的就是Block，但是该Block没有并很完美的利用完全。
     void* p = this->malloc_from_region(new_size);
diff --git a/core/predictor/op/op.cpp b/core/predictor/op/op.cpp
old mode 100755
new mode 100644
diff --git a/core/predictor/proto/framework.proto b/core/predictor/proto/framework.proto
old mode 100755
new mode 100644
diff --git a/core/predictor/src/pdserving.cpp b/core/predictor/src/pdserving.cpp
old mode 100755
new mode 100644
index e88d9b3b2aaa03ccbb7f903485bdffecfa6f7222..6fbf01c8b15532c120b5d6428a1d5199d5e476cd
--- a/core/predictor/src/pdserving.cpp
+++ b/core/predictor/src/pdserving.cpp
@@ -68,13 +68,14 @@ static bvar::PassiveStatus<std::string> s_predictor_revision(
 DEFINE_bool(V, false, "print version, bool");
 DEFINE_bool(g, false, "user defined gflag path");
 DECLARE_string(flagfile);
-
+/*
 namespace bthread {
 extern pthread_mutex_t g_task_control_mutex;
 }
 pthread_mutex_t g_worker_start_fn_mutex = PTHREAD_MUTEX_INITIALIZER;
-
+*/
 void pthread_worker_start_fn() {
+  /*
   while (pthread_mutex_lock(&g_worker_start_fn_mutex) != 0) {
   }
 
@@ -83,15 +84,18 @@ void pthread_worker_start_fn() {
   if (lock_status == EBUSY || lock_status == EAGAIN) {
     pthread_mutex_unlock(&bthread::g_task_control_mutex);
   }
+  */
   Resource::instance().thread_initialize();
 
   // Try to avoid deadlock in bthread
+  /*
   if (lock_status == EBUSY || lock_status == EAGAIN) {
     while (pthread_mutex_lock(&bthread::g_task_control_mutex) != 0) {
     }
   }
 
   pthread_mutex_unlock(&g_worker_start_fn_mutex);
+  */
 }
 
 static void g_change_server_port() {
@@ -126,7 +130,7 @@ int main(int argc, char** argv) {
     return 0;
   }
 
-  //google::ParseCommandLineFlags(&argc, &argv, true);
+  // google::ParseCommandLineFlags(&argc, &argv, true);
 
   g_change_server_port();
 
@@ -202,7 +206,7 @@ int main(int argc, char** argv) {
   }
   VLOG(2) << "Succ call pthread worker start function";
 
-  //this is not used by any code segment,which can be cancelled.
+  // this is not used by any code segment,which can be cancelled.
   if (Resource::instance().general_model_initialize(FLAGS_resource_path,
                                                     FLAGS_resource_file) != 0) {
     LOG(ERROR) << "Failed to initialize general model conf: "
diff --git a/core/predictor/tools/CMakeLists.txt b/core/predictor/tools/CMakeLists.txt
index 73e0d2a4b3a36681fbddd0b8789b394e89e792ff..c15ada04307c0fb546dc2cb7864d542fe8f2994f 100644
--- a/core/predictor/tools/CMakeLists.txt
+++ b/core/predictor/tools/CMakeLists.txt
@@ -2,3 +2,16 @@ set(seq_gen_src ${CMAKE_CURRENT_LIST_DIR}/seq_generator.cpp  ${CMAKE_CURRENT_LIS
 LIST(APPEND seq_gen_src ${PROTO_SRCS})
 add_executable(seq_generator ${seq_gen_src})
 target_link_libraries(seq_generator protobuf -lpthread)
+
+
+set(seq_reader_src ${CMAKE_CURRENT_LIST_DIR}/seq_reader.cpp ${CMAKE_CURRENT_LIST_DIR}/../../cube/cube-builder/src/seqfile_reader.cpp)
+add_executable(seq_reader ${seq_reader_src})
+add_dependencies(seq_reader brpc)
+install(TARGETS seq_reader
+        RUNTIME DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/bin
+        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
+        LIBRARY DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/so
+        )
+
+install(TARGETS seq_reader RUNTIME DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/tool)
+install(TARGETS seq_generator RUNTIME DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/tool)
diff --git a/core/predictor/tools/ocrtools/clipper.cpp b/core/predictor/tools/ocrtools/clipper.cpp
old mode 100755
new mode 100644
diff --git a/core/predictor/tools/ocrtools/clipper.h b/core/predictor/tools/ocrtools/clipper.h
old mode 100755
new mode 100644
diff --git a/core/predictor/tools/ocrtools/postprocess_op.cpp b/core/predictor/tools/ocrtools/postprocess_op.cpp
old mode 100755
new mode 100644
diff --git a/core/predictor/tools/ocrtools/postprocess_op.h b/core/predictor/tools/ocrtools/postprocess_op.h
old mode 100755
new mode 100644
diff --git a/core/predictor/tools/ocrtools/preprocess_op.cpp b/core/predictor/tools/ocrtools/preprocess_op.cpp
old mode 100755
new mode 100644
diff --git a/core/predictor/tools/ocrtools/preprocess_op.h b/core/predictor/tools/ocrtools/preprocess_op.h
old mode 100755
new mode 100644
diff --git a/core/predictor/tools/ocrtools/utility.cpp b/core/predictor/tools/ocrtools/utility.cpp
old mode 100755
new mode 100644
diff --git a/core/predictor/tools/ocrtools/utility.h b/core/predictor/tools/ocrtools/utility.h
old mode 100755
new mode 100644
diff --git a/core/predictor/tools/seq_reader.cpp b/core/predictor/tools/seq_reader.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3bd271c03b8f172ad0eec067d54a928de0404bb3
--- /dev/null
+++ b/core/predictor/tools/seq_reader.cpp
@@ -0,0 +1,91 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/time.h>
+#include <limits.h> 
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <thread>
+#include "core/cube/cube-builder/include/cube-builder/seqfile_reader.h"
+std::string string_to_hex(const std::string& input) {
+  static const char* const lut = "0123456789ABCDEF";
+  size_t len = input.length();
+
+  std::string output;
+  output.reserve(2 * len);
+  for (size_t i = 0; i < len; ++i) {
+    const unsigned char c = input[i];
+    output.push_back(lut[c >> 4]);
+    output.push_back(lut[c & 15]);
+  }
+  return output;
+}
+
+void printSeq(std::string file, int limit) {
+  SequenceFileRecordReader reader(file.c_str());
+
+  if (reader.open() != 0) {
+    std::cerr << "open file failed! " << file;
+    return;
+  }
+  if (reader.read_header() != 0) {
+    std::cerr << "read header error! " << file;
+    reader.close();
+    return;
+  }
+
+  Record record(reader.get_header());
+  int total_count = 0;
+
+  while (reader.next(&record) == 0) {
+    uint64_t key =
+        *reinterpret_cast<uint64_t *>(const_cast<char *>(record.key.data()));
+
+    total_count++;
+    int64_t value_length = record.record_len - record.key_len;
+    std::cout << "key: " << key << " , value: " << string_to_hex(record.value.c_str()) << std::endl; 
+    if (total_count >= limit) {
+        break;
+    }
+  }
+
+  if (reader.close() != 0) {
+    std::cerr << "close file failed! " << file;
+    return;
+  }
+}
+
+int main(int argc, char **argv) {
+    if (argc != 3 && argc != 2) {
+        std::cout << "Seq Reader Usage:" << std::endl;
+        std::cout << "get all keys: ./seq_reader $FILENAME " << std::endl;
+        std::cout << "get some keys: ./seq_reader $FILENAME $KEY_NUM" << std::endl;
+        return -1; 
+    }
+    if (argc == 3 || argc == 2) {
+        const char* filename_str = argv[1];
+        std::cout << "cstr filename is " << filename_str << std::endl;
+        std::string filename = filename_str;
+        std::cout << "filename is " << filename << std::endl;
+        if (argc == 3) {
+          const char* key_num_str = argv[2];
+          int key_limit = std::stoi(key_num_str);
+          printSeq(filename, key_limit);
+        } else {
+          printSeq(filename, INT_MAX);
+        }
+    }
+    return 0;
+}
diff --git a/core/sdk-cpp/proto/general_model_service.proto b/core/sdk-cpp/proto/general_model_service.proto
old mode 100644
new mode 100755
index 9988b298bdd22210fbe3127b9e4b57c89077f3ff..92032ab77e88a515c48db312e20b8acb13c9cddc
--- a/core/sdk-cpp/proto/general_model_service.proto
+++ b/core/sdk-cpp/proto/general_model_service.proto
@@ -20,21 +20,20 @@ package baidu.paddle_serving.predictor.general_model;
 option cc_generic_services = true;
 
 message Tensor {
-  repeated bytes data = 1;
+  repeated string data = 1;
   repeated int32 int_data = 2;
   repeated int64 int64_data = 3;
   repeated float float_data = 4;
-  optional int32 elem_type = 5;
-  repeated int32 shape = 6;
-  repeated int32 lod = 7; // only for fetch tensor currently
+  optional int32 elem_type =
+      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
+  repeated int32 shape = 6;       // shape should include batch
+  repeated int32 lod = 7;         // only for fetch tensor currently
+  optional string name = 8;       // get from the Model prototxt
+  optional string alias_name = 9; // get from the Model prototxt
 };
 
-message FeedInst { repeated Tensor tensor_array = 1; };
-
-message FetchInst { repeated Tensor tensor_array = 1; };
-
 message Request {
-  repeated FeedInst insts = 1;
+  repeated Tensor tensor = 1;
   repeated string fetch_var_names = 2;
   optional bool profile_server = 3 [ default = false ];
   required uint64 log_id = 4 [ default = 0 ];
@@ -46,7 +45,7 @@ message Response {
 };
 
 message ModelOutput {
-  repeated FetchInst insts = 1;
+  repeated Tensor tensor = 1;
   optional string engine_name = 2;
 }
 
diff --git a/doc/BENCHMARKING_GPU.md b/doc/BENCHMARKING_GPU.md
old mode 100755
new mode 100644
diff --git a/doc/COMPILE.md b/doc/COMPILE.md
old mode 100755
new mode 100644
index 1e27b32e69a4579b14b2d91da3032fec6c52b82d..ea7f53b2ed1704777b611a58c3a8d971d48eb312
--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -87,6 +87,7 @@ go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
 go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
 go get -u github.com/golang/protobuf/protoc-gen-go@v1.4.3
 go get -u google.golang.org/grpc@v1.33.0
+go env -w GO111MODULE=auto
 ```
 
 
diff --git a/doc/COMPILE_CN.md b/doc/COMPILE_CN.md
old mode 100755
new mode 100644
index 33d5dfa9786d034c88002daf379a29d2b394ee07..89178cee78746013915fc416b212b5a49f6762c2
--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -86,6 +86,7 @@ go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
 go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
 go get -u github.com/golang/protobuf/protoc-gen-go@v1.4.3
 go get -u google.golang.org/grpc@v1.33.0
+go env -w GO111MODULE=auto
 ```
 
 
diff --git a/doc/CUBE_TEST_CN.md b/doc/CUBE_TEST_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..c9e8c23ca3be43390ffd959d83c456cf47722056
--- /dev/null
+++ b/doc/CUBE_TEST_CN.md
@@ -0,0 +1,61 @@
+## 如果获得稀疏参数索引Cube所需的模型输入
+
+### 背景知识
+
+推荐系统需要大规模稀疏参数索引来帮助分布式部署，可在`python/example/criteo_ctr_with_cube`或是[PaddleRec](https://github.com/paddlepaddle/paddlerec)了解推荐模型。
+
+稀疏参数索引的模型格式是SequenceFile，源自Hadoop生态的键值对格式文件。
+
+为了方便调试，我们给出了从特定格式的可读文本文件到SequenceFile格式文件的转换工具，以及SequenceFile格式文件与可阅读文字的转换。
+
+用户在调试Cube服务功能时，可以自定义KV对生成SequenceFile格式文件来进行调试。
+用户在验证Cube的配送正确性时，可以转换SequenceFile格式文件至可读文字来进行比对验证。
+
+### 预备知识
+
+- 需要会编译Paddle Serving，参见[编译文档](./COMPILE.md)
+
+### 用法
+
+在编译结束后的安装文件，可以得到 seq_reader 和 kv_to_seqfile.py。
+
+#### 生成SequenceFile
+
+在`output/tool/`下，修改`output/tool/source/file.txt`，该文件每一行对应一个键值对，用冒号`:`区分key和value部分。
+
+例如：
+```
+1676869128226002114:48241       37064           91      -539    114     51      -122    269     229     -134    -282
+1657749292782759014:167 40              98      27      117     10      -29     15      74      67      -54
+```
+执行
+```
+python kv_to_seqfile.py
+```
+即可生成`data`文件夹，我们看下它的结构
+
+```
+.
+├── 20210805095422
+│   └── base
+│       └── feature
+└── donefile
+    └── base.txt
+```
+其中`20210805095422/base/feature` 就是SequenceFile格式文件，donefile保存在`donefile/base.txt`。
+
+#### 查看SequenceFile
+
+我们使用`seq_reader`工具来解读SequenceFile格式文件。
+```
+./seq_reader 20210805095422/base/feature 10 # 阅读开头的10个KV对
+./seq_reader 20210805095422/base/feature # 阅读所有KV对
+```
+
+结果
+```
+key: 1676869128226002114 , value: 343832343109333730363409093931092D35333909313134093531092D3132320932363909323239092D313334092D323832
+key: 1657749292782759014 , value: 3136370934300909393809323709313137093130092D3239093135093734093637092D3534
+```
+
+其中value 我们目前都以16进制的形式打印。
diff --git a/doc/FAQ.md b/doc/FAQ.md
index 2b173f8b310f117265120dcdf4c0c8161a5563b3..cdad1a3dda5339aa1fac55a223a5e3a38f33d031 100644
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -242,6 +242,9 @@ InvalidArgumentError: Device id must be less than GPU count, but received id is:
 
 **A:** 支持离线部署，需要把一些相关的[依赖包](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE.md)提前准备安装好
 
+#### Q: Docker中启动server IP地址 127.0.0.1 与 0.0.0.0 差异
+**A:** 您必须将容器的主进程设置为绑定到特殊的 0.0.0.0 “所有接口”地址，否则它将无法从容器外部访问。在Docker中 127.0.0.1 代表“这个容器”，而不是“这台机器”。如果您从容器建立到 127.0.0.1 的出站连接，它将返回到同一个容器；如果您将服务器绑定到 127.0.0.1，接收不到来自外部的连接。
+
 ## 预测问题
 
 #### Q: 使用GPU第一次预测时特别慢，如何调整RPC服务的等待时间避免超时？ 
@@ -321,6 +324,15 @@ GLOG_v=2 python -m paddle_serving_server.serve --model xxx_conf/ --port 9999
 
 **A:** Logid默认为0（后续应该有自动生成Logid的计划，当前版本0.4.0），Client端通过在predict函数中指定log_id参数传递
 
+#### Q: C++Server出现问题如何调试和定位
+
+**A:** 推荐您使用gdb进行定位和调试，如果您使用docker,在启动容器时候，需要加上docker run --privileged参数，开启特权模式，这样才能在docker容器中使用gdb定位和调试
+
+如果您C++端出现coredump，一般而言会生成一个core文件，若没有，则应开启生成core文件选项，使用ulimit -c unlimited命令。
+
+使用gdb调试core文件的方法为：gdb <可执行文件> <core文件>，进入后输入bt指令，一般即可显示出错在哪一行。
+
+注意：可执行文件路径是C++ bin文件的路径，而不是python命令，一般为类似下面的这种/usr/local/lib/python3.6/site-packages/paddle_serving_server/serving-gpu-102-0.6.2/serving
 
 
 ## 性能优化
diff --git a/doc/HTTP_SERVICE_CN.md b/doc/HTTP_SERVICE_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..a839039bac48e5c23c8c8d1571953365315b7bd8
--- /dev/null
+++ b/doc/HTTP_SERVICE_CN.md
@@ -0,0 +1,150 @@
+# HTTP方式访问Server
+
+Paddle Serving服务端目前提供了支持Http直接访问的功能，本文档显示了详细信息。
+
+## 基本原理
+
+BRPC-Server端支持通过Http的方式被访问，各种语言都有实现Http请求的一些库，所以Java/Python/Go等BRPC支持不太完善的语言，可以通过Http的方式直接访问服务端进行预测。
+
+### Http方式
+基本流程和原理：客户端需要将数据按照Proto约定的格式(请参阅[`core/general-server/proto/general_model_service.proto`](../core/general-server/proto/general_model_service.proto))封装在Http请求的请求体中。
+BRPC-Server会尝试去JSON字符串中再去反序列化出Proto格式的数据，从而进行后续的处理。
+
+### Http+protobuf方式
+各种语言都提供了对ProtoBuf的支持，如果您对此比较熟悉，您也可以先将数据使用ProtoBuf序列化，再将序列化后的数据放入Http请求数据体中，然后指定Content-Type: application/proto，从而使用http/h2+protobuf二进制串访问服务。
+实测随着数据量的增大，使用JSON方式的Http的数据量和反序列化的耗时会大幅度增加，推荐当您的数据量较大时，使用Http+protobuf方式，目前已经在Java和Python的Client端提供了支持。
+
+**理论上讲，序列化/反序列化的性能从高到底排序为：protobuf > http/h2+protobuf > http**
+
+
+## 示例
+
+我们将以python/examples/fit_a_line为例，讲解如何通过Http访问Server端。
+
+### 获取模型
+
+```shell
+sh get_data.sh
+```
+
+## 开启服务端
+
+```shell
+python3.6 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
+```
+服务端无须做任何改造，即可支持BRPC和HTTP两种方式。
+
+
+## 客户端访问
+
+
+### HttpClient方式发送Http请求(Python/Java)
+
+为了方便用户快速的使用Http方式请求Server端预测服务，我们已经将常用的Http请求的数据体封装、压缩、请求加密等功能封装为一个HttpClient类提供给用户，方便用户使用。
+
+使用HttpClient最简单只需要三步，1、创建一个HttpClient对象。2、加载Client端的prototxt配置文件（本例中为python/examples/fit_a_line/目录下的uci_housing_client/serving_client_conf.prototxt)，3、调用Predict函数，通过Http方式请求预测服务。
+
+此外，您可以根据自己的需要配置Server端IP、Port、服务名称（此服务名称需要与[`core/general-server/proto/general_model_service.proto`](../core/general-server/proto/general_model_service.proto)文件中的Service服务名和rpc方法名对应，即`GeneralModelService`字段和`inference`字段），设置Request数据体压缩，设置Response支持压缩传输，模型加密预测（需要配置Server端使用模型加密）、设置响应超时时间等功能。
+
+Python的HttpClient使用示例见[`python/examples/fit_a_line/test_httpclient.py`](../python/examples/fit_a_line/test_httpclient.py)，接口详见[`python/paddle_serving_client/httpclient.py`](../python/paddle_serving_client/httpclient.py)。
+
+Java的HttpClient使用示例见[`java/examples/src/main/java/PaddleServingClientExample.java`](../java/examples/src/main/java/PaddleServingClientExample.java)接口详见[`java/src/main/java/io/paddle/serving/client/HttpClient.java`](../java/src/main/java/io/paddle/serving/client/HttpClient.java)。
+
+如果不能满足您的需求，您也可以在此基础上添加一些功能。
+
+如需支持https或者自定义Response的Status Code等,则需要对C++端brpc-Server进行一定的二次开发，请参考https://github.com/apache/incubator-brpc/blob/master/docs/cn/http_service.md，后续如果需求很大，我们也会将这部分功能加入到Server中，尽情期待。
+
+
+### curl方式发送Http请求(基本原理)
+
+```shell
+curl -XPOST http://0.0.0.0:9393/GeneralModelService/inference -d ' {"tensor":[{"float_data":[0.0137,-0.1136,0.2553,-0.0692,0.0582,-0.0727,-0.1583,-0.0584,0.6283,0.4919,0.1856,0.0795,-0.0332],"elem_type":1,"name":"x","alias_name":"x","shape":[1,13]}],"fetch_var_names":["price"],"log_id":0}'
+```
+其中`127.0.0.1:9393`为IP和Port，根据您服务端启动的IP和Port自行设定。
+
+`GeneralModelService`字段和`inference`字段分别为Proto文件中的Service服务名和rpc方法名，详见[`core/general-server/proto/general_model_service.proto`](../core/general-server/proto/general_model_service.proto)
+
+-d后面的是请求的数据体，json中一定要包含上述proto中的required字段，否则转化会失败，对应请求会被拒绝。
+
+需要注意的是，数据中的shape字段为模型实际需要的shape信息，包含batch维度在内，可能与proto文件中的shape不一致。
+
+#### message
+
+对应rapidjson Object, 以花括号包围，其中的元素会被递归地解析。
+
+```protobuf
+// protobuf
+message Foo {
+    required string field1 = 1;
+    required int32 field2 = 2;  
+}
+message Bar { 
+    required Foo foo = 1; 
+    optional bool flag = 2;
+    required string name = 3;
+}
+
+// rapidjson
+{"foo":{"field1":"hello", "field2":3},"name":"Tom" }
+```
+
+#### repeated field
+
+对应rapidjson Array, 以方括号包围，其中的元素会被递归地解析，和message不同，每个元素的类型相同。
+
+```protobuf
+// protobuf
+repeated int32 numbers = 1;
+
+// rapidjson
+{"numbers" : [12, 17, 1, 24] }
+```
+#### elem_type
+
+表示数据类型，0 means int64, 1 means float32, 2 means int32, 3 means bytes(string)
+
+#### fetch_var_names
+
+表示返回结果中需要的数据名称，请参考模型文件serving_client_conf.prototxt中的`fetch_var`字段下的`alias_name`。
+
+### Http压缩
+
+支持gzip压缩，但gzip并不是一个压缩解压速度非常快的方法，当数据量较小时候，使用gzip压缩反而会得不偿失，推荐至少数据大于512字节时才考虑使用gzip压缩,实测结果是当数据量小于50K时，压缩的收益都不大。
+
+#### Client请求的数据体压缩
+
+以上面的fit_a_line为例，仍使用上文的请求数据体，但只作为示例演示用法，实际此时使用压缩得不偿失。
+
+```shell
+echo ' {"tensor":[{"float_data":[0.0137,-0.1136,0.2553,-0.0692,0.0582,-0.0727,-0.1583,-0.0584,0.6283,0.4919,0.1856,0.0795,-0.0332],"elem_type":1,"shape":[1,13]}],"fetch_var_names":["price"],"log_id":0}' | gzip -c > data.txt.gz
+```
+
+```shell
+curl --data-binary @data.txt.gz -H'Content-Encoding: gzip' -XPOST http://127.0.0.1:9393/GeneralModelService/inference
+```
+
+**注意：当请求数据体压缩时，需要指定请求头中Content-Encoding: gzip**
+
+#### Server端Response压缩
+
+当Http请求头中设置了Accept-encoding: gzip时，Server端会尝试用gzip压缩Response的数据，“尝试“指的是压缩有可能不发生，条件有：
+
+- 请求中没有设置Accept-encoding: gzip。
+
+- body尺寸小于-http_body_compress_threshold指定的字节数，默认是512。gzip并不是一个很快的压缩算法，当body较小时，压缩增加的延时可能比网络传输省下的还多。当包较小时不做压缩可能是个更好的选项。
+
+这时server总是会返回不压缩的结果。
+
+如果使用curl，通常推荐使用--compressed参数来设置Response压缩，--compressed参数会自动地在http请求中设置Accept-encoding: gzip，并在收到压缩后的Response后自动解压，对于用户而言，整个压缩/解压过程就像透明的一样。
+```shell
+curl --data-binary @data.txt.gz -H'Content-Encoding: gzip' --compressed -XPOST http://127.0.0.1:9393/GeneralModelService/inference
+```
+
+若您只是在Http请求头中通过-H'Accept-encoding: gzip'设置了接收压缩的信息，收到的将是压缩后的Response，此时，您需要手动解压。
+
+也就是说，--compressed = -H'Content-Encoding: gzip' + 自动解压，所以推荐您使用--compressed，以下仅作为单独设置请求头+手动解压的原理性示例。
+
+当您想要验证返回值是否真的压缩时，您可以只添加请求头-H'Content-Encoding: gzip'，而不解压，可以看到返回信息是压缩后的数据（一般而言是看不懂的压缩码）。
+```shell
+curl --data-binary @data.txt.gz -H'Content-Encoding: gzip' -H'Accept-encoding: gzip' -XPOST http://127.0.0.1:9393/GeneralModelService/inference | gunzip
+```
diff --git a/doc/architecture.png b/doc/architecture.png
old mode 100755
new mode 100644
diff --git a/doc/client-side-proxy.png b/doc/client-side-proxy.png
old mode 100755
new mode 100644
diff --git a/doc/deprecated/CTR_PREDICTION.md b/doc/deprecated/CTR_PREDICTION.md
old mode 100755
new mode 100644
diff --git a/doc/framework.png b/doc/framework.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-local-qps-batchsize.png b/doc/gpu-local-qps-batchsize.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-local-qps-concurrency.png b/doc/gpu-local-qps-concurrency.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-local-time-batchsize.png b/doc/gpu-local-time-batchsize.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-local-time-concurrency.png b/doc/gpu-local-time-concurrency.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-serving-multi-card-multi-concurrency-qps-batchsize-concurrency-client1.png b/doc/gpu-serving-multi-card-multi-concurrency-qps-batchsize-concurrency-client1.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-serving-multi-card-multi-concurrency-qps-batchsize-concurrency-client2.png b/doc/gpu-serving-multi-card-multi-concurrency-qps-batchsize-concurrency-client2.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-serving-multi-card-multi-concurrency-time-batchsize-concurrency-client1.png b/doc/gpu-serving-multi-card-multi-concurrency-time-batchsize-concurrency-client1.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-serving-multi-card-multi-concurrency-time-batchsize-concurrency-client2.png b/doc/gpu-serving-multi-card-multi-concurrency-time-batchsize-concurrency-client2.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-serving-multi-card-single-concurrency-qps-batchsize-client1.png b/doc/gpu-serving-multi-card-single-concurrency-qps-batchsize-client1.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-serving-multi-card-single-concurrency-qps-batchsize-client2.png b/doc/gpu-serving-multi-card-single-concurrency-qps-batchsize-client2.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-serving-multi-card-single-concurrency-time-batchsize-client1.png b/doc/gpu-serving-multi-card-single-concurrency-time-batchsize-client1.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-serving-multi-card-single-concurrency-time-batchsize-client2.png b/doc/gpu-serving-multi-card-single-concurrency-time-batchsize-client2.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-serving-single-card-qps-batchsize.png b/doc/gpu-serving-single-card-qps-batchsize.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-serving-single-card-qps-concurrency.png b/doc/gpu-serving-single-card-qps-concurrency.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-serving-single-card-time-batchsize.png b/doc/gpu-serving-single-card-time-batchsize.png
old mode 100755
new mode 100644
diff --git a/doc/gpu-serving-single-card-time-concurrency.png b/doc/gpu-serving-single-card-time-concurrency.png
old mode 100755
new mode 100644
diff --git a/doc/multi-service.png b/doc/multi-service.png
old mode 100755
new mode 100644
diff --git a/doc/multi-variants.png b/doc/multi-variants.png
old mode 100755
new mode 100644
diff --git a/doc/predict-service.png b/doc/predict-service.png
old mode 100755
new mode 100644
diff --git a/doc/pruned-ctr-network.png b/doc/pruned-ctr-network.png
old mode 100755
new mode 100644
diff --git a/doc/qps-threads-bow.png b/doc/qps-threads-bow.png
old mode 100755
new mode 100644
diff --git a/doc/qps-threads-cnn.png b/doc/qps-threads-cnn.png
old mode 100755
new mode 100644
diff --git a/doc/qps-threads-lstm.png b/doc/qps-threads-lstm.png
old mode 100755
new mode 100644
diff --git a/doc/server-side.png b/doc/server-side.png
old mode 100755
new mode 100644
diff --git a/doc/serving-timings.png b/doc/serving-timings.png
old mode 100755
new mode 100644
diff --git a/go/client_app/acc.go b/go/client_app/acc.go
deleted file mode 100644
index d93f47360976c5dd22be2a9919fe278c283aea2c..0000000000000000000000000000000000000000
--- a/go/client_app/acc.go
+++ /dev/null
@@ -1,50 +0,0 @@
-package main
-
-import (
-       "io"
-       "os"
-       "fmt"
-       "bufio"
-       "strings"
-       "strconv"
-)
-
-func main() {
-     score_file := os.Args[1]
-     fi, err := os.Open(score_file)
-     if err != nil {
-     	fmt.Print(err)
-     }
-
-     defer fi.Close()
-     br := bufio.NewReader(fi)     
-
-     total := int(0)
-     acc := int(0)
-     for {
-     	 line, err := br.ReadString('\n')
-	 if err == io.EOF {
-	    break
-	 }
-
-	 line = strings.Trim(line, "\n")
-	 s := strings.Split(line, "\t")
-	 prob_str := strings.Trim(s[0], " ")
-	 label_str := strings.Trim(s[1], " ")
-	 prob, err := strconv.ParseFloat(prob_str, 32)
-	 if err != nil {
-	    panic(err)
-	 }
-	 label, err := strconv.ParseFloat(label_str, 32)
-	 if err != nil {
-	    panic(err)
-	 }
-	 if (prob - 0.5) * (label - 0.5) > 0 {
-	    acc++
-	 }
-	 total++
-    }
-    fmt.Println("total num: ", total)
-    fmt.Println("acc num: ", acc)
-    fmt.Println("acc: ", float32(acc) / float32(total))    
-}
\ No newline at end of file
diff --git a/go/client_app/imdb_client.go b/go/client_app/imdb_client.go
deleted file mode 100644
index aef823ed2c5209217d4f60f93d19006e67dca35d..0000000000000000000000000000000000000000
--- a/go/client_app/imdb_client.go
+++ /dev/null
@@ -1,79 +0,0 @@
-//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
-       "io"
-       "fmt"
-       "strings"
-       "bufio"
-       "strconv"
-       "os"
-       serving_client "github.com/PaddlePaddle/Serving/go/serving_client"
-)
-
-func main() {
-     var config_file_path string
-     config_file_path = os.Args[1]
-     handle := serving_client.LoadModelConfig(config_file_path)
-     handle = serving_client.Connect("127.0.0.1", "9292", handle)
-
-     test_file_path := os.Args[2]
-     fi, err := os.Open(test_file_path)
-     if err != nil {
-     	fmt.Print(err)
-     }
-
-     defer fi.Close()
-     br := bufio.NewReader(fi)
-
-     fetch := []string{"cost", "acc", "prediction"}     
-
-     var result map[string][]float32
-
-     for {
-     	 line, err := br.ReadString('\n')
-	 if err == io.EOF {
-	    break
-	 }
-
-	 line = strings.Trim(line, "\n")
-
-	 var words = []int64{}
-
-	 s := strings.Split(line, " ")
-	 value, err := strconv.Atoi(s[0])
-	 var feed_int_map map[string][]int64
-
-	 for _, v := range s[1:value + 1] {
-	     int_v, _ := strconv.Atoi(v)
-	     words = append(words, int64(int_v))
-	 }
-
-	 label, err := strconv.Atoi(s[len(s)-1])
-
-	 if err != nil {
-	    panic(err)
-	 }
-
-	 feed_int_map = map[string][]int64{}
-	 feed_int_map["words"] = words
-	 feed_int_map["label"] = []int64{int64(label)}
-	 
-	 result = serving_client.Predict(handle,
-	 	 feed_int_map, fetch)
-	 fmt.Println(result["prediction"][1], "\t", int64(label))
-     }
-}
\ No newline at end of file
diff --git a/go/proto/general_model_config.pb.go b/go/proto/general_model_config.pb.go
deleted file mode 100644
index 40d993ac1aa81d9180e51f1c6da464a3df251ff2..0000000000000000000000000000000000000000
--- a/go/proto/general_model_config.pb.go
+++ /dev/null
@@ -1,237 +0,0 @@
-// Code generated by protoc-gen-go. DO NOT EDIT.
-// source: general_model_config.proto
-
-package baidu_paddle_serving_configure
-
-import (
-	fmt "fmt"
-	proto "github.com/golang/protobuf/proto"
-	math "math"
-)
-
-// Reference imports to suppress errors if they are not otherwise used.
-var _ = proto.Marshal
-var _ = fmt.Errorf
-var _ = math.Inf
-
-// This is a compile-time assertion to ensure that this generated file
-// is compatible with the proto package it is being compiled against.
-// A compilation error at this line likely means your copy of the
-// proto package needs to be updated.
-const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package
-
-type FeedVar struct {
-	Name                 *string  `protobuf:"bytes,1,opt,name=name" json:"name,omitempty"`
-	AliasName            *string  `protobuf:"bytes,2,opt,name=alias_name,json=aliasName" json:"alias_name,omitempty"`
-	IsLodTensor          *bool    `protobuf:"varint,3,opt,name=is_lod_tensor,json=isLodTensor,def=0" json:"is_lod_tensor,omitempty"`
-	FeedType             *int32   `protobuf:"varint,4,opt,name=feed_type,json=feedType,def=0" json:"feed_type,omitempty"`
-	Shape                []int32  `protobuf:"varint,5,rep,name=shape" json:"shape,omitempty"`
-	XXX_NoUnkeyedLiteral struct{} `json:"-"`
-	XXX_unrecognized     []byte   `json:"-"`
-	XXX_sizecache        int32    `json:"-"`
-}
-
-func (m *FeedVar) Reset()         { *m = FeedVar{} }
-func (m *FeedVar) String() string { return proto.CompactTextString(m) }
-func (*FeedVar) ProtoMessage()    {}
-func (*FeedVar) Descriptor() ([]byte, []int) {
-	return fileDescriptor_efa52beffa29d37a, []int{0}
-}
-
-func (m *FeedVar) XXX_Unmarshal(b []byte) error {
-	return xxx_messageInfo_FeedVar.Unmarshal(m, b)
-}
-func (m *FeedVar) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
-	return xxx_messageInfo_FeedVar.Marshal(b, m, deterministic)
-}
-func (m *FeedVar) XXX_Merge(src proto.Message) {
-	xxx_messageInfo_FeedVar.Merge(m, src)
-}
-func (m *FeedVar) XXX_Size() int {
-	return xxx_messageInfo_FeedVar.Size(m)
-}
-func (m *FeedVar) XXX_DiscardUnknown() {
-	xxx_messageInfo_FeedVar.DiscardUnknown(m)
-}
-
-var xxx_messageInfo_FeedVar proto.InternalMessageInfo
-
-const Default_FeedVar_IsLodTensor bool = false
-const Default_FeedVar_FeedType int32 = 0
-
-func (m *FeedVar) GetName() string {
-	if m != nil && m.Name != nil {
-		return *m.Name
-	}
-	return ""
-}
-
-func (m *FeedVar) GetAliasName() string {
-	if m != nil && m.AliasName != nil {
-		return *m.AliasName
-	}
-	return ""
-}
-
-func (m *FeedVar) GetIsLodTensor() bool {
-	if m != nil && m.IsLodTensor != nil {
-		return *m.IsLodTensor
-	}
-	return Default_FeedVar_IsLodTensor
-}
-
-func (m *FeedVar) GetFeedType() int32 {
-	if m != nil && m.FeedType != nil {
-		return *m.FeedType
-	}
-	return Default_FeedVar_FeedType
-}
-
-func (m *FeedVar) GetShape() []int32 {
-	if m != nil {
-		return m.Shape
-	}
-	return nil
-}
-
-type FetchVar struct {
-	Name                 *string  `protobuf:"bytes,1,opt,name=name" json:"name,omitempty"`
-	AliasName            *string  `protobuf:"bytes,2,opt,name=alias_name,json=aliasName" json:"alias_name,omitempty"`
-	IsLodTensor          *bool    `protobuf:"varint,3,opt,name=is_lod_tensor,json=isLodTensor,def=0" json:"is_lod_tensor,omitempty"`
-	Shape                []int32  `protobuf:"varint,4,rep,name=shape" json:"shape,omitempty"`
-	XXX_NoUnkeyedLiteral struct{} `json:"-"`
-	XXX_unrecognized     []byte   `json:"-"`
-	XXX_sizecache        int32    `json:"-"`
-}
-
-func (m *FetchVar) Reset()         { *m = FetchVar{} }
-func (m *FetchVar) String() string { return proto.CompactTextString(m) }
-func (*FetchVar) ProtoMessage()    {}
-func (*FetchVar) Descriptor() ([]byte, []int) {
-	return fileDescriptor_efa52beffa29d37a, []int{1}
-}
-
-func (m *FetchVar) XXX_Unmarshal(b []byte) error {
-	return xxx_messageInfo_FetchVar.Unmarshal(m, b)
-}
-func (m *FetchVar) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
-	return xxx_messageInfo_FetchVar.Marshal(b, m, deterministic)
-}
-func (m *FetchVar) XXX_Merge(src proto.Message) {
-	xxx_messageInfo_FetchVar.Merge(m, src)
-}
-func (m *FetchVar) XXX_Size() int {
-	return xxx_messageInfo_FetchVar.Size(m)
-}
-func (m *FetchVar) XXX_DiscardUnknown() {
-	xxx_messageInfo_FetchVar.DiscardUnknown(m)
-}
-
-var xxx_messageInfo_FetchVar proto.InternalMessageInfo
-
-const Default_FetchVar_IsLodTensor bool = false
-
-func (m *FetchVar) GetName() string {
-	if m != nil && m.Name != nil {
-		return *m.Name
-	}
-	return ""
-}
-
-func (m *FetchVar) GetAliasName() string {
-	if m != nil && m.AliasName != nil {
-		return *m.AliasName
-	}
-	return ""
-}
-
-func (m *FetchVar) GetIsLodTensor() bool {
-	if m != nil && m.IsLodTensor != nil {
-		return *m.IsLodTensor
-	}
-	return Default_FetchVar_IsLodTensor
-}
-
-func (m *FetchVar) GetShape() []int32 {
-	if m != nil {
-		return m.Shape
-	}
-	return nil
-}
-
-type GeneralModelConfig struct {
-	FeedVar              []*FeedVar  `protobuf:"bytes,1,rep,name=feed_var,json=feedVar" json:"feed_var,omitempty"`
-	FetchVar             []*FetchVar `protobuf:"bytes,2,rep,name=fetch_var,json=fetchVar" json:"fetch_var,omitempty"`
-	XXX_NoUnkeyedLiteral struct{}    `json:"-"`
-	XXX_unrecognized     []byte      `json:"-"`
-	XXX_sizecache        int32       `json:"-"`
-}
-
-func (m *GeneralModelConfig) Reset()         { *m = GeneralModelConfig{} }
-func (m *GeneralModelConfig) String() string { return proto.CompactTextString(m) }
-func (*GeneralModelConfig) ProtoMessage()    {}
-func (*GeneralModelConfig) Descriptor() ([]byte, []int) {
-	return fileDescriptor_efa52beffa29d37a, []int{2}
-}
-
-func (m *GeneralModelConfig) XXX_Unmarshal(b []byte) error {
-	return xxx_messageInfo_GeneralModelConfig.Unmarshal(m, b)
-}
-func (m *GeneralModelConfig) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
-	return xxx_messageInfo_GeneralModelConfig.Marshal(b, m, deterministic)
-}
-func (m *GeneralModelConfig) XXX_Merge(src proto.Message) {
-	xxx_messageInfo_GeneralModelConfig.Merge(m, src)
-}
-func (m *GeneralModelConfig) XXX_Size() int {
-	return xxx_messageInfo_GeneralModelConfig.Size(m)
-}
-func (m *GeneralModelConfig) XXX_DiscardUnknown() {
-	xxx_messageInfo_GeneralModelConfig.DiscardUnknown(m)
-}
-
-var xxx_messageInfo_GeneralModelConfig proto.InternalMessageInfo
-
-func (m *GeneralModelConfig) GetFeedVar() []*FeedVar {
-	if m != nil {
-		return m.FeedVar
-	}
-	return nil
-}
-
-func (m *GeneralModelConfig) GetFetchVar() []*FetchVar {
-	if m != nil {
-		return m.FetchVar
-	}
-	return nil
-}
-
-func init() {
-	proto.RegisterType((*FeedVar)(nil), "baidu.paddle_serving.configure.FeedVar")
-	proto.RegisterType((*FetchVar)(nil), "baidu.paddle_serving.configure.FetchVar")
-	proto.RegisterType((*GeneralModelConfig)(nil), "baidu.paddle_serving.configure.GeneralModelConfig")
-}
-
-func init() { proto.RegisterFile("general_model_config.proto", fileDescriptor_efa52beffa29d37a) }
-
-var fileDescriptor_efa52beffa29d37a = []byte{
-	// 283 bytes of a gzipped FileDescriptorProto
-	0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xb4, 0xd0, 0x31, 0x4b, 0xc4, 0x30,
-	0x14, 0x07, 0x70, 0x72, 0x6d, 0xb9, 0xf6, 0x1d, 0x2e, 0xc1, 0xa1, 0x08, 0x1e, 0xe5, 0x16, 0xe3,
-	0x52, 0xc4, 0xf1, 0x46, 0xc5, 0x73, 0x51, 0x87, 0x72, 0xb8, 0x86, 0xd8, 0xbc, 0xb6, 0x81, 0x5c,
-	0x53, 0x92, 0xde, 0xc1, 0x2d, 0x7e, 0x13, 0xf1, 0xab, 0x4a, 0x93, 0x43, 0x9c, 0x74, 0x72, 0x7b,
-	0x79, 0xff, 0xf0, 0xde, 0xe3, 0x07, 0x17, 0x2d, 0xf6, 0x68, 0x85, 0xe6, 0x3b, 0x23, 0x51, 0xf3,
-	0xda, 0xf4, 0x8d, 0x6a, 0xcb, 0xc1, 0x9a, 0xd1, 0xd0, 0xe5, 0x9b, 0x50, 0x72, 0x5f, 0x0e, 0x42,
-	0x4a, 0x8d, 0xdc, 0xa1, 0x3d, 0xa8, 0xbe, 0x2d, 0xc3, 0x97, 0xbd, 0xc5, 0xd5, 0x07, 0x81, 0xf9,
-	0x06, 0x51, 0xbe, 0x0a, 0x4b, 0x29, 0xc4, 0xbd, 0xd8, 0x61, 0x4e, 0x0a, 0xc2, 0xb2, 0xca, 0xd7,
-	0xf4, 0x12, 0x40, 0x68, 0x25, 0x1c, 0xf7, 0xc9, 0xcc, 0x27, 0x99, 0xef, 0xbc, 0x4c, 0xf1, 0x35,
-	0x9c, 0x29, 0xc7, 0xb5, 0x91, 0x7c, 0xc4, 0xde, 0x19, 0x9b, 0x47, 0x05, 0x61, 0xe9, 0x3a, 0x69,
-	0x84, 0x76, 0x58, 0x2d, 0x94, 0x7b, 0x32, 0x72, 0xeb, 0x13, 0xba, 0x84, 0xac, 0x41, 0x94, 0x7c,
-	0x3c, 0x0e, 0x98, 0xc7, 0x05, 0x61, 0xc9, 0x9a, 0xdc, 0x54, 0xe9, 0xd4, 0xdb, 0x1e, 0x07, 0xa4,
-	0xe7, 0x90, 0xb8, 0x4e, 0x0c, 0x98, 0x27, 0x45, 0xc4, 0x92, 0x2a, 0x3c, 0x56, 0xef, 0x90, 0x6e,
-	0x70, 0xac, 0xbb, 0xff, 0xbf, 0xef, 0x7b, 0x7f, 0xfc, 0x73, 0xff, 0x27, 0x01, 0xfa, 0x18, 0x78,
-	0x9f, 0x27, 0xdd, 0x7b, 0x2f, 0x47, 0xef, 0xc0, 0x1f, 0xce, 0x0f, 0xc2, 0xe6, 0xa4, 0x88, 0xd8,
-	0xe2, 0xf6, 0xaa, 0xfc, 0x5d, 0xba, 0x3c, 0x29, 0x57, 0xf3, 0xe6, 0xc4, 0xfd, 0x30, 0x81, 0x8c,
-	0x75, 0xe7, 0x87, 0xcc, 0xfc, 0x10, 0xf6, 0xf7, 0x90, 0x60, 0x31, 0xb9, 0x85, 0xea, 0x2b, 0x00,
-	0x00, 0xff, 0xff, 0x08, 0x27, 0x9c, 0x1a, 0xfe, 0x01, 0x00, 0x00,
-}
diff --git a/go/serving_client/serving_client_api.go b/go/serving_client/serving_client_api.go
deleted file mode 100644
index 299e4bcfd5f5387e54978cf51e06c2aacbd50943..0000000000000000000000000000000000000000
--- a/go/serving_client/serving_client_api.go
+++ /dev/null
@@ -1,171 +0,0 @@
-//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package serving_client
-
-import (
-       "bytes"
-       "encoding/json"
-       "io/ioutil"
-       "log"
-       "net/http"
-       pb "github.com/PaddlePaddle/Serving/go/proto"
-       "github.com/golang/protobuf/proto"
-)
-
-type Tensor struct {
-     Data   []byte `json:"data"`
-     FloatData	   []float32 `json:"float_data"`
-     IntData	   []int `json:"int_data"`
-     Int64Data	   []int64 `json:"int64_data"`
-     ElemType	int `json:"elem_type"`
-     Shape	[]int `json:"shape"`
-}
-
-type FeedInst struct {
-     TensorArray     []Tensor `json:"tensor_array"`
-}
-
-type FetchInst struct {
-     TensorArray      []Tensor `json:"tensor_array"`
-}
-
-type Request struct {
-     Insts   []FeedInst `json:"insts"`
-     FetchVarNames	[]string `json:"fetch_var_names"`
-     ProfileServer	bool `json:"profile_server"`
-}
-
-type Response struct {
-     Insts    []FetchInst `json:"insts"`
-     ProfileTime	  []int64 `json:"profile_time"`     
-}
-
-type Handle struct {
-     Url    string
-     Port   string
-     FeedAliasNameMap	map[string]string
-     FeedShapeMap	map[string][]int
-     FeedNameMap   map[string]int
-     FeedAliasNames	   []string
-     FetchNameMap  map[string]int
-     FetchAliasNameMap	map[string]string
-}
-
-func LoadModelConfig(config string) Handle {
-     in, err := ioutil.ReadFile(config)
-     if err != nil {
-     	log.Fatalln("Failed to read general model: ", err)
-     }
-     general_model_config := &pb.GeneralModelConfig{}
-     if err := proto.Unmarshal(in, general_model_config); err != nil {
-     	log.Fatalln("Failed to parse GeneralModelConfig: ", err)
-     }
-     log.Println("read protobuf succeed")
-     handle := Handle{}
-     handle.FeedNameMap = map[string]int{}
-     handle.FeedAliasNameMap = map[string]string{}
-     handle.FeedShapeMap = map[string][]int{}
-     handle.FetchNameMap = map[string]int{}
-     handle.FetchAliasNameMap = map[string]string{}
-     handle.FeedAliasNames = []string{}
-
-     for i, v := range general_model_config.FeedVar {
-     	 handle.FeedNameMap[*v.Name] = i
-	 tmp_array := []int{}
-	 for _, vv := range v.Shape {
-	     tmp_array = append(tmp_array, int(vv))
-	 }
-	 handle.FeedShapeMap[*v.Name] = tmp_array
-	 handle.FeedAliasNameMap[*v.AliasName] = *v.Name
-	 handle.FeedAliasNames = append(handle.FeedAliasNames, *v.AliasName)
-     }
-
-     for i, v := range general_model_config.FetchVar {
-     	 handle.FetchNameMap[*v.Name] = i
-	 handle.FetchAliasNameMap[*v.AliasName] = *v.Name
-     }
-
-     return handle
-}
-
-func Connect(url string, port string, handle Handle) Handle {
-     handle.Url = url
-     handle.Port = port
-     return handle
-}
-
-func Predict(handle Handle, int_feed_map map[string][]int64, fetch []string) map[string][]float32 {
-     contentType := "application/json;charset=utf-8"
-
-     var tensor_array []Tensor
-     var inst FeedInst
-     tensor_array = []Tensor{}
-     inst = FeedInst{}
-
-     for i := 0; i < len(handle.FeedAliasNames); i++ {
-     	 key_i := handle.FeedAliasNames[i]
-	 var tmp Tensor
-	 tmp.IntData = []int{}
-	 tmp.Shape = []int{}
-	 tmp.Int64Data = int_feed_map[key_i]
-	 tmp.ElemType = 0
-	 tmp.Shape = handle.FeedShapeMap[key_i]
-	 tensor_array = append(tensor_array, tmp)
-     }
-
-     inst.TensorArray = tensor_array
-
-     var profile_server bool
-     profile_server = false
-
-     req := &Request{
-     	 Insts: []FeedInst{inst},
-	 FetchVarNames: fetch,
-	 ProfileServer: profile_server}
-
-     b, err := json.Marshal(req)
-
-     body := bytes.NewBuffer(b)
-
-     var post_address bytes.Buffer
-     post_address.WriteString("http://")
-     post_address.WriteString(handle.Url)
-     post_address.WriteString(":")
-     post_address.WriteString(handle.Port)
-     post_address.WriteString("/GeneralModelService/inference")
-
-     resp, err := http.Post(post_address.String(), contentType, body)
-     if err != nil {
-     	log.Println("Post failed:", err)
-     }
-
-     defer resp.Body.Close()
-
-     content, err := ioutil.ReadAll(resp.Body)
-     if err != nil {
-      	log.Println("Read failed:", err)
-     }
-
-     response_json := Response{}
-     err = json.Unmarshal([]byte(content), &response_json)
-
-     var result map[string][]float32
-     result = map[string][]float32{}
-     for i, v := range fetch {
-     	 result[v] = response_json.Insts[0].TensorArray[i].FloatData
-     }
-     
-     return result
-}
diff --git a/java/README_CN.md b/java/README_CN.md
old mode 100644
new mode 100755
index cbee3f006912677d410e364d762666fe670a301f..ef53ac9b1b020940679db9fecbfe1d33111b79f1
--- a/java/README_CN.md
+++ b/java/README_CN.md
@@ -2,20 +2,20 @@
 
 ([English](./README.md)|简体中文)
 
-### 开发环境
+## 开发环境
 
 为了方便用户使用java进行开发，我们提供了编译好的Serving工程放置在java镜像当中，获取镜像并进入开发环境的方式是
 
 ```
-docker pull registry.baidubce.com/paddlepaddle/serving:0.5.0-java
-docker run --rm -dit --name java_serving registry.baidubce.com/paddlepaddle/serving:0.5.0-java
+docker pull registry.baidubce.com/paddlepaddle/serving:0.6.0-java
+docker run --rm -dit --name java_serving registry.baidubce.com/paddlepaddle/serving:0.6.0-java
 docker exec -it java_serving bash
 cd Serving/java
 ```
 
 Serving文件夹是镜像生成时的develop分支工程目录，需要git pull 到最新版本，或者git checkout 到想要的分支。
 
-### 安装客户端依赖
+## 安装客户端依赖
 
 由于依赖库数量庞大，因此镜像已经在生成时编译过一次，用户执行以下操作即可
 
@@ -27,7 +27,34 @@ mvn compile
 mvn install
 ```
 
-### 启动服务端(Pipeline方式)
+## 请求BRPC-Server
+
+### 服务端启动
+
+以fit_a_line模型为例，服务端启动与常规BRPC-Server端启动命令一样。
+
+```
+cd ../../python/examples/fit_a_line
+sh get_data.sh
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
+```
+
+### 客户端预测
+客户端目前支持多种请求方式，目前支持HTTP（数据为JSON格式）、HTTP（数据为PROTO格式）、GRPC
+
+推荐您使用HTTP（数据为PROTO格式），此时数据体为PROTO格式，传输的数据量小，速度快，目前已经帮用户实现了HTTP/GRPC的数据体（JSON/PROTO）的封装函数,详见[Client.java](./src/main/java/io/paddle/serving/client/Client.java)
+```
+cd ../../../java/examples/target
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample http_proto <configPath>
+```
+**注意  `<configPath>`为客户端配置文件，一般是名为serving_client_conf.prototxt的文件。**
+
+更多示例详见[PaddleServingClientExample.java](./examples/src/main/java/PaddleServingClientExample.java)
+
+
+## 请求Pipeline-Server
+
+### 服务端启动
 
 对于input data type = string类型，以IMDB model ensemble模型为例，服务端启动
 
@@ -39,14 +66,14 @@ python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.
 python test_pipeline_server.py &>pipeline.log &
 ```
 
-客户端预测(同步)
+### 客户端预测(同步)
 
 ```
 cd ../../../java/examples/target
 java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample string_imdb_predict
 ```
 
-客户端预测(异步)
+### 客户端预测(异步)
 
 ```
 cd ../../../java/examples/target
@@ -54,7 +81,7 @@ java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar Pipeli
 ```
 
 
-对于input data type = INDArray类型，以Simple Pipeline WebService中的uci_housing_model模型为例，服务端启动
+### 对于input data type = INDArray类型，以Simple Pipeline WebService中的uci_housing_model模型为例，服务端启动
 
 ```
 cd ../../python/examples/pipeline/simple_web_service
@@ -62,7 +89,7 @@ sh get_data.sh
 python web_service_java.py &>log.txt &
 ```
 
-客户端预测(同步)
+### 客户端预测(同步)
 
 ```
 cd ../../../java/examples/target
@@ -71,7 +98,7 @@ java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar Pipeli
 
 ### 注意事项
 
-1.在示例中，所有非Pipeline模型都需要使用`--use_multilang`来启动GRPC多编程语言支持，以及端口号都是9393，如果需要别的端口，需要在java文件里修改。
+1.在示例中，端口号都是9393，ip默认设置为了127.0.0.1表示本机，注意ip和port需要与Server端对应。
 
 2.目前Serving已推出Pipeline模式（原理详见[Pipeline Serving](../doc/PIPELINE_SERVING_CN.md)），面向Java的Pipeline Serving Client已发布。
 
@@ -84,5 +111,3 @@ java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar Pipeli
 第一种是GPU Serving和Java Client在运行在同一个GPU镜像中，需要用户在启动GPU镜像后，把在java镜像中编译完成后的文件(位于/Serving/java目录下)拷贝到GPU镜像中的/Serving/java目录下。
 
 第二种是GPU Serving和Java Client分别在各自的docker镜像中(或具备编译开发环境的不同主机上)部署，此时仅需注意Java Client端与GPU Serving端的ip和port需要对应，详见上述注意事项中的第3项。
-
-
diff --git a/java/examples/src/main/java/PaddleServingClientExample.java b/java/examples/src/main/java/PaddleServingClientExample.java
old mode 100644
new mode 100755
index 5f5e3ff655e7450d12f562229ae4cb2481ab4a54..153cc4afdd172b524383a78b8d1340ecebf1cc44
--- a/java/examples/src/main/java/PaddleServingClientExample.java
+++ b/java/examples/src/main/java/PaddleServingClientExample.java
@@ -11,39 +11,131 @@ import org.nd4j.linalg.factory.Nd4j;
 import java.util.*;
 
 public class PaddleServingClientExample {
-    boolean fit_a_line() {
+    boolean http_proto(String model_config_path) {
         float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
             0.0582f, -0.0727f, -0.1583f, -0.0584f,
             0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
         INDArray npdata = Nd4j.createFromArray(data);
         long[] batch_shape = {1,13};
         INDArray batch_npdata = npdata.reshape(batch_shape);
-        HashMap<String, INDArray> feed_data
-            = new HashMap<String, INDArray>() {{
+        HashMap<String, Object> feed_data
+            = new HashMap<String, Object>() {{
                 put("x", batch_npdata);
             }};
         List<String> fetch = Arrays.asList("price");
         
         Client client = new Client();
-        String target = "localhost:9393";
-        boolean succ = client.connect(target);
-        if (succ != true) {
-            System.out.println("connect failed.");
-            return false;
-        }
+        client.setIP("127.0.0.1");
+        client.setPort("9393");
+        client.loadClientConfig(model_config_path);
+        String result = client.predict(feed_data, fetch, true, 0);
+        
+        System.out.println(result);
+        return true;
+    }
 
-        Map<String, INDArray> fetch_map = client.predict(feed_data, fetch);
-        if (fetch_map == null) {
-            return false;
-        }
+    boolean http_json(String model_config_path) {
+        float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
+            0.0582f, -0.0727f, -0.1583f, -0.0584f,
+            0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
+        INDArray npdata = Nd4j.createFromArray(data);
+        long[] batch_shape = {1,13};
+        INDArray batch_npdata = npdata.reshape(batch_shape);
+        HashMap<String, Object> feed_data
+            = new HashMap<String, Object>() {{
+                put("x", batch_npdata);
+            }};
+        List<String> fetch = Arrays.asList("price");
+        
+        Client client = new Client();
+        //注意：跨docker，需要设置--net-host或直接访问另一个docker的ip
+        client.setIP("127.0.0.1");
+        client.setPort("9393");
+        client.set_http_proto(false);
+        client.loadClientConfig(model_config_path);
+        String result = client.predict(feed_data, fetch, true, 0);
+        
+        System.out.println(result);
+        return true;
+    }
+
+    boolean grpc(String model_config_path) {
+        float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
+            0.0582f, -0.0727f, -0.1583f, -0.0584f,
+            0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
+        INDArray npdata = Nd4j.createFromArray(data);
+        long[] batch_shape = {1,13};
+        INDArray batch_npdata = npdata.reshape(batch_shape);
+        HashMap<String, Object> feed_data
+            = new HashMap<String, Object>() {{
+                put("x", batch_npdata);
+            }};
+        List<String> fetch = Arrays.asList("price");
+        
+        Client client = new Client();
+        client.setIP("127.0.0.1");
+        client.setPort("9393");
+        client.loadClientConfig(model_config_path);
+        client.set_use_grpc_client(true);
+        String result = client.predict(feed_data, fetch, true, 0);
+        
+        System.out.println(result);
+        return true;
+    }
 
-        for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
-            System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
+    boolean encrypt(String model_config_path,String keyFilePath) {
+        float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
+            0.0582f, -0.0727f, -0.1583f, -0.0584f,
+            0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
+        INDArray npdata = Nd4j.createFromArray(data);
+        long[] batch_shape = {1,13};
+        INDArray batch_npdata = npdata.reshape(batch_shape);
+        HashMap<String, Object> feed_data
+            = new HashMap<String, Object>() {{
+                put("x", batch_npdata);
+            }};
+        List<String> fetch = Arrays.asList("price");
+        
+        Client client = new Client();
+        client.setIP("127.0.0.1");
+        client.setPort("9393");
+        client.loadClientConfig(model_config_path);
+        client.use_key(keyFilePath);
+        try {
+            Thread.sleep(1000*3);   // 休眠3秒，等待Server启动
+        } catch (Exception e) {
         }
+        String result = client.predict(feed_data, fetch, true, 0);
+        
+        System.out.println(result);
         return true;
     }
 
-    boolean yolov4(String filename) {
+    boolean compress(String model_config_path) {
+        float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
+            0.0582f, -0.0727f, -0.1583f, -0.0584f,
+            0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
+        INDArray npdata = Nd4j.createFromArray(data);
+        long[] batch_shape = {500,13};
+        INDArray batch_npdata = npdata.broadcast(batch_shape);
+        HashMap<String, Object> feed_data
+            = new HashMap<String, Object>() {{
+                put("x", batch_npdata);
+            }};
+        List<String> fetch = Arrays.asList("price");
+        
+        Client client = new Client();
+        client.setIP("127.0.0.1");
+        client.setPort("9393");
+        client.loadClientConfig(model_config_path);
+        client.set_request_compress(true);
+        client.set_response_compress(true);
+        String result = client.predict(feed_data, fetch, true, 0);
+        System.out.println(result);
+        return true;
+    }
+
+    boolean yolov4(String model_config_path,String filename) {
         // https://deeplearning4j.konduit.ai/
         int height = 608;
         int width = 608;
@@ -77,171 +169,44 @@ public class PaddleServingClientExample {
         INDArray im_size = Nd4j.createFromArray(new int[]{height, width});
         long[] batch_size_shape = {1,2};
         INDArray batch_im_size = im_size.reshape(batch_size_shape);
-        HashMap<String, INDArray> feed_data
-            = new HashMap<String, INDArray>() {{
+        HashMap<String, Object> feed_data
+            = new HashMap<String, Object>() {{
                 put("image", batch_image);
                 put("im_size", batch_im_size);
             }};
         List<String> fetch = Arrays.asList("save_infer_model/scale_0.tmp_0");
-        
         Client client = new Client();
-        String target = "localhost:9393";
-        boolean succ = client.connect(target);
-        if (succ != true) {
-            System.out.println("connect failed.");
-            return false;
-        }
-        succ = client.setRpcTimeoutMs(20000); // cpu
-        if (succ != true) {
-            System.out.println("set timeout failed.");
-            return false;
-        }
-
-        Map<String, INDArray> fetch_map = client.predict(feed_data, fetch);
-        if (fetch_map == null) {
-            return false;
-        }
-
-        for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
-            System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
-        }
+        client.setIP("127.0.0.1");
+        client.setPort("9393");
+        client.loadClientConfig(model_config_path);
+        String result = client.predict(feed_data, fetch, true, 0);
+        System.out.println(result);
         return true;
     }
 
-    boolean batch_predict() {
-        float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
-            0.0582f, -0.0727f, -0.1583f, -0.0584f,
-            0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
-        INDArray npdata = Nd4j.createFromArray(data);
-        HashMap<String, INDArray> feed_data
-            = new HashMap<String, INDArray>() {{
-                put("x", npdata);
-            }};
-        List<HashMap<String, INDArray>> feed_batch
-            = new ArrayList<HashMap<String, INDArray>>() {{
-                add(feed_data);
-                add(feed_data);
-            }};
-        List<String> fetch = Arrays.asList("price");
-        
-        Client client = new Client();
-        String target = "localhost:9393";
-        boolean succ = client.connect(target);
-        if (succ != true) {
-            System.out.println("connect failed.");
-            return false;
-        }
-
-        Map<String, INDArray> fetch_map = client.predict(feed_batch, fetch);
-        if (fetch_map == null) {
-            return false;
-        }
-
-        for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
-            System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
-        }
-        return true;
-    }
-
-    boolean asyn_predict() {
-        float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
-            0.0582f, -0.0727f, -0.1583f, -0.0584f,
-            0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
-        INDArray npdata = Nd4j.createFromArray(data);
-        HashMap<String, INDArray> feed_data
-            = new HashMap<String, INDArray>() {{
-                put("x", npdata);
-            }};
-        List<String> fetch = Arrays.asList("price");
-
-        Client client = new Client();
-        String target = "localhost:9393";
-        boolean succ = client.connect(target);
-        if (succ != true) {
-            System.out.println("connect failed.");
-            return false;
-        }
-
-        PredictFuture future = client.asyn_predict(feed_data, fetch);
-        Map<String, INDArray> fetch_map = future.get();
-        if (fetch_map == null) {
-            System.out.println("Get future reslut failed");
-            return false;
-        }
-        
-        for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
-            System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
-        }
-        return true;
-    }
-
-    boolean model_ensemble() {
-        long[] data = {8, 233, 52, 601};
-        INDArray npdata = Nd4j.createFromArray(data);
-        HashMap<String, INDArray> feed_data
-            = new HashMap<String, INDArray>() {{
-                put("words", npdata);
-            }};
-        List<String> fetch = Arrays.asList("prediction");
-
-        Client client = new Client();
-        String target = "localhost:9393";
-        boolean succ = client.connect(target);
-        if (succ != true) {
-            System.out.println("connect failed.");
-            return false;
-        }
-        
-        Map<String, HashMap<String, INDArray>> fetch_map
-            = client.ensemble_predict(feed_data, fetch);
-        if (fetch_map == null) {
-            return false;
-        }
-
-        for (Map.Entry<String, HashMap<String, INDArray>> entry : fetch_map.entrySet()) {
-            System.out.println("Model = " + entry.getKey());
-            HashMap<String, INDArray> tt = entry.getValue();
-            for (Map.Entry<String, INDArray> e : tt.entrySet()) {
-                System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
-            }
-        }
-        return true;
-    }
-
-    boolean bert() {
+    boolean bert(String model_config_path) {
         float[] input_mask = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
         long[] position_ids = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
         long[] input_ids = {101, 6843, 3241, 749, 8024, 7662, 2533, 1391, 2533, 2523, 7676, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
         long[] segment_ids = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-        HashMap<String, INDArray> feed_data
-            = new HashMap<String, INDArray>() {{
+        HashMap<String, Object> feed_data
+            = new HashMap<String, Object>() {{
                 put("input_mask", Nd4j.createFromArray(input_mask));
                 put("position_ids", Nd4j.createFromArray(position_ids));
                 put("input_ids", Nd4j.createFromArray(input_ids));
                 put("segment_ids", Nd4j.createFromArray(segment_ids));
             }};
         List<String> fetch = Arrays.asList("pooled_output");
-
         Client client = new Client();
-        String target = "localhost:9393";
-        boolean succ = client.connect(target);
-        if (succ != true) {
-            System.out.println("connect failed.");
-            return false;
-        }
-        
-        Map<String, INDArray> fetch_map = client.predict(feed_data, fetch);
-        if (fetch_map == null) {
-            return false;
-        }
-
-        for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
-            System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
-        }
+        client.setIP("127.0.0.1");
+        client.setPort("9393");
+        client.loadClientConfig(model_config_path);
+        String result = client.predict(feed_data, fetch, true, 0);
+        System.out.println(result);
         return true;
     }
 
-    boolean cube_local() {
+    boolean cube_local(String model_config_path) {
         long[] embedding_14 = {250644};
         long[] embedding_2 = {890346};
         long[] embedding_10 = {3939};
@@ -271,8 +236,8 @@ public class PaddleServingClientExample {
         long[] embedding_19 = {537425};
         long[] embedding_0 = {737395};
 
-        HashMap<String, INDArray> feed_data
-            = new HashMap<String, INDArray>() {{
+        HashMap<String, Object> feed_data
+            = new HashMap<String, Object>() {{
                 put("embedding_14.tmp_0", Nd4j.createFromArray(embedding_14));
                 put("embedding_2.tmp_0", Nd4j.createFromArray(embedding_2));
                 put("embedding_10.tmp_0", Nd4j.createFromArray(embedding_10));
@@ -302,23 +267,12 @@ public class PaddleServingClientExample {
                 put("embedding_0.tmp_0", Nd4j.createFromArray(embedding_0));
             }};
         List<String> fetch = Arrays.asList("prob");
-
         Client client = new Client();
-        String target = "localhost:9393";
-        boolean succ = client.connect(target);
-        if (succ != true) {
-            System.out.println("connect failed.");
-            return false;
-        }
-        
-        Map<String, INDArray> fetch_map = client.predict(feed_data, fetch);
-        if (fetch_map == null) {
-            return false;
-        }
-
-        for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
-            System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
-        }
+        client.setIP("127.0.0.1");
+        client.setPort("9393");
+        client.loadClientConfig(model_config_path);
+        String result = client.predict(feed_data, fetch, true, 0);
+        System.out.println(result);
         return true;
     }
 
@@ -328,33 +282,37 @@ public class PaddleServingClientExample {
         PaddleServingClientExample e = new PaddleServingClientExample();
         boolean succ = false;
         
-        if (args.length < 1) {
-            System.out.println("Usage: java -cp <jar> PaddleServingClientExample <test-type>.");
-            System.out.println("<test-type>: fit_a_line bert model_ensemble asyn_predict batch_predict cube_local cube_quant yolov4");
+        if (args.length < 2) {
+            System.out.println("Usage: java -cp <jar> PaddleServingClientExample <test-type> <configPath>.");
+            System.out.println("<test-type>: http_proto http_json grpc bert cube_local yolov4 encrypt");
             return;
         }
         String testType = args[0];
         System.out.format("[Example] %s\n", testType);
-        if ("fit_a_line".equals(testType)) {
-            succ = e.fit_a_line();
+        if ("http_proto".equals(testType)) {
+            succ = e.http_proto(args[1]);
+        } else if ("http_json".equals(testType)) {
+            succ = e.http_json(args[1]);
+        } else if ("grpc".equals(testType)) {
+            succ = e.grpc(args[1]);
+        } else if ("compress".equals(testType)) {
+            succ = e.compress(args[1]);
         } else if ("bert".equals(testType)) {
-            succ = e.bert();
-        } else if ("model_ensemble".equals(testType)) {
-            succ = e.model_ensemble();
-        } else if ("asyn_predict".equals(testType)) {
-            succ = e.asyn_predict();
-        } else if ("batch_predict".equals(testType)) {
-            succ = e.batch_predict();
+            succ = e.bert(args[1]);
         } else if ("cube_local".equals(testType)) {
-            succ = e.cube_local();
-        } else if ("cube_quant".equals(testType)) {
-            succ = e.cube_local();
+            succ = e.cube_local(args[1]);
         } else if ("yolov4".equals(testType)) {
-            if (args.length < 2) {
-                System.out.println("Usage: java -cp <jar> PaddleServingClientExample yolov4 <image-filepath>.");
+            if (args.length < 3) {
+                System.out.println("Usage: java -cp <jar> PaddleServingClientExample yolov4 <configPath> <image-filepath>.");
+                return;
+            }
+            succ = e.yolov4(args[1],args[2]);
+        } else if ("encrypt".equals(testType)) {
+            if (args.length < 3) {
+                System.out.println("Usage: java -cp <jar> PaddleServingClientExample encrypt <configPath> <keyPath>.");
                 return;
             }
-            succ = e.yolov4(args[1]);
+            succ = e.encrypt(args[1],args[2]);
         } else {
             System.out.format("test-type(%s) not match.\n", testType);
             return;
diff --git a/java/examples/src/main/java/PipelineClientExample.java b/java/examples/src/main/java/PipelineClientExample.java
old mode 100755
new mode 100644
diff --git a/java/examples/src/main/java/StaticPipelineClient.java b/java/examples/src/main/java/StaticPipelineClient.java
old mode 100755
new mode 100644
diff --git a/java/pom.xml b/java/pom.xml
index cd8ce00051d03e6f14c4d524725a17ba71be2511..3072da68908844f8bbcfde63b25f9a811dbf8e82 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -145,6 +145,11 @@
             <artifactId>json</artifactId>
             <version>20190722</version>
         </dependency>
+        <dependency>
+            <groupId>org.apache.httpcomponents</groupId>
+            <artifactId>httpclient</artifactId>
+            <version>4.5.12</version>
+        </dependency>
         <dependency>
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-api</artifactId>
diff --git a/java/src/main/java/io/paddle/serving/client/Client.java b/java/src/main/java/io/paddle/serving/client/Client.java
old mode 100644
new mode 100755
index aae7e6f8f50d4ca2baca877f2e51c8e71eb64af8..63e861ba6199c7a56129c4d3b0cb03a77d26f6b7
--- a/java/src/main/java/io/paddle/serving/client/Client.java
+++ b/java/src/main/java/io/paddle/serving/client/Client.java
@@ -1,27 +1,68 @@
 package io.paddle.serving.client;
-
 import java.util.*;
 import java.util.function.Function;
+import java.util.stream.*;
+import java.util.Arrays;
+import java.util.Iterator;
 import java.lang.management.ManagementFactory;
 import java.lang.management.RuntimeMXBean;
 import java.util.stream.Collectors;
-import java.util.List;
-import java.util.ArrayList;
+import java.util.stream.IntStream;
+import java.util.stream.LongStream;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.util.Map.Entry;
+import java.nio.file.*;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.iter.NdIndexIterator;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.nativeblas.Nd4jCpu.boolean_and;
+
+import java.lang.reflect.*;
+
+import org.apache.http.HttpEntity;
+import org.apache.http.NameValuePair;
+import org.apache.http.client.ClientProtocolException;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.client.entity.UrlEncodedFormEntity;
+import org.apache.http.entity.StringEntity;
+import org.apache.http.client.entity.GzipDecompressingEntity;
+import org.apache.http.Header;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.http.message.BasicNameValuePair;
+import org.apache.http.util.EntityUtils;
+import org.hamcrest.core.IsInstanceOf;
+import org.apache.http.entity.InputStreamEntity;
+import org.apache.http.entity.ByteArrayEntity;
+
+import org.json.*;
+
+import io.paddle.serving.configure.*;
+import baidu.paddle_serving.predictor.general_model.*;
+
+import org.apache.commons.lang3.ArrayUtils;
+
 
 import io.grpc.ManagedChannel;
 import io.grpc.ManagedChannelBuilder;
 import io.grpc.StatusRuntimeException;
 import com.google.protobuf.ByteString;
 
-import com.google.common.util.concurrent.ListenableFuture;
 
-import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.api.iter.NdIndexIterator;
-import org.nd4j.linalg.factory.Nd4j;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+ 
 
-import io.paddle.serving.grpc.*;
-import io.paddle.serving.configure.*;
-import io.paddle.serving.client.PredictFuture;
+enum ElementType
+{
+    Int64_type, Float32_type, Int32_type, Bytes_type;
+}
 
 class Profiler {
     int pid_;
@@ -56,35 +97,62 @@ class Profiler {
         enable_ = flag;
     }
 }
-
 public class Client {
-    private ManagedChannel channel_;
-    private MultiLangGeneralModelServiceGrpc.MultiLangGeneralModelServiceBlockingStub blockingStub_;
-    private MultiLangGeneralModelServiceGrpc.MultiLangGeneralModelServiceFutureStub futureStub_;
-    private double rpcTimeoutS_;
+    private int timeoutS_;
     private List<String> feedNames_;
+    private Map<String, String> feedRealNames_;
     private Map<String, Integer> feedTypes_;
     private Map<String, List<Integer>> feedShapes_;
+    private Map<String, Integer> feedNameToIndex_;
+    private Map<Integer, String> feedTypeToDataKey_;
     private List<String> fetchNames_;
     private Map<String, Integer> fetchTypes_;
     private Set<String> lodTensorSet_;
     private Map<String, Integer> feedTensorLen_;
     private Profiler profiler_;
+    private String ip;
+    private String serverPort;
+    private String port;
+    private String serviceName;
+    private boolean request_compress_flag;
+    private boolean response_compress_flag;
+    private String GLOG_v;
+    private boolean http_proto;
+    private boolean use_grpc_client;
+    private ManagedChannel channel_;
+    private GeneralModelServiceGrpc.GeneralModelServiceBlockingStub blockingStub_;
 
-    public Client() {
-        channel_ = null;
-        blockingStub_ = null;
-        futureStub_ = null;
-        rpcTimeoutS_ = 2;
 
+    public Client() {
         feedNames_ = null;
+        feedRealNames_ = null;
         feedTypes_ = null;
         feedShapes_ = null;
         fetchNames_ = null;
         fetchTypes_ = null;
         lodTensorSet_ = null;
         feedTensorLen_ = null;
-        
+        feedNameToIndex_ = null;
+        timeoutS_ = 200000;
+        ip = "127.0.0.1";
+        port = "9393";
+        serverPort = "9393";
+        serviceName = "/GeneralModelService/inference";
+        request_compress_flag = false;
+        response_compress_flag = false;
+        GLOG_v = System.getenv("GLOG_v");
+        http_proto = true;//use the Proto in HTTP by default.
+        use_grpc_client = false;
+
+        channel_ = null;
+        blockingStub_ = null;
+
+        feedTypeToDataKey_ = new HashMap<Integer, String>();
+        feedTypeToDataKey_.put(0, "int64_data");
+        feedTypeToDataKey_.put(1, "float_data");
+        feedTypeToDataKey_.put(2, "int_data");
+        feedTypeToDataKey_.put(3, "data");
+
         profiler_ = new Profiler();
         boolean is_profile = false;
         String FLAGS_profile_client = System.getenv("FLAGS_profile_client");
@@ -93,85 +161,57 @@ public class Client {
         }
         profiler_.enable(is_profile);
     }
-    
-    public boolean setRpcTimeoutMs(int rpc_timeout) {
-        if (futureStub_ == null || blockingStub_ == null) {
-            System.out.println("set timeout must be set after connect.");
-            return false;
-        }
-        rpcTimeoutS_ = rpc_timeout / 1000.0;
-        SetTimeoutRequest timeout_req = SetTimeoutRequest.newBuilder()
-            .setTimeoutMs(rpc_timeout)
-            .build();
-        SimpleResponse resp;
-        try {
-            resp = blockingStub_.setTimeout(timeout_req);
-        } catch (StatusRuntimeException e) {
-            System.out.format("Set RPC timeout failed: %s\n", e.toString());
-            return false;
-        }
-        return resp.getErrCode() == 0;
+
+    public void setTimeOut(int timeoutS_) {
+        this.timeoutS_ = timeoutS_;
     }
 
-    public boolean connect(String target) {
-        // TODO: target must be NameResolver-compliant URI
-        // https://grpc.github.io/grpc-java/javadoc/io/grpc/ManagedChannelBuilder.html
-        try {
-            channel_ = ManagedChannelBuilder.forTarget(target)
-                .defaultLoadBalancingPolicy("round_robin")
-                .maxInboundMessageSize(Integer.MAX_VALUE)
-                .usePlaintext()
-                .build();
-            blockingStub_ = MultiLangGeneralModelServiceGrpc.newBlockingStub(channel_);
-            futureStub_ = MultiLangGeneralModelServiceGrpc.newFutureStub(channel_);
-        } catch (Exception e) {
-            System.out.format("Connect failed: %s\n", e.toString());
-            return false;
-        }
-        GetClientConfigRequest get_client_config_req = GetClientConfigRequest.newBuilder().build();
-        GetClientConfigResponse resp;
-        try {
-            resp = blockingStub_.getClientConfig(get_client_config_req);
-        } catch (Exception e) {
-            System.out.format("Get Client config failed: %s\n", e.toString());
-            return false;
-        }
-        String model_config_str = resp.getClientConfigStr();
-        _parseModelConfig(model_config_str);
-        return true;
+    public void setIP(String ip) {
+        this.ip = ip;
+    }
+
+    public void setPort(String port) {
+        this.port = port;
+        this.serverPort = port;
+    }
+
+    public void setServiceName(String serviceName){
+        this.serviceName = serviceName;
     }
 
-    private void _parseModelConfig(String model_config_str) {
+    public void loadClientConfig(String model_config_path) {
         GeneralModelConfig.Builder model_conf_builder = GeneralModelConfig.newBuilder();
         try {
+            byte[] data = Files.readAllBytes(Paths.get(model_config_path));
+            String model_config_str = new String(data, "utf-8");
             com.google.protobuf.TextFormat.getParser().merge(model_config_str, model_conf_builder);
         } catch (com.google.protobuf.TextFormat.ParseException e) {
             System.out.format("Parse client config failed: %s\n", e.toString());
+        } catch (Exception e) {
+            System.out.format("Open client config failed: %s\n", e.toString());
         }
         GeneralModelConfig model_conf = model_conf_builder.build();
 
         feedNames_ = new ArrayList<String>();
-        fetchNames_ = new ArrayList<String>();
+        feedRealNames_ = new HashMap<String, String>();
         feedTypes_ = new HashMap<String, Integer>();
         feedShapes_ = new HashMap<String, List<Integer>>();
-        fetchTypes_ = new HashMap<String, Integer>();
         lodTensorSet_ = new HashSet<String>();
         feedTensorLen_ = new HashMap<String, Integer>();
+        feedNameToIndex_ = new HashMap<String, Integer>();
 
-        List<FeedVar> feed_var_list = model_conf.getFeedVarList();
-        for (FeedVar feed_var : feed_var_list) {
-            feedNames_.add(feed_var.getAliasName());
-        }
-        List<FetchVar> fetch_var_list = model_conf.getFetchVarList();
-        for (FetchVar fetch_var : fetch_var_list) {
-            fetchNames_.add(fetch_var.getAliasName());
-        }
+        fetchNames_ = new ArrayList<String>();
+        fetchTypes_ = new HashMap<String, Integer>();
 
+        List<FeedVar> feed_var_list = model_conf.getFeedVarList();
         for (int i = 0; i < feed_var_list.size(); ++i) {
             FeedVar feed_var = feed_var_list.get(i);
             String var_name = feed_var.getAliasName();
+            feedNames_.add(var_name);
+            feedRealNames_.put(var_name, feed_var.getName());
             feedTypes_.put(var_name, feed_var.getFeedType());
             feedShapes_.put(var_name, feed_var.getShapeList());
+            feedNameToIndex_.put(var_name, i);
             if (feed_var.getIsLodTensor()) {
                 lodTensorSet_.add(var_name);
             } else {
@@ -183,391 +223,553 @@ public class Client {
             }
         }
 
+        List<FetchVar> fetch_var_list = model_conf.getFetchVarList();
         for (int i = 0; i < fetch_var_list.size(); i++) {
             FetchVar fetch_var = fetch_var_list.get(i);
             String var_name = fetch_var.getAliasName();
+            fetchNames_.add(var_name);
             fetchTypes_.put(var_name, fetch_var.getFetchType());
-            if (fetch_var.getIsLodTensor()) {
-                lodTensorSet_.add(var_name);
-            }
         }
     }
 
-    private InferenceRequest _packInferenceRequest(
-            List<HashMap<String, INDArray>> feed_batch,
-            Iterable<String> fetch,
-            long log_id) throws IllegalArgumentException {
-        List<String> feed_var_names = new ArrayList<String>();
-        feed_var_names.addAll(feed_batch.get(0).keySet());
-
-        InferenceRequest.Builder req_builder = InferenceRequest.newBuilder()
-            .addAllFeedVarNames(feed_var_names)
-            .addAllFetchVarNames(fetch)
-            .setIsPython(false)
-            .setLogId(log_id);
-        for (HashMap<String, INDArray> feed_data: feed_batch) {
-            FeedInst.Builder inst_builder = FeedInst.newBuilder();
-            for (String name: feed_var_names) {
-                Tensor.Builder tensor_builder = Tensor.newBuilder();
-                INDArray variable = feed_data.get(name);
-                long[] flattened_shape = {-1};
-                INDArray flattened_list = variable.reshape(flattened_shape);
-                int v_type = feedTypes_.get(name);
-                NdIndexIterator iter = new NdIndexIterator(flattened_list.shape());
-                if (v_type == 0) { // int64
-                    while (iter.hasNext()) {
-                        long[] next_index = iter.next();
-                        long x = flattened_list.getLong(next_index);
-                        tensor_builder.addInt64Data(x);
-                    }
-                } else if (v_type == 1) { // float32
-                    while (iter.hasNext()) {
-                        long[] next_index = iter.next();
-                        float x = flattened_list.getFloat(next_index);
-                        tensor_builder.addFloatData(x);
-                    }
-                } else if (v_type == 2) { // int32
-                    while (iter.hasNext()) {
-                        long[] next_index = iter.next();
-                        // the interface of INDArray is strange:
-                        // https://deeplearning4j.org/api/latest/org/nd4j/linalg/api/ndarray/INDArray.html
-                        int[] int_next_index = new int[next_index.length];
-                        for(int i = 0; i < next_index.length; i++) {
-                            int_next_index[i] = (int)next_index[i];
-                        }
-                        int x = flattened_list.getInt(int_next_index);
-                        tensor_builder.addIntData(x);
-                    }
-                } else {
-                    throw new IllegalArgumentException("error tensor value type.");
-                }
-                long[] longArray = variable.shape();
-                int[] intArray = Arrays.stream(longArray).mapToInt(i -> (int) i).toArray();
-                List<Integer> indarrayShapeList = Arrays.stream(intArray).boxed().collect(Collectors.toList());
-                //tensor_builder.addAllShape(feedShapes_.get(name));
-                tensor_builder.addAllShape(indarrayShapeList);
-                inst_builder.addTensorArray(tensor_builder.build());
-            }
-            req_builder.addInsts(inst_builder.build());
+    public void use_key(String keyFilePath) {
+        String key_str = null;
+        String encrypt_url = "http://" + this.ip + ":" +this.port;
+        try {
+            byte[] data = Files.readAllBytes(Paths.get(keyFilePath));
+            key_str = Base64.getEncoder().encodeToString(data);
+        } catch (Exception e) {
+            System.out.format("Open key file failed: %s\n", e.toString());
         }
-        return req_builder.build();
-    }
-
-    private Map<String, HashMap<String, INDArray>>
-        _unpackInferenceResponse(
-            InferenceResponse resp,
-            Iterable<String> fetch,
-            Boolean need_variant_tag) throws IllegalArgumentException {
-        return Client._staticUnpackInferenceResponse(
-                resp, fetch, fetchTypes_, lodTensorSet_, need_variant_tag);
-    }
-
-    private static Map<String, HashMap<String, INDArray>>
-        _staticUnpackInferenceResponse(
-            InferenceResponse resp,
-            Iterable<String> fetch,
-            Map<String, Integer> fetchTypes,
-            Set<String> lodTensorSet,
-            Boolean need_variant_tag) throws IllegalArgumentException {
-        if (resp.getErrCode() != 0) {
-            return null;
+        JSONObject jsonKey = new JSONObject();
+        if( key_str != null) {
+            jsonKey.put("key", key_str);
+        }else{
+            jsonKey.put("key", "");
         }
-        String tag = resp.getTag();
-        HashMap<String, HashMap<String, INDArray>> multi_result_map
-            = new HashMap<String, HashMap<String, INDArray>>();
-        for (ModelOutput model_result: resp.getOutputsList()) {
-            String engine_name = model_result.getEngineName();
-            FetchInst inst = model_result.getInsts(0);
-            HashMap<String, INDArray> result_map
-                = new HashMap<String, INDArray>();
-            int index = 0;
-            for (String name: fetch) {
-                Tensor variable = inst.getTensorArray(index);
-                int v_type = fetchTypes.get(name);
-                INDArray data = null; 
-                if (v_type == 0) { // int64
-                    List<Long> list = variable.getInt64DataList();
-                    long[] array = new long[list.size()];
-                    for (int i = 0; i < list.size(); i++) {
-                        array[i] = list.get(i);
-                    }
-                    data = Nd4j.createFromArray(array);
-                } else if (v_type == 1) { // float32
-                    List<Float> list = variable.getFloatDataList();
-                    float[] array = new float[list.size()];
-                    for (int i = 0; i < list.size(); i++) {
-                        array[i] = list.get(i);
-                    }
-                    data = Nd4j.createFromArray(array);
-                } else if (v_type == 2) { // int32
-                    List<Integer> list = variable.getIntDataList();
-                    int[] array = new int[list.size()];
-                    for (int i = 0; i < list.size(); i++) {
-                        array[i] = list.get(i);
-                    }
-                    data = Nd4j.createFromArray(array);
-                } else {
-                    throw new IllegalArgumentException("error tensor value type.");
-                }
-                // shape
-                List<Integer> shape_lsit = variable.getShapeList();
-                int[] shape_array = new int[shape_lsit.size()];
-                for (int i = 0; i < shape_lsit.size(); ++i) {
-                    shape_array[i] = shape_lsit.get(i);
-                }
-                data = data.reshape(shape_array);
-                
-                // put data to result_map
-                result_map.put(name, data);
-
-                // lod
-                if (lodTensorSet.contains(name)) {
-                    List<Integer> list = variable.getLodList();
-                    int[] array = new int[list.size()];
-                    for (int i = 0; i < list.size(); i++) {
-                        array[i] = list.get(i);
-                    }
-                    result_map.put(name + ".lod", Nd4j.createFromArray(array));
-                }
-                index += 1;
-            }
-            multi_result_map.put(engine_name, result_map);
+        String result = doPost(encrypt_url, jsonKey.toString());
+        try {
+            JSONObject jsonObject = new JSONObject(result);
+            JSONArray jsonArray = jsonObject.getJSONArray("endpoint_list");
+            this.serverPort = jsonArray.getString(0);
+            System.out.format("Real ServerPort is: %s\n", this.serverPort);
+        }catch (JSONException err) {
+            System.out.format("Parse serverPort failed: %s\n", err.toString());
         }
-
-        // TODO: tag(ABtest not support now)
-        return multi_result_map;
-    }
-
-    public Map<String, INDArray> predict(
-            HashMap<String, INDArray> feed,
-            Iterable<String> fetch) {
-        return predict(feed, fetch, false, 0);
-    }
-
-    public Map<String, INDArray> predict(
-            HashMap<String, INDArray> feed,
-            Iterable<String> fetch,
-            long log_id) {
-        return predict(feed, fetch, false, log_id);
     }
 
-    public Map<String, HashMap<String, INDArray>> ensemble_predict(
-            HashMap<String, INDArray> feed,
-            Iterable<String> fetch) {
-        return ensemble_predict(feed, fetch, false, 0);
+    public void set_request_compress(boolean request_compress_flag) {
+        this.request_compress_flag = request_compress_flag;
     }
 
-    public Map<String, HashMap<String, INDArray>> ensemble_predict(
-            HashMap<String, INDArray> feed,
-            Iterable<String> fetch,
-            long log_id) {
-        return ensemble_predict(feed, fetch, false, log_id);
+    public void set_response_compress(boolean response_compress_flag) {
+        this.response_compress_flag = response_compress_flag;
     }
 
-    public PredictFuture asyn_predict(
-            HashMap<String, INDArray> feed,
-            Iterable<String> fetch) {
-        return asyn_predict(feed, fetch, false, 0);
+    public void set_http_proto(boolean http_proto){
+        this.http_proto = http_proto;
     }
-
-    public PredictFuture asyn_predict(
-            HashMap<String, INDArray> feed,
-            Iterable<String> fetch,
-            long log_id) {
-        return asyn_predict(feed, fetch, false, log_id);
-    }
-
-    public Map<String, INDArray> predict(
-            HashMap<String, INDArray> feed,
-            Iterable<String> fetch,
-            Boolean need_variant_tag) {
-        return predict(feed, fetch, need_variant_tag, 0);
-    }
-
-    public Map<String, INDArray> predict(
-            HashMap<String, INDArray> feed,
-            Iterable<String> fetch,
-            Boolean need_variant_tag,
-            long log_id) {
-        List<HashMap<String, INDArray>> feed_batch
-            = new ArrayList<HashMap<String, INDArray>>();
-        feed_batch.add(feed);
-        return predict(feed_batch, fetch, need_variant_tag, log_id);
+    public void set_use_grpc_client(boolean use_grpc_client){
+        this.use_grpc_client = use_grpc_client;
     }
 
-    public Map<String, HashMap<String, INDArray>> ensemble_predict(
-            HashMap<String, INDArray> feed,
-            Iterable<String> fetch,
-            Boolean need_variant_tag) {
-        return ensemble_predict(feed, fetch, need_variant_tag, 0);
+    public byte[] compress(Object obj) {
+        if (obj == null) {
+            return null;
+        }
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        GZIPOutputStream gzip;
+        try {
+            gzip = new GZIPOutputStream(out);
+            if(obj instanceof String){
+                gzip.write(((String)obj).getBytes("UTF-8"));
+            }else{
+                gzip.write((byte[])obj);
+            }
+            gzip.close();
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        return out.toByteArray();
     }
     
-    public Map<String, HashMap<String, INDArray>> ensemble_predict(
-            HashMap<String, INDArray> feed,
-            Iterable<String> fetch,
-            Boolean need_variant_tag,
-            long log_id) {
-        List<HashMap<String, INDArray>> feed_batch
-            = new ArrayList<HashMap<String, INDArray>>();
-        feed_batch.add(feed);
-        return ensemble_predict(feed_batch, fetch, need_variant_tag, log_id);
-    }
-
-    public PredictFuture asyn_predict(
-            HashMap<String, INDArray> feed,
-            Iterable<String> fetch,
-            Boolean need_variant_tag) {
-        return asyn_predict(feed, fetch, need_variant_tag, 0);
+    // 帮助用户封装Http请求的接口，用户只需要传递FeedData,Lod,Fetchlist即可。
+    // 根据Proto组装Json的过程由这个函数来完成，且接口与Python基本一致.
+    // 共提供了四组重载的接口，支持用户最少传入feedData和fetch，还可传lod和batchFlag.
+    public String predict(Map<String, Object> feedData,
+                    List<String> fetch,
+                    int log_id) {
+        
+        return predict(feedData,null,fetch,false,log_id);
     }
 
-    public PredictFuture asyn_predict(
-            HashMap<String, INDArray> feed,
-            Iterable<String> fetch,
-            Boolean need_variant_tag,
-            long log_id) {
-        List<HashMap<String, INDArray>> feed_batch
-            = new ArrayList<HashMap<String, INDArray>>();
-        feed_batch.add(feed);
-        return asyn_predict(feed_batch, fetch, need_variant_tag, log_id);
+    public String predict(Map<String, Object> feedData,
+                    List<String> fetch,
+                    boolean batchFlag,
+                    int log_id) {
+        
+        return predict(feedData,null,fetch,batchFlag,log_id);
     }
 
-    public Map<String, INDArray> predict(
-            List<HashMap<String, INDArray>> feed_batch,
-            Iterable<String> fetch) {
-        return predict(feed_batch, fetch, false, 0);
+    public String predict(Map<String, Object> feedData,
+                    Map<String, Object> feedLod,
+                    List<String> fetch,
+                    int log_id) {
+        
+        return predict(feedData,feedLod,fetch,false,log_id);
     }
 
-    public Map<String, INDArray> predict(
-            List<HashMap<String, INDArray>> feed_batch,
-            Iterable<String> fetch,
-            long log_id) {
-        return predict(feed_batch, fetch, false, log_id);
-    }
-    
-    public Map<String, HashMap<String, INDArray>> ensemble_predict(
-            List<HashMap<String, INDArray>> feed_batch,
-            Iterable<String> fetch) {
-        return ensemble_predict(feed_batch, fetch, false, 0);
+    public String predict(Map<String, Object> feedData,
+                    Map<String, Object> feedLod,
+                    List<String> fetch,
+                    boolean batchFlag,
+                    int log_id) {
+        if(this.use_grpc_client){
+            return grpc_predict(feedData, feedLod, fetch, batchFlag, log_id);
+        }
+        return http_predict(feedData, feedLod, fetch, batchFlag, log_id);
     }
 
-    public Map<String, HashMap<String, INDArray>> ensemble_predict(
-            List<HashMap<String, INDArray>> feed_batch,
-            Iterable<String> fetch,
-            long log_id) {
-        return ensemble_predict(feed_batch, fetch, false, log_id);
-    }
+    public String grpc_predict(Map<String, Object> feedData,
+                    Map<String, Object> feedLod,
+                    List<String> fetch,
+                    boolean batchFlag,
+                    int log_id) {
+        String result = null;
+        try {
+            String server_url = this.ip + ":" + this.serverPort;
+            channel_ = ManagedChannelBuilder.forTarget(server_url)
+                .defaultLoadBalancingPolicy("round_robin")
+                .maxInboundMessageSize(Integer.MAX_VALUE)
+                .usePlaintext()
+                .build();
+            blockingStub_ = GeneralModelServiceGrpc.newBlockingStub(channel_);
+            Request request = process_proto_data(feedData, feedLod, fetch, batchFlag, log_id);
+            Response resp = blockingStub_.inference(request);
+            result = resp.toString();
+        } catch (Exception e) {
+            System.out.format("grpc_predict failed: %s\n", e.toString());
+            return null;
+        }
+        return result;
+    }
+
+    public String http_predict(Map<String, Object> feedData,
+                    Map<String, Object> feedLod,
+                    List<String> fetch,
+                    boolean batchFlag,
+                    int log_id) {
+        String server_url = "http://" + this.ip + ":" + this.serverPort + this.serviceName;
+        // 处理fetchList
+        String result = null;
+        if(this.http_proto){
+            Request request = process_proto_data(feedData, feedLod, fetch, batchFlag, log_id);
+            result = doPost(server_url, request.toByteArray());
+
+        }else{
+            JSONObject jsonRequest = process_json_data(feedData,feedLod,fetch,batchFlag,log_id);
+            result = doPost(server_url, jsonRequest.toString());
+        }
+        return result;
+    }
+
+    public String doPost(String url, Object postData) {
+        CloseableHttpClient httpClient = null;
+        CloseableHttpResponse httpResponse = null;
+        String result = "";
+        // 创建httpClient实例
+        httpClient = HttpClients.createDefault();
+        // 创建httpPost远程连接实例
+        HttpPost httpPost = new HttpPost(url);
+        // 配置请求参数实例
+        RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(timeoutS_)// 设置连接主机服务超时时间
+                .setConnectionRequestTimeout(timeoutS_)// 设置连接请求超时时间
+                .setSocketTimeout(timeoutS_)// 设置读取数据连接超时时间
+                .build();
+        // 为httpPost实例设置配置
+        httpPost.setConfig(requestConfig);
+        if(this.http_proto){
+            httpPost.setHeader("Content-Type", "application/proto");
+        }else{
+            httpPost.setHeader("Content-Type", "application/json");
+        }
+        
+        // 设置请求头
+        if(response_compress_flag){
+            httpPost.addHeader("Accept-encoding", "gzip");
+            if(GLOG_v != null){
+                System.out.format("------- Accept-encoding gzip:  \n");
+            }
+        }
+        
+        try {
+            if(postData instanceof String){
+                if(request_compress_flag && ((String)postData).length()>1024){
+                    try{
+                        byte[] gzipEncrypt = compress(postData);
+                        httpPost.setEntity(new InputStreamEntity(new ByteArrayInputStream(gzipEncrypt), gzipEncrypt.length));
+                        httpPost.addHeader("Content-Encoding", "gzip");
+                    } catch (Exception e) {
+                        e.printStackTrace();
+                    }
+                }else{
+                    httpPost.setEntity(new StringEntity((String)postData, "UTF-8"));
+                }
 
-    public PredictFuture asyn_predict(
-            List<HashMap<String, INDArray>> feed_batch,
-            Iterable<String> fetch) {
-        return asyn_predict(feed_batch, fetch, false, 0);
+            }else{
+                if(request_compress_flag && ((byte[])postData).length>1024){
+                    try{
+                        byte[] gzipEncrypt = compress(postData);
+                        httpPost.setEntity(new InputStreamEntity(new ByteArrayInputStream(gzipEncrypt), gzipEncrypt.length));
+                        httpPost.addHeader("Content-Encoding", "gzip");
+                    } catch (Exception e) {
+                        e.printStackTrace();
+                    }
+                }else{
+                    httpPost.setEntity(new ByteArrayEntity((byte[])postData));
+                    //httpPost.setEntity(new InputStreamEntity(new ByteArrayInputStream((byte[])postData), ((byte[])postData).length));
+                }
+            }
+            
+            // httpClient对象执行post请求,并返回响应参数对象
+            httpResponse = httpClient.execute(httpPost);
+            // 从响应对象中获取响应内容
+            HttpEntity entity = httpResponse.getEntity();
+            Header header = entity.getContentEncoding();
+            if(GLOG_v != null){
+                System.out.format("------- response header:  %s\n", header);
+            }
+            if(header != null && header.getValue().equalsIgnoreCase("gzip")){	//判断返回内容是否为gzip压缩格式
+                GzipDecompressingEntity gzipEntity = new GzipDecompressingEntity(entity);
+                result = EntityUtils.toString(gzipEntity);
+                if(GLOG_v != null){
+                    System.out.format("------- degzip response:  %s\n", result);
+                }
+            }else{
+                if(this.http_proto){
+                    
+                    Response resp = Response.parseFrom(EntityUtils.toByteArray(entity));
+                    result = resp.toString();
+                }else{
+                    result = EntityUtils.toString(entity);
+                }
+            }
+        } catch (ClientProtocolException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        } finally {
+            // 关闭资源
+            if (null != httpResponse) {
+                try {
+                    httpResponse.close();
+                } catch (IOException e) {
+                    e.printStackTrace();
+                }
+            }
+            if (null != httpClient) {
+                try {
+                    httpClient.close();
+                } catch (IOException e) {
+                    e.printStackTrace();
+                }
+            }
+        }
+        return result;
     }
 
-    public PredictFuture asyn_predict(
-            List<HashMap<String, INDArray>> feed_batch,
-            Iterable<String> fetch,
-            long log_id) {
-        return asyn_predict(feed_batch, fetch, false, log_id);
-    }
+    public List<Object> recursiveExtract(Object stuff) {
 
-    public Map<String, INDArray> predict(
-            List<HashMap<String, INDArray>> feed_batch,
-            Iterable<String> fetch,
-            Boolean need_variant_tag) {
-        return predict(feed_batch, fetch, need_variant_tag, 0);        
+        List<Object> mylist = new ArrayList<Object>();
+    
+        if(stuff instanceof Iterable) {
+            for(Object o : (Iterable< ? >)stuff) {
+                mylist.addAll(recursiveExtract(o));
+            }
+        } else if(stuff instanceof Map) {
+            for(Object o : ((Map<?, ? extends Object>) stuff).values()) {
+                mylist.addAll(recursiveExtract(o));
+            }
+        } else {
+            mylist.add(stuff);
+        }
+    
+        return mylist;
     }
 
-    public Map<String, INDArray> predict(
-            List<HashMap<String, INDArray>> feed_batch,
-            Iterable<String> fetch,
-            Boolean need_variant_tag,
-            long log_id) {
-        try {
-            profiler_.record("java_prepro_0");
-            InferenceRequest req = _packInferenceRequest(
-                    feed_batch, fetch, log_id);
-            profiler_.record("java_prepro_1");
-            
-            profiler_.record("java_client_infer_0");
-            InferenceResponse resp = blockingStub_.inference(req);
-            profiler_.record("java_client_infer_1");
-
-            profiler_.record("java_postpro_0");
-            Map<String, HashMap<String, INDArray>> ensemble_result
-                = _unpackInferenceResponse(resp, fetch, need_variant_tag);
-            List<Map.Entry<String, HashMap<String, INDArray>>> list
-                = new ArrayList<Map.Entry<String, HashMap<String, INDArray>>>(
-                    ensemble_result.entrySet());
-            if (list.size() != 1) {
-                System.out.format("Failed to predict: please use ensemble_predict impl.\n");
-                return null;
+    public JSONObject process_json_data(Map<String, Object> feedData,
+                Map<String, Object> feedLod,
+                List<String> fetch,
+                boolean batchFlag,
+                int log_id){
+        
+        // 处理Tensor
+        JSONArray jsonTensorArray = new JSONArray();
+        try{
+            if (null != feedData && feedData.size() > 0) {
+                // 通过map集成entrySet方法获取entity
+                Set<Entry<String, Object>> entrySet = feedData.entrySet();
+                // 循环遍历，获取迭代器
+                Iterator<Entry<String, Object>> iterator = entrySet.iterator();
+                while (iterator.hasNext()) {
+                    JSONObject jsonTensor = new JSONObject();
+                    Entry<String, Object> mapEntry = iterator.next();
+                    Object objectValue = mapEntry.getValue();
+                    String feed_alias_name = mapEntry.getKey();
+                    String feed_real_name = feedRealNames_.get(feed_alias_name);
+                    List<Integer> shape = new ArrayList<Integer>(feedShapes_.get(feed_alias_name));
+                    int element_type = feedTypes_.get(feed_alias_name);
+                    
+                    jsonTensor.put("alias_name", feed_alias_name);
+                    jsonTensor.put("name", feed_real_name);
+                    jsonTensor.put("elem_type", element_type);
+
+                    // 处理数据与shape
+                    String protoDataKey = feedTypeToDataKey_.get(element_type);
+                    // 如果是INDArray类型，先转为一维.
+                    // 此时shape为INDArray的shape
+                    if(objectValue instanceof INDArray){
+                        INDArray tempIndArray = (INDArray)objectValue;
+                        long[] indarrayShape = tempIndArray.shape();
+                        shape.clear();
+                        for(long dim:indarrayShape){
+                            shape.add((int)dim);
+                        }
+                        if(element_type == ElementType.Int64_type.ordinal()){
+                            objectValue = tempIndArray.data().asLong();
+                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                            objectValue = tempIndArray.data().asInt();
+                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                            objectValue = tempIndArray.data().asFloat();
+                        }else{
+                            throw new Exception("INDArray 类型不支持");
+                        }
+                    }else if(objectValue.getClass().isArray()){
+                        // 如果是数组类型，则无须处理，直接使用即可。
+                        // 且数组无法嵌套，此时batch无法从数据中获取
+                        // 默认batch维度为1，或者feedVar的shape信息中已包含batch
+                    }else if(objectValue instanceof List){
+                        // 如果为list，可能存在嵌套，此时需要展平
+                        // 如果batchFlag为True，则认为是嵌套list
+                        // 此时取最外层为batch的维度
+                        if (batchFlag) {
+                            List<?> list = new ArrayList<>();
+                            list = new ArrayList<>((Collection<?>)objectValue);
+                            // 在index=0处，加上batch
+                            shape.add(0, list.size());
+                        }
+                        objectValue = recursiveExtract(objectValue);
+                    }else{
+                        // 此时认为是传入的单个String或者Int等
+                        // 此时无法获取batch信息，故对shape不处理
+                        // 由于Proto中为Repeated,需要把数据包装成list
+                        if(objectValue instanceof String){
+                            if(feedTypes_.get(protoDataKey)!= ElementType.Bytes_type.ordinal()){
+                                throw new Exception("feedvar is not string-type,feed can`t be a single string.");
+                            }
+                        }else{
+                            if(feedTypes_.get(protoDataKey)== ElementType.Bytes_type.ordinal()){
+                                throw new Exception("feedvar is string-type,feed, feed can`t be a single int or others.");
+                            }
+                        }
+                        List<Object> list = new ArrayList<>();
+                        list.add(objectValue);
+                        objectValue = list;
+                    }
+                    jsonTensor.put(protoDataKey,objectValue);
+                    if(!batchFlag){
+                        // 在index=0处，加上batch=1
+                        shape.add(0, 1);
+                    }
+                    jsonTensor.put("shape", shape);
+                    
+                    // 处理lod信息，支持INDArray Array Iterable
+                    Object feedLodValue = null;
+                    if(feedLod != null){
+                        feedLodValue = feedLod.get(feed_alias_name);
+                        if(feedLodValue != null) {
+                            if(feedLodValue instanceof INDArray){
+                                INDArray tempIndArray = (INDArray)feedLodValue;    
+                                feedLodValue = tempIndArray.data().asInt();
+                            }else if(feedLodValue.getClass().isArray()){
+                                // 如果是数组类型，则无须处理，直接使用即可。
+                            }else if(feedLodValue instanceof Iterable){
+                                // 如果为list，可能存在嵌套，此时需要展平
+                                feedLodValue = recursiveExtract(feedLodValue);
+                            }else{
+                                throw new Exception("Lod must be INDArray or Array or Iterable.");
+                            }
+                            jsonTensor.put("lod", feedLodValue);
+                        }
+                    }
+                    jsonTensorArray.put(jsonTensor);
+                }
             }
-            profiler_.record("java_postpro_1");
-            profiler_.printProfile();
-
-            return list.get(0).getValue();
-        } catch (StatusRuntimeException e) {
-            System.out.format("Failed to predict: %s\n", e.toString());
-            return null;
+        }catch (Exception e) {
+            e.printStackTrace();
         }
-    }
 
-    public Map<String, HashMap<String, INDArray>> ensemble_predict(
-            List<HashMap<String, INDArray>> feed_batch,
-            Iterable<String> fetch,
-            Boolean need_variant_tag) {
-        return ensemble_predict(feed_batch, fetch, need_variant_tag, 0);        
-    }
-     
-    public Map<String, HashMap<String, INDArray>> ensemble_predict(
-            List<HashMap<String, INDArray>> feed_batch,
-            Iterable<String> fetch,
-            Boolean need_variant_tag,
-            long log_id) {
-        try {
-            profiler_.record("java_prepro_0");
-            InferenceRequest req = _packInferenceRequest(
-                    feed_batch, fetch, log_id);
-            profiler_.record("java_prepro_1");
-            
-            profiler_.record("java_client_infer_0");
-            InferenceResponse resp = blockingStub_.inference(req);
-            profiler_.record("java_client_infer_1");
-            
-            profiler_.record("java_postpro_0");
-            Map<String, HashMap<String, INDArray>> ensemble_result 
-               = _unpackInferenceResponse(resp, fetch, need_variant_tag);
-            profiler_.record("java_postpro_1");
-            profiler_.printProfile();
-
-            return ensemble_result;
-        } catch (StatusRuntimeException e) {
-            System.out.format("Failed to predict: %s\n", e.toString());
-            return null;
+        JSONArray jsonFetchList = new JSONArray(fetch);
+        /*
+        Iterator<String> fetchIterator = fetch.iterator();
+        while (fetchIterator.hasNext()) {
+            jsonFetchList.put(fetchIterator.next());
+        }
+        */
+        JSONObject jsonRequest = new JSONObject();
+        jsonRequest.put("log_id",log_id);
+        jsonRequest.put("fetch_var_names", jsonFetchList);
+        jsonRequest.put("tensor",jsonTensorArray);
+        if(GLOG_v != null){
+            System.out.format("------- Final jsonRequest:  %s\n", jsonRequest.toString());
+        }
+        return jsonRequest;
+    }
+
+    public Request process_proto_data(Map<String, Object> feedData,
+                Map<String, Object> feedLod,
+                List<String> fetch,
+                boolean batchFlag,
+                int log_id){
+        // 处理Tensor
+        Request.Builder request_builder = Request.newBuilder().addAllFetchVarNames(fetch).setLogId(log_id);
+        try{
+            if (null != feedData && feedData.size() > 0) {
+                // 通过map集成entrySet方法获取entity
+                Set<Entry<String, Object>> entrySet = feedData.entrySet();
+                // 循环遍历，获取迭代器
+                Iterator<Entry<String, Object>> iterator = entrySet.iterator();
+                while (iterator.hasNext()) {
+                    Tensor.Builder tensor_builder = Tensor.newBuilder();
+                    Entry<String, Object> mapEntry = iterator.next();
+                    Object objectValue = mapEntry.getValue();
+                    String feed_alias_name = mapEntry.getKey();
+                    String feed_real_name = feedRealNames_.get(feed_alias_name);
+                    List<Integer> shape = new ArrayList<Integer>(feedShapes_.get(feed_alias_name));
+                    int element_type = feedTypes_.get(feed_alias_name);
+                    
+                    tensor_builder.setAliasName(feed_alias_name);
+                    tensor_builder.setName(feed_real_name);
+                    tensor_builder.setElemType(element_type);
+
+                    // 处理数据与shape
+                    // 如果是INDArray类型，先转为一维.
+                    // 此时shape为INDArray的shape
+                    if(objectValue instanceof INDArray){
+                        INDArray tempIndArray = (INDArray)objectValue;
+                        long[] indarrayShape = tempIndArray.shape();
+                        shape.clear();
+                        for(long dim:indarrayShape){
+                            shape.add((int)dim);
+                        }   
+                        if(element_type == ElementType.Int64_type.ordinal()){
+                            
+                            List<Long> iter = Arrays.stream(tempIndArray.data().asLong()).boxed().collect(Collectors.toList());
+                            tensor_builder.addAllInt64Data(iter);
+                            
+                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                            
+                            List<Integer> iter = Arrays.stream(tempIndArray.data().asInt()).boxed().collect(Collectors.toList());
+                            tensor_builder.addAllIntData(iter);
+                            
+                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                            List<Float> iter = Arrays.asList(ArrayUtils.toObject(tempIndArray.data().asFloat()));
+                            tensor_builder.addAllFloatData(iter);
+                            
+                        }else{
+                            // 看接口是String还是Bytes
+                            throw new Exception("INDArray 类型不支持");
+                        }
+                    }else if(objectValue.getClass().isArray()){
+                        // 如果是数组类型，则无须处理，直接使用即可。
+                        // 且数组无法嵌套，此时batch无法从数据中获取
+                        // 默认batch维度为1，或者feedVar的shape信息中已包含batch
+                        if(element_type == ElementType.Int64_type.ordinal()){
+                            List<Long> iter = Arrays.stream((long[])objectValue).boxed().collect(Collectors.toList());
+                            tensor_builder.addAllInt64Data(iter);
+                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                            List<Integer> iter = Arrays.stream((int[])objectValue).boxed().collect(Collectors.toList());
+                            tensor_builder.addAllIntData(iter);
+                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                            List<Float> iter = Arrays.asList(ArrayUtils.toObject((float[])objectValue));
+                            tensor_builder.addAllFloatData(iter);
+                        }else{
+                            List<String> iter = Arrays.asList((String[])objectValue);
+                            tensor_builder.addAllData(iter);
+                        }
+                    }else if(objectValue instanceof List){
+                        // 如果为list，可能存在嵌套，此时需要展平
+                        // 如果batchFlag为True，则认为是嵌套list
+                        // 此时取最外层为batch的维度
+                        if (batchFlag) {
+                            List<?> list = new ArrayList<>();
+                            list = new ArrayList<>((Collection<?>)objectValue);
+                            // 在index=0处，加上batch
+                            shape.add(0, list.size());
+                        }
+                        if(element_type == ElementType.Int64_type.ordinal()){
+                            tensor_builder.addAllInt64Data((List<Long>)(List)recursiveExtract(objectValue));
+                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                            tensor_builder.addAllIntData((List<Integer>)(List)recursiveExtract(objectValue));
+                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                            tensor_builder.addAllFloatData((List<Float>)(List)recursiveExtract(objectValue));
+                        }else{
+                            // 看接口是String还是Bytes
+                            tensor_builder.addAllData((List<String>)(List)recursiveExtract(objectValue));
+                        }
+                    }else{
+                        // 此时认为是传入的单个String或者Int等
+                        // 此时无法获取batch信息，故对shape不处理
+                        // 由于Proto中为Repeated,需要把数据包装成list
+                        List<Object> tempList = new ArrayList<>();
+                        tempList.add(objectValue);
+                        if(element_type == ElementType.Int64_type.ordinal()){
+                            tensor_builder.addAllInt64Data((List<Long>)(List)tempList);
+                        }else if(element_type == ElementType.Int32_type.ordinal()){
+                            tensor_builder.addAllIntData((List<Integer>)(List)tempList);
+                        }else if(element_type == ElementType.Float32_type.ordinal()){
+                            tensor_builder.addAllFloatData((List<Float>)(List)tempList);
+                        }else{
+                            // 看接口是String还是Bytes
+                            tensor_builder.addAllData((List<String>)(List)tempList);
+                        }
+                    }
+                    if(!batchFlag){
+                        // 在index=0处，加上batch=1
+                        shape.add(0, 1);
+                    }
+                    tensor_builder.addAllShape(shape);
+                    
+                    // 处理lod信息，支持INDArray Array Iterable
+                    Object feedLodValue = null;
+                    if(feedLod != null){
+                        feedLodValue = feedLod.get(feed_alias_name);
+                        if(feedLodValue != null) {
+                            if(feedLodValue instanceof INDArray){
+                                INDArray tempIndArray = (INDArray)feedLodValue;
+                                List<Integer> iter = Arrays.stream(tempIndArray.data().asInt()).boxed().collect(Collectors.toList());
+                                tensor_builder.addAllLod(iter);
+                            }else if(feedLodValue.getClass().isArray()){
+                                // 如果是数组类型，则无须处理，直接使用即可。
+                                List<Integer> iter = Arrays.stream((int[])feedLodValue).boxed().collect(Collectors.toList());
+                                tensor_builder.addAllLod(iter);
+                            }else if(feedLodValue instanceof Iterable){
+                                // 如果为list，可能存在嵌套，此时需要展平
+                                tensor_builder.addAllLod((List<Integer>)(List)recursiveExtract(feedLodValue));
+                            }else{
+                                throw new Exception("Lod must be INDArray or Array or Iterable.");
+                            }
+                        }
+                    }
+                    request_builder.addTensor(tensor_builder.build());
+                }
+            }
+        }catch (Exception e) {
+            e.printStackTrace();
         }
+        return request_builder.build();
     }
 
-    public PredictFuture asyn_predict(
-            List<HashMap<String, INDArray>> feed_batch,
-            Iterable<String> fetch,
-            Boolean need_variant_tag) {
-        return asyn_predict(feed_batch, fetch, need_variant_tag, 0);
-    }
 
-    public PredictFuture asyn_predict(
-            List<HashMap<String, INDArray>> feed_batch,
-            Iterable<String> fetch,
-            Boolean need_variant_tag,
-            long log_id) {
-        InferenceRequest req = _packInferenceRequest(
-                feed_batch, fetch, log_id);
-        ListenableFuture<InferenceResponse> future = futureStub_.inference(req);
-        PredictFuture predict_future = new PredictFuture(future, 
-            (InferenceResponse resp) -> {
-                return Client._staticUnpackInferenceResponse(
-                resp, fetch, fetchTypes_, lodTensorSet_, need_variant_tag);
-            }
-        );
-        return predict_future;
-    }
 }
+
diff --git a/java/src/main/java/io/paddle/serving/client/PipelineClient.java b/java/src/main/java/io/paddle/serving/client/PipelineClient.java
old mode 100755
new mode 100644
diff --git a/java/src/main/java/io/paddle/serving/client/PredictFuture.java b/java/src/main/java/io/paddle/serving/client/PredictFuture.java
deleted file mode 100644
index 28156d965e76db889358be00ab8c05381e0f89d8..0000000000000000000000000000000000000000
--- a/java/src/main/java/io/paddle/serving/client/PredictFuture.java
+++ /dev/null
@@ -1,54 +0,0 @@
-package io.paddle.serving.client;
-
-import java.util.*;
-import java.util.function.Function;
-import io.grpc.StatusRuntimeException;
-import com.google.common.util.concurrent.ListenableFuture;
-import org.nd4j.linalg.api.ndarray.INDArray;
-
-import io.paddle.serving.client.Client;
-import io.paddle.serving.grpc.*;
-
-public class PredictFuture {
-    private ListenableFuture<InferenceResponse> callFuture_;
-    private Function<InferenceResponse, 
-                     Map<String, HashMap<String, INDArray>>> callBackFunc_;
-    
-    PredictFuture(ListenableFuture<InferenceResponse> call_future,
-            Function<InferenceResponse, 
-                     Map<String, HashMap<String, INDArray>>> call_back_func) {
-        callFuture_ = call_future;
-        callBackFunc_ = call_back_func;
-    }
-
-    public Map<String, INDArray> get() {
-        InferenceResponse resp = null;
-        try {
-            resp = callFuture_.get();
-        } catch (Exception e) {
-            System.out.format("predict failed: %s\n", e.toString());
-            return null;
-        }
-        Map<String, HashMap<String, INDArray>> ensemble_result
-            = callBackFunc_.apply(resp);
-        List<Map.Entry<String, HashMap<String, INDArray>>> list
-            = new ArrayList<Map.Entry<String, HashMap<String, INDArray>>>(
-                    ensemble_result.entrySet());
-        if (list.size() != 1) {
-            System.out.format("predict failed: please use get_ensemble impl.\n");
-            return null;
-        }
-        return list.get(0).getValue();
-    }
-
-    public Map<String, HashMap<String, INDArray>> ensemble_get() {
-        InferenceResponse resp = null;
-        try {
-            resp = callFuture_.get();
-        } catch (Exception e) {
-            System.out.format("predict failed: %s\n", e.toString());
-            return null;
-        }
-        return callBackFunc_.apply(resp);
-    }
-}
diff --git a/java/src/main/proto/general_model_service.proto b/java/src/main/proto/general_model_service.proto
new file mode 100644
index 0000000000000000000000000000000000000000..89ac489f8ae3b90b74c94a3f9f3c82711086cd64
--- /dev/null
+++ b/java/src/main/proto/general_model_service.proto
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+package baidu.paddle_serving.predictor.general_model;
+option java_multiple_files = true;
+
+message Tensor {
+  repeated string data = 1;
+  repeated int32 int_data = 2;
+  repeated int64 int64_data = 3;
+  repeated float float_data = 4;
+  optional int32 elem_type =
+      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
+  repeated int32 shape = 6;       // shape should include batch
+  repeated int32 lod = 7;         // only for fetch tensor currently
+  optional string name = 8;       // get from the Model prototxt
+  optional string alias_name = 9; // get from the Model prototxt
+};
+
+message Request {
+  repeated Tensor tensor = 1;
+  repeated string fetch_var_names = 2;
+  optional bool profile_server = 3 [ default = false ];
+  required uint64 log_id = 4 [ default = 0 ];
+};
+
+message Response {
+  repeated ModelOutput outputs = 1;
+  repeated int64 profile_time = 2;
+};
+
+message ModelOutput {
+  repeated Tensor tensor = 1;
+  optional string engine_name = 2;
+}
+
+service GeneralModelService {
+  rpc inference(Request) returns (Response) {}
+  rpc debug(Request) returns (Response) {}
+};
diff --git a/java/src/main/proto/multi_lang_general_model_service.proto b/java/src/main/proto/multi_lang_general_model_service.proto
deleted file mode 100644
index 18fbcf760647e1694e738c0832fe45f4f7d9934f..0000000000000000000000000000000000000000
--- a/java/src/main/proto/multi_lang_general_model_service.proto
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-syntax = "proto2";
-
-package baidu.paddle_serving.multi_lang;
-
-option java_multiple_files = true;
-option java_package = "io.paddle.serving.grpc";
-option java_outer_classname = "ServingProto";
-
-message Tensor {
-  optional bytes data = 1;
-  repeated int32 int_data = 2;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
-  optional int32 elem_type = 5;
-  repeated int32 shape = 6;
-  repeated int32 lod = 7; // only for fetch tensor currently
-};
-
-message FeedInst { repeated Tensor tensor_array = 1; };
-
-message FetchInst { repeated Tensor tensor_array = 1; };
-
-message InferenceRequest {
-  repeated FeedInst insts = 1;
-  repeated string feed_var_names = 2;
-  repeated string fetch_var_names = 3;
-  required bool is_python = 4 [ default = false ];
-  required uint64 log_id = 5 [ default = 0 ];
-};
-
-message InferenceResponse {
-  repeated ModelOutput outputs = 1;
-  optional string tag = 2;
-  required int32 err_code = 3;
-};
-
-message ModelOutput {
-  repeated FetchInst insts = 1;
-  optional string engine_name = 2;
-}
-
-message SetTimeoutRequest { required int32 timeout_ms = 1; }
-
-message SimpleResponse { required int32 err_code = 1; }
-
-message GetClientConfigRequest {}
-
-message GetClientConfigResponse { required string client_config_str = 1; }
-
-service MultiLangGeneralModelService {
-  rpc Inference(InferenceRequest) returns (InferenceResponse) {}
-  rpc SetTimeout(SetTimeoutRequest) returns (SimpleResponse) {}
-  rpc GetClientConfig(GetClientConfigRequest)
-      returns (GetClientConfigResponse) {}
-};
diff --git a/paddle_inference/paddle/include/paddle_engine.h b/paddle_inference/paddle/include/paddle_engine.h
old mode 100755
new mode 100644
index 262a0378bef5caacbfdf5a3d2b46ed6ce598cb10..7eedc89c0ae045f51944b08a3a806a9439c02e2a
--- a/paddle_inference/paddle/include/paddle_engine.h
+++ b/paddle_inference/paddle/include/paddle_engine.h
@@ -14,11 +14,13 @@
 
 #pragma once
 
+#include <dirent.h>
 #include <pthread.h>
 #include <fstream>
 #include <map>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 #include "core/configure/include/configure_parser.h"
 #include "core/configure/inferencer_configure.pb.h"
@@ -68,6 +70,30 @@ PrecisionType GetPrecision(const std::string& precision_data) {
   return PrecisionType::kFloat32;
 }
 
+const std::string getFileBySuffix(
+    const std::string& path, const std::vector<std::string>& suffixVector) {
+  DIR* dp = nullptr;
+  std::string fileName = "";
+  struct dirent* dirp = nullptr;
+  if ((dp = opendir(path.c_str())) == nullptr) {
+    return fileName;
+  }
+  while ((dirp = readdir(dp)) != nullptr) {
+    if (dirp->d_type == DT_REG) {
+      for (int idx = 0; idx < suffixVector.size(); ++idx) {
+        if (std::string(dirp->d_name).find(suffixVector[idx]) !=
+            std::string::npos) {
+          fileName = static_cast<std::string>(dirp->d_name);
+          break;
+        }
+      }
+    }
+    if (fileName.length() != 0) break;
+  }
+  closedir(dp);
+  return fileName;
+}
+
 // Engine Base
 class EngineCore {
  public:
@@ -96,7 +122,7 @@ class EngineCore {
     return true;
   }
 
-  virtual int create(const configure::EngineDesc& conf) = 0;
+  virtual int create(const configure::EngineDesc& conf, int gpu_id) = 0;
 
   virtual int clone(void* predictor) {
     if (predictor == NULL) {
@@ -121,7 +147,7 @@ class EngineCore {
 // Paddle Inference Engine
 class PaddleInferenceEngine : public EngineCore {
  public:
-  int create(const configure::EngineDesc& engine_conf) {
+  int create(const configure::EngineDesc& engine_conf, int gpu_id) {
     std::string model_path = engine_conf.model_dir();
     if (access(model_path.c_str(), F_OK) == -1) {
       LOG(ERROR) << "create paddle predictor failed, path not exits: "
@@ -130,9 +156,21 @@ class PaddleInferenceEngine : public EngineCore {
     }
 
     Config config;
-    // todo, auto config(zhangjun)
-    if (engine_conf.has_encrypted_model() && engine_conf.encrypted_model()) {
+    std::vector<std::string> suffixParaVector = {".pdiparams", "__params__"};
+    std::vector<std::string> suffixModelVector = {".pdmodel", "__model__"};
+    std::string paraFileName = getFileBySuffix(model_path, suffixParaVector);
+    std::string modelFileName = getFileBySuffix(model_path, suffixModelVector);
+
+    std::string encryParaPath = model_path + "/encrypt_model";
+    std::string encryModelPath = model_path + "/encrypt_params";
+    std::string encryKeyPath = model_path + "/key";
+
+    // encrypt model
+    if (access(encryParaPath.c_str(), F_OK) != -1 &&
+        access(encryModelPath.c_str(), F_OK) != -1 &&
+        access(encryKeyPath.c_str(), F_OK) != -1) {
       // decrypt model
+
       std::string model_buffer, params_buffer, key_buffer;
       predictor::ReadBinaryFile(model_path + "/encrypt_model", &model_buffer);
       predictor::ReadBinaryFile(model_path + "/encrypt_params", &params_buffer);
@@ -146,23 +184,22 @@ class PaddleInferenceEngine : public EngineCore {
                             real_model_buffer.size(),
                             &real_params_buffer[0],
                             real_params_buffer.size());
-    } else if (engine_conf.has_combined_model()) {
-      if (!engine_conf.combined_model()) {
-        config.SetModel(model_path);
-      } else {
-        config.SetParamsFile(model_path + "/__params__");
-        config.SetProgFile(model_path + "/__model__");
-      }
+    } else if (paraFileName.length() != 0 && modelFileName.length() != 0) {
+      config.SetParamsFile(model_path + "/" + paraFileName);
+      config.SetProgFile(model_path + "/" + modelFileName);
     } else {
-      config.SetParamsFile(model_path + "/__params__");
-      config.SetProgFile(model_path + "/__model__");
+      config.SetModel(model_path);
     }
 
     config.SwitchSpecifyInputNames(true);
     config.SetCpuMathLibraryNumThreads(1);
     if (engine_conf.has_use_gpu() && engine_conf.use_gpu()) {
       // 2000MB GPU memory
-      config.EnableUseGpu(2000, FLAGS_gpuid);
+      config.EnableUseGpu(50, gpu_id);
+      if (engine_conf.has_gpu_multi_stream() &&
+          engine_conf.gpu_multi_stream()) {
+        config.EnableGpuMultiStream();
+      }
     }
     precision_type = GetPrecision(FLAGS_precision);
 
@@ -174,8 +211,13 @@ class PaddleInferenceEngine : public EngineCore {
     }
 
     if (engine_conf.has_use_trt() && engine_conf.use_trt()) {
+      config.SwitchIrOptim(true);
       if (!engine_conf.has_use_gpu() || !engine_conf.use_gpu()) {
-        config.EnableUseGpu(2000, FLAGS_gpuid);
+        config.EnableUseGpu(50, gpu_id);
+        if (engine_conf.has_gpu_multi_stream() &&
+            engine_conf.gpu_multi_stream()) {
+          config.EnableGpuMultiStream();
+        }
       }
       config.EnableTensorRtEngine(1 << 20,
                                   max_batch,
@@ -203,7 +245,7 @@ class PaddleInferenceEngine : public EngineCore {
       if (precision_type == PrecisionType::kInt8) {
         config.EnableMkldnnQuantizer();
         auto quantizer_config = config.mkldnn_quantizer_config();
-        // TODO: warmup data
+        // TODO(somebody): warmup data
         // quantizer_config -> SetWarmupData();
         // quantizer_config -> SetWarmupBatchSize();
         // quantizer_config -> SetEnabledOpTypes(4);
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index de1fe2843bd32a9cc9e2aa567c0ddddd7457c67c..589420ad45ae7f347c8e7b9b25c5cc0034830263 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -81,7 +81,6 @@ if (SERVER)
   if(WITH_LITE)
     set(VERSION_SUFFIX 2)
   endif()
-
   add_custom_command(
     OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
     COMMAND cp -r
diff --git a/python/examples/bert/README.md b/python/examples/bert/README.md
old mode 100644
new mode 100755
index 7bada93876f8f043b0046b83c3dc3707129079a7..5d3242837f6d8be08f321d68890587e4bba725e8
--- a/python/examples/bert/README.md
+++ b/python/examples/bert/README.md
@@ -1,4 +1,4 @@
-## Bert as service
+Http## Bert as service
 
 ([简体中文](./README_CN.md)|English)
 
@@ -42,48 +42,36 @@ sh get_data.sh
 ```
 this script will download Chinese Dictionary File vocab.txt and Chinese Sample Data data-c.txt
 
-### RPC Inference Service
+### Inference Service(Support BRPC-Client、GRPC-Client、Http-Client)
 start cpu inference service,Run
 ```
-python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #cpu inference service
+python3 -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #cpu inference service
 ```
 Or,start gpu inference service,Run
 ```
-python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #launch gpu inference service at GPU 0
+python3 -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #launch gpu inference service at GPU 0
 ```
 
-### RPC Inference
+### BRPC-Client Inference
 
 before prediction we should install paddle_serving_app. This module provides data preprocessing for BERT model.
 ```
-pip install paddle_serving_app
+pip3 install paddle_serving_app
 ```
 Run
 ```
-head data-c.txt | python bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
+head data-c.txt | python3 bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
 ```
 
 the client reads data from data-c.txt and send prediction request, the prediction is given by word vector. (Due to massive data in the word vector, we do not print it).
 
-### HTTP Inference Service
-start cpu HTTP inference service,Run
-```
- python bert_web_service.py bert_seq128_model/ 9292 #launch cpu inference service
+#### GRPC-Client/HTTP-Client
+Run
 ```
+head data-c.txt | python3 bert_httpclient.py --model bert_seq128_client/serving_client_conf.prototxt
 
-Or,start gpu HTTP inference service,Run
-```
- export CUDA_VISIBLE_DEVICES=0,1
-```
-set environmental variable to specify which gpus are used, the command above means gpu 0 and gpu 1 is used.
 ```
- python bert_web_service_gpu.py bert_seq128_model/ 9292 #launch gpu inference service
-```
-### HTTP Inference 
 
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:9292/bert/prediction
-```
 
 ## Benchmark
 ``` shell
diff --git a/python/examples/bert/README_CN.md b/python/examples/bert/README_CN.md
old mode 100644
new mode 100755
index a03b577493fc763c43d1ce96766d4e9eb260565e..42bc3ffab0ad51e304b11a78634b5a90415d1ace
--- a/python/examples/bert/README_CN.md
+++ b/python/examples/bert/README_CN.md
@@ -40,15 +40,15 @@ sh get_data.sh
 ```
 脚本将下载中文词典vocab.txt和中文样例数据data-c.txt
 
-### 启动RPC预测服务
+### 启动预测服务（支持BRPC-Client、GRPC-Client、HTTP-Client三种方式访问）
 启动cpu预测服务，执行
 ```
-python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #启动cpu预测服务
+python3 -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #启动cpu预测服务
 
 ```
 或者，启动gpu预测服务，执行
 ```
-python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务
+python3 -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务
 
 ```
 
@@ -56,37 +56,22 @@ python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --g
 
 执行预测前需要安装paddle_serving_app，模块中提供了BERT模型的数据预处理方法。
 ```
-pip install paddle_serving_app
+pip3 install paddle_serving_app
 ```
+
+#### BRPC-Client
 执行
 ```
-head data-c.txt | python bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
+head data-c.txt | python3 bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
 
 ```
 启动client读取data-c.txt中的数据进行预测，预测结果为文本的向量表示（由于数据较多，脚本中没有将输出进行打印），server端的地址在脚本中修改。
 
-
-
-### 启动HTTP预测服务
-启动cpu HTTP预测服务，执行
-```
-python bert_web_service.py bert_seq128_model/ 9292 #启动CPU预测服务
-
-```
-
-或者，启动gpu HTTP预测服务，执行
-```
- export CUDA_VISIBLE_DEVICES=0,1
-```
-通过环境变量指定gpu预测服务使用的gpu，示例中指定索引为0和1的两块gpu
-```
-python bert_web_service_gpu.py bert_seq128_model/ 9292 #启动gpu预测服务
+#### GRPC-Client/HTTP-Client
+执行
 ```
+head data-c.txt | python3 bert_httpclient.py --model bert_seq128_client/serving_client_conf.prototxt
 
-### 执行预测
-
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:9292/bert/prediction
 ```
 
 ## 性能测试
diff --git a/python/examples/bert/benchmark.py b/python/examples/bert/benchmark.py
old mode 100755
new mode 100644
diff --git a/python/examples/bert/benchmark.sh b/python/examples/bert/benchmark.sh
old mode 100755
new mode 100644
diff --git a/python/examples/bert/benchmark_with_profile.sh b/python/examples/bert/benchmark_with_profile.sh
old mode 100755
new mode 100644
diff --git a/python/examples/bert/bert_httpclient.py b/python/examples/bert/bert_httpclient.py
new file mode 100644
index 0000000000000000000000000000000000000000..560f90910f5e5f92b7e15306d88f1c6e6477e9b3
--- /dev/null
+++ b/python/examples/bert/bert_httpclient.py
@@ -0,0 +1,59 @@
+# coding:utf-8
+# pylint: disable=doc-string-missing
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from paddle_serving_client import HttpClient
+from paddle_serving_client.utils import benchmark_args
+from paddle_serving_app.reader import ChineseBertReader
+import numpy as np
+args = benchmark_args()
+
+reader = ChineseBertReader({"max_seq_len": 128})
+fetch = ["pooled_output"]
+
+client = HttpClient(ip='127.0.0.1', port='9292')
+client.load_client_config(args.model)
+#client.set_ip('127.0.0.1')
+#client.set_port('9292')
+''' 
+if you want use GRPC-client, set_use_grpc_client(True)
+or you can directly use client.grpc_client_predict(...)
+as for HTTP-client,set_use_grpc_client(False)(which is default)
+or you can directly use client.http_client_predict(...)
+'''
+#client.set_use_grpc_client(True)
+'''
+if you want to enable Encrypt Module,uncommenting the following line
+'''
+#client.use_key("./key")
+'''
+if you want to compress,uncommenting the following line
+'''
+#client.set_response_compress(True)
+#client.set_request_compress(True)
+'''
+we recommend use Proto data format in HTTP-body, set True(which is default)
+if you want use JSON data format in HTTP-body, set False
+'''
+#client.set_http_proto(True)
+
+for line in sys.stdin:
+    feed_dict = reader.process(line)
+    for key in feed_dict.keys():
+        feed_dict[key] = np.array(feed_dict[key]).reshape((128, 1))
+    #print(feed_dict)
+    result = client.predict(feed=feed_dict, fetch=fetch, batch=False)
+print(result)
diff --git a/python/examples/bert/bert_web_service.py b/python/examples/bert/bert_web_service.py
deleted file mode 100644
index 7cd34fb99e0ecebbf2f6bec47e9c9d163ac3a44c..0000000000000000000000000000000000000000
--- a/python/examples/bert/bert_web_service.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server.web_service import WebService
-from paddle_serving_app.reader import ChineseBertReader
-import sys
-import os
-import numpy as np
-
-
-class BertService(WebService):
-    def load(self):
-        self.reader = ChineseBertReader({
-            "vocab_file": "vocab.txt",
-            "max_seq_len": 128
-        })
-
-    def preprocess(self, feed=[], fetch=[]):
-        feed_res = []
-        is_batch = False
-        for ins in feed:
-            feed_dict = self.reader.process(ins["words"].encode("utf-8"))
-            for key in feed_dict.keys():
-                feed_dict[key] = np.array(feed_dict[key]).reshape(
-                    (len(feed_dict[key]), 1))
-            feed_res.append(feed_dict)
-        return feed_res, fetch, is_batch
-
-
-bert_service = BertService(name="bert")
-bert_service.load()
-bert_service.load_model_config(sys.argv[1])
-bert_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[2]), device="cpu")
-bert_service.run_rpc_service()
-bert_service.run_web_service()
diff --git a/python/examples/bert/bert_web_service_gpu.py b/python/examples/bert/bert_web_service_gpu.py
deleted file mode 100644
index fb332bca3b16ee6a2c8c25dc7ab8f1b70998e874..0000000000000000000000000000000000000000
--- a/python/examples/bert/bert_web_service_gpu.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server.web_service import WebService
-from paddle_serving_app.reader import ChineseBertReader
-import sys
-import os
-import numpy as np
-
-
-class BertService(WebService):
-    def load(self):
-        self.reader = ChineseBertReader({
-            "vocab_file": "vocab.txt",
-            "max_seq_len": 128
-        })
-
-    def preprocess(self, feed=[], fetch=[]):
-        feed_res = []
-        is_batch = False
-        for ins in feed:
-            feed_dict = self.reader.process(ins["words"].encode("utf-8"))
-            for key in feed_dict.keys():
-                feed_dict[key] = np.array(feed_dict[key]).reshape(
-                    (len(feed_dict[key]), 1))
-            feed_res.append(feed_dict)
-        return feed_res, fetch, is_batch
-
-
-bert_service = BertService(name="bert")
-bert_service.load()
-bert_service.load_model_config(sys.argv[1])
-bert_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[2]), device="gpu")
-bert_service.run_rpc_service()
-bert_service.run_web_service()
diff --git a/python/examples/blazeface/README.md b/python/examples/blazeface/README.md
index 6f9d3c5adab5f3275989479078cb4329d14589fd..29e3026b4d972e141eabcc1a180d7a5cdb804a52 100644
--- a/python/examples/blazeface/README.md
+++ b/python/examples/blazeface/README.md
@@ -2,7 +2,7 @@
 
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model blazeface
+python3 -m paddle_serving_app.package --get_model blazeface
 tar -xf blazeface.tar.gz
 ```
 
@@ -11,13 +11,13 @@ tar -xf blazeface.tar.gz
 ### Start Service
 
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9494
+python3 -m paddle_serving_server.serve --model serving_server --port 9494
 ```
 
 ### Client Prediction
 
 ```
-python test_client.py serving_client/serving_client_conf.prototxt test.jpg
+python3 test_client.py serving_client/serving_client_conf.prototxt test.jpg
 ```
 
 the result is in `output` folder, including a json file and image file with bounding boxes.
diff --git a/python/examples/cascade_rcnn/README.md b/python/examples/cascade_rcnn/README.md
index f8aa79e8bf97da5dd998ac6d340c0abd398931c0..0f831a400a04db1c5c38c76fd911fee4831f8779 100644
--- a/python/examples/cascade_rcnn/README.md
+++ b/python/examples/cascade_rcnn/README.md
@@ -10,12 +10,12 @@ If you want to have more detection models, please refer to [Paddle Detection Mod
 
 ### Start the service
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
 ```
 
 ### Perform prediction
 ```
-python test_client.py 
+python3 test_client.py 
 ```
 
 Image with bounding boxes and json result would be saved in `output` folder.
diff --git a/python/examples/cascade_rcnn/README_CN.md b/python/examples/cascade_rcnn/README_CN.md
index 99606de41812cb591a46e443c8a2f72c30ba19e0..0cc65ed681416de3bacd0edb1a0226b085c24faa 100644
--- a/python/examples/cascade_rcnn/README_CN.md
+++ b/python/examples/cascade_rcnn/README_CN.md
@@ -10,12 +10,12 @@ sh get_data.sh
 
 ### 启动服务
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
 ```
 
 ### 执行预测
 ```
-python test_client.py
+python3 test_client.py
 ```
 
 客户端已经为图片做好了后处理，在`output`文件夹下存放各个框的json格式信息还有后处理结果图片。
diff --git a/python/examples/criteo_ctr/README.md b/python/examples/criteo_ctr/README.md
index 46be4d0ae9d3167bc107ec45b0000520920d6dea..6c1d79e7362a0240a49a9f0243f3de3340119ce3 100644
--- a/python/examples/criteo_ctr/README.md
+++ b/python/examples/criteo_ctr/README.md
@@ -19,13 +19,13 @@ the directories like `ctr_serving_model` and `ctr_client_conf` will appear.
 ### Start RPC Inference Service
 
 ```
-python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #CPU RPC Service
-python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #RPC Service on GPU 0
+python3 -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #CPU RPC Service
+python3 -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #RPC Service on GPU 0
 ```
 
 ### RPC Infer
 
 ```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0
+python3 test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0
 ```
 the latency will display in the end.
diff --git a/python/examples/criteo_ctr/README_CN.md b/python/examples/criteo_ctr/README_CN.md
index c7d6255e0b21aa447c5decc823a9bbb5bdb4ad65..c5b1da76055e64bd08bcf2a00dffe537bc931ee9 100644
--- a/python/examples/criteo_ctr/README_CN.md
+++ b/python/examples/criteo_ctr/README_CN.md
@@ -19,13 +19,13 @@ mv models/ctr_serving_model .
 ### 启动RPC预测服务
 
 ```
-python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #启动CPU预测服务
-python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #在GPU 0上启动预测服务
+python3 -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #启动CPU预测服务
+python3 -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #在GPU 0上启动预测服务
 ```
 
 ### 执行预测
 
 ```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0
+python3 test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0
 ```
 预测完毕会输出预测过程的耗时。
diff --git a/python/examples/criteo_ctr_with_cube/README.md b/python/examples/criteo_ctr_with_cube/README.md
index 493b3d72c1fff9275c2a99cfee45efd4bef1af4c..de5c3269228a8d7ef619a8c46f2252208e53b982 100755
--- a/python/examples/criteo_ctr_with_cube/README.md
+++ b/python/examples/criteo_ctr_with_cube/README.md
@@ -32,13 +32,13 @@ Here, the sparse parameter is loaded by cube sparse parameter indexing service C
 ### Start RPC Predictor, the number of serving thread is 4（configurable in test_server.py）
 
 ```
-python test_server.py ctr_serving_model_kv 
+python3 test_server.py ctr_serving_model_kv 
 ```
 
 ### Run Prediction
 
 ```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
+python3 test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
 ```
 
 ### Benchmark
diff --git a/python/examples/criteo_ctr_with_cube/README_CN.md b/python/examples/criteo_ctr_with_cube/README_CN.md
index 7a0eb43c203aafeb38b64d249954cdabf7bf7a38..15d61160317f866aae25a4d777d76e14725424d3 100644
--- a/python/examples/criteo_ctr_with_cube/README_CN.md
+++ b/python/examples/criteo_ctr_with_cube/README_CN.md
@@ -30,13 +30,13 @@ sh cube_prepare.sh &
 ### 启动RPC预测服务，服务端线程数为4（可在test_server.py配置）
 
 ```
-python test_server.py ctr_serving_model_kv 
+python3 test_server.py ctr_serving_model_kv 
 ```
 
 ### 执行预测
 
 ```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
+python3 test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
 ```
 
 ### Benchmark
diff --git a/python/examples/criteo_ctr_with_cube/cube_prepare.sh b/python/examples/criteo_ctr_with_cube/cube_prepare.sh
new file mode 100755
index 0000000000000000000000000000000000000000..773baba4d91b02b244e766cd8ebf899cc740dbbc
--- /dev/null
+++ b/python/examples/criteo_ctr_with_cube/cube_prepare.sh
@@ -0,0 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+#! /bin/bash
+
+mkdir -p cube_model
+mkdir -p cube/data
+./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1  -only_build=false
+cd cube && ./cube
diff --git a/python/examples/deeplabv3/README.md b/python/examples/deeplabv3/README.md
index 28bec77bb500e42919734433617ea2df1b9e95c0..08022618fcec5220667ca19bfb803cba36519c7b 100644
--- a/python/examples/deeplabv3/README.md
+++ b/python/examples/deeplabv3/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 
 ```
-python -m paddle_serving_app.package --get_model deeplabv3
+python3 -m paddle_serving_app.package --get_model deeplabv3
 tar -xzvf deeplabv3.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf deeplabv3.tar.gz
 ### Start Service
 
 ```
-python -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
+python3 -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
 ```
 
 ### Client Prediction
 
 ```
-python deeplabv3_client.py
+python3 deeplabv3_client.py
 ```
diff --git a/python/examples/deeplabv3/README_CN.md b/python/examples/deeplabv3/README_CN.md
index 6de3c420833d31f871ad79122e1d77aee4208e35..16f11daba354349f1b73f8bba00cac8ff5c88864 100644
--- a/python/examples/deeplabv3/README_CN.md
+++ b/python/examples/deeplabv3/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 
 ```
-python -m paddle_serving_app.package --get_model deeplabv3
+python3 -m paddle_serving_app.package --get_model deeplabv3
 tar -xzvf deeplabv3.tar.gz
 ```
 
@@ -12,10 +12,10 @@ tar -xzvf deeplabv3.tar.gz
 ### 启动服务端
 
 ```
-python -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
+python3 -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
 ```
 
 ### 客户端预测
 
 ```
-python deeplabv3_client.py
+python3 deeplabv3_client.py
diff --git a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
index f25bd27766cfa64e20cc28c731f0a17e1f6dd826..ebb8b9d87307c82543f3a5de977bb997ddeb79e1 100644
--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
@@ -10,13 +10,14 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf faster_rcnn_hrnetv2p_w18_1x.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
-This model support TensorRT, if you want a faster inference, please use `--use_trt`. 
+This model support TensorRT, if you want a faster inference, please use `--use_trt`. But you need to do some extra work.
+Please reference to https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40 
 
 
 ### Prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
diff --git a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
index 2c9048e10f32698cad0ded98643f804b91c810fc..5be20d68ae3ecbcb45440659eb917a7b5b0d0ee1 100644
--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
@@ -11,11 +11,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf faster_rcnn_hrnetv2p_w18_1x.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
-该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
+该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项,但此时需要额外设置子图的TRT变长最大最小最优shape.
+请参考https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40
 
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
diff --git a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
index a755b33cb88b144a1f872338acc33742f50d9b62..d56aa416b9e54114646f9271c27f6afde7d41259 100644
--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
@@ -10,15 +10,16 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
-This model support TensorRT, if you want a faster inference, please use `--use_trt`. 
+This model support TensorRT, if you want a faster inference, please use `--use_trt`. But you need to do some extra work.
+Please reference to https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40 
 
 
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
 
 ## 3. Result analysis
diff --git a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
index 47f0aca10fb38f287a6640d37684fa5cf0a16436..f8475daf029ae2230432871237281970052fe3e3 100644
--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
@@ -11,13 +11,14 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
-该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
+该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项,但此时需要额外设置子图的TRT变长最大最小最优shape.
+请参考https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40
 
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
 
 ## 3. 结果分析
diff --git a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/test_client.py b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/test_client.py
old mode 100755
new mode 100644
diff --git a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README.md b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README.md
index d0cdb1582584cb7e0e95d00231c2c8a5fb33d464..5612b754ae9610ed351a4becfec6b47bdcb57c8d 100644
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README.md
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README.md
@@ -10,11 +10,11 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf fcos_dcn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
diff --git a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README_CN.md b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README_CN.md
index 56c2505c8c7ee2be7627a2f6fd9e108868428805..d9737261632c64172684fea0d60c566f242e95e6 100644
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README_CN.md
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README_CN.md
@@ -11,12 +11,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf fcos_dcn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
diff --git a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/test_client.py b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/test_client.py
old mode 100755
new mode 100644
diff --git a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md
index 8c3d5142ad2a88dc151478965e41def5075e4b2f..8060e087107e54bc401849fd576497e9fc9cd421 100644
--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md
@@ -10,13 +10,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ppyolo_r50vd_dcn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
-
diff --git a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md
index 1aebb8db9a0a3b3523d233a70ff42afe4f40a610..3071db7b124fd998d15901be7a78a67018d0de0f 100644
--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md
@@ -11,13 +11,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf ppyolo_r50vd_dcn_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
-
diff --git a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/test_client.py b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/test_client.py
old mode 100755
new mode 100644
diff --git a/python/examples/detection/ssd_vgg16_300_240e_voc/README.md b/python/examples/detection/ssd_vgg16_300_240e_voc/README.md
index 062941bfb8deff3a09c938e9c43cd2b710cbb0e5..60a22fdb5d3c1486827376d935c4f39de1b2c387 100644
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/README.md
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/README.md
@@ -10,11 +10,11 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ssd_vgg16_300_240e_voc.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
diff --git a/python/examples/detection/ssd_vgg16_300_240e_voc/README_CN.md b/python/examples/detection/ssd_vgg16_300_240e_voc/README_CN.md
index 32c19b5159a497e52df1c5fd01a87fd43f7d67e4..a2e0d187a5e896f796dec4ed0dbdcb3af4ed5334 100644
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/README_CN.md
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/README_CN.md
@@ -11,12 +11,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf ssd_vgg16_300_240e_voc.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
diff --git a/python/examples/detection/ssd_vgg16_300_240e_voc/test_client.py b/python/examples/detection/ssd_vgg16_300_240e_voc/test_client.py
old mode 100755
new mode 100644
diff --git a/python/examples/detection/ttfnet_darknet53_1x_coco/README.md b/python/examples/detection/ttfnet_darknet53_1x_coco/README.md
index 58c538b7cdc5ff7975b57d292b1d8b0c7d5dd2b7..d6ffb912c45d94a85cc6a546f3bce6c690e1f2fe 100644
--- a/python/examples/detection/ttfnet_darknet53_1x_coco/README.md
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/README.md
@@ -10,12 +10,11 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ttfnet_darknet53_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
-
diff --git a/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md b/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md
index 641086cd2eba4b274325bca47791a60c6a5ec97f..7a1d3d1abb81e389542a6e5dfd65befa59c402a5 100644
--- a/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md
@@ -11,13 +11,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf ttfnet_darknet53_1x_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
-
diff --git a/python/examples/detection/ttfnet_darknet53_1x_coco/test_client.py b/python/examples/detection/ttfnet_darknet53_1x_coco/test_client.py
old mode 100755
new mode 100644
diff --git a/python/examples/detection/yolov3_darknet53_270e_coco/README.md b/python/examples/detection/yolov3_darknet53_270e_coco/README.md
index 6357c3030a5936b4ec9105860dd63144bfd8098e..32670748db42336053d01e61bf087d00c03c7e06 100644
--- a/python/examples/detection/yolov3_darknet53_270e_coco/README.md
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/README.md
@@ -10,13 +10,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf yolov3_darknet53_270e_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
 
 ### Perform prediction
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
-
diff --git a/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md b/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md
index 166d562e79a91bbc59cd7dc15b7e5667f4e0cb27..4185e0fe4963113ed0f9c0ea865705fd33226d1b 100644
--- a/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md
@@ -11,13 +11,12 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf yolov3_darknet53_270e_coco.tar
-python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
 
 ### 执行预测
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
-
diff --git a/python/examples/encryption/README.md b/python/examples/encryption/README.md
index a08b8b84241fb699992d1a718f2bfbf986d8d180..3120422ebfaa2a88851eda18c42e7740fe29e884 100644
--- a/python/examples/encryption/README.md
+++ b/python/examples/encryption/README.md
@@ -12,9 +12,9 @@ sh get_data.sh
 
 ## Encrypt Model
 
-The `paddlepaddle` package is used in this example, you may need to download the corresponding package(`pip install paddlepaddle`).
+The `paddlepaddle` package is used in this example, you may need to download the corresponding package(`pip3 install paddlepaddle`).
 
-[python encrypt.py](./encrypt.py)
+[python3 encrypt.py](./encrypt.py)
 
 [//file]:#encrypt.py
 ``` python
@@ -35,14 +35,14 @@ client-side configuration file are stored in the `encrypt_client` directory.
 ## Start Encryption Service
 CPU Service
 ```
-python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model
+python3 -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model
 ```
 GPU Service
 ```
-python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
+python3 -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model --gpu_ids 0
 ```
 
 ## Prediction
 ```
-python test_client.py encrypt_client/serving_client_conf.prototxt
+python3 test_client.py encrypt_client/serving_client_conf.prototxt
 ```
diff --git a/python/examples/encryption/README_CN.md b/python/examples/encryption/README_CN.md
index f950796ec14dadfd7bf6744d94aba4959c838e7f..ad82d49b61cb70093a9423ad83dbc30663b6d4f1 100644
--- a/python/examples/encryption/README_CN.md
+++ b/python/examples/encryption/README_CN.md
@@ -11,9 +11,9 @@ sh get_data.sh
 ```
 
 ## 模型加密
-本示例中使用了`paddlepaddle`包中的模块，需要进行下载（`pip install paddlepaddle`）。
+本示例中使用了`paddlepaddle`包中的模块，需要进行下载（`pip3 install paddlepaddle`）。
 
-运行[python encrypt.py](./encrypt.py)进行模型加密
+运行[python3 encrypt.py](./encrypt.py)进行模型加密
 
 [//file]:#encrypt.py
 ``` python
@@ -36,14 +36,14 @@ def serving_encryption():
 ## 启动加密预测服务
 CPU预测服务
 ```
-python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model
+python3 -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model
 ```
 GPU预测服务
 ```
-python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
+python3 -m paddle_serving_server.serve --model encrypt_server/ --port 9393 --use_encryption_model --gpu_ids 0
 ```
 
 ## 预测
 ```
-python test_client.py encrypt_client/
+python3 test_client.py encrypt_client/serving_client_conf.prototxt
 ```
diff --git a/python/examples/encryption/test_client.py b/python/examples/encryption/test_client.py
index 4d211a562733d2a2b1e653a7684fdcd6cf0285d1..33816e741c9a6ffeda0685d5fd3bda6774a5f186 100644
--- a/python/examples/encryption/test_client.py
+++ b/python/examples/encryption/test_client.py
@@ -19,7 +19,8 @@ import sys
 client = Client()
 client.load_client_config(sys.argv[1])
 client.use_key("./key")
-client.connect(["127.0.0.1:9300"], encryption=True)
+client.connect(["0.0.0.0:9393"], encryption=True)
+fetch_list = client.get_fetch_names()
 
 import paddle
 test_reader = paddle.batch(
@@ -28,5 +29,5 @@ test_reader = paddle.batch(
     batch_size=1)
 
 for data in test_reader():
-    fetch_map = client.predict(feed={"x": data[0][0]}, fetch=["price"])
-    print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
+    fetch_map = client.predict(feed={"x": data[0][0]}, fetch=fetch_list)
+    print(fetch_map)
diff --git a/python/examples/fit_a_line/README.md b/python/examples/fit_a_line/README.md
index 77583ce596727d5d0335696fab10960550352ccb..9586cd670240eb43e4a706ff89ea435b7a8c6d1c 100644
--- a/python/examples/fit_a_line/README.md
+++ b/python/examples/fit_a_line/README.md
@@ -15,33 +15,24 @@ sh get_data.sh
 ### Start server
 
 ```shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
 ```
 
-### Client prediction
+## Client prediction
 
-The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip install paddlepaddle`).
+### RPC Client
+The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip3 install paddlepaddle`).
 
 ``` shell
-python test_client.py uci_housing_client/serving_client_conf.prototxt
+python3 test_client.py uci_housing_client/serving_client_conf.prototxt
 ```
 
+### Http Client
 
-
-## HTTP service
-
-### Start server
-
-Start a web service with default web service hosting modules:
 ``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --name uci
+python3 test_httpclient.py uci_housing_client/serving_client_conf.prototxt
 ```
 
-### Client prediction
-
-``` shell
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9393/uci/prediction
-```
 
 ## Benchmark
 ``` shell
diff --git a/python/examples/fit_a_line/README_CN.md b/python/examples/fit_a_line/README_CN.md
old mode 100644
new mode 100755
index d58eb4fbf15045ef2e9d873b2c8517f86cbca0de..d1cace5e2c5b5cee2195deaa1667af68e5f1f067
--- a/python/examples/fit_a_line/README_CN.md
+++ b/python/examples/fit_a_line/README_CN.md
@@ -9,41 +9,29 @@ sh get_data.sh
 ```
 
 
-
-## RPC服务
-
-### 开启服务端
+## 开启服务端（支持BRPC-Client/GRPC Client/Http-Client）
 
 ```shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
-```
-
-### 客户端预测
-
-`test_client.py`中使用了`paddlepaddle`包，需要进行下载（`pip install paddlepaddle`）。
-
-``` shell
-python test_client.py uci_housing_client/serving_client_conf.prototxt
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
 ```
 
+## 客户端预测
 
+### BRPC-Client
 
-## HTTP服务
-
-### 开启服务端
-
-通过下面的一行代码开启默认web服务：
+`test_client.py`中使用了`paddlepaddle`包，需要进行下载（`pip3 install paddlepaddle`）。
 
 ``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --name uci
+python3 test_client.py uci_housing_client/serving_client_conf.prototxt
 ```
 
-### 客户端预测
+### GRPC-Client/Http-Client
 
 ``` shell
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9393/uci/prediction
+python3 test_httpclient.py uci_housing_client/serving_client_conf.prototxt
 ```
 
+
 ## 性能测试
 ``` shell
 bash benchmark.sh uci_housing_model uci_housing_client
diff --git a/python/examples/fit_a_line/benchmark.sh b/python/examples/fit_a_line/benchmark.sh
old mode 100755
new mode 100644
diff --git a/python/examples/fit_a_line/test_client.py b/python/examples/fit_a_line/test_client.py
old mode 100644
new mode 100755
index b698f1a13c337e15c6e0a525f172fdd2b6142c70..d18ece66686520e25a0b9ebbd2d8b29354f4da16
--- a/python/examples/fit_a_line/test_client.py
+++ b/python/examples/fit_a_line/test_client.py
@@ -20,7 +20,7 @@ import numpy as np
 client = Client()
 client.load_client_config(sys.argv[1])
 client.connect(["127.0.0.1:9393"])
-
+fetch_list = client.get_fetch_names()
 import paddle
 test_reader = paddle.batch(
     paddle.reader.shuffle(
@@ -31,6 +31,5 @@ for data in test_reader():
     new_data = np.zeros((1, 13)).astype("float32")
     new_data[0] = data[0][0]
     fetch_map = client.predict(
-        feed={"x": new_data}, fetch=["price"], batch=True)
-    print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
+        feed={"x": new_data}, fetch=fetch_list, batch=True)
     print(fetch_map)
diff --git a/python/examples/fit_a_line/test_httpclient.py b/python/examples/fit_a_line/test_httpclient.py
new file mode 100755
index 0000000000000000000000000000000000000000..f36a6d221943065544155dc364799946bc86cf78
--- /dev/null
+++ b/python/examples/fit_a_line/test_httpclient.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+from paddle_serving_client.httpclient import HttpClient
+import sys
+import numpy as np
+import time
+
+client = HttpClient()
+client.load_client_config(sys.argv[1])
+#client.set_ip('127.0.0.1')
+#client.set_port('9393')
+''' 
+if you want use GRPC-client, set_use_grpc_client(True)
+or you can directly use client.grpc_client_predict(...)
+as for HTTP-client,set_use_grpc_client(False)(which is default)
+or you can directly use client.http_client_predict(...)
+'''
+#client.set_use_grpc_client(True)
+'''
+if you want to enable Encrypt Module,uncommenting the following line
+'''
+#client.use_key("./key")
+'''
+if you want to compress,uncommenting the following line
+'''
+#client.set_response_compress(True)
+#client.set_request_compress(True)
+'''
+we recommend use Proto data format in HTTP-body, set True(which is default)
+if you want use JSON data format in HTTP-body, set False
+'''
+#client.set_http_proto(True)
+
+import paddle
+test_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.test(), buf_size=500),
+    batch_size=1)
+fetch_list = client.get_fetch_names()
+for data in test_reader():
+    new_data = np.zeros((1, 13)).astype("float32")
+    new_data[0] = data[0][0]
+    fetch_map = client.predict(
+        feed={"x": new_data}, fetch=fetch_list, batch=True)
+    print(fetch_map)
+    break
diff --git a/python/examples/grpc_impl_example/fit_a_line/README_CN.md b/python/examples/grpc_impl_example/fit_a_line/README_CN.md
deleted file mode 100644
index 728932b21498b119b46756bc5a67ed38c8db358d..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/fit_a_line/README_CN.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# 线性回归预测服务示例
-
-## 获取数据
-
-```shell
-sh get_data.sh
-```
-
-## 开启 gRPC 服务端
-
-``` shell
-python test_server.py uci_housing_model/
-```
-
-也可以通过下面的一行代码开启默认 gRPC 服务：
-
-```shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang
-```
-
-## 客户端预测
-
-### 同步预测
-
-``` shell
-python test_sync_client.py
-```
-
-### 异步预测
-
-``` shell
-python test_asyn_client.py
-```
-
-### Batch 预测
-
-``` shell
-python test_batch_client.py
-```
-
-### 预测超时
-
-``` shell
-python test_timeout_client.py
-```
diff --git a/python/examples/grpc_impl_example/fit_a_line/get_data.sh b/python/examples/grpc_impl_example/fit_a_line/get_data.sh
deleted file mode 100644
index 84a3966a0ef323cef4b146d8e9489c70a7a8ae35..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/fit_a_line/get_data.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
-tar -xzf uci_housing.tar.gz
diff --git a/python/examples/grpc_impl_example/fit_a_line/test_asyn_client.py b/python/examples/grpc_impl_example/fit_a_line/test_asyn_client.py
deleted file mode 100644
index e9562359031fb32372d76dbbac25229d15ac0265..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/fit_a_line/test_asyn_client.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-from paddle_serving_client import MultiLangClient as Client
-import functools
-import time
-import threading
-import grpc
-import numpy as np
-client = Client()
-client.connect(["127.0.0.1:9393"])
-
-complete_task_count = [0]
-lock = threading.Lock()
-
-
-def call_back(call_future):
-    try:
-        fetch_map = call_future.result()
-        print(fetch_map)
-    except grpc.RpcError as e:
-        print(e.code())
-    finally:
-        with lock:
-            complete_task_count[0] += 1
-
-
-x = [
-    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
-    0.4919, 0.1856, 0.0795, -0.0332
-]
-task_count = 0
-for i in range(3):
-    new_data = np.array(x).astype("float32").reshape((1, 13))
-    future = client.predict(
-        feed={"x": new_data}, fetch=["price"], batch=False, asyn=True)
-    task_count += 1
-    future.add_done_callback(functools.partial(call_back))
-
-while complete_task_count[0] != task_count:
-    time.sleep(0.1)
diff --git a/python/examples/grpc_impl_example/fit_a_line/test_batch_client.py b/python/examples/grpc_impl_example/fit_a_line/test_batch_client.py
deleted file mode 100644
index 41494e71c3c26d3f9af36b83ef50509e8bd071f0..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/fit_a_line/test_batch_client.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_client import MultiLangClient as Client
-import numpy as np
-client = Client()
-client.connect(["127.0.0.1:9393"])
-
-batch_size = 2
-x = [
-    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
-    0.4919, 0.1856, 0.0795, -0.0332
-]
-
-for i in range(3):
-    new_data = np.array(x).astype("float32").reshape((1, 1, 13))
-    batch_data = np.concatenate([new_data, new_data, new_data], axis=0)
-    print(batch_data.shape)
-    fetch_map = client.predict(
-        feed={"x": batch_data}, fetch=["price"], batch=True)
-
-    if fetch_map["serving_status_code"] == 0:
-        print(fetch_map)
-    else:
-        print(fetch_map["serving_status_code"])
diff --git a/python/examples/grpc_impl_example/fit_a_line/test_server.py b/python/examples/grpc_impl_example/fit_a_line/test_server.py
deleted file mode 100644
index 6acc7bfe2e6d00621f32f1f7f437691fc15d20fc..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/fit_a_line/test_server.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-import os
-import sys
-from paddle_serving_server import OpMaker
-from paddle_serving_server import OpSeqMaker
-from paddle_serving_server import MultiLangServer as Server
-
-op_maker = OpMaker()
-read_op = op_maker.create('general_reader')
-general_infer_op = op_maker.create('general_infer')
-response_op = op_maker.create('general_response')
-
-op_seq_maker = OpSeqMaker()
-op_seq_maker.add_op(read_op)
-op_seq_maker.add_op(general_infer_op)
-op_seq_maker.add_op(response_op)
-
-server = Server()
-server.set_op_sequence(op_seq_maker.get_op_sequence())
-server.load_model_config(sys.argv[1])
-server.prepare_server(workdir="work_dir1", port=9393, device="cpu")
-server.run_server()
diff --git a/python/examples/grpc_impl_example/fit_a_line/test_server_gpu.py b/python/examples/grpc_impl_example/fit_a_line/test_server_gpu.py
deleted file mode 100644
index 62361d9994ca1c532a2f07384c8b089d3b0fad65..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/fit_a_line/test_server_gpu.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-import os
-import sys
-from paddle_serving_server import OpMaker
-from paddle_serving_server import OpSeqMaker
-from paddle_serving_server import MultiLangServer as Server
-
-op_maker = OpMaker()
-read_op = op_maker.create('general_reader')
-general_infer_op = op_maker.create('general_infer')
-response_op = op_maker.create('general_response')
-
-op_seq_maker = OpSeqMaker()
-op_seq_maker.add_op(read_op)
-op_seq_maker.add_op(general_infer_op)
-op_seq_maker.add_op(response_op)
-
-server = Server()
-server.set_op_sequence(op_seq_maker.get_op_sequence())
-server.load_model_config(sys.argv[1])
-server.set_gpuid(0)
-server.prepare_server(workdir="work_dir1", port=9393, device="cpu")
-server.run_server()
diff --git a/python/examples/grpc_impl_example/fit_a_line/test_sync_client.py b/python/examples/grpc_impl_example/fit_a_line/test_sync_client.py
deleted file mode 100644
index 879bc1ce6790de67041dcb077d2be49a437f2b14..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/fit_a_line/test_sync_client.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-from paddle_serving_client import MultiLangClient as Client
-import numpy as np
-client = Client()
-client.connect(["127.0.0.1:9393"])
-"""
-for data in test_reader():
-    new_data = np.zeros((1, 1, 13)).astype("float32")
-    new_data[0] = data[0][0]
-    fetch_map = client.predict(
-        feed={"x": new_data}, fetch=["price"], batch=True)
-    print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
-    print(fetch_map)
-"""
-
-x = [
-    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
-    0.4919, 0.1856, 0.0795, -0.0332
-]
-for i in range(3):
-    new_data = np.array(x).astype("float32").reshape((1, 13))
-    fetch_map = client.predict(
-        feed={"x": new_data}, fetch=["price"], batch=False)
-    if fetch_map["serving_status_code"] == 0:
-        print(fetch_map)
-    else:
-        print(fetch_map["serving_status_code"])
diff --git a/python/examples/grpc_impl_example/fit_a_line/test_timeout_client.py b/python/examples/grpc_impl_example/fit_a_line/test_timeout_client.py
deleted file mode 100644
index 3e9dcc907fff5c51ff76864cfa406bdbf3f3e082..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/fit_a_line/test_timeout_client.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-from paddle_serving_client import MultiLangClient as Client
-import grpc
-import numpy as np
-client = Client()
-client.connect(["127.0.0.1:9393"])
-client.set_rpc_timeout_ms(40)
-
-x = [
-    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
-    0.4919, 0.1856, 0.0795, -0.0332
-]
-for i in range(3):
-    new_data = np.array(x).astype("float32").reshape((1, 13))
-    fetch_map = client.predict(
-        feed={"x": new_data}, fetch=["price"], batch=False)
-    if fetch_map["serving_status_code"] == 0:
-        print(fetch_map)
-    elif fetch_map["serving_status_code"] == grpc.StatusCode.DEADLINE_EXCEEDED:
-        print('timeout')
-    else:
-        print(fetch_map["serving_status_code"])
diff --git a/python/examples/grpc_impl_example/imdb/README.md b/python/examples/grpc_impl_example/imdb/README.md
deleted file mode 100644
index 73636f3caa9aeea375f56714f57e26a6f31e990c..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/imdb/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-## IMDB comment sentiment inference service
-
-([简体中文](./README_CN.md)|English)
-
-### Get model files and sample data
-
-```
-sh get_data.sh
-```
-the package downloaded contains cnn, lstm and bow model config along with their test_data and train_data.
-
-### Start RPC inference service
-
-```
-python -m paddle_serving_server.serve --model imdb_cnn_model/ --thread 10 --port 9393 --use_multilang
-```
-### RPC Infer
-
-The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip install paddlepaddle`).
-
-```
-head test_data/part-0 | python test_client.py
-```
-
-it will get predict results of the first 10 test cases.
diff --git a/python/examples/grpc_impl_example/imdb/README_CN.md b/python/examples/grpc_impl_example/imdb/README_CN.md
deleted file mode 100644
index 327b1c5541ad53f14b8518037de39e572c31e67c..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/imdb/README_CN.md
+++ /dev/null
@@ -1,24 +0,0 @@
-## IMDB评论情绪预测服务
-
-(简体中文|[English](./README.md))
-
-### 获取模型文件和样例数据
-
-```
-sh get_data.sh
-```
-脚本会下载和解压出cnn、lstm和bow三种模型的配置文文件以及test_data和train_data。
-
-### 启动RPC预测服务
-
-```
-python -m paddle_serving_server.serve --model imdb_cnn_model/ --thread 10 --port 9393 --use_multilang
-```
-### 执行预测
-
-`test_client.py`中使用了`paddlepaddle`包，需要进行下载（`pip install paddlepaddle`）。
-
-```
-head test_data/part-0 | python test_client.py
-```
-预测test_data/part-0的前十个样例。
diff --git a/python/examples/grpc_impl_example/imdb/get_data.sh b/python/examples/grpc_impl_example/imdb/get_data.sh
deleted file mode 100644
index 81d8d5d3b018f133c41e211d1501cf3cd9a3d8a4..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/imdb/get_data.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-wget --no-check-certificate https://fleet.bj.bcebos.com/text_classification_data.tar.gz
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/imdb-demo/imdb_model.tar.gz
-tar -zxvf text_classification_data.tar.gz
-tar -zxvf imdb_model.tar.gz
diff --git a/python/examples/grpc_impl_example/imdb/imdb_reader.py b/python/examples/grpc_impl_example/imdb/imdb_reader.py
deleted file mode 100644
index a4ef3e163a50b0dc244ac2653df1e38d7f91699b..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/imdb/imdb_reader.py
+++ /dev/null
@@ -1,92 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-import sys
-import os
-import paddle
-import re
-import paddle.fluid.incubate.data_generator as dg
-
-py_version = sys.version_info[0]
-
-
-class IMDBDataset(dg.MultiSlotDataGenerator):
-    def load_resource(self, dictfile):
-        self._vocab = {}
-        wid = 0
-        if py_version == 2:
-            with open(dictfile) as f:
-                for line in f:
-                    self._vocab[line.strip()] = wid
-                    wid += 1
-        else:
-            with open(dictfile, encoding="utf-8") as f:
-                for line in f:
-                    self._vocab[line.strip()] = wid
-                    wid += 1
-        self._unk_id = len(self._vocab)
-        self._pattern = re.compile(r'(;|,|\.|\?|!|\s|\(|\))')
-        self.return_value = ("words", [1, 2, 3, 4, 5, 6]), ("label", [0])
-
-    def get_words_only(self, line):
-        sent = line.lower().replace("<br />", " ").strip()
-        words = [x for x in self._pattern.split(sent) if x and x != " "]
-        feas = [
-            self._vocab[x] if x in self._vocab else self._unk_id for x in words
-        ]
-        return feas
-
-    def get_words_and_label(self, line):
-        send = '|'.join(line.split('|')[:-1]).lower().replace("<br />",
-                                                              " ").strip()
-        label = [int(line.split('|')[-1])]
-
-        words = [x for x in self._pattern.split(send) if x and x != " "]
-        feas = [
-            self._vocab[x] if x in self._vocab else self._unk_id for x in words
-        ]
-        return feas, label
-
-    def infer_reader(self, infer_filelist, batch, buf_size):
-        def local_iter():
-            for fname in infer_filelist:
-                with open(fname, "r") as fin:
-                    for line in fin:
-                        feas, label = self.get_words_and_label(line)
-                        yield feas, label
-
-        import paddle
-        batch_iter = paddle.batch(
-            paddle.reader.shuffle(
-                local_iter, buf_size=buf_size),
-            batch_size=batch)
-        return batch_iter
-
-    def generate_sample(self, line):
-        def memory_iter():
-            for i in range(1000):
-                yield self.return_value
-
-        def data_iter():
-            feas, label = self.get_words_and_label(line)
-            yield ("words", feas), ("label", label)
-
-        return data_iter
-
-
-if __name__ == "__main__":
-    imdb = IMDBDataset()
-    imdb.load_resource("imdb.vocab")
-    imdb.run_from_stdin()
diff --git a/python/examples/grpc_impl_example/yolov4/000000570688.jpg b/python/examples/grpc_impl_example/yolov4/000000570688.jpg
deleted file mode 100644
index cb304bd56c4010c08611a30dcca58ea9140cea54..0000000000000000000000000000000000000000
Binary files a/python/examples/grpc_impl_example/yolov4/000000570688.jpg and /dev/null differ
diff --git a/python/examples/grpc_impl_example/yolov4/README.md b/python/examples/grpc_impl_example/yolov4/README.md
deleted file mode 100644
index b468a7f6aa174a477e7ef5aeb8a17aaacf09242d..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/yolov4/README.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# Yolov4 Detection Service
-
-([简体中文](README_CN.md)|English)
-
-## Get Model
-
-```
-python -m paddle_serving_app.package --get_model yolov4
-tar -xzvf yolov4.tar.gz
-```
-
-## Start RPC Service
-
-```
-python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang
-```
-
-## Prediction
-
-```
-python test_client.py 000000570688.jpg
-```
-After the prediction is completed, a json file to save the prediction result and a picture with the detection result box will be generated in the `./outpu folder.
diff --git a/python/examples/grpc_impl_example/yolov4/README_CN.md b/python/examples/grpc_impl_example/yolov4/README_CN.md
deleted file mode 100644
index 991d2ee22aadf16a554a269b2bf5892df45907d2..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/yolov4/README_CN.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# Yolov4 检测服务
-
-(简体中文|[English](README.md))
-
-## 获取模型
-
-```
-python -m paddle_serving_app.package --get_model yolov4
-tar -xzvf yolov4.tar.gz
-```
-
-## 启动RPC服务
-
-```
-python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang
-```
-
-## 预测
-
-```
-python test_client.py 000000570688.jpg
-```
-
-预测完成会在`./output`文件夹下生成保存预测结果的json文件以及标出检测结果框的图片。
diff --git a/python/examples/grpc_impl_example/yolov4/label_list.txt b/python/examples/grpc_impl_example/yolov4/label_list.txt
deleted file mode 100644
index 941cb4e1392266f6a6c09b1fdc5f79503b2e5df6..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/yolov4/label_list.txt
+++ /dev/null
@@ -1,80 +0,0 @@
-person
-bicycle
-car
-motorcycle
-airplane
-bus
-train
-truck
-boat
-traffic light
-fire hydrant
-stop sign
-parking meter
-bench
-bird
-cat
-dog
-horse
-sheep
-cow
-elephant
-bear
-zebra
-giraffe
-backpack
-umbrella
-handbag
-tie
-suitcase
-frisbee
-skis
-snowboard
-sports ball
-kite
-baseball bat
-baseball glove
-skateboard
-surfboard
-tennis racket
-bottle
-wine glass
-cup
-fork
-knife
-spoon
-bowl
-banana
-apple
-sandwich
-orange
-broccoli
-carrot
-hot dog
-pizza
-donut
-cake
-chair
-couch
-potted plant
-bed
-dining table
-toilet
-tv
-laptop
-mouse
-remote
-keyboard
-cell phone
-microwave
-oven
-toaster
-sink
-refrigerator
-book
-clock
-vase
-scissors
-teddy bear
-hair drier
-toothbrush
diff --git a/python/examples/grpc_impl_example/yolov4/test_client.py b/python/examples/grpc_impl_example/yolov4/test_client.py
deleted file mode 100644
index 520d8bec5a830450f644a2e3fbf143c744419441..0000000000000000000000000000000000000000
--- a/python/examples/grpc_impl_example/yolov4/test_client.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import numpy as np
-from paddle_serving_client import MultiLangClient as Client
-from paddle_serving_app.reader import *
-import cv2
-
-preprocess = Sequential([
-    File2Image(), BGR2RGB(), Resize(
-        (608, 608), interpolation=cv2.INTER_LINEAR), Div(255.0), Transpose(
-            (2, 0, 1))
-])
-
-postprocess = RCNNPostprocess("label_list.txt", "output", [608, 608])
-client = Client()
-client.connect(['127.0.0.1:9393'])
-client.set_rpc_timeout_ms(15000)
-
-im = preprocess(sys.argv[1])
-fetch_map = client.predict(
-    feed={
-        "image": im,
-        "im_size": np.array(list(im.shape[1:])),
-    },
-    fetch=["save_infer_model/scale_0.tmp_0"],
-    batch=False)
-print(fetch_map)
-fetch_map.pop("serving_status_code")
-fetch_map["image"] = sys.argv[1]
-postprocess(fetch_map)
diff --git a/python/examples/imagenet/README.md b/python/examples/imagenet/README.md
old mode 100644
new mode 100755
index ad8b12b5bb8bf5669a34cf88637f34e640ca0a65..eaff522a5ae31eab08786489cbce0fa83f85e91d
--- a/python/examples/imagenet/README.md
+++ b/python/examples/imagenet/README.md
@@ -12,38 +12,30 @@ sh get_model.sh
 ### Install preprocess module
 
 ```
-pip install paddle_serving_app
+pip3 install paddle_serving_app
 ```
 
-### HTTP Service
-
-launch server side
-```
-python resnet50_web_service.py ResNet50_vd_model cpu 9696 #cpu inference service
-```
-```
-python resnet50_web_service.py ResNet50_vd_model gpu 9696 #gpu inference service
-```
 
+### Inference Service(Support BRPC-Client/GRPC-Client/Http-Client)
 
-client send inference request
+launch server side
 ```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"image": "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"}], "fetch": ["score"]}' http://127.0.0.1:9696/image/prediction
+python3 -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu inference service
 ```
 
-### RPC Service
-
-launch server side
 ```
-python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu inference service
+python3 -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu inference service
 ```
 
+### BRPC-Client
+client send inference request
 ```
-python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu inference service
+python3 resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
 ```
+*the port of server side in this example is 9696
 
+### GRPC-Client/Http-Client
 client send inference request
 ```
-python resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
+python3 resnet50_http_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
 ```
-*the port of server side in this example is 9696
diff --git a/python/examples/imagenet/README_CN.md b/python/examples/imagenet/README_CN.md
old mode 100644
new mode 100755
index 8650d51a6b41ac3ad68d49e3a7c966f0c0425ad1..642bee3d0cbab98a48f2f09284ea887751752667
--- a/python/examples/imagenet/README_CN.md
+++ b/python/examples/imagenet/README_CN.md
@@ -12,38 +12,30 @@ sh get_model.sh
 ### 安装数据预处理模块
 
 ```
-pip install paddle_serving_app
+pip3 install paddle_serving_app
 ```
 
-### HTTP服务
+### 启动服务端（支持BRPC-Client、GRPC-Client、Http-Client）
 
 启动server端
 ```
-python resnet50_web_service.py ResNet50_vd_model cpu 9696 #cpu预测服务
+python3 -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu预测服务
 ```
-```
-python resnet50_web_service.py ResNet50_vd_model gpu 9696 #gpu预测服务
-```
-
 
-发送HTTP POST请求
 ```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"image": "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"}], "fetch": ["score"]}' http://127.0.0.1:9696/image/prediction
+python3 -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu预测服务
 ```
 
-### RPC服务
-
-启动server端
+### BRPC-Client预测
+client端进行预测
 ```
-python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu预测服务
+python3 resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
 ```
+*server端示例中服务端口为9696端口
 
-```
-python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu预测服务
-```
 
+### GRPC-Client/Http-Client预测
 client端进行预测
 ```
-python resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
+python3 resnet50_http_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
 ```
-*server端示例中服务端口为9696端口
diff --git a/python/examples/imagenet/resnet50_http_client.py b/python/examples/imagenet/resnet50_http_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..48efbb88c2c3525a0d38d238c6f093f518f5e0b8
--- /dev/null
+++ b/python/examples/imagenet/resnet50_http_client.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from paddle_serving_client import HttpClient
+from paddle_serving_app.reader import Sequential, URL2Image, Resize
+from paddle_serving_app.reader import CenterCrop, RGB2BGR, Transpose, Div, Normalize
+import time
+
+client = HttpClient(ip='127.0.0.1', port='9696')
+client.load_client_config(sys.argv[1])
+#client.set_ip('127.0.0.1')
+#client.set_port('9292')
+''' 
+if you want use GRPC-client, set_use_grpc_client(True)
+or you can directly use client.grpc_client_predict(...)
+as for HTTP-client,set_use_grpc_client(False)(which is default)
+or you can directly use client.http_client_predict(...)
+'''
+#client.set_use_grpc_client(True)
+'''
+if you want to enable Encrypt Module,uncommenting the following line
+'''
+#client.use_key("./key")
+'''
+if you want to compress,uncommenting the following line
+'''
+#client.set_response_compress(True)
+#client.set_request_compress(True)
+'''
+we recommend use Proto data format in HTTP-body, set True(which is default)
+if you want use JSON data format in HTTP-body, set False
+'''
+#client.set_http_proto(True)
+
+label_dict = {}
+label_idx = 0
+with open("imagenet.label") as fin:
+    for line in fin:
+        label_dict[label_idx] = line.strip()
+        label_idx += 1
+
+seq = Sequential([
+    URL2Image(), Resize(256), CenterCrop(224), RGB2BGR(), Transpose((2, 0, 1)),
+    Div(255), Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], True)
+])
+
+start = time.time()
+image_file = "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"
+for i in range(10):
+    img = seq(image_file)
+    fetch_map = client.predict(
+        feed={"image": img}, fetch=["score"], batch=False)
+    print(fetch_map)
+
+end = time.time()
+print(end - start)
diff --git a/python/examples/imagenet/resnet50_web_service.py b/python/examples/imagenet/resnet50_web_service.py
deleted file mode 100644
index ca111615deb9d240f9d8b042f1f7edb599a1b775..0000000000000000000000000000000000000000
--- a/python/examples/imagenet/resnet50_web_service.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-from paddle_serving_client import Client
-import numpy as np
-from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-
-if len(sys.argv) != 4:
-    print("python resnet50_web_service.py model device port")
-    sys.exit(-1)
-
-device = sys.argv[2]
-
-if device == "cpu":
-    from paddle_serving_server.web_service import WebService
-else:
-    from paddle_serving_server.web_service import WebService
-
-
-class ImageService(WebService):
-    def init_imagenet_setting(self):
-        self.seq = Sequential([
-            URL2Image(), Resize(256), CenterCrop(224), RGB2BGR(), Transpose(
-                (2, 0, 1)), Div(255), Normalize([0.485, 0.456, 0.406],
-                                                [0.229, 0.224, 0.225], True)
-        ])
-        self.label_dict = {}
-        label_idx = 0
-        with open("imagenet.label") as fin:
-            for line in fin:
-                self.label_dict[label_idx] = line.strip()
-                label_idx += 1
-
-    def preprocess(self, feed=[], fetch=[]):
-        feed_batch = []
-        is_batch = True
-        for ins in feed:
-            if "image" not in ins:
-                raise ("feed data error!")
-            img = self.seq(ins["image"])
-            feed_batch.append({"image": img[np.newaxis, :]})
-        return feed_batch, fetch, is_batch
-
-    def postprocess(self, feed=[], fetch=[], fetch_map={}):
-        score_list = fetch_map["score"]
-        result = {"label": [], "prob": []}
-        for score in score_list:
-            score = score.tolist()
-            max_score = max(score)
-            result["label"].append(self.label_dict[score.index(max_score)]
-                                   .strip().replace(",", ""))
-            result["prob"].append(max_score)
-        return result
-
-
-image_service = ImageService(name="image")
-image_service.load_model_config(sys.argv[1])
-image_service.init_imagenet_setting()
-if device == "gpu":
-    image_service.set_gpus("0")
-image_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[3]), device=device)
-image_service.run_rpc_service()
-image_service.run_web_service()
diff --git a/python/examples/imdb/README.md b/python/examples/imdb/README.md
old mode 100644
new mode 100755
index e2b9a74c98e8993f19b14888f3e21343f526b81d..573ac47db37d23406e66fb1605ac60ea58189ffa
--- a/python/examples/imdb/README.md
+++ b/python/examples/imdb/README.md
@@ -9,24 +9,20 @@ sh get_data.sh
 ```
 the package downloaded contains cnn, lstm and bow model config along with their test_data and train_data.
 
-### Start RPC inference service
+### Start inference service(Support BRPC-Client/GRPC-Client/Http-Client)
 
 ```
-python -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292
+python3 -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292
 ```
-### RPC Infer
+### BRPC-Client Infer
 ```
-head test_data/part-0 | python test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
+head test_data/part-0 | python3 test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
 ```
 
 it will get predict results of the first 10 test cases.
 
-### Start HTTP inference service
-```
-python text_classify_service.py imdb_cnn_model/ workdir/ 9292 imdb.vocab
-```
-### HTTP Infer
 
+### GRPC-Client/Http-Client Infer
 ```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:9292/imdb/prediction
+head test_data/part-0 | python3 test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
 ```
diff --git a/python/examples/imdb/README_CN.md b/python/examples/imdb/README_CN.md
old mode 100644
new mode 100755
index a669e29e94f6c6cce238473a8fc33405e29e8471..a1fecc8af35dcd2f5a38f47480b9b80b3cf96054
--- a/python/examples/imdb/README_CN.md
+++ b/python/examples/imdb/README_CN.md
@@ -9,23 +9,18 @@ sh get_data.sh
 ```
 脚本会下载和解压出cnn、lstm和bow三种模型的配置文文件以及test_data和train_data。
 
-### 启动RPC预测服务
+### 启动预测服务(支持BRPC-Client/GRPC-Client/Http-Client)
 
 ```
-python -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292
+python3 -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292
 ```
-### 执行预测
+### BRPC-Client预测
 ```
-head test_data/part-0 | python test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
+head test_data/part-0 | python3 test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
 ```
 预测test_data/part-0的前十个样例。
 
-### 启动HTTP预测服务
+### BRPC-Client预测
 ```
-python text_classify_service.py imdb_cnn_model/ workdir/ 9292 imdb.vocab
-```
-### 执行预测
-
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:9292/imdb/prediction
+head test_data/part-0 | python3 test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
 ```
diff --git a/python/examples/imdb/imdb_web_service_demo.sh b/python/examples/imdb/imdb_web_service_demo.sh
deleted file mode 100644
index 05d1b729c64359025119e443ed601c902a87ae4d..0000000000000000000000000000000000000000
--- a/python/examples/imdb/imdb_web_service_demo.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-wget https://paddle-serving.bj.bcebos.com/imdb-demo/imdb_service.tar.gz
-tar -xzf imdb_service.tar.gz
-wget --no-check-certificate https://fleet.bj.bcebos.com/text_classification_data.tar.gz
-tar -zxvf text_classification_data.tar.gz
-python text_classify_service.py serving_server_model/ workdir imdb.vocab
diff --git a/python/examples/grpc_impl_example/imdb/test_client.py b/python/examples/imdb/test_http_client.py
old mode 100644
new mode 100755
similarity index 56%
rename from python/examples/grpc_impl_example/imdb/test_client.py
rename to python/examples/imdb/test_http_client.py
index bddc4d501d346c4cfbb33d743d53e2e0eb3b6b10..5f1f164218f3ed57647b1841d25d71e410c83a57
--- a/python/examples/grpc_impl_example/imdb/test_client.py
+++ b/python/examples/imdb/test_http_client.py
@@ -12,19 +12,42 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
-from paddle_serving_client import MultiLangClient as Client
+from paddle_serving_client import HttpClient
 from paddle_serving_app.reader.imdb_reader import IMDBDataset
 import sys
 import numpy as np
 
-client = Client()
-client.connect(["127.0.0.1:9393"])
+client = HttpClient(ip='127.0.0.1', port='9292')
+client.load_client_config(sys.argv[1])
+#client.set_ip('127.0.0.1')
+#client.set_port('9292')
+''' 
+if you want use GRPC-client, set_use_grpc_client(True)
+or you can directly use client.grpc_client_predict(...)
+as for HTTP-client,set_use_grpc_client(False)(which is default)
+or you can directly use client.http_client_predict(...)
+'''
+#client.set_use_grpc_client(True)
+'''
+if you want to enable Encrypt Module,uncommenting the following line
+'''
+#client.use_key("./key")
+'''
+if you want to compress,uncommenting the following line
+'''
+#client.set_response_compress(True)
+#client.set_request_compress(True)
+'''
+we recommend use Proto data format in HTTP-body, set True(which is default)
+if you want use JSON data format in HTTP-body, set False
+'''
+#client.set_http_proto(True)
 
 # you can define any english sentence or dataset here
 # This example reuses imdb reader in training, you
 # can define your own data preprocessing easily.
 imdb_dataset = IMDBDataset()
-imdb_dataset.load_resource('imdb.vocab')
+imdb_dataset.load_resource(sys.argv[2])
 
 for line in sys.stdin:
     word_ids, label = imdb_dataset.get_words_and_label(line)
@@ -33,10 +56,7 @@ for line in sys.stdin:
         "words": np.array(word_ids).reshape(word_len, 1),
         "words.lod": [0, word_len]
     }
+    #print(feed)
     fetch = ["prediction"]
     fetch_map = client.predict(feed=feed, fetch=fetch, batch=True)
-    if fetch_map["serving_status_code"] == 0:
-        print(fetch_map)
-    else:
-        print(fetch_map["serving_status_code"])
-    #print("{} {}".format(fetch_map["prediction"][0], label[0]))
+    print(fetch_map)
diff --git a/python/examples/imdb/text_classify_service.py b/python/examples/imdb/text_classify_service.py
deleted file mode 100755
index ca1e26002baf0284f282add235706080f7902c33..0000000000000000000000000000000000000000
--- a/python/examples/imdb/text_classify_service.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-from paddle_serving_server.web_service import WebService
-from paddle_serving_app.reader.imdb_reader import IMDBDataset
-import sys
-import numpy as np
-
-
-class IMDBService(WebService):
-    def prepare_dict(self, args={}):
-        if len(args) == 0:
-            exit(-1)
-        self.dataset = IMDBDataset()
-        self.dataset.load_resource(args["dict_file_path"])
-
-    def preprocess(self, feed={}, fetch=[]):
-        feed_batch = []
-        words_lod = [0]
-        is_batch = True
-        for ins in feed:
-            words = self.dataset.get_words_only(ins["words"])
-            words = np.array(words).reshape(len(words), 1)
-            words_lod.append(words_lod[-1] + len(words))
-            feed_batch.append(words)
-        feed = {"words": np.concatenate(feed_batch), "words.lod": words_lod}
-        return feed, fetch, is_batch
-
-
-imdb_service = IMDBService(name="imdb")
-imdb_service.load_model_config(sys.argv[1])
-imdb_service.prepare_server(
-    workdir=sys.argv[2], port=int(sys.argv[3]), device="cpu")
-imdb_service.prepare_dict({"dict_file_path": sys.argv[4]})
-imdb_service.run_rpc_service()
-imdb_service.run_web_service()
diff --git a/python/examples/lac/README.md b/python/examples/lac/README.md
old mode 100644
new mode 100755
index 8d7adfb583f8e8e1fde0681a73f2bba65452fa87..108d5051b50b2b639e28c023364d36ec9a0a0a44
--- a/python/examples/lac/README.md
+++ b/python/examples/lac/README.md
@@ -4,28 +4,23 @@
 
 ### Get Model
 ```
-python -m paddle_serving_app.package --get_model lac
+python3 -m paddle_serving_app.package --get_model lac
 tar -xzvf lac.tar.gz
 ```
 
-#### Start RPC inference service
+#### Start inference service(Support BRPC-Client/GRPC-Client/Http-Client)
 
 ```
-python -m paddle_serving_server.serve --model lac_model/ --port 9292
+python3 -m paddle_serving_server.serve --model lac_model/ --port 9292
 ```
-### RPC Infer
+### BRPC Infer
 ```
-echo "我爱北京天安门" | python lac_client.py lac_client/serving_client_conf.prototxt
+echo "我爱北京天安门" | python3 lac_client.py lac_client/serving_client_conf.prototxt
 ```
 
 It will get the segmentation result. 
 
-### Start HTTP inference service
+### GRPC/Http Infer
 ```
-python lac_web_service.py lac_model/ lac_workdir 9292
-```
-### HTTP Infer
-
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "我爱北京天安门"}], "fetch":["word_seg"]}' http://127.0.0.1:9292/lac/prediction
+echo "我爱北京天安门" | python3 lac_http_client.py lac_client/serving_client_conf.prototxt
 ```
diff --git a/python/examples/lac/README_CN.md b/python/examples/lac/README_CN.md
old mode 100644
new mode 100755
index 2379aa8ed69c026c6afd94b8b791774882eaf567..5634128c80c23126836677f4cb434df68dde9056
--- a/python/examples/lac/README_CN.md
+++ b/python/examples/lac/README_CN.md
@@ -4,28 +4,23 @@
 
 ### 获取模型
 ```
-python -m paddle_serving_app.package --get_model lac
+python3 -m paddle_serving_app.package --get_model lac
 tar -xzvf lac.tar.gz
 ```
 
-#### 开启RPC预测服务
+#### 开启预测服务(支持BRPC-Client/GRPC-Client/Http-Client)
 
 ```
-python -m paddle_serving_server.serve --model lac_model/ --port 9292
+python3 -m paddle_serving_server.serve --model lac_model/ --port 9292
 ```
-### 执行RPC预测
+### 执行BRPC预测
 ```
-echo "我爱北京天安门" | python lac_client.py lac_client/serving_client_conf.prototxt
+echo "我爱北京天安门" | python3 lac_client.py lac_client/serving_client_conf.prototxt
 ```
 
 我们就能得到分词结果
 
-### 开启HTTP预测服务
+### 执行GRPC/Http预测
 ```
-python lac_web_service.py lac_model/ lac_workdir 9292
-```
-### 执行HTTP预测
-
-```
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "我爱北京天安门"}], "fetch":["word_seg"]}' http://127.0.0.1:9292/lac/prediction
+echo "我爱北京天安门" | python3 lac_http_client.py lac_client/serving_client_conf.prototxt
 ```
diff --git a/python/examples/lac/lac_http_client.py b/python/examples/lac/lac_http_client.py
old mode 100644
new mode 100755
index 852d785f368e95bb16bfd5804e3153b022945f59..e894addca3e8b5aebc83804c9e171900bc7e6b65
--- a/python/examples/lac/lac_http_client.py
+++ b/python/examples/lac/lac_http_client.py
@@ -1,3 +1,4 @@
+# encoding=utf-8
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,17 +12,56 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#coding=utf-8
-import requests
-import json
-import time
+# pylint: disable=doc-string-missing
 
-if __name__ == "__main__":
-    server = "http://127.0.0.1:9280/lac/prediction"
-    fin = open("jieba_test.txt", "r")
-    start = time.time()
-    for line in fin:
-        req_data = {"words": line.strip(), "fetch": ["crf_decode"]}
-        r = requests.post(server, json=req_data)
-    end = time.time()
-    print(end - start)
+from paddle_serving_client import HttpClient
+from paddle_serving_app.reader import LACReader
+import sys
+import os
+import io
+import numpy as np
+
+client = HttpClient(ip='127.0.0.1', port='9292')
+client.load_client_config(sys.argv[1])
+#client.set_ip('127.0.0.1')
+#client.set_port('9292')
+''' 
+if you want use GRPC-client, set_use_grpc_client(True)
+or you can directly use client.grpc_client_predict(...)
+as for HTTP-client,set_use_grpc_client(False)(which is default)
+or you can directly use client.http_client_predict(...)
+'''
+#client.set_use_grpc_client(True)
+'''
+if you want to enable Encrypt Module,uncommenting the following line
+'''
+#client.use_key("./key")
+'''
+if you want to compress,uncommenting the following line
+'''
+#client.set_response_compress(True)
+#client.set_request_compress(True)
+'''
+we recommend use Proto data format in HTTP-body, set True(which is default)
+if you want use JSON data format in HTTP-body, set False
+'''
+#client.set_http_proto(True)
+
+reader = LACReader()
+for line in sys.stdin:
+    if len(line) <= 0:
+        continue
+    feed_data = reader.process(line)
+    if len(feed_data) <= 0:
+        continue
+    print(feed_data)
+    #fetch_map = client.predict(feed={"words": np.array(feed_data).reshape(len(feed_data), 1), "words.lod": [0, len(feed_data)]}, fetch=["crf_decode"], batch=True)
+    fetch_map = client.predict(
+        feed={
+            "words": np.array(feed_data + feed_data).reshape(
+                len(feed_data) * 2, 1),
+            "words.lod": [0, len(feed_data), 2 * len(feed_data)]
+        },
+        fetch=["crf_decode"],
+        batch=True)
+    print(fetch_map)
diff --git a/python/examples/lac/lac_web_service.py b/python/examples/lac/lac_web_service.py
deleted file mode 100644
index cf37f66294bd154324f2c7cacd1a35571b6c6350..0000000000000000000000000000000000000000
--- a/python/examples/lac/lac_web_service.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle_serving_server.web_service import WebService
-import sys
-from paddle_serving_app.reader import LACReader
-import numpy as np
-
-
-class LACService(WebService):
-    def load_reader(self):
-        self.reader = LACReader()
-
-    def preprocess(self, feed={}, fetch=[]):
-        feed_batch = []
-        fetch = ["crf_decode"]
-        lod_info = [0]
-        is_batch = True
-        for ins in feed:
-            if "words" not in ins:
-                raise ("feed data error!")
-            feed_data = self.reader.process(ins["words"])
-            feed_batch.append(np.array(feed_data).reshape(len(feed_data), 1))
-            lod_info.append(lod_info[-1] + len(feed_data))
-        feed_dict = {
-            "words": np.concatenate(
-                feed_batch, axis=0),
-            "words.lod": lod_info
-        }
-        return feed_dict, fetch, is_batch
-
-    def postprocess(self, feed={}, fetch=[], fetch_map={}):
-        batch_ret = []
-        for idx, ins in enumerate(feed):
-            begin = fetch_map['crf_decode.lod'][idx]
-            end = fetch_map['crf_decode.lod'][idx + 1]
-            segs = self.reader.parse_result(ins["words"],
-                                            fetch_map["crf_decode"][begin:end])
-            batch_ret.append({"word_seg": "|".join(segs)})
-        return batch_ret
-
-
-lac_service = LACService(name="lac")
-lac_service.load_model_config(sys.argv[1])
-lac_service.load_reader()
-lac_service.prepare_server(
-    workdir=sys.argv[2], port=int(sys.argv[3]), device="cpu")
-lac_service.run_rpc_service()
-lac_service.run_web_service()
diff --git a/python/examples/low_precision/resnet50/README.md b/python/examples/low_precision/resnet50/README.md
index 9e1ff16c676b067437183e6e19446e8a526feed5..b4ae2552c3dcd1c30c67b5731d81095e05ca9a86 100644
--- a/python/examples/low_precision/resnet50/README.md
+++ b/python/examples/low_precision/resnet50/README.md
@@ -11,15 +11,15 @@ Firstly, download the [Resnet50 int8 model](https://paddle-inference-dist.bj.bce
 wget https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ResNet50_quant.tar.gz
 tar zxvf ResNet50_quant.tar.gz
 
-python -m paddle_serving_client.convert --dirname ResNet50_quant
+python3 -m paddle_serving_client.convert --dirname ResNet50_quant
 ```
 Start RPC service, specify the GPU id and precision mode
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 
+python3 -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 
 ```
 Request the serving service with Client
 ```
-python resnet50_client.py
+python3 resnet50_client.py
 ```
 
 ## Reference
diff --git a/python/examples/low_precision/resnet50/README_CN.md b/python/examples/low_precision/resnet50/README_CN.md
index 1c1a3be1de1690e9736d994016ac05cfba12bcab..648b64dd2b0a5089ce8539c42c0222862e89d8f3 100644
--- a/python/examples/low_precision/resnet50/README_CN.md
+++ b/python/examples/low_precision/resnet50/README_CN.md
@@ -10,15 +10,15 @@
 wget https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ResNet50_quant.tar.gz
 tar zxvf ResNet50_quant.tar.gz
 
-python -m paddle_serving_client.convert --dirname ResNet50_quant
+python3 -m paddle_serving_client.convert --dirname ResNet50_quant
 ```
 启动rpc服务, 设定所选GPU id、部署模型精度
 ```
-python -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 
+python3 -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 
 ```
 使用client进行请求
 ```
-python resnet50_client.py
+python3 resnet50_client.py
 ```
 
 ## 参考文档
diff --git a/python/examples/mobilenet/README.md b/python/examples/mobilenet/README.md
index 4a808026af0ca5cc1920a292c3f85c82962a3f41..1a16b749220bdf8e6db0dd8950fc505620cbc8fc 100644
--- a/python/examples/mobilenet/README.md
+++ b/python/examples/mobilenet/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 
 ```
-python -m paddle_serving_app.package --get_model mobilenet_v2_imagenet
+python3 -m paddle_serving_app.package --get_model mobilenet_v2_imagenet
 tar -xzvf mobilenet_v2_imagenet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf mobilenet_v2_imagenet.tar.gz
 ### Start Service
 
 ```
-python -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
+python3 -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
 ```
 
 ### Client Prediction
 
 ```
-python mobilenet_tutorial.py
+python3 mobilenet_tutorial.py
 ```
diff --git a/python/examples/mobilenet/README_CN.md b/python/examples/mobilenet/README_CN.md
index d4f91837ec5e03c4ef32041580e5d6b30039480e..68474e5d80afdec183cb5bac0e9ebfc13a7f9ac6 100644
--- a/python/examples/mobilenet/README_CN.md
+++ b/python/examples/mobilenet/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 
 ```
-python -m paddle_serving_app.package --get_model mobilenet_v2_imagenet
+python3 -m paddle_serving_app.package --get_model mobilenet_v2_imagenet
 tar -xzvf mobilenet_v2_imagenet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf mobilenet_v2_imagenet.tar.gz
 ### 启动服务端
 
 ```
-python -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
+python3 -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
 ```
 
 ### 客户端预测
 
 ```
-python mobilenet_tutorial.py
+python3 mobilenet_tutorial.py
 ```
diff --git a/python/examples/ocr/README.md b/python/examples/ocr/README.md
old mode 100755
new mode 100644
index 605b4abe7c12cf2a9b0d8d0d02a2fe9c04b76723..630f01d999943b9948e153430b30d80fbabd0549
--- a/python/examples/ocr/README.md
+++ b/python/examples/ocr/README.md
@@ -4,9 +4,9 @@
 
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model ocr_rec
+python3 -m paddle_serving_app.package --get_model ocr_rec
 tar -xzvf ocr_rec.tar.gz
-python -m paddle_serving_app.package --get_model ocr_det
+python3 -m paddle_serving_app.package --get_model ocr_det
 tar -xzvf ocr_det.tar.gz
 ```
 
@@ -23,16 +23,16 @@ tar xf test_imgs.tar
 ```
 #choose one of cpu/gpu commands as following
 #for cpu user
-python -m paddle_serving_server.serve --model ocr_det_model --port 9293
-python ocr_web_server.py cpu
+python3 -m paddle_serving_server.serve --model ocr_det_model --port 9293
+python3 ocr_web_server.py cpu
 #for gpu user
-python -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_id 0
-python ocr_web_server.py gpu
+python3 -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_ids 0
+python3 ocr_web_server.py gpu
 ```
 
 ### Client Prediction
 ```
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 If you want a faster web service, please try Web LocalPredictor Service
 
@@ -40,14 +40,14 @@ If you want a faster web service, please try Web LocalPredictor Service
 ```
 #choose one of cpu/gpu commands as following
 #for cpu user
-python ocr_debugger_server.py cpu
+python3 ocr_debugger_server.py cpu
 #for gpu user
-python ocr_debugger_server.py gpu 
+python3 ocr_debugger_server.py gpu 
 ```
 
 ## Web LocalPredictor Client Prediction
 ```
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 
 ## Benchmark
@@ -69,34 +69,34 @@ if you are going to detect images not recognize it or directly recognize the wor
 ### Det Server 
 
 ```
-python det_web_server.py cpu #for cpu user
-python det_web_server.py gpu #for gpu user
+python3 det_web_server.py cpu #for cpu user
+python3 det_web_server.py gpu #for gpu user
 #or
-python det_debugger_server.py cpu #for cpu user
-python det_debugger_server.py gpu #for gpu user
+python3 det_debugger_server.py cpu #for cpu user
+python3 det_debugger_server.py gpu #for gpu user
 ```
 
 ### Det Client
 
 ```
 # also use ocr_web_client.py
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 
 ### Rec Server
 
 ```
-python rec_web_server.py cpu #for cpu user
-python rec_web_server.py gpu #for gpu user
+python3 rec_web_server.py cpu #for cpu user
+python3 rec_web_server.py gpu #for gpu user
 #or
-python rec_debugger_server.py cpu #for cpu user
-python rec_debugger_server.py gpu #for gpu user
+python3 rec_debugger_server.py cpu #for cpu user
+python3 rec_debugger_server.py gpu #for gpu user
 ```
 
 ### Rec Client
 
 ```
-python rec_web_client.py
+python3 rec_web_client.py
 ```
 
 ## C++ OCR Service
@@ -109,9 +109,9 @@ Select a startup mode according to CPU / GPU device
 After the -- model parameter, the folder path of multiple model files is passed in to start the prediction service of multiple model concatenation.
 ```
 #for cpu user
-python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
+python3 -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
 #for gpu user
-python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_id 0
+python3 -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_ids 0
 ```
 
 ### Client Prediction
@@ -123,5 +123,5 @@ for this case, `feed_type` should be 3(which means the data type is string),`sha
 
 By passing in multiple client folder paths, the client can be started for multi model prediction.
 ```
-python ocr_cpp_client.py ocr_det_client ocr_rec_client
+python3 ocr_cpp_client.py ocr_det_client ocr_rec_client
 ```
diff --git a/python/examples/ocr/README_CN.md b/python/examples/ocr/README_CN.md
old mode 100755
new mode 100644
index ad7ddcee21dd2f514d2ab8f63a732ee93349abac..421a4b930507abd3d36ef6db737f85a060647ced
--- a/python/examples/ocr/README_CN.md
+++ b/python/examples/ocr/README_CN.md
@@ -4,9 +4,9 @@
 
 ## 获取模型
 ```
-python -m paddle_serving_app.package --get_model ocr_rec
+python3 -m paddle_serving_app.package --get_model ocr_rec
 tar -xzvf ocr_rec.tar.gz
-python -m paddle_serving_app.package --get_model ocr_det
+python3 -m paddle_serving_app.package --get_model ocr_det
 tar -xzvf ocr_det.tar.gz
 ```
 ## 获取数据集（可选）
@@ -22,16 +22,16 @@ tar xf test_imgs.tar
 ```
 #根据CPU/GPU设备选择一种启动方式
 #for cpu user
-python -m paddle_serving_server.serve --model ocr_det_model --port 9293
-python ocr_web_server.py cpu
+python3 -m paddle_serving_server.serve --model ocr_det_model --port 9293
+python3 ocr_web_server.py cpu
 #for gpu user
-python -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_id 0
-python ocr_web_server.py gpu
+python3 -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_ids 0
+python3 ocr_web_server.py gpu
 ```
 
 ### 启动客户端
 ```
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 
 如果用户需要更快的执行速度，请尝试LocalPredictor版Web服务
@@ -39,14 +39,14 @@ python ocr_web_client.py
 ```
 #根据CPU/GPU设备选择一种启动方式
 #for cpu user
-python ocr_debugger_server.py cpu
+python3 ocr_debugger_server.py cpu
 #for gpu user
-python ocr_debugger_server.py gpu
+python3 ocr_debugger_server.py gpu
 ```
 
 ## 启动客户端
 ```
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 
 ## 性能指标
@@ -69,34 +69,34 @@ GPU: Nvidia Tesla V100单卡
 ### 启动检测服务
 
 ```
-python det_web_server.py cpu #for cpu user
-python det_web_server.py gpu #for gpu user
+python3 det_web_server.py cpu #for cpu user
+python3 det_web_server.py gpu #for gpu user
 #or
-python det_debugger_server.py cpu #for cpu user
-python det_debugger_server.py gpu #for gpu user
+python3 det_debugger_server.py cpu #for cpu user
+python3 det_debugger_server.py gpu #for gpu user
 ```
 
 ### 检测服务客户端
 
 ```
 # also use ocr_web_client.py
-python ocr_web_client.py
+python3 ocr_web_client.py
 ```
 
 ### 启动识别服务
 
 ```
-python rec_web_server.py cpu #for cpu user
-python rec_web_server.py gpu #for gpu user
+python3 rec_web_server.py cpu #for cpu user
+python3 rec_web_server.py gpu #for gpu user
 #or
-python rec_debugger_server.py cpu #for cpu user
-python rec_debugger_server.py gpu #for gpu user
+python3 rec_debugger_server.py cpu #for cpu user
+python3 rec_debugger_server.py gpu #for gpu user
 ```
 
 ### 识别服务客户端
 
 ```
-python rec_web_client.py
+python3 rec_web_client.py
 ```
 ## C++ OCR Service服务
 
@@ -108,9 +108,9 @@ python rec_web_client.py
 通过--model后，指定多个模型文件的文件夹路径来启动多模型串联的预测服务。
 ```
 #for cpu user
-python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
+python3 -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
 #for gpu user
-python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_id 0
+python3 -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_ids 0
 ```
 
 ### 启动客户端
@@ -122,5 +122,5 @@ python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port
 
 通过在客户端启动后加入多个client模型的client配置文件夹路径，启动client进行预测。
 ```
-python ocr_cpp_client.py ocr_det_client ocr_rec_client
+python3 ocr_cpp_client.py ocr_det_client ocr_rec_client
 ```
diff --git a/python/examples/ocr/det_debugger_server.py b/python/examples/ocr/det_debugger_server.py
index 8c8305012368d1307ce75983233f6d3af8e35a76..5b40fe9372a56b2b663c1bfeff02619a8ec9730b 100644
--- a/python/examples/ocr/det_debugger_server.py
+++ b/python/examples/ocr/det_debugger_server.py
@@ -71,8 +71,7 @@ ocr_service = OCRService(name="ocr")
 ocr_service.load_model_config("ocr_det_model")
 if sys.argv[1] == 'gpu':
     ocr_service.set_gpus("0")
-    ocr_service.prepare_server(
-        workdir="workdir", port=9292, device="gpu", gpuid=0)
+    ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu")
 elif sys.argv[1] == 'cpu':
     ocr_service.prepare_server(workdir="workdir", port=9292)
 ocr_service.init_det()
diff --git a/python/examples/ocr/det_web_server.py b/python/examples/ocr/det_web_server.py
index c72dc6af964694ee2d364add100d52930576b798..d38686e5a86c4f2df45db7f495a8c08a72270919 100644
--- a/python/examples/ocr/det_web_server.py
+++ b/python/examples/ocr/det_web_server.py
@@ -70,8 +70,7 @@ ocr_service = OCRService(name="ocr")
 ocr_service.load_model_config("ocr_det_model")
 if sys.argv[1] == 'gpu':
     ocr_service.set_gpus("0")
-    ocr_service.prepare_server(
-        workdir="workdir", port=9292, device="gpu", gpuid=0)
+    ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu")
 elif sys.argv[1] == 'cpu':
     ocr_service.prepare_server(workdir="workdir", port=9292, device="cpu")
 ocr_service.init_det()
diff --git a/python/examples/ocr/ocr_cpp_client.py b/python/examples/ocr/ocr_cpp_client.py
old mode 100755
new mode 100644
diff --git a/python/examples/ocr/ocr_web_server.py b/python/examples/ocr/ocr_web_server.py
index 56cacc0e38fb3aac3d64d2b0e79adfae600e01dc..58fc850c94a5e8d2f37ae5d03f14b60d343a2203 100644
--- a/python/examples/ocr/ocr_web_server.py
+++ b/python/examples/ocr/ocr_web_server.py
@@ -95,8 +95,7 @@ ocr_service = OCRService(name="ocr")
 ocr_service.load_model_config("ocr_rec_model")
 if sys.argv[1] == 'gpu':
     ocr_service.set_gpus("0")
-    ocr_service.prepare_server(
-        workdir="workdir", port=9292, device="gpu", gpuid=0)
+    ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu")
 elif sys.argv[1] == 'cpu':
     ocr_service.prepare_server(workdir="workdir", port=9292)
 ocr_service.init_det_client(
diff --git a/python/examples/ocr/rec_debugger_server.py b/python/examples/ocr/rec_debugger_server.py
index d7d663fe3ba7dd2f157c5e856f55aa82d1159e0a..f84463238af859a00983f515e405686c00fdf9fa 100644
--- a/python/examples/ocr/rec_debugger_server.py
+++ b/python/examples/ocr/rec_debugger_server.py
@@ -71,8 +71,7 @@ ocr_service.load_model_config("ocr_rec_model")
 if sys.argv[1] == 'gpu':
     ocr_service.set_gpus("0")
     ocr_service.init_rec()
-    ocr_service.prepare_server(
-        workdir="workdir", port=9292, device="gpu", gpuid=0)
+    ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu")
 elif sys.argv[1] == 'cpu':
     ocr_service.init_rec()
     ocr_service.prepare_server(workdir="workdir", port=9292, device="cpu")
diff --git a/python/examples/ocr/rec_web_server.py b/python/examples/ocr/rec_web_server.py
index 61669fddf34fe361460c6d18049905e7e3400db0..2db6e398d3a025e739761fabd50c5bb8a6609f07 100644
--- a/python/examples/ocr/rec_web_server.py
+++ b/python/examples/ocr/rec_web_server.py
@@ -73,8 +73,7 @@ ocr_service.load_model_config("ocr_rec_model")
 ocr_service.init_rec()
 if sys.argv[1] == 'gpu':
     ocr_service.set_gpus("0")
-    ocr_service.prepare_server(
-        workdir="workdir", port=9292, device="gpu", gpuid=0)
+    ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu")
 elif sys.argv[1] == 'cpu':
     ocr_service.prepare_server(workdir="workdir", port=9292, device="cpu")
 ocr_service.run_rpc_service()
diff --git a/python/examples/pipeline/PaddleClas/DarkNet53/README.md b/python/examples/pipeline/PaddleClas/DarkNet53/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/DarkNet53/README.md
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/DarkNet53/README_CN.md b/python/examples/pipeline/PaddleClas/DarkNet53/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/DarkNet53/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/DarkNet53/benchmark.py b/python/examples/pipeline/PaddleClas/DarkNet53/benchmark.py
index d643b90f5b7ac6ef6892bb83e7dfb20b650df49b..71b5219441a536789e02e4549c84a5cd550bc70f 100644
--- a/python/examples/pipeline/PaddleClas/DarkNet53/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,13 +19,13 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -24,6 +38,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/DarkNet53/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/DarkNet53/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/DarkNet53/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/DarkNet53/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/DarkNet53/resnet50_web_service.py
index c16966e3c4d9481b03344bc51abcd2a2090e5bb7..4fadfd7959ab548c2c88994a36604b2abb7db6d2 100644
--- a/python/examples/pipeline/PaddleClas/DarkNet53/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/DarkNet53/resnet50_web_service.py
@@ -46,7 +46,7 @@ class ImagenetOp(Op):
         input_imgs = np.concatenate(imgs, axis=0)
         return {"image": input_imgs}, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         score_list = fetch_dict["prediction"]
         result = {"label": [], "prob": []}
         for score in score_list:
diff --git a/python/examples/pipeline/PaddleClas/HRNet_W18_C/README.md b/python/examples/pipeline/PaddleClas/HRNet_W18_C/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/README.md
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/HRNet_W18_C/README_CN.md b/python/examples/pipeline/PaddleClas/HRNet_W18_C/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/HRNet_W18_C/benchmark.py b/python/examples/pipeline/PaddleClas/HRNet_W18_C/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..90a3ff9bdda545a01427a26146edcbdf8332da30 100644
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,13 +19,13 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -24,6 +38,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/HRNet_W18_C/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/HRNet_W18_C/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/HRNet_W18_C/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/HRNet_W18_C/resnet50_web_service.py
index 6469657ac297c09f57f08c9cbafb806f62214fea..c246e45db331925e47b8d026f4801c5acf5f2ae7 100644
--- a/python/examples/pipeline/PaddleClas/HRNet_W18_C/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/HRNet_W18_C/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
@@ -49,7 +47,7 @@ class ImagenetOp(Op):
         input_imgs = np.concatenate(imgs, axis=0)
         return {"image": input_imgs}, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         score_list = fetch_dict["prediction"]
         result = {"label": [], "prob": []}
         for score in score_list:
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV1/README.md b/python/examples/pipeline/PaddleClas/MobileNetV1/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/README.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV1/README_CN.md b/python/examples/pipeline/PaddleClas/MobileNetV1/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV1/benchmark.py b/python/examples/pipeline/PaddleClas/MobileNetV1/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..90a3ff9bdda545a01427a26146edcbdf8332da30 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,13 +19,13 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -24,6 +38,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV1/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/MobileNetV1/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV1/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/MobileNetV1/resnet50_web_service.py
index 6469657ac297c09f57f08c9cbafb806f62214fea..c246e45db331925e47b8d026f4801c5acf5f2ae7 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV1/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV1/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
@@ -49,7 +47,7 @@ class ImagenetOp(Op):
         input_imgs = np.concatenate(imgs, axis=0)
         return {"image": input_imgs}, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         score_list = fetch_dict["prediction"]
         result = {"label": [], "prob": []}
         for score in score_list:
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV2/README.md b/python/examples/pipeline/PaddleClas/MobileNetV2/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/README.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV2/README_CN.md b/python/examples/pipeline/PaddleClas/MobileNetV2/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV2/benchmark.py b/python/examples/pipeline/PaddleClas/MobileNetV2/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..90a3ff9bdda545a01427a26146edcbdf8332da30 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,13 +19,13 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -24,6 +38,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV2/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/MobileNetV2/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV2/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/MobileNetV2/resnet50_web_service.py
index 6469657ac297c09f57f08c9cbafb806f62214fea..c246e45db331925e47b8d026f4801c5acf5f2ae7 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV2/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV2/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
@@ -49,7 +47,7 @@ class ImagenetOp(Op):
         input_imgs = np.concatenate(imgs, axis=0)
         return {"image": input_imgs}, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         score_list = fetch_dict["prediction"]
         result = {"label": [], "prob": []}
         for score in score_list:
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README.md b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README_CN.md b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/benchmark.py b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..90a3ff9bdda545a01427a26146edcbdf8332da30 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,13 +19,13 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -24,6 +38,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/resnet50_web_service.py
index 6469657ac297c09f57f08c9cbafb806f62214fea..c246e45db331925e47b8d026f4801c5acf5f2ae7 100644
--- a/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/MobileNetV3_large_x1_0/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
@@ -49,7 +47,7 @@ class ImagenetOp(Op):
         input_imgs = np.concatenate(imgs, axis=0)
         return {"image": input_imgs}, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         score_list = fetch_dict["prediction"]
         result = {"label": [], "prob": []}
         for score in score_list:
diff --git a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README.md b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README_CN.md b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/benchmark.py b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..90a3ff9bdda545a01427a26146edcbdf8332da30 100644
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,13 +19,13 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -24,6 +38,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/resnet50_web_service.py
index 6469657ac297c09f57f08c9cbafb806f62214fea..c246e45db331925e47b8d026f4801c5acf5f2ae7 100644
--- a/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNeXt101_vd_64x4d/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
@@ -49,7 +47,7 @@ class ImagenetOp(Op):
         input_imgs = np.concatenate(imgs, axis=0)
         return {"image": input_imgs}, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         score_list = fetch_dict["prediction"]
         result = {"label": [], "prob": []}
         for score in score_list:
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd/README.md b/python/examples/pipeline/PaddleClas/ResNet50_vd/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd/README_CN.md b/python/examples/pipeline/PaddleClas/ResNet50_vd/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd/benchmark.py b/python/examples/pipeline/PaddleClas/ResNet50_vd/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..90a3ff9bdda545a01427a26146edcbdf8332da30 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,13 +19,13 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -24,6 +38,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/ResNet50_vd/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/ResNet50_vd/resnet50_web_service.py
index 6469657ac297c09f57f08c9cbafb806f62214fea..c246e45db331925e47b8d026f4801c5acf5f2ae7 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
@@ -49,7 +47,7 @@ class ImagenetOp(Op):
         input_imgs = np.concatenate(imgs, axis=0)
         return {"image": input_imgs}, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         score_list = fetch_dict["prediction"]
         result = {"label": [], "prob": []}
         for score in score_list:
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README.md b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README_CN.md b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/benchmark.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..90a3ff9bdda545a01427a26146edcbdf8332da30 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,13 +19,13 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -24,6 +38,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/resnet50_web_service.py
index 668b119273df6ab351e5234badb98b41bef87c1e..43dac2a27c64d79f85f73011755c418cc6a59f1e 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_FPGM/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
@@ -49,7 +47,7 @@ class ImagenetOp(Op):
         input_imgs = np.concatenate(imgs, axis=0)
         return {"image": input_imgs}, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         score_list = fetch_dict["save_infer_model/scale_0.tmp_1"]
         result = {"label": [], "prob": []}
         for score in score_list:
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README.md b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README_CN.md b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/benchmark.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..90a3ff9bdda545a01427a26146edcbdf8332da30 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,13 +19,13 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -24,6 +38,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/resnet50_web_service.py
index 359ea1ed817c7117f64e68fd8a984aa0e7bf5f60..569b15bcfa61a1a1732de303e2980e9b4387c9a0 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_KL/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
@@ -49,7 +47,7 @@ class ImagenetOp(Op):
         input_imgs = np.concatenate(imgs, axis=0)
         return {"inputs": input_imgs}, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         score_list = fetch_dict["save_infer_model/scale_0.tmp_0"]
         result = {"label": [], "prob": []}
         for score in score_list:
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README.md b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README_CN.md b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/benchmark.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..90a3ff9bdda545a01427a26146edcbdf8332da30 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,13 +19,13 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -24,6 +38,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/resnet50_web_service.py
index 2c35ab72255fe2fabd9c83d7a3bd152b744bdd8e..debc1753cc9174dd79bf3a0072681b352c8be17b 100644
--- a/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet50_vd_PACT/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
@@ -49,7 +47,7 @@ class ImagenetOp(Op):
         input_imgs = np.concatenate(imgs, axis=0)
         return {"inputs": input_imgs}, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         score_list = fetch_dict["save_infer_model/scale_0.tmp_1"]
         result = {"label": [], "prob": []}
         for score in score_list:
diff --git a/python/examples/pipeline/PaddleClas/ResNet_V2_50/README.md b/python/examples/pipeline/PaddleClas/ResNet_V2_50/README.md
index 5b909301d9e114019ae8c6ac2bbfcec3cb188b33..1297abfb7a649e3eced26ea4c08848e0a51fbdbf 100644
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/README.md
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/README.md
@@ -4,17 +4,17 @@ This document will takes Imagenet service as an example to introduce how to use
 
 ## Get model
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
 
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/ResNet_V2_50/README_CN.md b/python/examples/pipeline/PaddleClas/ResNet_V2_50/README_CN.md
index cc2fcdd7514fc197ec892826ec56b76906150578..d547b289281cb13a3abb49343b6b77230a2f3d2c 100644
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/README_CN.md
@@ -4,18 +4,17 @@
 
 ## 获取模型
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
 
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/ResNet_V2_50/benchmark.py b/python/examples/pipeline/PaddleClas/ResNet_V2_50/benchmark.py
index 98babc4acddb9a548afeafed1dfee16a88244714..562d159da3ce96233f7f9d2019fbb3061022dc06 100644
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,13 +19,13 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -24,6 +38,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18000/imagenet/prediction"    
+    url = "http://127.0.0.1:18000/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/ResNet_V2_50/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/ResNet_V2_50/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..82a570244cecc51061a38b64c25602f8dfbe931d 100644
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/pipeline_rpc_client.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/ResNet_V2_50/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/ResNet_V2_50/resnet50_web_service.py
index 8b3815f7d7d1397ffd7618048a43a21b5b3123e0..6a7213b7abd0ddf892f64e81f96601205e5b249c 100644
--- a/python/examples/pipeline/PaddleClas/ResNet_V2_50/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ResNet_V2_50/resnet50_web_service.py
@@ -46,7 +46,7 @@ class ImagenetOp(Op):
         input_imgs = np.concatenate(imgs, axis=0)
         return {"image": input_imgs}, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         score_list = fetch_dict["score"]
         result = {"label": [], "prob": []}
         for score in score_list:
diff --git a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README.md b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README.md
index d0fa99e6d72f10d3d2b5907285528b68685128e0..6fbe0c4cf3a635670341d5aee4cee8bcbdc59a88 100644
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README.md
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README.md
@@ -10,10 +10,10 @@ sh get_model.sh
 ## Start server
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## RPC test
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
diff --git a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README_CN.md b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README_CN.md
index 335c96b2144b17e20d6007f376dec4416fb10aa5..c204c3c662825ed26001cf6d444d94f0bab508f7 100644
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README_CN.md
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/README_CN.md
@@ -10,11 +10,10 @@ sh get_model.sh
 ## 启动服务
 
 ```
-python resnet50_web_service.py &>log.txt &
+python3 resnet50_web_service.py &>log.txt &
 ```
 
 ## 测试
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/benchmark.py b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/benchmark.py
index 2433b0132728dc96627254f9231949a74a551c28..90a3ff9bdda545a01427a26146edcbdf8332da30 100644
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/benchmark.py
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -5,13 +19,13 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -24,6 +38,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -34,15 +49,17 @@ def gen_yml(device, gpu_id):
         config["op"]["imagenet"]["local_service_conf"]["devices"] = gpu_id
     else:
         config["op"]["imagenet"]["local_service_conf"]["device_type"] = 0
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18080/imagenet/prediction"    
+    url = "http://127.0.0.1:18080/imagenet/prediction"
     start = time.time()
 
     with open(os.path.join(".", "daisy.jpg"), 'rb') as file:
@@ -68,6 +85,7 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -87,6 +105,7 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18080'])
@@ -107,11 +126,12 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         if device == "gpu":
@@ -120,7 +140,7 @@ if __name__ == "__main__":
             gpu_id = None
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -131,4 +151,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/pipeline_rpc_client.py b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/pipeline_rpc_client.py
index 34a08f4b5d1ec2861c3101685b434453d61156de..a816eb8eed49d922d5caf729dfd089fc28936853 100644
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/pipeline_rpc_client.py
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/pipeline_rpc_client.py
@@ -11,10 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json
diff --git a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/resnet50_web_service.py b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/resnet50_web_service.py
index 6469657ac297c09f57f08c9cbafb806f62214fea..c246e45db331925e47b8d026f4801c5acf5f2ae7 100644
--- a/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/resnet50_web_service.py
+++ b/python/examples/pipeline/PaddleClas/ShuffleNetV2_x1_0/resnet50_web_service.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
+
+from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2
@@ -49,7 +47,7 @@ class ImagenetOp(Op):
         input_imgs = np.concatenate(imgs, axis=0)
         return {"image": input_imgs}, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         score_list = fetch_dict["prediction"]
         result = {"label": [], "prob": []}
         for score in score_list:
diff --git a/python/examples/pipeline/PaddleDetection/faster_rcnn/README.md b/python/examples/pipeline/PaddleDetection/faster_rcnn/README.md
index 4d242be2f3f7550c3bb64053a5689894a6b2c76c..a56ecbef06d82eef59510a1242de7f19c0915d55 100644
--- a/python/examples/pipeline/PaddleDetection/faster_rcnn/README.md
+++ b/python/examples/pipeline/PaddleDetection/faster_rcnn/README.md
@@ -8,11 +8,11 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
-python web_service.py
+python3 web_service.py
 ```
 
 ### Perform prediction
 
 ```
-python pipeline_http_client.py
+python3 pipeline_http_client.py
 ```
diff --git a/python/examples/pipeline/PaddleDetection/faster_rcnn/benchmark.py b/python/examples/pipeline/PaddleDetection/faster_rcnn/benchmark.py
index f0a55614c1390b1d4f73bd015b1ce21b85e4ba55..8a25952cdda2e09f0f74794cf8a2226880f29040 100644
--- a/python/examples/pipeline/PaddleDetection/faster_rcnn/benchmark.py
+++ b/python/examples/pipeline/PaddleDetection/faster_rcnn/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import yaml
@@ -6,17 +20,17 @@ import time
 import json
 import cv2
 import base64
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -29,6 +43,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -36,10 +51,11 @@ def gen_yml(device, gpu_id):
     config["dag"]["tracer"] = {"interval_s": 30}
     if device == "gpu":
         config["op"]["faster_rcnn"]["local_service_conf"]["device_type"] = 1
-        config["op"]["faster_rcnn"]["local_service_conf"]["devices"] = gpu_id        
-    with open("config2.yml", "w") as fout: 
+        config["op"]["faster_rcnn"]["local_service_conf"]["devices"] = gpu_id
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
     url = "http://127.0.0.1:18082/faster_rcnn/prediction"
@@ -65,6 +81,7 @@ def run_http(idx, batch_size):
             break
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -84,22 +101,25 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     pass
 
+
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         gpu_id = sys.argv[5]
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -110,4 +130,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleDetection/faster_rcnn/web_service.py b/python/examples/pipeline/PaddleDetection/faster_rcnn/web_service.py
index 691d647befa8e1c583a53121e89ab5f2859f64b7..fa026000e399cf0246df4afa2a37005d40d53d70 100644
--- a/python/examples/pipeline/PaddleDetection/faster_rcnn/web_service.py
+++ b/python/examples/pipeline/PaddleDetection/faster_rcnn/web_service.py
@@ -19,6 +19,7 @@ import cv2
 from paddle_serving_app.reader import *
 import base64
 
+
 class FasterRCNNOp(Op):
     def init_op(self):
         self.img_preprocess = Sequential([
@@ -38,22 +39,30 @@ class FasterRCNNOp(Op):
             im = cv2.imdecode(data, cv2.IMREAD_COLOR)
             im = self.img_preprocess(im)
             imgs.append({
-              "image": im[np.newaxis,:],
-              "im_shape": np.array(list(im.shape[1:])).reshape(-1)[np.newaxis,:],
-              "scale_factor": np.array([1.0, 1.0]).reshape(-1)[np.newaxis,:],
+                "image": im[np.newaxis, :],
+                "im_shape":
+                np.array(list(im.shape[1:])).reshape(-1)[np.newaxis, :],
+                "scale_factor": np.array([1.0, 1.0]).reshape(-1)[np.newaxis, :],
             })
         feed_dict = {
-            "image": np.concatenate([x["image"] for x in imgs], axis=0),
-            "im_shape": np.concatenate([x["im_shape"] for x in imgs], axis=0),
-            "scale_factor": np.concatenate([x["scale_factor"] for x in imgs], axis=0)
+            "image": np.concatenate(
+                [x["image"] for x in imgs], axis=0),
+            "im_shape": np.concatenate(
+                [x["im_shape"] for x in imgs], axis=0),
+            "scale_factor": np.concatenate(
+                [x["scale_factor"] for x in imgs], axis=0)
         }
         #for key in feed_dict.keys():
         #    print(key, feed_dict[key].shape)
         return feed_dict, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         #print(fetch_dict)
-        res_dict = {"bbox_result": str(self.img_postprocess(fetch_dict, visualize=False))}
+        res_dict = {
+            "bbox_result":
+            str(self.img_postprocess(
+                fetch_dict, visualize=False))
+        }
         return res_dict, None, ""
 
 
diff --git a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/README.md b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/README.md
index a37ca74056fb9dcd4a609f87f914e1ac71df070d..73087efca7abc75d9ed7d6178d962911b9a2b1cb 100644
--- a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/README.md
+++ b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/README.md
@@ -10,11 +10,10 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ppyolo_mbv3_large_coco.tar
-python web_service.py
+python3 web_service.py
 ```
 
 ### Perform prediction
 ```
-python pipeline_http_client.py
+python3 pipeline_http_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/benchmark.py b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/benchmark.py
index a23f64314ef448f2617f92ab40f94f75cc6e707f..45853c065b013754d0d591686a9a03ad0aeb6a3d 100644
--- a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/benchmark.py
+++ b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import yaml
@@ -6,17 +20,17 @@ import time
 import json
 import cv2
 import base64
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -29,6 +43,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -37,9 +52,10 @@ def gen_yml(device, gpu_id):
     if device == "gpu":
         config["op"]["ppyolo_mbv3"]["local_service_conf"]["device_type"] = 1
         config["op"]["ppyolo_mbv3"]["local_service_conf"]["devices"] = gpu_id
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
     url = "http://127.0.0.1:18082/ppyolo_mbv3/prediction"
@@ -65,6 +81,7 @@ def run_http(idx, batch_size):
             break
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -84,22 +101,25 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     pass
 
+
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         gpu_id = sys.argv[5]
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -110,4 +130,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/web_service.py b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/web_service.py
index 8611b0671862a887efd1705b3c1a922db906581d..1cfa0aee793d1a6fa22f109284c426b1e7676e0b 100644
--- a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/web_service.py
+++ b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/web_service.py
@@ -19,6 +19,7 @@ import cv2
 from paddle_serving_app.reader import *
 import base64
 
+
 class PPYoloMbvOp(Op):
     def init_op(self):
         self.img_preprocess = Sequential([
@@ -38,23 +39,31 @@ class PPYoloMbvOp(Op):
             im = cv2.imdecode(data, cv2.IMREAD_COLOR)
             im = self.img_preprocess(im)
             imgs.append({
-              "image": im[np.newaxis,:],
-              "im_shape": np.array(list(im.shape[1:])).reshape(-1)[np.newaxis,:],
-              "scale_factor": np.array([1.0, 1.0]).reshape(-1)[np.newaxis,:],
+                "image": im[np.newaxis, :],
+                "im_shape":
+                np.array(list(im.shape[1:])).reshape(-1)[np.newaxis, :],
+                "scale_factor": np.array([1.0, 1.0]).reshape(-1)[np.newaxis, :],
             })
 
         feed_dict = {
-            "image": np.concatenate([x["image"] for x in imgs], axis=0),
-            "im_shape": np.concatenate([x["im_shape"] for x in imgs], axis=0),
-            "scale_factor": np.concatenate([x["scale_factor"] for x in imgs], axis=0)
+            "image": np.concatenate(
+                [x["image"] for x in imgs], axis=0),
+            "im_shape": np.concatenate(
+                [x["im_shape"] for x in imgs], axis=0),
+            "scale_factor": np.concatenate(
+                [x["scale_factor"] for x in imgs], axis=0)
         }
         for key in feed_dict.keys():
             print(key, feed_dict[key].shape)
         return feed_dict, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         #print(fetch_dict)
-        res_dict = {"bbox_result": str(self.img_postprocess(fetch_dict, visualize=False))}
+        res_dict = {
+            "bbox_result":
+            str(self.img_postprocess(
+                fetch_dict, visualize=False))
+        }
         return res_dict, None, ""
 
 
diff --git a/python/examples/pipeline/PaddleDetection/yolov3/README.md b/python/examples/pipeline/PaddleDetection/yolov3/README.md
index 1a1431a2a90d404c23728e5515c00aebce0fa4a7..8340f1060d0be6b100575ecbcb0270db0a6227f4 100644
--- a/python/examples/pipeline/PaddleDetection/yolov3/README.md
+++ b/python/examples/pipeline/PaddleDetection/yolov3/README.md
@@ -10,11 +10,10 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf yolov3_darknet53_270e_coco.tar
-python web_service.py
+python3 web_service.py
 ```
 
 ### Perform prediction
 ```
-python pipeline_http_client.py
+python3 pipeline_http_client.py
 ```
-
diff --git a/python/examples/pipeline/PaddleDetection/yolov3/benchmark.py b/python/examples/pipeline/PaddleDetection/yolov3/benchmark.py
index ae9c5a8fb25f56eebe3c3893a4a4d251f21e5b61..62732613dbfc6ab0b119609a547ea36c18b11ede 100644
--- a/python/examples/pipeline/PaddleDetection/yolov3/benchmark.py
+++ b/python/examples/pipeline/PaddleDetection/yolov3/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import yaml
@@ -6,17 +20,17 @@ import time
 import json
 import cv2
 import base64
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -29,6 +43,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device, gpu_id):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -36,10 +51,11 @@ def gen_yml(device, gpu_id):
     config["dag"]["tracer"] = {"interval_s": 30}
     if device == "gpu":
         config["op"]["yolov3"]["local_service_conf"]["device_type"] = 1
-        config["op"]["yolov3"]["local_service_conf"]["devices"] = gpu_id        
-    with open("config2.yml", "w") as fout: 
+        config["op"]["yolov3"]["local_service_conf"]["devices"] = gpu_id
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
     url = "http://127.0.0.1:18082/yolov3/prediction"
@@ -65,6 +81,7 @@ def run_http(idx, batch_size):
             break
     return [[end - start], latency_list, [total_num]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
     start = time.time()
@@ -84,22 +101,25 @@ def multithread_http(thread, batch_size):
                                          total_cost))
     show_latency(result[1])
 
+
 def run_rpc(thread, batch_size):
     pass
 
+
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         gpu_id = sys.argv[5]
         gen_yml(device, gpu_id)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -110,4 +130,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/PaddleDetection/yolov3/web_service.py b/python/examples/pipeline/PaddleDetection/yolov3/web_service.py
index d28c22b9cc1060af29b6b31140911fc848bdec28..fa55f78067118184ae5b5541c1bc1fe36db617a0 100644
--- a/python/examples/pipeline/PaddleDetection/yolov3/web_service.py
+++ b/python/examples/pipeline/PaddleDetection/yolov3/web_service.py
@@ -19,6 +19,7 @@ import cv2
 from paddle_serving_app.reader import *
 import base64
 
+
 class Yolov3Op(Op):
     def init_op(self):
         self.img_preprocess = Sequential([
@@ -38,22 +39,30 @@ class Yolov3Op(Op):
             im = cv2.imdecode(data, cv2.IMREAD_COLOR)
             im = self.img_preprocess(im)
             imgs.append({
-              "image": im[np.newaxis,:],
-              "im_shape": np.array(list(im.shape[1:])).reshape(-1)[np.newaxis,:],
-              "scale_factor": np.array([1.0, 1.0]).reshape(-1)[np.newaxis,:],
+                "image": im[np.newaxis, :],
+                "im_shape":
+                np.array(list(im.shape[1:])).reshape(-1)[np.newaxis, :],
+                "scale_factor": np.array([1.0, 1.0]).reshape(-1)[np.newaxis, :],
             })
         feed_dict = {
-            "image": np.concatenate([x["image"] for x in imgs], axis=0),
-            "im_shape": np.concatenate([x["im_shape"] for x in imgs], axis=0),
-            "scale_factor": np.concatenate([x["scale_factor"] for x in imgs], axis=0)
+            "image": np.concatenate(
+                [x["image"] for x in imgs], axis=0),
+            "im_shape": np.concatenate(
+                [x["im_shape"] for x in imgs], axis=0),
+            "scale_factor": np.concatenate(
+                [x["scale_factor"] for x in imgs], axis=0)
         }
         #for key in feed_dict.keys():
         #    print(key, feed_dict[key].shape)
         return feed_dict, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         #print(fetch_dict)
-        res_dict = {"bbox_result": str(self.img_postprocess(fetch_dict, visualize=False))}
+        res_dict = {
+            "bbox_result":
+            str(self.img_postprocess(
+                fetch_dict, visualize=False))
+        }
         return res_dict, None, ""
 
 
diff --git a/python/examples/pipeline/bert/README.md b/python/examples/pipeline/bert/README.md
index 6074aa2b80dbe96c69726b7b8049e28db853445a..c396b77c9d2b9198d0474540872cb1c4dcdce5b1 100644
--- a/python/examples/pipeline/bert/README.md
+++ b/python/examples/pipeline/bert/README.md
@@ -4,7 +4,7 @@ This document will takes Imagenet service as an example to introduce how to use
 
 ## Get model
 ```
-sh get_model.sh
+sh get_data.sh
 ```
 
 ## Start server
diff --git a/python/examples/pipeline/bert/README_CN.md b/python/examples/pipeline/bert/README_CN.md
index ace7b76fe717c8a0922bf41aa5615b3b5da945a1..841abdadf5a3848fcf1e042d8e73c051610eefaa 100644
--- a/python/examples/pipeline/bert/README_CN.md
+++ b/python/examples/pipeline/bert/README_CN.md
@@ -4,7 +4,7 @@
 
 ## 获取模型
 ```
-sh get_model.sh
+sh get_data.sh
 ```
 
 ## 启动服务
diff --git a/python/examples/pipeline/bert/benchmark.py b/python/examples/pipeline/bert/benchmark.py
index 3dece4914d6a606753c2b91db2a6d759e0ec6897..5abc646bffffff118ab24414e3a50f06668729d9 100644
--- a/python/examples/pipeline/bert/benchmark.py
+++ b/python/examples/pipeline/bert/benchmark.py
@@ -1,13 +1,25 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
@@ -38,6 +50,8 @@ from paddle_serving_client.utils import benchmark_args, show_latency
 2021-03-16 10:26:01,840 	chl0(In: ['@DAGExecutor'], Out: ['bert']) size[0/0]
 2021-03-16 10:26:01,841 	chl1(In: ['bert'], Out: ['@DAGExecutor']) size[0/0]
 '''
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -50,6 +64,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -57,13 +72,14 @@ def gen_yml(device):
     config["dag"]["tracer"] = {"interval_s": 10}
     if device == "gpu":
         config["op"]["bert"]["local_service_conf"]["device_type"] = 1
-        config["op"]["bert"]["local_service_conf"]["devices"] = "2"        
-    with open("config2.yml", "w") as fout: 
+        config["op"]["bert"]["local_service_conf"]["devices"] = "2"
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18082/bert/prediction"    
+    url = "http://127.0.0.1:18082/bert/prediction"
     start = time.time()
     with open("data-c.txt", 'r') as fin:
         start = time.time()
@@ -84,9 +100,11 @@ def run_http(idx, batch_size):
         end = time.time()
     return [[end - start]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_http , thread, batch_size)
+    result = multi_thread_runner.run(run_http, thread, batch_size)
+
 
 def run_rpc(thread, batch_size):
     client = PipelineClient()
@@ -110,16 +128,17 @@ def run_rpc(thread, batch_size):
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         gen_yml(device)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -130,4 +149,3 @@ if __name__ == "__main__":
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-    
diff --git a/python/examples/pipeline/bert/web_service.py b/python/examples/pipeline/bert/web_service.py
index 7f5128f95d772a8d108e5ab3a92314eee103235d..46495a850886c6bc9f33a117a1485ec0d2ea6d9a 100644
--- a/python/examples/pipeline/bert/web_service.py
+++ b/python/examples/pipeline/bert/web_service.py
@@ -43,9 +43,11 @@ class BertOp(Op):
             print(key, feed_dict[key].shape)
         return feed_dict, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
-        fetch_dict["pooled_output"] = str(fetch_dict["pooled_output"])
-        return fetch_dict, None, ""
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
+        new_dict = {}
+        new_dict["pooled_output"] = str(fetch_dict["pooled_output"])
+        new_dict["sequence_output"] = str(fetch_dict["sequence_output"])
+        return new_dict, None, ""
 
 
 class BertService(WebService):
diff --git a/python/examples/pipeline/imagenet/resnet50_web_service.py b/python/examples/pipeline/imagenet/resnet50_web_service.py
index 53a0b6d9c5d7290b709df9c5ba7a314d29bbd08d..a4d37ed600a8eb90836b83f33f0cbe32e35d5008 100644
--- a/python/examples/pipeline/imagenet/resnet50_web_service.py
+++ b/python/examples/pipeline/imagenet/resnet50_web_service.py
@@ -42,7 +42,7 @@ class ImagenetOp(Op):
         img = self.seq(im)
         return {"image": img[np.newaxis, :].copy()}, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         print(fetch_dict)
         score_list = fetch_dict["score"]
         result = {"label": [], "prob": []}
diff --git a/python/examples/pipeline/ocr/benchmark.py b/python/examples/pipeline/ocr/benchmark.py
index 79ecead3801cc48714812a7a8732e8b7a2367989..1e39176436b0be11093031ddfc4727ee68671c62 100644
--- a/python/examples/pipeline/ocr/benchmark.py
+++ b/python/examples/pipeline/ocr/benchmark.py
@@ -19,10 +19,8 @@ import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
diff --git a/python/examples/pipeline/ocr/web_service.py b/python/examples/pipeline/ocr/web_service.py
index 6724415886497e43595672b840f6ed9c7362f2ee..c19d481113a0563bbea92b5038968ae9d18e0ab5 100644
--- a/python/examples/pipeline/ocr/web_service.py
+++ b/python/examples/pipeline/ocr/web_service.py
@@ -54,7 +54,7 @@ class DetOp(Op):
             imgs.append(det_img[np.newaxis, :].copy())
         return {"image": np.concatenate(imgs, axis=0)}, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
         #        print(fetch_dict)
         det_out = fetch_dict["concat_1.tmp_0"]
         ratio_list = [
@@ -149,7 +149,7 @@ class RecOp(Op):
 
         return feed_list, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_data, log_id):
+    def postprocess(self, input_dicts, fetch_data, data_id, log_id):
         res_list = []
         if isinstance(fetch_data, dict):
             if len(fetch_data) > 0:
diff --git a/python/examples/pipeline/simple_web_service/README.md b/python/examples/pipeline/simple_web_service/README.md
index f52f7a85d1c9da98572def013e8d83c5aca2419c..ce2fc841b92b27e1b310353d2b8ef31ae48a2aeb 100644
--- a/python/examples/pipeline/simple_web_service/README.md
+++ b/python/examples/pipeline/simple_web_service/README.md
@@ -10,7 +10,7 @@ sh get_data.sh
 ## Start server
 
 ```
-python web_service.py &>log.txt &
+python3 web_service.py &>log.txt &
 ```
 
 ## Http test
diff --git a/python/examples/pipeline/simple_web_service/README_CN.md b/python/examples/pipeline/simple_web_service/README_CN.md
index 8b07942c19c566f5638e4497eb7c4d5a9fc1f2b2..b7007d366e058af40e0383fb05f8cfcbca6e19d2 100644
--- a/python/examples/pipeline/simple_web_service/README_CN.md
+++ b/python/examples/pipeline/simple_web_service/README_CN.md
@@ -10,7 +10,7 @@ sh get_data.sh
 ## 启动服务
 
 ```
-python web_service.py &>log.txt &
+python3 web_service.py &>log.txt &
 ```
 
 ## 测试
diff --git a/python/examples/pipeline/simple_web_service/benchmark.py b/python/examples/pipeline/simple_web_service/benchmark.py
index f5041fab1c3d7f91f0b4b61a9a63fad168753dc6..c2c612dd2740d7c97da4289a0913270b03611e7a 100644
--- a/python/examples/pipeline/simple_web_service/benchmark.py
+++ b/python/examples/pipeline/simple_web_service/benchmark.py
@@ -1,28 +1,42 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import yaml
 import requests
 import time
 import json
-try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
+
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
 
+
 def gen_yml():
     fin = open("config.yml", "r")
     config = yaml.load(fin)
     fin.close()
     config["dag"]["tracer"] = {"interval_s": 5}
-    with open("config2.yml", "w") as fout: 
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:18082/uci/prediction"    
+    url = "http://127.0.0.1:18082/uci/prediction"
     start = time.time()
     value = "0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"
     all_value = ";".join([value for i in range(batch_size)])
@@ -33,9 +47,11 @@ def run_http(idx, batch_size):
     end = time.time()
     return [[end - start]]
 
+
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_http , thread, batch_size)
+    result = multi_thread_runner.run(run_http, thread, batch_size)
+
 
 def run_rpc(thread, batch_size):
     client = PipelineClient()
@@ -44,25 +60,26 @@ def run_rpc(thread, batch_size):
     all_value = ";".join([value for i in range(batch_size)])
     data = {"key": "x", "value": all_value}
     for i in range(1000):
-        ret = client.predict(feed_dict={data["key"]: data["value"]}, fetch=["res"])
+        ret = client.predict(
+            feed_dict={data["key"]: data["value"]}, fetch=["res"])
     print(ret)
 
+
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         gen_yml()
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
             multithread_http(thread, batch_size)
         elif mode == "rpc":
             multithread_rpc(thread, batch_size)
-
-    
diff --git a/python/examples/pipeline/simple_web_service/web_service.py b/python/examples/pipeline/simple_web_service/web_service.py
index ea3109cf998ab81ecf68f556c0254fe35b3f4091..5f999f94f9da10809c0128a45c115d90f05f0f41 100644
--- a/python/examples/pipeline/simple_web_service/web_service.py
+++ b/python/examples/pipeline/simple_web_service/web_service.py
@@ -40,9 +40,10 @@ class UciOp(Op):
         proc_dict = {}
         return input_dict, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
-        _LOGGER.info("UciOp::postprocess >>> log_id:{}, fetch_dict:{}".format(
-            log_id, fetch_dict))
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
+        _LOGGER.info(
+            "UciOp::postprocess >>> data_id:{}, log_id:{}, fetch_dict:{}".
+            format(data_id, log_id, fetch_dict))
         fetch_dict["price"] = str(fetch_dict["price"])
         return fetch_dict, None, ""
 
diff --git a/python/examples/pipeline/simple_web_service/web_service_java.py b/python/examples/pipeline/simple_web_service/web_service_java.py
index da944a1df2a3265f930eb458c11709dd6b9402ee..c4ddfb2b1b3c57b4975cac3dc048e1310aa10772 100644
--- a/python/examples/pipeline/simple_web_service/web_service_java.py
+++ b/python/examples/pipeline/simple_web_service/web_service_java.py
@@ -41,9 +41,10 @@ class UciOp(Op):
 
         return input_dict, False, None, ""
 
-    def postprocess(self, input_dicts, fetch_dict, log_id):
-        _LOGGER.info("UciOp::postprocess >>> log_id:{}, fetch_dict:{}".format(
-            log_id, fetch_dict))
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
+        _LOGGER.info(
+            "UciOp::postprocess >>> data_id:{}, log_id:{}, fetch_dict:{}".
+            format(data_id, log_id, fetch_dict))
         fetch_dict["price"] = str(fetch_dict["price"][0][0])
         return fetch_dict, None, ""
 
diff --git a/python/examples/resnet_v2_50/README.md b/python/examples/resnet_v2_50/README.md
index 0279918b664dfc5d5d922e8d7ba6bc6aaa15106a..12144b0ea9836c9eb647fa6482db244f1030354b 100644
--- a/python/examples/resnet_v2_50/README.md
+++ b/python/examples/resnet_v2_50/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### Start Service
 
 ```
-python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
+python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
 ```
 
 ### Client Prediction
 
 ```
-python resnet50_v2_tutorial.py
+python3 resnet50_v2_tutorial.py
 ```
diff --git a/python/examples/resnet_v2_50/README_CN.md b/python/examples/resnet_v2_50/README_CN.md
index c67e4f7c3e06c8fe0f3266ed51fc7d6db813ae7b..fee0e01f3cbac29052e4ae931027574ab6f778a0 100644
--- a/python/examples/resnet_v2_50/README_CN.md
+++ b/python/examples/resnet_v2_50/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### 启动服务端
 
 ```
-python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
+python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
 ```
 
 ### 客户端预测
 
 ```
-python resnet50_v2_tutorial.py
+python3 resnet50_v2_tutorial.py
 ```
diff --git a/python/examples/senta/README.md b/python/examples/senta/README.md
index 8929a9312c17264800f299f77afb583221006068..9a159133eeb20832c1870bb949136a59ae461901 100644
--- a/python/examples/senta/README.md
+++ b/python/examples/senta/README.md
@@ -3,16 +3,16 @@
 
 ## Get Model
 ```
-python -m paddle_serving_app.package --get_model senta_bilstm
-python -m paddle_serving_app.package --get_model lac
+python3 -m paddle_serving_app.package --get_model senta_bilstm
+python3 -m paddle_serving_app.package --get_model lac
 tar -xzvf senta_bilstm.tar.gz
 tar -xzvf lac.tar.gz
 ```
 
 ## Start HTTP Service
 ```
-python -m paddle_serving_server.serve --model lac_model --port 9300
-python senta_web_service.py
+python3 -m paddle_serving_server.serve --model lac_model --port 9300
+python3 senta_web_service.py
 ```
 In the Chinese sentiment classification task, the Chinese word segmentation needs to be done through [LAC task] (../lac). 
 In this demo, the LAC task is placed in the preprocessing part of the HTTP prediction service of the sentiment classification task.
diff --git a/python/examples/senta/README_CN.md b/python/examples/senta/README_CN.md
index e5624dc975e6bc00de219f68cbf74dea7cac8360..a09fd117767cbdd01847d6cdef06992caf4a9715 100644
--- a/python/examples/senta/README_CN.md
+++ b/python/examples/senta/README_CN.md
@@ -3,16 +3,16 @@
 
 ## 获取模型文件
 ```
-python -m paddle_serving_app.package --get_model senta_bilstm
-python -m paddle_serving_app.package --get_model lac
+python3 -m paddle_serving_app.package --get_model senta_bilstm
+python3 -m paddle_serving_app.package --get_model lac
 tar -xzvf lac.tar.gz
 tar -xzvf senta_bilstm.tar.gz
 ```
 
 ## 启动HTTP服务
 ```
-python -m paddle_serving_server.serve --model lac_model --port 9300
-python senta_web_service.py
+python3 -m paddle_serving_server.serve --model lac_model --port 9300
+python3 senta_web_service.py
 ```
 中文情感分类任务中需要先通过[LAC任务](../lac)进行中文分词。
 示例中将LAC任务放在情感分类任务的HTTP预测服务的预处理部分。
diff --git a/python/examples/unet_for_image_seg/README.md b/python/examples/unet_for_image_seg/README.md
index 170dc133aea41a6f31696c2161d8e60ccfb4a621..59004712bd76f5388d6e57947f70ce22562f8dbe 100644
--- a/python/examples/unet_for_image_seg/README.md
+++ b/python/examples/unet_for_image_seg/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 
 ```
-python -m paddle_serving_app.package --get_model unet
+python3 -m paddle_serving_app.package --get_model unet
 tar -xzvf unet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf unet.tar.gz
 ### Start Service
 
 ```
-python -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
+python3 -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
 ```
 
 ### Client Prediction
 
 ```
-python seg_client.py
+python3 seg_client.py
 ```
diff --git a/python/examples/unet_for_image_seg/README_CN.md b/python/examples/unet_for_image_seg/README_CN.md
index eed1313eb938be67b80331e498b01a9749cb5dc6..53c2f1893a879d5585cea0b77103fc1461086784 100644
--- a/python/examples/unet_for_image_seg/README_CN.md
+++ b/python/examples/unet_for_image_seg/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 
 ```
-python -m paddle_serving_app.package --get_model unet
+python3 -m paddle_serving_app.package --get_model unet
 tar -xzvf unet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf unet.tar.gz
 ### 启动服务端
 
 ```
-python -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
+python3 -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
 ```
 
 ### 客户端预测
 
 ```
-python seg_client.py
+python3 seg_client.py
 ```
diff --git a/python/examples/util/README.md b/python/examples/util/README.md
index 64cb44a0a84d243810be409e2efd3870c8a4f75c..678ca388df5106e57f146a9758e3ef8da485e270 100644
--- a/python/examples/util/README.md
+++ b/python/examples/util/README.md
@@ -13,14 +13,14 @@ In order to show the time consuming of each stage more intuitively, a script is
 
 When using, first save the output of the client to a file, taking `profile` as an example.
 ```
-python show_profile.py profile ${thread_num}
+python3 show_profile.py profile ${thread_num}
 ```
 Here the `thread_num` parameter is the number of processes when the client is running, and the script will calculate the average time spent in each phase according to this parameter.
 
 The script calculates the time spent in each stage, divides by the number of threads to average, and prints to standard output.
 
 ```
-python timeline_trace.py profile trace
+python3 timeline_trace.py profile trace
 ```
 The script converts the time-dot information in the log into a json format and saves it to a trace file. The trace file can be visualized through the tracing function of the Chrome browser.
 
diff --git a/python/examples/util/README_CN.md b/python/examples/util/README_CN.md
index 43acef8073148b7a4978ed5c02fa5fa05258f6a0..aaca0ae21dd1af33a3fb708efd0b2113525e5141 100644
--- a/python/examples/util/README_CN.md
+++ b/python/examples/util/README_CN.md
@@ -13,14 +13,14 @@ export FLAGS_profile_server=1 #开启server端各阶段时间打点
 
 使用时先将client的输出保存到文件，以profile为例。
 ```
-python show_profile.py profile ${thread_num}
+python3 show_profile.py profile ${thread_num}
 ```
 这里thread_num参数为client运行时的进程数，脚本将按照这个参数来计算各阶段的平均耗时。
 
 脚本将计算各阶段的耗时，并除以线程数做平均，打印到标准输出。
 
 ```
-python timeline_trace.py profile trace
+python3 timeline_trace.py profile trace
 ```
 脚本将日志中的时间打点信息转换成json格式保存到trace文件，trace文件可以通过chrome浏览器的tracing功能进行可视化。
 
diff --git a/python/examples/util/show_profile.py b/python/examples/util/show_profile.py
old mode 100755
new mode 100644
diff --git a/python/examples/xpu/fit_a_line_xpu/README.md b/python/examples/xpu/fit_a_line_xpu/README.md
index e54dc69f1042a6031e9f5a1570d67c5696817191..b74ddd38613ba30444fb97a34cbab1c154882574 100644
--- a/python/examples/xpu/fit_a_line_xpu/README.md
+++ b/python/examples/xpu/fit_a_line_xpu/README.md
@@ -13,15 +13,15 @@ sh get_data.sh
 ### Start server
 You can use the following code to start the RPC service 
 ```shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
 ```
 
 ### Client prediction
 
-The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip install paddlepaddle`).
+The `paddlepaddle` package is used in `test_client.py`, and you may need to download the corresponding package(`pip3 install paddlepaddle`).
 
 ``` shell
-python test_client.py uci_housing_client/serving_client_conf.prototxt
+python3 test_client.py uci_housing_client/serving_client_conf.prototxt
 ```
 
 ## HTTP service
@@ -30,7 +30,7 @@ python test_client.py uci_housing_client/serving_client_conf.prototxt
 
 Start a web service with default web service hosting modules:
 ``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim --name uci
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim --name uci
 ```
 
 ### Client prediction
diff --git a/python/examples/xpu/fit_a_line_xpu/README_CN.md b/python/examples/xpu/fit_a_line_xpu/README_CN.md
index e19a17afb643db84129d20979b5822931ee335d7..60adac1c17a0a232a37a0235999a687b48dcbc7a 100644
--- a/python/examples/xpu/fit_a_line_xpu/README_CN.md
+++ b/python/examples/xpu/fit_a_line_xpu/README_CN.md
@@ -15,21 +15,21 @@ sh get_data.sh
 ### 开启服务端
 
 ``` shell
-python test_server.py uci_housing_model/
+python3 test_server.py uci_housing_model/
 ```
 
 也可以通过下面的一行代码开启默认RPC服务：
 
 ```shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
 ```
 
 ### 客户端预测
 
-`test_client.py`中使用了`paddlepaddle`包，需要进行下载（`pip install paddlepaddle`）。
+`test_client.py`中使用了`paddlepaddle`包，需要进行下载（`pip3 install paddlepaddle`）。
 
 ``` shell
-python test_client.py uci_housing_client/serving_client_conf.prototxt
+python3 test_client.py uci_housing_client/serving_client_conf.prototxt
 ```
 
 ## HTTP服务
@@ -39,7 +39,7 @@ python test_client.py uci_housing_client/serving_client_conf.prototxt
 通过下面的一行代码开启默认web服务：
 
 ``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim --name uci
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim --name uci
 ```
 
 ### 客户端预测
diff --git a/python/examples/xpu/resnet_v2_50_xpu/README.md b/python/examples/xpu/resnet_v2_50_xpu/README.md
index ba19b6d7e442346fbc4ee890c34f6fa6c5b55bf7..76b04d614bd4513e806d9a139c38d66b8bce6569 100644
--- a/python/examples/xpu/resnet_v2_50_xpu/README.md
+++ b/python/examples/xpu/resnet_v2_50_xpu/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### Start Service
 
 ```
-python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
+python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
 ```
 
 ### Client Prediction
 
 ```
-python resnet50_client.py
+python3 resnet50_client.py
 ```
diff --git a/python/examples/xpu/resnet_v2_50_xpu/README_CN.md b/python/examples/xpu/resnet_v2_50_xpu/README_CN.md
index 007c90e4a498dc576982fc26a2814918ec1a7b91..652c4f672fd82b494a2240f327463e50dca8829c 100644
--- a/python/examples/xpu/resnet_v2_50_xpu/README_CN.md
+++ b/python/examples/xpu/resnet_v2_50_xpu/README_CN.md
@@ -3,7 +3,7 @@
 ## 获取模型
 
 ```
-python -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
+python3 -m paddle_serving_app.package --get_model resnet_v2_50_imagenet
 tar -xzvf resnet_v2_50_imagenet.tar.gz
 ```
 
@@ -12,11 +12,11 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### 启动服务端
 
 ```
-python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
+python3 -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
 ```
 
 ### 客户端预测
 
 ```
-python resnet50_client.py
+python3 resnet50_client.py
 ```
diff --git a/python/examples/xpu/vgg19/README.md b/python/examples/xpu/vgg19/README.md
index 338a80562df3a74033c839cf42ab66e87982595c..d8520684f55a9caf88818905f4cc309f55304fe0 100644
--- a/python/examples/xpu/vgg19/README.md
+++ b/python/examples/xpu/vgg19/README.md
@@ -26,5 +26,5 @@ python3 -m paddle_serving_server.serve --model serving_server --port 7702 --use_
 ### Client Prediction
 
 ```
-python vgg19_client.py
+python3 vgg19_client.py
 ```
diff --git a/python/examples/yolov4/README.md b/python/examples/yolov4/README.md
index fb1bc7622da88cc827b64cfc37336a4de3331831..0c7cfa7c0ffb4938456aa908015aff2daf367727 100644
--- a/python/examples/yolov4/README.md
+++ b/python/examples/yolov4/README.md
@@ -5,19 +5,19 @@
 ## Get Model
 
 ```
-python -m paddle_serving_app.package --get_model yolov4
+python3 -m paddle_serving_app.package --get_model yolov4
 tar -xzvf yolov4.tar.gz
 ```
 
 ## Start RPC Service
 
 ```
-python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
 ```
 
 ## Prediction
 
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
 After the prediction is completed, a json file to save the prediction result and a picture with the detection result box will be generated in the `./outpu folder.
diff --git a/python/examples/yolov4/README_CN.md b/python/examples/yolov4/README_CN.md
index 72923c5af51d2584ae151cbc15ba62efb48adced..1c773033418b9d072a7096a91d47b665b465c322 100644
--- a/python/examples/yolov4/README_CN.md
+++ b/python/examples/yolov4/README_CN.md
@@ -5,20 +5,20 @@
 ## 获取模型
 
 ```
-python -m paddle_serving_app.package --get_model yolov4
+python3 -m paddle_serving_app.package --get_model yolov4
 tar -xzvf yolov4.tar.gz
 ```
 
 ## 启动RPC服务
 
 ```
-python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
+python3 -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
 ```
 
 ## 预测
 
 ```
-python test_client.py 000000570688.jpg
+python3 test_client.py 000000570688.jpg
 ```
 
 预测完成会在`./output`文件夹下生成保存预测结果的json文件以及标出检测结果框的图片。
diff --git a/python/paddle_serving_app/local_predict.py b/python/paddle_serving_app/local_predict.py
index 382d231789cb094787f1dbe4c7dcad85e8c50007..afe4ba62d69850482e82ba97d43ac747e0f69aaf 100644
--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -22,6 +22,7 @@ import argparse
 from .proto import general_model_config_pb2 as m_config
 import paddle.inference as paddle_infer
 import logging
+import glob
 
 logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger("LocalPredictor")
@@ -51,6 +52,23 @@ class LocalPredictor(object):
         self.fetch_names_to_idx_ = {}
         self.fetch_names_to_type_ = {}
 
+    def search_suffix_files(self, model_path, target_suffix):
+        """
+        Find all files with the suffix xxx in the specified directory.
+
+        Args:
+            model_path: model directory, not None.
+            target_suffix: filenames with target suffix, not None. e.g: *.pdmodel
+
+        Returns:
+            file_list, None, [] or [path, ] . 
+        """
+        if model_path is None or target_suffix is None:
+            return None
+
+        file_list = glob.glob(os.path.join(model_path, target_suffix))
+        return file_list
+
     def load_model_config(self,
                           model_path,
                           use_gpu=False,
@@ -91,16 +109,36 @@ class LocalPredictor(object):
             mkldnn_bf16_op_list: op list accelerated using MKLDNN bf16, None default.
             use_feed_fetch_ops: use feed/fetch ops, False default.
         """
+        gpu_id = int(gpu_id)
         client_config = "{}/serving_server_conf.prototxt".format(model_path)
         model_conf = m_config.GeneralModelConfig()
         f = open(client_config, 'r')
         model_conf = google.protobuf.text_format.Merge(
             str(f.read()), model_conf)
+
+        # Init paddle_infer config
+        # Paddle's model files and parameter files have multiple naming rules:
+        #   1) __model__, __params__
+        #   2) *.pdmodel, *.pdiparams
+        #   3) __model__, conv2d_1.w_0, conv2d_2.w_0, fc_1.w_0, conv2d_1.b_0, ... 
+        pdmodel_file_list = self.search_suffix_files(model_path, "*.pdmodel")
+        pdiparams_file_list = self.search_suffix_files(model_path,
+                                                       "*.pdiparams")
         if os.path.exists(os.path.join(model_path, "__params__")):
+            # case 1) initializing
             config = paddle_infer.Config(
                 os.path.join(model_path, "__model__"),
                 os.path.join(model_path, "__params__"))
+        elif pdmodel_file_list and len(
+                pdmodel_file_list) > 0 and pdiparams_file_list and len(
+                    pdiparams_file_list) > 0:
+            # case 2) initializing
+            logger.info("pdmodel_file_list:{}, pdiparams_file_list:{}".format(
+                pdmodel_file_list, pdiparams_file_list))
+            config = paddle_infer.Config(pdmodel_file_list[0],
+                                         pdiparams_file_list[0])
         else:
+            # case 3) initializing.
             config = paddle_infer.Config(model_path)
 
         logger.info(
@@ -126,7 +164,8 @@ class LocalPredictor(object):
 
         for i, var in enumerate(model_conf.fetch_var):
             self.fetch_names_to_idx_[var.alias_name] = i
-            self.fetch_names_to_type_[var.alias_name] = var.fetch_type
+            self.fetch_types_[var.alias_name] = var.fetch_type
+            self.fetch_names_to_type_[var.alias_name] = var.shape
 
         # set precision of inference.
         precision_type = paddle_infer.PrecisionType.Float32
@@ -199,8 +238,9 @@ class LocalPredictor(object):
         Run model inference by Paddle Inference API.
 
         Args:
-            feed: feed var
-            fetch: fetch var
+            feed: feed var list, None is not allowed.
+            fetch: fetch var list, None allowed. when it is None, all fetch 
+                   vars are returned. Otherwise, return fetch specified result.
             batch: batch data or not, False default.If batch is False, a new
                    dimension is added to header of the shape[np.newaxis].
             log_id: for logging
@@ -208,16 +248,8 @@ class LocalPredictor(object):
         Returns:
             fetch_map: dict 
         """
-        if feed is None or fetch is None:
-            raise ValueError("You should specify feed and fetch for prediction.\
-                log_id:{}".format(log_id))
-        fetch_list = []
-        if isinstance(fetch, str):
-            fetch_list = [fetch]
-        elif isinstance(fetch, list):
-            fetch_list = fetch
-        else:
-            raise ValueError("Fetch only accepts string and list of string.\
+        if feed is None:
+            raise ValueError("You should specify feed vars for prediction.\
                 log_id:{}".format(log_id))
 
         feed_batch = []
@@ -229,18 +261,20 @@ class LocalPredictor(object):
             raise ValueError("Feed only accepts dict and list of dict.\
                 log_id:{}".format(log_id))
 
-        fetch_names = []
+        fetch_list = []
+        if fetch is not None:
+            if isinstance(fetch, str):
+                fetch_list = [fetch]
+            elif isinstance(fetch, list):
+                fetch_list = fetch
+
         # Filter invalid fetch names
+        fetch_names = []
         for key in fetch_list:
             if key in self.fetch_names_:
                 fetch_names.append(key)
 
-        if len(fetch_names) == 0:
-            raise ValueError(
-                "Fetch names should not be empty or out of saved fetch list.\
-                    log_id:{}".format(log_id))
-
-        # Assemble the input data of paddle predictor 
+        # Assemble the input data of paddle predictor, and filter invalid inputs. 
         input_names = self.predictor.get_input_names()
         for name in input_names:
             if isinstance(feed[name], list):
@@ -252,8 +286,27 @@ class LocalPredictor(object):
                 feed[name] = feed[name].astype("float32")
             elif self.feed_types_[name] == 2:
                 feed[name] = feed[name].astype("int32")
+            elif self.feed_types_[name] == 3:
+                feed[name] = feed[name].astype("float64")
+            elif self.feed_types_[name] == 4:
+                feed[name] = feed[name].astype("int16")
+            elif self.feed_types_[name] == 5:
+                feed[name] = feed[name].astype("float16")
+            elif self.feed_types_[name] == 6:
+                feed[name] = feed[name].astype("uint16")
+            elif self.feed_types_[name] == 7:
+                feed[name] = feed[name].astype("uint8")
+            elif self.feed_types_[name] == 8:
+                feed[name] = feed[name].astype("int8")
+            elif self.feed_types_[name] == 9:
+                feed[name] = feed[name].astype("bool")
+            elif self.feed_types_[name] == 10:
+                feed[name] = feed[name].astype("complex64")
+            elif self.feed_types_[name] == 11:
+                feed[name] = feed[name].astype("complex128")
             else:
                 raise ValueError("local predictor receives wrong data type")
+
             input_tensor_handle = self.predictor.get_input_handle(name)
             if "{}.lod".format(name) in feed:
                 input_tensor_handle.set_lod([feed["{}.lod".format(name)]])
@@ -261,11 +314,15 @@ class LocalPredictor(object):
                 input_tensor_handle.copy_from_cpu(feed[name][np.newaxis, :])
             else:
                 input_tensor_handle.copy_from_cpu(feed[name])
+
+        # set output tensor handlers
         output_tensor_handles = []
+        output_name_to_index_dict = {}
         output_names = self.predictor.get_output_names()
-        for output_name in output_names:
+        for i, output_name in enumerate(output_names):
             output_tensor_handle = self.predictor.get_output_handle(output_name)
             output_tensor_handles.append(output_tensor_handle)
+            output_name_to_index_dict[output_name] = i
 
         # Run inference 
         self.predictor.run()
@@ -275,10 +332,43 @@ class LocalPredictor(object):
         for output_tensor_handle in output_tensor_handles:
             output = output_tensor_handle.copy_to_cpu()
             outputs.append(output)
+        outputs_len = len(outputs)
+
+        # Copy fetch vars. If fetch is None, it will copy all results from output_tensor_handles. 
+        # Otherwise, it will copy the fields specified from output_tensor_handles.
         fetch_map = {}
-        for i, name in enumerate(fetch):
-            fetch_map[name] = outputs[i]
-            if len(output_tensor_handles[i].lod()) > 0:
-                fetch_map[name + ".lod"] = np.array(output_tensor_handles[i]
-                                                    .lod()[0]).astype('int32')
+        if fetch is None:
+            for i, name in enumerate(output_names):
+                fetch_map[name] = outputs[i]
+                if len(output_tensor_handles[i].lod()) > 0:
+                    fetch_map[name + ".lod"] = np.array(output_tensor_handles[
+                        i].lod()[0]).astype('int32')
+        else:
+            # Because the save_inference_model interface will increase the scale op 
+            # in the network, the name of fetch_var is different from that in prototxt. 
+            # Therefore, it is compatible with v0.6.x and the previous model save format,
+            # and here is compatible with the results that do not match.
+            fetch_match_num = 0
+            for i, name in enumerate(fetch):
+                output_index = output_name_to_index_dict.get(name)
+                if output_index is None:
+                    continue
+
+                fetch_map[name] = outputs[output_index]
+                fetch_match_num += 1
+                if len(output_tensor_handles[output_index].lod()) > 0:
+                    fetch_map[name + ".lod"] = np.array(output_tensor_handles[
+                        output_index].lod()[0]).astype('int32')
+
+            # Compatible with v0.6.x and lower versions model saving formats.
+            if fetch_match_num == 0:
+                logger.debug("fetch match num is 0. Retrain the model please!")
+                for i, name in enumerate(fetch):
+                    if i >= outputs_len:
+                        break
+                    fetch_map[name] = outputs[i]
+                    if len(output_tensor_handles[i].lod()) > 0:
+                        fetch_map[name + ".lod"] = np.array(
+                            output_tensor_handles[i].lod()[0]).astype('int32')
+
         return fetch_map
diff --git a/python/paddle_serving_client/__init__.py b/python/paddle_serving_client/__init__.py
old mode 100755
new mode 100644
index 29d8aa5ccd03a06929b81adaafd1f6f96608d320..61274f60219422f49e6d3ed55e13f1c72a64db01
--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
@@ -16,5 +16,6 @@
 from . import version
 from . import client
 from .client import *
+from .httpclient import *
 
 __version__ = version.version_tag
diff --git a/python/paddle_serving_client/client.py b/python/paddle_serving_client/client.py
index 0ccdbb1435fbca1a9e576d1a5ad01517d4ed352d..bd6a1b5245da259ec7f8abc0a89cf3e955391bee 100755
--- a/python/paddle_serving_client/client.py
+++ b/python/paddle_serving_client/client.py
@@ -25,11 +25,8 @@ import base64
 import time
 import sys
 
-import grpc
-from .proto import multi_lang_general_model_service_pb2
 sys.path.append(
     os.path.join(os.path.abspath(os.path.dirname(__file__)), 'proto'))
-from .proto import multi_lang_general_model_service_pb2_grpc
 
 #param 'type'(which is in feed_var or fetch_var) = 0 means dataType is int64
 #param 'type'(which is in feed_var or fetch_var) = 1 means dataType is float32
@@ -79,7 +76,7 @@ class SDKConfig(object):
         self.tag_list = []
         self.cluster_list = []
         self.variant_weight_list = []
-        self.rpc_timeout_ms = 20000
+        self.rpc_timeout_ms = 200000
         self.load_balance_strategy = "la"
 
     def add_server_variant(self, tag, cluster, variant_weight):
@@ -142,7 +139,7 @@ class Client(object):
         self.profile_ = _Profiler()
         self.all_numpy_input = True
         self.has_numpy_input = False
-        self.rpc_timeout_ms = 20000
+        self.rpc_timeout_ms = 200000
         from .serving_client import PredictorRes
         self.predictorres_constructor = PredictorRes
 
@@ -292,132 +289,129 @@ class Client(object):
                 log_id=0):
         self.profile_.record('py_prepro_0')
 
-        if feed is None or fetch is None:
-            raise ValueError("You should specify feed and fetch for prediction")
+        if feed is None:
+            raise ValueError("You should specify feed for prediction")
 
         fetch_list = []
         if isinstance(fetch, str):
             fetch_list = [fetch]
         elif isinstance(fetch, list):
             fetch_list = fetch
+        elif fetch == None:
+            pass
         else:
-            raise ValueError("Fetch only accepts string and list of string")
+            raise ValueError("Fetch only accepts string or list of string")
 
         feed_batch = []
         if isinstance(feed, dict):
             feed_batch.append(feed)
         elif isinstance(feed, list):
-            feed_batch = feed
+            # feed = [dict]
+            if len(feed) == 1 and isinstance(feed[0], dict):
+                feed_batch = feed
+            else:
+                # if input is a list and the number of feed_var is 1.
+                # create a temp_dict { key = feed_var_name, value = list}
+                # put the temp_dict into the feed_batch.
+                if len(self.feed_names_) != 1:
+                    raise ValueError(
+                        "input is a list, but we got 0 or 2+ feed_var, don`t know how to divide the feed list"
+                    )
+                temp_dict = {}
+                temp_dict[self.feed_names_[0]] = feed
+                feed_batch.append(temp_dict)
         else:
             raise ValueError("Feed only accepts dict and list of dict")
 
-        int_slot_batch = []
+        # batch_size must be 1, cause batch is already in Tensor.
+        if len(feed_batch) != 1:
+            raise ValueError("len of feed_batch can only be 1.")
+
+        int_slot = []
         int_feed_names = []
         int_shape = []
         int_lod_slot_batch = []
-        float_slot_batch = []
+
+        float_slot = []
         float_feed_names = []
         float_lod_slot_batch = []
         float_shape = []
-        string_slot_batch = []
+
+        string_slot = []
         string_feed_names = []
         string_lod_slot_batch = []
         string_shape = []
 
         fetch_names = []
-        counter = 0
-        batch_size = len(feed_batch)
 
         for key in fetch_list:
             if key in self.fetch_names_:
                 fetch_names.append(key)
 
-        if len(fetch_names) == 0:
-            raise ValueError(
-                "Fetch names should not be empty or out of saved fetch list.")
-            return {}
-
-        for i, feed_i in enumerate(feed_batch):
-            int_slot = []
-            int_lod_slot = []
-            float_slot = []
-            float_lod_slot = []
-            string_slot = []
-            string_lod_slot = []
-            for key in feed_i:
-                if ".lod" not in key and key not in self.feed_names_:
-                    raise ValueError("Wrong feed name: {}.".format(key))
-                if ".lod" in key:
-                    continue
-                #if not isinstance(feed_i[key], np.ndarray):
-                self.shape_check(feed_i, key)
-                if self.feed_types_[key] in int_type:
-                    if i == 0:
-                        int_feed_names.append(key)
-                        shape_lst = []
-                        if batch == False:
-                            feed_i[key] = np.expand_dims(feed_i[key], 0).repeat(
-                                1, axis=0)
-                        if isinstance(feed_i[key], np.ndarray):
-                            shape_lst.extend(list(feed_i[key].shape))
-                            int_shape.append(shape_lst)
-                        else:
-                            int_shape.append(self.feed_shapes_[key])
-                        if "{}.lod".format(key) in feed_i:
-                            int_lod_slot_batch.append(feed_i["{}.lod".format(
-                                key)])
-                        else:
-                            int_lod_slot_batch.append([])
-
-                    if isinstance(feed_i[key], np.ndarray):
-                        int_slot.append(np.ascontiguousarray(feed_i[key]))
-                        self.has_numpy_input = True
-                    else:
-                        int_slot.append(np.ascontiguousarray(feed_i[key]))
-                        self.all_numpy_input = False
-
-                elif self.feed_types_[key] in float_type:
-                    if i == 0:
-                        float_feed_names.append(key)
-                        shape_lst = []
-                        if batch == False:
-                            feed_i[key] = np.expand_dims(feed_i[key], 0).repeat(
-                                1, axis=0)
-                        if isinstance(feed_i[key], np.ndarray):
-                            shape_lst.extend(list(feed_i[key].shape))
-                            float_shape.append(shape_lst)
-                        else:
-                            float_shape.append(self.feed_shapes_[key])
-                        if "{}.lod".format(key) in feed_i:
-                            float_lod_slot_batch.append(feed_i["{}.lod".format(
-                                key)])
-                        else:
-                            float_lod_slot_batch.append([])
-
-                    if isinstance(feed_i[key], np.ndarray):
-                        float_slot.append(np.ascontiguousarray(feed_i[key]))
-                        self.has_numpy_input = True
-                    else:
-                        float_slot.append(np.ascontiguousarray(feed_i[key]))
-                        self.all_numpy_input = False
-                #if input is string, feed is not numpy.
-                elif self.feed_types_[key] in string_type:
-                    if i == 0:
-                        string_feed_names.append(key)
-                        string_shape.append(self.feed_shapes_[key])
-                        if "{}.lod".format(key) in feed_i:
-                            string_lod_slot_batch.append(feed_i["{}.lod".format(
-                                key)])
-                        else:
-                            string_lod_slot_batch.append([])
-                    string_slot.append(feed_i[key])
+        feed_dict = feed_batch[0]
+        for key in feed_dict:
+            if ".lod" not in key and key not in self.feed_names_:
+                raise ValueError("Wrong feed name: {}.".format(key))
+            if ".lod" in key:
+                continue
+
+            self.shape_check(feed_dict, key)
+            if self.feed_types_[key] in int_type:
+                int_feed_names.append(key)
+                shape_lst = []
+                if batch == False:
+                    feed_dict[key] = np.expand_dims(feed_dict[key], 0).repeat(
+                        1, axis=0)
+                if isinstance(feed_dict[key], np.ndarray):
+                    shape_lst.extend(list(feed_dict[key].shape))
+                    int_shape.append(shape_lst)
+                else:
+                    int_shape.append(self.feed_shapes_[key])
+                if "{}.lod".format(key) in feed_dict:
+                    int_lod_slot_batch.append(feed_dict["{}.lod".format(key)])
+                else:
+                    int_lod_slot_batch.append([])
+
+                if isinstance(feed_dict[key], np.ndarray):
+                    int_slot.append(np.ascontiguousarray(feed_dict[key]))
+                    self.has_numpy_input = True
+                else:
+                    int_slot.append(np.ascontiguousarray(feed_dict[key]))
+                    self.all_numpy_input = False
+
+            elif self.feed_types_[key] in float_type:
+                float_feed_names.append(key)
+                shape_lst = []
+                if batch == False:
+                    feed_dict[key] = np.expand_dims(feed_dict[key], 0).repeat(
+                        1, axis=0)
+                if isinstance(feed_dict[key], np.ndarray):
+                    shape_lst.extend(list(feed_dict[key].shape))
+                    float_shape.append(shape_lst)
+                else:
+                    float_shape.append(self.feed_shapes_[key])
+                if "{}.lod".format(key) in feed_dict:
+                    float_lod_slot_batch.append(feed_dict["{}.lod".format(key)])
+                else:
+                    float_lod_slot_batch.append([])
+
+                if isinstance(feed_dict[key], np.ndarray):
+                    float_slot.append(np.ascontiguousarray(feed_dict[key]))
                     self.has_numpy_input = True
-            int_slot_batch.append(int_slot)
-            int_lod_slot_batch.append(int_lod_slot)
-            float_slot_batch.append(float_slot)
-            float_lod_slot_batch.append(float_lod_slot)
-            string_slot_batch.append(string_slot)
-            string_lod_slot_batch.append(string_lod_slot)
+                else:
+                    float_slot.append(np.ascontiguousarray(feed_dict[key]))
+                    self.all_numpy_input = False
+            #if input is string, feed is not numpy.
+            elif self.feed_types_[key] in string_type:
+                string_feed_names.append(key)
+                string_shape.append(self.feed_shapes_[key])
+                if "{}.lod".format(key) in feed_dict:
+                    string_lod_slot_batch.append(feed_dict["{}.lod".format(
+                        key)])
+                else:
+                    string_lod_slot_batch.append([])
+                string_slot.append(feed_dict[key])
+                self.has_numpy_input = True
 
         self.profile_.record('py_prepro_1')
         self.profile_.record('py_client_infer_0')
@@ -425,11 +419,11 @@ class Client(object):
         result_batch_handle = self.predictorres_constructor()
         if self.all_numpy_input:
             res = self.client_handle_.numpy_predict(
-                float_slot_batch, float_feed_names, float_shape,
-                float_lod_slot_batch, int_slot_batch, int_feed_names, int_shape,
-                int_lod_slot_batch, string_slot_batch, string_feed_names,
-                string_shape, string_lod_slot_batch, fetch_names,
-                result_batch_handle, self.pid, log_id)
+                float_slot, float_feed_names, float_shape, float_lod_slot_batch,
+                int_slot, int_feed_names, int_shape, int_lod_slot_batch,
+                string_slot, string_feed_names, string_shape,
+                string_lod_slot_batch, fetch_names, result_batch_handle,
+                self.pid, log_id)
         elif self.has_numpy_input == False:
             raise ValueError(
                 "Please make sure all of your inputs are numpy array")
@@ -448,6 +442,8 @@ class Client(object):
         model_engine_names = result_batch_handle.get_engine_names()
         for mi, engine_name in enumerate(model_engine_names):
             result_map = {}
+            if len(fetch_names) == 0:
+                fetch_names = result_batch_handle.get_tensor_alias_names(mi)
             # result map needs to be a numpy array
             for i, name in enumerate(fetch_names):
                 if self.fetch_names_to_type_[name] == int64_type:
@@ -517,243 +513,3 @@ class Client(object):
     def release(self):
         self.client_handle_.destroy_predictor()
         self.client_handle_ = None
-
-
-class MultiLangClient(object):
-    def __init__(self):
-        self.channel_ = None
-        self.stub_ = None
-        self.rpc_timeout_s_ = 2
-        self.profile_ = _Profiler()
-
-    def add_variant(self, tag, cluster, variant_weight):
-        # TODO
-        raise Exception("cannot support ABtest yet")
-
-    def set_rpc_timeout_ms(self, rpc_timeout):
-        if self.stub_ is None:
-            raise Exception("set timeout must be set after connect.")
-        if not isinstance(rpc_timeout, int):
-            # for bclient
-            raise ValueError("rpc_timeout must be int type.")
-        self.rpc_timeout_s_ = rpc_timeout / 1000.0
-        timeout_req = multi_lang_general_model_service_pb2.SetTimeoutRequest()
-        timeout_req.timeout_ms = rpc_timeout
-        resp = self.stub_.SetTimeout(timeout_req)
-        return resp.err_code == 0
-
-    def connect(self, endpoints):
-        # https://github.com/tensorflow/serving/issues/1382
-        options = [('grpc.max_receive_message_length', 512 * 1024 * 1024),
-                   ('grpc.max_send_message_length', 512 * 1024 * 1024),
-                   ('grpc.lb_policy_name', 'round_robin')]
-        # TODO: weight round robin
-        g_endpoint = 'ipv4:{}'.format(','.join(endpoints))
-        self.channel_ = grpc.insecure_channel(g_endpoint, options=options)
-        self.stub_ = multi_lang_general_model_service_pb2_grpc.MultiLangGeneralModelServiceStub(
-            self.channel_)
-        # get client model config
-        get_client_config_req = multi_lang_general_model_service_pb2.GetClientConfigRequest(
-        )
-        resp = self.stub_.GetClientConfig(get_client_config_req)
-        model_config_str = resp.client_config_str
-        self._parse_model_config(model_config_str)
-
-    def _flatten_list(self, nested_list):
-        for item in nested_list:
-            if isinstance(item, (list, tuple)):
-                for sub_item in self._flatten_list(item):
-                    yield sub_item
-            else:
-                yield item
-
-    def _parse_model_config(self, model_config_str):
-        model_conf = m_config.GeneralModelConfig()
-        model_conf = google.protobuf.text_format.Merge(model_config_str,
-                                                       model_conf)
-        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
-        self.feed_types_ = {}
-        self.feed_shapes_ = {}
-        self.lod_tensor_set_ = set()
-        for i, var in enumerate(model_conf.feed_var):
-            self.feed_types_[var.alias_name] = var.feed_type
-            self.feed_shapes_[var.alias_name] = var.shape
-            if var.is_lod_tensor:
-                self.lod_tensor_set_.add(var.alias_name)
-        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
-        self.fetch_types_ = {}
-        for i, var in enumerate(model_conf.fetch_var):
-            self.fetch_types_[var.alias_name] = var.fetch_type
-            if var.is_lod_tensor:
-                self.lod_tensor_set_.add(var.alias_name)
-
-    def _pack_inference_request(self, feed, fetch, is_python, log_id):
-        req = multi_lang_general_model_service_pb2.InferenceRequest()
-        req.fetch_var_names.extend(fetch)
-        req.is_python = is_python
-        req.log_id = log_id
-        feed_var_names = []
-        for key in feed.keys():
-            if '.lod' not in key:
-                feed_var_names.append(key)
-        req.feed_var_names.extend(feed_var_names)
-        inst = multi_lang_general_model_service_pb2.FeedInst()
-        for name in req.feed_var_names:
-            tensor = multi_lang_general_model_service_pb2.Tensor()
-            var = feed[name]
-            v_type = self.feed_types_[name]
-            if is_python:
-                data = None
-                if isinstance(var, list):
-                    if v_type == 0:  # int64
-                        data = np.array(var, dtype="int64")
-                    elif v_type == 1:  # float32
-                        data = np.array(var, dtype="float32")
-                    elif v_type == 2:  # int32
-                        data = np.array(var, dtype="int32")
-                    else:
-                        raise Exception("error tensor value type.")
-                elif isinstance(var, np.ndarray):
-                    data = var
-                    if v_type == 0:
-                        if data.dtype != 'int64':
-                            data = data.astype("int64")
-                    elif v_type == 1:
-                        if data.dtype != 'float32':
-                            data = data.astype("float32")
-                    elif v_type == 2:
-                        if data.dtype != 'int32':
-                            data = data.astype("int32")
-                    else:
-                        raise Exception("error tensor value type.")
-                else:
-                    raise Exception("var must be list or ndarray.")
-                data = np.ascontiguousarray(data)
-                tensor.data = data.tobytes()
-            tensor.shape.extend(list(var.shape))
-            if "{}.lod".format(name) in feed.keys():
-                tensor.lod.extend(feed["{}.lod".format(name)])
-            inst.tensor_array.append(tensor)
-        req.insts.append(inst)
-        return req
-
-    def _unpack_inference_response(self, resp, fetch, is_python,
-                                   need_variant_tag):
-        if resp.err_code != 0:
-            return None
-        tag = resp.tag
-        multi_result_map = {}
-        for model_result in resp.outputs:
-            inst = model_result.insts[0]
-            result_map = {}
-            for i, name in enumerate(fetch):
-                var = inst.tensor_array[i]
-                v_type = self.fetch_types_[name]
-                if is_python:
-                    if v_type == 0:  # int64
-                        result_map[name] = np.frombuffer(
-                            var.data, dtype="int64")
-                    elif v_type == 1:  # float32
-                        result_map[name] = np.frombuffer(
-                            var.data, dtype="float32")
-                    else:
-                        raise Exception("error type.")
-                else:
-                    if v_type == 0:  # int64
-                        result_map[name] = np.array(
-                            list(var.int64_data), dtype="int64")
-                    elif v_type == 1:  # float32
-                        result_map[name] = np.array(
-                            list(var.float_data), dtype="float32")
-                    else:
-                        raise Exception("error type.")
-                result_map[name].shape = list(var.shape)
-                if name in self.lod_tensor_set_:
-                    result_map["{}.lod".format(name)] = np.array(list(var.lod))
-            multi_result_map[model_result.engine_name] = result_map
-        ret = None
-        if len(resp.outputs) == 1:
-            ret = list(multi_result_map.values())[0]
-        else:
-            ret = multi_result_map
-
-        ret["serving_status_code"] = 0
-        return ret if not need_variant_tag else [ret, tag]
-
-    def _done_callback_func(self, fetch, is_python, need_variant_tag):
-        def unpack_resp(resp):
-            return self._unpack_inference_response(resp, fetch, is_python,
-                                                   need_variant_tag)
-
-        return unpack_resp
-
-    def get_feed_names(self):
-        return self.feed_names_
-
-    def predict(self,
-                feed,
-                fetch,
-                batch=True,
-                need_variant_tag=False,
-                asyn=False,
-                is_python=True,
-                log_id=0):
-        if isinstance(feed, dict) is False:
-            raise ValueError("Type Error. grpc feed must be dict.")
-        if batch is False:
-            for key in feed:
-                if ".lod" not in key:
-                    feed[key] = np.expand_dims(feed[key], 0).repeat(1, axis=0)
-        if not asyn:
-            try:
-                self.profile_.record('py_prepro_0')
-                req = self._pack_inference_request(
-                    feed, fetch, is_python=is_python, log_id=log_id)
-                self.profile_.record('py_prepro_1')
-
-                self.profile_.record('py_client_infer_0')
-                resp = self.stub_.Inference(req, timeout=self.rpc_timeout_s_)
-                self.profile_.record('py_client_infer_1')
-
-                self.profile_.record('py_postpro_0')
-                ret = self._unpack_inference_response(
-                    resp,
-                    fetch,
-                    is_python=is_python,
-                    need_variant_tag=need_variant_tag)
-                self.profile_.record('py_postpro_1')
-                self.profile_.print_profile()
-                return ret
-            except grpc.RpcError as e:
-                return {"serving_status_code": e.code()}
-        else:
-            req = self._pack_inference_request(
-                feed, fetch, is_python=is_python, log_id=log_id)
-            call_future = self.stub_.Inference.future(
-                req, timeout=self.rpc_timeout_s_)
-            return MultiLangPredictFuture(
-                call_future,
-                self._done_callback_func(
-                    fetch,
-                    is_python=is_python,
-                    need_variant_tag=need_variant_tag))
-
-
-class MultiLangPredictFuture(object):
-    def __init__(self, call_future, callback_func):
-        self.call_future_ = call_future
-        self.callback_func_ = callback_func
-
-    def result(self):
-        try:
-            resp = self.call_future_.result()
-        except grpc.RpcError as e:
-            return {"serving_status_code": e.code()}
-        return self.callback_func_(resp)
-
-    def add_done_callback(self, fn):
-        def __fn__(call_future):
-            assert call_future == self.call_future_
-            fn(self)
-
-        self.call_future_.add_done_callback(__fn__)
diff --git a/python/paddle_serving_client/httpclient.py b/python/paddle_serving_client/httpclient.py
new file mode 100755
index 0000000000000000000000000000000000000000..053ef9101d96c16ba9a81060f18b68cb8b4b2028
--- /dev/null
+++ b/python/paddle_serving_client/httpclient.py
@@ -0,0 +1,565 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import json
+import numpy as np
+import os
+from .proto import general_model_config_pb2 as m_config
+import google.protobuf.text_format
+import gzip
+from collections import Iterable
+import base64
+import sys
+
+import grpc
+from .proto import general_model_service_pb2
+sys.path.append(
+    os.path.join(os.path.abspath(os.path.dirname(__file__)), 'proto'))
+from .proto import general_model_service_pb2_grpc
+#param 'type'(which is in feed_var or fetch_var) = 0 means dataType is int64
+#param 'type'(which is in feed_var or fetch_var) = 1 means dataType is float32
+#param 'type'(which is in feed_var or fetch_var) = 2 means dataType is int32
+#param 'type'(which is in feed_var or fetch_var) = 3 means dataType is string(also called bytes in proto)
+int64_type = 0
+float32_type = 1
+int32_type = 2
+bytes_type = 3
+# this is corresponding to the proto
+proto_data_key_list = ["int64_data", "float_data", "int_data", "data"]
+
+
+def list_flatten(items, ignore_types=(str, bytes)):
+    for x in items:
+        if isinstance(x, Iterable) and not isinstance(x, ignore_types):
+            yield from list_flatten(x)
+        else:
+            yield x
+
+
+def data_bytes_number(datalist):
+    total_bytes_number = 0
+    if isinstance(datalist, list):
+        if len(datalist) == 0:
+            return total_bytes_number
+        else:
+            for data in datalist:
+                if isinstance(data, str):
+                    total_bytes_number = total_bytes_number + len(data)
+                else:
+                    total_bytes_number = total_bytes_number + 4 * len(datalist)
+                    break
+    else:
+        raise ValueError(
+            "In the Function data_bytes_number(), data must be list.")
+    return total_bytes_number
+
+
+# 此文件名，暂时为httpclient.py，待后续测试后考虑是否替换client.py
+# 默认使用http方式，默认使用Proto in HTTP-body
+# 如果想使用JSON in HTTP-body, set_http_proto(False)
+# Predict()是包装类http_client_predict/grpc_client_predict
+# 可以直接调用需要的http_client_predict/grpc_client_predict
+# 例如，如果想使用GRPC方式，set_use_grpc_client(True)
+# 或者直接调用grpc_client_predict()
+class HttpClient(object):
+    def __init__(self,
+                 ip="127.0.0.1",
+                 port="9393",
+                 service_name="/GeneralModelService/inference"):
+        self.feed_names_ = []
+        self.feed_real_names = []
+        self.fetch_names_ = []
+        self.feed_shapes_ = {}
+        self.feed_types_ = {}
+        self.feed_names_to_idx_ = {}
+        self.timeout_ms = 20000
+        self.ip = ip
+        self.port = port
+        self.server_port = port
+        self.service_name = service_name
+        self.key = None
+        self.try_request_gzip = False
+        self.try_response_gzip = False
+        self.total_data_number = 0
+        self.headers = {}
+        self.http_proto = True
+        self.headers["Content-Type"] = "application/proto"
+        self.max_body_size = 512 * 1024 * 1024
+        self.use_grpc_client = False
+        self.url = None
+
+        # 使用连接池能够不用反复建立连接
+        self.requests_session = requests.session()
+        # 初始化grpc_stub
+        options = [('grpc.max_receive_message_length', self.max_body_size),
+                   ('grpc.max_send_message_length', self.max_body_size)]
+
+        endpoints = [self.ip + ":" + self.server_port]
+        g_endpoint = 'ipv4:{}'.format(','.join(endpoints))
+        self.channel_ = grpc.insecure_channel(g_endpoint, options=options)
+        self.stub_ = general_model_service_pb2_grpc.GeneralModelServiceStub(
+            self.channel_)
+
+    def load_client_config(self, model_config_path_list):
+        if isinstance(model_config_path_list, str):
+            model_config_path_list = [model_config_path_list]
+        elif isinstance(model_config_path_list, list):
+            pass
+
+        file_path_list = []
+        for single_model_config in model_config_path_list:
+            if os.path.isdir(single_model_config):
+                file_path_list.append("{}/serving_client_conf.prototxt".format(
+                    single_model_config))
+            elif os.path.isfile(single_model_config):
+                file_path_list.append(single_model_config)
+        model_conf = m_config.GeneralModelConfig()
+        f = open(file_path_list[0], 'r')
+        model_conf = google.protobuf.text_format.Merge(
+            str(f.read()), model_conf)
+
+        # load configuraion here
+        # get feed vars, fetch vars
+        # get feed shapes, feed types
+        # map feed names to index
+        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
+        self.feed_real_names = [var.name for var in model_conf.feed_var]
+        self.feed_names_to_idx_ = {}  #this is useful
+        self.lod_tensor_set = set()
+        self.feed_tensor_len = {}  #this is only used for shape check
+        self.key = None
+
+        for i, var in enumerate(model_conf.feed_var):
+            self.feed_names_to_idx_[var.alias_name] = i
+            self.feed_types_[var.alias_name] = var.feed_type
+            self.feed_shapes_[var.alias_name] = [dim for dim in var.shape]
+
+            if var.is_lod_tensor:
+                self.lod_tensor_set.add(var.alias_name)
+            else:
+                counter = 1
+                for dim in self.feed_shapes_[var.alias_name]:
+                    counter *= dim
+                self.feed_tensor_len[var.alias_name] = counter
+        if len(file_path_list) > 1:
+            model_conf = m_config.GeneralModelConfig()
+            f = open(file_path_list[-1], 'r')
+            model_conf = google.protobuf.text_format.Merge(
+                str(f.read()), model_conf)
+        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
+        self.fetch_names_to_type_ = {}
+        self.fetch_names_to_idx_ = {}
+        for i, var in enumerate(model_conf.fetch_var):
+            self.fetch_names_to_idx_[var.alias_name] = i
+            self.fetch_names_to_type_[var.alias_name] = var.fetch_type
+            if var.is_lod_tensor:
+                self.lod_tensor_set.add(var.alias_name)
+        return
+
+    def set_max_body_size(self, max_body_size):
+        self.max_body_size = max_body_size
+        self.init_grpc_stub()
+
+    def set_timeout_ms(self, timeout_ms):
+        if not isinstance(timeout_ms, int):
+            raise ValueError("timeout_ms must be int type.")
+        else:
+            self.timeout_ms = timeout_ms
+
+    def set_max_retries(self, retry_times=3):
+        if not isinstance(retry_times, int):
+            raise ValueError("retry_times must be int type.")
+        else:
+            self.requests_session.mount(
+                'http://', HTTPAdapter(max_retries=retry_times))
+
+    def set_ip(self, ip):
+        self.ip = ip
+        self.init_grpc_stub()
+
+    def set_service_name(self, service_name):
+        self.service_name = service_name
+
+    def set_port(self, port):
+        self.port = port
+        self.server_port = port
+        self.init_grpc_stub()
+
+    def set_url(self, url):
+        if isinstance(url, str):
+            self.url = url
+        else:
+            print("url must be str")
+
+    def add_http_headers(self, headers):
+        if isinstance(headers, dict):
+            self.headers.update(headers)
+        else:
+            print("headers must be a dict")
+
+    def set_request_compress(self, try_request_gzip):
+        self.try_request_gzip = try_request_gzip
+
+    def set_response_compress(self, try_response_gzip):
+        self.try_response_gzip = try_response_gzip
+
+    def set_http_proto(self, http_proto):
+        self.http_proto = http_proto
+        if self.http_proto:
+            self.headers["Content-Type"] = "application/proto"
+        else:
+            self.headers["Content-Type"] = "application/json"
+
+    def set_use_grpc_client(self, use_grpc_client):
+        self.use_grpc_client = use_grpc_client
+
+    # use_key is the function of encryption.
+    def use_key(self, key_filename):
+        with open(key_filename, "rb") as f:
+            self.key = f.read()
+            self.get_serving_port()
+
+    def get_serving_port(self):
+        encrypt_url = "http://" + str(self.ip) + ":" + str(self.port)
+        if self.key is not None:
+            req = json.dumps({"key": base64.b64encode(self.key).decode()})
+        else:
+            req = json.dumps({})
+        with requests.post(
+                encrypt_url, data=req, timeout=self.timeout_ms / 1000) as r:
+            result = r.json()
+            if "endpoint_list" not in result:
+                raise ValueError("server not ready")
+            else:
+                self.server_port = str(result["endpoint_list"][0])
+                print("rpc port is ", self.server_port)
+
+    def get_feed_names(self):
+        return self.feed_names_
+
+    def get_fetch_names(self):
+        return self.fetch_names_
+
+    def get_legal_fetch(self, fetch):
+
+        fetch_list = []
+        if isinstance(fetch, str):
+            fetch_list = [fetch]
+        elif isinstance(fetch, (list, tuple)):
+            fetch_list = fetch
+        elif fetch == None:
+            pass
+        else:
+            raise ValueError("Fetch only accepts string/list/tuple of string")
+
+        fetch_names = []
+        for key in fetch_list:
+            if key in self.fetch_names_:
+                fetch_names.append(key)
+        return fetch_names
+
+    def get_feedvar_dict(self, feed):
+        if feed is None:
+            raise ValueError("You should specify feed for prediction")
+        feed_dict = {}
+        if isinstance(feed, dict):
+            feed_dict = feed
+        elif isinstance(feed, (list, str, tuple)):
+            # feed = [dict]
+            if len(feed) == 1 and isinstance(feed[0], dict):
+                feed_dict = feed[0]
+                return feed_dict
+            # if input is a list or str or tuple, and the number of feed_var is 1.
+            # create a feed_dict { key = feed_var_name, value = list}
+            if len(self.feed_names_) == 1:
+                feed_dict[self.feed_names_[0]] = feed
+            elif len(self.feed_names_) > 1:
+                if isinstance(feed, str):
+                    raise ValueError(
+                        "input is a str, but we got 2+ feed_var, don`t know how to divide the string"
+                    )
+                # feed is a list or tuple
+                elif len(self.feed_names_) == len(feed):
+                    for index in range(len(feed)):
+                        feed_dict[self.feed_names_[index]] = feed[index]
+                else:
+                    raise ValueError("len(feed) ≠ len(feed_var), error")
+            else:
+                raise ValueError("we got feed, but feed_var is None")
+
+        else:
+            raise ValueError("Feed only accepts dict/str/list/tuple")
+
+        return feed_dict
+
+    def process_json_data(self, feed_dict, fetch_list, batch, log_id):
+        Request = {}
+        Request["fetch_var_names"] = fetch_list
+        Request["log_id"] = int(log_id)
+        Request["tensor"] = []
+        for key in feed_dict:
+            if ".lod" not in key and key not in self.feed_names_:
+                raise ValueError("Wrong feed name: {}.".format(key))
+            if ".lod" in key:
+                continue
+
+            tensor_dict = self.process_tensor(key, feed_dict, batch)
+            data_key = tensor_dict["data_key"]
+            data_value = tensor_dict["data_value"]
+
+            tensor = {}
+            tensor[data_key] = data_value
+            tensor["shape"] = tensor_dict["shape"]
+            tensor["elem_type"] = tensor_dict["elem_type"]
+            tensor["name"] = tensor_dict["name"]
+            tensor["alias_name"] = tensor_dict["alias_name"]
+            if "lod" in tensor_dict:
+                tensor["lod"] = tensor_dict["lod"]
+            Request["tensor"].append(tensor)
+
+        # request
+        postData = json.dumps(Request)
+        return postData
+
+    def process_proto_data(self, feed_dict, fetch_list, batch, log_id):
+        req = general_model_service_pb2.Request()
+        req.fetch_var_names.extend(fetch_list)
+        req.log_id = log_id
+
+        for key in feed_dict:
+            tensor = general_model_service_pb2.Tensor()
+            if ".lod" not in key and key not in self.feed_names_:
+                raise ValueError("Wrong feed name: {}.".format(key))
+            if ".lod" in key:
+                continue
+
+            tensor_dict = self.process_tensor(key, feed_dict, batch)
+
+            tensor.shape.extend(tensor_dict["shape"])
+            tensor.name = tensor_dict["name"]
+            tensor.alias_name = tensor_dict["alias_name"]
+            tensor.elem_type = tensor_dict["elem_type"]
+            if "lod" in tensor_dict:
+                tensor.lod.extend(tensor_dict["lod"])
+
+            if tensor_dict["data_key"] == "int64_data":
+                tensor.int64_data.extend(tensor_dict["data_value"])
+            elif tensor_dict["data_key"] == "float_data":
+                tensor.float_data.extend(tensor_dict["data_value"])
+            elif tensor_dict["data_key"] == "int_data":
+                tensor.int_data.extend(tensor_dict["data_value"])
+            elif tensor_dict["data_key"] == "data":
+                tensor.data.extend(tensor_dict["data_value"])
+            else:
+                raise ValueError(
+                    "tensor element_type must be one of [int64_data,float_data,int_data,data]."
+                )
+
+            req.tensor.append(tensor)
+
+        return req
+
+    def process_tensor(self, key, feed_dict, batch):
+        lod = []
+        if "{}.lod".format(key) in feed_dict:
+            lod = feed_dict["{}.lod".format(key)]
+        shape = self.feed_shapes_[key].copy()
+        elem_type = self.feed_types_[key]
+        data_value = feed_dict[key]
+        data_key = proto_data_key_list[elem_type]
+        proto_index = self.feed_names_to_idx_[key]
+        name = self.feed_real_names[proto_index]
+        alias_name = key
+
+        # feed_dict[key] 可以是np.ndarray
+        # 也可以是list或tuple
+        # 当np.ndarray需要处理为list
+        if isinstance(feed_dict[key], np.ndarray):
+            shape_lst = []
+            # 0维numpy 需要在外层再加一个[]
+            if feed_dict[key].ndim == 0:
+                data_value = [feed_dict[key].tolist()]
+                shape_lst.append(1)
+            else:
+                shape_lst.extend(list(feed_dict[key].shape))
+                shape = shape_lst
+                data_value = feed_dict[key].flatten().tolist()
+            # 当Batch为False，shape字段前插一个1,表示batch维
+            # 当Batch为True,则直接使用numpy.shape作为batch维度
+            if batch == False:
+                shape.insert(0, 1)
+
+            # 当是list或tuple时，需要把多层嵌套展开
+        elif isinstance(feed_dict[key], (list, tuple)):
+            # 当Batch为False，shape字段前插一个1,表示batch维
+            # 当Batch为True, 由于list并不像numpy那样规整，所以
+            # 无法获取shape，此时取第一维度作为Batch维度.
+            # 插入到feedVar.shape前面.
+            if batch == False:
+                shape.insert(0, 1)
+            else:
+                shape.insert(0, len(feed_dict[key]))
+            feed_dict[key] = [x for x in list_flatten(feed_dict[key])]
+            data_value = feed_dict[key]
+        else:
+            # 输入可能是单个的str或int值等
+            # 此时先统一处理为一个list
+            # 由于输入比较特殊，shape保持原feedvar中不变
+            data_value = []
+            if isinstance(feed_dict[key], (str, bytes)):
+                if self.feed_types_[key] != bytes_type:
+                    raise ValueError(
+                        "feedvar is not string-type,feed can`t be a single string."
+                    )
+                if isinstance(feed_dict[key], bytes):
+                    feed_dict[key] = feed_dict[key].decode()
+            else:
+                if self.feed_types_[key] == bytes_type:
+                    raise ValueError(
+                        "feedvar is string-type,feed can`t be a single int or others."
+                    )
+            data_value.append(feed_dict[key])
+        # 如果不压缩，那么不需要统计数据量。
+        if self.try_request_gzip:
+            self.total_data_number = self.total_data_number + data_bytes_number(
+                data_value)
+        tensor_dict = {}
+        tensor_dict["data_key"] = data_key
+        tensor_dict["data_value"] = data_value
+        tensor_dict["shape"] = shape
+        tensor_dict["elem_type"] = elem_type
+        tensor_dict["name"] = name
+        tensor_dict["alias_name"] = alias_name
+        if len(lod) > 0:
+            tensor_dict["lod"] = lod
+        return tensor_dict
+
+    # feed结构必须为dict、List、tuple、string
+    # feed中数据支持Numpy、list、tuple、以及基本类型
+    # fetch默认是从模型的配置文件中获取全部的fetch_var
+    def predict(self,
+                feed=None,
+                fetch=None,
+                batch=False,
+                need_variant_tag=False,
+                log_id=0):
+        if self.use_grpc_client:
+            return self.grpc_client_predict(feed, fetch, batch,
+                                            need_variant_tag, log_id)
+        else:
+            return self.http_client_predict(feed, fetch, batch,
+                                            need_variant_tag, log_id)
+
+    def http_client_predict(self,
+                            feed=None,
+                            fetch=None,
+                            batch=False,
+                            need_variant_tag=False,
+                            log_id=0):
+
+        feed_dict = self.get_feedvar_dict(feed)
+        fetch_list = self.get_legal_fetch(fetch)
+        postData = ''
+
+        if self.http_proto == True:
+            postData = self.process_proto_data(feed_dict, fetch_list, batch,
+                                               log_id).SerializeToString()
+
+        else:
+            postData = self.process_json_data(feed_dict, fetch_list, batch,
+                                              log_id)
+
+        web_url = "http://" + self.ip + ":" + self.server_port + self.service_name
+        if self.url != None:
+            if "http" not in self.url:
+                self.url = "http://" + self.url
+            if "self.service_name" not in self.url:
+                self.url = self.url + self.service_name
+            web_url = self.url
+        # 当数据区长度大于512字节时才压缩.
+        self.headers.pop("Content-Encoding", "nokey")
+        try:
+            if self.try_request_gzip and self.total_data_number > 512:
+
+                if self.http_proto:
+                    postData = gzip.compress(postData)
+                else:
+                    postData = gzip.compress(bytes(postData, 'utf-8'))
+                self.headers["Content-Encoding"] = "gzip"
+            if self.try_response_gzip:
+                self.headers["Accept-encoding"] = "gzip"
+        # 压缩异常，使用原始数据
+        except:
+            print("compress error, we will use the no-compress data")
+            self.headers.pop("Content-Encoding", "nokey")
+        # requests支持自动识别解压
+        try:
+            result = self.requests_session.post(
+                url=web_url,
+                headers=self.headers,
+                data=postData,
+                timeout=self.timeout_ms / 1000,
+                verify=False)
+            result.raise_for_status()
+        except:
+            print("http post error")
+            return None
+        else:
+            if result == None:
+                return None
+            if result.status_code == 200:
+                if result.headers["Content-Type"] == 'application/proto':
+                    response = general_model_service_pb2.Response()
+                    response.ParseFromString(result.content)
+                    return response
+                else:
+                    return result.json()
+            return result
+
+    def grpc_client_predict(self,
+                            feed=None,
+                            fetch=None,
+                            batch=False,
+                            need_variant_tag=False,
+                            log_id=0):
+
+        feed_dict = self.get_feedvar_dict(feed)
+        fetch_list = self.get_legal_fetch(fetch)
+
+        postData = self.process_proto_data(feed_dict, fetch_list, batch, log_id)
+
+        try:
+            resp = self.stub_.inference(
+                postData, timeout=self.timeout_ms / 1000)
+        except:
+            print("Grpc inference error occur")
+            return None
+        else:
+            return resp
+
+    def init_grpc_stub(self):
+        # https://github.com/tensorflow/serving/issues/1382
+        options = [('grpc.max_receive_message_length', self.max_body_size),
+                   ('grpc.max_send_message_length', self.max_body_size)]
+
+        endpoints = [self.ip + ":" + self.server_port]
+        g_endpoint = 'ipv4:{}'.format(','.join(endpoints))
+        self.channel_ = grpc.insecure_channel(g_endpoint, options=options)
+        self.stub_ = general_model_service_pb2_grpc.GeneralModelServiceStub(
+            self.channel_)
+
+    def __del__(self):
+        self.requests_session.close()
+        self.channel_.close()
diff --git a/python/paddle_serving_client/io/__init__.py b/python/paddle_serving_client/io/__init__.py
index b7b0898a3b3b811c8f089c8409b6c5f94185660a..35b400bed60da8dab4b49cb660d4e6fcfe0f7f2c 100644
--- a/python/paddle_serving_client/io/__init__.py
+++ b/python/paddle_serving_client/io/__init__.py
@@ -31,6 +31,21 @@ import paddle.nn.functional as F
 import errno
 from paddle.jit import to_static
 
+_PADDLE_DTYPE_2_NUMPY_DTYPE = {
+    core.VarDesc.VarType.BOOL: 'bool',
+    core.VarDesc.VarType.FP16: 'float16',
+    core.VarDesc.VarType.BF16: 'uint16',
+    core.VarDesc.VarType.FP32: 'float32',
+    core.VarDesc.VarType.FP64: 'float64',
+    core.VarDesc.VarType.INT8: 'int8',
+    core.VarDesc.VarType.INT16: 'int16',
+    core.VarDesc.VarType.INT32: 'int32',
+    core.VarDesc.VarType.INT64: 'int64',
+    core.VarDesc.VarType.UINT8: 'uint8',
+    core.VarDesc.VarType.COMPLEX64: 'complex64',
+    core.VarDesc.VarType.COMPLEX128: 'complex128',
+}
+
 
 def save_dygraph_model(serving_model_folder, client_config_folder, model):
     paddle.jit.save(model, "serving_tmp")
@@ -52,18 +67,12 @@ def save_dygraph_model(serving_model_folder, client_config_folder, model):
     }
     config = model_conf.GeneralModelConfig()
 
-    #int64 = 0; float32 = 1; int32 = 2;
     for key in feed_var_dict:
         feed_var = model_conf.FeedVar()
         feed_var.alias_name = key
         feed_var.name = feed_var_dict[key].name
+        feed_var.feed_type = var_type_conversion(feed_var_dict[key].dtype)
         feed_var.is_lod_tensor = feed_var_dict[key].lod_level >= 1
-        if feed_var_dict[key].dtype == core.VarDesc.VarType.INT64:
-            feed_var.feed_type = 0
-        if feed_var_dict[key].dtype == core.VarDesc.VarType.FP32:
-            feed_var.feed_type = 1
-        if feed_var_dict[key].dtype == core.VarDesc.VarType.INT32:
-            feed_var.feed_type = 2
         if feed_var.is_lod_tensor:
             feed_var.shape.extend([-1])
         else:
@@ -77,13 +86,8 @@ def save_dygraph_model(serving_model_folder, client_config_folder, model):
         fetch_var = model_conf.FetchVar()
         fetch_var.alias_name = key
         fetch_var.name = fetch_var_dict[key].name
+        fetch_var.fetch_type = var_type_conversion(fetch_var_dict[key].dtype)
         fetch_var.is_lod_tensor = 1
-        if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT64:
-            fetch_var.fetch_type = 0
-        if fetch_var_dict[key].dtype == core.VarDesc.VarType.FP32:
-            fetch_var.fetch_type = 1
-        if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT32:
-            fetch_var.fetch_type = 2
         if fetch_var.is_lod_tensor:
             fetch_var.shape.extend([-1])
         else:
@@ -119,6 +123,58 @@ def save_dygraph_model(serving_model_folder, client_config_folder, model):
         fout.write(config.SerializeToString())
 
 
+def var_type_conversion(dtype):
+    """
+    Variable type conversion
+    Args:
+        dtype: type of core.VarDesc.VarType.xxxxx
+        (https://github.com/PaddlePaddle/Paddle/blob/release/2.1/python/paddle/framework/dtype.py) 
+    
+    Returns:
+        (int)type value, -1 is type matching failed.
+            int64 => 0; 
+            float32 => 1; 
+            int32 => 2; 
+            float64 => 3; 
+            int16 => 4; 
+            float16 => 5; 
+            bfloat16 => 6; 
+            uint8 => 7; 
+            int8 => 8; 
+            bool => 9;
+            complex64 => 10, 
+            complex128 => 11;
+    """
+    type_val = -1
+    if dtype == core.VarDesc.VarType.INT64:
+        type_val = 0
+    elif dtype == core.VarDesc.VarType.FP32:
+        type_val = 1
+    elif dtype == core.VarDesc.VarType.INT32:
+        type_val = 2
+    elif dtype == core.VarDesc.VarType.FP64:
+        type_val = 3
+    elif dtype == core.VarDesc.VarType.INT16:
+        type_val = 4
+    elif dtype == core.VarDesc.VarType.FP16:
+        type_val = 5
+    elif dtype == core.VarDesc.VarType.BF16:
+        type_val = 6
+    elif dtype == core.VarDesc.VarType.UINT8:
+        type_val = 7
+    elif dtype == core.VarDesc.VarType.INT8:
+        type_val = 8
+    elif dtype == core.VarDesc.VarType.BOOL:
+        type_val = 9
+    elif dtype == core.VarDesc.VarType.COMPLEX64:
+        type_val = 10
+    elif dtype == core.VarDesc.VarType.COMPLEX128:
+        type_val = 11
+    else:
+        type_val = -1
+    return type_val
+
+
 def save_model(server_model_folder,
                client_config_folder,
                feed_var_dict,
@@ -126,7 +182,9 @@ def save_model(server_model_folder,
                main_program=None,
                encryption=False,
                key_len=128,
-               encrypt_conf=None):
+               encrypt_conf=None,
+               model_filename=None,
+               params_filename=None):
     executor = Executor(place=CPUPlace())
 
     feed_var_names = [feed_var_dict[x].name for x in feed_var_dict]
@@ -136,15 +194,27 @@ def save_model(server_model_folder,
         target_vars.append(fetch_var_dict[key])
         target_var_names.append(key)
 
+    if not os.path.exists(server_model_folder):
+        os.makedirs(server_model_folder)
     if not encryption:
-        save_inference_model(
-            server_model_folder,
-            feed_var_names,
-            target_vars,
-            executor,
-            model_filename="__model__",
-            params_filename="__params__",
-            main_program=main_program)
+        if not model_filename:
+            model_filename = "model.pdmodel"
+        if not params_filename:
+            params_filename = "params.pdiparams"
+
+        new_model_path = os.path.join(server_model_folder, model_filename)
+        new_params_path = os.path.join(server_model_folder, params_filename)
+
+        with open(new_model_path, "wb") as new_model_file:
+            new_model_file.write(main_program.desc.serialize_to_string())
+
+        paddle.static.save_vars(
+            executor=executor,
+            dirname=server_model_folder,
+            main_program=main_program,
+            vars=None,
+            predicate=paddle.static.io.is_persistable,
+            filename=params_filename)
     else:
         if encrypt_conf == None:
             aes_cipher = CipherFactory.create_cipher()
@@ -164,18 +234,13 @@ def save_model(server_model_folder,
 
     config = model_conf.GeneralModelConfig()
 
-    #int64 = 0; float32 = 1; int32 = 2;
     for key in feed_var_dict:
         feed_var = model_conf.FeedVar()
         feed_var.alias_name = key
         feed_var.name = feed_var_dict[key].name
+        feed_var.feed_type = var_type_conversion(feed_var_dict[key].dtype)
+
         feed_var.is_lod_tensor = feed_var_dict[key].lod_level >= 1
-        if feed_var_dict[key].dtype == core.VarDesc.VarType.INT64:
-            feed_var.feed_type = 0
-        if feed_var_dict[key].dtype == core.VarDesc.VarType.FP32:
-            feed_var.feed_type = 1
-        if feed_var_dict[key].dtype == core.VarDesc.VarType.INT32:
-            feed_var.feed_type = 2
         if feed_var.is_lod_tensor:
             feed_var.shape.extend([-1])
         else:
@@ -190,14 +255,10 @@ def save_model(server_model_folder,
         fetch_var = model_conf.FetchVar()
         fetch_var.alias_name = key
         fetch_var.name = fetch_var_dict[key].name
-        #fetch_var.is_lod_tensor = fetch_var_dict[key].lod_level >= 1
-        fetch_var.is_lod_tensor = 1
-        if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT64:
-            fetch_var.fetch_type = 0
-        if fetch_var_dict[key].dtype == core.VarDesc.VarType.FP32:
-            fetch_var.fetch_type = 1
-        if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT32:
-            fetch_var.fetch_type = 2
+        fetch_var.fetch_type = var_type_conversion(fetch_var_dict[key].dtype)
+
+        fetch_var.is_lod_tensor = fetch_var_dict[key].lod_level >= 1
+        #fetch_var.is_lod_tensor = 1
         if fetch_var.is_lod_tensor:
             fetch_var.shape.extend([-1])
         else:
@@ -247,7 +308,8 @@ def inference_model_to_serving(dirname,
     }
     fetch_dict = {x.name: x for x in fetch_targets}
     save_model(serving_server, serving_client, feed_dict, fetch_dict,
-               inference_program, encryption, key_len, encrypt_conf)
+               inference_program, encryption, key_len, encrypt_conf,
+               model_filename, params_filename)
     feed_names = feed_dict.keys()
     fetch_names = fetch_dict.keys()
     return feed_names, fetch_names
diff --git a/python/paddle_serving_server/__init__.py b/python/paddle_serving_server/__init__.py
old mode 100755
new mode 100644
index 80a3573a29335acfbf241ede81618d0ad980d7f3..75aa84d1082b46f32f3a07820706b312ac767487
--- a/python/paddle_serving_server/__init__.py
+++ b/python/paddle_serving_server/__init__.py
@@ -14,18 +14,16 @@
 # pylint: disable=doc-string-missing
 
 from . import monitor
-from . import rpc_service
 from . import serve
 from . import version
 
-__all__ = ["version", "server", "serve", "monitor", "rpc_service", "dag"]
+__all__ = ["version", "server", "serve", "monitor", "dag"]
 
 from paddle_serving_server import (
     version,
     server,
     serve,
     monitor,
-    rpc_service,
     dag, )
 
 from .dag import *
diff --git a/python/paddle_serving_server/dag.py b/python/paddle_serving_server/dag.py
old mode 100755
new mode 100644
diff --git a/python/paddle_serving_server/rpc_service.py b/python/paddle_serving_server/rpc_service.py
deleted file mode 100755
index 163f7a5791d7c6549da0172ce7a5b925f0726b4e..0000000000000000000000000000000000000000
--- a/python/paddle_serving_server/rpc_service.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import os
-import numpy as np
-import google.protobuf.text_format
-
-from .proto import general_model_config_pb2 as m_config
-from .proto import multi_lang_general_model_service_pb2
-sys.path.append(
-    os.path.join(os.path.abspath(os.path.dirname(__file__)), 'proto'))
-from .proto import multi_lang_general_model_service_pb2_grpc
-
-
-class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
-                                     MultiLangGeneralModelServiceServicer):
-    def __init__(self, model_config_path_list, is_multi_model, endpoints):
-        self.is_multi_model_ = is_multi_model
-        self.model_config_path_list = model_config_path_list
-        self.endpoints_ = endpoints
-        self._init_bclient(self.model_config_path_list, self.endpoints_)
-        self._parse_model_config(self.model_config_path_list)
-
-    def _init_bclient(self, model_config_path_list, endpoints, timeout_ms=None):
-        file_path_list = []
-        for single_model_config in model_config_path_list:
-            if os.path.isdir(single_model_config):
-                file_path_list.append("{}/serving_server_conf.prototxt".format(
-                    single_model_config))
-            elif os.path.isfile(single_model_config):
-                file_path_list.append(single_model_config)
-        from paddle_serving_client import Client
-        self.bclient_ = Client()
-        if timeout_ms is not None:
-            self.bclient_.set_rpc_timeout_ms(timeout_ms)
-        self.bclient_.load_client_config(file_path_list)
-        self.bclient_.connect(endpoints)
-
-    def _parse_model_config(self, model_config_path_list):
-        if isinstance(model_config_path_list, str):
-            model_config_path_list = [model_config_path_list]
-        elif isinstance(model_config_path_list, list):
-            pass
-
-        file_path_list = []
-        for single_model_config in model_config_path_list:
-            if os.path.isdir(single_model_config):
-                file_path_list.append("{}/serving_server_conf.prototxt".format(
-                    single_model_config))
-            elif os.path.isfile(single_model_config):
-                file_path_list.append(single_model_config)
-        model_conf = m_config.GeneralModelConfig()
-        f = open(file_path_list[0], 'r')
-        model_conf = google.protobuf.text_format.Merge(
-            str(f.read()), model_conf)
-        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
-        self.feed_types_ = {}
-        self.feed_shapes_ = {}
-        self.lod_tensor_set_ = set()
-        for i, var in enumerate(model_conf.feed_var):
-            self.feed_types_[var.alias_name] = var.feed_type
-            self.feed_shapes_[var.alias_name] = var.shape
-            if var.is_lod_tensor:
-                self.lod_tensor_set_.add(var.alias_name)
-        if len(file_path_list) > 1:
-            model_conf = m_config.GeneralModelConfig()
-            f = open(file_path_list[-1], 'r')
-            model_conf = google.protobuf.text_format.Merge(
-                str(f.read()), model_conf)
-
-        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
-        self.fetch_types_ = {}
-        for i, var in enumerate(model_conf.fetch_var):
-            self.fetch_types_[var.alias_name] = var.fetch_type
-            if var.is_lod_tensor:
-                self.lod_tensor_set_.add(var.alias_name)
-
-    def _flatten_list(self, nested_list):
-        for item in nested_list:
-            if isinstance(item, (list, tuple)):
-                for sub_item in self._flatten_list(item):
-                    yield sub_item
-            else:
-                yield item
-
-    def _unpack_inference_request(self, request):
-        feed_names = list(request.feed_var_names)
-        fetch_names = list(request.fetch_var_names)
-        is_python = request.is_python
-        log_id = request.log_id
-        feed_batch = []
-        for feed_inst in request.insts:
-            feed_dict = {}
-            for idx, name in enumerate(feed_names):
-                var = feed_inst.tensor_array[idx]
-                v_type = self.feed_types_[name]
-                data = None
-                if is_python:
-                    if v_type == 0:  # int64
-                        data = np.frombuffer(var.data, dtype="int64")
-                    elif v_type == 1:  # float32
-                        data = np.frombuffer(var.data, dtype="float32")
-                    elif v_type == 2:  # int32
-                        data = np.frombuffer(var.data, dtype="int32")
-                    else:
-                        raise Exception("error type.")
-                else:
-                    if v_type == 0:  # int64
-                        data = np.array(list(var.int64_data), dtype="int64")
-                    elif v_type == 1:  # float32
-                        data = np.array(list(var.float_data), dtype="float32")
-                    elif v_type == 2:  # int32
-                        data = np.array(list(var.int_data), dtype="int32")
-                    else:
-                        raise Exception("error type.")
-                data.shape = list(feed_inst.tensor_array[idx].shape)
-                feed_dict[name] = np.ascontiguousarray(data)
-                if len(var.lod) > 0:
-                    feed_dict["{}.lod".format(name)] = var.lod
-            feed_batch.append(feed_dict)
-        return feed_batch, fetch_names, is_python, log_id
-
-    def _pack_inference_response(self, ret, fetch_names, is_python):
-        resp = multi_lang_general_model_service_pb2.InferenceResponse()
-        if ret is None:
-            resp.err_code = 1
-            return resp
-        results, tag = ret
-        resp.tag = tag
-        resp.err_code = 0
-
-        if not self.is_multi_model_:
-            results = {'general_infer_0': results}
-        for model_name, model_result in results.items():
-            model_output = multi_lang_general_model_service_pb2.ModelOutput()
-            inst = multi_lang_general_model_service_pb2.FetchInst()
-            for idx, name in enumerate(fetch_names):
-                tensor = multi_lang_general_model_service_pb2.Tensor()
-                v_type = self.fetch_types_[name]
-                if is_python:
-                    tensor.data = model_result[name].tobytes()
-                else:
-                    if v_type == 0:  # int64
-                        tensor.int64_data.extend(model_result[name].reshape(-1)
-                                                 .tolist())
-                    elif v_type == 1:  # float32
-                        tensor.float_data.extend(model_result[name].reshape(-1)
-                                                 .tolist())
-                    elif v_type == 2:  # int32
-                        tensor.int_data.extend(model_result[name].reshape(-1)
-                                               .tolist())
-                    else:
-                        raise Exception("error type.")
-                tensor.shape.extend(list(model_result[name].shape))
-                if "{}.lod".format(name) in model_result:
-                    tensor.lod.extend(model_result["{}.lod".format(name)]
-                                      .tolist())
-                inst.tensor_array.append(tensor)
-            model_output.insts.append(inst)
-            model_output.engine_name = model_name
-            resp.outputs.append(model_output)
-        return resp
-
-    def SetTimeout(self, request, context):
-        # This porcess and Inference process cannot be operate at the same time.
-        # For performance reasons, do not add thread lock temporarily.
-        timeout_ms = request.timeout_ms
-        self._init_bclient(self.model_config_path_list, self.endpoints_,
-                           timeout_ms)
-        resp = multi_lang_general_model_service_pb2.SimpleResponse()
-        resp.err_code = 0
-        return resp
-
-    def Inference(self, request, context):
-        feed_batch, fetch_names, is_python, log_id \
-                = self._unpack_inference_request(request)
-        ret = self.bclient_.predict(
-            feed=feed_batch,
-            fetch=fetch_names,
-            batch=True,
-            need_variant_tag=True,
-            log_id=log_id)
-        return self._pack_inference_response(ret, fetch_names, is_python)
-
-    def GetClientConfig(self, request, context):
-        #model_config_path_list is list right now.
-        #dict should be added when graphMaker is used.
-        resp = multi_lang_general_model_service_pb2.GetClientConfigResponse()
-        model_config_str = []
-        for single_model_config in self.model_config_path_list:
-            if os.path.isdir(single_model_config):
-                with open("{}/serving_server_conf.prototxt".format(
-                        single_model_config)) as f:
-                    model_config_str.append(str(f.read()))
-            elif os.path.isfile(single_model_config):
-                with open(single_model_config) as f:
-                    model_config_str.append(str(f.read()))
-        resp.client_config_str = model_config_str[0]
-        return resp
diff --git a/python/paddle_serving_server/serve.py b/python/paddle_serving_server/serve.py
index 101b7b4b460fd125541ed62f28b38d7512b5bda8..8531e83fc1bb3a330d276e0c0d72616a810eea72 100755
--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
@@ -23,23 +23,109 @@ import json
 import base64
 import time
 from multiprocessing import Process
-from flask import Flask, request
 import sys
 if sys.version_info.major == 2:
     from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
 elif sys.version_info.major == 3:
     from http.server import BaseHTTPRequestHandler, HTTPServer
 
+from contextlib import closing
+import socket
+
+
+# web_service.py is still used by Pipeline.
+def port_is_available(port):
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        sock.settimeout(2)
+        result = sock.connect_ex(('127.0.0.1', port))
+    if result != 0:
+        return True
+    else:
+        return False
+
+
+def format_gpu_to_strlist(unformatted_gpus):
+    gpus_strlist = []
+    if isinstance(unformatted_gpus, int):
+        gpus_strlist = [str(unformatted_gpus)]
+    elif isinstance(unformatted_gpus, list):
+        if unformatted_gpus == [""]:
+            gpus_strlist = ["-1"]
+        elif len(unformatted_gpus) == 0:
+            gpus_strlist = ["-1"]
+        else:
+            gpus_strlist = [str(x) for x in unformatted_gpus]
+    elif isinstance(unformatted_gpus, str):
+        if unformatted_gpus == "":
+            gpus_strlist = ["-1"]
+        else:
+            gpus_strlist = [unformatted_gpus]
+    elif unformatted_gpus == None:
+        gpus_strlist = ["-1"]
+    else:
+        raise ValueError("error input of set_gpus")
+
+    # check cuda visible
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        env_gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        for op_gpus_str in gpus_strlist:
+            op_gpu_list = op_gpus_str.split(",")
+            # op_gpu_list == ["-1"] means this op use CPU
+            # so don`t check cudavisible.
+            if op_gpu_list == ["-1"]:
+                continue
+            for ids in op_gpu_list:
+                if ids not in env_gpus:
+                    print("gpu_ids is not in CUDA_VISIBLE_DEVICES.")
+                    exit(-1)
+
+    # check gpuid is valid
+    for op_gpus_str in gpus_strlist:
+        op_gpu_list = op_gpus_str.split(",")
+        use_gpu = False
+        for ids in op_gpu_list:
+            if int(ids) < -1:
+                raise ValueError("The input of gpuid error.")
+            if int(ids) >= 0:
+                use_gpu = True
+            if int(ids) == -1 and use_gpu:
+                raise ValueError("You can not use CPU and GPU in one model.")
+
+    return gpus_strlist
+
+
+def is_gpu_mode(unformatted_gpus):
+    gpus_strlist = format_gpu_to_strlist(unformatted_gpus)
+    for op_gpus_str in gpus_strlist:
+        op_gpu_list = op_gpus_str.split(",")
+        for ids in op_gpu_list:
+            if int(ids) >= 0:
+                return True
+    return False
+
 
 def serve_args():
     parser = argparse.ArgumentParser("serve")
     parser.add_argument(
-        "--thread", type=int, default=2, help="Concurrency of server")
+        "--thread",
+        type=int,
+        default=4,
+        help="Concurrency of server,[4,1024]",
+        choices=range(4, 1025))
+    parser.add_argument(
+        "--port", type=int, default=9393, help="Port of the starting gpu")
+    parser.add_argument(
+        "--device", type=str, default="cpu", help="Type of device")
     parser.add_argument(
-        "--port", type=int, default=9292, help="Port of the starting gpu")
+        "--gpu_ids", type=str, default="", nargs="+", help="gpu ids")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="Type of device")
-    parser.add_argument("--gpu_ids", type=str, default="", help="gpu ids")
+        "--op_num", type=int, default=0, nargs="+", help="Number of each op")
+    parser.add_argument(
+        "--op_max_batch",
+        type=int,
+        default=32,
+        nargs="+",
+        help="Max batch of each op")
     parser.add_argument(
         "--model", type=str, default="", nargs="+", help="Model for serving")
     parser.add_argument(
@@ -47,8 +133,6 @@ def serve_args():
         type=str,
         default="workdir",
         help="Working dir of current service")
-    parser.add_argument(
-        "--name", type=str, default="None", help="Default service name")
     parser.add_argument(
         "--use_mkl", default=False, action="store_true", help="Use MKL")
     parser.add_argument(
@@ -78,11 +162,6 @@ def serve_args():
         default=False,
         action="store_true",
         help="Use encryption model")
-    parser.add_argument(
-        "--use_multilang",
-        default=False,
-        action="store_true",
-        help="Use Multi-language-service")
     parser.add_argument(
         "--use_trt", default=False, action="store_true", help="Use TensorRT")
     parser.add_argument(
@@ -99,94 +178,27 @@ def serve_args():
         type=str,
         default=None,
         help="container_id for authentication")
+    parser.add_argument(
+        "--gpu_multi_stream",
+        default=False,
+        action="store_true",
+        help="Use gpu_multi_stream")
     return parser.parse_args()
 
 
-def start_standard_model(serving_port):  # pylint: disable=doc-string-missing
-    args = serve_args()
-    thread_num = args.thread
-    model = args.model
-    port = serving_port
-    workdir = args.workdir
-    device = args.device
-    mem_optim = args.mem_optim_off is False
-    ir_optim = args.ir_optim
-    max_body_size = args.max_body_size
-    use_mkl = args.use_mkl
-    use_encryption_model = args.use_encryption_model
-    use_multilang = args.use_multilang
-
-    if model == "":
-        print("You must specify your serving model")
-        exit(-1)
-
-    for single_model_config in args.model:
-        if os.path.isdir(single_model_config):
-            pass
-        elif os.path.isfile(single_model_config):
-            raise ValueError("The input of --model should be a dir not file.")
-
-    import paddle_serving_server as serving
-    op_maker = serving.OpMaker()
-    op_seq_maker = serving.OpSeqMaker()
-
-    read_op = op_maker.create('general_reader')
-    op_seq_maker.add_op(read_op)
-
-    for idx, single_model in enumerate(model):
-        infer_op_name = "general_infer"
-        #Temporary support for OCR model,it will be completely revised later
-        #If you want to use this, C++ server must compile with WITH_OPENCV option.
-        if len(model) == 2 and idx == 0 and model[0] == 'ocr_det_model':
-            infer_op_name = "general_detection"
-        general_infer_op = op_maker.create(infer_op_name)
-        op_seq_maker.add_op(general_infer_op)
-
-    general_response_op = op_maker.create('general_response')
-    op_seq_maker.add_op(general_response_op)
-
-    server = None
-    if use_multilang:
-        server = serving.MultiLangServer()
-    else:
-        server = serving.Server()
-    server.set_op_sequence(op_seq_maker.get_op_sequence())
-    server.set_num_threads(thread_num)
-    server.set_memory_optimize(mem_optim)
-    server.set_ir_optimize(ir_optim)
-    server.use_mkl(use_mkl)
-    server.set_max_body_size(max_body_size)
-    server.set_port(port)
-    server.set_precision(args.precision)
-    server.set_use_calib(args.use_calib)
-    server.use_encryption_model(use_encryption_model)
-    if args.product_name != None:
-        server.set_product_name(args.product_name)
-    if args.container_id != None:
-        server.set_container_id(args.container_id)
-
-    server.load_model_config(model)
-    server.prepare_server(workdir=workdir, port=port, device=device)
-    server.run_server()
+def start_gpu_card_model(gpu_mode, port, args):  # pylint: disable=doc-string-missing
 
+    device = "cpu"
+    if gpu_mode == True:
+        device = "gpu"
 
-def start_gpu_card_model(index, gpuid, port, args):  # pylint: disable=doc-string-missing
-    workdir = args.workdir
-    gpuid = int(gpuid)
-    device = "gpu"
-    if gpuid == -1:
-        device = "cpu"
-    elif gpuid >= 0:
-        port = port + index
     thread_num = args.thread
     model = args.model
     mem_optim = args.mem_optim_off is False
     ir_optim = args.ir_optim
     use_mkl = args.use_mkl
     max_body_size = args.max_body_size
-    use_multilang = args.use_multilang
-    if gpuid >= 0:
-        workdir = "{}_{}".format(args.workdir, gpuid)
+    workdir = "{}_{}".format(args.workdir, port)
 
     if model == "":
         print("You must specify your serving model")
@@ -204,7 +216,11 @@ def start_gpu_card_model(index, gpuid, port, args):  # pylint: disable=doc-strin
     op_seq_maker.add_op(read_op)
     for idx, single_model in enumerate(model):
         infer_op_name = "general_infer"
-        if len(model) == 2 and idx == 0:
+        # 目前由于ocr的节点Det模型依赖于opencv的第三方库
+        # 只有使用ocr的时候，才会加入opencv的第三方库并编译GeneralDetectionOp
+        # 故此处做特殊处理，当不满足下述情况时，所添加的op默认为GeneralInferOp
+        # 以后可能考虑不用python脚本来生成配置
+        if len(model) == 2 and idx == 0 and single_model == "ocr_det_model":
             infer_op_name = "general_detection"
         else:
             infer_op_name = "general_infer"
@@ -214,10 +230,7 @@ def start_gpu_card_model(index, gpuid, port, args):  # pylint: disable=doc-strin
     general_response_op = op_maker.create('general_response')
     op_seq_maker.add_op(general_response_op)
 
-    if use_multilang:
-        server = serving.MultiLangServer()
-    else:
-        server = serving.Server()
+    server = serving.Server()
     server.set_op_sequence(op_seq_maker.get_op_sequence())
     server.set_num_threads(thread_num)
     server.use_mkl(use_mkl)
@@ -226,8 +239,19 @@ def start_gpu_card_model(index, gpuid, port, args):  # pylint: disable=doc-strin
     server.set_memory_optimize(mem_optim)
     server.set_ir_optimize(ir_optim)
     server.set_max_body_size(max_body_size)
-    if args.use_trt:
+
+    if args.use_trt and device == "gpu":
         server.set_trt()
+        server.set_ir_optimize(True)
+
+    if args.gpu_multi_stream and device == "gpu":
+        server.set_gpu_multi_stream()
+
+    if args.op_num:
+        server.set_op_num(args.op_num)
+
+    if args.op_max_batch:
+        server.set_op_max_batch(args.op_max_batch)
 
     if args.use_lite:
         server.set_lite()
@@ -241,54 +265,27 @@ def start_gpu_card_model(index, gpuid, port, args):  # pylint: disable=doc-strin
     if args.container_id != None:
         server.set_container_id(args.container_id)
 
+    if gpu_mode == True:
+        server.set_gpuid(args.gpu_ids)
     server.load_model_config(model)
     server.prepare_server(
         workdir=workdir,
         port=port,
         device=device,
         use_encryption_model=args.use_encryption_model)
-    if gpuid >= 0:
-        server.set_gpuid(gpuid)
     server.run_server()
 
 
 def start_multi_card(args, serving_port=None):  # pylint: disable=doc-string-missing
-    gpus = ""
+
     if serving_port == None:
         serving_port = args.port
-    if args.gpu_ids == "":
-        gpus = []
-    else:
-        gpus = args.gpu_ids.split(",")
-        if "CUDA_VISIBLE_DEVICES" in os.environ:
-            env_gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
-            for ids in gpus:
-                if ids not in env_gpus:
-                    print("gpu_ids is not in CUDA_VISIBLE_DEVICES.")
-                    exit(-1)
-        else:
-            env_gpus = []
+
     if args.use_lite:
         print("run using paddle-lite.")
-        start_gpu_card_model(-1, -1, serving_port, args)
-    elif len(gpus) <= 0:
-        print("gpu_ids not set, going to run cpu service.")
-        start_gpu_card_model(-1, -1, serving_port, args)
+        start_gpu_card_model(False, serving_port, args)
     else:
-        gpu_processes = []
-        for i, gpu_id in enumerate(gpus):
-            p = Process(
-                target=start_gpu_card_model,
-                args=(
-                    i,
-                    gpu_id,
-                    serving_port,
-                    args, ))
-            gpu_processes.append(p)
-        for p in gpu_processes:
-            p.start()
-        for p in gpu_processes:
-            p.join()
+        start_gpu_card_model(is_gpu_mode(args.gpu_ids), serving_port, args)
 
 
 class MainService(BaseHTTPRequestHandler):
@@ -370,7 +367,9 @@ class MainService(BaseHTTPRequestHandler):
 
 
 if __name__ == "__main__":
-
+    # args.device is not used at all.
+    # just keep the interface.
+    # so --device should not be recommended at the HomePage.
     args = serve_args()
     for single_model_config in args.model:
         if os.path.isdir(single_model_config):
@@ -378,54 +377,14 @@ if __name__ == "__main__":
         elif os.path.isfile(single_model_config):
             raise ValueError("The input of --model should be a dir not file.")
 
-    if args.name == "None":
-        from .web_service import port_is_available
-        if args.use_encryption_model:
-            p_flag = False
-            p = None
-            serving_port = 0
-            server = HTTPServer(('localhost', int(args.port)), MainService)
-            print(
-                'Starting encryption server, waiting for key from client, use <Ctrl-C> to stop'
-            )
-            server.serve_forever()
-        else:
-            start_multi_card(args)
+    if args.use_encryption_model:
+        p_flag = False
+        p = None
+        serving_port = 0
+        server = HTTPServer(('0.0.0.0', int(args.port)), MainService)
+        print(
+            'Starting encryption server, waiting for key from client, use <Ctrl-C> to stop'
+        )
+        server.serve_forever()
     else:
-        from .web_service import WebService
-        web_service = WebService(name=args.name)
-        web_service.load_model_config(args.model)
-        gpu_ids = args.gpu_ids
-        if gpu_ids == "":
-            if "CUDA_VISIBLE_DEVICES" in os.environ:
-                gpu_ids = os.environ["CUDA_VISIBLE_DEVICES"]
-        if len(gpu_ids) > 0:
-            web_service.set_gpus(gpu_ids)
-        web_service.prepare_server(
-            workdir=args.workdir,
-            port=args.port,
-            device=args.device,
-            use_lite=args.use_lite,
-            use_xpu=args.use_xpu,
-            ir_optim=args.ir_optim,
-            thread_num=args.thread,
-            precision=args.precision,
-            use_calib=args.use_calib)
-        web_service.run_rpc_service()
-
-        app_instance = Flask(__name__)
-
-        @app_instance.before_first_request
-        def init():
-            web_service._launch_web_service()
-
-        service_name = "/" + web_service.name + "/prediction"
-
-        @app_instance.route(service_name, methods=["POST"])
-        def run():
-            return web_service.get_prediction(request)
-
-        app_instance.run(host="0.0.0.0",
-                         port=web_service.port,
-                         threaded=False,
-                         processes=4)
+        start_multi_card(args)
diff --git a/python/paddle_serving_server/server.py b/python/paddle_serving_server/server.py
index 6d8077ad3a3a10c943201f9a945a2ef92b370df0..078f3702d125d5829cf1336b1b69493d471170fd 100755
--- a/python/paddle_serving_server/server.py
+++ b/python/paddle_serving_server/server.py
@@ -16,10 +16,9 @@ import os
 import tarfile
 import socket
 import paddle_serving_server as paddle_serving_server
-from paddle_serving_server.rpc_service import MultiLangServerServiceServicer
+from paddle_serving_server.serve import format_gpu_to_strlist
 from .proto import server_configure_pb2 as server_sdk
 from .proto import general_model_config_pb2 as m_config
-from .proto import multi_lang_general_model_service_pb2_grpc
 import google.protobuf.text_format
 import time
 from .version import version_tag, version_suffix, device_type
@@ -32,7 +31,6 @@ if sys.platform.startswith('win') is False:
 import shutil
 import platform
 import numpy as np
-import grpc
 import sys
 import collections
 import subprocess
@@ -41,6 +39,8 @@ from multiprocessing import Pool, Process
 from concurrent import futures
 
 
+# The whole file is about to be discarded.
+# We will use default config-file to start C++Server.
 class Server(object):
     def __init__(self):
         """
@@ -81,8 +81,11 @@ class Server(object):
         self.use_local_bin = False
         self.mkl_flag = False
         self.device = "cpu"
-        self.gpuid = 0
+        self.gpuid = []
+        self.op_num = [0]
+        self.op_max_batch = [32]
         self.use_trt = False
+        self.gpu_multi_stream = False
         self.use_lite = False
         self.use_xpu = False
         self.model_config_paths = collections.OrderedDict()
@@ -137,11 +140,13 @@ class Server(object):
     def set_ir_optimize(self, flag=False):
         self.ir_optimization = flag
 
+    # Multi-Server does not have this Function.
     def set_product_name(self, product_name=None):
         if product_name == None:
             raise ValueError("product_name can't be None.")
         self.product_name = product_name
 
+    # Multi-Server does not have this Function.
     def set_container_id(self, container_id):
         if container_id == None:
             raise ValueError("container_id can't be None.")
@@ -163,12 +168,21 @@ class Server(object):
     def set_device(self, device="cpu"):
         self.device = device
 
-    def set_gpuid(self, gpuid=0):
-        self.gpuid = gpuid
+    def set_gpuid(self, gpuid):
+        self.gpuid = format_gpu_to_strlist(gpuid)
+
+    def set_op_num(self, op_num):
+        self.op_num = op_num
+
+    def set_op_max_batch(self, op_max_batch):
+        self.op_max_batch = op_max_batch
 
     def set_trt(self):
         self.use_trt = True
 
+    def set_gpu_multi_stream(self):
+        self.gpu_multi_stream = True
+
     def set_lite(self):
         self.use_lite = True
 
@@ -176,9 +190,33 @@ class Server(object):
         self.use_xpu = True
 
     def _prepare_engine(self, model_config_paths, device, use_encryption_model):
+        self.device = device
         if self.model_toolkit_conf == None:
             self.model_toolkit_conf = []
 
+        # Generally, self.gpuid = str[] or [].
+        # when len(self.gpuid) means no gpuid is specified.
+        # if self.device == "gpu" or self.use_trt:
+        # we assume you forget to set gpuid, so set gpuid = ['0'];
+        if len(self.gpuid) == 0 or self.gpuid == ["-1"]:
+            if self.device == "gpu" or self.use_trt or self.gpu_multi_stream:
+                self.gpuid = ["0"]
+                self.device = "gpu"
+            else:
+                self.gpuid = ["-1"]
+
+        if isinstance(self.op_num, int):
+            self.op_num = [self.op_num]
+        if len(self.op_num) == 0:
+            self.op_num.append(0)
+
+        if isinstance(self.op_max_batch, int):
+            self.op_max_batch = [self.op_max_batch]
+        if len(self.op_max_batch) == 0:
+            self.op_max_batch.append(32)
+
+        index = 0
+
         for engine_name, model_config_path in model_config_paths.items():
             engine = server_sdk.EngineDesc()
             engine.name = engine_name
@@ -186,18 +224,39 @@ class Server(object):
             engine.reloadable_meta = model_config_path + "/fluid_time_file"
             os.system("touch {}".format(engine.reloadable_meta))
             engine.reloadable_type = "timestamp_ne"
-            engine.runtime_thread_num = 0
-            engine.batch_infer_size = 0
-            engine.enable_batch_align = 0
+            engine.runtime_thread_num = self.op_num[index % len(self.op_num)]
+            engine.batch_infer_size = self.op_max_batch[index %
+                                                        len(self.op_max_batch)]
+
+            engine.enable_batch_align = 1
             engine.model_dir = model_config_path
             engine.enable_memory_optimization = self.memory_optimization
             engine.enable_ir_optimization = self.ir_optimization
             engine.use_trt = self.use_trt
+            engine.gpu_multi_stream = self.gpu_multi_stream
             engine.use_lite = self.use_lite
             engine.use_xpu = self.use_xpu
             engine.use_gpu = False
-            if self.device == "gpu":
+
+            if len(self.gpuid) == 0:
+                raise ValueError("CPU: self.gpuid = -1, GPU: must set it ")
+            op_gpu_list = self.gpuid[index % len(self.gpuid)].split(",")
+            for ids in op_gpu_list:
+                engine.gpu_ids.extend([int(ids)])
+
+            if self.device == "gpu" or self.use_trt or self.gpu_multi_stream:
                 engine.use_gpu = True
+                # this is for Mixed use of GPU and CPU
+                # if model-1 use GPU and set the device="gpu"
+                # but gpuid[1] = "-1" which means use CPU in Model-2
+                # so config about GPU should be False.
+                # op_gpu_list = gpuid[index].split(",")
+                # which is the gpuid for each engine.
+                if len(op_gpu_list) == 1:
+                    if int(op_gpu_list[0]) == -1:
+                        engine.use_gpu = False
+                        engine.gpu_multi_stream = False
+                        engine.use_trt = False
 
             if os.path.exists('{}/__params__'.format(model_config_path)):
                 engine.combined_model = True
@@ -208,6 +267,7 @@ class Server(object):
             engine.type = "PADDLE_INFER"
             self.model_toolkit_conf.append(server_sdk.ModelToolkitConf())
             self.model_toolkit_conf[-1].engines.extend([engine])
+            index = index + 1
 
     def _prepare_infer_service(self, port):
         if self.infer_service_conf == None:
@@ -332,7 +392,11 @@ class Server(object):
         self.mkl_flag = flag
 
     def check_avx(self):
-        p = subprocess.Popen(['cat /proc/cpuinfo | grep avx 2>/dev/null'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+        p = subprocess.Popen(
+            ['cat /proc/cpuinfo | grep avx 2>/dev/null'],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True)
         out, err = p.communicate()
         if err == b'' and len(out) > 0:
             return True
@@ -373,7 +437,6 @@ class Server(object):
 
     def download_bin(self):
         os.chdir(self.module_path)
-        need_download = False
 
         #acquire lock
         version_file = open("{}/version.py".format(self.module_path), "r")
@@ -428,9 +491,17 @@ class Server(object):
     def prepare_server(self,
                        workdir=None,
                        port=9292,
-                       device="cpu",
+                       device=None,
                        use_encryption_model=False,
                        cube_conf=None):
+        # if `device` is not set, use self.device
+        # self.device may not be changed.
+        # or self.device may have changed by set_device.
+        if device == None:
+            device = self.device
+        # if `device` is set, let self.device = device.
+        else:
+            self.device = device
         if workdir == None:
             workdir = "./tmp"
             os.system("mkdir -p {}".format(workdir))
@@ -466,7 +537,7 @@ class Server(object):
     def port_is_available(self, port):
         with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
             sock.settimeout(2)
-            result = sock.connect_ex(('0.0.0.0', port))
+            result = sock.connect_ex(('127.0.0.1', port))
         if result != 0:
             return True
         else:
@@ -484,237 +555,39 @@ class Server(object):
         else:
             print("Use local bin : {}".format(self.bin_path))
         #self.check_cuda()
-        # Todo: merge CPU and GPU code, remove device to model_toolkit
-        if self.device == "cpu" or self.device == "arm":
-            command = "{} " \
-                      "-enable_model_toolkit " \
-                      "-inferservice_path {} " \
-                      "-inferservice_file {} " \
-                      "-max_concurrency {} " \
-                      "-num_threads {} " \
-                      "-port {} " \
-                      "-precision {} " \
-                      "-use_calib {} " \
-                      "-reload_interval_s {} " \
-                      "-resource_path {} " \
-                      "-resource_file {} " \
-                      "-workflow_path {} " \
-                      "-workflow_file {} " \
-                      "-bthread_concurrency {} " \
-                      "-max_body_size {} ".format(
-                          self.bin_path,
-                          self.workdir,
-                          self.infer_service_fn,
-                          self.max_concurrency,
-                          self.num_threads,
-                          self.port,
-                          self.precision,
-                          self.use_calib,
-                          self.reload_interval_s,
-                          self.workdir,
-                          self.resource_fn,
-                          self.workdir,
-                          self.workflow_fn,
-                          self.num_threads,
-                          self.max_body_size)
-        else:
-            command = "{} " \
-                      "-enable_model_toolkit " \
-                      "-inferservice_path {} " \
-                      "-inferservice_file {} " \
-                      "-max_concurrency {} " \
-                      "-num_threads {} " \
-                      "-port {} " \
-                      "-precision {} " \
-                      "-use_calib {} " \
-                      "-reload_interval_s {} " \
-                      "-resource_path {} " \
-                      "-resource_file {} " \
-                      "-workflow_path {} " \
-                      "-workflow_file {} " \
-                      "-bthread_concurrency {} " \
-                      "-gpuid {} " \
-                      "-max_body_size {} ".format(
-                          self.bin_path,
-                          self.workdir,
-                          self.infer_service_fn,
-                          self.max_concurrency,
-                          self.num_threads,
-                          self.port,
-                          self.precision,
-                          self.use_calib,
-                          self.reload_interval_s,
-                          self.workdir,
-                          self.resource_fn,
-                          self.workdir,
-                          self.workflow_fn,
-                          self.num_threads,
-                          self.gpuid,
-                          self.max_body_size)
+        command = "{} " \
+                    "-enable_model_toolkit " \
+                    "-inferservice_path {} " \
+                    "-inferservice_file {} " \
+                    "-max_concurrency {} " \
+                    "-num_threads {} " \
+                    "-port {} " \
+                    "-precision {} " \
+                    "-use_calib {} " \
+                    "-reload_interval_s {} " \
+                    "-resource_path {} " \
+                    "-resource_file {} " \
+                    "-workflow_path {} " \
+                    "-workflow_file {} " \
+                    "-bthread_concurrency {} " \
+                    "-max_body_size {} ".format(
+                        self.bin_path,
+                        self.workdir,
+                        self.infer_service_fn,
+                        self.max_concurrency,
+                        self.num_threads,
+                        self.port,
+                        self.precision,
+                        self.use_calib,
+                        self.reload_interval_s,
+                        self.workdir,
+                        self.resource_fn,
+                        self.workdir,
+                        self.workflow_fn,
+                        self.num_threads,
+                        self.max_body_size)
+
         print("Going to Run Comand")
         print(command)
 
         os.system(command)
-
-
-class MultiLangServer(object):
-    def __init__(self):
-        self.bserver_ = Server()
-        self.worker_num_ = 4
-        self.body_size_ = 64 * 1024 * 1024
-        self.concurrency_ = 100000
-        self.is_multi_model_ = False  # for model ensemble, which is not useful right now.
-
-    def set_max_concurrency(self, concurrency):
-        self.concurrency_ = concurrency
-        self.bserver_.set_max_concurrency(concurrency)
-
-    def set_device(self, device="cpu"):
-        self.device = device
-
-    def set_num_threads(self, threads):
-        self.worker_num_ = threads
-        self.bserver_.set_num_threads(threads)
-
-    def set_max_body_size(self, body_size):
-        self.bserver_.set_max_body_size(body_size)
-        if body_size >= self.body_size_:
-            self.body_size_ = body_size
-        else:
-            print(
-                "max_body_size is less than default value, will use default value in service."
-            )
-
-    def use_encryption_model(self, flag=False):
-        self.encryption_model = flag
-
-    def set_port(self, port):
-        self.gport_ = port
-
-    def set_precision(self, precision="fp32"):
-        self.precision = precision
-
-    def set_use_calib(self, use_calib=False):
-        self.use_calib = use_calib
-
-    def set_reload_interval(self, interval):
-        self.bserver_.set_reload_interval(interval)
-
-    def set_op_sequence(self, op_seq):
-        self.bserver_.set_op_sequence(op_seq)
-
-    def set_op_graph(self, op_graph):
-        self.bserver_.set_op_graph(op_graph)
-
-    def use_mkl(self, flag):
-        self.bserver_.use_mkl(flag)
-
-    def set_memory_optimize(self, flag=False):
-        self.bserver_.set_memory_optimize(flag)
-
-    def set_ir_optimize(self, flag=False):
-        self.bserver_.set_ir_optimize(flag)
-
-    def set_gpuid(self, gpuid=0):
-        self.bserver_.set_gpuid(gpuid)
-
-    def load_model_config(self,
-                          server_config_dir_paths,
-                          client_config_path=None):
-        if isinstance(server_config_dir_paths, str):
-            server_config_dir_paths = [server_config_dir_paths]
-        elif isinstance(server_config_dir_paths, list):
-            pass
-        else:
-            raise Exception("The type of model_config_paths must be str or list"
-                            ", not {}.".format(type(server_config_dir_paths)))
-
-        for single_model_config in server_config_dir_paths:
-            if os.path.isdir(single_model_config):
-                pass
-            elif os.path.isfile(single_model_config):
-                raise ValueError(
-                    "The input of --model should be a dir not file.")
-
-        self.bserver_.load_model_config(server_config_dir_paths)
-        if client_config_path is None:
-            #now dict is not useful.
-            if isinstance(server_config_dir_paths, dict):
-                self.is_multi_model_ = True
-                client_config_path = []
-                for server_config_path_items in list(
-                        server_config_dir_paths.items()):
-                    client_config_path.append(server_config_path_items[1])
-            elif isinstance(server_config_dir_paths, list):
-                self.is_multi_model_ = False
-                client_config_path = server_config_dir_paths
-            else:
-                raise Exception(
-                    "The type of model_config_paths must be str or list or "
-                    "dict({op: model_path}), not {}.".format(
-                        type(server_config_dir_paths)))
-        if isinstance(client_config_path, str):
-            client_config_path = [client_config_path]
-        elif isinstance(client_config_path, list):
-            pass
-        else:  # dict is not support right now.
-            raise Exception(
-                "The type of client_config_path must be str or list or "
-                "dict({op: model_path}), not {}.".format(
-                    type(client_config_path)))
-        if len(client_config_path) != len(server_config_dir_paths):
-            raise Warning(
-                "The len(client_config_path) is {}, != len(server_config_dir_paths) {}."
-                .format(len(client_config_path), len(server_config_dir_paths)))
-        self.bclient_config_path_list = client_config_path
-
-    def prepare_server(self,
-                       workdir=None,
-                       port=9292,
-                       device="cpu",
-                       use_encryption_model=False,
-                       cube_conf=None):
-        if not self._port_is_available(port):
-            raise SystemExit("Port {} is already used".format(port))
-        default_port = 12000
-        self.port_list_ = []
-        for i in range(1000):
-            if default_port + i != port and self._port_is_available(default_port
-                                                                    + i):
-                self.port_list_.append(default_port + i)
-                break
-        self.bserver_.prepare_server(
-            workdir=workdir,
-            port=self.port_list_[0],
-            device=device,
-            use_encryption_model=use_encryption_model,
-            cube_conf=cube_conf)
-        self.set_port(port)
-
-    def _launch_brpc_service(self, bserver):
-        bserver.run_server()
-
-    def _port_is_available(self, port):
-        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-            sock.settimeout(2)
-            result = sock.connect_ex(('0.0.0.0', port))
-        return result != 0
-
-    def run_server(self):
-        p_bserver = Process(
-            target=self._launch_brpc_service, args=(self.bserver_, ))
-        p_bserver.start()
-        options = [('grpc.max_send_message_length', self.body_size_),
-                   ('grpc.max_receive_message_length', self.body_size_)]
-        server = grpc.server(
-            futures.ThreadPoolExecutor(max_workers=self.worker_num_),
-            options=options,
-            maximum_concurrent_rpcs=self.concurrency_)
-        multi_lang_general_model_service_pb2_grpc.add_MultiLangGeneralModelServiceServicer_to_server(
-            MultiLangServerServiceServicer(
-                self.bclient_config_path_list, self.is_multi_model_,
-                ["0.0.0.0:{}".format(self.port_list_[0])]), server)
-        server.add_insecure_port('[::]:{}'.format(self.gport_))
-        server.start()
-        p_bserver.join()
-        server.wait_for_termination()
diff --git a/python/paddle_serving_server/web_service.py b/python/paddle_serving_server/web_service.py
index 3db3b7bc47f497e53993b51631910828a0af8ed3..52e394b2c07fd85b0da7be591c0b9a77669e19b6 100755
--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -14,6 +14,7 @@
 #!flask/bin/python
 # pylint: disable=doc-string-missing
 
+# Now, this is only for Pipeline.
 from flask import Flask, request, abort
 from contextlib import closing
 from multiprocessing import Pool, Process, Queue
@@ -26,12 +27,13 @@ import numpy as np
 import os
 from paddle_serving_server import pipeline
 from paddle_serving_server.pipeline import Op
+from paddle_serving_server.serve import format_gpu_to_strlist
 
 
 def port_is_available(port):
     with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
         sock.settimeout(2)
-        result = sock.connect_ex(('0.0.0.0', port))
+        result = sock.connect_ex(('127.0.0.1', port))
     if result != 0:
         return True
     else:
@@ -44,7 +46,7 @@ class WebService(object):
         # pipeline
         self._server = pipeline.PipelineServer(self.name)
 
-        self.gpus = []  # deprecated
+        self.gpus = ["-1"]  # deprecated
         self.rpc_service_list = []  # deprecated
 
     def get_pipeline_response(self, read_op):
@@ -91,7 +93,7 @@ class WebService(object):
         f = open(file_path_list[0], 'r')
         model_conf = google.protobuf.text_format.Merge(
             str(f.read()), model_conf)
-        self.feed_vars = {var.name: var for var in model_conf.feed_var}
+        self.feed_vars = {var.alias_name: var for var in model_conf.feed_var}
 
         if len(file_path_list) > 1:
             model_conf = m_config.GeneralModelConfig()
@@ -99,31 +101,58 @@ class WebService(object):
             model_conf = google.protobuf.text_format.Merge(
                 str(f.read()), model_conf)
 
-        self.fetch_vars = {var.name: var for var in model_conf.fetch_var}
+        self.fetch_vars = {var.alias_name: var for var in model_conf.fetch_var}
         if client_config_path == None:
             self.client_config_path = file_path_list
 
+    # after this function, self.gpus should be a list of str or [].
     def set_gpus(self, gpus):
         print("This API will be deprecated later. Please do not use it")
-        self.gpus = [int(x) for x in gpus.split(",")]
+        self.gpus = format_gpu_to_strlist(gpus)
+
+# this function can be called by user
+# or by Function create_rpc_config
+# if by user, user can set_gpus or pass the `gpus`
+# if `gpus` == None, which means it`s not set at all.
+# at this time, we should use self.gpus instead.
+# otherwise, we should use the `gpus` first.
+# which means if set_gpus and `gpus` is both set.
+# `gpus` will be used.
 
     def default_rpc_service(self,
-                            workdir="conf",
+                            workdir,
                             port=9292,
-                            gpuid=0,
-                            thread_num=2,
+                            gpus=None,
+                            thread_num=4,
                             mem_optim=True,
                             use_lite=False,
                             use_xpu=False,
                             ir_optim=False,
                             precision="fp32",
-                            use_calib=False):
-        device = "gpu"
-        if gpuid == -1:
+                            use_calib=False,
+                            use_trt=False,
+                            gpu_multi_stream=False,
+                            op_num=None,
+                            op_max_batch=None):
+
+        device = "cpu"
+        server = Server()
+        # only when `gpus == None`, which means it`s not set at all
+        # we will use the self.gpus.
+        if gpus == None:
+            gpus = self.gpus
+
+        gpus = format_gpu_to_strlist(gpus)
+        server.set_gpuid(gpus)
+
+        if len(gpus) == 0 or gpus == ["-1"]:
             if use_lite:
                 device = "arm"
             else:
                 device = "cpu"
+        else:
+            device = "gpu"
+
         op_maker = OpMaker()
         op_seq_maker = OpSeqMaker()
 
@@ -142,7 +171,6 @@ class WebService(object):
         general_response_op = op_maker.create('general_response')
         op_seq_maker.add_op(general_response_op)
 
-        server = Server()
         server.set_op_sequence(op_seq_maker.get_op_sequence())
         server.set_num_threads(thread_num)
         server.set_memory_optimize(mem_optim)
@@ -151,6 +179,19 @@ class WebService(object):
         server.set_precision(precision)
         server.set_use_calib(use_calib)
 
+        if use_trt and device == "gpu":
+            server.set_trt()
+            server.set_ir_optimize(True)
+
+        if gpu_multi_stream and device == "gpu":
+            server.set_gpu_multi_stream()
+
+        if op_num:
+            server.set_op_num(op_num)
+
+        if op_max_batch:
+            server.set_op_max_batch(op_max_batch)
+
         if use_lite:
             server.set_lite()
         if use_xpu:
@@ -158,86 +199,87 @@ class WebService(object):
 
         server.load_model_config(self.server_config_dir_paths
                                  )  #brpc Server support server_config_dir_paths
-        if gpuid >= 0:
-            server.set_gpuid(gpuid)
+
         server.prepare_server(workdir=workdir, port=port, device=device)
         return server
 
     def _launch_rpc_service(self, service_idx):
         self.rpc_service_list[service_idx].run_server()
 
+    # if use this function, self.gpus must be set before.
+    # if not, we will use the default value, self.gpus = ["-1"].
+    # so we always pass the `gpus` = self.gpus. 
     def create_rpc_config(self):
-        if len(self.gpus) == 0:
-            # init cpu service
-            self.rpc_service_list.append(
-                self.default_rpc_service(
-                    self.workdir,
-                    self.port_list[0],
-                    -1,
-                    thread_num=self.thread_num,
-                    mem_optim=self.mem_optim,
-                    use_lite=self.use_lite,
-                    use_xpu=self.use_xpu,
-                    ir_optim=self.ir_optim,
-                    precision=self.precision,
-                    use_calib=self.use_calib))
-        else:
-            for i, gpuid in enumerate(self.gpus):
-                self.rpc_service_list.append(
-                    self.default_rpc_service(
-                        "{}_{}".format(self.workdir, i),
-                        self.port_list[i],
-                        gpuid,
-                        thread_num=self.thread_num,
-                        mem_optim=self.mem_optim,
-                        use_lite=self.use_lite,
-                        use_xpu=self.use_xpu,
-                        ir_optim=self.ir_optim,
-                        precision=self.precision,
-                        use_calib=self.use_calib))
+        self.rpc_service_list.append(
+            self.default_rpc_service(
+                self.workdir,
+                self.port_list[0],
+                self.gpus,
+                thread_num=self.thread_num,
+                mem_optim=self.mem_optim,
+                use_lite=self.use_lite,
+                use_xpu=self.use_xpu,
+                ir_optim=self.ir_optim,
+                precision=self.precision,
+                use_calib=self.use_calib,
+                use_trt=self.use_trt,
+                gpu_multi_stream=self.gpu_multi_stream,
+                op_num=self.op_num,
+                op_max_batch=self.op_max_batch))
 
     def prepare_server(self,
-                       workdir="",
+                       workdir,
                        port=9393,
-                       device="gpu",
+                       device="cpu",
                        precision="fp32",
                        use_calib=False,
                        use_lite=False,
                        use_xpu=False,
                        ir_optim=False,
-                       gpuid=0,
-                       thread_num=2,
-                       mem_optim=True):
+                       thread_num=4,
+                       mem_optim=True,
+                       use_trt=False,
+                       gpu_multi_stream=False,
+                       op_num=None,
+                       op_max_batch=None,
+                       gpuid=None):
         print("This API will be deprecated later. Please do not use it")
         self.workdir = workdir
         self.port = port
         self.thread_num = thread_num
-        self.device = device
+        # self.device is not used at all.
+        # device is set by gpuid.
         self.precision = precision
         self.use_calib = use_calib
         self.use_lite = use_lite
         self.use_xpu = use_xpu
         self.ir_optim = ir_optim
         self.mem_optim = mem_optim
-        self.gpuid = gpuid
         self.port_list = []
+        self.use_trt = use_trt
+        self.gpu_multi_stream = gpu_multi_stream
+        self.op_num = op_num
+        self.op_max_batch = op_max_batch
+
+        # if gpuid != None, we will use gpuid first.
+        # otherwise, keep the self.gpus unchanged.
+        # maybe self.gpus is set by the Function set_gpus.
+        if gpuid != None:
+            self.gpus = format_gpu_to_strlist(gpuid)
+        else:
+            pass
+
         default_port = 12000
         for i in range(1000):
             if port_is_available(default_port + i):
                 self.port_list.append(default_port + i)
-            if len(self.port_list) > len(self.gpus):
                 break
 
     def _launch_web_service(self):
-        gpu_num = len(self.gpus)
         self.client = Client()
         self.client.load_client_config(self.client_config_path)
         endpoints = ""
-        if gpu_num > 0:
-            for i in range(gpu_num):
-                endpoints += "127.0.0.1:{},".format(self.port_list[i])
-        else:
-            endpoints = "127.0.0.1:{}".format(self.port_list[0])
+        endpoints = "127.0.0.1:{}".format(self.port_list[0])
         self.client.connect([endpoints])
 
     def get_prediction(self, request):
@@ -322,12 +364,13 @@ class WebService(object):
         if gpu:
             # if user forget to call function `set_gpus` to set self.gpus.
             # default self.gpus = [0].
-            if len(self.gpus) == 0:
-                self.gpus.append(0)
+            if len(self.gpus) == 0 or self.gpus == ["-1"]:
+                self.gpus = ["0"]
+            # right now, local Predictor only support 1 card.
+            # no matter how many gpu_id is in gpus, we only use the first one.
+            gpu_id = (self.gpus[0].split(","))[0]
             self.client.load_model_config(
-                self.server_config_dir_paths[0],
-                use_gpu=True,
-                gpu_id=self.gpus[0])
+                self.server_config_dir_paths[0], use_gpu=True, gpu_id=gpu_id)
         else:
             self.client.load_model_config(
                 self.server_config_dir_paths[0], use_gpu=False)
diff --git a/python/pipeline/dag.py b/python/pipeline/dag.py
index 1f8f9cefeb178f57bd613f6b4a7e7a4e4a9f90c4..69ed7124f51948e643e204001c699f820bf288f4 100644
--- a/python/pipeline/dag.py
+++ b/python/pipeline/dag.py
@@ -176,7 +176,7 @@ class DAGExecutor(object):
                              "in_channel must be Channel type, but get {}".
                              format(type(in_channel)))
             os._exit(-1)
-        in_channel.add_producer(self.name)
+
         self._in_channel = in_channel
         _LOGGER.info("[DAG] set in channel succ, name [{}]".format(self.name))
 
@@ -669,14 +669,14 @@ class DAG(object):
                                              out_degree_ops)
         dag_views = list(reversed(dag_views))
         if not self._build_dag_each_worker:
-            _LOGGER.debug("================== DAG ====================")
+            _LOGGER.info("================== DAG ====================")
             for idx, view in enumerate(dag_views):
-                _LOGGER.debug("(VIEW {})".format(idx))
+                _LOGGER.info("(VIEW {})".format(idx))
                 for op in view:
-                    _LOGGER.debug("  [{}]".format(op.name))
+                    _LOGGER.info("  [{}]".format(op.name))
                     for out_op in out_degree_ops[op.name]:
-                        _LOGGER.debug("    - {}".format(out_op.name))
-            _LOGGER.debug("-------------------------------------------")
+                        _LOGGER.info("    - {}".format(out_op.name))
+            _LOGGER.info("-------------------------------------------")
 
         # create channels and virtual ops
         virtual_op_name_gen = NameGenerator("vir")
@@ -719,6 +719,7 @@ class DAG(object):
                 channel = self._gen_channel(channel_name_gen)
                 channels.append(channel)
                 op.add_input_channel(channel)
+                _LOGGER.info("op:{} add input channel.".format(op.name))
                 pred_ops = pred_op_of_next_view_op[op.name]
                 if v_idx == 0:
                     input_channel = channel
@@ -726,6 +727,8 @@ class DAG(object):
                     # if pred_op is virtual op, it will use ancestors as producers to channel
                     for pred_op in pred_ops:
                         pred_op.add_output_channel(channel)
+                        _LOGGER.info("pred_op:{} add output channel".format(
+                            pred_op.name))
                 processed_op.add(op.name)
                 # find same input op to combine channel
                 for other_op in actual_next_view[o_idx + 1:]:
@@ -745,6 +748,7 @@ class DAG(object):
         output_channel = self._gen_channel(channel_name_gen)
         channels.append(output_channel)
         last_op.add_output_channel(output_channel)
+        _LOGGER.info("last op:{} add output channel".format(last_op.name))
 
         pack_func, unpack_func = None, None
         pack_func = response_op.pack_response_package
@@ -752,7 +756,11 @@ class DAG(object):
         actual_ops = virtual_ops
         for op in used_ops:
             if len(op.get_input_ops()) == 0:
+                #set special features of the request op. 
+                #1.set unpack function.
+                #2.set output channel. 
                 unpack_func = op.unpack_request_package
+                op.add_output_channel(input_channel)
                 continue
             actual_ops.append(op)
 
diff --git a/python/pipeline/operator.py b/python/pipeline/operator.py
index eab2c3a593274064c35a7af28d81fc4e21d80746..a563bd964115289e739854a3a64f0a0595097952 100644
--- a/python/pipeline/operator.py
+++ b/python/pipeline/operator.py
@@ -16,7 +16,7 @@ from time import time as _time
 import time
 import threading
 import multiprocessing
-from paddle_serving_client import MultiLangClient, Client
+from paddle_serving_client import Client
 from concurrent import futures
 import logging
 import func_timeout
@@ -40,6 +40,7 @@ from .channel import (ThreadChannel, ProcessChannel, ChannelDataErrcode,
 from .util import NameGenerator
 from .profiler import UnsafeTimeProfiler as TimeProfiler
 from . import local_service_handler
+from .pipeline_client import PipelineClient as PPClient
 
 _LOGGER = logging.getLogger(__name__)
 _op_name_gen = NameGenerator("Op")
@@ -58,13 +59,15 @@ class Op(object):
                  retry=0,
                  batch_size=None,
                  auto_batching_timeout=None,
-                 local_service_handler=None):
+                 local_service_handler=None,
+                 jump_to_ops=[]):
         # In __init__, all the parameters are just saved and Op is not initialized
         if name is None:
             name = _op_name_gen.next()
         self.name = name  # to identify the type of OP, it must be globally unique
         self.concurrency = concurrency  # amount of concurrency
         self.set_input_ops(input_ops)
+        self.set_jump_to_ops(jump_to_ops)
 
         self._local_service_handler = local_service_handler
         self._server_endpoints = server_endpoints
@@ -99,9 +102,7 @@ class Op(object):
             conf: config.yaml
 
         Returns:
-            None
         """
-        # init op
         if self.concurrency is None:
             self.concurrency = conf["concurrency"]
         if self._retry is None:
@@ -330,8 +331,8 @@ class Op(object):
         if self.client_type == 'brpc':
             client = Client()
             client.load_client_config(client_config)
-        elif self.client_type == 'grpc':
-            client = MultiLangClient()
+        elif self.client_type == 'pipeline_grpc':
+            client = PPClient()
         elif self.client_type == 'local_predictor':
             if self.local_predictor is None:
                 raise ValueError("local predictor not yet created")
@@ -371,6 +372,79 @@ class Op(object):
                 os._exit(-1)
             self._input_ops.append(op)
 
+    def get_jump_to_ops(self):
+        return self._jump_to_ops
+
+    def set_jump_to_ops(self, ops):
+        """
+        Set jump to ops, then, this op can send channeldata to output channel.
+
+        Args:
+            ops: op list to be jumpped
+
+        Returns:
+            None.
+        """
+        if not isinstance(ops, list):
+            ops = [] if ops is None else [ops]
+
+        self._jump_to_ops = []
+        for op in ops:
+            if not isinstance(op, Op):
+                _LOGGER.critical(
+                    self._log("Failed to set input_ops: input op "
+                              "must be Op type, not {}".format(type(op))))
+                os._exit(-1)
+            self._jump_to_ops.append(op)
+
+    def is_jump_op(self):
+        """
+        The op has _jump_to_ops members or not.
+
+        Args:
+            None
+
+        Returns:
+            True or False
+        """
+        return len(self._jump_to_ops) > 0
+
+    def check_jumping(self, input_data):
+        """
+        Check whether to send data to jump ops.WhileOp needs to rewrite 
+        this interface. this function returns False default.
+     
+        Args:
+            input_data: input data to be preprocessed
+
+        Returns:
+            True, send data to the output channel of jump ops
+            False, send data to output channel.
+        """
+        return False
+
+    def get_output_channels_of_jump_ops(self):
+        """
+        Get output channels of jump ops
+
+        Args:
+            None
+
+        Returns:
+            list of channels
+        """
+        channels = []
+        if self.is_jump_op() is False:
+            return channels
+        for op in self._jump_to_ops:
+            _LOGGER.info("op:{} extend op._get_output_channels:{}".format(
+                op.name, op._get_output_channels()))
+            channels.extend(op._get_output_channels())
+
+        _LOGGER.info("get_output_channels_of_jump_ops, channels:{}".format(
+            channels))
+        return channels
+
     def add_input_channel(self, channel):
         """
         Adding one input channel to the Op. Each op have many front op,
@@ -409,6 +483,7 @@ class Op(object):
             os._exit(-1)
         channel.add_producer(self.name)
         self._outputs.append(channel)
+        _LOGGER.info("op:{} add output_channel {}".format(self.name, channel))
 
     def clean_output_channels(self):
         self._outputs = []
@@ -423,7 +498,7 @@ class Op(object):
 
         Args:
             input_dicts: input data to be preprocessed
-            data_id: inner unique id, 0 default
+            data_id: inner unique id, increase auto
             log_id: global unique id for RTT, 0 default
 
         Return:
@@ -456,36 +531,80 @@ class Op(object):
         Returns:
             call_result: predict result
         """
-        err, err_info = ChannelData.check_batch_npdata(feed_batch)
-        if err != 0:
-            _LOGGER.critical(
-                self._log("Failed to run process: {}. Please override "
-                          "preprocess func.".format(err_info)))
-            os._exit(-1)
+
+        call_result = None
+        err_code = ChannelDataErrcode.OK.value
+        err_info = ""
+
         if self.client_type == "local_predictor":
+            err, err_info = ChannelData.check_batch_npdata(feed_batch)
+            if err != 0:
+                _LOGGER.error(
+                    self._log("Failed to run process: {}. feed_batch must be \
+                        npdata in process for local_predictor mode."
+                              .format(err_info)))
+                return call_result, ChannelDataErrcode.TYPE_ERROR.value, "feed_batch must be npdata"
+
             call_result = self.client.predict(
                 feed=feed_batch[0],
                 fetch=self._fetch_names,
                 batch=True,
                 log_id=typical_logid)
-        else:
+
+        elif self.client_type == "brpc":
+            err, err_info = ChannelData.check_batch_npdata(feed_batch)
+            if err != 0:
+                _LOGGER.error(
+                    self._log("Failed to run process: {}. feed_batch must be \
+                        npdata in process for brpc mode.".format(err_info)))
+                return call_result, ChannelDataErrcode.TYPE_ERROR.value, "feed_batch must be npdata"
             call_result = self.client.predict(
-                feed=feed_batch,
+                feed=feed_batch[0],
                 fetch=self._fetch_names,
                 batch=True,
                 log_id=typical_logid)
-        if isinstance(self.client, MultiLangClient):
-            if call_result is None or call_result["serving_status_code"] != 0:
-                return None
-            call_result.pop("serving_status_code")
-        return call_result
 
-    def postprocess(self, input_data, fetch_data, log_id=0):
+        elif self.client_type == "pipeline_grpc":
+            err, err_info = ChannelData.check_dictdata(feed_batch)
+            if err != 0:
+                _LOGGER.error(
+                    self._log("Failed to run process: {}. feed_batch must be \
+                       npdata in process for pipeline_grpc mode."
+                              .format(err_info)))
+                return call_result, ChannelDataErrcode.TYPE_ERROR.value, "feed_batch must be dict"
+
+            call_result = self.client.predict(
+                feed_dict=feed_batch[0],
+                fetch=self._fetch_names,
+                asyn=False,
+                profile=False)
+            if call_result is None:
+                _LOGGER.error(
+                    self._log("Failed in pipeline_grpc. call_result is None."))
+                return call_result, ChannelDataErrcode.UNKNOW.value, "pipeline_grpc error"
+            if call_result.err_no != 0:
+                _LOGGER.error(
+                    self._log("Failed in pipeline_grpc. err_no:{}, err_info:{}".
+                              format(call_result.err_no, call_result.err_msg)))
+                return call_result, ChannelDataErrcode(
+                    call_result.err_no).value, call_result.err_msg
+
+            new_dict = {}
+            err_code = ChannelDataErrcode(call_result.err_no).value
+            err_info = call_result.err_msg
+            for idx, key in enumerate(call_result.key):
+                new_dict[key] = [call_result.value[idx]]
+            call_result = new_dict
+
+        return call_result, err_code, err_info
+
+    def postprocess(self, input_data, fetch_data, data_id=0, log_id=0):
         """
         In postprocess stage, assemble data for next op or output.
         Args:
             input_data: data returned in preprocess stage, dict(for single predict) or list(for batch predict)
             fetch_data: data returned in process stage, dict(for single predict) or list(for batch predict)
+            data_id: inner unique id, increase auto
             log_id: logid, 0 default
 
         Returns: 
@@ -589,7 +708,8 @@ class Op(object):
                       self.device_type, self.devices, self.mem_optim,
                       self.ir_optim, self.precision, self.use_mkldnn,
                       self.mkldnn_cache_capacity, self.mkldnn_op_list,
-                      self.mkldnn_bf16_op_list))
+                      self.mkldnn_bf16_op_list, self.is_jump_op(),
+                      self.get_output_channels_of_jump_ops()))
             p.daemon = True
             p.start()
             process.append(p)
@@ -625,7 +745,8 @@ class Op(object):
                       self.device_type, self.devices, self.mem_optim,
                       self.ir_optim, self.precision, self.use_mkldnn,
                       self.mkldnn_cache_capacity, self.mkldnn_op_list,
-                      self.mkldnn_bf16_op_list))
+                      self.mkldnn_bf16_op_list, self.is_jump_op(),
+                      self.get_output_channels_of_jump_ops()))
             # When a process exits, it attempts to terminate
             # all of its daemonic child processes.
             t.daemon = True
@@ -810,16 +931,20 @@ class Op(object):
 
         midped_batch = None
         error_code = ChannelDataErrcode.OK.value
+        error_info = ""
         if self._timeout <= 0:
             # No retry
             try:
                 if batch_input is False:
-                    midped_batch = self.process(feed_batch, typical_logid)
+                    midped_batch, error_code, error_info = self.process(
+                        feed_batch, typical_logid)
                 else:
                     midped_batch = []
                     for idx in range(len(feed_batch)):
-                        predict_res = self.process([feed_batch[idx]],
-                                                   typical_logid)
+                        predict_res, error_code, error_info = self.process(
+                            [feed_batch[idx]], typical_logid)
+                        if error_code != ChannelDataErrcode.OK.value:
+                            break
                         midped_batch.append(predict_res)
             except Exception as e:
                 error_code = ChannelDataErrcode.UNKNOW.value
@@ -832,14 +957,14 @@ class Op(object):
                 try:
                     # time out for each process
                     if batch_input is False:
-                        midped_batch = func_timeout.func_timeout(
+                        midped_batch, error_code, error_info = func_timeout.func_timeout(
                             self._timeout,
                             self.process,
                             args=(feed_batch, typical_logid))
                     else:
                         midped_batch = []
                         for idx in range(len(feed_batch)):
-                            predict_res = func_timeout.func_timeout(
+                            predict_res, error_code, error_info = func_timeout.func_timeout(
                                 self._timeout,
                                 self.process,
                                 args=([feed_batch[idx]], typical_logid))
@@ -950,7 +1075,7 @@ class Op(object):
             prod_errcode, prod_errinfo = None, None
             try:
                 postped_data, prod_errcode, prod_errinfo = self.postprocess(
-                    parsed_data_dict[data_id], midped_data,
+                    parsed_data_dict[data_id], midped_data, data_id,
                     logid_dict.get(data_id))
             except Exception as e:
                 error_info = "(data_id={} log_id={}) {} Failed to postprocess: {}".format(
@@ -1096,7 +1221,8 @@ class Op(object):
     def _run(self, concurrency_idx, input_channel, output_channels,
              is_thread_op, trace_buffer, model_config, workdir, thread_num,
              device_type, devices, mem_optim, ir_optim, precision, use_mkldnn,
-             mkldnn_cache_capacity, mkldnn_op_list, mkldnn_bf16_op_list):
+             mkldnn_cache_capacity, mkldnn_op_list, mkldnn_bf16_op_list,
+             is_jump_op, output_channels_of_jump_ops):
         """
         _run() is the entry function of OP process / thread model.When client 
         type is local_predictor in process mode, the CUDA environment needs to 
@@ -1123,6 +1249,8 @@ class Op(object):
             mkldnn_cache_capacity: cache capacity of mkldnn, 0 means no limit.
             mkldnn_op_list: OP list optimized by mkldnn, None default.
             mkldnn_bf16_op_list: OP list optimized by mkldnn bf16, None default.
+            is_jump_op: OP has jump op list or not, False default.
+            output_channels_of_jump_ops: all output channels of jump ops.
 
         Returns:
             None
@@ -1263,27 +1391,46 @@ class Op(object):
                 break
             if len(postped_data_dict) == 0:
                 continue
+
             # push data to channel (if run succ)
             start = int(round(_time() * 1000000))
             try:
                 profile_str = profiler.gen_profile_str()
-                for data_id, postped_data in postped_data_dict.items():
-                    if self._server_use_profile:
-                        sys.stderr.write(profile_str)
-                    self._push_to_output_channels(
-                        data=postped_data,
-                        channels=output_channels,
-                        profile_str=profile_str,
-                        client_need_profile=need_profile_dict[data_id],
-                        profile_set=profile_dict[data_id])
-                    after_outchannel_time = _time()
-                    _LOGGER.debug(
-                        "(data_id={}) PUSH OUTPUT CHANNEL! op:{} push cost:{} ms".
-                        format(data_id, self.name, (after_outchannel_time -
-                                                    after_postp_time) * 1000))
-                    _LOGGER.debug(
-                        "(data_id={}) PUSH OUTPUT CHANNEL! op:{} push data:{}".
-                        format(data_id, self.name, postped_data.get_all_data()))
+                if self.is_jump_op() is True and self.check_jumping(
+                        postped_data_dict) is True:
+                    # push data to output channel of ops to be jumped 
+                    for data_id, postped_data in postped_data_dict.items():
+                        if self._server_use_profile:
+                            sys.stderr.write(profile_str)
+                        self._push_to_output_channels(
+                            data=postped_data,
+                            channels=output_channels_of_jump_ops,
+                            profile_str=profile_str,
+                            client_need_profile=need_profile_dict[data_id],
+                            profile_set=profile_dict[data_id])
+                        after_outchannel_time = _time()
+                        _LOGGER.debug(
+                            "(data_id={}) PUSH OUTPUT CHANNEL OF JUMP OPs! op:{} push cost:{} ms".
+                            format(data_id, self.name, (after_outchannel_time -
+                                                        after_postp_time) *
+                                   1000))
+                else:
+                    # push data to output channel.
+                    for data_id, postped_data in postped_data_dict.items():
+                        if self._server_use_profile:
+                            sys.stderr.write(profile_str)
+                        self._push_to_output_channels(
+                            data=postped_data,
+                            channels=output_channels,
+                            profile_str=profile_str,
+                            client_need_profile=need_profile_dict[data_id],
+                            profile_set=profile_dict[data_id])
+                        after_outchannel_time = _time()
+                        _LOGGER.debug(
+                            "(data_id={}) PUSH OUTPUT CHANNEL! op:{} push cost:{} ms".
+                            format(data_id, self.name, (after_outchannel_time -
+                                                        after_postp_time) *
+                                   1000))
             except ChannelStopError:
                 _LOGGER.debug("{} Stop.".format(op_info_prefix))
                 self._finalize(is_thread_op)
@@ -1406,7 +1553,7 @@ class RequestOp(Op):
         for idx, key in enumerate(request.key):
             dict_data[key] = request.value[idx]
         log_id = request.logid
-        _LOGGER.info("RequestOp unpack one request. log_id:{}, clientip:{} \
+        _LOGGER.debug("RequestOp unpack one request. log_id:{}, clientip:{} \
             name:{}, method:{}".format(log_id, request.clientip, request.name,
                                        request.method))
 
diff --git a/python/pipeline/pipeline_client.py b/python/pipeline/pipeline_client.py
index 132cf043cd49f097c4ee47e36ce67f53f022b82a..af123adfd4c43b3bcb7bb715797f6092df7ebd7f 100644
--- a/python/pipeline/pipeline_client.py
+++ b/python/pipeline/pipeline_client.py
@@ -93,13 +93,19 @@ class PipelineClient(object):
     def _unpack_response_package(self, resp, fetch):
         return resp
 
-    def predict(self, feed_dict, fetch=None, asyn=False, profile=False):
+    def predict(self,
+                feed_dict,
+                fetch=None,
+                asyn=False,
+                profile=False,
+                log_id=0):
         if not isinstance(feed_dict, dict):
             raise TypeError(
                 "feed must be dict type with format: {name: value}.")
         if fetch is not None and not isinstance(fetch, list):
             raise TypeError("fetch must be list type with format: [name].")
         req = self._pack_request_package(feed_dict, profile)
+        req.logid = log_id
         if not asyn:
             resp = self._stub.inference(req)
             return self._unpack_response_package(resp, fetch)
diff --git a/python/pipeline/util.py b/python/pipeline/util.py
old mode 100644
new mode 100755
index d7847f179de7557b5446958536008adc3c981f95..8bc15446b81c24162bbe2e236f204ffd1d0c23d1
--- a/python/pipeline/util.py
+++ b/python/pipeline/util.py
@@ -39,7 +39,7 @@ class AvailablePortGenerator(object):
     def port_is_available(port):
         with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
             sock.settimeout(2)
-            result = sock.connect_ex(('0.0.0.0', port))
+            result = sock.connect_ex(('127.0.0.1', port))
         if result != 0:
             return True
         else:
diff --git a/tools/codestyle/clang_format.hook b/tools/codestyle/clang_format.hook
old mode 100755
new mode 100644
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
old mode 100755
new mode 100644
diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook
old mode 100755
new mode 100644
diff --git a/tools/cpp_examples/demo-serving/proto/general_model_service.proto b/tools/cpp_examples/demo-serving/proto/general_model_service.proto
old mode 100644
new mode 100755
index 803f7aa09435ac7a5715d62d78a6e12a671c7cb5..8fedb60e97ec5b81263687b47ff0794880da8671
--- a/tools/cpp_examples/demo-serving/proto/general_model_service.proto
+++ b/tools/cpp_examples/demo-serving/proto/general_model_service.proto
@@ -20,18 +20,34 @@ package baidu.paddle_serving.predictor.general_model;
 option cc_generic_services = true;
 
 message Tensor {
-  repeated bytes data = 1;
-  optional int32 elem_type = 2;
-  repeated int32 shape = 3;
+  repeated string data = 1;
+  repeated int32 int_data = 2;
+  repeated int64 int64_data = 3;
+  repeated float float_data = 4;
+  optional int32 elem_type =
+      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
+  repeated int32 shape = 6;       // shape should include batch
+  repeated int32 lod = 7;         // only for fetch tensor currently
+  optional string name = 8;       // get from the Model prototxt
+  optional string alias_name = 9; // get from the Model prototxt
 };
 
-message FeedInst { repeated Tensor tensor_array = 1; };
-
-message FetchInst { repeated Tensor tensor_array = 1; };
+message Request {
+  repeated Tensor tensor = 1;
+  repeated string fetch_var_names = 2;
+  optional bool profile_server = 3 [ default = false ];
+  required uint64 log_id = 4 [ default = 0 ];
+};
 
-message Request { repeated FeedInst insts = 1; };
+message Response {
+  repeated ModelOutput outputs = 1;
+  repeated int64 profile_time = 2;
+};
 
-message Response { repeated FetchInst insts = 1; };
+message ModelOutput {
+  repeated Tensor tensor = 1;
+  optional string engine_name = 2;
+}
 
 service GeneralModelService {
   rpc inference(Request) returns (Response);
diff --git a/tools/dockerfiles/build_scripts/build_utils.sh b/tools/dockerfiles/build_scripts/build_utils.sh
old mode 100755
new mode 100644
diff --git a/tools/dockerfiles/root/.bashrc b/tools/dockerfiles/root/.bashrc
old mode 100755
new mode 100644
diff --git a/tools/dockerfiles/root/.gitconfig b/tools/dockerfiles/root/.gitconfig
old mode 100755
new mode 100644
diff --git a/tools/dockerfiles/root/.scripts/git-completion.sh b/tools/dockerfiles/root/.scripts/git-completion.sh
old mode 100755
new mode 100644
diff --git a/tools/dockerfiles/root/.scripts/git-prompt.sh b/tools/dockerfiles/root/.scripts/git-prompt.sh
old mode 100755
new mode 100644
diff --git a/tools/scripts/ipipe_py3.sh b/tools/scripts/ipipe_py3.sh
index da2b7be10635632340d38302aa2c2ba2888c4f7d..47eb600a66a510de8f2906ad161579d5e8c58186 100644
--- a/tools/scripts/ipipe_py3.sh
+++ b/tools/scripts/ipipe_py3.sh
@@ -36,13 +36,14 @@ go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
 go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
 go get -u github.com/golang/protobuf/protoc-gen-go@v1.4.3
 go get -u google.golang.org/grpc@v1.33.0
+go env -w GO111MODULE=auto
 
 build_whl_list=(build_cpu_server build_gpu_server build_client build_app)
 rpc_model_list=(grpc_fit_a_line grpc_yolov4 pipeline_imagenet bert_rpc_gpu bert_rpc_cpu ResNet50_rpc \
 lac_rpc cnn_rpc bow_rpc lstm_rpc fit_a_line_rpc deeplabv3_rpc mobilenet_rpc unet_rpc resnetv2_rpc \
 criteo_ctr_rpc_cpu criteo_ctr_rpc_gpu ocr_rpc yolov4_rpc_gpu faster_rcnn_hrnetv2p_w18_1x_encrypt \
-low_precision_resnet50_int8 ocr_c++_service)
-http_model_list=(fit_a_line_http lac_http cnn_http bow_http lstm_http ResNet50_http bert_http \
+faster_rcnn_model_rpc low_precision_resnet50_int8 ocr_c++_service)
+http_model_list=(fit_a_line_http lac_http imdb_http_proto imdb_http_json imdb_grpc ResNet50_http bert_http \
 pipeline_ocr_cpu_http)
 
 function setproxy() {
@@ -120,31 +121,66 @@ function check() {
     fi
 }
 
+function check_gpu_memory() {
+    gpu_memory=`nvidia-smi --id=$1 --format=csv,noheader --query-gpu=memory.used | awk '{print $1}'`
+    echo -e "${GREEN_COLOR}-------id-$1 gpu_memory_used: ${gpu_memory}${RES}"
+    if [ ${gpu_memory} -le 100 ]; then
+        echo "-------GPU-$1 is not used"
+        status="GPU-$1 is not used"
+    else
+        echo "-------GPU_memory used is expected"
+    fi
+}
+
 function check_result() {
     if [ $? == 0 ]; then
         echo -e "${GREEN_COLOR}$1 execute normally${RES}"
         if [ $1 == "server" ]; then
             sleep $2
-            tail ${dir}server_log.txt | tee -a ${log_dir}server_total.txt
+            cat ${dir}server_log.txt | tee -a ${log_dir}server_total.txt
         fi
         if [ $1 == "client" ]; then
-            tail ${dir}client_log.txt | tee -a ${log_dir}client_total.txt
+            cat ${dir}client_log.txt | tee -a ${log_dir}client_total.txt
             grep -E "${error_words}" ${dir}client_log.txt > /dev/null
             if [ $? == 0 ]; then
+                if [ "${status}" != "" ]; then
+                    status="${status}|Failed"
+                else
+                    status="Failed"
+                fi
                 echo -e "${RED_COLOR}$1 error command${RES}\n" | tee -a ${log_dir}server_total.txt ${log_dir}client_total.txt
-                echo -e "--------------pipeline.log:----------------\n"
+                echo "--------------server log:--------------"
+                cat ${dir}server_log.txt
+                echo "--------------client log:--------------"
+                cat ${dir}client_log.txt
+                echo "--------------pipeline.log:----------------"
                 cat PipelineServingLogs/pipeline.log
-                echo -e "-------------------------------------------\n"
+                echo "-------------------------------------------\n"
                 error_log $2
             else
+                if [ "${status}" != "" ]; then
+                    error_log $2
+                fi
                 echo -e "${GREEN_COLOR}$2${RES}\n" | tee -a ${log_dir}server_total.txt ${log_dir}client_total.txt
             fi
         fi
     else
         echo -e "${RED_COLOR}$1 error command${RES}\n" | tee -a ${log_dir}server_total.txt ${log_dir}client_total.txt
-        tail ${dir}client_log.txt | tee -a ${log_dir}client_total.txt
+        echo "--------------server log:--------------"
+        cat ${dir}server_log.txt
+        echo "--------------client log:--------------"
+        cat ${dir}client_log.txt
+        echo "--------------pipeline.log:----------------"
+        cat PipelineServingLogs/pipeline.log
+        echo "-------------------------------------------\n"
+        if [ "${status}" != "" ]; then
+            status="${status}|Failed"
+        else
+            status="Failed"
+        fi
         error_log $2
     fi
+    status=""
 }
 
 function error_log() {
@@ -163,7 +199,7 @@ function error_log() {
     echo "deployment: ${deployment// /_}" | tee -a ${log_dir}error_models.txt
     echo "py_version: ${py_version}" | tee -a ${log_dir}error_models.txt
     echo "cuda_version: ${cuda_version}" | tee -a ${log_dir}error_models.txt
-    echo "status: Failed" | tee -a ${log_dir}error_models.txt
+    echo "status: ${status}" | tee -a ${log_dir}error_models.txt
     echo -e "-----------------------------\n\n" | tee -a ${log_dir}error_models.txt
     prefix=${arg//\//_}
     for file in ${dir}*
@@ -192,7 +228,7 @@ function link_data() {
 function before_hook() {
     setproxy
     cd ${build_path}/python
-    ${py_version} -m pip install --upgrade pip
+    ${py_version} -m pip install --upgrade pip==21.1.3
     ${py_version} -m pip install requests
     ${py_version} -m pip install -r requirements.txt
     ${py_version} -m pip install numpy==1.16.4
@@ -325,7 +361,7 @@ function low_precision_resnet50_int8 () {
     ${py_version} -m paddle_serving_client.convert --dirname ResNet50_quant
     echo -e "${GREEN_COLOR}low_precision_resnet50_int8_GPU_RPC server started${RES}" | tee -a ${log_dir}server_total.txt
     ${py_version} -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0 --use_trt --precision int8 > ${dir}server_log.txt 2>&1 &
-    check_result server 10
+    check_result server 15
     echo -e "${GREEN_COLOR}low_precision_resnet50_int8_GPU_RPC client started${RES}" | tee -a ${log_dir}client_total.txt
     ${py_version} resnet50_client.py > ${dir}client_log.txt 2>&1
     check_result client "low_precision_resnet50_int8_GPU_RPC server test completed"
@@ -341,7 +377,7 @@ function faster_rcnn_hrnetv2p_w18_1x_encrypt() {
     ${py_version} encrypt.py
     unsetproxy
     echo -e "${GREEN_COLOR}faster_rcnn_hrnetv2p_w18_1x_ENCRYPTION_GPU_RPC server started${RES}" | tee -a ${log_dir}server_total.txt
-    ${py_version} -m paddle_serving_server.serve --model encrypt_server/ --port 9494 --use_trt --gpu_ids 0 --use_encryption_model > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model encrypt_server/ --port 9494 --gpu_ids 0 --use_encryption_model > ${dir}server_log.txt 2>&1 &
     check_result server 3
     echo -e "${GREEN_COLOR}faster_rcnn_hrnetv2p_w18_1x_ENCRYPTION_GPU_RPC client started${RES}" | tee -a ${log_dir}client_total.txt
     ${py_version} test_encryption.py 000000570688.jpg > ${dir}client_log.txt 2>&1
@@ -379,6 +415,7 @@ function bert_rpc_gpu() {
     ls -hlst
     ${py_version} -m paddle_serving_server.serve --model bert_seq128_model/ --port 8860 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
     check_result server 15
+    check_gpu_memory 0
     nvidia-smi
     head data-c.txt | ${py_version} bert_client.py --model bert_seq128_client/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
     check_result client "bert_GPU_RPC server test completed"
@@ -429,6 +466,7 @@ function ResNet50_rpc() {
     sed -i 's/9696/8863/g' resnet50_rpc_client.py
     ${py_version} -m paddle_serving_server.serve --model ResNet50_vd_model --port 8863 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
     check_result server 8
+    check_gpu_memory 0
     nvidia-smi
     ${py_version} resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
     check_result client "ResNet50_GPU_RPC server test completed"
@@ -446,6 +484,7 @@ function ResNet101_rpc() {
     sed -i "22cclient.connect(['127.0.0.1:8864'])" image_rpc_client.py
     ${py_version} -m paddle_serving_server.serve --model ResNet101_vd_model --port 8864 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
     check_result server 8
+    check_gpu_memory 0
     nvidia-smi
     ${py_version} image_rpc_client.py ResNet101_vd_client_config/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
     check_result client "ResNet101_GPU_RPC server test completed"
@@ -536,10 +575,11 @@ function faster_rcnn_model_rpc() {
     data_dir=${data}detection/faster_rcnn_r50_fpn_1x_coco/
     link_data ${data_dir}
     sed -i 's/9494/8870/g' test_client.py
-    ${py_version} -m paddle_serving_server.serve --model serving_server --port 8870 --gpu_ids 0 --thread 2 --use_trt > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model serving_server --port 8870 --gpu_ids 1 --thread 8 > ${dir}server_log.txt 2>&1 &
     echo "faster rcnn running ..."
     nvidia-smi
     check_result server 10
+    check_gpu_memory 1
     ${py_version} test_client.py 000000570688.jpg > ${dir}client_log.txt 2>&1
     nvidia-smi
     check_result client "faster_rcnn_GPU_RPC server test completed"
@@ -556,6 +596,7 @@ function cascade_rcnn_rpc() {
     sed -i "s/9292/8879/g" test_client.py
     ${py_version} -m paddle_serving_server.serve --model serving_server --port 8879 --gpu_ids 0 --thread 2 > ${dir}server_log.txt 2>&1 &
     check_result server 8
+    check_gpu_memory 0
     nvidia-smi
     ${py_version} test_client.py > ${dir}client_log.txt 2>&1
     nvidia-smi
@@ -571,8 +612,9 @@ function deeplabv3_rpc() {
     data_dir=${data}deeplabv3/
     link_data ${data_dir}
     sed -i "s/9494/8880/g" deeplabv3_client.py
-    ${py_version} -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 8880 --thread 2 > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 8880 --thread 4 > ${dir}server_log.txt 2>&1 &
     check_result server 10
+    check_gpu_memory 0
     nvidia-smi
     ${py_version} deeplabv3_client.py > ${dir}client_log.txt 2>&1
     nvidia-smi
@@ -590,6 +632,7 @@ function mobilenet_rpc() {
     sed -i "s/9393/8881/g" mobilenet_tutorial.py
     ${py_version} -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 8881 > ${dir}server_log.txt 2>&1 &
     check_result server 8
+    check_gpu_memory 0
     nvidia-smi
     ${py_version} mobilenet_tutorial.py > ${dir}client_log.txt 2>&1
     nvidia-smi
@@ -605,8 +648,9 @@ function unet_rpc() {
     data_dir=${data}unet_for_image_seg/
     link_data ${data_dir}
     sed -i "s/9494/8882/g" seg_client.py
-    ${py_version} -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 8882 > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model unet_model --gpu_ids 1 --port 8882 > ${dir}server_log.txt 2>&1 &
     check_result server 8
+    check_gpu_memory 1
     nvidia-smi
     ${py_version} seg_client.py > ${dir}client_log.txt 2>&1
     nvidia-smi
@@ -624,6 +668,7 @@ function resnetv2_rpc() {
     sed -i 's/9393/8883/g' resnet50_v2_tutorial.py
     ${py_version} -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 8883 > ${dir}server_log.txt 2>&1 &
     check_result server 10
+    check_gpu_memory 0
     nvidia-smi
     ${py_version} resnet50_v2_tutorial.py > ${dir}client_log.txt 2>&1
     nvidia-smi
@@ -671,8 +716,9 @@ function criteo_ctr_rpc_gpu() {
     data_dir=${data}criteo_ctr/
     link_data ${data_dir}
     sed -i "s/8885/8886/g" test_client.py
-    ${py_version} -m paddle_serving_server.serve --model ctr_serving_model/ --port 8886 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model ctr_serving_model/ --port 8886 --gpu_ids 1 > ${dir}server_log.txt 2>&1 &
     check_result server 8
+    check_gpu_memory 1
     nvidia-smi
     ${py_version} test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0 > ${dir}client_log.txt 2>&1
     nvidia-smi
@@ -691,6 +737,7 @@ function yolov4_rpc_gpu() {
     ${py_version} -m paddle_serving_server.serve --model yolov4_model --port 8887 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
     nvidia-smi
     check_result server 8
+    check_gpu_memory 0
     ${py_version} test_client.py 000000570688.jpg > ${dir}client_log.txt 2>&1
     nvidia-smi
     check_result client "yolov4_GPU_RPC server test completed"
@@ -708,6 +755,7 @@ function senta_rpc_cpu() {
     ${py_version} -m paddle_serving_server.serve --model yolov4_model --port 8887 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
     nvidia-smi
     check_result server 8
+    check_gpu_memory 0
     ${py_version} test_client.py 000000570688.jpg > ${dir}client_log.txt 2>&1
     nvidia-smi
     check_result client "senta_GPU_RPC server test completed"
@@ -720,10 +768,9 @@ function fit_a_line_http() {
     check_dir ${dir}
     unsetproxy
     cd ${build_path}/python/examples/fit_a_line
-    sed -i "s/9393/8871/g" test_server.py
-    ${py_version} test_server.py > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 > ${dir}server_log.txt 2>&1 &
     check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:8871/uci/prediction > ${dir}client_log.txt 2>&1
+    ${py_version} test_httpclient.py uci_housing_client/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
     check_result client "fit_a_line_CPU_HTTP server test completed"
     kill_server_process
 }
@@ -733,46 +780,50 @@ function lac_http() {
     check_dir ${dir}
     unsetproxy
     cd ${build_path}/python/examples/lac
-    ${py_version} lac_web_service.py lac_model/ lac_workdir 8872 > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model lac_model/ --port 9292 > ${dir}server_log.txt 2>&1 &
     check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "我爱北京天安门"}], "fetch":["word_seg"]}' http://127.0.0.1:8872/lac/prediction > ${dir}client_log.txt 2>&1
+    echo "我爱北京天安门" | ${py_version} lac_http_client.py lac_client/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
     check_result client "lac_CPU_HTTP server test completed"
     kill_server_process
 }
 
-function cnn_http() {
-    dir=${log_dir}http_model/cnn_http/
+function imdb_http_proto() {
+    dir=${log_dir}http_model/imdb_http_proto/
     check_dir ${dir}
     unsetproxy
     cd ${build_path}/python/examples/imdb
-    ${py_version} text_classify_service.py imdb_cnn_model/ workdir/ 8873 imdb.vocab > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292 > ${dir}server_log.txt 2>&1 &
     check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:8873/imdb/prediction > ${dir}client_log.txt 2>&1
-    check_result client "cnn_CPU_HTTP server test completed"
+    head test_data/part-0 | ${py_version} test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab > ${dir}client_log.txt 2>&1
+    check_result client "imdb_CPU_HTTP-proto server test completed"
     kill_server_process
 }
 
-function bow_http() {
-    dir=${log_dir}http_model/bow_http/
+function imdb_http_json() {
+    dir=${log_dir}http_model/imdb_http_json/
     check_dir ${dir}
     unsetproxy
     cd ${build_path}/python/examples/imdb
-    ${py_version} text_classify_service.py imdb_bow_model/ workdir/ 8874 imdb.vocab > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292 > ${dir}server_log.txt 2>&1 &
     check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:8874/imdb/prediction > ${dir}client_log.txt 2>&1
-    check_result client "bow_CPU_HTTP server test completed"
+    sed -i "s/#client.set_http_proto(True)/client.set_http_proto(False)/g" test_http_client.py
+    head test_data/part-0 | ${py_version} test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab > ${dir}client_log.txt 2>&1
+    check_result client "imdb_CPU_HTTP-json server test completed"
     kill_server_process
 }
 
-function lstm_http() {
-    dir=${log_dir}http_model/lstm_http/
+function imdb_grpc() {
+    dir=${log_dir}http_model/imdb_grpc/
     check_dir ${dir}
     unsetproxy
     cd ${build_path}/python/examples/imdb
-    ${py_version} text_classify_service.py imdb_bow_model/ workdir/ 8875 imdb.vocab > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model imdb_cnn_model/ --port 9292 --gpu_ids 1 > ${dir}server_log.txt 2>&1 &
     check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://127.0.0.1:8875/imdb/prediction > ${dir}client_log.txt 2>&1
-    check_result client "lstm_CPU_HTTP server test completed"
+    check_gpu_memory 1
+    sed -i "s/client.set_http_proto(False)/#client.set_http_proto(False)/g" test_http_client.py
+    sed -i "s/#client.set_use_grpc_client(True)/client.set_use_grpc_client(True)/g" test_http_client.py
+    head test_data/part-0 | ${py_version} test_http_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab > ${dir}client_log.txt 2>&1
+    check_result client "imdb_GPU_GRPC server test completed"
     kill_server_process
 }
 
@@ -781,65 +832,70 @@ function ResNet50_http() {
     check_dir ${dir}
     unsetproxy
     cd ${build_path}/python/examples/imagenet
-    ${py_version} resnet50_web_service.py ResNet50_vd_model gpu 8876 > ${dir}server_log.txt 2>&1 &
+    ${py_version} -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
     check_result server 10
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"image": "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"}], "fetch": ["score"]}' http://127.0.0.1:8876/image/prediction > ${dir}client_log.txt 2>&1
+    check_gpu_memory 0
+    ${py_version} resnet50_http_client.py ResNet50_vd_client_config/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
     check_result client "ResNet50_GPU_HTTP server test completed"
     kill_server_process
 }
 
 function bert_http() {
-    dir=${log_dir}http_model/ResNet50_http/
+    dir=${log_dir}http_model/bert_http/
     check_dir ${dir}
     unsetproxy
     cd ${build_path}/python/examples/bert
     cp data-c.txt.1 data-c.txt
     cp vocab.txt.1 vocab.txt
-    export CUDA_VISIBLE_DEVICES=0
-    ${py_version} bert_web_service.py bert_seq128_model/ 8878 > ${dir}server_log.txt 2>&1 &
-    check_result server 8
-    curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:8878/bert/prediction > ${dir}client_log.txt 2>&1
+    export CUDA_VISIBLE_DEVICES=0,1
+    ${py_version} -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 > ${dir}server_log.txt 2>&1 &
+    check_result server 10
+    check_gpu_memory 0
+    head data-c.txt | ${py_version} bert_httpclient.py --model bert_seq128_client/serving_client_conf.prototxt > ${dir}client_log.txt 2>&1
     check_result client "bert_GPU_HTTP server test completed"
     kill_server_process
 }
 
 function grpc_fit_a_line() {
-    dir=${log_dir}rpc_model/grpc_fit_a_line/
-    check_dir ${dir}
-    unsetproxy
-    cd ${build_path}/python/examples/grpc_impl_example/fit_a_line
-    data_dir=${data}fit_a_line/
-    link_data ${data_dir}
-    ${py_version} test_server.py uci_housing_model/ > ${dir}server_log.txt 2>&1 &
-    check_result server 5
-    echo "sync predict" > ${dir}client_log.txt 2>&1
-    ${py_version} test_sync_client.py >> ${dir}client_log.txt 2>&1
-    check_result client "grpc_impl_example_fit_a_line_sync_CPU_gRPC server sync test completed"
-    echo "async predict" >> ${dir}client_log.txt 2>&1
-    ${py_version} test_asyn_client.py >> ${dir}client_log.txt 2>&1
-    check_result client "grpc_impl_example_fit_a_line_asyn_CPU_gRPC server asyn test completed"
-    echo "batch predict" >> ${dir}client_log.txt 2>&1
-    ${py_version} test_batch_client.py >> ${dir}client_log.txt 2>&1
-    check_result client "grpc_impl_example_fit_a_line_batch_CPU_gRPC server batch test completed"
-    echo "timeout predict" >> ${dir}client_log.txt 2>&1
-    ${py_version} test_timeout_client.py >> ${dir}client_log.txt 2>&1
-    check_result client "grpc_impl_example_fit_a_line_timeout_CPU_gRPC server timeout test completed"
-    kill_server_process
+    echo "pass"
+#    dir=${log_dir}rpc_model/grpc_fit_a_line/
+#    check_dir ${dir}
+#    unsetproxy
+#    cd ${build_path}/python/examples/grpc_impl_example/fit_a_line
+#    data_dir=${data}fit_a_line/
+#    link_data ${data_dir}
+#    ${py_version} test_server.py uci_housing_model/ > ${dir}server_log.txt 2>&1 &
+#    check_result server 5
+#    echo "sync predict" > ${dir}client_log.txt 2>&1
+#    ${py_version} test_sync_client.py >> ${dir}client_log.txt 2>&1
+#    check_result client "grpc_impl_example_fit_a_line_sync_CPU_gRPC server sync test completed"
+#    echo "async predict" >> ${dir}client_log.txt 2>&1
+#    ${py_version} test_asyn_client.py >> ${dir}client_log.txt 2>&1
+#    check_result client "grpc_impl_example_fit_a_line_asyn_CPU_gRPC server asyn test completed"
+#    echo "batch predict" >> ${dir}client_log.txt 2>&1
+#    ${py_version} test_batch_client.py >> ${dir}client_log.txt 2>&1
+#    check_result client "grpc_impl_example_fit_a_line_batch_CPU_gRPC server batch test completed"
+#    echo "timeout predict" >> ${dir}client_log.txt 2>&1
+#    ${py_version} test_timeout_client.py >> ${dir}client_log.txt 2>&1
+#    check_result client "grpc_impl_example_fit_a_line_timeout_CPU_gRPC server timeout test completed"
+#    kill_server_process
 }
 
 function grpc_yolov4() {
-    dir=${log_dir}rpc_model/grpc_yolov4/
-    cd ${build_path}/python/examples/grpc_impl_example/yolov4
-    check_dir ${dir}
-    data_dir=${data}yolov4/
-    link_data ${data_dir}
-    echo -e "${GREEN_COLOR}grpc_impl_example_yolov4_GPU_gRPC server started${RES}"
-    ${py_version} -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang > ${dir}server_log.txt 2>&1 &
-    check_result server 10
-    echo -e "${GREEN_COLOR}grpc_impl_example_yolov4_GPU_gRPC client started${RES}"
-    ${py_version} test_client.py 000000570688.jpg > ${dir}client_log.txt 2>&1
-    check_result client "grpc_yolov4_GPU_GRPC server test completed"
-    kill_server_process
+    echo "pass"
+#    dir=${log_dir}rpc_model/grpc_yolov4/
+#    cd ${build_path}/python/examples/grpc_impl_example/yolov4
+#    check_dir ${dir}
+#    data_dir=${data}yolov4/
+#    link_data ${data_dir}
+#    echo -e "${GREEN_COLOR}grpc_impl_example_yolov4_GPU_gRPC server started${RES}"
+#    ${py_version} -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang > ${dir}server_log.txt 2>&1 &
+#    check_result server 15
+#    check_gpu_memory 0
+#    echo -e "${GREEN_COLOR}grpc_impl_example_yolov4_GPU_gRPC client started${RES}"
+#    ${py_version} test_client.py 000000570688.jpg > ${dir}client_log.txt 2>&1
+#    check_result client "grpc_yolov4_GPU_GRPC server test completed"
+#    kill_server_process
 }
 
 function ocr_c++_service() {
@@ -857,6 +913,7 @@ function ocr_c++_service() {
     echo -e "${GREEN_COLOR}OCR_C++_Service_GPU_RPC server started${RES}"
     $py_version -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_id 0 > ${dir}server_log.txt 2>&1 &
     check_result server 8
+    check_gpu_memory 0
     echo -e "${GREEN_COLOR}OCR_C++_Service_GPU_RPC client started${RES}"
     echo "------------------first:"
     $py_version ocr_cpp_client.py ocr_det_client ocr_rec_client