Merge branch 'develop' of https://github.com/PaddlePaddle/Serving into develop

af65b28f · BohaoWu · bcc0c749 · 7ea59bda · af65b28f · af65b28f
166 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,6 +54,7 @@ option(SERVER	    "Compile Paddle Serving Server"		    OFF)
 option(APP          "Compile Paddle Serving App package"	    OFF)
 option(WITH_ELASTIC_CTR "Compile ELASITC-CTR solution"              OFF)
 option(PACK         "Compile for whl"                               OFF)
+option(WITH_TRT     "Compile Paddle Serving with TRT"       OFF)
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)

--- a/README.md
+++ b/README.md
@@ -45,32 +45,26 @@ nvidia-docker exec -it test bash
 ```
 ```shell
-pip install paddle-serving-client==0.3.2 
+pip install paddle-serving-client==0.4.0 
-pip install paddle-serving-server==0.3.2 # CPU
+pip install paddle-serving-server==0.4.0 # CPU
-pip install paddle-serving-server-gpu==0.3.2.post9 # GPU with CUDA9.0
+pip install paddle-serving-server-gpu==0.4.0.post9 # GPU with CUDA9.0
-pip install paddle-serving-server-gpu==0.3.2.post10 # GPU with CUDA9.0
+pip install paddle-serving-server-gpu==0.4.0.post10 # GPU with CUDA10.0
+pip install paddle-serving-server-gpu==0.4.0.trt # GPU with CUDA10.1+TensorRT
 ```
 You may need to use a domestic mirror source (in China, you can use the Tsinghua mirror source, add `-i https://pypi.tuna.tsinghua.edu.cn/simple` to pip command) to speed up the download.
 If you need install modules compiled with develop branch, please download packages from [latest packages list](./doc/LATEST_PACKAGES.md) and install with `pip install` command.
-Packages of paddle-serving-server and paddle-serving-server-gpu support Centos 6/7 and Ubuntu 16/18.
+Packages of paddle-serving-server and paddle-serving-server-gpu support Centos 6/7, Ubuntu 16/18, Windows 10.
-Packages of paddle-serving-client and paddle-serving-app support Linux and Windows, but paddle-serving-client only support python2.7/3.6/3.7.
+Packages of paddle-serving-client and paddle-serving-app support Linux and Windows, but paddle-serving-client only support python2.7/3.5/3.6/3.7.
-Recommended to install paddle >= 1.8.2.
+Recommended to install paddle >= 1.8.4.
-<h2 align="center"> Pre-built services with Paddle Serving</h2>
+For **Windows Users**, please read the document [Paddle Serving for Windows Users](./doc/WINDOWS_TUTORIAL.md)
-<h3 align="center">Latest release</h4>
+<h2 align="center"> Pre-built services with Paddle Serving</h2>
-<p align="center">
-    <a href="https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/ocr">Optical Character Recognition</a>
-    <br>
-    <a href="https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/faster_rcnn_model">Object Detection</a>
-    <br>
-    <a href="https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/deeplabv3">Image Segmentation</a>
-<p>
 <h3 align="center">Chinese Word Segmentation</h4>
@@ -111,11 +105,11 @@ tar -xzf uci_housing.tar.gz
 Paddle Serving provides HTTP and RPC based service for users to access
-### HTTP service
+### RPC service
-Paddle Serving provides a built-in python module called `paddle_serving_server.serve` that can start a RPC service or a http service with one-line command. If we specify the argument `--name uci`, it means that we will have a HTTP service with a url of `$IP:$PORT/uci/prediction`
+A user can also start a RPC service with `paddle_serving_server.serve`. RPC service is usually faster than HTTP service, although a user needs to do some coding based on Paddle Serving's python client API. Note that we do not specify `--name` here. 
 ``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --name uci
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292
 ```
 <center>
@@ -123,41 +117,63 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 |--------------|------|-----------|--------------------------------|
 | `thread` | int | `4` | Concurrency of current service |
 | `port` | int | `9292` | Exposed port of current service to users|
-| `name` | str | `""` | Service name, can be used to generate HTTP request url |
 | `model` | str | `""` | Path of paddle model directory to be served |
 | `mem_optim_off` | - | - | Disable memory / graphic memory optimization |
 | `ir_optim` | - | - | Enable analysis and optimization of calculation graph |
 | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL |
+| `use_trt` (Only for trt version) | - | - | Run inference with TensorRT  |
-Here, we use `curl` to send a HTTP POST request to the service we just started. Users can use any python library to send HTTP POST as well, e.g, [requests](https://requests.readthedocs.io/en/master/).
 </center>
-``` shell
+```python
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
-```
-### RPC service
-A user can also start a RPC service with `paddle_serving_server.serve`. RPC service is usually faster than HTTP service, although a user needs to do some coding based on Paddle Serving's python client API. Note that we do not specify `--name` here. 
-``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292
-```
-``` python
 # A user can visit rpc service through paddle_serving_client API
 from paddle_serving_client import Client
+import numpy as np
 client = Client()
 client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
 client.connect(["127.0.0.1:9292"])
 data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
        -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
-fetch_map = client.predict(feed={"x": data}, fetch=["price"])
+fetch_map = client.predict(feed={"x": np.array(data).reshape(1,13,1)}, fetch=["price"])
 print(fetch_map)
 ```
 Here, `client.predict` function has two arguments. `feed` is a `python dict` with model input variable alias name and values. `fetch` assigns the prediction variables to be returned from servers. In the example, the name of `"x"` and `"price"` are assigned when the servable model is saved during training.
+### WEB service
+Users can also put the data format processing logic on the server side, so that they can directly use curl to access the service, refer to the following case whose path is `python/examples/fit_a_line`
+```python
+from paddle_serving_server.web_service import WebService
+import numpy as np
+class UciService(WebService):
+    def preprocess(self, feed=[], fetch=[]):
+        feed_batch = []
+        is_batch = True
+        new_data = np.zeros((len(feed), 1, 13)).astype("float32")
+        for i, ins in enumerate(feed):
+            nums = np.array(ins["x"]).reshape(1, 1, 13)
+            new_data[i] = nums
+        feed = {"x": new_data}
+        return feed, fetch, is_batch
+uci_service = UciService(name="uci")
+uci_service.load_model_config("uci_housing_model")
+uci_service.prepare_server(workdir="workdir", port=9292)
+uci_service.run_rpc_service()
+uci_service.run_web_service()
+```
+for client side,
+```
+curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
+```
+the response is
+```
+{"result":{"price":[[18.901151657104492]]}}
+```
 <h2 align="center">Some Key Features of Paddle Serving</h2>
 - Integrate with Paddle training pipeline seamlessly, most paddle models can be deployed **with one line command**.
@@ -212,6 +228,10 @@ To connect with other users and contributors, welcome to join our [Slack channel
 If you want to contribute code to Paddle Serving, please reference [Contribution Guidelines](doc/CONTRIBUTE.md)
+- Special Thanks to [@BeyondYourself](https://github.com/BeyondYourself) in complementing the gRPC tutorial, updating the FAQ doc and modifying the mdkir command
+- Special Thanks to [@mcl-stone](https://github.com/mcl-stone) in updating faster_rcnn benchmark
+- Special Thanks to [@cg82616424](https://github.com/cg82616424) in updating the unet benchmark and modifying resize comment error
 ### Feedback
 For any feedback or to report a bug, please propose a [GitHub Issue](https://github.com/PaddlePaddle/Serving/issues).

--- a/README_CN.md
+++ b/README_CN.md
@@ -47,21 +47,24 @@ nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/se
 nvidia-docker exec -it test bash
 ```
 ```shell
-pip install paddle-serving-client==0.3.2
+pip install paddle-serving-client==0.4.0
-pip install paddle-serving-server==0.3.2 # CPU
+pip install paddle-serving-server==0.4.0 # CPU
-pip install paddle-serving-server-gpu==0.3.2.post9 # GPU with CUDA9.0
+pip install paddle-serving-server-gpu==0.4.0.post9 # GPU with CUDA9.0
-pip install paddle-serving-server-gpu==0.3.2.post10 # GPU with CUDA10.0
+pip install paddle-serving-server-gpu==0.4.0.post10 # GPU with CUDA10.0
+pip install paddle-serving-server-gpu==0.4.0.trt # GPU with CUDA10.1+TensorRT
 ```
 您可能需要使用国内镜像源（例如清华源, 在pip命令中添加`-i https://pypi.tuna.tsinghua.edu.cn/simple`）来加速下载。
 如果需要使用develop分支编译的安装包，请从[最新安装包列表](./doc/LATEST_PACKAGES.md)中获取下载地址进行下载，使用`pip install`命令进行安装。
-paddle-serving-server和paddle-serving-server-gpu安装包支持Centos 6/7和Ubuntu 16/18。
+paddle-serving-server和paddle-serving-server-gpu安装包支持Centos 6/7, Ubuntu 16/18和Windows 10。
 paddle-serving-client和paddle-serving-app安装包支持Linux和Windows，其中paddle-serving-client仅支持python2.7/3.5/3.6。
-推荐安装1.8.2及以上版本的paddle
+推荐安装1.8.4及以上版本的paddle
+对于**Windows 10 用户**，请参考文档[Windows平台使用Paddle Serving指导](./doc/WINDOWS_TUTORIAL_CN.md)。
 <h2 align="center"> Paddle Serving预装的服务 </h2>
@@ -105,13 +108,12 @@ tar -xzf uci_housing.tar.gz
 Paddle Serving 为用户提供了基于 HTTP 和 RPC 的服务
+<h3 align="center">RPC服务</h3>
-<h3 align="center">HTTP服务</h3>
+用户还可以使用`paddle_serving_server.serve`启动RPC服务。 尽管用户需要基于Paddle Serving的python客户端API进行一些开发，但是RPC服务通常比HTTP服务更快。需要指出的是这里我们没有指定`--name`。
-Paddle Serving提供了一个名为`paddle_serving_server.serve`的内置python模块，可以使用单行命令启动RPC服务或HTTP服务。如果我们指定参数`--name uci`，则意味着我们将拥有一个HTTP服务，其URL为$IP:$PORT/uci/prediction`。
 ``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --name uci
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292
 ```
 <center>
@@ -124,22 +126,12 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 | `mem_optim_off` | - | - | Disable memory optimization |
 | `ir_optim` | - | - | Enable analysis and optimization of calculation graph |
 | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL |
+| `use_trt` (Only for trt version) | - | - | Run inference with TensorRT  |
-我们使用 `curl` 命令来发送HTTP POST请求给刚刚启动的服务。用户也可以调用python库来发送HTTP POST请求，请参考英文文档 [requests](https://requests.readthedocs.io/en/master/)。
+我们使用 `curl` 命令来发送HTTP POST请求给刚刚启动的服务。用户也可以调用python库来发送HTTP POST请求，请参考英文文
+档 [requests](https://requests.readthedocs.io/en/master/)。
 </center>
-``` shell
-curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
-```
-<h3 align="center">RPC服务</h3>
-用户还可以使用`paddle_serving_server.serve`启动RPC服务。 尽管用户需要基于Paddle Serving的python客户端API进行一些开发，但是RPC服务通常比HTTP服务更快。需要指出的是这里我们没有指定`--name`。
-``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292
-```
 ``` python
 # A user can visit rpc service through paddle_serving_client API
 from paddle_serving_client import Client
@@ -149,12 +141,45 @@ client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
 client.connect(["127.0.0.1:9292"])
 data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
        -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
-fetch_map = client.predict(feed={"x": data}, fetch=["price"])
+fetch_map = client.predict(feed={"x": np.array(data).reshape(1,13,1)}, fetch=["price"])
 print(fetch_map)
 ```
 在这里，`client.predict`函数具有两个参数。 `feed`是带有模型输入变量别名和值的`python dict`。 `fetch`被要从服务器返回的预测变量赋值。 在该示例中，在训练过程中保存可服务模型时，被赋值的tensor名为`"x"`和`"price"`。
+<h3 align="center">HTTP服务</h3>
+用户也可以将数据格式处理逻辑放在服务器端进行，这样就可以直接用curl去访问服务，参考如下案例，在目录`python/examples/fit_a_line`
+```python
+from paddle_serving_server.web_service import WebService
+import numpy as np
+class UciService(WebService):
+    def preprocess(self, feed=[], fetch=[]):
+        feed_batch = []
+        is_batch = True
+        new_data = np.zeros((len(feed), 1, 13)).astype("float32")
+        for i, ins in enumerate(feed):
+            nums = np.array(ins["x"]).reshape(1, 1, 13)
+            new_data[i] = nums
+        feed = {"x": new_data}
+        return feed, fetch, is_batch
+uci_service = UciService(name="uci")
+uci_service.load_model_config("uci_housing_model")
+uci_service.prepare_server(workdir="workdir", port=9292)
+uci_service.run_rpc_service()
+uci_service.run_web_service()
+```
+客户端输入
+```
+curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
+```
+返回结果
+```
+{"result":{"price":[[18.901151657104492]]}}
+```
 <h2 align="center">Paddle Serving的核心功能</h2>
 - 与Paddle训练紧密连接，绝大部分Paddle模型可以 **一键部署**.
@@ -207,6 +232,10 @@ print(fetch_map)
 如果您想为Paddle Serving贡献代码，请参考 [Contribution Guidelines](doc/CONTRIBUTE.md)
+- 特别感谢 [@BeyondYourself](https://github.com/BeyondYourself) 提供grpc教程，更新FAQ教程，整理文件目录。
+- 特别感谢 [@mcl-stone](https://github.com/mcl-stone) 提供faster rcnn benchmark脚本
+- 特别感谢 [@cg82616424](https://github.com/cg82616424) 提供unet benchmark脚本和修改部分注释错误
 ### 反馈
 如有任何反馈或是bug，请在 [GitHub Issue](https://github.com/PaddlePaddle/Serving/issues)提交

--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -34,7 +34,11 @@ message( "WITH_GPU = ${WITH_GPU}")
 SET(PADDLE_VERSION "1.8.4")
 if (WITH_GPU)
-    SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda${CUDA_VERSION_MAJOR}-cudnn7-avx-mkl")
+    if (WITH_TRT)
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7.6-avx-mkl-trt6")
+    else()
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10-cudnn7-avx-mkl")
+    endif()
 else()
    if (WITH_AVX)
        if (WITH_MKLML)
@@ -50,7 +54,23 @@ endif()
 SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/fluid_inference.tgz")
 MESSAGE(STATUS "PADDLE_LIB_PATH=${PADDLE_LIB_PATH}")
 if (WITH_GPU OR WITH_MKLML)
-ExternalProject_Add(
+    if (WITH_TRT)
+        ExternalProject_Add(
+            "extern_paddle"
+            ${EXTERNAL_PROJECT_LOG_ARGS}
+            URL                 "${PADDLE_LIB_PATH}"
+            PREFIX              "${PADDLE_SOURCES_DIR}"
+            DOWNLOAD_DIR        "${PADDLE_DOWNLOAD_DIR}"
+            CONFIGURE_COMMAND   ""
+            BUILD_COMMAND       ""
+            UPDATE_COMMAND      ""
+            INSTALL_COMMAND
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/include ${PADDLE_INSTALL_DIR}/include &&
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/lib ${PADDLE_INSTALL_DIR}/lib &&
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/third_party ${PADDLE_INSTALL_DIR}/third_party
+        )
+    else()
+        ExternalProject_Add(
            "extern_paddle"
            ${EXTERNAL_PROJECT_LOG_ARGS}
            URL                 "${PADDLE_LIB_PATH}"
@@ -64,7 +84,8 @@ ExternalProject_Add(
                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/lib ${PADDLE_INSTALL_DIR}/lib &&
                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/third_party ${PADDLE_INSTALL_DIR}/third_party &&
                ${CMAKE_COMMAND} -E copy ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so.0 ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so 
-)
+        )
+    endif()
 else()
 ExternalProject_Add(
    "extern_paddle"
@@ -92,8 +113,16 @@ LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib)
 ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/openblas/lib/libopenblas.a)
-ADD_LIBRARY(paddle_fluid STATIC IMPORTED GLOBAL)
+ADD_LIBRARY(paddle_fluid SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.a)
+SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.so)
+if (WITH_TRT)
+ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET nvinfer PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer.so)
+ADD_LIBRARY(nvinfer_plugin SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET nvinfer_plugin PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer_plugin.so)
+endif()
 ADD_LIBRARY(xxhash STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET xxhash PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xxhash/lib/libxxhash.a)
@@ -102,3 +131,8 @@ LIST(APPEND external_project_dependencies paddle)
 LIST(APPEND paddle_depend_libs
    xxhash)
+if(WITH_TRT)
+LIST(APPEND paddle_depend_libs
+    nvinfer nvinfer_plugin)
+endif()
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -44,6 +44,7 @@ message EngineDesc {
  optional bool static_optimization = 14;
  optional bool force_update_static_cache = 15;
  optional bool enable_ir_optimization = 16;
+  optional bool use_trt = 17;
 };
 // model_toolkit conf

--- a/core/cube/CMakeLists.txt
+++ b/core/cube/CMakeLists.txt
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
+#execute_process(COMMAND go env -w GO111MODULE=off)
 add_subdirectory(cube-server)
 add_subdirectory(cube-api)
 add_subdirectory(cube-builder)
-add_subdirectory(cube-transfer)
+#add_subdirectory(cube-transfer)
-add_subdirectory(cube-agent)
+#add_subdirectory(cube-agent)
--- a/core/general-client/include/general_model.h
+++ b/core/general-client/include/general_model.h
@@ -218,25 +218,15 @@ class PredictorClient {
  int destroy_predictor();
-  int batch_predict(
-      const std::vector<std::vector<std::vector<float>>>& float_feed_batch,
-      const std::vector<std::string>& float_feed_name,
-      const std::vector<std::vector<int>>& float_shape,
-      const std::vector<std::vector<std::vector<int64_t>>>& int_feed_batch,
-      const std::vector<std::string>& int_feed_name,
-      const std::vector<std::vector<int>>& int_shape,
-      const std::vector<std::string>& fetch_name,
-      PredictorRes& predict_res_batch,  // NOLINT
-      const int& pid,
-      const uint64_t log_id);
  int numpy_predict(
      const std::vector<std::vector<py::array_t<float>>>& float_feed_batch,
      const std::vector<std::string>& float_feed_name,
      const std::vector<std::vector<int>>& float_shape,
+      const std::vector<std::vector<int>>& float_lod_slot_batch,
      const std::vector<std::vector<py::array_t<int64_t>>>& int_feed_batch,
      const std::vector<std::string>& int_feed_name,
      const std::vector<std::vector<int>>& int_shape,
+      const std::vector<std::vector<int>>& int_lod_slot_batch,
      const std::vector<std::string>& fetch_name,
      PredictorRes& predict_res_batch,  // NOLINT
      const int& pid,

--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -137,227 +137,15 @@ int PredictorClient::create_predictor() {
  return 0;
 }
-int PredictorClient::batch_predict(
-    const std::vector<std::vector<std::vector<float>>> &float_feed_batch,
-    const std::vector<std::string> &float_feed_name,
-    const std::vector<std::vector<int>> &float_shape,
-    const std::vector<std::vector<std::vector<int64_t>>> &int_feed_batch,
-    const std::vector<std::string> &int_feed_name,
-    const std::vector<std::vector<int>> &int_shape,
-    const std::vector<std::string> &fetch_name,
-    PredictorRes &predict_res_batch,
-    const int &pid,
-    const uint64_t log_id) {
-  int batch_size = std::max(float_feed_batch.size(), int_feed_batch.size());
-  predict_res_batch.clear();
-  Timer timeline;
-  int64_t preprocess_start = timeline.TimeStampUS();
-  int fetch_name_num = fetch_name.size();
-  _api.thrd_initialize();
-  std::string variant_tag;
-  _predictor = _api.fetch_predictor("general_model", &variant_tag);
-  predict_res_batch.set_variant_tag(variant_tag);
-  VLOG(2) << "fetch general model predictor done.";
-  VLOG(2) << "float feed name size: " << float_feed_name.size();
-  VLOG(2) << "int feed name size: " << int_feed_name.size();
-  VLOG(2) << "max body size : " << brpc::fLU64::FLAGS_max_body_size;
-  Request req;
-  req.set_log_id(log_id);
-  for (auto &name : fetch_name) {
-    req.add_fetch_var_names(name);
-  }
-  for (int bi = 0; bi < batch_size; bi++) {
-    VLOG(2) << "prepare batch " << bi;
-    std::vector<Tensor *> tensor_vec;
-    FeedInst *inst = req.add_insts();
-    std::vector<std::vector<float>> float_feed = float_feed_batch[bi];
-    std::vector<std::vector<int64_t>> int_feed = int_feed_batch[bi];
-    for (auto &name : float_feed_name) {
-      tensor_vec.push_back(inst->add_tensor_array());
-    }
-    for (auto &name : int_feed_name) {
-      tensor_vec.push_back(inst->add_tensor_array());
-    }
-    VLOG(2) << "batch [" << bi << "] int_feed_name and float_feed_name "
-            << "prepared";
-    int vec_idx = 0;
-    VLOG(2) << "tensor_vec size " << tensor_vec.size() << " float shape "
-            << float_shape.size();
-    for (auto &name : float_feed_name) {
-      int idx = _feed_name_to_idx[name];
-      Tensor *tensor = tensor_vec[idx];
-      VLOG(2) << "prepare float feed " << name << " shape size "
-              << float_shape[vec_idx].size();
-      for (uint32_t j = 0; j < float_shape[vec_idx].size(); ++j) {
-        tensor->add_shape(float_shape[vec_idx][j]);
-      }
-      tensor->set_elem_type(1);
-      for (uint32_t j = 0; j < float_feed[vec_idx].size(); ++j) {
-        tensor->add_float_data(float_feed[vec_idx][j]);
-      }
-      vec_idx++;
-    }
-    VLOG(2) << "batch [" << bi << "] "
-            << "float feed value prepared";
-    vec_idx = 0;
-    for (auto &name : int_feed_name) {
-      int idx = _feed_name_to_idx[name];
-      Tensor *tensor = tensor_vec[idx];
-      if (_type[idx] == 0) {
-        VLOG(2) << "prepare int64 feed " << name << " shape size "
-                << int_shape[vec_idx].size();
-        VLOG(3) << "feed var name " << name << " index " << vec_idx
-                << "first data " << int_feed[vec_idx][0];
-        for (uint32_t j = 0; j < int_feed[vec_idx].size(); ++j) {
-          tensor->add_int64_data(int_feed[vec_idx][j]);
-        }
-      } else if (_type[idx] == 2) {
-        VLOG(2) << "prepare int32 feed " << name << " shape size "
-                << int_shape[vec_idx].size();
-        VLOG(3) << "feed var name " << name << " index " << vec_idx
-                << "first data " << int32_t(int_feed[vec_idx][0]);
-        for (uint32_t j = 0; j < int_feed[vec_idx].size(); ++j) {
-          tensor->add_int_data(int32_t(int_feed[vec_idx][j]));
-        }
-      }
-      for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
-        tensor->add_shape(int_shape[vec_idx][j]);
-      }
-      tensor->set_elem_type(_type[idx]);
-      vec_idx++;
-    }
-    VLOG(2) << "batch [" << bi << "] "
-            << "int feed value prepared";
-  }
-  int64_t preprocess_end = timeline.TimeStampUS();
-  int64_t client_infer_start = timeline.TimeStampUS();
-  Response res;
-  int64_t client_infer_end = 0;
-  int64_t postprocess_start = 0;
-  int64_t postprocess_end = 0;
-  if (FLAGS_profile_client) {
-    if (FLAGS_profile_server) {
-      req.set_profile_server(true);
-    }
-  }
-  res.Clear();
-  if (_predictor->inference(&req, &res) != 0) {
-    LOG(ERROR) << "failed call predictor with req: " << req.ShortDebugString();
-    _api.thrd_clear();
-    return -1;
-  } else {
-    client_infer_end = timeline.TimeStampUS();
-    postprocess_start = client_infer_end;
-    VLOG(2) << "get model output num";
-    uint32_t model_num = res.outputs_size();
-    VLOG(2) << "model num: " << model_num;
-    for (uint32_t m_idx = 0; m_idx < model_num; ++m_idx) {
-      VLOG(2) << "process model output index: " << m_idx;
-      auto output = res.outputs(m_idx);
-      ModelRes model;
-      model.set_engine_name(output.engine_name());
-      int idx = 0;
-      for (auto &name : fetch_name) {
-        // int idx = _fetch_name_to_idx[name];
-        int shape_size = output.insts(0).tensor_array(idx).shape_size();
-        VLOG(2) << "fetch var " << name << " index " << idx << " shape size "
-                << shape_size;
-        model._shape_map[name].resize(shape_size);
-        for (int i = 0; i < shape_size; ++i) {
-          model._shape_map[name][i] =
-              output.insts(0).tensor_array(idx).shape(i);
-        }
-        int lod_size = output.insts(0).tensor_array(idx).lod_size();
-        if (lod_size > 0) {
-          model._lod_map[name].resize(lod_size);
-          for (int i = 0; i < lod_size; ++i) {
-            model._lod_map[name][i] = output.insts(0).tensor_array(idx).lod(i);
-          }
-        }
-        idx += 1;
-      }
-      idx = 0;
-      for (auto &name : fetch_name) {
-        // int idx = _fetch_name_to_idx[name];
-        if (_fetch_name_to_type[name] == 0) {
-          VLOG(2) << "ferch var " << name << "type int64";
-          int size = output.insts(0).tensor_array(idx).int64_data_size();
-          model._int64_value_map[name] = std::vector<int64_t>(
-              output.insts(0).tensor_array(idx).int64_data().begin(),
-              output.insts(0).tensor_array(idx).int64_data().begin() + size);
-        } else if (_fetch_name_to_type[name] == 1) {
-          VLOG(2) << "fetch var " << name << "type float";
-          int size = output.insts(0).tensor_array(idx).float_data_size();
-          model._float_value_map[name] = std::vector<float>(
-              output.insts(0).tensor_array(idx).float_data().begin(),
-              output.insts(0).tensor_array(idx).float_data().begin() + size);
-        } else if (_fetch_name_to_type[name] == 2) {
-          VLOG(2) << "fetch var " << name << "type int32";
-          int size = output.insts(0).tensor_array(idx).int_data_size();
-          model._int32_value_map[name] = std::vector<int32_t>(
-              output.insts(0).tensor_array(idx).int_data().begin(),
-              output.insts(0).tensor_array(idx).int_data().begin() + size);
-        }
-        idx += 1;
-      }
-      predict_res_batch.add_model_res(std::move(model));
-    }
-    postprocess_end = timeline.TimeStampUS();
-  }
-  if (FLAGS_profile_client) {
-    std::ostringstream oss;
-    oss << "PROFILE\t"
-        << "pid:" << pid << "\t"
-        << "prepro_0:" << preprocess_start << " "
-        << "prepro_1:" << preprocess_end << " "
-        << "client_infer_0:" << client_infer_start << " "
-        << "client_infer_1:" << client_infer_end << " ";
-    if (FLAGS_profile_server) {
-      int op_num = res.profile_time_size() / 2;
-      for (int i = 0; i < op_num; ++i) {
-        oss << "op" << i << "_0:" << res.profile_time(i * 2) << " ";
-        oss << "op" << i << "_1:" << res.profile_time(i * 2 + 1) << " ";
-      }
-    }
-    oss << "postpro_0:" << postprocess_start << " ";
-    oss << "postpro_1:" << postprocess_end;
-    fprintf(stderr, "%s\n", oss.str().c_str());
-  }
-  _api.thrd_clear();
-  return 0;
-}
 int PredictorClient::numpy_predict(
    const std::vector<std::vector<py::array_t<float>>> &float_feed_batch,
    const std::vector<std::string> &float_feed_name,
    const std::vector<std::vector<int>> &float_shape,
+    const std::vector<std::vector<int>> &float_lod_slot_batch,
    const std::vector<std::vector<py::array_t<int64_t>>> &int_feed_batch,
    const std::vector<std::string> &int_feed_name,
    const std::vector<std::vector<int>> &int_shape,
+    const std::vector<std::vector<int>> &int_lod_slot_batch,
    const std::vector<std::string> &fetch_name,
    PredictorRes &predict_res_batch,
    const int &pid,
@@ -412,6 +200,9 @@ int PredictorClient::numpy_predict(
      for (uint32_t j = 0; j < float_shape[vec_idx].size(); ++j) {
        tensor->add_shape(float_shape[vec_idx][j]);
      }
+      for (uint32_t j = 0; j < float_lod_slot_batch[vec_idx].size(); ++j) {
+        tensor->add_lod(float_lod_slot_batch[vec_idx][j]);
+      }
      tensor->set_elem_type(1);
      const int float_shape_size = float_shape[vec_idx].size();
      switch (float_shape_size) {
@@ -470,6 +261,9 @@ int PredictorClient::numpy_predict(
      for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
        tensor->add_shape(int_shape[vec_idx][j]);
      }
+      for (uint32_t j = 0; j < int_lod_slot_batch[vec_idx].size(); ++j) {
+        tensor->add_lod(int_lod_slot_batch[vec_idx][j]);
+      }
      tensor->set_elem_type(_type[idx]);
      if (_type[idx] == 0) {

--- a/core/general-client/src/pybind_general_model.cpp
+++ b/core/general-client/src/pybind_general_model.cpp
@@ -95,42 +95,18 @@ PYBIND11_MODULE(serving_client, m) {
           [](PredictorClient &self) { self.create_predictor(); })
      .def("destroy_predictor",
           [](PredictorClient &self) { self.destroy_predictor(); })
-      .def("batch_predict",
-           [](PredictorClient &self,
-              const std::vector<std::vector<std::vector<float>>>
-                  &float_feed_batch,
-              const std::vector<std::string> &float_feed_name,
-              const std::vector<std::vector<int>> &float_shape,
-              const std::vector<std::vector<std::vector<int64_t>>>
-                  &int_feed_batch,
-              const std::vector<std::string> &int_feed_name,
-              const std::vector<std::vector<int>> &int_shape,
-              const std::vector<std::string> &fetch_name,
-              PredictorRes &predict_res_batch,
-              const int &pid,
-              const uint64_t log_id) {
-             return self.batch_predict(float_feed_batch,
-                                       float_feed_name,
-                                       float_shape,
-                                       int_feed_batch,
-                                       int_feed_name,
-                                       int_shape,
-                                       fetch_name,
-                                       predict_res_batch,
-                                       pid,
-                                       log_id);
-           },
-           py::call_guard<py::gil_scoped_release>())
      .def("numpy_predict",
           [](PredictorClient &self,
              const std::vector<std::vector<py::array_t<float>>>
                  &float_feed_batch,
              const std::vector<std::string> &float_feed_name,
              const std::vector<std::vector<int>> &float_shape,
+              const std::vector<std::vector<int>> &float_lod_slot_batch,
              const std::vector<std::vector<py::array_t<int64_t>>>
                  &int_feed_batch,
              const std::vector<std::string> &int_feed_name,
              const std::vector<std::vector<int>> &int_shape,
+              const std::vector<std::vector<int>> &int_lod_slot_batch,
              const std::vector<std::string> &fetch_name,
              PredictorRes &predict_res_batch,
              const int &pid,
@@ -138,9 +114,11 @@ PYBIND11_MODULE(serving_client, m) {
             return self.numpy_predict(float_feed_batch,
                                       float_feed_name,
                                       float_shape,
+                                       float_lod_slot_batch,
                                       int_feed_batch,
                                       int_feed_name,
                                       int_shape,
+                                       int_lod_slot_batch,
                                       fetch_name,
                                       predict_res_batch,
                                       pid,

--- a/core/general-server/CMakeLists.txt
+++ b/core/general-server/CMakeLists.txt
@@ -9,7 +9,7 @@ endif()
 target_include_directories(serving PUBLIC
        ${CMAKE_CURRENT_BINARY_DIR}/../../core/predictor
        )
+    include_directories(${CUDNN_ROOT}/include/)
 if(WITH_GPU)
    target_link_libraries(serving -Wl,--whole-archive fluid_gpu_engine
            -Wl,--no-whole-archive)
@@ -29,7 +29,11 @@ if(WITH_GPU)
 endif()
 if(WITH_MKL OR WITH_GPU)
+    if (WITH_TRT)
+    target_link_libraries(serving -liomp5 -lmklml_intel -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -lbz2)
+    else()
    target_link_libraries(serving -liomp5 -lmklml_intel -lmkldnn -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -lbz2)
+endif()
 else()
    target_link_libraries(serving openblas -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -lbz2)
 endif()

--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -73,8 +73,6 @@ int GeneralReaderOp::inference() {
  // reade request from client
  const Request *req = dynamic_cast<const Request *>(get_request_message());
  uint64_t log_id = req->log_id();
-  int batch_size = req->insts_size();
  int input_var_num = 0;
  std::vector<int64_t> elem_type;
  std::vector<int64_t> elem_size;
@@ -83,7 +81,6 @@ int GeneralReaderOp::inference() {
  GeneralBlob *res = mutable_data<GeneralBlob>();
  TensorVector *out = &res->tensor_vector;
-  res->SetBatchSize(batch_size);
  res->SetLogId(log_id);
  if (!res) {
@@ -98,11 +95,11 @@ int GeneralReaderOp::inference() {
  VLOG(2) << "(logid=" << log_id
          << ") start to call load general model_conf op";
  baidu::paddle_serving::predictor::Resource &resource =
      baidu::paddle_serving::predictor::Resource::instance();
  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
  std::shared_ptr<PaddleGeneralModelConfig> model_config =
      resource.get_general_model_config();
@@ -122,13 +119,11 @@ int GeneralReaderOp::inference() {
  elem_type.resize(var_num);
  elem_size.resize(var_num);
  capacity.resize(var_num);
  // prepare basic information for input
  for (int i = 0; i < var_num; ++i) {
    paddle::PaddleTensor lod_tensor;
    elem_type[i] = req->insts(0).tensor_array(i).elem_type();
-    VLOG(2) << "(logid=" << log_id << ") var[" << i
+    VLOG(2) << "var[" << i << "] has elem type: " << elem_type[i];
-            << "] has elem type: " << elem_type[i];
    if (elem_type[i] == 0) {  // int64
      elem_size[i] = sizeof(int64_t);
      lod_tensor.dtype = paddle::PaddleDType::INT64;
@@ -139,13 +134,24 @@ int GeneralReaderOp::inference() {
      elem_size[i] = sizeof(int32_t);
      lod_tensor.dtype = paddle::PaddleDType::INT32;
    }
+    // implement lod tensor here
-    if (model_config->_is_lod_feed[i]) {
+    if (req->insts(0).tensor_array(i).lod_size() > 0) {
-      lod_tensor.lod.resize(1);
-      lod_tensor.lod[0].push_back(0);
      VLOG(2) << "(logid=" << log_id << ") var[" << i << "] is lod_tensor";
+      lod_tensor.lod.resize(1);
+      for (int k = 0; k < req->insts(0).tensor_array(i).lod_size(); ++k) {
+        lod_tensor.lod[0].push_back(req->insts(0).tensor_array(i).lod(k));
+      }
+      capacity[i] = 1;
+      for (int k = 0; k < req->insts(0).tensor_array(i).shape_size(); ++k) {
+        int dim = req->insts(0).tensor_array(i).shape(k);
+        VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
+                << "]: " << dim;
+        capacity[i] *= dim;
+        lod_tensor.shape.push_back(dim);
+      }
+      VLOG(2) << "(logid=" << log_id << ") var[" << i
+              << "] is tensor, capacity: " << capacity[i];
    } else {
-      lod_tensor.shape.push_back(batch_size);
      capacity[i] = 1;
      for (int k = 0; k < req->insts(0).tensor_array(i).shape_size(); ++k) {
        int dim = req->insts(0).tensor_array(i).shape(k);
@@ -160,13 +166,11 @@ int GeneralReaderOp::inference() {
    lod_tensor.name = model_config->_feed_name[i];
    out->push_back(lod_tensor);
  }
  // specify the memory needed for output tensor_vector
  for (int i = 0; i < var_num; ++i) {
    if (out->at(i).lod.size() == 1) {
      int tensor_size = 0;
-      for (int j = 0; j < batch_size; ++j) {
+      const Tensor &tensor = req->insts(0).tensor_array(i);
-        const Tensor &tensor = req->insts(j).tensor_array(i);
      int data_len = 0;
      if (tensor.int64_data_size() > 0) {
        data_len = tensor.int64_data_size();
@@ -188,23 +192,14 @@ int GeneralReaderOp::inference() {
      } else {
        sample_len = tensor.shape(0);
      }
-        out->at(i).lod[0].push_back(cur_len + sample_len);
      VLOG(2) << "(logid=" << log_id << ") new len: " << cur_len + sample_len;
-      }
      out->at(i).data.Resize(tensor_size * elem_size[i]);
-      out->at(i).shape = {out->at(i).lod[0].back()};
-      for (int j = 1; j < req->insts(0).tensor_array(i).shape_size(); ++j) {
-        out->at(i).shape.push_back(req->insts(0).tensor_array(i).shape(j));
-      }
-      if (out->at(i).shape.size() == 1) {
-        out->at(i).shape.push_back(1);
-      }
      VLOG(2) << "(logid=" << log_id << ") var[" << i
              << "] is lod_tensor and len=" << out->at(i).lod[0].back();
    } else {
-      out->at(i).data.Resize(batch_size * capacity[i] * elem_size[i]);
+      out->at(i).data.Resize(capacity[i] * elem_size[i]);
      VLOG(2) << "(logid=" << log_id << ") var[" << i
-              << "] is tensor and capacity=" << batch_size * capacity[i];
+              << "] is tensor and capacity=" << capacity[i];
    }
  }
@@ -215,58 +210,36 @@ int GeneralReaderOp::inference() {
      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
              << "] is " << req->insts(0).tensor_array(i).int64_data(0);
      int offset = 0;
-      for (int j = 0; j < batch_size; ++j) {
+      int elem_num = req->insts(0).tensor_array(i).int64_data_size();
-        int elem_num = req->insts(j).tensor_array(i).int64_data_size();
      for (int k = 0; k < elem_num; ++k) {
-          dst_ptr[offset + k] = req->insts(j).tensor_array(i).int64_data(k);
+        dst_ptr[offset + k] = req->insts(0).tensor_array(i).int64_data(k);
-        }
-        if (out->at(i).lod.size() == 1) {
-          offset = out->at(i).lod[0][j + 1];
-        } else {
-          offset += capacity[i];
-        }
      }
    } else if (elem_type[i] == 1) {
      float *dst_ptr = static_cast<float *>(out->at(i).data.data());
      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
              << "] is " << req->insts(0).tensor_array(i).float_data(0);
      int offset = 0;
-      for (int j = 0; j < batch_size; ++j) {
+      int elem_num = req->insts(0).tensor_array(i).float_data_size();
-        int elem_num = req->insts(j).tensor_array(i).float_data_size();
      for (int k = 0; k < elem_num; ++k) {
-          dst_ptr[offset + k] = req->insts(j).tensor_array(i).float_data(k);
+        dst_ptr[offset + k] = req->insts(0).tensor_array(i).float_data(k);
-        }
-        if (out->at(i).lod.size() == 1) {
-          offset = out->at(i).lod[0][j + 1];
-        } else {
-          offset += capacity[i];
-        }
      }
    } else if (elem_type[i] == 2) {
      int32_t *dst_ptr = static_cast<int32_t *>(out->at(i).data.data());
      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
              << "] is " << req->insts(0).tensor_array(i).int_data(0);
      int offset = 0;
-      for (int j = 0; j < batch_size; ++j) {
+      int elem_num = req->insts(0).tensor_array(i).int_data_size();
-        int elem_num = req->insts(j).tensor_array(i).int_data_size();
      for (int k = 0; k < elem_num; ++k) {
-          dst_ptr[offset + k] = req->insts(j).tensor_array(i).int_data(k);
+        dst_ptr[offset + k] = req->insts(0).tensor_array(i).int_data(k);
-        }
-        if (out->at(i).lod.size() == 1) {
-          offset = out->at(i).lod[0][j + 1];
-        } else {
-          offset += capacity[i];
-        }
      }
    }
  }
  VLOG(2) << "(logid=" << log_id << ") output size: " << out->size();
  timeline.Pause();
  int64_t end = timeline.TimeStampUS();
  res->p_size = 0;
-  res->_batch_size = batch_size;
+  res->_batch_size = 1;
  AddBlobInfo(res, start);
  AddBlobInfo(res, end);

--- a/core/general-server/op/general_response_op.cpp
+++ b/core/general-server/op/general_response_op.cpp
@@ -155,11 +155,13 @@ int GeneralResponseOp::inference() {
      }
      if (model_config->_is_lod_fetch[idx]) {
+        if (in->at(idx).lod.size() > 0) {
          for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
            fetch_p->mutable_tensor_array(var_idx)->add_lod(
                in->at(idx).lod[0][j]);
          }
        }
+      }
      VLOG(2) << "(logid=" << log_id << ") fetch var ["
              << model_config->_fetch_name[idx] << "] ready";

--- a/core/predictor/CMakeLists.txt
+++ b/core/predictor/CMakeLists.txt
@@ -13,7 +13,9 @@ set_source_files_properties(
        PROPERTIES
        COMPILE_FLAGS  "-Wno-strict-aliasing -Wno-unused-variable -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure)
+if (WITH_TRT)
+    add_definitions(-DWITH_TRT)
+endif()
 target_link_libraries(pdserving
        brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)

--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -38,6 +38,7 @@ class InferEngineCreationParams {
    _enable_ir_optimization = false;
    _static_optimization = false;
    _force_update_static_cache = false;
+    _use_trt = false;
  }
  void set_path(const std::string& path) { _path = path; }
@@ -50,12 +51,16 @@ class InferEngineCreationParams {
    _enable_ir_optimization = enable_ir_optimization;
  }
+  void set_use_trt(bool use_trt) { _use_trt = use_trt; }
  bool enable_memory_optimization() const {
    return _enable_memory_optimization;
  }
  bool enable_ir_optimization() const { return _enable_ir_optimization; }
+  bool use_trt() const { return _use_trt; }
  void set_static_optimization(bool static_optimization = false) {
    _static_optimization = static_optimization;
  }
@@ -86,6 +91,7 @@ class InferEngineCreationParams {
  bool _enable_ir_optimization;
  bool _static_optimization;
  bool _force_update_static_cache;
+  bool _use_trt;
 };
 class InferEngine {
@@ -172,6 +178,10 @@ class ReloadableInferEngine : public InferEngine {
          force_update_static_cache);
    }
+    if (conf.has_use_trt()) {
+      _infer_engine_params.set_use_trt(conf.use_trt());
+    }
    if (!check_need_reload() || load(_infer_engine_params) != 0) {
      LOG(ERROR) << "Failed load model_data_path" << _model_data_path;
      return -1;
@@ -553,8 +563,12 @@ class CloneDBReloadableInferEngine
 };
 template <typename FluidFamilyCore>
+#ifdef WITH_TRT
+class FluidInferEngine : public DBReloadableInferEngine<FluidFamilyCore> {
+#else
 class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
- public:
+#endif
+ public:  // NOLINT
  FluidInferEngine() {}
  ~FluidInferEngine() {}

--- a/core/sdk-cpp/include/abtest.h
+++ b/core/sdk-cpp/include/abtest.h
@@ -51,8 +51,8 @@ class WeightedRandomRender : public EndpointRouterBase {
        new (std::nothrow) Factory<WeightedRandomRender, EndpointRouterBase>();
    if (factory == NULL) {
      RAW_LOG(ERROR,
-              "Failed regist factory: WeightedRandomRender->EndpointRouterBase \
+              "Failed regist factory: WeightedRandomRender->EndpointRouterBase "
-          in macro!");
+              "in macro!");
      return -1;
    }
@@ -63,8 +63,8 @@ class WeightedRandomRender : public EndpointRouterBase {
    if (FactoryPool<EndpointRouterBase>::instance().register_factory(
            "WeightedRandomRender", factory) != 0) {
      RAW_LOG(INFO,
-              "Factory has been registed: \
+              "Factory has been registed: "
-              WeightedRandomRender->EndpointRouterBase.");
+              "WeightedRandomRender->EndpointRouterBase.");
    }
    return 0;

--- a/doc/BERT_10_MINS.md
+++ b/doc/BERT_10_MINS.md
@@ -56,21 +56,25 @@ the script of client side bert_client.py is as follow:
 [//file]:#bert_client.py
 ``` python
-import os
 import sys
 from paddle_serving_client import Client
+from paddle_serving_client.utils import benchmark_args
 from paddle_serving_app.reader import ChineseBertReader
+import numpy as np
+args = benchmark_args()
-reader = ChineseBertReader()
+reader = ChineseBertReader({"max_seq_len": 128})
 fetch = ["pooled_output"]
-endpoint_list = ["127.0.0.1:9292"]
+endpoint_list = ['127.0.0.1:9292']
 client = Client()
-client.load_client_config("bert_seq20_client/serving_client_conf.prototxt")
+client.load_client_config(args.model)
 client.connect(endpoint_list)
 for line in sys.stdin:
    feed_dict = reader.process(line)
-    result = client.predict(feed=feed_dict, fetch=fetch)
+    for key in feed_dict.keys():
+        feed_dict[key] = np.array(feed_dict[key]).reshape((128, 1))
+    result = client.predict(feed=feed_dict, fetch=fetch, batch=False)
 ```
 run

--- a/doc/BERT_10_MINS_CN.md
+++ b/doc/BERT_10_MINS_CN.md
@@ -52,18 +52,23 @@ pip install paddle_serving_app
 ``` python
 import sys
 from paddle_serving_client import Client
+from paddle_serving_client.utils import benchmark_args
 from paddle_serving_app.reader import ChineseBertReader
+import numpy as np
+args = benchmark_args()
-reader = ChineseBertReader()
+reader = ChineseBertReader({"max_seq_len": 128})
 fetch = ["pooled_output"]
-endpoint_list = ["127.0.0.1:9292"]
+endpoint_list = ['127.0.0.1:9292']
 client = Client()
-client.load_client_config("bert_seq20_client/serving_client_conf.prototxt")
+client.load_client_config(args.model)
 client.connect(endpoint_list)
 for line in sys.stdin:
    feed_dict = reader.process(line)
-    result = client.predict(feed=feed_dict, fetch=fetch)
+    for key in feed_dict.keys():
+        feed_dict[key] = np.array(feed_dict[key]).reshape((128, 1))
+    result = client.predict(feed=feed_dict, fetch=fetch, batch=False)
 ```
 执行

--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -75,10 +75,12 @@ export PATH=$PATH:$GOPATH/bin
 ## Get go packages
 ```shell
-go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway
+go env -w GO111MODULE=on
-go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger
+go env -w GOPROXY=https://goproxy.cn,direct
-go get -u github.com/golang/protobuf/protoc-gen-go
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
-go get -u google.golang.org/grpc
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
+go get -u github.com/golang/protobuf/protoc-gen-go@v1.4.3
+go get -u google.golang.org/grpc@v1.33.0
 ```
@@ -98,17 +100,47 @@ make -j10
 you can execute `make install` to put targets under directory `./output`, you need to add`-DCMAKE_INSTALL_PREFIX=./output`to specify output path to cmake command shown above.
 ### Integrated GPU version paddle inference library
+### CUDA_PATH is the cuda install path,use the command(whereis cuda) to check,it should be /usr/local/cuda.
+### CUDNN_LIBRARY && CUDA_CUDART_LIBRARY is the lib path, it should be /usr/local/cuda/lib64/
 ``` shell
+export CUDA_PATH='/usr/local'
+export CUDNN_LIBRARY='/usr/local/cuda/lib64/'
+export CUDA_CUDART_LIBRARY="/usr/local/cuda/lib64/"
 mkdir server-build-gpu && cd server-build-gpu
 cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
+    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
+    -DCUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY} \  
    -DSERVER=ON \
    -DWITH_GPU=ON ..
 make -j10
 ```
+### Integrated TRT version paddle inference library
+```
+export CUDA_PATH='/usr/local'
+export CUDNN_LIBRARY='/usr/local/cuda/lib64/'
+export CUDA_CUDART_LIBRARY="/usr/local/cuda/lib64/"
+mkdir server-build-trt && cd server-build-trt
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH} \
+    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
+    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
+    -DCUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY} \
+    -DSERVER=ON \
+    -DWITH_GPU=ON \
+    -DWITH_TRT=ON ..
+make -j10
+```
 execute `make install` to put targets under directory `./output`
 **Attention：** After the compilation is successful, you need to set the path of `SERVING_BIN`. See [Note](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE.md#Note) for details.
@@ -134,7 +166,10 @@ execute `make install` to put targets under directory `./output`
 ```bash
 mkdir app-build && cd app-build
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DAPP=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DAPP=ON ..
 make
 ```
@@ -143,12 +178,14 @@ make
 ## Install wheel package
 Regardless of the client, server or App part, after compiling, install the whl package in `python/dist/` in the temporary directory(`server-build-cpu`, `server-build-gpu`, `client-build`,`app-build`) of the compilation process.
+for example：cd server-build-cpu/python/dist && pip install -U xxxxx.whl
 ## Note
 When running the python server, it will check the `SERVING_BIN` environment variable. If you want to use your own compiled binary file, set the environment variable to the path of the corresponding binary file, usually`export SERVING_BIN=${BUILD_DIR}/core/general-server/serving`.
+BUILD_DIR is the absolute path of server build CPU or server build GPU。
+for example: cd server-build-cpu && export SERVING_BIN=${PWD}/core/general-server/serving
@@ -165,7 +202,9 @@ Please use the example under `python/examples` to verify.
 |     WITH_AVX     | Compile Paddle Serving with AVX intrinsics | OFF  |
 |     WITH_MKL     |  Compile Paddle Serving with MKL support   | OFF  |
 |     WITH_GPU     |   Compile Paddle Serving with NVIDIA GPU   | OFF  |
-|    CUDNN_ROOT    |    Define CuDNN library and header path    |      |
+|  CUDNN_LIBRARY   |    Define CuDNN library and header path    |      |
+| CUDA_TOOLKIT_ROOT_DIR |       Define CUDA PATH                |      |
+|   TENSORRT_ROOT  |           Define TensorRT PATH             |      |
 |      CLIENT      |       Compile Paddle Serving Client        | OFF  |
 |      SERVER      |       Compile Paddle Serving Server        | OFF  |
 |       APP        |     Compile Paddle Serving App package     | OFF  |
@@ -180,7 +219,8 @@ To compile the Paddle Serving GPU version on bare metal, you need to install the
 - CUDA
 - CuDNN
- NCCL2
+To compile the TensorRT version, you need to install the TensorRT library.
 Note here:
@@ -190,21 +230,12 @@ Note here:
 The following is the base library version matching relationship used by the PaddlePaddle release version for reference:
-|        |  CUDA   |          CuDNN           | NCCL2  |
+|          |  CUDA   |          CuDNN           | TensorRT |
 | :----:   | :-----: | :----------------------: | :----:   |
-| CUDA 8 | 8.0.61  | CuDNN 7.1.2 for CUDA 8.0 | 2.1.4  |
+| post9    |  9.0    | CuDNN 7.3.1 for CUDA 9.0 |          |
-| CUDA 9 | 9.0.176 | CuDNN 7.3.1 for CUDA 9.0 | 2.2.12 |
+| post10   |  10.0   | CuDNN 7.5.1 for CUDA 10.0|          |
+| trt      |  10.1   | CuDNN 7.5.1 for CUDA 10.1| 6.0.1.5  |
 ### How to make the compiler detect the CuDNN library
 Download the corresponding CUDNN version from NVIDIA developer official website and decompressing it, add `-DCUDNN_ROOT` to cmake command, to specify the path of CUDNN.
-### How to make the compiler detect the nccl library
-After downloading the corresponding version of the nccl2 library from the NVIDIA developer official website and decompressing it, add the following environment variables (take nccl2.1.4 as an example):
-```shell
-export C_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$C_INCLUDE_PATH
-export CPLUS_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$CPLUS_INCLUDE_PATH
-export LD_LIBRARY_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/lib/:$LD_LIBRARY_PATH
-```
--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -72,10 +72,12 @@ export PATH=$PATH:$GOPATH/bin
 ## 获取 Go packages
 ```shell
-go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway
+go env -w GO111MODULE=on
-go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger
+go env -w GOPROXY=https://goproxy.cn,direct
-go get -u github.com/golang/protobuf/protoc-gen-go
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
-go get -u google.golang.org/grpc
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
+go get -u github.com/golang/protobuf/protoc-gen-go@v1.4.3
+go get -u google.golang.org/grpc@v1.33.0
 ```
@@ -85,31 +87,68 @@ go get -u google.golang.org/grpc
 ``` shell
 mkdir server-build-cpu && cd server-build-cpu
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DSERVER=ON ..
 make -j10
 ```
 可以执行`make install`把目标产出放在`./output`目录下，cmake阶段需添加`-DCMAKE_INSTALL_PREFIX=./output`选项来指定存放路径。
 ### 集成GPU版本Paddle Inference Library
+### CUDA_PATH是cuda的安装路径，可以使用命令行whereis cuda命令确认你的cuda安装路径，通常应该是/usr/local/cuda
+### CUDNN_LIBRARY CUDA_CUDART_LIBRARY 是cuda库文件的路径，通常应该是/usr/local/cuda/lib64/
 ``` shell
+export CUDA_PATH='/usr/local'
+export CUDNN_LIBRARY='/usr/local/cuda/lib64/'
+export CUDA_CUDART_LIBRARY="/usr/local/cuda/lib64/"
 mkdir server-build-gpu && cd server-build-gpu
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON -DWITH_GPU=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
+    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
+    -DCUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY} \
+    -DSERVER=ON \
+    -DWITH_GPU=ON ..
 make -j10
 ```
-执行`make install`可以把目标产出放在`./output`目录下。
+### 集成TensorRT版本Paddle Inference Library
-**注意：** 编译成功后，需要设置`SERVING_BIN`路径，详见后面的[注意事项](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE_CN.md#注意事项)。
+```
+export CUDA_PATH='/usr/local'
+export CUDNN_LIBRARY='/usr/local/cuda/lib64/'
+export CUDA_CUDART_LIBRARY="/usr/local/cuda/lib64/"
+mkdir server-build-trt && cd server-build-trt
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH} \
+    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
+    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
+    -DCUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY} \
+    -DSERVER=ON \
+    -DWITH_GPU=ON \
+    -DWITH_TRT=ON ..
+make -j10
+```
+执行`make install`可以把目标产出放在`./output`目录下。
+**注意：** 编译成功后，需要设置`SERVING_BIN`路径，详见后面的[注意事项](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE_CN.md#注意事项)。
 ## 编译Client部分
 ``` shell
 mkdir client-build && cd client-build
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCLIENT=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DCLIENT=ON ..
 make -j10
 ```
@@ -121,7 +160,11 @@ make -j10
 ```bash
 mkdir app-build && cd app-build
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCMAKE_INSTALL_PREFIX=./output -DAPP=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DCMAKE_INSTALL_PREFIX=./output \
+    -DAPP=ON ..
 make
 ```
@@ -130,12 +173,16 @@ make
 ## 安装wheel包
 无论是Client端，Server端还是App部分，编译完成后，安装编译过程临时目录（`server-build-cpu`、`server-build-gpu`、`client-build`、`app-build`）下的`python/dist/` 中的whl包即可。
+例如：cd server-build-cpu/python/dist && pip install -U xxxxx.whl
 ## 注意事项
 运行python端Server时，会检查`SERVING_BIN`环境变量，如果想使用自己编译的二进制文件，请将设置该环境变量为对应二进制文件的路径，通常是`export SERVING_BIN=${BUILD_DIR}/core/general-server/serving`。
+其中BUILD_DIR为server-build-cpu或server-build-gpu的绝对路径。
+可以cd server-build-cpu路径下，执行export SERVING_BIN=${PWD}/core/general-server/serving
@@ -152,7 +199,10 @@ make
 |     WITH_AVX     | Compile Paddle Serving with AVX intrinsics | OFF  |
 |     WITH_MKL     |  Compile Paddle Serving with MKL support   | OFF  |
 |     WITH_GPU     |   Compile Paddle Serving with NVIDIA GPU   | OFF  |
-|    CUDNN_ROOT    |    Define CuDNN library and header path    |      |
+|     WITH_TRT     |    Compile Paddle Serving with TensorRT    | OFF  |
+|  CUDNN_LIBRARY   |    Define CuDNN library and header path    |      |
+| CUDA_TOOLKIT_ROOT_DIR |       Define CUDA PATH                |      |
+|   TENSORRT_ROOT  |           Define TensorRT PATH             |      |
 |      CLIENT      |       Compile Paddle Serving Client        | OFF  |
 |      SERVER      |       Compile Paddle Serving Server        | OFF  |
 |       APP        |     Compile Paddle Serving App package     | OFF  |
@@ -167,7 +217,8 @@ Paddle Serving通过PaddlePaddle预测库支持在GPU上做预测。WITH_GPU选
 - CUDA
 - CuDNN
- NCCL2
+编译TensorRT版本，需要安装TensorRT库。
 这里要注意的是：
@@ -176,21 +227,12 @@ Paddle Serving通过PaddlePaddle预测库支持在GPU上做预测。WITH_GPU选
 以下是PaddlePaddle发布版本所使用的基础库版本匹配关系，供参考：
-|        |  CUDA   |          CuDNN           | NCCL2  |
+|          |  CUDA   |          CuDNN           | TensorRT |
 | :----:   | :-----: | :----------------------: | :----:   |
-| CUDA 8 | 8.0.61  | CuDNN 7.1.2 for CUDA 8.0 | 2.1.4  |
+| post9    |  9.0    | CuDNN 7.3.1 for CUDA 9.0 |          |
-| CUDA 9 | 9.0.176 | CuDNN 7.3.1 for CUDA 9.0 | 2.2.12 |
+| post10   |  10.0   | CuDNN 7.5.1 for CUDA 10.0|          |
+| trt      |  10.1   | CuDNN 7.5.1 for CUDA 10.1| 6.0.1.5  |
 ### 如何让Paddle Serving编译系统探测到CuDNN库
-从NVIDIA developer官网下载对应版本CuDNN并在本地解压后，在cmake编译命令中增加`-DCUDNN_ROOT`参数，指定CuDNN库所在路径。
+从NVIDIA developer官网下载对应版本CuDNN并在本地解压后，在cmake编译命令中增加`-DCUDNN_LIBRARY`参数，指定CuDNN库所在路径。
-### 如何让Paddle Serving编译系统探测到nccl库
-从NVIDIA developer官网下载对应版本nccl2库并解压后，增加如下环境变量 (以nccl2.1.4为例)：
-```shell
-export C_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$C_INCLUDE_PATH
-export CPLUS_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$CPLUS_INCLUDE_PATH
-export LD_LIBRARY_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/lib/:$LD_LIBRARY_PATH
-```
--- a/doc/DOCKER_IMAGES.md
+++ b/doc/DOCKER_IMAGES.md
@@ -25,10 +25,10 @@ You can get images in two ways:
 ## Image description
 Runtime images cannot be used for compilation.
+If you want to customize your Serving based on source code, use the version with the suffix - devel.
 |                         Description                          |   OS    |             TAG              |                          Dockerfile                          |
 | :----------------------------------------------------------: | :-----: | :--------------------------: | :----------------------------------------------------------: |
@@ -40,3 +40,13 @@ Runtime images cannot be used for compilation.
 |              GPU (cuda10.0-cudnn7) development               | CentOS7 | latest-cuda10.0-cudnn7-devel | [Dockerfile.cuda10.0-cudnn7.devel](../tools/Dockerfile.cuda10.0-cudnn7.devel) |
 |     CPU development (Used to compile packages on Ubuntu)     | CentOS6 |            <None>            | [Dockerfile.centos6.devel](../tools/Dockerfile.centos6.devel) |
 | GPU (cuda9.0-cudnn7) development (Used to compile packages on Ubuntu) | CentOS6 |            <None>            | [Dockerfile.centos6.cuda9.0-cudnn7.devel](../tools/Dockerfile.centos6.cuda9.0-cudnn7.devel) |
+## Requirements for running CUDA containers
+Running a CUDA container requires a machine with at least one CUDA-capable GPU and a driver compatible with the CUDA toolkit version you are using. 
+The machine running the CUDA container **only requires the NVIDIA driver**, the CUDA toolkit doesn't have to be installed.
+For the relationship between CUDA toolkit version, Driver version and GPU architecture, please refer to [nvidia-docker wiki](https://github.com/NVIDIA/nvidia-docker/wiki/CUDA).
--- a/doc/DOCKER_IMAGES_CN.md
+++ b/doc/DOCKER_IMAGES_CN.md
@@ -25,10 +25,10 @@
 ## 镜像说明
 运行时镜像不能用于开发编译。
+若需要基于源代码二次开发编译，请使用后缀为-devel的版本。
 | 镜像说明                                           | 操作系统 | TAG                          | Dockerfile                                                   |
 | -------------------------------------------------- | -------- | ---------------------------- | ------------------------------------------------------------ |
@@ -40,3 +40,13 @@
 | GPU (cuda10.0-cudnn7) 开发镜像                     | CentOS7  | latest-cuda10.0-cudnn7-devel | [Dockerfile.cuda10.0-cudnn7.devel](../tools/Dockerfile.cuda10.0-cudnn7.devel) |
 | CPU 开发镜像 (用于编译 Ubuntu 包)                  | CentOS6  | <无>                         | [Dockerfile.centos6.devel](../tools/Dockerfile.centos6.devel) |
 | GPU (cuda9.0-cudnn7) 开发镜像 (用于编译 Ubuntu 包) | CentOS6  | <无>                         | [Dockerfile.centos6.cuda9.0-cudnn7.devel](../tools/Dockerfile.centos6.cuda9.0-cudnn7.devel) |
+## 运行CUDA容器的要求
+运行CUDA容器需要至少具有一个支持CUDA的GPU以及与您所使用的CUDA工具包版本兼容的驱动程序。
+运行CUDA容器的机器**只需要相应的NVIDIA驱动程序**，而CUDA工具包不是必要的。
+相关CUDA工具包版本、驱动版本和GPU架构的关系请参阅 [nvidia-docker wiki](https://github.com/NVIDIA/nvidia-docker/wiki/CUDA)。
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
 # FAQ
- Q: 如何调整RPC服务的等待时间，避免超时？ 
-  A: 使用set_rpc_timeout_ms设置更长的等待时间，单位为毫秒，默认时间为20秒。
-  示例：
+## 基础知识
-  ```
-  from paddle_serving_client import Client
-  client = Client()
+#### Q: Paddle Serving 、Paddle Inference、PaddleHub Serving三者的区别及联系？
-  client.load_client_config(sys.argv[1])
-  client.set_rpc_timeout_ms(100000)
-  client.connect(["127.0.0.1:9393"])
-   ```
- Q: 如何使用自己编译的Paddle Serving进行预测？
+**A:** paddle serving是远程服务，即发起预测的设备（手机、浏览器、客户端等）与实际预测的硬件不在一起。	paddle inference是一个library，适合嵌入到一个大系统中保证预测效率，paddle serving调用了paddle       inference做远程服务。paddlehub serving可以认为是一个示例，都会使用paddle serving作为统一预测服务入口。如果在web端交互，一般是调用远程服务的形式，可以使用paddle serving的web service搭建。
-  A: 通过pip命令安装自己编译出的whl包，并设置SERVING_BIN环境变量为编译出的serving二进制文件路径。
+#### Q: paddle-serving是否支持Int32支持
- Q: 执行GPU预测时遇到InvalidArgumentError: Device id must be less than GPU count, but received id is: 0. GPU count is: 0.
+**A:** 在protobuf定feed_type和fetch_type编号与数据类型对应如下
-  A: 将显卡驱动对应的libcuda.so的目录添加到LD_LIBRARY_PATH环境变量中
+     0-int64
- Q: 执行GPU预测时遇到ExternalError: Cudnn error, CUDNN_STATUS_BAD_PARAM at (/home/scmbuild/workspaces_cluster.dev/baidu.lib.paddlepaddle/baidu/lib/paddlepaddle/Paddle/paddle/fluid/operators/batch_norm_op.cu:198)
+	  1-float32
-  A: 将cudnn的lib64路径添加到LD_LIBRARY_PATH，安装自pypi的Paddle Serving中post9版使用的是cudnn 7.3,post10使用的是cudnn 7.5。如果是使用自己编译的Paddle Serving，可以在log/serving.INFO日志文件中查看对应的cudnn版本。
+	  2-int32
- Q: 执行GPU预测时遇到Error: Failed to find dynamic library: libcublas.so
+#### Q: paddle-serving是否支持windows和Linux环境下的多线程调用 
-  A: 将cuda的lib64路径添加到LD_LIBRARY_PATH, post9版本的Paddle Serving使用的是cuda 9.0，post10版本使用的cuda 10.0。
+**A:** 客户端可以发起多线程访问调用服务端 
- Q: 部署和预测中的日志信息在哪里查看？
+#### Q: paddle-serving如何修改消息大小限制
- A: server端的日志分为两部分，一部分打印到标准输出，一部分打印到启动服务时的目录下的log/serving.INFO文件中。
+**A:** 在server端和client但通过FLAGS_max_body_size来扩大数据量限制，单位为字节，默认为64MB
-    client端的日志直接打印到标准输出。
+#### Q: paddle-serving客户端目前支持哪些语言
-    通过在部署服务之前 'export  GLOG_v=3'可以输出更为详细的日志信息。
+**A:** java c++ python 
+#### Q: paddle-serving目前支持哪些协议
+**A:** http rpc 
+## 编译问题
+#### Q: 如何使用自己编译的Paddle Serving进行预测？
+**A:** 通过pip命令安装自己编译出的whl包，并设置SERVING_BIN环境变量为编译出的serving二进制文件路径。
+#### Q: 使用Java客户端，mvn compile过程出现"No compiler is provided in this environment. Perhaps you are running on a JRE rather than a JDK?"错误
+**A:** 没有安装JDK，或者JAVA_HOME路径配置错误（正确配置是JDK路径，常见错误配置成JRE路径，例如正确路径参考JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.262.b10-0.el7_8.x86_64/"）。Java JDK安装参考https://segmentfault.com/a/1190000015389941
+## 部署问题
+#### Q: GPU环境运行Serving报错，GPU count is: 0。
+```
+terminate called after throwing an instance of 'paddle::platform::EnforceNotMet'
+what():
+--------------------------------------------
+C++ Call Stacks (More useful to developers):
+--------------------------------------------
+0   std::string paddle::platform::GetTraceBackString<std::string const&>(std::string const&, char const*, int)
+1   paddle::platform::SetDeviceId(int)
+2   paddle::AnalysisConfig::fraction_of_gpu_memory_for_pool() const
+3   std::unique_ptr<paddle::PaddlePredictor, std::default_delete<paddle::PaddlePredictor> > paddle::CreatePaddlePredictor<paddle::AnalysisConfig, (paddle::PaddleEngineKind)2>(paddle::AnalysisConfig const&)
+4   std::unique_ptr<paddle::PaddlePredictor, std::default_delete<paddle::PaddlePredictor> > paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(paddle::AnalysisConfig const&)
+----------------------
+Error Message Summary:
+----------------------
+InvalidArgumentError: Device id must be less than GPU count, but received id is: 0. GPU count is: 0.
+[Hint: Expected id < GetCUDADeviceCount(), but received id:0 >= GetCUDADeviceCount():0.] at (/home/scmbuild/workspaces_cluster.dev/baidu.lib.paddlepaddle/baidu/lib/paddlepaddle/Paddle/paddle/fluid/platform/gpu_info.cc:211)
+```
+**A:** libcuda.so没有链接成功。首先在机器上找到libcuda.so，ldd检查libnvidia版本与nvidia-smi中版本一致（libnvidia-fatbinaryloader.so.418.39，与NVIDIA-SMI 418.39 Driver Version: 418.39）,然后用export导出libcuda.so的路径即可（例如libcuda.so在/usr/lib64/，export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib64/）
+#### Q: 遇到 GPU not found, please check your environment or use cpu version by "pip install paddle_serving_server"
+**A:** 检查环境中是否有N卡：ls /dev/ | grep nvidia
+#### Q: 目前Paddle Serving支持哪些镜像环境？
+**A:** 目前（0.4.0）仅支持CentOS，具体列表查阅[这里](https://github.com/PaddlePaddle/Serving/blob/develop/doc/DOCKER_IMAGES.md)
+#### Q: python编译的GCC版本与serving的版本不匹配
+**A:**:1)使用[GPU docker](https://github.com/PaddlePaddle/Serving/blob/develop/doc/RUN_IN_DOCKER.md#gpunvidia-docker)解决环境问题
+	   2)修改anaconda的虚拟环境下安装的python的gcc版本[参考](https://www.jianshu.com/p/c498b3d86f77) 
+#### Q: paddle-serving是否支持本地离线安装 
+**A:** 支持离线部署，需要把一些相关的[依赖包](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE.md)提前准备安装好
+## 预测问题
+#### Q: 使用GPU第一次预测时特别慢，如何调整RPC服务的等待时间避免超时？ 
+**A:** GPU第一次预测需要初始化。使用set_rpc_timeout_ms设置更长的等待时间，单位为毫秒，默认时间为20秒。
+示例：
+```
+from paddle_serving_client import Client
+client = Client()
+client.load_client_config(sys.argv[1])
+client.set_rpc_timeout_ms(100000)
+client.connect(["127.0.0.1:9393"])
+```
+#### Q: 执行GPU预测时遇到InvalidArgumentError: Device id must be less than GPU count, but received id is: 0. GPU count is: 0.
+**A:** 将显卡驱动对应的libcuda.so的目录添加到LD_LIBRARY_PATH环境变量中
+#### Q: 执行GPU预测时遇到ExternalError: Cudnn error, CUDNN_STATUS_BAD_PARAM at (../batch_norm_op.cu:198)
+**A:** 将cudnn的lib64路径添加到LD_LIBRARY_PATH，安装自pypi的Paddle Serving中post9版使用的是cudnn 7.3,post10使用的是cudnn 7.5。如果是使用自己编译的Paddle Serving，可以在log/serving.INFO日志文件中查看对应的cudnn版本。
+#### Q: 执行GPU预测时遇到Error: Failed to find dynamic library: libcublas.so
+**A:** 将cuda的lib64路径添加到LD_LIBRARY_PATH, post9版本的Paddle Serving使用的是cuda 9.0，post10版本使用的cuda 10.0。
+#### Q: Client端fetch的变量名如何设置
+**A:** 可以查看配置文件serving_server_conf.prototxt，获取需要的变量名
+#### Q: 如何使用多语言客户端
+**A:** 多语言客户端要与多语言服务端配套使用。当前版本下（0.4.0），服务端需要将Server改为MultiLangServer（如果是以命令行启动的话只需要添加--use_multilang参数），Python客户端需要将Client改为MultiLangClient，同时去除load_client_config的过程。[Java客户端参考文档](https://github.com/PaddlePaddle/Serving/blob/develop/doc/JAVA_SDK_CN.md)
+#### Q: 如何在Windows下使用Paddle Serving
+**A:** 当前版本（0.4.0）在Windows上可以运行多语言RPC客户端，或使用HTTP方式访问。如果使用多语言RPC客户端，需要在Linux环境（比如本机容器，或远程Linux机器）中运行多语言服务端；如果使用HTTP方式，需要在Linux环境中运行普通服务端
+#### Q: libnvinfer.so: cannot open shared object file: No such file or directory)
+ **A:** 参考该文档安装TensorRT: https://blog.csdn.net/hesongzefairy/article/details/105343525
+## 日志排查
+#### Q: 部署和预测中的日志信息在哪里查看？
+**A:** server端的日志分为两部分，一部分打印到标准输出，一部分打印到启动服务时的目录下的log/serving.INFO文件中。
+client端的日志直接打印到标准输出。
+通过在部署服务之前 'export  GLOG_v=3'可以输出更为详细的日志信息。
+#### Q: paddle-serving启动成功后，相关的日志在哪里设置
+**A:** 1)警告是glog组件打印的，告知glog初始化之前日志打印在STDERR
+	   2)一般采用GLOG_v方式启动服务同时设置日志级别。
+例如：
+```
+GLOG_v=2 python -m paddle_serving_server.serve --model xxx_conf/ --port 9999 
+```
+#### Q: （GLOG_v=2下）Server端日志一切正常，但Client端始终得不到正确的预测结果
+**A:** 可能是配置文件有问题，检查下配置文件（is_load_tensor，fetch_type等有没有问题）
+#### Q: 如何给Server传递Logid
+**A:** Logid默认为0（后续应该有自动生成Logid的计划，当前版本0.4.0），Client端通过在predict函数中指定log_id参数传递
+## 性能优化
--- a/doc/GRPC_IMPL_CN.md
+++ b/doc/GRPC_IMPL_CN.md
-# gRPC接口
+# gRPC接口使用介绍
-gRPC 接口实现形式类似 Web Service：
+  - [1.与bRPC接口对比](#1与brpc接口对比)
+      - [1.1 服务端对比](#11-服务端对比)
+      - [1.2 客服端对比](#12-客服端对比)
+      - [1.3 其他](#13-其他)
+  - [2.示例：线性回归预测服务](#2示例线性回归预测服务)
+      - [获取数据](#获取数据)
+      - [开启 gRPC 服务端](#开启-grpc-服务端)
+    - [客户端预测](#客户端预测)
+      - [同步预测](#同步预测)
+      - [异步预测](#异步预测)
+      - [Batch 预测](#batch-预测)
+      - [通用 pb 预测](#通用-pb-预测)
+      - [预测超时](#预测超时)
+      - [List 输入](#list-输入)
+  - [3.更多示例](#3更多示例)
-![](grpc_impl.png)
+使用gRPC接口，Client端可以在Win/Linux/MacOS平台上调用不同语言。gRPC 接口实现结构如下：
-## 与bRPC接口对比
+![](https://github.com/PaddlePaddle/Serving/blob/develop/doc/grpc_impl.png)
-1. gRPC Server 端 `load_model_config` 函数添加 `client_config_path` 参数：
+## 1.与bRPC接口对比
-   ```python
+#### 1.1 服务端对比
+* gRPC Server 端 `load_model_config` 函数添加 `client_config_path` 参数：
+   ```
   def load_model_config(self, server_config_paths, client_config_path=None)
   ```
+    在一些例子中 bRPC Server 端与 bRPC Client 端的配置文件可能不同（如 在cube local 中，Client 端的数据先交给 cube，经过 cube 处理后再交给预测库），此时 gRPC Server 端需要手动设置 gRPC Client 端的配置`client_config_path`。
+    **`client_config_path` 默认为 `<server_config_path>/serving_server_conf.prototxt`。**
-   在一些例子中 bRPC Server 端与 bRPC Client 端的配置文件可能是不同的（如 cube local 例子中，Client 端的数据先交给 cube，经过 cube 处理后再交给预测库），所以 gRPC Server 端需要获取 gRPC Client 端的配置；同时为了取消 gRPC Client 端手动加载配置文件的过程，所以设计 gRPC Server 端同时加载两个配置文件。`client_config_path` 默认为 `<server_config_path>/serving_server_conf.prototxt`。
+#### 1.2 客服端对比
-2. gRPC Client 端取消 `load_client_config` 步骤：
+* gRPC Client 端取消 `load_client_config` 步骤：
   在 `connect` 步骤通过 RPC 获取相应的 prototxt（从任意一个 endpoint 获取即可）。
-3. gRPC Client 需要通过 RPC 方式设置 timeout 时间（调用形式与 bRPC Client保持一致）
+* gRPC Client 需要通过 RPC 方式设置 timeout 时间（调用形式与 bRPC Client保持一致）
   因为 bRPC Client 在 `connect` 后无法更改 timeout 时间，所以当 gRPC Server 收到变更 timeout 的调用请求时会重新创建 bRPC Client 实例以变更 bRPC Client timeout时间，同时 gRPC Client 会设置 gRPC 的 deadline 时间。
   **注意，设置 timeout 接口和 Inference 接口不能同时调用（非线程安全），出于性能考虑暂时不加锁。**
-4. gRPC Client 端 `predict` 函数添加 `asyn` 和 `is_python` 参数：
+* gRPC Client 端 `predict` 函数添加 `asyn` 和 `is_python` 参数：
-   ```python
+   ```
   def predict(self, feed, fetch, need_variant_tag=False, asyn=False, is_python=True)
   ```
-   其中，`asyn` 为异步调用选项。当 `asyn=True` 时为异步调用，返回 `MultiLangPredictFuture` 对象，通过 `MultiLangPredictFuture.result()` 阻塞获取预测值；当 `asyn=Fasle` 为同步调用。
+1.    `asyn` 为异步调用选项。当 `asyn=True` 时为异步调用，返回 `MultiLangPredictFuture` 对象，通过 `MultiLangPredictFuture.result()` 阻塞获取预测值；当 `asyn=Fasle` 为同步调用。
+2.    `is_python` 为 proto 格式选项。当 `is_python=True` 时，基于 numpy bytes 格式进行数据传输，目前只适用于 Python；当 `is_python=False` 时，以普通数据格式传输，更加通用。使用 numpy bytes 格式传输耗时比普通数据格式小很多（详见 [#654](https://github.com/PaddlePaddle/Serving/pull/654)）。
+#### 1.3 其他
+* 异常处理：当 gRPC Server 端的 bRPC Client 预测失败（返回 `None`）时，gRPC Client 端同样返回None。其他 gRPC 异常会在 Client 内部捕获，并在返回的 fetch_map 中添加一个 "status_code" 字段来区分是否预测正常（参考 timeout 样例）。
+* 由于 gRPC 只支持 pick_first 和 round_robin 负载均衡策略，ABTEST 特性还未打齐。
+* 系统兼容性：
+    * [x]  CentOS
+    * [x]  macOS
+    * [x]  Windows
+* 已经支持的客户端语言：
+   -  Python
+   -  Java
+   -  Go
+## 2.示例：线性回归预测服务
+以下是采用gRPC实现的关于线性回归预测的一个示例，具体代码详见此[链接](https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/grpc_impl_example/fit_a_line)
+#### 获取数据
+```shell
+sh get_data.sh
+```
+#### 开启 gRPC 服务端
+``` shell
+python test_server.py uci_housing_model/
+```
+也可以通过下面的一行代码开启默认 gRPC 服务：
+```shell
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang
+```
+注：--use_multilang参数用来启用多语言客户端
+### 客户端预测
+#### 同步预测
+``` shell
+python test_sync_client.py
+```
+#### 异步预测
+``` shell
+python test_asyn_client.py
+```
+#### Batch 预测
+``` shell
+python test_batch_client.py
+```
-   `is_python` 为 proto 格式选项。当 `is_python=True` 时，基于 numpy bytes 格式进行数据传输，目前只适用于 Python；当 `is_python=False` 时，以普通数据格式传输，更加通用。使用 numpy bytes 格式传输耗时比普通数据格式小很多（详见 [#654](https://github.com/PaddlePaddle/Serving/pull/654)）。
+#### 通用 pb 预测
-5. 异常处理：当 gRPC Server 端的 bRPC Client 预测失败（返回 `None`）时，gRPC Client 端同样返回None。其他 gRPC 异常会在 Client 内部捕获，并在返回的 fetch_map 中添加一个 "status_code" 字段来区分是否预测正常（参考 timeout 样例）。
+``` shell
+python test_general_pb_client.py
+```
-6. 由于 gRPC 只支持 pick_first 和 round_robin 负载均衡策略，ABTEST 特性还未打齐。
+#### 预测超时
-7. 经测试，gRPC 版本可以在 Windows、macOS 平台使用。
+``` shell
+python test_timeout_client.py
+```
-8. 计划支持的客户端语言：
+#### List 输入
-   - [x] Python
+``` shell
-   - [ ] Java
+python test_list_input_client.py
-   - [ ] Go
+```
-   - [ ] JavaScript
-## Python 端的一些例子 
+## 3.更多示例
-详见 `python/examples/grpc_impl_example` 下的示例文件。
+详见[`python/examples/grpc_impl_example`](https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/grpc_impl_example)下的示例文件。
--- a/doc/INFERENCE_TO_SERVING.md
+++ b/doc/INFERENCE_TO_SERVING.md
@@ -24,13 +24,13 @@ inference_model_dir = "your_inference_model"
 serving_client_dir = "serving_client_dir"
 serving_server_dir = "serving_server_dir"
 feed_var_names, fetch_var_names = inference_model_to_serving(
-		inference_model_dir, serving_client_dir, serving_server_dir)
+		inference_model_dir, serving_server_dir, serving_client_dir)
 ```
 if your model file and params file are both standalone, please use the following api.
 ```
 feed_var_names, fetch_var_names = inference_model_to_serving(
-		inference_model_dir, serving_client_dir, serving_server_dir,
+		inference_model_dir, serving_server_dir, serving_client_dir,
 		model_filename="model", params_filename="params")
 ```
--- a/doc/INFERENCE_TO_SERVING_CN.md
+++ b/doc/INFERENCE_TO_SERVING_CN.md
@@ -23,11 +23,11 @@ inference_model_dir = "your_inference_model"
 serving_client_dir = "serving_client_dir"
 serving_server_dir = "serving_server_dir"
 feed_var_names, fetch_var_names = inference_model_to_serving(
-		inference_model_dir, serving_client_dir, serving_server_dir)
+		inference_model_dir, serving_server_dir, serving_client_dir)
 ```
 如果模型中有模型描述文件`model_filename` 和 模型参数文件`params_filename`，那么请用
 ```
 feed_var_names, fetch_var_names = inference_model_to_serving(
-		inference_model_dir, serving_client_dir, serving_server_dir,
+		inference_model_dir, serving_server_dir, serving_client_dir,
 		 model_filename="model", params_filename="params")
 ```
--- a/doc/LATEST_PACKAGES.md
+++ b/doc/LATEST_PACKAGES.md
@@ -18,6 +18,8 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.0.0-py2-none-an
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py3-none-any.whl
 #cuda 10.0
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py3-none-any.whl
+#cuda10.1 with TensorRT 6
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py3-none-any.whl
 ```
 ### Python 2
 ```
@@ -25,6 +27,8 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py2-none-any.whl
 #cuda 10.0
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py2-none-any.whl
+##cuda10.1 with TensorRT 6
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py2-none-any.whl
 ```
 ## Client
@@ -36,6 +40,10 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.0.0-cp37-none-a
 ```
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.0.0-cp36-none-any.whl
 ```
+### Python 3.5
+```
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.0.0-cp35-none-any.whl
+```
 ### Python 2.7
 ```
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.0.0-cp27-none-any.whl

--- a/doc/PIPELINE_SERVING.md
+++ b/doc/PIPELINE_SERVING.md
@@ -7,14 +7,46 @@ Paddle Serving is usually used for the deployment of single model, but the end-t
 Paddle Serving provides a user-friendly programming framework for multi-model composite services, Pipeline Serving, which aims to reduce the threshold of programming, improve resource utilization (especially GPU), and improve the prediction efficiency.
-## Architecture Design
+## ★ Architecture Design
-The Server side is built based on gRPC and graph execution engine. The relationship between them is shown in the following figure.
+The Server side is built based on <b>RPC Service</b> and <b>graph execution engine</b>. The relationship between them is shown in the following figure.
 <center>
 <img src='pipeline_serving-image1.png' height = "250" align="middle"/>
 </center>
-### Graph Execution Engine
+### 1. RPC Service
+In order to meet the needs of different users, the RPC service starts one Web server and one RPC server at the same time, and can process 2 types of requests, RESTful API and gRPC.The gPRC gateway receives RESTful API requests and forwards requests to the gRPC server through the reverse proxy server; gRPC requests are received by the gRPC server, so the two types of requests are processed by the gRPC Service in a unified manner to ensure that the processing logic is consistent.
+#### <b>1.1 Request and Respose of proto
+gRPC service and gRPC gateway service are generated with service.proto.
+```proto
+message Request {
+  repeated string key = 1;  
+  repeated string value = 2;
+  optional string name = 3;
+  optional string method = 4;
+  optional int64 logid = 5;
+  optional string clientip = 6;
+};
+message Response {
+  optional int32 err_no = 1;
+  optional string err_msg = 2;
+  repeated string key = 3;
+  repeated string value = 4;
+};
+```
+The `key` and `value` in the Request are paired string arrays. The `name` and `method` correspond to the URL of the RESTful API://{ip}:{port}/{name}/{method}.The `logid` and `clientip` are convenient for users to connect service-level requests and customize strategies.
+In Response, `err_no` and `err_msg` express the correctness and error information of the processing result, and `key` and `value` are the returned results.
+### 2. Graph Execution Engine
 The graph execution engine consists of OPs and Channels, and the connected OPs share one Channel.
@@ -28,7 +60,7 @@ The graph execution engine consists of OPs and Channels, and the connected OPs s
 </center>
-### OP Design
+#### <b>2.1 OP Design</b>
 - The default function of a single OP is to access a single Paddle Serving Service based on the input Channel data and put the result into the output Channel.
 - OP supports user customization, including preprocess, process, postprocess functions that can be inherited and implemented by the user.
@@ -36,7 +68,7 @@ The graph execution engine consists of OPs and Channels, and the connected OPs s
 - OP can obtain data from multiple different RPC requests for Auto-Batching.
 - OP can be started by a thread or process.
-### Channel Design
+#### <b>2.2 Channel Design</b>
 - Channel is the data structure for sharing data between OPs, responsible for sharing data or sharing data status information.
 - Outputs from multiple OPs can be stored in the same Channel, and data from the same Channel can be used by multiple OPs.
@@ -47,8 +79,17 @@ The graph execution engine consists of OPs and Channels, and the connected OPs s
 </center>
+#### <b>2.3 client type design</b>
-### Extreme Case Consideration
+- Prediction type (client_type) of Op has 3 types, brpc, grpc and local_predictor
+- brpc: Using bRPC Client to interact with remote Serving by network, performance is better than grpc.
+  - grpc: Using gRPC Client to interact with remote Serving by network, cross-platform deployment supported.
+  - local_predictor: Load the model and predict in the local service without interacting with the network. Support multi-card deployment, and TensorRT prediction.
+  - Selection: 
+    - Time cost(lower is better): local_predict < brpc <= grpc
+    - Microservice: Split the brpc or grpc model into independent services, simplify development and deployment complexity, and improve resource utilization
+#### <b>2.4 Extreme Case Consideration</b>
 - Request timeout
@@ -65,9 +106,7 @@ The graph execution engine consists of OPs and Channels, and the connected OPs s
  - For output buffer, you can use a similar process as input buffer, which adjusts the concurrency of OP3 and OP4 to control the buffer length of output buffer. (The length of the output buffer depends on the speed at which downstream OPs obtain data from the output buffer)
  - The amount of data in the Channel will not exceed `worker_num` of gRPC, that is, it will not exceed the thread pool size.
-## Detailed Design
+## ★ Detailed Design
-### User Interface Design
 #### 1. General OP Definition
@@ -79,11 +118,13 @@ def __init__(name=None,
             server_endpoints=[],
             fetch_list=[],
             client_config=None,
+             client_type=None,
             concurrency=1,
             timeout=-1,
             retry=1,
             batch_size=1,
-             auto_batching_timeout=None)
+             auto_batching_timeout=None,
+             local_service_handler=None)
 ```
 The meaning of each parameter is as follows:
@@ -92,14 +133,16 @@ The meaning of each parameter is as follows:
 | :-------------------: | :----------------------------------------------------------: |
 |         name          | (str) String used to identify the OP type, which must be globally unique. |
 |       input_ops       |     (list) A list of all previous OPs of the current Op.     |
-|   server_endpoints    | (list) List of endpoints for remote Paddle Serving Service. If this parameter is not set, the OP will not access the remote Paddle Serving Service, that is, the process operation will not be performed. |
+|   server_endpoints    | (list) List of endpoints for remote Paddle Serving Service. If this parameter is not set,it is considered as local_precditor mode, and the configuration is read from local_service_conf |
 |      fetch_list       | (list) List of fetch variable names for remote Paddle Serving Service. |
 |     client_config     | (str) The path of the client configuration file corresponding to the Paddle Serving Service. |
+|     client_type       | （str)brpc, grpc or local_predictor. local_predictor does not start the Serving service, in-process prediction|
 |      concurrency      |             (int) The number of concurrent OPs.              |
 |        timeout        | (int) The timeout time of the process operation, in ms. If the value is less than zero, no timeout is considered. |
 |         retry         | (int) Timeout number of retries. When the value is 1, no retries are made. |
 |      batch_size       | (int) The expected batch_size of Auto-Batching, since building batches may time out, the actual batch_size may be less than the set value. |
-| auto_batching_timeout | (float) Timeout for building batches of Auto-Batching (the unit is ms). |
+| auto_batching_timeout | (float) Timeout for building batches of Auto-Batching (the unit is ms). When batch_size> 1, auto_batching_timeout should be set, otherwise the waiting will be blocked when the number of requests is insufficient for batch_size|
+| local_service_handler | (object) local predictor handler，assigned by Op init() input parameters or created in Op init()|
 #### 2. General OP Secondary Development Interface
@@ -156,7 +199,7 @@ def init_op(self):
 It should be **noted** that in the threaded version of OP, each OP will only call this function once, so the loaded resources must be thread safe.
-#### 3. RequestOp Definition
+#### 3. RequestOp Definition and Secondary Development Interface
 RequestOp is used to process RPC data received by Pipeline Server, and the processed data will be added to the graph execution engine. Its constructor is as follows:
@@ -164,7 +207,7 @@ RequestOp is used to process RPC data received by Pipeline Server, and the proce
 def __init__(self)
 ```
-#### 4. RequestOp Secondary Development Interface
+When the default RequestOp cannot meet the parameter parsing requirements, you can customize the request parameter parsing method by rewriting the following two interfaces.
 |           Interface or Variable           |                           Explain                            |
 | :---------------------------------------: | :----------------------------------------------------------: |
@@ -188,7 +231,7 @@ def unpack_request_package(self, request):
 The return value is required to be a dictionary type.
-#### 5. ResponseOp Definition
+#### 4. ResponseOp Definition and Secondary Development Interface
 ResponseOp is used to process the prediction results of the graph execution engine. The processed data will be used as the RPC return value of Pipeline Server. Its constructor is as follows:
@@ -198,7 +241,7 @@ def __init__(self, input_ops)
 `input_ops` is the last OP of graph execution engine. Users can construct different DAGs by setting different `input_ops` without modifying the topology of OPs.
-#### 6. ResponseOp Secondary Development Interface
+When the default ResponseOp cannot meet the requirements of the result return format, you can customize the return package packaging method by rewriting the following two interfaces.
 |            Interface or Variable             |                           Explain                            |
 | :------------------------------------------: | :----------------------------------------------------------: |
@@ -237,7 +280,7 @@ def pack_response_package(self, channeldata):
  return resp
 ```
-#### 7. PipelineServer Definition
+#### 5. PipelineServer Definition
 The definition of PipelineServer is relatively simple, as follows:
@@ -251,22 +294,137 @@ server.run_server()
 Where `response_op` is the responseop mentioned above, PipelineServer will initialize Channels according to the topology relationship of each OP and build the calculation graph. `config_yml_path` is the configuration file of PipelineServer. The example file is as follows:
 ```yaml
-rpc_port: 18080  # gRPC port
+# gRPC port
-worker_num: 1  # gRPC thread pool size (the number of processes in the process version servicer). The default is 1
+rpc_port: 18080  
-build_dag_each_worker: false  # Whether to use process server or not. The default is false
-http_port: 0 # HTTP service port. Do not start HTTP service when the value is less or equals 0. The default value is 0.
+# http port, do not start HTTP service when the value is less or equals 0. The default value is 0.
+http_port: 18071 
+# gRPC thread pool size (the number of processes in the process version servicer). The default is 1
+worker_num: 1  
+ # Whether to use process server or not. The default is false
+build_dag_each_worker: false 
 dag:
-    is_thread_op: true  # Whether to use the thread version of OP. The default is true
+    # Whether to use the thread version of OP. The default is true
-    client_type: brpc  # Use brpc or grpc client. The default is brpc
+    is_thread_op: true  
-    retry: 1  # The number of times DAG executor retries after failure. The default value is 1, that is, no retrying
-    use_profile: false  # Whether to print the log on the server side. The default is false
+    # The number of times DAG executor retries after failure. The default value is 1, that is, no retrying
+    retry: 1 
+    # Whether to print the log on the server side. The default is false
+    use_profile: false  
+    # Monitoring time interval of Tracer (in seconds). Do not start monitoring when the value is less than 1. The default value is -1
    tracer:
-        interval_s: 600 # Monitoring time interval of Tracer (in seconds). Do not start monitoring when the value is less than 1. The default value is -1
+        interval_s: 600 
+op:
+    bow:
+        # Concurrency, when is_thread_op=True, it's thread concurrency; otherwise, it's process concurrency
+        concurrency: 1
+        # Client types, brpc, grpc and local_predictor
+        client_type: brpc
+        # Retry times, no retry by default
+        retry: 1
+        # Prediction timeout, ms
+        timeout: 3000
+        # Serving IPs
+        server_endpoints: ["127.0.0.1:9393"]
+        # Client config of bow model
+        client_config: "imdb_bow_client_conf/serving_client_conf.prototxt"
+        # Fetch list
+        fetch_list: ["prediction"]    
+        # Batch size, default 1
+        batch_size: 1
+        # Batch query timeout
+        auto_batching_timeout: 2000
+```
+### 6. Special usages
+#### 6.1 <b>Business custom error type</b>
+Users can customize the error code according to the business, inherit ProductErrCode, and return it in the return list in Op's preprocess or postprocess. The next stage of processing will skip the post OP processing based on the custom error code.
+```python
+class ProductErrCode(enum.Enum):
+    """
+    ProductErrCode is a base class for recording business error code. 
+    product developers inherit this class and extend more error codes. 
+    """
+    pass
+```
+#### <b>6.2 Skip process stage</b>
+The 2rd result of the result list returned by preprocess is `is_skip_process=True`, indicating whether to skip the process stage of the current OP and directly enter the postprocess processing
+```python
+def preprocess(self, input_dicts, data_id, log_id):
+        """
+        In preprocess stage, assembling data for process stage. users can 
+        override this function for model feed features.
+        Args:
+            input_dicts: input data to be preprocessed
+            data_id: inner unique id
+            log_id: global unique id for RTT
+        Return:
+            input_dict: data for process stage
+            is_skip_process: skip process stage or not, False default
+            prod_errcode: None default, otherwise, product errores occured.
+                          It is handled in the same way as exception. 
+            prod_errinfo: "" default
+        """
+        # multiple previous Op
+        if len(input_dicts) != 1:
+            _LOGGER.critical(
+                self._log(
+                    "Failed to run preprocess: this Op has multiple previous "
+                    "inputs. Please override this func."))
+            os._exit(-1)
+        (_, input_dict), = input_dicts.items()
+        return input_dict, False, None, ""
 ```
+#### <b>6.3 Custom proto Request and Response</b>
+When the default proto structure does not meet the business requirements, at the same time, the Request and Response message structures of the proto in the following two files remain the same.
+> pipeline/gateway/proto/gateway.proto 
+> pipeline/proto/pipeline_service.proto
+Recompile Serving Server again.
+#### <b>6.4 Custom URL</b>
-## Example
+The grpc gateway processes post requests. The default `method` is `prediction`, for example: 127.0.0.1:8080/ocr/prediction. Users can customize the name and method, and can seamlessly switch services with existing URLs.
+```proto
+service PipelineService {
+  rpc inference(Request) returns (Response) {
+    option (google.api.http) = {
+      post : "/{name=*}/{method=*}"
+      body : "*"
+    };
+  }
+};
+```
+***
+## ★ Classic examples
 Here, we build a simple imdb model enable example to show how to use Pipeline Serving. The relevant code can be found in the `python/examples/pipeline/imdb_model_ensemble` folder. The Server-side structure in the example is shown in the following figure:
@@ -277,7 +435,7 @@ Here, we build a simple imdb model enable example to show how to use Pipeline Se
 </center>
-### Get the model file and start the Paddle Serving Service
+### 1. Get the model file and start the Paddle Serving Service
 ```shell
 cd python/examples/pipeline/imdb_model_ensemble
@@ -288,7 +446,83 @@ python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.
 PipelineServing also supports local automatic startup of PaddleServingService. Please refer to the example `python/examples/pipeline/ocr`.
-### Start PipelineServer
+### 2. Create config.yaml
+Because there is a lot of configuration information in config.yaml,, only part of the OP configuration is shown here. For full information, please refer to `python/examples/pipeline/imdb_model_ensemble/config.yaml`
+```yaml
+op:
+    bow:
+        # Concurrency, when is_thread_op=True, it's thread concurrency; otherwise, it's process concurrency
+        concurrency: 1
+        # Client types, brpc, grpc and local_predictor
+        client_type: brpc
+        # Retry times, no retry by default
+        retry: 1
+        # Predcition timeout, ms
+        timeout: 3000
+        # Serving IPs
+        server_endpoints: ["127.0.0.1:9393"]
+        # Client config of bow model
+        client_config: "imdb_bow_client_conf/serving_client_conf.prototxt"
+        # Fetch list
+        fetch_list: ["prediction"]    
+        # Batch request size, default 1
+        batch_size: 1
+        # Batch query timeout
+        auto_batching_timeout: 2000
+    cnn:
+        # Concurrency
+        concurrency: 1
+        # Client types, brpc, grpc and local_predictor
+        client_type: brpc
+        # Retry times, no retry by default
+        retry: 1
+        # Predcition timeout, ms
+        timeout: 3000
+        # Serving IPs
+        server_endpoints: ["127.0.0.1:9292"]
+        # Client config of cnn model
+        client_config: "imdb_cnn_client_conf/serving_client_conf.prototxt"
+        # Fetch list
+        fetch_list: ["prediction"]
+        # Batch request size, default 1
+        batch_size: 1
+        # Batch query timeout
+        auto_batching_timeout: 2000
+    combine:
+        # Concurrency
+        concurrency: 1
+        #R etry times, no retry by default
+        retry: 1
+        # Predcition timeout, ms
+        timeout: 3000
+        # Batch request size, default 1
+        batch_size: 1
+        # Batch query timeout, ms
+        auto_batching_timeout: 2000
+### 3. Start PipelineServer
 Run the following code
@@ -359,7 +593,7 @@ server.prepare_server('config.yml')
 server.run_server()
 ```
-### Perform prediction through PipelineClient
+### 4. Perform prediction through PipelineClient
 ```python
 from paddle_serving_client.pipeline import PipelineClient
@@ -385,13 +619,16 @@ for f in futures:
        exit(1)
 ```
+***
+## ★ Performance analysis
-## How to optimize with the timeline tool
+### 1. How to optimize with the timeline tool
 In order to better optimize the performance, PipelineServing provides a timeline tool to monitor the time of each stage of the whole service.
-### Output profile information on server side
+### 2. Output profile information on server side
 The server is controlled by the `use_profile` field in yaml:
@@ -418,8 +655,29 @@ if __name__ == "__main__":
 Specific operation: open Chrome browser, input in the address bar `chrome://tracing/` , jump to the tracing page, click the load button, open the saved `trace` file, and then visualize the time information of each stage of the prediction service.
-### Output profile information on client side
+### 3. Output profile information on client side
 The profile function can be enabled by setting `profile=True` in the `predict` interface on the client side.
 After the function is enabled, the client will print the log information corresponding to the prediction to the standard output during the prediction process, and the subsequent analysis and processing are the same as that of the server.
+### 4. Analytical methods
+```
+cost of one single OP：
+op_cost = process(pre + mid + post) 
+OP Concurrency: 
+op_concurrency = op_cost(s) * qps_expected
+Service throughput：
+service_throughput = 1 / slowest_op_cost * op_concurrency
+Service average cost：
+service_avg_cost = ∑op_concurrency in critical Path
+Channel accumulations：
+channel_acc_size = QPS(down - up) * time
+Average cost of batch predictor：
+avg_batch_cost = (N * pre + mid + post) / N 
+```
--- a/doc/PIPELINE_SERVING_CN.md
+++ b/doc/PIPELINE_SERVING_CN.md
@@ -7,15 +7,47 @@ Paddle Serving 通常用于单模型的一键部署，但端到端的深度学
 Paddle Serving 提供了用户友好的多模型组合服务编程框架，Pipeline Serving，旨在降低编程门槛，提高资源使用率（尤其是GPU设备），提升整体的预估效率。
-## 整体架构设计
+## ★ 整体架构设计
-Server端基于 gRPC 和图执行引擎构建，两者的关系如下图所示。
+Server端基于<b>RPC服务层</b>和<b>图执行引擎</b>构建，两者的关系如下图所示。
 <center>
 <img src='pipeline_serving-image1.png' height = "250" align="middle"/>
 </center>
-### 图执行引擎
+</n>
+### 1. RPC服务层
+为满足用户不同的使用需求，RPC服务层同时启动1个Web服务器和1个RPC服务器，可同时处理RESTful API、gRPC 2种类型请求。gPRC gateway接收RESTful API请求通过反向代理服务器将请求转发给gRPC Service；gRPC请求由gRPC service接收，所以，2种类型的请求统一由gRPC Service处理，确保处理逻辑一致。
+#### <b>1.1 proto的输入输出结构</b>
+gRPC服务和gRPC gateway服务统一用service.proto生成。
+```proto
+message Request {
+  repeated string key = 1;  
+  repeated string value = 2;
+  optional string name = 3;
+  optional string method = 4;
+  optional int64 logid = 5;
+  optional string clientip = 6;
+};
+message Response {
+  optional int32 err_no = 1;
+  optional string err_msg = 2;
+  repeated string key = 3;
+  repeated string value = 4;
+};
+```
+Request中`key`与`value`是配对的string数组。 `name`与`method`对应RESTful API的URL://{ip}:{port}/{name}/{method}。`logid`和`clientip`便于用户串联服务级请求和自定义策略。
+Response中`err_no`和`err_msg`表达处理结果的正确性和错误信息，`key`和`value`为返回结果。
+### 2. 图执行引擎
 图执行引擎由 OP 和 Channel 构成，相连接的 OP 之间会共享一个 Channel。
@@ -29,7 +61,7 @@ Server端基于 gRPC 和图执行引擎构建，两者的关系如下图所示
 </center>
-### OP的设计
+#### <b>2.1 OP的设计</b>
 - 单个 OP 默认的功能是根据输入的 Channel 数据，访问一个 Paddle Serving 的单模型服务，并将结果存在输出的 Channel
 - 单个 OP 可以支持用户自定义，包括 preprocess，process，postprocess 三个函数都可以由用户继承和实现
@@ -37,7 +69,7 @@ Server端基于 gRPC 和图执行引擎构建，两者的关系如下图所示
 - 单个 OP 可以获取多个不同 RPC 请求的数据，以实现 Auto-Batching
 - OP 可以由线程或进程启动
-### Channel的设计
+#### <b>2.2 Channel的设计</b>
 - Channel 是 OP 之间共享数据的数据结构，负责共享数据或者共享数据状态信息
 - Channel 可以支持多个OP的输出存储在同一个 Channel，同一个 Channel 中的数据可以被多个 OP 使用
@@ -47,8 +79,18 @@ Server端基于 gRPC 和图执行引擎构建，两者的关系如下图所示
 <img src='pipeline_serving-image3.png' height = "500" align="middle"/>
 </center>
+#### <b>2.3 预测类型的设计</b>
+- OP的预测类型(client_type)有3种类型，brpc、grpc和local_predictor
+  - brpc: 使用bRPC Client与远端的Serving服务网络交互，性能优于grpc
+  - grpc: 使用gRPC Client与远端的Serving服务网络交互，支持跨平台部署
+  - local_predictor: 本地服务内加载模型并完成预测，不需要与网络交互。支持多卡部署，和TensorRT高性能预测。
+  - 选型: 
+    - 延时(越少越好): local_predict < brpc <= grpc
+    - 微服务: brpc或grpc模型分拆成独立服务，简化开发和部署复杂度，提升资源利用率
-### 极端情况的考虑
+#### <b>2.4 极端情况的考虑</b>
 - 请求超时的处理
@@ -65,9 +107,11 @@ Server端基于 gRPC 和图执行引擎构建，两者的关系如下图所示
  - 对于 output buffer，可以采用和 input buffer 类似的处理方法，即调整 OP3 和 OP4 的并发数，使得 output buffer 的缓冲长度得到控制（output buffer 的长度取决于下游 OP 从 output buffer 获取数据的速度）
  - 同时 Channel 中数据量不会超过 gRPC 的 `worker_num`，即线程池大小
-### 用户接口设计
+***
+## ★ 详细设计
-#### 1. 普通 OP 定义
+### 1. 普通 OP 定义
 普通 OP 作为图执行引擎中的基本单元，其构造函数如下：
@@ -77,11 +121,13 @@ def __init__(name=None,
             server_endpoints=[],
             fetch_list=[],
             client_config=None,
+             client_type=None,
             concurrency=1,
             timeout=-1,
             retry=1,
             batch_size=1,
-             auto_batching_timeout=None)
+             auto_batching_timeout=None,
+             local_service_handler=None)
 ```
 各参数含义如下
@@ -90,17 +136,21 @@ def __init__(name=None,
 | :-------------------: | :----------------------------------------------------------: |
 |         name          |    （str）用于标识 OP 类型的字符串，该字段必须全局唯一。     |
 |       input_ops       |            （list）当前 OP 的所有前继 OP 的列表。            |
-|   server_endpoints    | （list）远程 Paddle Serving Service 的 endpoints 列表。如果不设置该参数，则不访问远程 Paddle Serving Service，即 不会执行 process 操作。 |
+|   server_endpoints    | （list）远程 Paddle Serving Service 的 endpoints 列表。如果不设置该参数，认为是local_precditor模式，从local_service_conf中读取配置。 |
 |      fetch_list       |     （list）远程 Paddle Serving Service 的 fetch 列表。      |
 |     client_config     | （str）Paddle Serving Service 对应的 Client 端配置文件路径。 |
+|      client_type      | (str) 可选择brpc、grpc或local_predictor。local_predictor不启动Serving服务，进程内预测。 |
 |      concurrency      |                     （int）OP 的并发数。                     |
 |        timeout        | （int）process 操作的超时时间，单位为毫秒。若该值小于零，则视作不超时。 |
 |         retry         |       （int）超时重试次数。当该值为 1 时，不进行重试。       |
-|      batch_size       | （int）进行 Auto-Batching 的期望 batch_size 大小，由于构建 batch 可能超时，实际 batch_size 可能小于设定值。 |
+|      batch_size       | （int）进行 Auto-Batching 的期望 batch_size 大小，由于构建 batch 可能超时，实际 batch_size 可能小于设定值，默认为 1。 |
-| auto_batching_timeout | （float）进行 Auto-Batching 构建 batch 的超时时间，单位为毫秒。 |
+| auto_batching_timeout | （float）进行 Auto-Batching 构建 batch 的超时时间，单位为毫秒。batch_size > 1时，要设置auto_batching_timeout，否则请求数量不足batch_size时会阻塞等待。 |
+| local_service_handler | (object) local predictor handler，Op init()入参赋值 或 在Op init()中创建|
-#### 2. 普通 OP二次开发接口
+### 2. 普通 OP二次开发接口
+OP 二次开发的目的是满足业务开发人员控制OP处理策略。
 |                    变量或接口                    |                             说明                             |
 | :----------------------------------------------: | :----------------------------------------------------------: |
@@ -154,7 +204,7 @@ def init_op(self):
 需要**注意**的是，在线程版 OP 中，每个 OP 只会调用一次该函数，故加载的资源必须要求是线程安全的。
-#### 3. RequestOp 定义
+### 3. RequestOp 定义 与 二次开发接口
 RequestOp 用于处理 Pipeline Server 接收到的 RPC 数据，处理后的数据将会被加入到图执行引擎中。其构造函数如下：
@@ -162,7 +212,7 @@ RequestOp 用于处理 Pipeline Server 接收到的 RPC 数据，处理后的数
 def __init__(self)
 ```
-#### 4. RequestOp 二次开发接口
+当默认的RequestOp无法满足参数解析需求时，可通过重写下面2个接口自定义请求参数解析方法。
 |                变量或接口                 |                    说明                    |
 | :---------------------------------------: | :----------------------------------------: |
@@ -186,7 +236,7 @@ def unpack_request_package(self, request):
 要求返回值是一个字典类型。
-#### 5. ResponseOp 定义
+#### 4. ResponseOp 定义 与 二次开发接口
 ResponseOp 用于处理图执行引擎的预测结果，处理后的数据将会作为 Pipeline Server 的RPC 返回值，其构造函数如下：
@@ -196,7 +246,7 @@ def __init__(self, input_ops)
 其中，`input_ops` 是图执行引擎的最后一个 OP，用户可以通过设置不同的 `input_ops` 以在不修改 OP 的拓扑关系下构造不同的 DAG。
-#### 6. ResponseOp 二次开发接口
+当默认的 ResponseOp 无法满足结果返回格式要求时，可通过重写下面2个接口自定义返回包打包方法。
 |                  变量或接口                  |                    说明                     |
 | :------------------------------------------: | :-----------------------------------------: |
@@ -235,7 +285,7 @@ def pack_response_package(self, channeldata):
  return resp
 ```
-#### 7. PipelineServer定义
+#### 5. PipelineServer定义
 PipelineServer 的定义比较简单，如下所示：
@@ -249,22 +299,134 @@ server.run_server()
 其中，`response_op` 为上面提到的 ResponseOp，PipelineServer 将会根据各个 OP 的拓扑关系初始化 Channel 并构建计算图。`config_yml_path` 为 PipelineServer 的配置文件，示例文件如下：
 ```yaml
-rpc_port: 18080  # gRPC端口号
+# gRPC端口号
-worker_num: 1  # gRPC线程池大小（进程版 Servicer 中为进程数），默认为 1
+rpc_port: 18080 
-build_dag_each_worker: false  # 是否使用进程版 Servicer，默认为 false
-http_port: 0 # HTTP 服务的端口号，若该值小于或等于 0 则不开启 HTTP 服务，默认为 0
+# http端口号，若该值小于或等于 0 则不开启 HTTP 服务，默认为 0
+http_port: 18071 
+# #worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+worker_num: 1  
+# 是否使用进程版 Servicer，默认为 false
+build_dag_each_worker: false  
 dag:
-    is_thread_op: true  # 是否使用线程版Op，默认为 true
+    # op资源类型, True, 为线程模型；False，为进程模型，默认为 True
-    client_type: brpc  # 使用 brpc 或 grpc client，默认为 brpc
+    is_thread_op: true  
-    retry: 1  # DAG Executor 在失败后重试次数，默认为 1，即不重试
-    use_profile: false  # 是否在 Server 端打印日志，默认为 false
+    # DAG Executor 在失败后重试次数，默认为 1，即不重试
+    retry: 1  
+    # 是否在 Server 端打印日志，默认为 false
+    use_profile: false  
+    # 跟踪框架吞吐，每个OP和channel的工作情况。无tracer时不生成数据
    tracer:
-        interval_s: 600 # Tracer 监控的时间间隔，单位为秒。当该值小于 1 时不启动监控，默认为 -1
+        interval_s: 600 # 监控的时间间隔，单位为秒。当该值小于 1 时不启动监控，默认为 -1
+op:
+    bow:
+        # 并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 1
+        # client连接类型，brpc
+        client_type: brpc
+        # Serving交互重试次数，默认不重试
+        retry: 1
+        # Serving交互超时时间, 单位ms
+        timeout: 3000
+        # Serving IPs
+        server_endpoints: ["127.0.0.1:9393"]
+        # bow模型client端配置
+        client_config: "imdb_bow_client_conf/serving_client_conf.prototxt"
+        # Fetch结果列表，以client_config中fetch_var的alias_name为准
+        fetch_list: ["prediction"]
+        # 批量查询Serving的数量, 默认1。batch_size>1要设置 auto_batching_timeout，否则不足batch_size时会阻塞
+        batch_size: 1
+        # 批量查询超时，与batch_size配合使用
+        auto_batching_timeout: 2000
 ```
+### 6. 特殊用法
+#### 6.1 <b>业务自定义错误类型</b>
+用户可根据业务场景自定义错误码，继承ProductErrCode，在Op的preprocess或postprocess中返回列表中返回，下一阶段处理会根据自定义错误码跳过后置OP处理。
+```python
+class ProductErrCode(enum.Enum):
+    """
+    ProductErrCode is a base class for recording business error code. 
+    product developers inherit this class and extend more error codes. 
+    """
+    pass
+```
+#### <b>6.2 跳过OP process阶段</b>
+preprocess返回结果列表的第二个结果是`is_skip_process=True`表示是否跳过当前OP的process阶段，直接进入postprocess处理
+```python
+def preprocess(self, input_dicts, data_id, log_id):
+        """
+        In preprocess stage, assembling data for process stage. users can 
+        override this function for model feed features.
+        Args:
+            input_dicts: input data to be preprocessed
+            data_id: inner unique id
+            log_id: global unique id for RTT
+        Return:
+            input_dict: data for process stage
+            is_skip_process: skip process stage or not, False default
+            prod_errcode: None default, otherwise, product errores occured.
+                          It is handled in the same way as exception. 
+            prod_errinfo: "" default
+        """
+        # multiple previous Op
+        if len(input_dicts) != 1:
+            _LOGGER.critical(
+                self._log(
+                    "Failed to run preprocess: this Op has multiple previous "
+                    "inputs. Please override this func."))
+            os._exit(-1)
+        (_, input_dict), = input_dicts.items()
+        return input_dict, False, None, ""
+```
+#### <b>6.3 自定义proto Request 和 Response结构</b>
-## 例子
+当默认proto结构不满足业务需求时，同时下面2个文件的proto的Request和Response message结构，保持一致。
+> pipeline/gateway/proto/gateway.proto 
+> pipeline/proto/pipeline_service.proto
+再重新编译Serving Server。
+#### <b>6.4 自定义URL</b>
+grpc gateway处理post请求，默认`method`是`prediction`，例如:127.0.0.1:8080/ocr/prediction。用户可自定义name和method，对于已有url的服务可无缝切换
+```proto
+service PipelineService {
+  rpc inference(Request) returns (Response) {
+    option (google.api.http) = {
+      post : "/{name=*}/{method=*}"
+      body : "*"
+    };
+  }
+};
+```
+***
+## ★ 典型示例
 这里通过搭建简单的 imdb model ensemble 例子来展示如何使用 Pipeline Serving，相关代码在 `python/examples/pipeline/imdb_model_ensemble` 文件夹下可以找到，例子中的 Server 端结构如下图所示：
@@ -275,7 +437,7 @@ dag:
 </center>
-### 获取模型文件并启动 Paddle Serving Service
+### 1. 获取模型文件并启动 Paddle Serving Service
 ```shell
 cd python/examples/pipeline/imdb_model_ensemble
@@ -286,9 +448,84 @@ python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.
 PipelineServing 也支持本地自动启动 PaddleServingService，请参考 `python/examples/pipeline/ocr` 下的例子。
-### 启动 PipelineServer
+### 2. 创建config.yaml
+由于config.yaml配置信息量很多，这里仅展示OP部分配置，全量信息参考`python/examples/pipeline/imdb_model_ensemble/config.yaml`
+```yaml
+op:
+    bow:
+        # 并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 1
+        # client连接类型，brpc
+        client_type: brpc
+        # Serving交互重试次数，默认不重试
+        retry: 1
+        # Serving交互超时时间, 单位ms
+        timeout: 3000
+        # Serving IPs
+        server_endpoints: ["127.0.0.1:9393"]
+        # bow模型client端配置
+        client_config: "imdb_bow_client_conf/serving_client_conf.prototxt"
+        # Fetch结果列表，以client_config中fetch_var的alias_name为准
+        fetch_list: ["prediction"]
+        # 批量查询Serving的数量, 默认1。batch_size>1要设置auto_batching_timeout，否则不足batch_size时会阻塞
+        batch_size: 1
+        # 批量查询超时，与batch_size配合使用
+        auto_batching_timeout: 2000
+    cnn:
+        # 并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 1
+        # client连接类型，brpc
+        client_type: brpc
+        # Serving交互重试次数，默认不重试
+        retry: 1
+        # 预测超时时间, 单位ms
+        timeout: 3000
-运行下面代码
+        # Serving IPs
+        server_endpoints: ["127.0.0.1:9292"]
+        # cnn模型client端配置
+        client_config: "imdb_cnn_client_conf/serving_client_conf.prototxt"
+        # Fetch结果列表，以client_config中fetch_var的alias_name为准
+        fetch_list: ["prediction"]
+        # 批量查询Serving的数量, 默认1。
+        batch_size: 1
+        # 批量查询超时，与batch_size配合使用
+        auto_batching_timeout: 2000
+    combine:
+        # 并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 1
+        # Serving交互重试次数，默认不重试
+        retry: 1
+        # 预测超时时间, 单位ms
+        timeout: 3000
+        # 批量查询Serving的数量, 默认1。
+        batch_size: 1
+        # 批量查询超时，与batch_size配合使用
+        auto_batching_timeout: 2000
+```
+### 3. 启动 PipelineServer
+代码示例中，重点留意3个自定义Op的proprocess、postprocess处理，以及Combin Op初始化列表input_ops=[bow_op, cnn_op]，设置Combin Op的前置OP列表。
 ```python
 from paddle_serving_server.pipeline import Op, RequestOp, ResponseOp
@@ -356,7 +593,7 @@ server.prepare_server('config.yml')
 server.run_server()
 ```
-### 通过 PipelineClient 执行预测
+### 4. 通过 PipelineClient 执行预测
 ```python
 from paddle_serving_client.pipeline import PipelineClient
@@ -382,13 +619,16 @@ for f in futures:
        exit(1)
 ```
+***
+## ★ 性能分析
-## 如何通过 Timeline 工具进行优化
+### 1. 如何通过 Timeline 工具进行优化
 为了更好地对性能进行优化，PipelineServing 提供了 Timeline 工具，对整个服务的各个阶段时间进行打点。
-### 在 Server 端输出 Profile 信息
+### 2. 在 Server 端输出 Profile 信息
 Server 端用 yaml 中的 `use_profile` 字段进行控制：
@@ -415,8 +655,29 @@ if __name__ == "__main__":
 具体操作：打开 chrome 浏览器，在地址栏输入 `chrome://tracing/` ，跳转至 tracing 页面，点击 load 按钮，打开保存的 `trace` 文件，即可将预测服务的各阶段时间信息可视化。
-### 在 Client 端输出 Profile 信息
+### 3. 在 Client 端输出 Profile 信息
 Client 端在 `predict` 接口设置 `profile=True`，即可开启 Profile 功能。
 开启该功能后，Client 端在预测的过程中会将该次预测对应的日志信息打印到标准输出，后续分析处理同 Server。
+### 4. 分析方法
+```
+单OP耗时：
+op_cost = process(pre + mid + post) 
+OP期望并发数：
+op_concurrency  = 单OP耗时(s) * 期望QPS
+服务吞吐量：
+service_throughput = 1 / 最慢OP的耗时 * 并发数
+服务平响：
+service_avg_cost = ∑op_concurrency 【关键路径】
+Channel堆积：
+channel_acc_size = QPS(down - up) * time
+批量预测平均耗时：
+avg_batch_cost = (N * pre + mid + post) / N 
+```
--- a/doc/RUN_IN_DOCKER.md
+++ b/doc/RUN_IN_DOCKER.md
@@ -2,6 +2,8 @@
 ([简体中文](RUN_IN_DOCKER_CN.md)|English)
+One of the biggest benefits of Docker is portability, which can be deployed on multiple operating systems and mainstream cloud computing platforms. The Paddle Serving Docker image can be deployed on Linux, Mac and Windows platforms.
 ## Requirements
 Docker (GPU version requires nvidia-docker to be installed on the GPU machine)
@@ -30,63 +32,9 @@ The `-p` option is to map the `9292` port of the container to the `9292` port of
 ### Install PaddleServing
-In order to make the image smaller, the PaddleServing package is not installed in the image. You can run the following command to install it:
+The mirror comes with `paddle_serving_server`, `paddle_serving_client`, and `paddle_serving_app` corresponding to the mirror tag version. If users don’t need to change the version, they can use it directly, which is suitable for environments without extranet services.
-```bash
-pip install paddle-serving-server
-```
-You may need to use a domestic mirror source (in China, you can use the Tsinghua mirror source of the following example) to speed up the download:
-```shell
-pip install paddle-serving-server -i https://pypi.tuna.tsinghua.edu.cn/simple
-```
-### Test example
-Get the trained Boston house price prediction model by the following command:
-```bash
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
-tar -xzf uci_housing.tar.gz
-```
- Test HTTP service
-  Running on the Server side (inside the container):
-  ```bash
-  python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --name uci >std.log 2>err.log &
-  ```
-  Running on the Client side (inside or outside the container):
-  ```bash
-  curl -H "Content-Type:application/json" -X POST -d '{"feed":{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
-  ```
- Test RPC service
-  Running on the Server side (inside the container):
-  ```bash
-  python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 >std.log 2>err.log &
-  ```
-  Running following Python code on the Client side (inside or outside the container, The `paddle-serving-client` package needs to be installed):
-  ```bash
-  from paddle_serving_client import Client
-  client = Client()
-  client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
-  client.connect(["127.0.0.1:9292"])
-  data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-          -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
-  fetch_map = client.predict(feed={"x": data}, fetch=["price"])
-  print(fetch_map)
-  ```
+If you need to change the version, please refer to the instructions on the homepage to download the pip package of the corresponding version.
 ## GPU
@@ -98,7 +46,7 @@ The GPU version is basically the same as the CPU version, with only some differe
 Refer to [this document](DOCKER_IMAGES.md) for a docker image, the following is an example of an `cuda9.0-cudnn7` image:
 ```shell
-nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
+docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
 ```
 ### Create container
@@ -108,77 +56,21 @@ nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/se
 nvidia-docker exec -it test bash
 ```
-The `-p` option is to map the `9292` port of the container to the `9292` port of the host.
+or
-### Install PaddleServing
-In order to make the image smaller, the PaddleServing package is not installed in the image. You can run the following command to install it:
-```bash
-pip install paddle-serving-server-gpu
-```
-You may need to use a domestic mirror source (in China, you can use the Tsinghua mirror source of the following example) to speed up the download:
-```shell
-pip install paddle-serving-server-gpu -i https://pypi.tuna.tsinghua.edu.cn/simple
-```
-### Test example
-When running the GPU Server, you need to set the GPUs used by the prediction service through the `--gpu_ids` option, and the CPU is used by default. An error will be reported when the value of `--gpu_ids` exceeds the environment variable `CUDA_VISIBLE_DEVICES`. The following example specifies to use a GPU with index 0:
-```shell
-export CUDA_VISIBLE_DEVICES=0,1
-python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9292 --gpu_ids 0
-```
-Get the trained Boston house price prediction model by the following command:
 ```bash
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
+docker run --gpus all -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
-tar -xzf uci_housing.tar.gz
+docker exec -it test bash
 ```
- Test HTTP service
+The `-p` option is to map the `9292` port of the container to the `9292` port of the host.
-  Running on the Server side (inside the container):
-  ```bash
-  python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9292 --name uci --gpu_ids 0
-  ```
-  Running on the Client side (inside or outside the container):
-  ```bash
-  curl -H "Content-Type:application/json" -X POST -d '{"feed":{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
-  ```
- Test RPC service
-  Running on the Server side (inside the container):
-  ```bash
-  python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0
-  ```
-  Running following Python code on the Client side (inside or outside the container, The `paddle-serving-client` package needs to be installed):
-  ```bash
-  from paddle_serving_client import Client
-  client = Client()
-  client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
-  client.connect(["127.0.0.1:9292"])
-  data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-          -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
-  fetch_map = client.predict(feed={"x": data}, fetch=["price"])
-  print(fetch_map)
-  ```
+### Install PaddleServing
+The mirror comes with `paddle_serving_server_gpu`, `paddle_serving_client`, and `paddle_serving_app` corresponding to the mirror tag version. If users don’t need to change the version, they can use it directly, which is suitable for environments without extranet services.
+If you need to change the version, please refer to the instructions on the homepage to download the pip package of the corresponding version.
-## Attention
+## Precautious
 Runtime images cannot be used for compilation. If you want to compile from source, refer to [COMPILE](COMPILE.md).
--- a/doc/RUN_IN_DOCKER_CN.md
+++ b/doc/RUN_IN_DOCKER_CN.md
@@ -2,6 +2,8 @@
 (简体中文|[English](RUN_IN_DOCKER.md))
+Docker最大的好处之一就是可移植性，可在多种操作系统和主流的云计算平台部署。使用Paddle Serving Docker镜像可在Linux、Mac和Windows平台部署。
 ## 环境要求
 Docker（GPU版本需要在GPU机器上安装nvidia-docker）
@@ -18,7 +20,6 @@ Docker（GPU版本需要在GPU机器上安装nvidia-docker）
 docker pull hub.baidubce.com/paddlepaddle/serving:latest
 ```
 ### 创建容器并进入
 ```bash
@@ -30,74 +31,11 @@ docker exec -it test bash
 ### 安装PaddleServing
-为了减小镜像的体积，镜像中没有安装Serving包，要执行下面命令进行安装。
+镜像里自带对应镜像tag版本的`paddle_serving_server`，`paddle_serving_client`，`paddle_serving_app`，如果用户不需要更改版本，可以直接使用，适用于没有外网服务的环境。
-```bash
-pip install paddle-serving-server
-```
-您可能需要使用国内镜像源（例如清华源）来加速下载。
-```shell
-pip install paddle-serving-server -i https://pypi.tuna.tsinghua.edu.cn/simple
-```
-### 测试example
-通过下面命令获取训练好的Boston房价预估模型：
-```bash
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
-tar -xzf uci_housing.tar.gz
-```
- 测试HTTP服务
-  在Server端（容器内）运行：
+如果需要更换版本，请参照首页的指导，下载对应版本的pip包。
-  ```bash
+## GPU 版本
-  python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --name uci >std.log 2>err.log &
-  ```
-  在Client端（容器内或容器外）运行：
-  ```bash
-  curl -H "Content-Type:application/json" -X POST -d '{"feed":{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
-  ```
- 测试RPC服务
-  在Server端（容器内）运行：
-  ```bash
-  python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 >std.log 2>err.log &
-  ```
-  在Client端（容器内或容器外，需要安装`paddle-serving-client`包）运行下面Python代码：
-  ```python
-  from paddle_serving_client import Client
-  client = Client()
-  client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
-  client.connect(["127.0.0.1:9292"])
-  data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-          -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
-  fetch_map = client.predict(feed={"x": data}, fetch=["price"])
-  print(fetch_map)
-  ```
-## GPU版本
-GPU版本与CPU版本基本一致，只有部分接口命名的差别（GPU版本需要在GPU机器上安装nvidia-docker）。
-### 获取镜像
-参考[该文档](DOCKER_IMAGES_CN.md)获取镜像，这里以 `cuda9.0-cudnn7` 的镜像为例：
-```shell
-nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
-```
 ### 创建容器并进入
@@ -105,74 +43,19 @@ nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
 nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
 nvidia-docker exec -it test bash
 ```
+或者
-`-p`选项是为了将容器的`9292`端口映射到宿主机的`9292`端口。
-### 安装PaddleServing
-为了减小镜像的体积，镜像中没有安装Serving包，要执行下面命令进行安装。
-```bash
-pip install paddle-serving-server-gpu
-```
-您可能需要使用国内镜像源（例如清华源）来加速下载。
-```shell
-pip install paddle-serving-server-gpu -i https://pypi.tuna.tsinghua.edu.cn/simple
-```
-### 测试example
-在运行GPU版Server时需要通过`--gpu_ids`选项设置预测服务使用的GPU，缺省状态默认使用CPU。当设置的`--gpu_ids`超出环境变量`CUDA_VISIBLE_DEVICES`时会报错。下面的示例为指定使用索引为0的GPU：
-```shell
-export CUDA_VISIBLE_DEVICES=0,1
-python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9292 --gpu_ids 0
-```
-通过下面命令获取训练好的Boston房价预估模型：
 ```bash
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
+docker run --gpus all -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
-tar -xzf uci_housing.tar.gz
+docker exec -it test bash
 ```
- 测试HTTP服务
+`-p`选项是为了将容器的`9292`端口映射到宿主机的`9292`端口。
-  在Server端（容器内）运行：
-  ```bash
-  python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9292 --name uci --gpu_ids 0
-  ```
-  在Client端（容器内或容器外）运行：
-  ```bash
-  curl -H "Content-Type:application/json" -X POST -d '{"feed":{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
-  ```
- 测试RPC服务
-  在Server端（容器内）运行：
-  ```bash
-  python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0
-  ```
-  在Client端（容器内或容器外，需要安装`paddle-serving-client`包）运行下面Python代码：
+### 安装PaddleServing
-  ```bash
+镜像里自带对应镜像tag版本的`paddle_serving_server_gpu`，`paddle_serving_client`，`paddle_serving_app`，如果用户不需要更改版本，可以直接使用，适用于没有外网服务的环境。
-  from paddle_serving_client import Client
-  client = Client()
+如果需要更换版本，请参照首页的指导，下载对应版本的pip包。
-  client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
-  client.connect(["127.0.0.1:9292"])
-  data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-          -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
-  fetch_map = client.predict(feed={"x": data}, fetch=["price"])
-  print(fetch_map)
-  ```
 ## 注意事项

--- a/doc/WINDOWS_TUTORIAL.md
+++ b/doc/WINDOWS_TUTORIAL.md
+## Paddle Serving for Windows Users
+(English|[简体中文](./WINDOWS_TUTORIAL_CN.md))
+### Summary
+This document guides users how to build Paddle Serving service on the Windows platform. Due to the limited support of third-party libraries, the Windows platform currently only supports the use of web services to build local predictor prediction services. If you want to experience all the services, you need to use Docker for Windows to simulate the operating environment of Linux.
+### Running Paddle Serving on Native Windows System
+**Configure Python environment variables to PATH**: **We only support Python 3.5+ on Native Windows System.**. First, you need to add the directory where the Python executable program is located to the PATH. Usually in **System Properties/My Computer Properties**-**Advanced**-**Environment Variables**, click Path and add the path at the beginning. For example, `C:\Users\$USER\AppData\Local\Programs\Python\Python36`, and finally click **OK** continuously. If you enter python on Powershell, you can enter the python interactive interface, indicating that the environment variable configuration is successful.
+**Install wget**: Because all the downloads in the tutorial and the built-in model download function in `paddle_serving_app` all use the wget tool, download the binary package at the [link](http://gnuwin32.sourceforge.net/packages/wget.htm), unzip and copy it to `C:\Windows\System32`, if there is a security prompt, you need to pass it.
+**Install Git**: For details, see [Git official website](https://git-scm.com/downloads)
+**Install the necessary C++ library (optional)**: Some users may encounter the problem that the dll cannot be linked during the `import paddle` stage. It is recommended to [Install Visual Studio Community Edition](https://visualstudio.microsoft.com/), and install the relevant components of C++.
+**Install Paddle and Serving**: In Powershell, execute
+```
+python -m pip install -U paddle_serving_server paddle_serving_client paddle_serving_app paddlepaddle`
+```
+for GPU users,
+```
+python -m pip install -U paddle_serving_server_gpu paddle_serving_client paddle_serving_app paddlepaddle-gpu
+```
+**Git clone Serving Project:**
+```
+git clone https://github.com/paddlepaddle/Serving
+pip install -r python/requirements_win.txt
+```
+**Run OCR example**:
+```
+cd Serving/python/example/ocr
+python -m paddle_serving_app.package --get_model ocr_rec
+tar -xzvf ocr_rec.tar.gz
+python -m paddle_serving_app.package --get_model ocr_det
+tar -xzvf ocr_det.tar.gz
+python ocr_debugger_server.py cpu &
+python ocr_web_client.py
+```
+### Create a new Paddle Serving Web Service on Windows
+Currently Windows supports the Local Predictor of the Web Service framework. The server code framework is as follows
+```
+# filename:your_webservice.py
+from paddle_serving_server.web_service import WebService
+# If it is the GPU version, please use from paddle_serving_server_gpu.web_service import WebService
+class YourWebService(WebService):
+    def preprocess(self, feed=[], fetch=[]):
+        #Implement pre-processing here
+        #feed_dict is key: var names, value: numpy array input
+        #fetch_names is a list of fetch variable names
+        The meaning of #is_batch is whether the numpy array in the value of feed_dict contains the batch dimension
+        return feed_dict, fetch_names, is_batch
+    def postprocess(self, feed={}, fetch=[], fetch_map=None):
+        #fetch map is the returned dictionary after prediction, the key is the fetch names given when the process returns, and the value is the var specific value corresponding to the fetch names
+        #After processing here, the result needs to be converted into a dictionary again, and the type of values should be a list, so that it can be serialized in JSON to facilitate web return
+        return response
+your_service = YourService(name="XXX")
+your_service.load_model_config("your_model_path")
+your_service.prepare_server(workdir="workdir", port=9292)
+# If you are a GPU user, you can refer to the python example under python/examples/ocr
+your_service.run_debugger_service()
+# Windows platform cannot use run_rpc_service() interface
+your_service.run_web_service()
+```
+Client code example
+```
+# filename:your_client.py
+import requests
+import json
+import base64
+import os, sys
+import time
+import cv2 # If you need to upload pictures
+# Used for image reading, the principle is to use base64 encoding file content
+def cv2_to_base64(image):
+    return base64.b64encode(image).decode(
+        'utf8') #data.tostring()).decode('utf8')
+headers = {"Content-type": "application/json"}
+url = "http://127.0.0.1:9292/XXX/prediction" # XXX depends on the initial name parameter of the server YourService
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+print(r.json())
+```
+The user only needs to follow the above instructions and implement the relevant content in the corresponding function. For more information, please refer to [How to develop a new Web Service? ](./NEW_WEB_SERVICE.md)
+Execute after development
+```
+python your_webservice.py &
+python your_client.py
+```
+Because the port needs to be occupied, there may be a security prompt during the startup process. Please click through and an IP address will be generated. It should be noted that when the Windows platform starts the service, the local IP address may not be 127.0.0.1. You need to confirm the IP address and then see how the Client should set the access IP.
+### Docker for Windows User Guide
+The above content is used for native Windows. If users want to experience complete functions, they need to use Docker tools to model Linux systems.
+Please refer to [Docker Desktop](https://www.docker.com/products/docker-desktop) to install Docker
+After installation, start the docker linux engine and download the relevant image. In the Serving directory
+```
+docker pull hub.baidubce.com/paddlepaddle/serving:latest-devel
+# There is no expose port here, users can set -p to perform port mapping as needed
+docker run --rm -dit --name serving_devel -v $PWD:/Serving hub.baidubce.com/paddlepaddle/serving:latest-devel
+docker exec -it serving_devel bash
+cd /Serving
+```
+The rest of the operations are exactly the same as the Linux version.
--- a/doc/WINDOWS_TUTORIAL_CN.md
+++ b/doc/WINDOWS_TUTORIAL_CN.md
+## Windows平台使用Paddle Serving指导
+([English](./WINDOWS_TUTORIAL.md)|简体中文）
+### 综述
+本文档指导用户如何在Windows平台手把手搭建Paddle Serving服务。由于受限第三方库的支持，Windows平台目前只支持用web service的方式搭建local predictor预测服务。如果想要体验全部的服务，需要使用Docker for Windows，来模拟Linux的运行环境。
+### 原生Windows系统运行Paddle Serving
+**配置Python环境变量到PATH**：**目前原生Windows仅支持Python 3.5或更高版本**。首先需要将Python的可执行程序所在目录加入到PATH当中。通常在**系统属性/我的电脑属性**-**高级**-**环境变量** ，点选Path，并在开头加上路径。例如`C:\Users\$USER\AppData\Local\Programs\Python\Python36`，最后连续点击**确定** 。在Powershell上如果输入python可以进入python交互界面，说明环境变量配置成功。
+**安装wget工具**：由于教程当中所有的下载，以及`paddle_serving_app`当中内嵌的模型下载功能，都是用到wget工具，在链接[下载wget](http://gnuwin32.sourceforge.net/packages/wget.htm)，解压后复制到`C:\Windows\System32`下，如有安全提示需要通过。
+**安装Git工具**： 详情参见[Git官网](https://git-scm.com/downloads)
+**安装必要的C++库（可选）**：部分用户可能会在`import paddle`阶段遇见dll无法链接的问题，建议可以[安装Visual Studio社区版本](`https://visualstudio.microsoft.com/`) ，并且安装C++的相关组件。
+**安装Paddle和Serving**：在Powershell，执行
+```
+python -m pip install -U paddle_serving_server paddle_serving_client paddle_serving_app paddlepaddle`
+```
+如果是GPU用户
+```
+python -m pip install -U paddle_serving_server_gpu paddle_serving_client paddle_serving_app paddlepaddle-gpu
+```
+**下载Serving库**：
+```
+git clone https://github.com/paddlepaddle/Serving
+pip install -r python/requirements_win.txt
+```
+**运行OCR示例**：
+```
+cd Serving/python/example/ocr
+python -m paddle_serving_app.package --get_model ocr_rec
+tar -xzvf ocr_rec.tar.gz
+python -m paddle_serving_app.package --get_model ocr_det
+tar -xzvf ocr_det.tar.gz
+python ocr_debugger_server.py cpu &
+python ocr_web_client.py
+```
+### 创建新的Windows支持的Paddle Serving服务
+目前Windows支持Web Service框架的Local Predictor。服务端代码框架如下
+```
+# filename:your_webservice.py
+from paddle_serving_server.web_service import WebService
+# 如果是GPU版本，请使用 from paddle_serving_server_gpu.web_service import WebService
+class YourWebService(WebService):
+    def preprocess(self, feed=[], fetch=[]):
+        #在这里实现前处理
+        #feed_dict是 key: var names, value: numpy array input
+        #fetch_names 是fetch变量名列表
+        #is_batch的含义是feed_dict的value里的numpy array是否包含了batch维度
+        return feed_dict, fetch_names, is_batch
+    def postprocess(self, feed={}, fetch=[], fetch_map=None):
+        #fetch map是经过预测之后的返回字典，key是process返回时给定的fetch names，value是对应fetch names的var具体值
+        #在这里做处理之后，结果需重新转换成字典，并且values的类型应是列表list，这样可以JSON序列化方便web返回
+        return response
+your_service = YourService(name="XXX")
+your_service.load_model_config("your_model_path")
+your_service.prepare_server(workdir="workdir", port=9292)
+# 如果是GPU用户，可以参照python/examples/ocr下的python示例
+your_service.run_debugger_service()
+# Windows平台不可以使用 run_rpc_service()接口
+your_service.run_web_service()
+```
+客户端代码示例
+```
+# filename：your_client.py
+import requests
+import json
+import base64
+import os, sys
+import time
+import cv2 # 如果需要上传图片
+# 用于图片读取，原理是采用base64编码文件内容
+def cv2_to_base64(image):
+    return base64.b64encode(image).decode(
+        'utf8')  #data.tostring()).decode('utf8')
+headers = {"Content-type": "application/json"}
+url = "http://127.0.0.1:9292/XXX/prediction" # XXX取决于服务端YourService的初始化name参数
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+print(r.json())
+```
+用户只需要按照如上指示，在对应函数中实现相关内容即可。更多信息请参见[如何开发一个新的Web Service？](./NEW_WEB_SERVICE_CN.md)
+开发完成后执行
+```
+python your_webservice.py &
+python your_client.py
+```
+因为需要占用端口，因此启动过程可能会有安全提示，请点选通过，就会有IP地址生成。需要注意的是，Windows平台启动服务时，本地IP地址可能不是127.0.0.1，需要确认好IP地址再看Client应该如何设定访问IP。
+### Docker for Windows 使用指南
+以上内容用于原生的Windows，如果用户想要体验完整的功能，需要使用Docker工具，来模拟Linux系统。
+安装Docker请参考[Docker Desktop](https://www.docker.com/products/docker-desktop)
+安装之后启动docker的linux engine，下载相关镜像。在Serving目录下
+```
+docker pull hub.baidubce.com/paddlepaddle/serving:latest-devel
+# 此处没有expose端口，用户可根据需要设置-p来进行端口映射
+docker run --rm -dit --name serving_devel -v $PWD:/Serving hub.baidubce.com/paddlepaddle/serving:latest-devel 
+docker exec -it serving_devel bash
+cd /Serving
+```
+其余操作与Linux版本完全一致。
--- a/java/README.md
+++ b/java/README.md
+## Tutorial of Java Client for Paddle Serving
+(English|[简体中文](./README_CN.md))
+### Development Environment
+In order to facilitate users to use java for development, we provide the compiled Serving project to be placed in the java mirror. The way to get the mirror and enter the development environment is
+```
+docker pull hub.baidubce.com/paddlepaddle/serving:0.4.0-java
+docker run --rm -dit --name java_serving hub.baidubce.com/paddlepaddle/serving:0.4.0-java
+docker exec -it java_serving bash
+cd Serving/java
+```
+The Serving folder is at the develop branch when the docker image is generated. You need to git pull to the latest version or git checkout to the desired branch.
+### Install client dependencies
+Due to the large number of dependent libraries, the image has been compiled once at the time of generation, and the user can perform the following operations
+```
+mvn compile
+mvn install
+cd examples
+mvn compile
+mvn install
+```
+### Start the server
+Take the fit_a_line model as an example, the server starts
+```
+cd ../../python/examples/fit_a_line
+sh get_data.sh
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang &
+```
+Client prediction
+```
+cd ../../../java/examples/target
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample fit_a_line
+```
+Take yolov4 as an example, the server starts
+```
+python -m paddle_serving_app.package --get_model yolov4
+tar -xzvf yolov4.tar.gz
+python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang & #It needs to be executed in GPU Docker, otherwise the execution method of CPU must be used.
+```
+Client prediction
+```
+# in /Serving/java/examples/target
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample yolov4 ../../../python/examples/yolov4/000000570688.jpg
+# The case of yolov4 needs to specify a picture as input
+```
+### Customization guidance
+The above example is running in CPU mode. If GPU mode is required, there are two options.
+The first is that GPU Serving and Java Client are in the same image. After starting the corresponding image, the user needs to move /Serving/java in the java image to the corresponding image.
+The second is to deploy GPU Serving and Java Client separately. If they are on the same host, you can learn the IP address of the corresponding container through ifconfig, and then when you connect to client.connect in `examples/src/main/java/PaddleServingClientExample.java` Make changes to the endpoint, and then compile it again. Or select `--net=host` to bind the network device of docker and host when docker starts, so that it can run directly without customizing java code.
+**It should be noted that in the example, all models need to use `--use_multilang` to start GRPC multi-programming language support, and the port number is 9393. If you need another port, you need to modify it in the java file**
+**Currently Serving has launched the Pipeline mode (see [Pipeline Serving](../doc/PIPELINE_SERVING.md) for details). The next version (0.4.1) of the Pipeline Serving Client for Java will be released. **
--- a/java/README_CN.md
+++ b/java/README_CN.md
+## 用于Paddle Serving的Java客户端
+([English](./README.md)|简体中文)
+### 开发环境
+为了方便用户使用java进行开发，我们提供了编译好的Serving工程放置在java镜像当中，获取镜像并进入开发环境的方式是
+```
+docker pull hub.baidubce.com/paddlepaddle/serving:0.4.0-java
+docker run --rm -dit --name java_serving hub.baidubce.com/paddlepaddle/serving:0.4.0-java
+docker exec -it java_serving bash
+cd Serving/java
+```
+Serving文件夹是镜像生成时的develop分支工程目录，需要git pull 到最新版本，或者git checkout 到想要的分支。
+### 安装客户端依赖
+由于依赖库数量庞大，因此镜像已经在生成时编译过一次，用户执行以下操作即可
+```
+mvn compile
+mvn install
+cd examples
+mvn compile
+mvn install
+```
+### 启动服务端
+以fit_a_line模型为例，服务端启动
+```
+cd ../../python/examples/fit_a_line
+sh get_data.sh
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang &
+```
+客户端预测
+```
+cd ../../../java/examples/target
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample fit_a_line
+```
+以yolov4为例子，服务端启动
+```
+python -m paddle_serving_app.package --get_model yolov4
+tar -xzvf yolov4.tar.gz
+python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang &  #需要在GPU Docker当中执行，否则要使用CPU的执行方式。
+```
+客户端预测
+```
+# in /Serving/java/examples/target
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample yolov4 ../../../python/examples/yolov4/000000570688.jpg
+# yolov4的案例需要指定一个图片作为输入
+```
+### 二次开发指导
+上述示例是在CPU模式下运行，如果需要GPU模式，可以有两种选择。
+第一种是GPU Serving和Java Client在同一个镜像，需要用户在启动对应的镜像后，把java镜像当中的/Serving/java移动到对应的镜像中。
+第二种是GPU Serving和Java Client分开部署，如果在同一台宿主机，可以通过ifconfig了解对应容器的IP地址，然后在`examples/src/main/java/PaddleServingClientExample.java`当中对client.connect时的endpoint做修改，然后再编译一次。 或者在docker启动时选择 `--net=host`来绑定docker和宿主机的网络设备，这样不需要定制java代码可以直接运行。
+**需要注意的是，在示例中，所有模型都需要使用`--use_multilang`来启动GRPC多编程语言支持，以及端口号都是9393，如果需要别的端口，需要在java文件里修改**
+**目前Serving已推出Pipeline模式（详见[Pipeline Serving](../doc/PIPELINE_SERVING_CN.md)），下个版本（0.4.1）面向Java的Pipeline Serving Client将会发布，敬请期待。**
--- a/java/examples/pom.xml
+++ b/java/examples/pom.xml
@@ -75,7 +75,7 @@
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
-            <version>4.11</version>
+            <version>4.13.1</version>
            <scope>test</scope>
        </dependency>
        <dependency>

--- a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
+++ b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
@@ -23,7 +23,6 @@
 #include "core/configure/inferencer_configure.pb.h"
 #include "core/predictor/framework/infer.h"
 #include "paddle_inference_api.h"  // NOLINT
-//#include "predictor/framework/infer.h"
 namespace baidu {
 namespace paddle_serving {

--- a/paddle_inference/inferencer-fluid-gpu/CMakeLists.txt
+++ b/paddle_inference/inferencer-fluid-gpu/CMakeLists.txt
@@ -2,6 +2,7 @@ FILE(GLOB fluid_gpu_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
 add_library(fluid_gpu_engine ${fluid_gpu_engine_srcs})
 target_include_directories(fluid_gpu_engine PUBLIC
        ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
 add_dependencies(fluid_gpu_engine pdserving extern_paddle configure)
 target_link_libraries(fluid_gpu_engine pdserving paddle_fluid iomp5 mklml_intel -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)

--- a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
+++ b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
@@ -190,7 +190,7 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
    paddle::AnalysisConfig analysis_config;
    analysis_config.SetModel(data_path);
-    analysis_config.EnableUseGpu(100, FLAGS_gpuid);
+    analysis_config.EnableUseGpu(1500, FLAGS_gpuid);
    analysis_config.SwitchSpecifyInputNames(true);
    analysis_config.SetCpuMathLibraryNumThreads(1);
@@ -198,12 +198,68 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
      analysis_config.EnableMemoryOptim();
    }
+#if 0  // todo: support flexible shape
+    int min_seq_len = 1;
+    int max_seq_len = 512;
+    int opt_seq_len = 128;
+    int head_number = 12;
+    int batch = 50;
+    std::vector<int> min_in_shape = {batch, min_seq_len, 1};
+    std::vector<int> max_in_shape = {batch, max_seq_len, 1};
+    std::vector<int> opt_in_shape = {batch, opt_seq_len, 1};
+    std::string input1_name = "src_text_a_ids";
+    std::string input2_name = "pos_text_a_ids";
+    std::string input3_name = "sent_text_a_ids";
+    std::string input4_name = "stack_0.tmp_0";
+    std::map<std::string, std::vector<int>> min_input_shape = {
+        {input1_name, min_in_shape},
+        {input2_name, min_in_shape},
+        {input3_name, min_in_shape},
+        {input4_name, {batch, head_number, min_seq_len, min_seq_len}},
+    };
+    std::map<std::string, std::vector<int>> max_input_shape = {
+        {input1_name, max_in_shape},
+        {input2_name, max_in_shape},
+        {input3_name, max_in_shape},
+        {input4_name, {batch, head_number, max_seq_len, max_seq_len}},
+    };
+    std::map<std::string, std::vector<int>> opt_input_shape = {
+        {input1_name, opt_in_shape},
+        {input2_name, opt_in_shape},
+        {input3_name, opt_in_shape},
+        {input4_name, {batch, head_number, opt_seq_len, opt_seq_len}},
+    };
+    analysis_config.SetTRTDynamicShapeInfo(
+        min_input_shape, max_input_shape, opt_input_shape);
+#endif
+    int max_batch = 32;
+    int min_subgraph_size = 3;
+    if (params.use_trt()) {
+      analysis_config.EnableTensorRtEngine(
+          1 << 20,
+          max_batch,
+          min_subgraph_size,
+          paddle::AnalysisConfig::Precision::kFloat32,
+          false,
+          false);
+      LOG(INFO) << "create TensorRT predictor";
+    } else {
+      if (params.enable_memory_optimization()) {
+        analysis_config.EnableMemoryOptim();
+      }
      if (params.enable_ir_optimization()) {
        analysis_config.SwitchIrOptim(true);
      } else {
        analysis_config.SwitchIrOptim(false);
      }
+    }
    AutoLock lock(GlobalPaddleCreateMutex::instance());
    _core =
        paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -80,6 +80,16 @@ if (SERVER)
            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+    elseif(WITH_TRT)
+        add_custom_command(
+            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+            COMMAND cp -r
+            ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
+            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
+            "server_gpu" trt
+            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
+        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
    else()
        add_custom_command(
            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp

--- a/python/examples/bert/bert_client.py
+++ b/python/examples/bert/bert_client.py
@@ -18,16 +18,20 @@ import sys
 from paddle_serving_client import Client
 from paddle_serving_client.utils import benchmark_args
 from paddle_serving_app.reader import ChineseBertReader
+import numpy as np
 args = benchmark_args()
 reader = ChineseBertReader({"max_seq_len": 128})
 fetch = ["pooled_output"]
-endpoint_list = ["127.0.0.1:9292"]
+endpoint_list = ['127.0.0.1:9292']
 client = Client()
 client.load_client_config(args.model)
 client.connect(endpoint_list)
 for line in sys.stdin:
    feed_dict = reader.process(line)
-    result = client.predict(feed=feed_dict, fetch=fetch)
+    for key in feed_dict.keys():
+        feed_dict[key] = np.array(feed_dict[key]).reshape((128, 1))
+    #print(feed_dict)
+    result = client.predict(feed=feed_dict, fetch=fetch, batch=False)
+print(result)
--- a/python/examples/bert/bert_web_service.py
+++ b/python/examples/bert/bert_web_service.py
@@ -13,10 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
-from paddle_serving_server_gpu.web_service import WebService
+from paddle_serving_server.web_service import WebService
 from paddle_serving_app.reader import ChineseBertReader
 import sys
 import os
+import numpy as np
 class BertService(WebService):
@@ -27,18 +28,21 @@ class BertService(WebService):
        })
    def preprocess(self, feed=[], fetch=[]):
-        feed_res = [
+        feed_res = []
-            self.reader.process(ins["words"].encode("utf-8")) for ins in feed
+        is_batch = False
-        ]
+        for ins in feed:
-        return feed_res, fetch
+            feed_dict = self.reader.process(ins["words"].encode("utf-8"))
+            for key in feed_dict.keys():
+                feed_dict[key] = np.array(feed_dict[key]).reshape(
+                    (len(feed_dict[key]), 1))
+            feed_res.append(feed_dict)
+        return feed_res, fetch, is_batch
 bert_service = BertService(name="bert")
 bert_service.load()
 bert_service.load_model_config(sys.argv[1])
-gpu_ids = os.environ["CUDA_VISIBLE_DEVICES"]
-bert_service.set_gpus(gpu_ids)
 bert_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[2]), device="gpu")
+    workdir="workdir", port=int(sys.argv[2]), device="cpu")
 bert_service.run_rpc_service()
 bert_service.run_web_service()
--- a/python/examples/bert/test_multi_fetch_client.py
+++ b/python/examples/bert/test_multi_fetch_client.py
@@ -15,6 +15,7 @@
 from paddle_serving_client import Client
 from paddle_serving_app.reader import ChineseBertReader
 import sys
+import numpy as np
 client = Client()
 client.load_client_config("./bert_seq32_client/serving_client_conf.prototxt")
@@ -28,12 +29,21 @@ expected_shape = {
    "pooled_output": (4, 768)
 }
 batch_size = 4
-feed_batch = []
+feed_batch = {}
+batch_len = 0
 for line in sys.stdin:
    feed = reader.process(line)
+    if batch_len == 0:
+        for key in feed.keys():
+            val_len = len(feed[key])
+            feed_batch[key] = np.array(feed[key]).reshape((1, val_len, 1))
+        continue
    if len(feed_batch) < batch_size:
-        feed_batch.append(feed)
+        for key in feed.keys():
+            np.concatenate([
+                feed_batch[key], np.array(feed[key]).reshape((1, val_len, 1))
+            ])
    else:
        fetch_map = client.predict(feed=feed_batch, fetch=fetch)
        feed_batch = []

--- a/python/examples/faster_rcnn_model/000000570688.jpg
+++ b/python/examples/faster_rcnn_model/000000570688.jpg
--- a/python/examples/faster_rcnn_model/label_list.txt
+++ b/python/examples/faster_rcnn_model/label_list.txt
--- a/python/examples/criteo_ctr_with_cube/test_server_gpu.py
+++ b/python/examples/criteo_ctr_with_cube/test_server_gpu.py
@@ -11,31 +11,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# pylint: disable=doc-string-missing
-import os
+from paddle_serving_client import Client
-import sys
+from paddle_serving_app.reader import *
-from paddle_serving_server_gpu import OpMaker
+import numpy as np
-from paddle_serving_server_gpu import OpSeqMaker
-from paddle_serving_server_gpu import Server
-op_maker = OpMaker()
+preprocess = Sequential([
-read_op = op_maker.create('general_reader')
+    File2Image(), BGR2RGB(), Div(255.0),
-general_dist_kv_infer_op = op_maker.create('general_dist_kv_infer')
+    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
-response_op = op_maker.create('general_response')
+    Resize(800, 1333), Transpose((2, 0, 1)), PadStride(32)
+])
-op_seq_maker = OpSeqMaker()
+postprocess = RCNNPostprocess("label_list.txt", "output")
-op_seq_maker.add_op(read_op)
+client = Client()
-op_seq_maker.add_op(general_dist_kv_infer_op)
+client.load_client_config("serving_client/serving_client_conf.prototxt")
-op_seq_maker.add_op(response_op)
+client.connect(['127.0.0.1:9292'])
+im = preprocess('000000570688.jpg')
-server = Server()
+fetch_map = client.predict(
-server.set_op_sequence(op_seq_maker.get_op_sequence())
+    feed={
-server.set_num_threads(4)
+        "image": im,
-server.load_model_config(sys.argv[1])
+        "im_info": np.array(list(im.shape[1:]) + [1.0]),
-server.prepare_server(
+        "im_shape": np.array(list(im.shape[1:]) + [1.0])
-    workdir="work_dir1",
+    },
-    port=9292,
+    fetch=["multiclass_nms_0.tmp_0"],
-    device="cpu",
+    batch=False)
-    cube_conf="./cube/conf/cube.conf")
+fetch_map["image"] = '000000570688.jpg'
-server.run_server()
+print(fetch_map)
+postprocess(fetch_map)
+print(fetch_map)
--- a/python/examples/criteo_ctr/test_client.py
+++ b/python/examples/criteo_ctr/test_client.py
@@ -20,7 +20,7 @@ import os
 import time
 import criteo_reader as criteo
 from paddle_serving_client.metric import auc
+import numpy as np
 import sys
 py_version = sys.version_info[0]
@@ -49,7 +49,8 @@ for ei in range(1000):
        data = reader().__next__()
    feed_dict = {}
    for i in range(1, 27):
-        feed_dict["sparse_{}".format(i - 1)] = data[0][i]
+        feed_dict["sparse_{}".format(i - 1)] = np.array(data[0][i]).reshape(-1)
+        feed_dict["sparse_{}.lod".format(i - 1)] = [0, len(data[0][i])]
    fetch_map = client.predict(feed=feed_dict, fetch=["prob"])
 end = time.time()
 print(end - start)
--- a/python/examples/criteo_ctr_with_cube/README.md
+++ b/python/examples/criteo_ctr_with_cube/README.md
-## Criteo CTR with Sparse Parameter Indexing Service
-([简体中文](./README_CN.md)|English)
-### Get Sample Dataset
-go to directory `python/examples/criteo_ctr_with_cube`
-```
-sh get_data.sh
-```
-### Download Model and Sparse Parameter Sequence Files
-```
-wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz
-tar xf ctr_cube_unittest.tar.gz
-mv models/ctr_client_conf ./
-mv models/ctr_serving_model_kv ./
-mv models/data ./cube/
-```
-the model will be in ./ctr_server_model_kv and ./ctr_client_config.
-### Start Sparse Parameter Indexing Service
-```
-wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz
-tar xf cube_app.tar.gz
-mv cube_app/cube* ./cube/
-sh cube_prepare.sh &
-```
-Here, the sparse parameter is loaded by cube sparse parameter indexing service Cube.
-### Start RPC Predictor, the number of serving thread is 4（configurable in test_server.py）
-```
-python test_server.py ctr_serving_model_kv 
-```
-### Run Prediction
-```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
-```
-### Benchmark
-CPU ：Intel(R) Xeon(R) CPU 6148 @ 2.40GHz 
-Model ：[Criteo CTR](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/criteo_ctr_with_cube/network_conf.py)
-server core/thread num ： 4/8
-Run
-```
-bash benchmark.sh
-```
-1000 batches will be sent by every client
-| client  thread num | prepro | client infer | op0    | op1   | op2    | postpro | avg_latency | qps   |
-| ------------------ | ------ | ------------ | ------ | ----- | ------ | ------- | ----- | ----- |
-| 1                  | 0.035  | 1.596        | 0.021  | 0.518 | 0.0024 | 0.0025  | 6.774 | 147.7 |
-| 2                  | 0.034  | 1.780        | 0.027  | 0.463 | 0.0020 | 0.0023  | 6.931 | 288.3 |
-| 4                  | 0.038  | 2.954        | 0.025  | 0.455 | 0.0019 | 0.0027  | 8.378 | 477.5 |
-| 8                  | 0.044  | 8.230        | 0.028  | 0.464 | 0.0023 | 0.0034  | 14.191 | 563.8 |
-| 16                 | 0.048  | 21.037       | 0.028  | 0.455 | 0.0025 | 0.0041  | 27.236 | 587.5 |
-the average latency of threads
-![avg cost](../../../doc/criteo-cube-benchmark-avgcost.png)
-The QPS is 
-![qps](../../../doc/criteo-cube-benchmark-qps.png)
--- a/python/examples/criteo_ctr_with_cube/README_CN.md
+++ b/python/examples/criteo_ctr_with_cube/README_CN.md
-## 带稀疏参数索引服务的CTR预测服务
-(简体中文|[English](./README.md))
-### 获取样例数据
-进入目录 `python/examples/criteo_ctr_with_cube`
-```
-sh get_data.sh
-```
-### 下载模型和稀疏参数序列文件
-```
-wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz
-tar xf ctr_cube_unittest.tar.gz
-mv models/ctr_client_conf ./
-mv models/ctr_serving_model_kv ./
-mv models/data ./cube/
-```
-执行脚本后会在当前目录有ctr_server_model_kv和ctr_client_config文件夹。
-### 启动稀疏参数索引服务
-```
-wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz
-tar xf cube_app.tar.gz
-mv cube_app/cube* ./cube/
-sh cube_prepare.sh &
-```
-此处，模型当中的稀疏参数会被存放在稀疏参数索引服务Cube当中。
-### 启动RPC预测服务，服务端线程数为4（可在test_server.py配置）
-```
-python test_server.py ctr_serving_model_kv 
-```
-### 执行预测
-```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
-```
-### Benchmark
-设备 ：Intel(R) Xeon(R) CPU 6148 @ 2.40GHz 
-模型 ：[Criteo CTR](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/criteo_ctr_with_cube/network_conf.py)
-server core/thread num ： 4/8
-执行
-```
-bash benchmark.sh
-```
-客户端每个线程会发送1000个batch
-| client  thread num | prepro | client infer | op0    | op1   | op2    | postpro | avg_latency | qps   |
-| ------------------ | ------ | ------------ | ------ | ----- | ------ | ------- | ----- | ----- |
-| 1                  | 0.035  | 1.596        | 0.021  | 0.518 | 0.0024 | 0.0025  | 6.774 | 147.7 |
-| 2                  | 0.034  | 1.780        | 0.027  | 0.463 | 0.0020 | 0.0023  | 6.931 | 288.3 |
-| 4                  | 0.038  | 2.954        | 0.025  | 0.455 | 0.0019 | 0.0027  | 8.378 | 477.5 |
-| 8                  | 0.044  | 8.230        | 0.028  | 0.464 | 0.0023 | 0.0034  | 14.191 | 563.8 |
-| 16                 | 0.048  | 21.037       | 0.028  | 0.455 | 0.0025 | 0.0041  | 27.236 | 587.5 |
-平均每个线程耗时图如下
-![avg cost](../../../doc/criteo-cube-benchmark-avgcost.png)
-每个线程QPS耗时如下
-![qps](../../../doc/criteo-cube-benchmark-qps.png)
--- a/python/examples/criteo_ctr_with_cube/args.py
+++ b/python/examples/criteo_ctr_with_cube/args.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-import argparse
-def parse_args():
-    parser = argparse.ArgumentParser(description="PaddlePaddle CTR example")
-    parser.add_argument(
-        '--train_data_path',
-        type=str,
-        default='./data/raw/train.txt',
-        help="The path of training dataset")
-    parser.add_argument(
-        '--sparse_only',
-        type=bool,
-        default=False,
-        help="Whether we use sparse features only")
-    parser.add_argument(
-        '--test_data_path',
-        type=str,
-        default='./data/raw/valid.txt',
-        help="The path of testing dataset")
-    parser.add_argument(
-        '--batch_size',
-        type=int,
-        default=1000,
-        help="The size of mini-batch (default:1000)")
-    parser.add_argument(
-        '--embedding_size',
-        type=int,
-        default=10,
-        help="The size for embedding layer (default:10)")
-    parser.add_argument(
-        '--num_passes',
-        type=int,
-        default=10,
-        help="The number of passes to train (default: 10)")
-    parser.add_argument(
-        '--model_output_dir',
-        type=str,
-        default='models',
-        help='The path for model to store (default: models)')
-    parser.add_argument(
-        '--sparse_feature_dim',
-        type=int,
-        default=1000001,
-        help='sparse feature hashing space for index processing')
-    parser.add_argument(
-        '--is_local',
-        type=int,
-        default=1,
-        help='Local train or distributed train (default: 1)')
-    parser.add_argument(
-        '--cloud_train',
-        type=int,
-        default=0,
-        help='Local train or distributed train on paddlecloud (default: 0)')
-    parser.add_argument(
-        '--async_mode',
-        action='store_true',
-        default=False,
-        help='Whether start pserver in async mode to support ASGD')
-    parser.add_argument(
-        '--no_split_var',
-        action='store_true',
-        default=False,
-        help='Whether split variables into blocks when update_method is pserver')
-    parser.add_argument(
-        '--role',
-        type=str,
-        default='pserver',  # trainer or pserver
-        help='The path for model to store (default: models)')
-    parser.add_argument(
-        '--endpoints',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The pserver endpoints, like: 127.0.0.1:6000,127.0.0.1:6001')
-    parser.add_argument(
-        '--current_endpoint',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The path for model to store (default: 127.0.0.1:6000)')
-    parser.add_argument(
-        '--trainer_id',
-        type=int,
-        default=0,
-        help='The path for model to store (default: models)')
-    parser.add_argument(
-        '--trainers',
-        type=int,
-        default=1,
-        help='The num of trianers, (default: 1)')
-    return parser.parse_args()
--- a/python/examples/criteo_ctr_with_cube/benchmark.sh
+++ b/python/examples/criteo_ctr_with_cube/benchmark.sh
-rm profile_log
-export FLAGS_profile_client=1
-export FLAGS_profile_server=1
-wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz --no-check-certificate
-tar xf ctr_cube_unittest.tar.gz
-mv models/ctr_client_conf ./
-mv models/ctr_serving_model_kv ./
-mv models/data ./cube/
-wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz --no-check-certificate
-tar xf cube_app.tar.gz
-mv cube_app/cube* ./cube/
-sh cube_prepare.sh &
-python test_server.py ctr_serving_model_kv > serving_log 2>&1 &
-for thread_num in 1 4 16
-do
-for batch_size in 1 4 16 64
-do
-    $PYTHONROOT/bin/python benchmark.py --thread $thread_num --batch_size $batch_size --model serving_client_conf/serving_client_conf.prototxt --request rpc > profile 2>&1
-    echo "batch size : $batch_size"
-    echo "thread num : $thread_num"
-    echo "========================================"
-    echo "batch size : $batch_size" >> profile_log
-    $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log
-    tail -n 3 profile >> profile_log
-done
-done
-ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9
--- a/python/examples/criteo_ctr_with_cube/benchmark_cube.sh
+++ b/python/examples/criteo_ctr_with_cube/benchmark_cube.sh
-rm profile_log
-#wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz --no-check-certificate
-#tar xf ctr_cube_unittest.tar.gz
-mv models/ctr_client_conf ./
-mv models/ctr_serving_model_kv ./
-mv models/data ./cube/
-#wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz --no-check-certificate
-#tar xf cube_app.tar.gz
-mv cube_app/cube* ./cube/
-sh cube_prepare.sh &
-cp ../../../build_server/core/cube/cube-api/cube-cli .
-python gen_key.py
-for thread_num in 1 4 16 32
-do
-for batch_size in 1000
-do
-    ./cube-cli -config_file ./cube/conf/cube.conf -keys key -dict test_dict -thread_num $thread_num --batch $batch_size > profile 2>&1
-    echo "batch size : $batch_size"
-    echo "thread num : $thread_num"
-    echo "========================================"
-    echo "batch size : $batch_size" >> profile_log
-    echo "thread num : $thread_num" >> profile_log
-    tail -n 8 profile >> profile_log
-done
-done
-ps -ef|grep 'cube'|grep -v grep|cut -c 9-15 | xargs kill -9
--- a/python/examples/criteo_ctr_with_cube/clean.sh
+++ b/python/examples/criteo_ctr_with_cube/clean.sh
-ps -ef | grep cube | awk {'print $2'} | xargs kill -9
-rm -rf cube/cube_data cube/data cube/log* cube/nohup* cube/output/ cube/donefile cube/input cube/monitor cube/cube-builder.INFO
-ps -ef | grep test | awk {'print $2'} | xargs kill -9
-ps -ef | grep serving | awk {'print $2'} | xargs kill -9
--- a/python/examples/criteo_ctr_with_cube/criteo.py
+++ b/python/examples/criteo_ctr_with_cube/criteo.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-class CriteoDataset(object):
-    def setup(self, sparse_feature_dim):
-        self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-        self.cont_max_ = [
-            20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
-        ]
-        self.cont_diff_ = [
-            20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
-        ]
-        self.hash_dim_ = sparse_feature_dim
-        # here, training data are lines with line_index < train_idx_
-        self.train_idx_ = 41256555
-        self.continuous_range_ = range(1, 14)
-        self.categorical_range_ = range(14, 40)
-    def _process_line(self, line):
-        features = line.rstrip('\n').split('\t')
-        dense_feature = []
-        sparse_feature = []
-        for idx in self.continuous_range_:
-            if features[idx] == '':
-                dense_feature.append(0.0)
-            else:
-                dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / \
-                                     self.cont_diff_[idx - 1])
-        for idx in self.categorical_range_:
-            sparse_feature.append(
-                [hash(str(idx) + features[idx]) % self.hash_dim_])
-        return dense_feature, sparse_feature, [int(features[0])]
-    def infer_reader(self, filelist, batch, buf_size):
-        def local_iter():
-            for fname in filelist:
-                with open(fname.strip(), "r") as fin:
-                    for line in fin:
-                        dense_feature, sparse_feature, label = self._process_line(
-                            line)
-                        #yield dense_feature, sparse_feature, label
-                        yield [dense_feature] + sparse_feature + [label]
-        import paddle
-        batch_iter = paddle.batch(
-            paddle.reader.shuffle(
-                local_iter, buf_size=buf_size),
-            batch_size=batch)
-        return batch_iter
-    def generate_sample(self, line):
-        def data_iter():
-            dense_feature, sparse_feature, label = self._process_line(line)
-            feature_name = ["dense_input"]
-            for idx in self.categorical_range_:
-                feature_name.append("C" + str(idx - 13))
-            feature_name.append("label")
-            yield zip(feature_name, [dense_feature] + sparse_feature + [label])
-        return data_iter
-if __name__ == "__main__":
-    criteo_dataset = CriteoDataset()
-    criteo_dataset.setup(int(sys.argv[1]))
-    criteo_dataset.run_from_stdin()
--- a/python/examples/criteo_ctr_with_cube/criteo_reader.py
+++ b/python/examples/criteo_ctr_with_cube/criteo_reader.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-import sys
-import paddle.fluid.incubate.data_generator as dg
-class CriteoDataset(dg.MultiSlotDataGenerator):
-    def setup(self, sparse_feature_dim):
-        self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-        self.cont_max_ = [
-            20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
-        ]
-        self.cont_diff_ = [
-            20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
-        ]
-        self.hash_dim_ = sparse_feature_dim
-        # here, training data are lines with line_index < train_idx_
-        self.train_idx_ = 41256555
-        self.continuous_range_ = range(1, 14)
-        self.categorical_range_ = range(14, 40)
-    def _process_line(self, line):
-        features = line.rstrip('\n').split('\t')
-        dense_feature = []
-        sparse_feature = []
-        for idx in self.continuous_range_:
-            if features[idx] == '':
-                dense_feature.append(0.0)
-            else:
-                dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / \
-                                     self.cont_diff_[idx - 1])
-        for idx in self.categorical_range_:
-            sparse_feature.append(
-                [hash(str(idx) + features[idx]) % self.hash_dim_])
-        return dense_feature, sparse_feature, [int(features[0])]
-    def infer_reader(self, filelist, batch, buf_size):
-        def local_iter():
-            for fname in filelist:
-                with open(fname.strip(), "r") as fin:
-                    for line in fin:
-                        dense_feature, sparse_feature, label = self._process_line(
-                            line)
-                        #yield dense_feature, sparse_feature, label
-                        yield [dense_feature] + sparse_feature + [label]
-        import paddle
-        batch_iter = paddle.batch(
-            paddle.reader.shuffle(
-                local_iter, buf_size=buf_size),
-            batch_size=batch)
-        return batch_iter
-    def generate_sample(self, line):
-        def data_iter():
-            dense_feature, sparse_feature, label = self._process_line(line)
-            feature_name = ["dense_input"]
-            for idx in self.categorical_range_:
-                feature_name.append("C" + str(idx - 13))
-            feature_name.append("label")
-            yield zip(feature_name, [dense_feature] + sparse_feature + [label])
-        return data_iter
-if __name__ == "__main__":
-    criteo_dataset = CriteoDataset()
-    criteo_dataset.setup(int(sys.argv[1]))
-    criteo_dataset.run_from_stdin()
--- a/python/examples/criteo_ctr_with_cube/cube/conf/cube.conf
+++ b/python/examples/criteo_ctr_with_cube/cube/conf/cube.conf
-[{
-    "dict_name": "test_dict",
-    "shard": 1,
-    "dup": 1,
-    "timeout": 200,
-    "retry": 3,
-    "backup_request": 100,
-    "type": "ipport_list",
-    "load_balancer": "rr",
-    "nodes": [{
-        "ipport_list": "list://127.0.0.1:8027"
-    }]
-}]
--- a/python/examples/criteo_ctr_with_cube/cube/conf/gflags.conf
+++ b/python/examples/criteo_ctr_with_cube/cube/conf/gflags.conf
--port=8027
--dict_split=1
--in_mem=true
--log_dir=./log/
--- a/python/examples/criteo_ctr_with_cube/cube/keys
+++ b/python/examples/criteo_ctr_with_cube/cube/keys
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
--- a/python/examples/criteo_ctr_with_cube/cube_prepare.sh
+++ b/python/examples/criteo_ctr_with_cube/cube_prepare.sh
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-#! /bin/bash
-mkdir -p cube_model
-mkdir -p cube/data
-./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1  -only_build=false
-cd cube && ./cube
--- a/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh
+++ b/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-#! /bin/bash
-mkdir -p cube_model
-mkdir -p cube/data
-./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature 8  
-./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1  -only_build=false
-mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
-cd cube && ./cube 
--- a/python/examples/criteo_ctr_with_cube/get_data.sh
+++ b/python/examples/criteo_ctr_with_cube/get_data.sh
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/data/ctr_prediction/ctr_data.tar.gz
-tar -zxvf ctr_data.tar.gz
--- a/python/examples/criteo_ctr_with_cube/local_train.py
+++ b/python/examples/criteo_ctr_with_cube/local_train.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from __future__ import print_function
-from args import parse_args
-import os
-import paddle.fluid as fluid
-import sys
-from network_conf import dnn_model
-dense_feature_dim = 13
-def train():
-    args = parse_args()
-    sparse_only = args.sparse_only
-    if not os.path.isdir(args.model_output_dir):
-        os.mkdir(args.model_output_dir)
-    dense_input = fluid.layers.data(
-        name="dense_input", shape=[dense_feature_dim], dtype='float32')
-    sparse_input_ids = [
-        fluid.layers.data(
-            name="C" + str(i), shape=[1], lod_level=1, dtype="int64")
-        for i in range(1, 27)
-    ]
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    #nn_input = None if sparse_only else dense_input
-    nn_input = dense_input
-    predict_y, loss, auc_var, batch_auc_var, infer_vars = dnn_model(
-        nn_input, sparse_input_ids, label, args.embedding_size,
-        args.sparse_feature_dim)
-    optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
-    optimizer.minimize(loss)
-    exe = fluid.Executor(fluid.CPUPlace())
-    exe.run(fluid.default_startup_program())
-    dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-    dataset.set_use_var([dense_input] + sparse_input_ids + [label])
-    python_executable = "python"
-    pipe_command = "{} criteo_reader.py {}".format(python_executable,
-                                                   args.sparse_feature_dim)
-    dataset.set_pipe_command(pipe_command)
-    dataset.set_batch_size(128)
-    thread_num = 10
-    dataset.set_thread(thread_num)
-    whole_filelist = [
-        "raw_data/part-%d" % x for x in range(len(os.listdir("raw_data")))
-    ]
-    print(whole_filelist)
-    dataset.set_filelist(whole_filelist[:100])
-    dataset.load_into_memory()
-    fluid.layers.Print(auc_var)
-    epochs = 1
-    for i in range(epochs):
-        exe.train_from_dataset(
-            program=fluid.default_main_program(), dataset=dataset, debug=True)
-        print("epoch {} finished".format(i))
-    import paddle_serving_client.io as server_io
-    feed_var_dict = {}
-    feed_var_dict['dense_input'] = dense_input
-    for i, sparse in enumerate(sparse_input_ids):
-        feed_var_dict["embedding_{}.tmp_0".format(i)] = sparse
-    fetch_var_dict = {"prob": predict_y}
-    feed_kv_dict = {}
-    feed_kv_dict['dense_input'] = dense_input
-    for i, emb in enumerate(infer_vars):
-        feed_kv_dict["embedding_{}.tmp_0".format(i)] = emb
-    fetch_var_dict = {"prob": predict_y}
-    server_io.save_model("ctr_serving_model", "ctr_client_conf", feed_var_dict,
-                         fetch_var_dict, fluid.default_main_program())
-    server_io.save_model("ctr_serving_model_kv", "ctr_client_conf_kv",
-                         feed_kv_dict, fetch_var_dict,
-                         fluid.default_main_program())
-if __name__ == '__main__':
-    train()
--- a/python/examples/criteo_ctr_with_cube/network_conf.py
+++ b/python/examples/criteo_ctr_with_cube/network_conf.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-import paddle.fluid as fluid
-import math
-def dnn_model(dense_input, sparse_inputs, label, embedding_size,
-              sparse_feature_dim):
-    def embedding_layer(input):
-        emb = fluid.layers.embedding(
-            input=input,
-            is_sparse=True,
-            is_distributed=False,
-            size=[sparse_feature_dim, embedding_size],
-            param_attr=fluid.ParamAttr(
-                name="SparseFeatFactors",
-                initializer=fluid.initializer.Uniform()))
-        x = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-        return emb, x
-    def mlp_input_tensor(emb_sums, dense_tensor):
-        #if isinstance(dense_tensor, fluid.Variable):
-        #    return fluid.layers.concat(emb_sums, axis=1)
-        #else:
-        return fluid.layers.concat(emb_sums + [dense_tensor], axis=1)
-    def mlp(mlp_input):
-        fc1 = fluid.layers.fc(input=mlp_input,
-                              size=400,
-                              act='relu',
-                              param_attr=fluid.ParamAttr(
-                                  initializer=fluid.initializer.Normal(
-                                      scale=1 / math.sqrt(mlp_input.shape[1]))))
-        fc2 = fluid.layers.fc(input=fc1,
-                              size=400,
-                              act='relu',
-                              param_attr=fluid.ParamAttr(
-                                  initializer=fluid.initializer.Normal(
-                                      scale=1 / math.sqrt(fc1.shape[1]))))
-        fc3 = fluid.layers.fc(input=fc2,
-                              size=400,
-                              act='relu',
-                              param_attr=fluid.ParamAttr(
-                                  initializer=fluid.initializer.Normal(
-                                      scale=1 / math.sqrt(fc2.shape[1]))))
-        pre = fluid.layers.fc(input=fc3,
-                              size=2,
-                              act='softmax',
-                              param_attr=fluid.ParamAttr(
-                                  initializer=fluid.initializer.Normal(
-                                      scale=1 / math.sqrt(fc3.shape[1]))))
-        return pre
-    emb_pair_sums = list(map(embedding_layer, sparse_inputs))
-    emb_sums = [x[1] for x in emb_pair_sums]
-    infer_vars = [x[0] for x in emb_pair_sums]
-    mlp_in = mlp_input_tensor(emb_sums, dense_input)
-    predict = mlp(mlp_in)
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.reduce_sum(cost)
-    accuracy = fluid.layers.accuracy(input=predict, label=label)
-    auc_var, batch_auc_var, auc_states = \
-        fluid.layers.auc(input=predict, label=label, num_thresholds=2 ** 12, slide_steps=20)
-    return predict, avg_cost, auc_var, batch_auc_var, infer_vars
--- a/python/examples/criteo_ctr_with_cube/test_client.py
+++ b/python/examples/criteo_ctr_with_cube/test_client.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_client import Client
-import sys
-import os
-import criteo as criteo
-import time
-from paddle_serving_client.metric import auc
-py_version = sys.version_info[0]
-client = Client()
-client.load_client_config(sys.argv[1])
-client.connect(["127.0.0.1:9292"])
-batch = 1
-buf_size = 100
-dataset = criteo.CriteoDataset()
-dataset.setup(1000001)
-test_filelists = ["{}/part-0".format(sys.argv[2])]
-reader = dataset.infer_reader(test_filelists, batch, buf_size)
-label_list = []
-prob_list = []
-start = time.time()
-for ei in range(10000):
-    if py_version == 2:
-        data = reader().next()
-    else:
-        data = reader().__next__()
-    feed_dict = {}
-    feed_dict['dense_input'] = data[0][0]
-    for i in range(1, 27):
-        feed_dict["embedding_{}.tmp_0".format(i - 1)] = data[0][i]
-    fetch_map = client.predict(feed=feed_dict, fetch=["prob"])
-    prob_list.append(fetch_map['prob'][0][1])
-    label_list.append(data[0][-1][0])
-print(auc(label_list, prob_list))
-end = time.time()
-print(end - start)
--- a/python/examples/criteo_ctr_with_cube/test_server.py
+++ b/python/examples/criteo_ctr_with_cube/test_server.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-import os
-import sys
-from paddle_serving_server import OpMaker
-from paddle_serving_server import OpSeqMaker
-from paddle_serving_server import Server
-op_maker = OpMaker()
-read_op = op_maker.create('general_reader')
-general_dist_kv_infer_op = op_maker.create('general_dist_kv_infer')
-response_op = op_maker.create('general_response')
-op_seq_maker = OpSeqMaker()
-op_seq_maker.add_op(read_op)
-op_seq_maker.add_op(general_dist_kv_infer_op)
-op_seq_maker.add_op(response_op)
-server = Server()
-server.set_op_sequence(op_seq_maker.get_op_sequence())
-server.set_num_threads(4)
-server.load_model_config(sys.argv[1])
-server.prepare_server(
-    workdir="work_dir1",
-    port=9292,
-    device="cpu",
-    cube_conf="./cube/conf/cube.conf")
-server.run_server()
--- a/python/examples/criteo_ctr_with_cube/test_server_quant.py
+++ b/python/examples/criteo_ctr_with_cube/test_server_quant.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-import os
-import sys
-from paddle_serving_server import OpMaker
-from paddle_serving_server import OpSeqMaker
-from paddle_serving_server import Server
-op_maker = OpMaker()
-read_op = op_maker.create('general_reader')
-general_dist_kv_infer_op = op_maker.create('general_dist_kv_quant_infer')
-response_op = op_maker.create('general_response')
-op_seq_maker = OpSeqMaker()
-op_seq_maker.add_op(read_op)
-op_seq_maker.add_op(general_dist_kv_infer_op)
-op_seq_maker.add_op(response_op)
-server = Server()
-server.set_op_sequence(op_seq_maker.get_op_sequence())
-server.set_num_threads(4)
-server.load_model_config(sys.argv[1])
-server.prepare_server(
-    workdir="work_dir1",
-    port=9292,
-    device="cpu",
-    cube_conf="./cube/conf/cube.conf")
-server.run_server()
--- a/python/examples/faster_rcnn/000000570688.jpg
+++ b/python/examples/faster_rcnn/000000570688.jpg
--- a/python/examples/faster_rcnn_model/000000570688_bbox.jpg
+++ b/python/examples/faster_rcnn_model/000000570688_bbox.jpg
--- a/python/examples/faster_rcnn_model/README.md
+++ b/python/examples/faster_rcnn_model/README.md
--- a/python/examples/faster_rcnn_model/README_CN.md
+++ b/python/examples/faster_rcnn_model/README_CN.md
--- a/python/examples/criteo_ctr_with_cube/benchmark.py
+++ b/python/examples/criteo_ctr_with_cube/benchmark.py
@@ -15,77 +15,109 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing
-from paddle_serving_client import Client
+from __future__ import unicode_literals, absolute_import
-import sys
 import os
-import criteo as criteo
+import sys
 import time
+import json
+import requests
+from paddle_serving_client import Client
 from paddle_serving_client.utils import MultiThreadRunner
-from paddle_serving_client.utils import benchmark_args
+from paddle_serving_client.utils import benchmark_args, show_latency
-from paddle_serving_client.metric import auc
+from paddle_serving_app.reader import ChineseBertReader
+from paddle_serving_app.reader import *
+import numpy as np
-py_version = sys.version_info[0]
 args = benchmark_args()
 def single_func(idx, resource):
-    client = Client()
+    img = "./000000570688.jpg"
-    print([resource["endpoint"][idx % len(resource["endpoint"])]])
+    profile_flags = False
-    client.load_client_config('ctr_client_conf/serving_client_conf.prototxt')
+    latency_flags = False
-    client.connect(['127.0.0.1:9292'])
+    if os.getenv("FLAGS_profile_client"):
-    batch = 1
+        profile_flags = True
-    buf_size = 100
+    if os.getenv("FLAGS_serving_latency"):
-    dataset = criteo.CriteoDataset()
+        latency_flags = True
-    dataset.setup(1000001)
+        latency_list = []
-    test_filelists = [
-        "./raw_data/part-%d" % x for x in range(len(os.listdir("./raw_data")))
-    ]
-    reader = dataset.infer_reader(test_filelists[len(test_filelists) - 40:],
-                                  batch, buf_size)
    if args.request == "rpc":
-        fetch = ["prob"]
+        preprocess = Sequential([
+            File2Image(), BGR2RGB(), Div(255.0),
+            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], False),
+            Resize(640, 640), Transpose((2, 0, 1))
+        ])
+        postprocess = RCNNPostprocess("label_list.txt", "output")
+        client = Client()
+        client.load_client_config(args.model)
+        client.connect([resource["endpoint"][idx % len(resource["endpoint"])]])
        start = time.time()
-        itr = 1000
+        for i in range(turns):
-        for ei in range(itr):
+            if args.batch_size >= 1:
-            if args.batch_size > 0:
+                l_start = time.time()
                feed_batch = []
+                b_start = time.time()
+                im = preprocess(img)
                for bi in range(args.batch_size):
-                    if py_version == 2:
+                    print("1111batch")
-                        data = reader().next()
+                    print(bi)
-                    else:
+                    feed_batch.append({
-                        data = reader().__next__()
+                        "image": im,
-                    feed_dict = {}
+                        "im_info": np.array(list(im.shape[1:]) + [1.0]),
-                    feed_dict['dense_input'] = data[0][0]
+                        "im_shape": np.array(list(im.shape[1:]) + [1.0])
-                    for i in range(1, 27):
+                    })
-                        feed_dict["embedding_{}.tmp_0".format(i - 1)] = data[0][
+            # im = preprocess(img)
-                            i]
+                b_end = time.time()
-                    feed_batch.append(feed_dict)
-                result = client.predict(feed=feed_batch, fetch=fetch)
+                if profile_flags:
+                    sys.stderr.write(
+                        "PROFILE\tpid:{}\tbert_pre_0:{} bert_pre_1:{}\n".format(
+                            os.getpid(),
+                            int(round(b_start * 1000000)),
+                            int(round(b_end * 1000000))))
+                #result = client.predict(feed=feed_batch, fetch=fetch)
+                fetch_map = client.predict(
+                    feed=feed_batch, fetch=["multiclass_nms"])
+                fetch_map["image"] = img
+                postprocess(fetch_map)
+                l_end = time.time()
+                if latency_flags:
+                    latency_list.append(l_end * 1000 - l_start * 1000)
            else:
                print("unsupport batch size {}".format(args.batch_size))
+    else:
-    elif args.request == "http":
+        raise ValueError("not implemented {} request".format(args.request))
-        raise ("Not support http service.")
    end = time.time()
-    qps = itr * args.batch_size / (end - start)
+    if latency_flags:
-    return [[end - start, qps]]
+        return [[end - start], latency_list]
+    else:
+        return [[end - start]]
 if __name__ == '__main__':
    multi_thread_runner = MultiThreadRunner()
-    endpoint_list = ["127.0.0.1:9292"]
+    endpoint_list = ["127.0.0.1:7777"]
-    #result = single_func(0, {"endpoint": endpoint_list})
+    turns = 10
    start = time.time()
-    result = multi_thread_runner.run(single_func, args.thread,
+    result = multi_thread_runner.run(
-                                     {"endpoint": endpoint_list})
+        single_func, args.thread, {"endpoint": endpoint_list,
+                                   "turns": turns})
    end = time.time()
    total_cost = end - start
    avg_cost = 0
-    qps = 0
    for i in range(args.thread):
-        avg_cost += result[0][i * 2 + 0]
+        avg_cost += result[0][i]
-        qps += result[0][i * 2 + 1]
    avg_cost = avg_cost / args.thread
-    print("total cost: {}".format(total_cost))
-    print("average total cost {} s.".format(avg_cost))
+    print("total cost: {}s".format(total_cost))
-    print("qps {} ins/s".format(qps))
+    print("each thread cost: {}s. ".format(avg_cost))
+    print("qps: {}samples/s".format(args.batch_size * args.thread * turns /
+                                    total_cost))
+    if os.getenv("FLAGS_serving_latency"):
+        show_latency(result[1])
--- a/python/examples/faster_rcnn/benchmark.sh
+++ b/python/examples/faster_rcnn/benchmark.sh
+rm profile_log*
+export CUDA_VISIBLE_DEVICES=0
+export FLAGS_profile_server=1
+export FLAGS_profile_client=1
+export FLAGS_serving_latency=1
+gpu_id=0
+#save cpu and gpu utilization log
+if [ -d utilization ];then
+    rm -rf utilization
+else
+    mkdir utilization
+fi
+#start server
+$PYTHONROOT/bin/python3 -m paddle_serving_server_gpu.serve --model $1 --port 7777 --thread 4 --gpu_ids 0  --ir_optim >  elog  2>&1 &
+sleep 5
+#warm up
+$PYTHONROOT/bin/python3 benchmark.py --thread 4 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
+echo -e "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+for thread_num in 1 4 8 16
+do
+for batch_size in 1
+do
+    job_bt=`date '+%Y%m%d%H%M%S'`
+    nvidia-smi --id=0 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
+    nvidia-smi --id=0 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
+    gpu_memory_pid=$!
+    $PYTHONROOT/bin/python3 benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
+    kill ${gpu_memory_pid}
+    kill `ps -ef|grep used_memory|awk '{print $2}'`
+    echo "model_name:" $1
+    echo "thread_num:" $thread_num
+    echo "batch_size:" $batch_size
+    echo "=================Done===================="
+    echo "model_name:$1" >> profile_log_$1
+    echo "batch_size:$batch_size" >> profile_log_$1
+    $PYTHONROOT/bin/python3 cpu_utilization.py >> profile_log_$1
+    job_et=`date '+%Y%m%d%H%M%S'`
+    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$1
+    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$1
+    rm -rf gpu_use.log gpu_utilization.log
+    $PYTHONROOT/bin/python3 ../util/show_profile.py profile $thread_num >> profile_log_$1
+    tail -n 8 profile >> profile_log_$1
+    echo "" >> profile_log_$1
+done
+done
+#Divided log
+awk 'BEGIN{RS="\n\n"}{i++}{print > "bert_log_"i}' profile_log_$1
+mkdir bert_log && mv bert_log_* bert_log
+ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9
--- a/python/examples/faster_rcnn/label_list.txt
+++ b/python/examples/faster_rcnn/label_list.txt
+background
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+dining table
+toilet
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
--- a/python/examples/faster_rcnn_model/test_client.py
+++ b/python/examples/faster_rcnn_model/test_client.py
@@ -36,6 +36,7 @@ fetch_map = client.predict(
        "im_info": np.array(list(im.shape[1:]) + [1.0]),
        "im_shape": np.array(list(im.shape[1:]) + [1.0])
    },
-    fetch=["multiclass_nms"])
+    fetch=["multiclass_nms"],
+    batch=False)
 fetch_map["image"] = sys.argv[3]
 postprocess(fetch_map)
--- a/python/examples/fit_a_line/README.md
+++ b/python/examples/fit_a_line/README.md
@@ -14,12 +14,6 @@ sh get_data.sh
 ### Start server
-``` shell
-python test_server.py uci_housing_model/
-```
-You can also start the default RPC service with the following line of code:
 ```shell
 python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
 ```
@@ -40,7 +34,7 @@ python test_client.py uci_housing_client/serving_client_conf.prototxt
 Start a web service with default web service hosting modules:
 ``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --name uci
+python test_server.py
 ```
 ### Client prediction

--- a/python/examples/fit_a_line/README_CN.md
+++ b/python/examples/fit_a_line/README_CN.md
@@ -41,7 +41,7 @@ python test_client.py uci_housing_client/serving_client_conf.prototxt
 通过下面的一行代码开启默认web服务：
 ``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --name uci
+python test_server.py
 ```
 ### 客户端预测

--- a/python/examples/fit_a_line/test_client.py
+++ b/python/examples/fit_a_line/test_client.py
@@ -15,6 +15,7 @@
 from paddle_serving_client import Client
 import sys
+import numpy as np
 client = Client()
 client.load_client_config(sys.argv[1])
@@ -27,5 +28,9 @@ test_reader = paddle.batch(
    batch_size=1)
 for data in test_reader():
-    fetch_map = client.predict(feed={"x": data[0][0]}, fetch=["price"])
+    new_data = np.zeros((1, 1, 13)).astype("float32")
+    new_data[0] = data[0][0]
+    fetch_map = client.predict(
+        feed={"x": new_data}, fetch=["price"], batch=True)
    print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
+    print(fetch_map)
--- a/python/examples/fit_a_line/test_multi_process_client.py
+++ b/python/examples/fit_a_line/test_multi_process_client.py
@@ -15,6 +15,7 @@
 from paddle_serving_client import Client
 from paddle_serving_client.utils import MultiThreadRunner
 import paddle
+import numpy as np
 def single_func(idx, resource):
@@ -26,6 +27,7 @@ def single_func(idx, resource):
        0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584,
        0.6283, 0.4919, 0.1856, 0.0795, -0.0332
    ]
+    x = np.array(x)
    for i in range(1000):
        fetch_map = client.predict(feed={"x": x}, fetch=["price"])
        if fetch_map is None:

--- a/python/examples/fit_a_line/test_server.py
+++ b/python/examples/fit_a_line/test_server.py
@@ -13,24 +13,24 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing
-import os
+from paddle_serving_server.web_service import WebService
-import sys
+import numpy as np
-from paddle_serving_server import OpMaker
-from paddle_serving_server import OpSeqMaker
-from paddle_serving_server import Server
-op_maker = OpMaker()
-read_op = op_maker.create('general_reader')
-general_infer_op = op_maker.create('general_infer')
-response_op = op_maker.create('general_response')
-op_seq_maker = OpSeqMaker()
+class UciService(WebService):
-op_seq_maker.add_op(read_op)
+    def preprocess(self, feed=[], fetch=[]):
-op_seq_maker.add_op(general_infer_op)
+        feed_batch = []
-op_seq_maker.add_op(response_op)
+        is_batch = True
+        new_data = np.zeros((len(feed), 1, 13)).astype("float32")
+        for i, ins in enumerate(feed):
+            nums = np.array(ins["x"]).reshape(1, 1, 13)
+            new_data[i] = nums
+        feed = {"x": new_data}
+        return feed, fetch, is_batch
-server = Server()
-server.set_op_sequence(op_seq_maker.get_op_sequence())
+uci_service = UciService(name="uci")
-server.load_model_config(sys.argv[1])
+uci_service.load_model_config("uci_housing_model")
-server.prepare_server(workdir="work_dir1", port=9393, device="cpu")
+uci_service.prepare_server(workdir="workdir", port=9292)
-server.run_server()
+uci_service.run_rpc_service()
+uci_service.run_web_service()
--- a/python/examples/imagenet/resnet50_rpc_client.py
+++ b/python/examples/imagenet/resnet50_rpc_client.py
@@ -38,7 +38,8 @@ start = time.time()
 image_file = "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"
 for i in range(10):
    img = seq(image_file)
-    fetch_map = client.predict(feed={"image": img}, fetch=["score"])
+    fetch_map = client.predict(
+        feed={"image": img}, fetch=["score"], batch=False)
    prob = max(fetch_map["score"][0])
    label = label_dict[fetch_map["score"][0].tolist().index(prob)].strip(
    ).replace(",", "")

--- a/python/examples/imagenet/resnet50_web_service.py
+++ b/python/examples/imagenet/resnet50_web_service.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import sys
 from paddle_serving_client import Client
+import numpy as np
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
 if len(sys.argv) != 4:
@@ -43,12 +44,13 @@ class ImageService(WebService):
    def preprocess(self, feed=[], fetch=[]):
        feed_batch = []
+        is_batch = True
        for ins in feed:
            if "image" not in ins:
                raise ("feed data error!")
            img = self.seq(ins["image"])
-            feed_batch.append({"image": img})
+            feed_batch.append({"image": img[np.newaxis, :]})
-        return feed_batch, fetch
+        return feed_batch, fetch, is_batch
    def postprocess(self, feed=[], fetch=[], fetch_map={}):
        score_list = fetch_map["score"]

--- a/python/examples/imdb/benchmark.py
+++ b/python/examples/imdb/benchmark.py
@@ -17,7 +17,8 @@ import os
 import sys
 import time
 import requests
-from paddle_serving_app.reader import IMDBDataset
+import numpy as np
+from paddle_serving_app.reader.imdb_reader import IMDBDataset
 from paddle_serving_client import Client
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import MultiThreadRunner, benchmark_args, show_latency
@@ -47,11 +48,17 @@ def single_func(idx, resource):
        for i in range(1000):
            if args.batch_size >= 1:
                feed_batch = []
+                feed = {"words": [], "words.lod": [0]}
                for bi in range(args.batch_size):
                    word_ids, label = imdb_dataset.get_words_and_label(dataset[
                        bi])
-                    feed_batch.append({"words": word_ids})
+                    feed["words.lod"].append(feed["words.lod"][-1] + len(
-                result = client.predict(feed=feed_batch, fetch=["prediction"])
+                        word_ids))
+                    feed["words"].extend(word_ids)
+                feed["words"] = np.array(feed["words"]).reshape(
+                    len(feed["words"]), 1)
+                result = client.predict(
+                    feed=feed, fetch=["prediction"], batch=True)
                if result is None:
                    raise ("predict failed.")
            else:

--- a/python/examples/imdb/test_client.py
+++ b/python/examples/imdb/test_client.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing
 from paddle_serving_client import Client
-from paddle_serving_app.reader import IMDBDataset
+from paddle_serving_app.reader.imdb_reader import IMDBDataset
 import sys
+import numpy as np
 client = Client()
 client.load_client_config(sys.argv[1])
@@ -28,7 +29,12 @@ imdb_dataset.load_resource(sys.argv[2])
 for line in sys.stdin:
    word_ids, label = imdb_dataset.get_words_and_label(line)
-    feed = {"words": word_ids}
+    word_len = len(word_ids)
+    feed = {
+        "words": np.array(word_ids).reshape(word_len, 1),
+        "words.lod": [0, word_len]
+    }
+    #print(feed)
    fetch = ["prediction"]
-    fetch_map = client.predict(feed=feed, fetch=fetch)
+    fetch_map = client.predict(feed=feed, fetch=fetch, batch=True)
    print("{} {}".format(fetch_map["prediction"][0], label[0]))
--- a/python/examples/imdb/text_classify_service.py
+++ b/python/examples/imdb/text_classify_service.py
@@ -14,8 +14,9 @@
 # pylint: disable=doc-string-missing
 from paddle_serving_server.web_service import WebService
-from paddle_serving_app.reader import IMDBDataset
+from paddle_serving_app.reader.imdb_reader import IMDBDataset
 import sys
+import numpy as np
 class IMDBService(WebService):
@@ -26,10 +27,16 @@ class IMDBService(WebService):
        self.dataset.load_resource(args["dict_file_path"])
    def preprocess(self, feed={}, fetch=[]):
-        res_feed = [{
+        feed_batch = []
-            "words": self.dataset.get_words_only(ins["words"])
+        words_lod = [0]
-        } for ins in feed]
+        is_batch = True
-        return res_feed, fetch
+        for ins in feed:
+            words = self.dataset.get_words_only(ins["words"])
+            words = np.array(words).reshape(len(words), 1)
+            words_lod.append(words_lod[-1] + len(words))
+            feed_batch.append(words)
+        feed = {"words": np.concatenate(feed_batch), "words.lod": words_lod}
+        return feed, fetch, is_batch
 imdb_service = IMDBService(name="imdb")

--- a/python/examples/lac/lac_client.py
+++ b/python/examples/lac/lac_client.py
@@ -19,6 +19,7 @@ from paddle_serving_app.reader import LACReader
 import sys
 import os
 import io
+import numpy as np
 client = Client()
 client.load_client_config(sys.argv[1])
@@ -31,7 +32,17 @@ for line in sys.stdin:
    feed_data = reader.process(line)
    if len(feed_data) <= 0:
        continue
-    fetch_map = client.predict(feed={"words": feed_data}, fetch=["crf_decode"])
+    print(feed_data)
+    #fetch_map = client.predict(feed={"words": np.array(feed_data).reshape(len(feed_data), 1), "words.lod": [0, len(feed_data)]}, fetch=["crf_decode"], batch=True)
+    fetch_map = client.predict(
+        feed={
+            "words": np.array(feed_data + feed_data).reshape(
+                len(feed_data) * 2, 1),
+            "words.lod": [0, len(feed_data), 2 * len(feed_data)]
+        },
+        fetch=["crf_decode"],
+        batch=True)
+    print(fetch_map)
    begin = fetch_map['crf_decode.lod'][0]
    end = fetch_map['crf_decode.lod'][1]
    segs = reader.parse_result(line, fetch_map["crf_decode"][begin:end])

--- a/python/examples/lac/lac_web_service.py
+++ b/python/examples/lac/lac_web_service.py
@@ -15,6 +15,7 @@
 from paddle_serving_server.web_service import WebService
 import sys
 from paddle_serving_app.reader import LACReader
+import numpy as np
 class LACService(WebService):
@@ -23,13 +24,21 @@ class LACService(WebService):
    def preprocess(self, feed={}, fetch=[]):
        feed_batch = []
+        fetch = ["crf_decode"]
+        lod_info = [0]
+        is_batch = True
        for ins in feed:
            if "words" not in ins:
                raise ("feed data error!")
            feed_data = self.reader.process(ins["words"])
-            feed_batch.append({"words": feed_data})
+            feed_batch.append(np.array(feed_data).reshape(len(feed_data), 1))
-        fetch = ["crf_decode"]
+            lod_info.append(lod_info[-1] + len(feed_data))
-        return feed_batch, fetch
+        feed_dict = {
+            "words": np.concatenate(
+                feed_batch, axis=0),
+            "words.lod": lod_info
+        }
+        return feed_dict, fetch, is_batch
    def postprocess(self, feed={}, fetch=[], fetch_map={}):
        batch_ret = []

--- a/python/examples/ocr/README.md
+++ b/python/examples/ocr/README.md
@@ -34,9 +34,9 @@ python ocr_web_server.py gpu
 ```
 python ocr_web_client.py
 ```
-If you want a faster web service, please try Web Debugger Service
+If you want a faster web service, please try Web LocalPredictor Service
-## Web Debugger Service
+## Web LocalPredictor Service
 ```
 #choose one of cpu/gpu commands as following
 #for cpu user
@@ -45,7 +45,7 @@ python ocr_debugger_server.py cpu
 python ocr_debugger_server.py gpu 
 ```
-## Web Debugger Client Prediction
+## Web LocalPredictor Client Prediction
 ```
 python ocr_web_client.py
 ```
@@ -61,7 +61,7 @@ Dataset: RCTW 500 sample images
 | engine                       | client read image(ms) | client-server tras time(ms) | server read image（ms） | det pre(ms) | det infer(ms) | det post(ms) | rec pre(ms) | rec infer(ms) | rec post(ms) | server-client trans time(ms) | server side time consumption(ms) | server side overhead(ms) | total time（ms) |
 |------------------------------|----------------|----------------------------|------------------|--------------------|------------------|--------------------|--------------------|------------------|--------------------|--------------------------|--------------------|--------------|---------------|
 | Serving web service          | 8.69         | 13.41                      | 109.97           | 2.82               | 87.76            | 4.29               | 3.98               | 78.51            | 3.66               | 4.12                     | 181.02             | 136.49       | 317.51        |
-| Serving Debugger web service |  8.73        | 16.42                      | 115.27           | 2.93               | 20.63            | 3.97               | 4.48               | 13.84            | 3.60               | 6.91                     | 49.45              | 147.33       | 196.78        |
+| Serving LocalPredictor web service |  8.73        | 16.42                      | 115.27           | 2.93               | 20.63            | 3.97               | 4.48               | 13.84            | 3.60               | 6.91                     | 49.45              | 147.33       | 196.78        |
 ## Appendix: For Users who want to launch Det or Rec only
 if you are going to detect images not recognize it or directly recognize the words from images. We also provide Det and Rec server for you.

--- a/python/examples/ocr/README_CN.md
+++ b/python/examples/ocr/README_CN.md
@@ -34,8 +34,8 @@ python ocr_web_server.py gpu
 python ocr_web_client.py
 ```
-如果用户需要更快的执行速度，请尝试Debugger版Web服务
+如果用户需要更快的执行速度，请尝试LocalPredictor版Web服务
-## 启动Debugger版Web服务
+## 启动LocalPredictor版Web服务
 ```
 #根据CPU/GPU设备选择一种启动方式
 #for cpu user
@@ -60,7 +60,7 @@ GPU: Nvidia Tesla V100单卡
 | engine                       | 客户端读图(ms) | 客户端发送请求到服务端(ms) | 服务端读图（ms） | 检测预处理耗时(ms) | 检测模型耗时(ms) | 检测后处理耗时(ms) | 识别预处理耗时(ms) | 识别模型耗时(ms) | 识别后处理耗时(ms) | 服务端回传客户端时间(ms) | 服务端整体耗时(ms) | 空跑耗时(ms) | 整体耗时（ms) |
 |------------------------------|----------------|----------------------------|------------------|--------------------|------------------|--------------------|--------------------|------------------|--------------------|--------------------------|--------------------|--------------|---------------|
 | Serving web service          | 8.69         | 13.41                      | 109.97           | 2.82               | 87.76            | 4.29               | 3.98               | 78.51            | 3.66               | 4.12                     | 181.02             | 136.49      | 317.51        |
-| Serving Debugger web service | 8.73         | 16.42                      | 115.27           | 2.93               | 20.63            | 3.97               | 4.48               | 13.84            | 3.60               | 6.91                     | 49.45              | 147.33      | 196.78        |
+| Serving LocalPredictor web service | 8.73         | 16.42                      | 115.27           | 2.93               | 20.63            | 3.97               | 4.48               | 13.84            | 3.60               | 6.91                     | 49.45              | 147.33      | 196.78        |
 ## 附录： 检测/识别单服务启动

--- a/python/examples/ocr/det_debugger_server.py
+++ b/python/examples/ocr/det_debugger_server.py
@@ -53,7 +53,9 @@ class OCRService(WebService):
        self.ori_h, self.ori_w, _ = im.shape
        det_img = self.det_preprocess(im)
        _, self.new_h, self.new_w = det_img.shape
-        return {"image": det_img[np.newaxis, :].copy()}, ["concat_1.tmp_0"]
+        return {
+            "image": det_img[np.newaxis, :].copy()
+        }, ["concat_1.tmp_0"], True
    def postprocess(self, feed={}, fetch=[], fetch_map=None):
        det_out = fetch_map["concat_1.tmp_0"]

--- a/python/examples/ocr/det_web_server.py
+++ b/python/examples/ocr/det_web_server.py
@@ -54,7 +54,7 @@ class OCRService(WebService):
        det_img = self.det_preprocess(im)
        _, self.new_h, self.new_w = det_img.shape
        print(det_img)
-        return {"image": det_img}, ["concat_1.tmp_0"]
+        return {"image": det_img}, ["concat_1.tmp_0"], False
    def postprocess(self, feed={}, fetch=[], fetch_map=None):
        det_out = fetch_map["concat_1.tmp_0"]

--- a/python/examples/ocr/ocr_debugger_server.py
+++ b/python/examples/ocr/ocr_debugger_server.py
@@ -26,7 +26,7 @@ if sys.argv[1] == 'gpu':
    from paddle_serving_server_gpu.web_service import WebService
 elif sys.argv[1] == 'cpu':
    from paddle_serving_server.web_service import WebService
-from paddle_serving_app.local_predict import Debugger
+from paddle_serving_app.local_predict import LocalPredictor
 import time
 import re
 import base64
@@ -39,13 +39,12 @@ class OCRService(WebService):
            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
                (2, 0, 1))
        ])
-        self.det_client = Debugger()
+        self.det_client = LocalPredictor()
        if sys.argv[1] == 'gpu':
            self.det_client.load_model_config(
-                det_model_config, gpu=True, profile=False)
+                det_model_config, use_gpu=True, gpu_id=1)
        elif sys.argv[1] == 'cpu':
-            self.det_client.load_model_config(
+            self.det_client.load_model_config(det_model_config)
-                det_model_config, gpu=False, profile=False)
        self.ocr_reader = OCRReader()
    def preprocess(self, feed=[], fetch=[]):
@@ -58,7 +57,7 @@ class OCRService(WebService):
        det_img = det_img[np.newaxis, :]
        det_img = det_img.copy()
        det_out = self.det_client.predict(
-            feed={"image": det_img}, fetch=["concat_1.tmp_0"])
+            feed={"image": det_img}, fetch=["concat_1.tmp_0"], batch=True)
        filter_func = FilterBoxes(10, 10)
        post_func = DBPostProcess({
            "thresh": 0.3,
@@ -91,7 +90,7 @@ class OCRService(WebService):
            imgs[id] = norm_img
        feed = {"image": imgs.copy()}
        fetch = ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"]
-        return feed, fetch
+        return feed, fetch, True
    def postprocess(self, feed={}, fetch=[], fetch_map=None):
        rec_res = self.ocr_reader.postprocess(fetch_map, with_score=True)
@@ -107,7 +106,8 @@ ocr_service.load_model_config("ocr_rec_model")
 ocr_service.prepare_server(workdir="workdir", port=9292)
 ocr_service.init_det_debugger(det_model_config="ocr_det_model")
 if sys.argv[1] == 'gpu':
-    ocr_service.run_debugger_service(gpu=True)
+    ocr_service.set_gpus("2")
+    ocr_service.run_debugger_service()
 elif sys.argv[1] == 'cpu':
    ocr_service.run_debugger_service()
 ocr_service.run_web_service()
--- a/python/examples/ocr/ocr_web_client.py
+++ b/python/examples/ocr/ocr_web_client.py
@@ -36,4 +36,5 @@ for img_file in os.listdir(test_img_dir):
    image = cv2_to_base64(image_data1)
    data = {"feed": [{"image": image}], "fetch": ["res"]}
    r = requests.post(url=url, headers=headers, data=json.dumps(data))
+    print(r)
    print(r.json())
--- a/python/examples/ocr/ocr_web_server.py
+++ b/python/examples/ocr/ocr_web_server.py
@@ -50,7 +50,7 @@ class OCRService(WebService):
        ori_h, ori_w, _ = im.shape
        det_img = self.det_preprocess(im)
        det_out = self.det_client.predict(
-            feed={"image": det_img}, fetch=["concat_1.tmp_0"])
+            feed={"image": det_img}, fetch=["concat_1.tmp_0"], batch=False)
        _, new_h, new_w = det_img.shape
        filter_func = FilterBoxes(10, 10)
        post_func = DBPostProcess({
@@ -77,10 +77,10 @@ class OCRService(WebService):
            max_wh_ratio = max(max_wh_ratio, wh_ratio)
        for img in img_list:
            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
-            feed = {"image": norm_img}
+            feed_list.append(norm_img[np.newaxis, :])
-            feed_list.append(feed)
+        feed_batch = {"image": np.concatenate(feed_list, axis=0)}
        fetch = ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"]
-        return feed_list, fetch
+        return feed_batch, fetch, True
    def postprocess(self, feed={}, fetch=[], fetch_map=None):
        rec_res = self.ocr_reader.postprocess(fetch_map, with_score=True)

--- a/python/examples/ocr/rec_debugger_server.py
+++ b/python/examples/ocr/rec_debugger_server.py
@@ -52,7 +52,7 @@ class OCRService(WebService):
            imgs[i] = norm_img
        feed = {"image": imgs.copy()}
        fetch = ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"]
-        return feed, fetch
+        return feed, fetch, True
    def postprocess(self, feed={}, fetch=[], fetch_map=None):
        rec_res = self.ocr_reader.postprocess(fetch_map, with_score=True)

--- a/python/examples/ocr/rec_web_server.py
+++ b/python/examples/ocr/rec_web_server.py
@@ -51,10 +51,17 @@ class OCRService(WebService):
            max_wh_ratio = max(max_wh_ratio, wh_ratio)
        for img in img_list:
            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
-            feed = {"image": norm_img}
+            #feed = {"image": norm_img}
-            feed_list.append(feed)
+            feed_list.append(norm_img)
+        if len(feed_list) == 1:
+            feed_batch = {
+                "image": np.concatenate(
+                    feed_list, axis=0)[np.newaxis, :]
+            }
+        else:
+            feed_batch = {"image": np.concatenate(feed_list, axis=0)}
        fetch = ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"]
-        return feed_list, fetch
+        return feed_batch, fetch, True
    def postprocess(self, feed={}, fetch=[], fetch_map=None):
        rec_res = self.ocr_reader.postprocess(fetch_map, with_score=True)

--- a/python/examples/ocr_detection/7.jpg
+++ b/python/examples/ocr_detection/7.jpg
--- a/python/examples/ocr_detection/text_det_client.py
+++ b/python/examples/ocr_detection/text_det_client.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from paddle_serving_client import Client
-from paddle_serving_app.reader import Sequential, File2Image, ResizeByFactor
-from paddle_serving_app.reader import Div, Normalize, Transpose
-from paddle_serving_app.reader import DBPostProcess, FilterBoxes
-client = Client()
-client.load_client_config("ocr_det_client/serving_client_conf.prototxt")
-client.connect(["127.0.0.1:9494"])
-read_image_file = File2Image()
-preprocess = Sequential([
-    ResizeByFactor(32, 960), Div(255),
-    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
-        (2, 0, 1))
-])
-post_func = DBPostProcess({
-    "thresh": 0.3,
-    "box_thresh": 0.5,
-    "max_candidates": 1000,
-    "unclip_ratio": 1.5,
-    "min_size": 3
-})
-filter_func = FilterBoxes(10, 10)
-img = read_image_file(name)
-ori_h, ori_w, _ = img.shape
-img = preprocess(img)
-new_h, new_w, _ = img.shape
-ratio_list = [float(new_h) / ori_h, float(new_w) / ori_w]
-outputs = client.predict(feed={"image": img}, fetch=["concat_1.tmp_0"])
-dt_boxes_list = post_func(outputs["concat_1.tmp_0"], [ratio_list])
-dt_boxes = filter_func(dt_boxes_list[0], [ori_h, ori_w])
--- a/python/examples/pipeline/imagenet/README.md
+++ b/python/examples/pipeline/imagenet/README.md
+# Imagenet Pipeline WebService
+This document will takes Imagenet service as an example to introduce how to use Pipeline WebService.
+## Get model
+```
+sh get_model.sh
+```
+## Start server
+```
+python resnet50_web_service.py &>log.txt &
+```
+## RPC test
+```
+python pipeline_rpc_client.py
+```
--- a/python/examples/pipeline/imagenet/README_CN.md
+++ b/python/examples/pipeline/imagenet/README_CN.md
+# Imagenet Pipeline WebService
+这里以 Uci 服务为例来介绍 Pipeline WebService 的使用。
+## 获取模型
+```
+sh get_data.sh
+```
+## 启动服务
+```
+python web_service.py &>log.txt &
+```
+## 测试
+```
+curl -X POST -k http://localhost:18082/uci/prediction -d '{"key": ["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}'
+```
--- a/python/examples/pipeline/imagenet/config.yml
+++ b/python/examples/pipeline/imagenet/config.yml
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+##当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
+worker_num: 1
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
+http_port: 18082
+rpc_port: 9999
+dag:
+    #op资源类型, True, 为线程模型；False，为进程模型
+    is_thread_op: False
+op:
+    imagenet:
+        #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
+        local_service_conf:
+            #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+            concurrency: 2
+            #uci模型路径
+            model_config: ResNet50_vd_model
+            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
+            devices: "0" # "0,1"
+            #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
+            client_type: local_predictor
+            #Fetch结果列表，以client_config中fetch_var的alias_name为准
+            fetch_list: ["score"] 
--- a/python/examples/pipeline/imagenet/daisy.jpg
+++ b/python/examples/pipeline/imagenet/daisy.jpg
--- a/python/examples/pipeline/imagenet/get_model.sh
+++ b/python/examples/pipeline/imagenet/get_model.sh
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/imagenet-example/ResNet50_vd.tar.gz
+tar -xzvf ResNet50_vd.tar.gz
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/imagenet-example/image_data.tar.gz
+tar -xzvf image_data.tar.gz
--- a/python/examples/pipeline/imagenet/imagenet.label
+++ b/python/examples/pipeline/imagenet/imagenet.label
+tench, Tinca tinca,
+goldfish, Carassius auratus,
+great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias,
+tiger shark, Galeocerdo cuvieri,
+hammerhead, hammerhead shark,
+electric ray, crampfish, numbfish, torpedo,
+stingray,
+cock,
+hen,
+ostrich, Struthio camelus,
+brambling, Fringilla montifringilla,
+goldfinch, Carduelis carduelis,
+house finch, linnet, Carpodacus mexicanus,
+junco, snowbird,
+indigo bunting, indigo finch, indigo bird, Passerina cyanea,
+robin, American robin, Turdus migratorius,
+bulbul,
+jay,
+magpie,
+chickadee,
+water ouzel, dipper,
+kite,
+bald eagle, American eagle, Haliaeetus leucocephalus,
+vulture,
+great grey owl, great gray owl, Strix nebulosa,
+European fire salamander, Salamandra salamandra,
+common newt, Triturus vulgaris,
+eft,
+spotted salamander, Ambystoma maculatum,
+axolotl, mud puppy, Ambystoma mexicanum,
+bullfrog, Rana catesbeiana,
+tree frog, tree-frog,
+tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui,
+loggerhead, loggerhead turtle, Caretta caretta,
+leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea,
+mud turtle,
+terrapin,
+box turtle, box tortoise,
+banded gecko,
+common iguana, iguana, Iguana iguana,
+American chameleon, anole, Anolis carolinensis,
+whiptail, whiptail lizard,
+agama,
+frilled lizard, Chlamydosaurus kingi,
+alligator lizard,
+Gila monster, Heloderma suspectum,
+green lizard, Lacerta viridis,
+African chameleon, Chamaeleo chamaeleon,
+Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis,
+African crocodile, Nile crocodile, Crocodylus niloticus,
+American alligator, Alligator mississipiensis,
+triceratops,
+thunder snake, worm snake, Carphophis amoenus,
+ringneck snake, ring-necked snake, ring snake,
+hognose snake, puff adder, sand viper,
+green snake, grass snake,
+king snake, kingsnake,
+garter snake, grass snake,
+water snake,
+vine snake,
+night snake, Hypsiglena torquata,
+boa constrictor, Constrictor constrictor,
+rock python, rock snake, Python sebae,
+Indian cobra, Naja naja,
+green mamba,
+sea snake,
+horned viper, cerastes, sand viper, horned asp, Cerastes cornutus,
+diamondback, diamondback rattlesnake, Crotalus adamanteus,
+sidewinder, horned rattlesnake, Crotalus cerastes,
+trilobite,
+harvestman, daddy longlegs, Phalangium opilio,
+scorpion,
+black and gold garden spider, Argiope aurantia,
+barn spider, Araneus cavaticus,
+garden spider, Aranea diademata,
+black widow, Latrodectus mactans,
+tarantula,
+wolf spider, hunting spider,
+tick,
+centipede,
+black grouse,
+ptarmigan,
+ruffed grouse, partridge, Bonasa umbellus,
+prairie chicken, prairie grouse, prairie fowl,
+peacock,
+quail,
+partridge,
+African grey, African gray, Psittacus erithacus,
+macaw,
+sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita,
+lorikeet,
+coucal,
+bee eater,
+hornbill,
+hummingbird,
+jacamar,
+toucan,
+drake,
+red-breasted merganser, Mergus serrator,
+goose,
+black swan, Cygnus atratus,
+tusker,
+echidna, spiny anteater, anteater,
+platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus,
+wallaby, brush kangaroo,
+koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus,
+wombat,
+jellyfish,
+sea anemone, anemone,
+brain coral,
+flatworm, platyhelminth,
+nematode, nematode worm, roundworm,
+conch,
+snail,
+slug,
+sea slug, nudibranch,
+chiton, coat-of-mail shell, sea cradle, polyplacophore,
+chambered nautilus, pearly nautilus, nautilus,
+Dungeness crab, Cancer magister,
+rock crab, Cancer irroratus,
+fiddler crab,
+king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica,
+American lobster, Northern lobster, Maine lobster, Homarus americanus,
+spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish,
+crayfish, crawfish, crawdad, crawdaddy,
+hermit crab,
+isopod,
+white stork, Ciconia ciconia,
+black stork, Ciconia nigra,
+spoonbill,
+flamingo,
+little blue heron, Egretta caerulea,
+American egret, great white heron, Egretta albus,
+bittern,
+crane,
+limpkin, Aramus pictus,
+European gallinule, Porphyrio porphyrio,
+American coot, marsh hen, mud hen, water hen, Fulica americana,
+bustard,
+ruddy turnstone, Arenaria interpres,
+red-backed sandpiper, dunlin, Erolia alpina,
+redshank, Tringa totanus,
+dowitcher,
+oystercatcher, oyster catcher,
+pelican,
+king penguin, Aptenodytes patagonica,
+albatross, mollymawk,
+grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus,
+killer whale, killer, orca, grampus, sea wolf, Orcinus orca,
+dugong, Dugong dugon,
+sea lion,
+Chihuahua,
+Japanese spaniel,
+Maltese dog, Maltese terrier, Maltese,
+Pekinese, Pekingese, Peke,
+Shih-Tzu,
+Blenheim spaniel,
+papillon,
+toy terrier,
+Rhodesian ridgeback,
+Afghan hound, Afghan,
+basset, basset hound,
+beagle,
+bloodhound, sleuthhound,
+bluetick,
+black-and-tan coonhound,
+Walker hound, Walker foxhound,
+English foxhound,
+redbone,
+borzoi, Russian wolfhound,
+Irish wolfhound,
+Italian greyhound,
+whippet,
+Ibizan hound, Ibizan Podenco,
+Norwegian elkhound, elkhound,
+otterhound, otter hound,
+Saluki, gazelle hound,
+Scottish deerhound, deerhound,
+Weimaraner,
+Staffordshire bullterrier, Staffordshire bull terrier,
+American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier,
+Bedlington terrier,
+Border terrier,
+Kerry blue terrier,
+Irish terrier,
+Norfolk terrier,
+Norwich terrier,
+Yorkshire terrier,
+wire-haired fox terrier,
+Lakeland terrier,
+Sealyham terrier, Sealyham,
+Airedale, Airedale terrier,
+cairn, cairn terrier,
+Australian terrier,
+Dandie Dinmont, Dandie Dinmont terrier,
+Boston bull, Boston terrier,
+miniature schnauzer,
+giant schnauzer,
+standard schnauzer,
+Scotch terrier, Scottish terrier, Scottie,
+Tibetan terrier, chrysanthemum dog,
+silky terrier, Sydney silky,
+soft-coated wheaten terrier,
+West Highland white terrier,
+Lhasa, Lhasa apso,
+flat-coated retriever,
+curly-coated retriever,
+golden retriever,
+Labrador retriever,
+Chesapeake Bay retriever,
+German short-haired pointer,
+vizsla, Hungarian pointer,
+English setter,
+Irish setter, red setter,
+Gordon setter,
+Brittany spaniel,
+clumber, clumber spaniel,
+English springer, English springer spaniel,
+Welsh springer spaniel,
+cocker spaniel, English cocker spaniel, cocker,
+Sussex spaniel,
+Irish water spaniel,
+kuvasz,
+schipperke,
+groenendael,
+malinois,
+briard,
+kelpie,
+komondor,
+Old English sheepdog, bobtail,
+Shetland sheepdog, Shetland sheep dog, Shetland,
+collie,
+Border collie,
+Bouvier des Flandres, Bouviers des Flandres,
+Rottweiler,
+German shepherd, German shepherd dog, German police dog, alsatian,
+Doberman, Doberman pinscher,
+miniature pinscher,
+Greater Swiss Mountain dog,
+Bernese mountain dog,
+Appenzeller,
+EntleBucher,
+boxer,
+bull mastiff,
+Tibetan mastiff,
+French bulldog,
+Great Dane,
+Saint Bernard, St Bernard,
+Eskimo dog, husky,
+malamute, malemute, Alaskan malamute,
+Siberian husky,
+dalmatian, coach dog, carriage dog,
+affenpinscher, monkey pinscher, monkey dog,
+basenji,
+pug, pug-dog,
+Leonberg,
+Newfoundland, Newfoundland dog,
+Great Pyrenees,
+Samoyed, Samoyede,
+Pomeranian,
+chow, chow chow,
+keeshond,
+Brabancon griffon,
+Pembroke, Pembroke Welsh corgi,
+Cardigan, Cardigan Welsh corgi,
+toy poodle,
+miniature poodle,
+standard poodle,
+Mexican hairless,
+timber wolf, grey wolf, gray wolf, Canis lupus,
+white wolf, Arctic wolf, Canis lupus tundrarum,
+red wolf, maned wolf, Canis rufus, Canis niger,
+coyote, prairie wolf, brush wolf, Canis latrans,
+dingo, warrigal, warragal, Canis dingo,
+dhole, Cuon alpinus,
+African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus,
+hyena, hyaena,
+red fox, Vulpes vulpes,
+kit fox, Vulpes macrotis,
+Arctic fox, white fox, Alopex lagopus,
+grey fox, gray fox, Urocyon cinereoargenteus,
+tabby, tabby cat,
+tiger cat,
+Persian cat,
+Siamese cat, Siamese,
+Egyptian cat,
+cougar, puma, catamount, mountain lion, painter, panther, Felis concolor,
+lynx, catamount,
+leopard, Panthera pardus,
+snow leopard, ounce, Panthera uncia,
+jaguar, panther, Panthera onca, Felis onca,
+lion, king of beasts, Panthera leo,
+tiger, Panthera tigris,
+cheetah, chetah, Acinonyx jubatus,
+brown bear, bruin, Ursus arctos,
+American black bear, black bear, Ursus americanus, Euarctos americanus,
+ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus,
+sloth bear, Melursus ursinus, Ursus ursinus,
+mongoose,
+meerkat, mierkat,
+tiger beetle,
+ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle,
+ground beetle, carabid beetle,
+long-horned beetle, longicorn, longicorn beetle,
+leaf beetle, chrysomelid,
+dung beetle,
+rhinoceros beetle,
+weevil,
+fly,
+bee,
+ant, emmet, pismire,
+grasshopper, hopper,
+cricket,
+walking stick, walkingstick, stick insect,
+cockroach, roach,
+mantis, mantid,
+cicada, cicala,
+leafhopper,
+lacewing, lacewing fly,
+"dragonfly, darning needle, devils darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
+damselfly,
+admiral,
+ringlet, ringlet butterfly,
+monarch, monarch butterfly, milkweed butterfly, Danaus plexippus,
+cabbage butterfly,
+sulphur butterfly, sulfur butterfly,
+lycaenid, lycaenid butterfly,
+starfish, sea star,
+sea urchin,
+sea cucumber, holothurian,
+wood rabbit, cottontail, cottontail rabbit,
+hare,
+Angora, Angora rabbit,
+hamster,
+porcupine, hedgehog,
+fox squirrel, eastern fox squirrel, Sciurus niger,
+marmot,
+beaver,
+guinea pig, Cavia cobaya,
+sorrel,
+zebra,
+hog, pig, grunter, squealer, Sus scrofa,
+wild boar, boar, Sus scrofa,
+warthog,
+hippopotamus, hippo, river horse, Hippopotamus amphibius,
+ox,
+water buffalo, water ox, Asiatic buffalo, Bubalus bubalis,
+bison,
+ram, tup,
+bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis,
+ibex, Capra ibex,
+hartebeest,
+impala, Aepyceros melampus,
+gazelle,
+Arabian camel, dromedary, Camelus dromedarius,
+llama,
+weasel,
+mink,
+polecat, fitch, foulmart, foumart, Mustela putorius,
+black-footed ferret, ferret, Mustela nigripes,
+otter,
+skunk, polecat, wood pussy,
+badger,
+armadillo,
+three-toed sloth, ai, Bradypus tridactylus,
+orangutan, orang, orangutang, Pongo pygmaeus,
+gorilla, Gorilla gorilla,
+chimpanzee, chimp, Pan troglodytes,
+gibbon, Hylobates lar,
+siamang, Hylobates syndactylus, Symphalangus syndactylus,
+guenon, guenon monkey,
+patas, hussar monkey, Erythrocebus patas,
+baboon,
+macaque,
+langur,
+colobus, colobus monkey,
+proboscis monkey, Nasalis larvatus,
+marmoset,
+capuchin, ringtail, Cebus capucinus,
+howler monkey, howler,
+titi, titi monkey,
+spider monkey, Ateles geoffroyi,
+squirrel monkey, Saimiri sciureus,
+Madagascar cat, ring-tailed lemur, Lemur catta,
+indri, indris, Indri indri, Indri brevicaudatus,
+Indian elephant, Elephas maximus,
+African elephant, Loxodonta africana,
+lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens,
+giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca,
+barracouta, snoek,
+eel,
+coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch,
+rock beauty, Holocanthus tricolor,
+anemone fish,
+sturgeon,
+gar, garfish, garpike, billfish, Lepisosteus osseus,
+lionfish,
+puffer, pufferfish, blowfish, globefish,
+abacus,
+abaya,
+"academic gown, academic robe, judges robe",
+accordion, piano accordion, squeeze box,
+acoustic guitar,
+aircraft carrier, carrier, flattop, attack aircraft carrier,
+airliner,
+airship, dirigible,
+altar,
+ambulance,
+amphibian, amphibious vehicle,
+analog clock,
+apiary, bee house,
+apron,
+ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin,
+assault rifle, assault gun,
+backpack, back pack, knapsack, packsack, rucksack, haversack,
+bakery, bakeshop, bakehouse,
+balance beam, beam,
+balloon,
+ballpoint, ballpoint pen, ballpen, Biro,
+Band Aid,
+banjo,
+bannister, banister, balustrade, balusters, handrail,
+barbell,
+barber chair,
+barbershop,
+barn,
+barometer,
+barrel, cask,
+barrow, garden cart, lawn cart, wheelbarrow,
+baseball,
+basketball,
+bassinet,
+bassoon,
+bathing cap, swimming cap,
+bath towel,
+bathtub, bathing tub, bath, tub,
+beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon,
+beacon, lighthouse, beacon light, pharos,
+beaker,
+bearskin, busby, shako,
+beer bottle,
+beer glass,
+bell cote, bell cot,
+bib,
+bicycle-built-for-two, tandem bicycle, tandem,
+bikini, two-piece,
+binder, ring-binder,
+binoculars, field glasses, opera glasses,
+birdhouse,
+boathouse,
+bobsled, bobsleigh, bob,
+bolo tie, bolo, bola tie, bola,
+bonnet, poke bonnet,
+bookcase,
+bookshop, bookstore, bookstall,
+bottlecap,
+bow,
+bow tie, bow-tie, bowtie,
+brass, memorial tablet, plaque,
+brassiere, bra, bandeau,
+breakwater, groin, groyne, mole, bulwark, seawall, jetty,
+breastplate, aegis, egis,
+broom,
+bucket, pail,
+buckle,
+bulletproof vest,
+bullet train, bullet,
+butcher shop, meat market,
+cab, hack, taxi, taxicab,
+caldron, cauldron,
+candle, taper, wax light,
+cannon,
+canoe,
+can opener, tin opener,
+cardigan,
+car mirror,
+carousel, carrousel, merry-go-round, roundabout, whirligig,
+"carpenters kit, tool kit",
+carton,
+car wheel,
+cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM,
+cassette,
+cassette player,
+castle,
+catamaran,
+CD player,
+cello, violoncello,
+cellular telephone, cellular phone, cellphone, cell, mobile phone,
+chain,
+chainlink fence,
+chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour,
+chain saw, chainsaw,
+chest,
+chiffonier, commode,
+chime, bell, gong,
+china cabinet, china closet,
+Christmas stocking,
+church, church building,
+cinema, movie theater, movie theatre, movie house, picture palace,
+cleaver, meat cleaver, chopper,
+cliff dwelling,
+cloak,
+clog, geta, patten, sabot,
+cocktail shaker,
+coffee mug,
+coffeepot,
+coil, spiral, volute, whorl, helix,
+combination lock,
+computer keyboard, keypad,
+confectionery, confectionary, candy store,
+container ship, containership, container vessel,
+convertible,
+corkscrew, bottle screw,
+cornet, horn, trumpet, trump,
+cowboy boot,
+cowboy hat, ten-gallon hat,
+cradle,
+crane,
+crash helmet,
+crate,
+crib, cot,
+Crock Pot,
+croquet ball,
+crutch,
+cuirass,
+dam, dike, dyke,
+desk,
+desktop computer,
+dial telephone, dial phone,
+diaper, nappy, napkin,
+digital clock,
+digital watch,
+dining table, board,
+dishrag, dishcloth,
+dishwasher, dish washer, dishwashing machine,
+disk brake, disc brake,
+dock, dockage, docking facility,
+dogsled, dog sled, dog sleigh,
+dome,
+doormat, welcome mat,
+drilling platform, offshore rig,
+drum, membranophone, tympan,
+drumstick,
+dumbbell,
+Dutch oven,
+electric fan, blower,
+electric guitar,
+electric locomotive,
+entertainment center,
+envelope,
+espresso maker,
+face powder,
+feather boa, boa,
+file, file cabinet, filing cabinet,
+fireboat,
+fire engine, fire truck,
+fire screen, fireguard,
+flagpole, flagstaff,
+flute, transverse flute,
+folding chair,
+football helmet,
+forklift,
+fountain,
+fountain pen,
+four-poster,
+freight car,
+French horn, horn,
+frying pan, frypan, skillet,
+fur coat,
+garbage truck, dustcart,
+gasmask, respirator, gas helmet,
+gas pump, gasoline pump, petrol pump, island dispenser,
+goblet,
+go-kart,
+golf ball,
+golfcart, golf cart,
+gondola,
+gong, tam-tam,
+gown,
+grand piano, grand,
+greenhouse, nursery, glasshouse,
+grille, radiator grille,
+grocery store, grocery, food market, market,
+guillotine,
+hair slide,
+hair spray,
+half track,
+hammer,
+hamper,
+hand blower, blow dryer, blow drier, hair dryer, hair drier,
+hand-held computer, hand-held microcomputer,
+handkerchief, hankie, hanky, hankey,
+hard disc, hard disk, fixed disk,
+harmonica, mouth organ, harp, mouth harp,
+harp,
+harvester, reaper,
+hatchet,
+holster,
+home theater, home theatre,
+honeycomb,
+hook, claw,
+hoopskirt, crinoline,
+horizontal bar, high bar,
+horse cart, horse-cart,
+hourglass,
+iPod,
+iron, smoothing iron,
+"jack-o-lantern",
+jean, blue jean, denim,
+jeep, landrover,
+jersey, T-shirt, tee shirt,
+jigsaw puzzle,
+jinrikisha, ricksha, rickshaw,
+joystick,
+kimono,
+knee pad,
+knot,
+lab coat, laboratory coat,
+ladle,
+lampshade, lamp shade,
+laptop, laptop computer,
+lawn mower, mower,
+lens cap, lens cover,
+letter opener, paper knife, paperknife,
+library,
+lifeboat,
+lighter, light, igniter, ignitor,
+limousine, limo,
+liner, ocean liner,
+lipstick, lip rouge,
+Loafer,
+lotion,
+loudspeaker, speaker, speaker unit, loudspeaker system, speaker system,
+"loupe, jewelers loupe",
+lumbermill, sawmill,
+magnetic compass,
+mailbag, postbag,
+mailbox, letter box,
+maillot,
+maillot, tank suit,
+manhole cover,
+maraca,
+marimba, xylophone,
+mask,
+matchstick,
+maypole,
+maze, labyrinth,
+measuring cup,
+medicine chest, medicine cabinet,
+megalith, megalithic structure,
+microphone, mike,
+microwave, microwave oven,
+military uniform,
+milk can,
+minibus,
+miniskirt, mini,
+minivan,
+missile,
+mitten,
+mixing bowl,
+mobile home, manufactured home,
+Model T,
+modem,
+monastery,
+monitor,
+moped,
+mortar,
+mortarboard,
+mosque,
+mosquito net,
+motor scooter, scooter,
+mountain bike, all-terrain bike, off-roader,
+mountain tent,
+mouse, computer mouse,
+mousetrap,
+moving van,
+muzzle,
+nail,
+neck brace,
+necklace,
+nipple,
+notebook, notebook computer,
+obelisk,
+oboe, hautboy, hautbois,
+ocarina, sweet potato,
+odometer, hodometer, mileometer, milometer,
+oil filter,
+organ, pipe organ,
+oscilloscope, scope, cathode-ray oscilloscope, CRO,
+overskirt,
+oxcart,
+oxygen mask,
+packet,
+paddle, boat paddle,
+paddlewheel, paddle wheel,
+padlock,
+paintbrush,
+"pajama, pyjama, pjs, jammies",
+palace,
+panpipe, pandean pipe, syrinx,
+paper towel,
+parachute, chute,
+parallel bars, bars,
+park bench,
+parking meter,
+passenger car, coach, carriage,
+patio, terrace,
+pay-phone, pay-station,
+pedestal, plinth, footstall,
+pencil box, pencil case,
+pencil sharpener,
+perfume, essence,
+Petri dish,
+photocopier,
+pick, plectrum, plectron,
+pickelhaube,
+picket fence, paling,
+pickup, pickup truck,
+pier,
+piggy bank, penny bank,
+pill bottle,
+pillow,
+ping-pong ball,
+pinwheel,
+pirate, pirate ship,
+pitcher, ewer,
+"plane, carpenters plane, woodworking plane",
+planetarium,
+plastic bag,
+plate rack,
+plow, plough,
+"plunger, plumbers helper",
+Polaroid camera, Polaroid Land camera,
+pole,
+police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria,
+poncho,
+pool table, billiard table, snooker table,
+pop bottle, soda bottle,
+pot, flowerpot,
+"potters wheel",
+power drill,
+prayer rug, prayer mat,
+printer,
+prison, prison house,
+projectile, missile,
+projector,
+puck, hockey puck,
+punching bag, punch bag, punching ball, punchball,
+purse,
+quill, quill pen,
+quilt, comforter, comfort, puff,
+racer, race car, racing car,
+racket, racquet,
+radiator,
+radio, wireless,
+radio telescope, radio reflector,
+rain barrel,
+recreational vehicle, RV, R.V.,
+reel,
+reflex camera,
+refrigerator, icebox,
+remote control, remote,
+restaurant, eating house, eating place, eatery,
+revolver, six-gun, six-shooter,
+rifle,
+rocking chair, rocker,
+rotisserie,
+rubber eraser, rubber, pencil eraser,
+rugby ball,
+rule, ruler,
+running shoe,
+safe,
+safety pin,
+saltshaker, salt shaker,
+sandal,
+sarong,
+sax, saxophone,
+scabbard,
+scale, weighing machine,
+school bus,
+schooner,
+scoreboard,
+screen, CRT screen,
+screw,
+screwdriver,
+seat belt, seatbelt,
+sewing machine,
+shield, buckler,
+shoe shop, shoe-shop, shoe store,
+shoji,
+shopping basket,
+shopping cart,
+shovel,
+shower cap,
+shower curtain,
+ski,
+ski mask,
+sleeping bag,
+slide rule, slipstick,
+sliding door,
+slot, one-armed bandit,
+snorkel,
+snowmobile,
+snowplow, snowplough,
+soap dispenser,
+soccer ball,
+sock,
+solar dish, solar collector, solar furnace,
+sombrero,
+soup bowl,
+space bar,
+space heater,
+space shuttle,
+spatula,
+speedboat,
+"spider web, spiders web",
+spindle,
+sports car, sport car,
+spotlight, spot,
+stage,
+steam locomotive,
+steel arch bridge,
+steel drum,
+stethoscope,
+stole,
+stone wall,
+stopwatch, stop watch,
+stove,
+strainer,
+streetcar, tram, tramcar, trolley, trolley car,
+stretcher,
+studio couch, day bed,
+stupa, tope,
+submarine, pigboat, sub, U-boat,
+suit, suit of clothes,
+sundial,
+sunglass,
+sunglasses, dark glasses, shades,
+sunscreen, sunblock, sun blocker,
+suspension bridge,
+swab, swob, mop,
+sweatshirt,
+swimming trunks, bathing trunks,
+swing,
+switch, electric switch, electrical switch,
+syringe,
+table lamp,
+tank, army tank, armored combat vehicle, armoured combat vehicle,
+tape player,
+teapot,
+teddy, teddy bear,
+television, television system,
+tennis ball,
+thatch, thatched roof,
+theater curtain, theatre curtain,
+thimble,
+thresher, thrasher, threshing machine,
+throne,
+tile roof,
+toaster,
+tobacco shop, tobacconist shop, tobacconist,
+toilet seat,
+torch,
+totem pole,
+tow truck, tow car, wrecker,
+toyshop,
+tractor,
+trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi,
+tray,
+trench coat,
+tricycle, trike, velocipede,
+trimaran,
+tripod,
+triumphal arch,
+trolleybus, trolley coach, trackless trolley,
+trombone,
+tub, vat,
+turnstile,
+typewriter keyboard,
+umbrella,
+unicycle, monocycle,
+upright, upright piano,
+vacuum, vacuum cleaner,
+vase,
+vault,
+velvet,
+vending machine,
+vestment,
+viaduct,
+violin, fiddle,
+volleyball,
+waffle iron,
+wall clock,
+wallet, billfold, notecase, pocketbook,
+wardrobe, closet, press,
+warplane, military plane,
+washbasin, handbasin, washbowl, lavabo, wash-hand basin,
+washer, automatic washer, washing machine,
+water bottle,
+water jug,
+water tower,
+whiskey jug,
+whistle,
+wig,
+window screen,
+window shade,
+Windsor tie,
+wine bottle,
+wing,
+wok,
+wooden spoon,
+wool, woolen, woollen,
+worm fence, snake fence, snake-rail fence, Virginia fence,
+wreck,
+yawl,
+yurt,
+web site, website, internet site, site,
+comic book,
+crossword puzzle, crossword,
+street sign,
+traffic light, traffic signal, stoplight,
+book jacket, dust cover, dust jacket, dust wrapper,
+menu,
+plate,
+guacamole,
+consomme,
+hot pot, hotpot,
+trifle,
+ice cream, icecream,
+ice lolly, lolly, lollipop, popsicle,
+French loaf,
+bagel, beigel,
+pretzel,
+cheeseburger,
+hotdog, hot dog, red hot,
+mashed potato,
+head cabbage,
+broccoli,
+cauliflower,
+zucchini, courgette,
+spaghetti squash,
+acorn squash,
+butternut squash,
+cucumber, cuke,
+artichoke, globe artichoke,
+bell pepper,
+cardoon,
+mushroom,
+Granny Smith,
+strawberry,
+orange,
+lemon,
+fig,
+pineapple, ananas,
+banana,
+jackfruit, jak, jack,
+custard apple,
+pomegranate,
+hay,
+carbonara,
+chocolate sauce, chocolate syrup,
+dough,
+meat loaf, meatloaf,
+pizza, pizza pie,
+potpie,
+burrito,
+red wine,
+espresso,
+cup,
+eggnog,
+alp,
+bubble,
+cliff, drop, drop-off,
+coral reef,
+geyser,
+lakeside, lakeshore,
+promontory, headland, head, foreland,
+sandbar, sand bar,
+seashore, coast, seacoast, sea-coast,
+valley, vale,
+volcano,
+ballplayer, baseball player,
+groom, bridegroom,
+scuba diver,
+rapeseed,
+daisy,
+"yellow ladys slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
+corn,
+acorn,
+hip, rose hip, rosehip,
+buckeye, horse chestnut, conker,
+coral fungus,
+agaric,
+gyromitra,
+stinkhorn, carrion fungus,
+earthstar,
+hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa,
+bolete,
+ear, spike, capitulum,
+toilet tissue, toilet paper, bathroom tissue
--- a/python/examples/criteo_ctr_with_cube/gen_key.py
+++ b/python/examples/criteo_ctr_with_cube/gen_key.py
@@ -11,10 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from paddle_serving_server_gpu.pipeline import PipelineClient
+import numpy as np
+import requests
+import json
+import cv2
+import base64
+import os
-import sys
+client = PipelineClient()
-import random
+client.connect(['127.0.0.1:9999'])
-with open("key", "w") as f:
-    for i in range(1000000):
+def cv2_to_base64(image):
-        f.write("{}\n".format(random.randint(0, 999999)))
+    return base64.b64encode(image).decode('utf8')
+with open("daisy.jpg", 'rb') as file:
+    image_data = file.read()
+image = cv2_to_base64(image_data)
+for i in range(1):
+    ret = client.predict(feed_dict={"image": image}, fetch=["label", "prob"])
+    print(ret)
--- a/python/examples/pipeline/imagenet/resnet50_web_service.py
+++ b/python/examples/pipeline/imagenet/resnet50_web_service.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
+try:
+    from paddle_serving_server_gpu.web_service import WebService, Op
+except ImportError:
+    from paddle_serving_server.web_service import WebService, Op
+import logging
+import numpy as np
+import base64, cv2
+class ImagenetOp(Op):
+    def init_op(self):
+        self.seq = Sequential([
+            Resize(256), CenterCrop(224), RGB2BGR(), Transpose((2, 0, 1)),
+            Div(255), Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225],
+                                True)
+        ])
+        self.label_dict = {}
+        label_idx = 0
+        with open("imagenet.label") as fin:
+            for line in fin:
+                self.label_dict[label_idx] = line.strip()
+                label_idx += 1
+    def preprocess(self, input_dicts, data_id, log_id):
+        (_, input_dict), = input_dicts.items()
+        data = base64.b64decode(input_dict["image"].encode('utf8'))
+        data = np.fromstring(data, np.uint8)
+        # Note: class variables(self.var) can only be used in process op mode
+        im = cv2.imdecode(data, cv2.IMREAD_COLOR)
+        img = self.seq(im)
+        return {"image": img[np.newaxis, :].copy()}, False, None, ""
+    def postprocess(self, input_dicts, fetch_dict, log_id):
+        print(fetch_dict)
+        score_list = fetch_dict["score"]
+        result = {"label": [], "prob": []}
+        for score in score_list:
+            score = score.tolist()
+            max_score = max(score)
+            result["label"].append(self.label_dict[score.index(max_score)]
+                                   .strip().replace(",", ""))
+            result["prob"].append(max_score)
+        result["label"] = str(result["label"])
+        result["prob"] = str(result["prob"])
+        return result, None, ""
+class ImageService(WebService):
+    def get_pipeline_response(self, read_op):
+        image_op = ImagenetOp(name="imagenet", input_ops=[read_op])
+        return image_op
+uci_service = ImageService(name="imagenet")
+uci_service.prepare_pipeline_config("config.yml")
+uci_service.run_service()
--- a/python/examples/pipeline/imdb_model_ensemble/README.md
+++ b/python/examples/pipeline/imdb_model_ensemble/README.md
+# IMDB model ensemble examples
+## Get models
+```
+sh get_data.sh
+```
+## Start servers
+```
+python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 &> cnn.log &
+python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.log &
+python test_pipeline_server.py &>pipeline.log &
+```
+## Start clients
+```
+python test_pipeline_client.py
+```
--- a/python/examples/pipeline/imdb_model_ensemble/README_CN.md
+++ b/python/examples/pipeline/imdb_model_ensemble/README_CN.md
@@ -8,8 +8,8 @@ sh get_data.sh
 ## 启动服务
 ```
-python -m paddle_serving_server_gpu.serve --model imdb_cnn_model --port 9292 &> cnn.log &
+python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 &> cnn.log &
-python -m paddle_serving_server_gpu.serve --model imdb_bow_model --port 9393 &> bow.log &
+python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.log &
 python test_pipeline_server.py &>pipeline.log &
 ```
@@ -17,8 +17,3 @@ python test_pipeline_server.py &>pipeline.log &
 ```
 python test_pipeline_client.py
 ```
-## HTTP 测试
-```
-curl -X POST -k http://localhost:9999/prediction -d '{"key": ["words"], "value": ["i am very sad | 0"]}'
-```
--- a/python/examples/pipeline/imdb_model_ensemble/config.yml
+++ b/python/examples/pipeline/imdb_model_ensemble/config.yml
-rpc_port: 18085
+#rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时，会自动将rpc_port设置为http_port+1
+rpc_port: 18070
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
+http_port: 18071
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+#当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
 worker_num: 4
-build_dag_each_worker: false
-http_port: 9999
+#build_dag_each_worker, False，框架在进程内创建一条DAG；True，框架会每个进程内创建多个独立的DAG
+build_dag_each_worker: False
 dag:
-    is_thread_op: false
+    #op资源类型, True, 为线程模型；False，为进程模型
-    client_type: brpc
+    is_thread_op: True
+    #重试次数
    retry: 1
-    use_profile: false
+    #使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
+    use_profile: False
+    #channel的最大长度，默认为0
+    channel_size: 0
+    #tracer, 跟踪框架吞吐，每个OP和channel的工作情况。无tracer时不生成数据
    tracer:
+        #每次trace的时间间隔，单位秒/s
        interval_s: 10
+op:
+    bow:
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 1
+        #client连接类型，brpc
+        client_type: brpc
+        #Serving交互重试次数，默认不重试
+        retry: 1
+        #Serving交互超时时间, 单位ms
+        timeout: 3000
+        #Serving IPs
+        server_endpoints: ["127.0.0.1:9393"]
+        #bow模型client端配置
+        client_config: "imdb_bow_client_conf/serving_client_conf.prototxt"
+        #Fetch结果列表，以client_config中fetch_var的alias_name为准
+        fetch_list: ["prediction"]
+        #批量查询Serving的数量, 默认1。batch_size>1要设置auto_batching_timeout，否则不足batch_size时会阻塞
+        batch_size: 1
+        #批量查询超时，与batch_size配合使用
+        auto_batching_timeout: 2000
+    cnn:
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 1
+        #client连接类型，brpc
+        client_type: brpc
+        #Serving交互重试次数，默认不重试
+        retry: 1
+        #超时时间, 单位ms
+        timeout: 3000
+        #Serving IPs
+        server_endpoints: ["127.0.0.1:9292"]
+        #cnn模型client端配置
+        client_config: "imdb_cnn_client_conf/serving_client_conf.prototxt"
+        #Fetch结果列表，以client_config中fetch_var的alias_name为准
+        fetch_list: ["prediction"]
+        #批量查询Serving的数量, 默认1。batch_size>1要设置auto_batching_timeout，否则不足batch_size时会阻塞
+        batch_size: 1
+        #批量查询超时，与batch_size配合使用
+        auto_batching_timeout: 2000
+    combine:
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 1
+        #Serving交互重试次数，默认不重试
+        retry: 1
+        #超时时间, 单位ms
+        timeout: 3000
+        #批量查询Serving的数量, 默认1。batch_size>1要设置auto_batching_timeout，否则不足batch_size时会阻塞
+        batch_size: 1
+        #批量查询超时，与batch_size配合使用
+        auto_batching_timeout: 2000
--- a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_client.py
+++ b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_client.py
@@ -15,21 +15,22 @@ from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 client = PipelineClient()
-client.connect(['127.0.0.1:18080'])
+client.connect(['127.0.0.1:18070'])
 words = 'i am very sad | 0'
 futures = []
-for i in range(4):
+for i in range(100):
    futures.append(
        client.predict(
-            feed_dict={"words": words},
+            feed_dict={"words": words,
+                       "logid": 10000 + i},
            fetch=["prediction"],
            asyn=True,
            profile=False))
 for f in futures:
    res = f.result()
-    if res["ecode"] != 0:
+    if res.err_no != 0:
        print("predict failed: {}".format(res))
    print(res)
--- a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
+++ b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
@@ -15,10 +15,14 @@
 from paddle_serving_server.pipeline import Op, RequestOp, ResponseOp
 from paddle_serving_server.pipeline import PipelineServer
 from paddle_serving_server.pipeline.proto import pipeline_service_pb2
-from paddle_serving_server.pipeline.channel import ChannelDataEcode
+from paddle_serving_server.pipeline.channel import ChannelDataErrcode
 import numpy as np
-from paddle_serving_app.reader import IMDBDataset
+from paddle_serving_app.reader.imdb_reader import IMDBDataset
 import logging
+try:
+    from paddle_serving_server.web_service import WebService
+except ImportError:
+    from paddle_serving_server_gpu.web_service import WebService
 _LOGGER = logging.getLogger()
 user_handler = logging.StreamHandler()
@@ -41,74 +45,68 @@ class ImdbRequestOp(RequestOp):
                continue
            words = request.value[idx]
            word_ids, _ = self.imdb_dataset.get_words_and_label(words)
-            dictdata[key] = np.array(word_ids)
+            word_len = len(word_ids)
-        return dictdata
+            dictdata[key] = np.array(word_ids).reshape(word_len, 1)
+            dictdata["{}.lod".format(key)] = np.array([0, word_len])
+        log_id = None
+        if request.logid is not None:
+            log_id = request.logid
+        return dictdata, log_id, None, ""
 class CombineOp(Op):
-    def preprocess(self, input_data):
+    def preprocess(self, input_data, data_id, log_id):
+        #_LOGGER.info("Enter CombineOp::preprocess")
        combined_prediction = 0
        for op_name, data in input_data.items():
            _LOGGER.info("{}: {}".format(op_name, data["prediction"]))
            combined_prediction += data["prediction"]
        data = {"prediction": combined_prediction / 2}
-        return data
+        return data, False, None, ""
 class ImdbResponseOp(ResponseOp):
    # Here ImdbResponseOp is consistent with the default ResponseOp implementation
    def pack_response_package(self, channeldata):
        resp = pipeline_service_pb2.Response()
-        resp.ecode = channeldata.ecode
+        resp.err_no = channeldata.error_code
-        if resp.ecode == ChannelDataEcode.OK.value:
+        if resp.err_no == ChannelDataErrcode.OK.value:
            feed = channeldata.parse()
            # ndarray to string
            for name, var in feed.items():
                resp.value.append(var.__repr__())
                resp.key.append(name)
        else:
-            resp.error_info = channeldata.error_info
+            resp.err_msg = channeldata.error_info
        return resp
 read_op = ImdbRequestOp()
-bow_op = Op(name="bow",
-            input_ops=[read_op],
-            server_endpoints=["127.0.0.1:9393"],
+class BowOp(Op):
-            fetch_list=["prediction"],
+    def init_op(self):
-            client_config="imdb_bow_client_conf/serving_client_conf.prototxt",
+        pass
-            concurrency=1,
-            timeout=-1,
-            retry=1,
+class CnnOp(Op):
-            batch_size=3,
+    def init_op(self):
-            auto_batching_timeout=1000)
+        pass
-cnn_op = Op(name="cnn",
-            input_ops=[read_op],
-            server_endpoints=["127.0.0.1:9292"],
+bow_op = BowOp("bow", input_ops=[read_op])
-            fetch_list=["prediction"],
+cnn_op = CnnOp("cnn", input_ops=[read_op])
-            client_config="imdb_cnn_client_conf/serving_client_conf.prototxt",
+combine_op = CombineOp("combine", input_ops=[bow_op, cnn_op])
-            concurrency=1,
-            timeout=-1,
-            retry=1,
-            batch_size=1,
-            auto_batching_timeout=None)
-combine_op = CombineOp(
-    name="combine",
-    input_ops=[bow_op, cnn_op],
-    concurrency=1,
-    timeout=-1,
-    retry=1,
-    batch_size=2,
-    auto_batching_timeout=None)
 # fetch output of bow_op
-# response_op = ImdbResponseOp(input_ops=[bow_op])
+#response_op = ImdbResponseOp(input_ops=[bow_op])
 # fetch output of combine_op
 response_op = ImdbResponseOp(input_ops=[combine_op])
 # use default ResponseOp implementation
-# response_op = ResponseOp(input_ops=[combine_op])
+#response_op = ResponseOp(input_ops=[combine_op])
 server = PipelineServer()
 server.set_response_op(response_op)

--- a/python/examples/pipeline/ocr/README.md
+++ b/python/examples/pipeline/ocr/README.md
@@ -28,31 +28,9 @@ python web_service.py &>log.txt &
 python pipeline_http_client.py
 ```
 <!--
 ## More (PipelineServing)
-You can choose one of the following versions to start Service.
-### Remote Service Version
-```
-python -m paddle_serving_server_gpu.serve --model ocr_det_model --port 12000 --gpu_id 0 &> det.log &
-python -m paddle_serving_server_gpu.serve --model ocr_rec_model --port 12001 --gpu_id 0 &> rec.log &
-python remote_service_pipeline_server.py &>pipeline.log &
-```
-### Local Service Version
-```
-python local_service_pipeline_server.py &>pipeline.log &
-```
-### Hybrid Service Version
-```
-python -m paddle_serving_server_gpu.serve --model ocr_rec_model --port 12001 --gpu_id 0 &> rec.log &
-python hybrid_service_pipeline_server.py &>pipeline.log &
-```
 ## Client Prediction
 ### RPC

--- a/python/examples/pipeline/ocr/README_CN.md
+++ b/python/examples/pipeline/ocr/README_CN.md
@@ -31,26 +31,6 @@ python pipeline_http_client.py
 <!--
 ## 其他 (PipelineServing)
-你可以选择下面任意一种版本启动服务。
-### 远程服务版本
-```
-python -m paddle_serving_server.serve --model ocr_det_model --port 12000 --gpu_id 0 &> det.log &
-python -m paddle_serving_server.serve --model ocr_rec_model --port 12001 --gpu_id 0 &> rec.log &
-python remote_service_pipeline_server.py &>pipeline.log &
-```
-### 本地服务版本
-```
-python local_service_pipeline_server.py &>pipeline.log &
-```
-### 混合服务版本
-```
-python -m paddle_serving_server_gpu.serve --model ocr_rec_model --port 12001 --gpu_id 0 &> rec.log &
-python hybrid_service_pipeline_server.py &>pipeline.log &
-```
 ## 启动客户端
 ### RPC

--- a/python/examples/pipeline/ocr/config.yml
+++ b/python/examples/pipeline/ocr/config.yml
-rpc_port: 18080
+#rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时，会自动将rpc_port设置为http_port+1
-worker_num: 4
+rpc_port: 18090
-build_dag_each_worker: false
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
 http_port: 9999
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+##当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
+worker_num: 1
+#build_dag_each_worker, False，框架在进程内创建一条DAG；True，框架会每个进程内创建多个独立的DAG
+build_dag_each_worker: false
 dag:
-    is_thread_op: false
+    #op资源类型, True, 为线程模型；False，为进程模型
-    client_type: brpc
+    is_thread_op: False
+    #重试次数
    retry: 1
+    #使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
    use_profile: false
 op:
    det:
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
        concurrency: 2
+        #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
        local_service_conf:
+            #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
+            client_type: local_predictor
+            #det模型路径
            model_config: ocr_det_model
+            #Fetch结果列表，以client_config中fetch_var的alias_name为准
+            fetch_list: ["concat_1.tmp_0"]
+            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
            devices: "0"
    rec:
-        concurrency: 1
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 2
+        #超时时间, 单位ms
        timeout: -1
+        #Serving交互重试次数，默认不重试
        retry: 1
+        #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
        local_service_conf:
+            #client类型，包括brpc, grpc和local_predictor。local_predictor不启动Serving服务，进程内预测
+            client_type: local_predictor
+            #rec模型路径
            model_config: ocr_rec_model
+            #Fetch结果列表，以client_config中fetch_var的alias_name为准
+            fetch_list: ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"] 
+            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
            devices: "0"
--- a/python/examples/pipeline/ocr/hybrid_service_pipeline_server.py
+++ b/python/examples/pipeline/ocr/hybrid_service_pipeline_server.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server_gpu.pipeline import Op, RequestOp, ResponseOp
-from paddle_serving_server_gpu.pipeline import PipelineServer
-from paddle_serving_server_gpu.pipeline.proto import pipeline_service_pb2
-from paddle_serving_server_gpu.pipeline.channel import ChannelDataEcode
-from paddle_serving_server_gpu.pipeline import LocalRpcServiceHandler
-import numpy as np
-import cv2
-import time
-import base64
-import json
-from paddle_serving_app.reader import OCRReader
-from paddle_serving_app.reader import Sequential, ResizeByFactor
-from paddle_serving_app.reader import Div, Normalize, Transpose
-from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
-import time
-import re
-import base64
-import logging
-_LOGGER = logging.getLogger()
-class DetOp(Op):
-    def init_op(self):
-        self.det_preprocess = Sequential([
-            ResizeByFactor(32, 960), Div(255),
-            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
-                (2, 0, 1))
-        ])
-        self.filter_func = FilterBoxes(10, 10)
-        self.post_func = DBPostProcess({
-            "thresh": 0.3,
-            "box_thresh": 0.5,
-            "max_candidates": 1000,
-            "unclip_ratio": 1.5,
-            "min_size": 3
-        })
-    def preprocess(self, input_dicts):
-        (_, input_dict), = input_dicts.items()
-        data = base64.b64decode(input_dict["image"].encode('utf8'))
-        data = np.fromstring(data, np.uint8)
-        # Note: class variables(self.var) can only be used in process op mode
-        self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
-        self.ori_h, self.ori_w, _ = self.im.shape
-        det_img = self.det_preprocess(self.im)
-        _, self.new_h, self.new_w = det_img.shape
-        return {"image": det_img}
-    def postprocess(self, input_dicts, fetch_dict):
-        det_out = fetch_dict["concat_1.tmp_0"]
-        ratio_list = [
-            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
-        ]
-        dt_boxes_list = self.post_func(det_out, [ratio_list])
-        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
-        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
-        return out_dict
-class RecOp(Op):
-    def init_op(self):
-        self.ocr_reader = OCRReader()
-        self.get_rotate_crop_image = GetRotateCropImage()
-        self.sorted_boxes = SortedBoxes()
-    def preprocess(self, input_dicts):
-        (_, input_dict), = input_dicts.items()
-        im = input_dict["image"]
-        dt_boxes = input_dict["dt_boxes"]
-        dt_boxes = self.sorted_boxes(dt_boxes)
-        feed_list = []
-        img_list = []
-        max_wh_ratio = 0
-        for i, dtbox in enumerate(dt_boxes):
-            boximg = self.get_rotate_crop_image(im, dt_boxes[i])
-            img_list.append(boximg)
-            h, w = boximg.shape[0:2]
-            wh_ratio = w * 1.0 / h
-            max_wh_ratio = max(max_wh_ratio, wh_ratio)
-        for img in img_list:
-            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
-            feed = {"image": norm_img}
-            feed_list.append(feed)
-        return feed_list
-    def postprocess(self, input_dicts, fetch_dict):
-        rec_res = self.ocr_reader.postprocess(fetch_dict, with_score=True)
-        res_lst = []
-        for res in rec_res:
-            res_lst.append(res[0])
-        res = {"res": str(res_lst)}
-        return res
-read_op = RequestOp()
-det_op = DetOp(
-    name="det",
-    input_ops=[read_op],
-    local_rpc_service_handler=LocalRpcServiceHandler(
-        model_config="ocr_det_model",
-        workdir="det_workdir",  # defalut: "workdir"
-        thread_num=2,  # defalut: 2
-        devices="0",  # gpu0. defalut: "" (cpu)
-        mem_optim=True,  # defalut: True
-        ir_optim=False,  # defalut: False
-        available_port_generator=None),  # defalut: None
-    concurrency=1)
-rec_op = RecOp(
-    name="rec",
-    input_ops=[det_op],
-    server_endpoints=["127.0.0.1:12001"],
-    fetch_list=["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"],
-    client_config="ocr_rec_client/serving_client_conf.prototxt",
-    concurrency=1)
-response_op = ResponseOp(input_ops=[rec_op])
-server = PipelineServer("ocr")
-server.set_response_op(response_op)
-server.prepare_server('config.yml')
-server.run_server()
--- a/python/examples/pipeline/ocr/local_service_pipeline_server.py
+++ b/python/examples/pipeline/ocr/local_service_pipeline_server.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server_gpu.pipeline import Op, RequestOp, ResponseOp
-from paddle_serving_server_gpu.pipeline import PipelineServer
-from paddle_serving_server_gpu.pipeline.proto import pipeline_service_pb2
-from paddle_serving_server_gpu.pipeline.channel import ChannelDataEcode
-from paddle_serving_server_gpu.pipeline import LocalRpcServiceHandler
-import numpy as np
-import cv2
-import time
-import base64
-import json
-from paddle_serving_app.reader import OCRReader
-from paddle_serving_app.reader import Sequential, ResizeByFactor
-from paddle_serving_app.reader import Div, Normalize, Transpose
-from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
-import time
-import re
-import base64
-import logging
-_LOGGER = logging.getLogger()
-class DetOp(Op):
-    def init_op(self):
-        self.det_preprocess = Sequential([
-            ResizeByFactor(32, 960), Div(255),
-            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
-                (2, 0, 1))
-        ])
-        self.filter_func = FilterBoxes(10, 10)
-        self.post_func = DBPostProcess({
-            "thresh": 0.3,
-            "box_thresh": 0.5,
-            "max_candidates": 1000,
-            "unclip_ratio": 1.5,
-            "min_size": 3
-        })
-    def preprocess(self, input_dicts):
-        (_, input_dict), = input_dicts.items()
-        data = base64.b64decode(input_dict["image"].encode('utf8'))
-        data = np.fromstring(data, np.uint8)
-        # Note: class variables(self.var) can only be used in process op mode
-        self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
-        self.ori_h, self.ori_w, _ = self.im.shape
-        det_img = self.det_preprocess(self.im)
-        _, self.new_h, self.new_w = det_img.shape
-        return {"image": det_img}
-    def postprocess(self, input_dicts, fetch_dict):
-        det_out = fetch_dict["concat_1.tmp_0"]
-        ratio_list = [
-            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
-        ]
-        dt_boxes_list = self.post_func(det_out, [ratio_list])
-        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
-        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
-        return out_dict
-class RecOp(Op):
-    def init_op(self):
-        self.ocr_reader = OCRReader()
-        self.get_rotate_crop_image = GetRotateCropImage()
-        self.sorted_boxes = SortedBoxes()
-    def preprocess(self, input_dicts):
-        (_, input_dict), = input_dicts.items()
-        im = input_dict["image"]
-        dt_boxes = input_dict["dt_boxes"]
-        dt_boxes = self.sorted_boxes(dt_boxes)
-        feed_list = []
-        img_list = []
-        max_wh_ratio = 0
-        for i, dtbox in enumerate(dt_boxes):
-            boximg = self.get_rotate_crop_image(im, dt_boxes[i])
-            img_list.append(boximg)
-            h, w = boximg.shape[0:2]
-            wh_ratio = w * 1.0 / h
-            max_wh_ratio = max(max_wh_ratio, wh_ratio)
-        for img in img_list:
-            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
-            feed = {"image": norm_img}
-            feed_list.append(feed)
-        return feed_list
-    def postprocess(self, input_dicts, fetch_dict):
-        rec_res = self.ocr_reader.postprocess(fetch_dict, with_score=True)
-        res_lst = []
-        for res in rec_res:
-            res_lst.append(res[0])
-        res = {"res": str(res_lst)}
-        return res
-read_op = RequestOp()
-det_op = DetOp(
-    name="det",
-    input_ops=[read_op],
-    local_rpc_service_handler=LocalRpcServiceHandler(
-        model_config="ocr_det_model",
-        workdir="det_workdir",  # defalut: "workdir"
-        thread_num=2,  # defalut: 2
-        devices="0",  # gpu0. defalut: "" (cpu)
-        mem_optim=True,  # defalut: True
-        ir_optim=False,  # defalut: False
-        available_port_generator=None),  # defalut: None
-    concurrency=1)
-rec_op = RecOp(
-    name="rec",
-    input_ops=[det_op],
-    local_rpc_service_handler=LocalRpcServiceHandler(
-        model_config="ocr_rec_model"),
-    concurrency=1)
-response_op = ResponseOp(input_ops=[rec_op])
-server = PipelineServer("ocr")
-server.set_response_op(response_op)
-server.prepare_server('config.yml')
-server.run_server()
--- a/python/examples/pipeline/ocr/pipeline_http_client.py
+++ b/python/examples/pipeline/ocr/pipeline_http_client.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json

--- a/python/examples/pipeline/ocr/pipeline_rpc_client.py
+++ b/python/examples/pipeline/ocr/pipeline_rpc_client.py
@@ -20,7 +20,7 @@ import base64
 import os
 client = PipelineClient()
-client.connect(['127.0.0.1:18080'])
+client.connect(['127.0.0.1:18090'])
 def cv2_to_base64(image):
@@ -33,6 +33,6 @@ for img_file in os.listdir(test_img_dir):
        image_data = file.read()
    image = cv2_to_base64(image_data)
-for i in range(4):
+for i in range(1):
    ret = client.predict(feed_dict={"image": image}, fetch=["res"])
    print(ret)
--- a/python/examples/pipeline/ocr/remote_service_pipeline_server.py
+++ b/python/examples/pipeline/ocr/remote_service_pipeline_server.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server_gpu.pipeline import Op, RequestOp, ResponseOp
-from paddle_serving_server_gpu.pipeline import PipelineServer
-from paddle_serving_server_gpu.pipeline.proto import pipeline_service_pb2
-from paddle_serving_server_gpu.pipeline.channel import ChannelDataEcode
-import numpy as np
-import cv2
-import time
-import base64
-import json
-from paddle_serving_app.reader import OCRReader
-from paddle_serving_app.reader import Sequential, ResizeByFactor
-from paddle_serving_app.reader import Div, Normalize, Transpose
-from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
-import time
-import re
-import base64
-import logging
-_LOGGER = logging.getLogger()
-class DetOp(Op):
-    def init_op(self):
-        self.det_preprocess = Sequential([
-            ResizeByFactor(32, 960), Div(255),
-            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
-                (2, 0, 1))
-        ])
-        self.filter_func = FilterBoxes(10, 10)
-        self.post_func = DBPostProcess({
-            "thresh": 0.3,
-            "box_thresh": 0.5,
-            "max_candidates": 1000,
-            "unclip_ratio": 1.5,
-            "min_size": 3
-        })
-    def preprocess(self, input_dicts):
-        (_, input_dict), = input_dicts.items()
-        data = base64.b64decode(input_dict["image"].encode('utf8'))
-        data = np.fromstring(data, np.uint8)
-        # Note: class variables(self.var) can only be used in process op mode
-        self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
-        self.ori_h, self.ori_w, _ = self.im.shape
-        det_img = self.det_preprocess(self.im)
-        _, self.new_h, self.new_w = det_img.shape
-        return {"image": det_img}
-    def postprocess(self, input_dicts, fetch_dict):
-        det_out = fetch_dict["concat_1.tmp_0"]
-        ratio_list = [
-            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
-        ]
-        dt_boxes_list = self.post_func(det_out, [ratio_list])
-        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
-        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
-        return out_dict
-class RecOp(Op):
-    def init_op(self):
-        self.ocr_reader = OCRReader()
-        self.get_rotate_crop_image = GetRotateCropImage()
-        self.sorted_boxes = SortedBoxes()
-    def preprocess(self, input_dicts):
-        (_, input_dict), = input_dicts.items()
-        im = input_dict["image"]
-        dt_boxes = input_dict["dt_boxes"]
-        dt_boxes = self.sorted_boxes(dt_boxes)
-        feed_list = []
-        img_list = []
-        max_wh_ratio = 0
-        for i, dtbox in enumerate(dt_boxes):
-            boximg = self.get_rotate_crop_image(im, dt_boxes[i])
-            img_list.append(boximg)
-            h, w = boximg.shape[0:2]
-            wh_ratio = w * 1.0 / h
-            max_wh_ratio = max(max_wh_ratio, wh_ratio)
-        for img in img_list:
-            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
-            feed = {"image": norm_img}
-            feed_list.append(feed)
-        return feed_list
-    def postprocess(self, input_dicts, fetch_dict):
-        rec_res = self.ocr_reader.postprocess(fetch_dict, with_score=True)
-        res_lst = []
-        for res in rec_res:
-            res_lst.append(res[0])
-        res = {"res": str(res_lst)}
-        return res
-read_op = RequestOp()
-det_op = DetOp(
-    name="det",
-    input_ops=[read_op],
-    server_endpoints=["127.0.0.1:12000"],
-    fetch_list=["concat_1.tmp_0"],
-    client_config="ocr_det_client/serving_client_conf.prototxt",
-    concurrency=1)
-rec_op = RecOp(
-    name="rec",
-    input_ops=[det_op],
-    server_endpoints=["127.0.0.1:12001"],
-    fetch_list=["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"],
-    client_config="ocr_rec_client/serving_client_conf.prototxt",
-    concurrency=1)
-response_op = ResponseOp(input_ops=[rec_op])
-server = PipelineServer("ocr")
-server.set_response_op(response_op)
-server.prepare_server('config.yml')
-server.run_server()
--- a/python/examples/pipeline/ocr/web_service.py
+++ b/python/examples/pipeline/ocr/web_service.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
    from paddle_serving_server.web_service import WebService, Op
+except ImportError:
+    from paddle_serving_server_gpu.web_service import WebService, Op
 import logging
 import numpy as np
 import cv2
@@ -43,7 +43,7 @@ class DetOp(Op):
            "min_size": 3
        })
-    def preprocess(self, input_dicts):
+    def preprocess(self, input_dicts, data_id, log_id):
        (_, input_dict), = input_dicts.items()
        data = base64.b64decode(input_dict["image"].encode('utf8'))
        data = np.fromstring(data, np.uint8)
@@ -52,9 +52,9 @@ class DetOp(Op):
        self.ori_h, self.ori_w, _ = self.im.shape
        det_img = self.det_preprocess(self.im)
        _, self.new_h, self.new_w = det_img.shape
-        return {"image": det_img}
+        return {"image": det_img[np.newaxis, :].copy()}, False, None, ""
-    def postprocess(self, input_dicts, fetch_dict):
+    def postprocess(self, input_dicts, fetch_dict, log_id):
        det_out = fetch_dict["concat_1.tmp_0"]
        ratio_list = [
            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
@@ -62,7 +62,8 @@ class DetOp(Op):
        dt_boxes_list = self.post_func(det_out, [ratio_list])
        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
-        return out_dict
+        print("out dict", out_dict)
+        return out_dict, None, ""
 class RecOp(Op):
@@ -71,7 +72,7 @@ class RecOp(Op):
        self.get_rotate_crop_image = GetRotateCropImage()
        self.sorted_boxes = SortedBoxes()
-    def preprocess(self, input_dicts):
+    def preprocess(self, input_dicts, data_id, log_id):
        (_, input_dict), = input_dicts.items()
        im = input_dict["image"]
        dt_boxes = input_dict["dt_boxes"]
@@ -85,19 +86,22 @@ class RecOp(Op):
            h, w = boximg.shape[0:2]
            wh_ratio = w * 1.0 / h
            max_wh_ratio = max(max_wh_ratio, wh_ratio)
-        for img in img_list:
+        _, w, h = self.ocr_reader.resize_norm_img(img_list[0],
+                                                  max_wh_ratio).shape
+        imgs = np.zeros((len(img_list), 3, w, h)).astype('float32')
+        for id, img in enumerate(img_list):
            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
-            feed = {"image": norm_img}
+            imgs[id] = norm_img
-            feed_list.append(feed)
+        feed = {"image": imgs.copy()}
-        return feed_list
+        return feed, False, None, ""
-    def postprocess(self, input_dicts, fetch_dict):
+    def postprocess(self, input_dicts, fetch_dict, log_id):
        rec_res = self.ocr_reader.postprocess(fetch_dict, with_score=True)
        res_lst = []
        for res in rec_res:
            res_lst.append(res[0])
        res = {"res": str(res_lst)}
-        return res
+        return res, None, ""
 class OcrService(WebService):

--- a/python/examples/pipeline/simple_web_service/README.md
+++ b/python/examples/pipeline/simple_web_service/README.md
@@ -15,5 +15,5 @@ python web_service.py &>log.txt &
 ## Http test
 ```
-curl -X POST -k http://localhost:18080/uci/prediction -d '{"key": ["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}'
+curl -X POST -k http://localhost:18082/uci/prediction -d '{"key": ["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}'
 ```
--- a/python/examples/pipeline/simple_web_service/README_CN.md
+++ b/python/examples/pipeline/simple_web_service/README_CN.md
@@ -15,5 +15,5 @@ python web_service.py &>log.txt &
 ## 测试
 ```
-curl -X POST -k http://localhost:18080/uci/prediction -d '{"key": ["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}'
+curl -X POST -k http://localhost:18082/uci/prediction -d '{"key": ["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}'
 ```
--- a/python/examples/pipeline/simple_web_service/config.yml
+++ b/python/examples/pipeline/simple_web_service/config.yml
-worker_num: 4
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
-http_port: 18080
+##当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
+worker_num: 1
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
+http_port: 18082
 dag:
-    is_thread_op: false
+    #op资源类型, True, 为线程模型；False，为进程模型
+    is_thread_op: False
 op:
    uci:
+        #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
        local_service_conf:
+            #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+            concurrency: 2
+            #uci模型路径
            model_config: uci_housing_model
-            devices: "" # "0,1"
+            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
+            devices: "0" # "0,1"
+            #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
+            client_type: local_predictor
+            #Fetch结果列表，以client_config中fetch_var的alias_name为准
+            fetch_list: ["price"] 
--- a/python/examples/pipeline/simple_web_service/web_service.py
+++ b/python/examples/pipeline/simple_web_service/web_service.py
@@ -17,6 +17,7 @@ except ImportError:
    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
+import sys
 _LOGGER = logging.getLogger()
@@ -25,19 +26,32 @@ class UciOp(Op):
    def init_op(self):
        self.separator = ","
-    def preprocess(self, input_dicts):
+    def preprocess(self, input_dicts, data_id, log_id):
        (_, input_dict), = input_dicts.items()
-        _LOGGER.info(input_dict)
+        _LOGGER.error("UciOp::preprocess >>> log_id:{}, input:{}".format(
+            log_id, input_dict))
        x_value = input_dict["x"]
+        proc_dict = {}
+        if sys.version_info.major == 2:
            if isinstance(x_value, (str, unicode)):
                input_dict["x"] = np.array(
-                [float(x.strip()) for x in x_value.split(self.separator)])
+                    [float(x.strip())
-        return input_dict
+                     for x in x_value.split(self.separator)]).reshape(1, 13)
+                _LOGGER.error("input_dict:{}".format(input_dict))
+        else:
+            if isinstance(x_value, str):
+                input_dict["x"] = np.array(
+                    [float(x.strip())
+                     for x in x_value.split(self.separator)]).reshape(1, 13)
+                _LOGGER.error("input_dict:{}".format(input_dict))
+        return input_dict, False, None, ""
-    def postprocess(self, input_dicts, fetch_dict):
+    def postprocess(self, input_dicts, fetch_dict, log_id):
-        # _LOGGER.info(fetch_dict)
+        _LOGGER.info("UciOp::postprocess >>> log_id:{}, fetch_dict:{}".format(
+            log_id, fetch_dict))
        fetch_dict["price"] = str(fetch_dict["price"][0][0])
-        return fetch_dict
+        return fetch_dict, None, ""
 class UciService(WebService):

--- a/python/examples/resnet_v2_50/resnet50_debug.py
+++ b/python/examples/resnet_v2_50/resnet50_debug.py
@@ -14,10 +14,10 @@
 from paddle_serving_app.reader import Sequential, File2Image, Resize, CenterCrop
 from paddle_serving_app.reader import RGB2BGR, Transpose, Div, Normalize
-from paddle_serving_app.local_predict import Debugger
+from paddle_serving_app.local_predict import LocalPredictor
 import sys
-debugger = Debugger()
+debugger = LocalPredictor()
 debugger.load_model_config(sys.argv[1], gpu=True)
 seq = Sequential([

--- a/python/examples/senta/senta_web_service.py
+++ b/python/examples/senta/senta_web_service.py
@@ -18,7 +18,7 @@ from paddle_serving_client import Client
 from paddle_serving_app.reader import LACReader, SentaReader
 import os
 import sys
+import numpy as np
 #senta_web_service.py
 from paddle_serving_server.web_service import WebService
 from paddle_serving_client import Client
@@ -36,20 +36,36 @@ class SentaService(WebService):
    #定义senta模型预测服务的预处理，调用顺序：lac reader->lac模型预测->预测结果后处理->senta reader
    def preprocess(self, feed=[], fetch=[]):
-        feed_data = [{
-            "words": self.lac_reader.process(x["words"])
-        } for x in feed]
-        lac_result = self.lac_client.predict(
-            feed=feed_data, fetch=["crf_decode"])
        feed_batch = []
+        is_batch = True
+        words_lod = [0]
+        for ins in feed:
+            if "words" not in ins:
+                raise ("feed data error!")
+            feed_data = self.lac_reader.process(ins["words"])
+            words_lod.append(words_lod[-1] + len(feed_data))
+            feed_batch.append(np.array(feed_data).reshape(len(feed_data), 1))
+        words = np.concatenate(feed_batch, axis=0)
+        lac_result = self.lac_client.predict(
+            feed={"words": words,
+                  "words.lod": words_lod},
+            fetch=["crf_decode"],
+            batch=True)
        result_lod = lac_result["crf_decode.lod"]
+        feed_batch = []
+        words_lod = [0]
        for i in range(len(feed)):
            segs = self.lac_reader.parse_result(
                feed[i]["words"],
                lac_result["crf_decode"][result_lod[i]:result_lod[i + 1]])
            feed_data = self.senta_reader.process(segs)
-            feed_batch.append({"words": feed_data})
+            feed_batch.append(np.array(feed_data).reshape(len(feed_data), 1))
-        return feed_batch, fetch
+            words_lod.append(words_lod[-1] + len(feed_data))
+        return {
+            "words": np.concatenate(feed_batch),
+            "words.lod": words_lod
+        }, fetch, is_batch
 senta_service = SentaService(name="senta")

--- a/python/examples/unet_for_image_seg/unet_benchmark/README.md
+++ b/python/examples/unet_for_image_seg/unet_benchmark/README.md
+#UNET_BENCHMARK 使用说明
+## 功能
+* benchmark测试
+## 注意事项
+* 示例图片（可以有多张）请放置于与img_data路径中，支持jpg，jpeg
+* 图片张数应该大于等于并发数量
+## TODO
+* http benchmark
--- a/python/examples/unet_for_image_seg/unet_benchmark/img_data/N0060.jpg
+++ b/python/examples/unet_for_image_seg/unet_benchmark/img_data/N0060.jpg
--- a/python/examples/unet_for_image_seg/unet_benchmark/launch_benckmark.sh
+++ b/python/examples/unet_for_image_seg/unet_benchmark/launch_benckmark.sh
+#!/bin/bash
+python unet_benchmark.py --thread 1 --batch_size 1 --model ../unet_client/serving_client_conf.prototxt
+# thread/batch can be modified as you wish 
--- a/python/examples/unet_for_image_seg/unet_benchmark/unet_benchmark.py
+++ b/python/examples/unet_for_image_seg/unet_benchmark/unet_benchmark.py
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+  unet bench mark script
+  20201130 first edition by cg82616424
+"""
+from __future__ import unicode_literals, absolute_import
+import os
+import time
+import json
+import requests
+from paddle_serving_client import Client
+from paddle_serving_client.utils import MultiThreadRunner
+from paddle_serving_client.utils import benchmark_args, show_latency
+from paddle_serving_app.reader import Sequential, File2Image, Resize, Transpose, BGR2RGB, SegPostprocess
+args = benchmark_args()
+def get_img_names(path):
+    """
+    Brief:
+        get img files(jpg) under this path
+        if any exception happened return None
+    Args:
+        path (string): image file path
+    Returns:
+        list: images names under this folder
+    """
+    if not os.path.exists(path):
+        return None
+    if not os.path.isdir(path):
+        return None
+    list_name = []
+    for f_handler in os.listdir(path):
+        file_path = os.path.join(path, f_handler)
+        if os.path.isdir(file_path):
+            continue
+        else:
+            if not file_path.endswith(".jpeg") and not file_path.endswith(
+                    ".jpg"):
+                continue
+            list_name.append(file_path)
+    return list_name
+def preprocess_img(img_list):
+    """
+    Brief:
+        prepare img data for benchmark
+    Args:
+        img_list(list): list for img file path
+    Returns:
+        image content binary list after preprocess
+    """
+    preprocess = Sequential([File2Image(), Resize((512, 512))])
+    result_list = []
+    for img in img_list:
+        img_tmp = preprocess(img)
+        result_list.append(img_tmp)
+    return result_list
+def benckmark_worker(idx, resource):
+    """
+    Brief:
+        benchmark single worker for unet
+    Args:
+        idx(int): worker idx ,use idx to select backend unet service
+        resource(dict): unet serving endpoint dict 
+    Returns:
+        latency
+    TODO:
+        http benckmarks
+    """
+    profile_flags = False
+    latency_flags = False
+    postprocess = SegPostprocess(2)
+    if os.getenv("FLAGS_profile_client"):
+        profile_flags = True
+    if os.getenv("FLAGS_serving_latency"):
+        latency_flags = True
+        latency_list = []
+    client_handler = Client()
+    client_handler.load_client_config(args.model)
+    client_handler.connect(
+        [resource["endpoint"][idx % len(resource["endpoint"])]])
+    start = time.time()
+    turns = resource["turns"]
+    img_list = resource["img_list"]
+    for i in range(turns):
+        if args.batch_size >= 1:
+            l_start = time.time()
+            feed_batch = []
+            b_start = time.time()
+            for bi in range(args.batch_size):
+                feed_batch.append({"image": img_list[bi]})
+            b_end = time.time()
+            if profile_flags:
+                sys.stderr.write(
+                    "PROFILE\tpid:{}\tunt_pre_0:{} unet_pre_1:{}\n".format(
+                        os.getpid(),
+                        int(round(b_start * 1000000)),
+                        int(round(b_end * 1000000))))
+            result = client_handler.predict(
+                feed={"image": img_list[bi]}, fetch=["output"])
+            #result["filename"] = "./img_data/N0060.jpg" % (os.getpid(), idx, time.time())
+            #postprocess(result) # if you  want to measure post process time, you have to uncomment this line
+            l_end = time.time()
+            if latency_flags:
+                latency_list.append(l_end * 1000 - l_start * 1000)
+        else:
+            print("unsupport batch size {}".format(args.batch_size))
+    end = time.time()
+    if latency_flags:
+        return [[end - start], latency_list]
+    else:
+        return [[end - start]]
+if __name__ == '__main__':
+    """
+    usage: 
+    """
+    img_file_list = get_img_names("./img_data")
+    img_content_list = preprocess_img(img_file_list)
+    multi_thread_runner = MultiThreadRunner()
+    endpoint_list = ["127.0.0.1:9494"]
+    turns = 1
+    start = time.time()
+    result = multi_thread_runner.run(benckmark_worker, args.thread, {
+        "endpoint": endpoint_list,
+        "turns": turns,
+        "img_list": img_content_list
+    })
+    end = time.time()
+    total_cost = end - start
+    avg_cost = 0
+    for i in range(args.thread):
+        avg_cost += result[0][i]
+    avg_cost = avg_cost / args.thread
+    print("total cost: {}s".format(total_cost))
+    print("each thread cost: {}s. ".format(avg_cost))
+    print("qps: {}samples/s".format(args.batch_size * args.thread * turns /
+                                    total_cost))
+    if os.getenv("FLAGS_serving_latency"):
+        show_latency(result[1])
--- a/python/examples/yolov4/test_client.py
+++ b/python/examples/yolov4/test_client.py
@@ -35,6 +35,7 @@ fetch_map = client.predict(
        "image": im,
        "im_size": np.array(list(im.shape[1:])),
    },
-    fetch=["save_infer_model/scale_0.tmp_0"])
+    fetch=["save_infer_model/scale_0.tmp_0"],
+    batch=False)
 fetch_map["image"] = sys.argv[1]
 postprocess(fetch_map)
--- a/python/paddle_serving_app/README.md
+++ b/python/paddle_serving_app/README.md
@@ -160,10 +160,10 @@ Therefore, a local prediction tool is built into the paddle_serving_app, which i
 Taking [fit_a_line prediction service](../examples/fit_a_line) as an example, the following code can be used to run local prediction.
 ```python
-from paddle_serving_app.local_predict import Debugger
+from paddle_serving_app.local_predict import LocalPredictor
 import numpy as np
-debugger = Debugger()
+debugger = LocalPredictor()
 debugger.load_model_config("./uci_housing_model", gpu=False)
 data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
        -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]

--- a/python/paddle_serving_app/README_CN.md
+++ b/python/paddle_serving_app/README_CN.md
@@ -147,10 +147,10 @@ Paddle Serving框架的server预测op使用了Paddle 的预测框架，在部署
 以[fit_a_line预测服务](../examples/fit_a_line)为例，使用以下代码即可执行本地预测。
 ```python
-from paddle_serving_app.local_predict import Debugger
+from paddle_serving_app.local_predict import LocalPredictor
 import numpy as np
-debugger = Debugger()
+debugger = LocalPredictor()
 debugger.load_model_config("./uci_housing_model", gpu=False)
 data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
        -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]

--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -31,7 +31,13 @@ logger = logging.getLogger("fluid")
 logger.setLevel(logging.INFO)
-class Debugger(object):
+class LocalPredictor(object):
+    """
+    Prediction in the current process of the local environment, in process
+    call, Compared with RPC/HTTP, LocalPredictor has better performance, 
+    because of no network and packaging load.
+    """
    def __init__(self):
        self.feed_names_ = []
        self.fetch_names_ = []
@@ -42,13 +48,41 @@ class Debugger(object):
        self.fetch_names_to_idx_ = {}
        self.fetch_names_to_type_ = {}
-    def load_model_config(self, model_path, gpu=False, profile=True, cpu_num=1):
+    def load_model_config(self,
+                          model_path,
+                          use_gpu=False,
+                          gpu_id=0,
+                          use_profile=False,
+                          thread_num=1,
+                          mem_optim=True,
+                          ir_optim=False,
+                          use_trt=False,
+                          use_feed_fetch_ops=False):
+        """
+        Load model config and set the engine config for the paddle predictor
+        Args:
+            model_path: model config path.
+            use_gpu: calculating with gpu, False default.
+            gpu_id: gpu id, 0 default.
+            use_profile: use predictor profiles, False default.
+            thread_num: thread nums, default 1. 
+            mem_optim: memory optimization, True default.
+            ir_optim: open calculation chart optimization, False default.
+            use_trt: use nvidia TensorRT optimization, False default
+            use_feed_fetch_ops: use feed/fetch ops, False default.
+        """
        client_config = "{}/serving_server_conf.prototxt".format(model_path)
        model_conf = m_config.GeneralModelConfig()
        f = open(client_config, 'r')
        model_conf = google.protobuf.text_format.Merge(
            str(f.read()), model_conf)
        config = AnalysisConfig(model_path)
+        logger.info("load_model_config params: model_path:{}, use_gpu:{},\
+            gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{},\
+            use_trt:{}, use_feed_fetch_ops:{}".format(
+            model_path, use_gpu, gpu_id, use_profile, thread_num, mem_optim,
+            ir_optim, use_trt, use_feed_fetch_ops))
        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
@@ -64,19 +98,43 @@ class Debugger(object):
            self.fetch_names_to_idx_[var.alias_name] = i
            self.fetch_names_to_type_[var.alias_name] = var.fetch_type
-        if not gpu:
+        if use_profile:
-            config.disable_gpu()
-        else:
-            config.enable_use_gpu(100, 0)
-        if profile:
            config.enable_profile()
+        if mem_optim:
+            config.enable_memory_optim()
+        config.switch_ir_optim(ir_optim)
+        config.set_cpu_math_library_num_threads(thread_num)
+        config.switch_use_feed_fetch_ops(use_feed_fetch_ops)
        config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
-        config.set_cpu_math_library_num_threads(cpu_num)
-        config.switch_ir_optim(False)
+        if not use_gpu:
-        config.switch_use_feed_fetch_ops(False)
+            config.disable_gpu()
+        else:
+            config.enable_use_gpu(100, gpu_id)
+            if use_trt:
+                config.enable_tensorrt_engine(
+                    workspace_size=1 << 20,
+                    max_batch_size=32,
+                    min_subgraph_size=3,
+                    use_static=False,
+                    use_calib_mode=False)
        self.predictor = create_paddle_predictor(config)
-    def predict(self, feed=None, fetch=None):
+    def predict(self, feed=None, fetch=None, batch=False, log_id=0):
+        """
+        Predict locally
+        Args:
+            feed: feed var
+            fetch: fetch var
+            batch: batch data or not, False default.If batch is False, a new
+                   dimension is added to header of the shape[np.newaxis].
+            log_id: for logging
+        Returns:
+            fetch_map: dict 
+        """
        if feed is None or fetch is None:
            raise ValueError("You should specify feed and fetch for prediction")
        fetch_list = []
@@ -121,9 +179,18 @@ class Debugger(object):
                    name])
            if self.feed_types_[name] == 0:
                feed[name] = feed[name].astype("int64")
-            else:
+            elif self.feed_types_[name] == 1:
                feed[name] = feed[name].astype("float32")
+            elif self.feed_types_[name] == 2:
+                feed[name] = feed[name].astype("int32")
+            else:
+                raise ValueError("local predictor receives wrong data type")
            input_tensor = self.predictor.get_input_tensor(name)
+            if "{}.lod".format(name) in feed:
+                input_tensor.set_lod([feed["{}.lod".format(name)]])
+            if batch == False:
+                input_tensor.copy_from_cpu(feed[name][np.newaxis, :])
+            else:
                input_tensor.copy_from_cpu(feed[name])
        output_tensors = []
        output_names = self.predictor.get_output_names()
@@ -139,5 +206,6 @@ class Debugger(object):
        for i, name in enumerate(fetch):
            fetch_map[name] = outputs[i]
            if len(output_tensors[i].lod()) > 0:
-                fetch_map[name + ".lod"] = output_tensors[i].lod()[0]
+                fetch_map[name + ".lod"] = np.array(output_tensors[i].lod()[
+                    0]).astype('int32')
        return fetch_map
--- a/python/paddle_serving_app/reader/__init__.py
+++ b/python/paddle_serving_app/reader/__init__.py
@@ -19,3 +19,5 @@ from .image_reader import RCNNPostprocess, SegPostprocess, PadStride, BlazeFaceP
 from .image_reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
 from .lac_reader import LACReader
 from .senta_reader import SentaReader
+#from .imdb_reader import IMDBDataset
+from .ocr_reader import OCRReader
--- a/python/paddle_serving_app/reader/pddet/image_tool.py
+++ b/python/paddle_serving_app/reader/pddet/image_tool.py
@@ -22,18 +22,17 @@ import yaml
 import copy
 import argparse
 import logging
-import paddle.fluid as fluid
 import json
 FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 logger = logging.getLogger(__name__)
-precision_map = {
+#precision_map = {
-    'trt_int8': fluid.core.AnalysisConfig.Precision.Int8,
+#    'trt_int8': fluid.core.AnalysisConfig.Precision.Int8,
-    'trt_fp32': fluid.core.AnalysisConfig.Precision.Float32,
+#    'trt_fp32': fluid.core.AnalysisConfig.Precision.Float32,
-    'trt_fp16': fluid.core.AnalysisConfig.Precision.Half
+#    'trt_fp16': fluid.core.AnalysisConfig.Precision.Half
-}
+#}
 class Resize(object):

--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
@@ -233,7 +233,12 @@ class Client(object):
            #    key))
            pass
-    def predict(self, feed=None, fetch=None, need_variant_tag=False, log_id=0):
+    def predict(self,
+                feed=None,
+                fetch=None,
+                batch=False,
+                need_variant_tag=False,
+                log_id=0):
        self.profile_.record('py_prepro_0')
        if feed is None or fetch is None:
@@ -260,7 +265,10 @@ class Client(object):
        int_feed_names = []
        float_feed_names = []
        int_shape = []
+        int_lod_slot_batch = []
+        float_lod_slot_batch = []
        float_shape = []
        fetch_names = []
        counter = 0
        batch_size = len(feed_batch)
@@ -277,31 +285,56 @@ class Client(object):
        for i, feed_i in enumerate(feed_batch):
            int_slot = []
            float_slot = []
+            int_lod_slot = []
+            float_lod_slot = []
            for key in feed_i:
-                if key not in self.feed_names_:
+                if ".lod" not in key and key not in self.feed_names_:
                    raise ValueError("Wrong feed name: {}.".format(key))
+                if ".lod" in key:
+                    continue
                #if not isinstance(feed_i[key], np.ndarray):
                self.shape_check(feed_i, key)
                if self.feed_types_[key] in int_type:
                    if i == 0:
                        int_feed_names.append(key)
+                        shape_lst = []
+                        if batch == False:
+                            feed_i[key] = feed_i[key][np.newaxis, :]
                        if isinstance(feed_i[key], np.ndarray):
-                            int_shape.append(list(feed_i[key].shape))
+                            shape_lst.extend(list(feed_i[key].shape))
+                            int_shape.append(shape_lst)
                        else:
                            int_shape.append(self.feed_shapes_[key])
+                        if "{}.lod".format(key) in feed_i:
+                            int_lod_slot_batch.append(feed_i["{}.lod".format(
+                                key)])
+                        else:
+                            int_lod_slot_batch.append([])
                    if isinstance(feed_i[key], np.ndarray):
                        int_slot.append(feed_i[key])
                        self.has_numpy_input = True
                    else:
                        int_slot.append(feed_i[key])
                        self.all_numpy_input = False
                elif self.feed_types_[key] in float_type:
                    if i == 0:
                        float_feed_names.append(key)
+                        shape_lst = []
+                        if batch == False:
+                            feed_i[key] = feed_i[key][np.newaxis, :]
                        if isinstance(feed_i[key], np.ndarray):
-                            float_shape.append(list(feed_i[key].shape))
+                            shape_lst.extend(list(feed_i[key].shape))
+                            float_shape.append(shape_lst)
                        else:
                            float_shape.append(self.feed_shapes_[key])
+                        if "{}.lod".format(key) in feed_i:
+                            float_lod_slot_batch.append(feed_i["{}.lod".format(
+                                key)])
+                        else:
+                            float_lod_slot_batch.append([])
                    if isinstance(feed_i[key], np.ndarray):
                        float_slot.append(feed_i[key])
                        self.has_numpy_input = True
@@ -310,6 +343,8 @@ class Client(object):
                        self.all_numpy_input = False
            int_slot_batch.append(int_slot)
            float_slot_batch.append(float_slot)
+            int_lod_slot_batch.append(int_lod_slot)
+            float_lod_slot_batch.append(float_lod_slot)
        self.profile_.record('py_prepro_1')
        self.profile_.record('py_client_infer_0')
@@ -317,14 +352,13 @@ class Client(object):
        result_batch_handle = self.predictorres_constructor()
        if self.all_numpy_input:
            res = self.client_handle_.numpy_predict(
-                float_slot_batch, float_feed_names, float_shape, int_slot_batch,
+                float_slot_batch, float_feed_names, float_shape,
-                int_feed_names, int_shape, fetch_names, result_batch_handle,
+                float_lod_slot_batch, int_slot_batch, int_feed_names, int_shape,
-                self.pid, log_id)
+                int_lod_slot_batch, fetch_names, result_batch_handle, self.pid,
+                log_id)
        elif self.has_numpy_input == False:
-            res = self.client_handle_.batch_predict(
+            raise ValueError(
-                float_slot_batch, float_feed_names, float_shape, int_slot_batch,
+                "Please make sure all of your inputs are numpy array")
-                int_feed_names, int_shape, fetch_names, result_batch_handle,
-                self.pid, log_id)
        else:
            raise ValueError(
                "Please make sure the inputs are all in list type or all in numpy.array type"
@@ -354,8 +388,9 @@ class Client(object):
                                name))
                    result_map[name].shape = shape
                    if name in self.lod_tensor_set:
-                        result_map["{}.lod".format(
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
-                            name)] = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
                elif self.fetch_names_to_type_[name] == float32_type:
                    result_map[name] = result_batch_handle.get_float_by_name(
                        mi, name)
@@ -367,9 +402,9 @@ class Client(object):
                    shape = result_batch_handle.get_shape(mi, name)
                    result_map[name].shape = shape
                    if name in self.lod_tensor_set:
-                        result_map["{}.lod".format(
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
-                            name)] = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
                elif self.fetch_names_to_type_[name] == int32_type:
                    # result_map[name] will be py::array(numpy array)
                    result_map[name] = result_batch_handle.get_int32_by_name(
@@ -382,8 +417,9 @@ class Client(object):
                    shape = result_batch_handle.get_shape(mi, name)
                    result_map[name].shape = shape
                    if name in self.lod_tensor_set:
-                        result_map["{}.lod".format(
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
-                            name)] = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
            multi_result_map.append(result_map)
        ret = None
        if len(model_engine_names) == 1:

--- a/python/paddle_serving_client/io/__init__.py
+++ b/python/paddle_serving_client/io/__init__.py
@@ -74,7 +74,8 @@ def save_model(server_model_folder,
        fetch_var = model_conf.FetchVar()
        fetch_var.alias_name = key
        fetch_var.name = fetch_var_dict[key].name
-        fetch_var.is_lod_tensor = fetch_var_dict[key].lod_level >= 1
+        #fetch_var.is_lod_tensor = fetch_var_dict[key].lod_level >= 1
+        fetch_var.is_lod_tensor = 1
        if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT64:
            fetch_var.fetch_type = 0
        if fetch_var_dict[key].dtype == core.VarDesc.VarType.FP32:
@@ -91,9 +92,12 @@ def save_model(server_model_folder,
            fetch_var.shape.extend(tmp_shape)
        config.fetch_var.extend([fetch_var])
-    cmd = "mkdir -p {}".format(client_config_folder)
+    try:
+        save_dirname = os.path.normpath(client_config_folder)
-    os.system(cmd)
+        os.makedirs(save_dirname)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
    with open("{}/serving_client_conf.prototxt".format(client_config_folder),
              "w") as fout:
        fout.write(str(config))

--- a/python/paddle_serving_server/__init__.py
+++ b/python/paddle_serving_server/__init__.py
@@ -23,13 +23,13 @@ import paddle_serving_server as paddle_serving_server
 from .version import serving_server_version
 from contextlib import closing
 import collections
-import fcntl
 import shutil
 import numpy as np
 import grpc
 from .proto import multi_lang_general_model_service_pb2
 import sys
+if sys.platform.startswith('win') is False:
+    import fcntl
 sys.path.append(
    os.path.join(os.path.abspath(os.path.dirname(__file__)), 'proto'))
 from .proto import multi_lang_general_model_service_pb2_grpc
@@ -584,7 +584,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
                    else:
                        raise Exception("error type.")
                tensor.shape.extend(list(model_result[name].shape))
-                if name in self.lod_tensor_set_:
+                if "{}.lod".format(name) in model_result:
                    tensor.lod.extend(model_result["{}.lod".format(name)]
                                      .tolist())
                inst.tensor_array.append(tensor)

--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -52,6 +52,20 @@ class WebService(object):
    def load_model_config(self, model_config):
        print("This API will be deprecated later. Please do not use it")
        self.model_config = model_config
+        import os
+        from .proto import general_model_config_pb2 as m_config
+        import google.protobuf.text_format
+        if os.path.isdir(model_config):
+            client_config = "{}/serving_server_conf.prototxt".format(
+                model_config)
+        elif os.path.isfile(model_config):
+            client_config = model_config
+        model_conf = m_config.GeneralModelConfig()
+        f = open(client_config, 'r')
+        model_conf = google.protobuf.text_format.Merge(
+            str(f.read()), model_conf)
+        self.feed_names = [var.alias_name for var in model_conf.feed_var]
+        self.fetch_names = [var.alias_name for var in model_conf.fetch_var]
    def _launch_rpc_service(self):
        op_maker = OpMaker()
@@ -112,13 +126,14 @@ class WebService(object):
        if "fetch" not in request.json:
            abort(400)
        try:
-            feed, fetch = self.preprocess(request.json["feed"],
+            feed, fetch, is_batch = self.preprocess(request.json["feed"],
                                                    request.json["fetch"])
            if isinstance(feed, dict) and "fetch" in feed:
                del feed["fetch"]
            if len(feed) == 0:
                raise ValueError("empty input")
-            fetch_map = self.client.predict(feed=feed, fetch=fetch)
+            fetch_map = self.client.predict(
+                feed=feed, fetch=fetch, batch=is_batch)
            result = self.postprocess(
                feed=request.json["feed"], fetch=fetch, fetch_map=fetch_map)
            result = {"result": result}
@@ -171,24 +186,22 @@ class WebService(object):
        self.app_instance = app_instance
    def _launch_local_predictor(self):
-        from paddle_serving_app.local_predict import Debugger
+        from paddle_serving_app.local_predict import LocalPredictor
-        self.client = Debugger()
+        self.client = LocalPredictor()
        self.client.load_model_config(
-            "{}".format(self.model_config), gpu=False, profile=False)
+            "{}".format(self.model_config), use_gpu=False)
    def run_web_service(self):
        print("This API will be deprecated later. Please do not use it")
-        self.app_instance.run(host="0.0.0.0",
+        self.app_instance.run(host="0.0.0.0", port=self.port, threaded=True)
-                              port=self.port,
-                              threaded=False,
-                              processes=1)
    def get_app_instance(self):
        return self.app_instance
    def preprocess(self, feed=[], fetch=[]):
        print("This API will be deprecated later. Please do not use it")
-        return feed, fetch
+        is_batch = True
+        return feed, fetch, is_batch
    def postprocess(self, feed=[], fetch=[], fetch_map=None):
        print("This API will be deprecated later. Please do not use it")

--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
@@ -25,7 +25,9 @@ from .version import serving_server_version
 from contextlib import closing
 import argparse
 import collections
-import fcntl
+import sys
+if sys.platform.startswith('win') is False:
+    import fcntl
 import shutil
 import numpy as np
 import grpc
@@ -73,6 +75,8 @@ def serve_args():
        default=False,
        action="store_true",
        help="Use Multi-language-service")
+    parser.add_argument(
+        "--use_trt", default=False, action="store_true", help="Use TensorRT")
    parser.add_argument(
        "--product_name",
        type=str,
@@ -205,6 +209,7 @@ class Server(object):
        self.cur_path = os.getcwd()
        self.use_local_bin = False
        self.gpuid = 0
+        self.use_trt = False
        self.model_config_paths = None  # for multi-model in a workflow
        self.product_name = None
        self.container_id = None
@@ -271,6 +276,9 @@ class Server(object):
    def set_gpuid(self, gpuid=0):
        self.gpuid = gpuid
+    def set_trt(self):
+        self.use_trt = True
    def _prepare_engine(self, model_config_paths, device):
        if self.model_toolkit_conf == None:
            self.model_toolkit_conf = server_sdk.ModelToolkitConf()
@@ -290,6 +298,7 @@ class Server(object):
            engine.enable_ir_optimization = self.ir_optimization
            engine.static_optimization = False
            engine.force_update_static_cache = False
+            engine.use_trt = self.use_trt
            if device == "cpu":
                engine.type = "FLUID_CPU_ANALYSIS_DIR"
@@ -396,7 +405,10 @@ class Server(object):
        for line in version_file.readlines():
            if re.match("cuda_version", line):
                cuda_version = line.split("\"")[1]
+                if cuda_version != "trt":
                    device_version = "serving-gpu-cuda" + cuda_version + "-"
+                else:
+                    device_version = "serving-gpu-" + cuda_version + "-"
        folder_name = device_version + serving_server_version
        tar_name = folder_name + ".tar.gz"
@@ -645,7 +657,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
                    else:
                        raise Exception("error type.")
                tensor.shape.extend(list(model_result[name].shape))
-                if name in self.lod_tensor_set_:
+                if "{}.lod".format(name) in model_result:
                    tensor.lod.extend(model_result["{}.lod".format(name)]
                                      .tolist())
                inst.tensor_array.append(tensor)

--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
@@ -64,6 +64,8 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
    server.set_memory_optimize(mem_optim)
    server.set_ir_optimize(ir_optim)
    server.set_max_body_size(max_body_size)
+    if args.use_trt:
+        server.set_trt()
    if args.product_name != None:
        server.set_product_name(args.product_name)

--- a/python/paddle_serving_server_gpu/web_service.py
+++ b/python/paddle_serving_server_gpu/web_service.py
@@ -58,6 +58,20 @@ class WebService(object):
    def load_model_config(self, model_config):
        print("This API will be deprecated later. Please do not use it")
        self.model_config = model_config
+        import os
+        from .proto import general_model_config_pb2 as m_config
+        import google.protobuf.text_format
+        if os.path.isdir(model_config):
+            client_config = "{}/serving_server_conf.prototxt".format(
+                model_config)
+        elif os.path.isfile(model_config):
+            client_config = model_config
+        model_conf = m_config.GeneralModelConfig()
+        f = open(client_config, 'r')
+        model_conf = google.protobuf.text_format.Merge(
+            str(f.read()), model_conf)
+        self.feed_names = [var.alias_name for var in model_conf.feed_var]
+        self.fetch_names = [var.alias_name for var in model_conf.fetch_var]
    def set_gpus(self, gpus):
        print("This API will be deprecated later. Please do not use it")
@@ -167,13 +181,14 @@ class WebService(object):
        if "fetch" not in request.json:
            abort(400)
        try:
-            feed, fetch = self.preprocess(request.json["feed"],
+            feed, fetch, is_batch = self.preprocess(request.json["feed"],
                                                    request.json["fetch"])
            if isinstance(feed, dict) and "fetch" in feed:
                del feed["fetch"]
            if len(feed) == 0:
                raise ValueError("empty input")
-            fetch_map = self.client.predict(feed=feed, fetch=fetch)
+            fetch_map = self.client.predict(
+                feed=feed, fetch=fetch, batch=is_batch)
            result = self.postprocess(
                feed=request.json["feed"], fetch=fetch, fetch_map=fetch_map)
            result = {"result": result}
@@ -232,24 +247,22 @@ class WebService(object):
        self.app_instance = app_instance
    def _launch_local_predictor(self, gpu):
-        from paddle_serving_app.local_predict import Debugger
+        from paddle_serving_app.local_predict import LocalPredictor
-        self.client = Debugger()
+        self.client = LocalPredictor()
        self.client.load_model_config(
-            "{}".format(self.model_config), gpu=gpu, profile=False)
+            "{}".format(self.model_config), use_gpu=True, gpu_id=self.gpus[0])
    def run_web_service(self):
        print("This API will be deprecated later. Please do not use it")
-        self.app_instance.run(host="0.0.0.0",
+        self.app_instance.run(host="0.0.0.0", port=self.port, threaded=True)
-                              port=self.port,
-                              threaded=False,
-                              processes=4)
    def get_app_instance(self):
        return self.app_instance
    def preprocess(self, feed=[], fetch=[]):
        print("This API will be deprecated later. Please do not use it")
-        return feed, fetch
+        is_batch = True
+        return feed, fetch, is_batch
    def postprocess(self, feed=[], fetch=[], fetch_map=None):
        print("This API will be deprecated later. Please do not use it")

--- a/python/pipeline/__init__.py
+++ b/python/pipeline/__init__.py
@@ -15,5 +15,5 @@ from . import logger  # this module must be the first to import
 from .operator import Op, RequestOp, ResponseOp
 from .pipeline_server import PipelineServer
 from .pipeline_client import PipelineClient
-from .local_rpc_service_handler import LocalRpcServiceHandler
+from .local_service_handler import LocalServiceHandler
 from .analyse import Analyst
--- a/python/pipeline/analyse.py
+++ b/python/pipeline/analyse.py
@@ -312,7 +312,7 @@ class OpAnalyst(object):
        # reduce op times
        op_times = {
-            op_name: sum(step_times.values())
+            op_name: sum(list(step_times.values()))
            for op_name, step_times in op_times.items()
        }

--- a/python/pipeline/channel.py
+++ b/python/pipeline/channel.py
@@ -32,7 +32,10 @@ import copy
 _LOGGER = logging.getLogger(__name__)
-class ChannelDataEcode(enum.Enum):
+class ChannelDataErrcode(enum.Enum):
+    """
+    ChannelData error code
+    """
    OK = 0
    TIMEOUT = 1
    NOT_IMPLEMENTED = 2
@@ -42,9 +45,21 @@ class ChannelDataEcode(enum.Enum):
    CLOSED_ERROR = 6
    NO_SERVICE = 7
    UNKNOW = 8
+    PRODUCT_ERROR = 9
+class ProductErrCode(enum.Enum):
+    """
+    ProductErrCode is a base class for recording business error code. 
+    product developers inherit this class and extend more error codes. 
+    """
+    pass
 class ChannelDataType(enum.Enum):
+    """
+    Channel data type
+    """
    DICT = 0
    CHANNEL_NPDATA = 1
    ERROR = 2
@@ -56,20 +71,23 @@ class ChannelData(object):
                 npdata=None,
                 dictdata=None,
                 data_id=None,
-                 ecode=None,
+                 log_id=None,
+                 error_code=None,
                 error_info=None,
+                 prod_error_code=None,
+                 prod_error_info=None,
                 client_need_profile=False):
        '''
        There are several ways to use it:
-        1. ChannelData(ChannelDataType.CHANNEL_NPDATA.value, npdata, data_id)
+        1. ChannelData(ChannelDataType.CHANNEL_NPDATA.value, npdata, data_id, log_id)
-        2. ChannelData(ChannelDataType.DICT.value, dictdata, data_id)
+        2. ChannelData(ChannelDataType.DICT.value, dictdata, data_id, log_id)
-        3. ChannelData(ecode, error_info, data_id)
+        3. ChannelData(error_code, error_info, prod_error_code, prod_error_info, data_id, log_id)
        Protobufs are not pickle-able:
        https://stackoverflow.com/questions/55344376/how-to-import-protobuf-module
        '''
-        if ecode is not None:
+        if error_code is not None or prod_error_code is not None:
            if data_id is None or error_info is None:
                _LOGGER.critical("Failed to generate ChannelData: data_id"
                                 " and error_info cannot be None")
@@ -77,25 +95,30 @@ class ChannelData(object):
            datatype = ChannelDataType.ERROR.value
        else:
            if datatype == ChannelDataType.CHANNEL_NPDATA.value:
-                ecode, error_info = ChannelData.check_npdata(npdata)
+                error_code, error_info = ChannelData.check_npdata(npdata)
-                if ecode != ChannelDataEcode.OK.value:
+                if error_code != ChannelDataErrcode.OK.value:
                    datatype = ChannelDataType.ERROR.value
-                    _LOGGER.error("(logid={}) {}".format(data_id, error_info))
+                    _LOGGER.error("(data_id={} log_id={}) {}".format(
+                        data_id, log_id, error_info))
            elif datatype == ChannelDataType.DICT.value:
-                ecode, error_info = ChannelData.check_dictdata(dictdata)
+                error_code, error_info = ChannelData.check_dictdata(dictdata)
-                if ecode != ChannelDataEcode.OK.value:
+                if error_code != ChannelDataErrcode.OK.value:
                    datatype = ChannelDataType.ERROR.value
-                    _LOGGER.error("(logid={}) {}".format(data_id, error_info))
+                    _LOGGER.error("(data_id={} log_id={}) {}".format(
+                        data_id, log_id, error_info))
            else:
-                _LOGGER.critical("(logid={}) datatype not match".format(
+                _LOGGER.critical("(data_id={} log_id={}) datatype not match".
-                    data_id))
+                                 format(data_id, log_id))
                os._exit(-1)
        self.datatype = datatype
        self.npdata = npdata
        self.dictdata = dictdata
        self.id = data_id
-        self.ecode = ecode
+        self.log_id = log_id
+        self.error_code = error_code
        self.error_info = error_info
+        self.prod_error_code = prod_error_code
+        self.prod_error_info = prod_error_info
        self.client_need_profile = client_need_profile
        self.profile_data_set = set()
@@ -106,67 +129,67 @@ class ChannelData(object):
    @staticmethod
    def check_dictdata(dictdata):
-        ecode = ChannelDataEcode.OK.value
+        error_code = ChannelDataErrcode.OK.value
        error_info = None
        if isinstance(dictdata, list):
            # batch data
            for sample in dictdata:
                if not isinstance(sample, dict):
-                    ecode = ChannelDataEcode.TYPE_ERROR.value
+                    error_code = ChannelDataErrcode.TYPE_ERROR.value
                    error_info = "Failed to check data: the type of " \
                            "data must be dict, but get {}.".format(type(sample))
                    break
        elif not isinstance(dictdata, dict):
            # batch size = 1
-            ecode = ChannelDataEcode.TYPE_ERROR.value
+            error_code = ChannelDataErrcode.TYPE_ERROR.value
            error_info = "Failed to check data: the type of data must " \
                    "be dict, but get {}.".format(type(dictdata))
-        return ecode, error_info
+        return error_code, error_info
    @staticmethod
    def check_batch_npdata(batch):
-        ecode = ChannelDataEcode.OK.value
+        error_code = ChannelDataErrcode.OK.value
        error_info = None
        for npdata in batch:
-            ecode, error_info = ChannelData.check_npdata(npdata)
+            error_code, error_info = ChannelData.check_npdata(npdata)
-            if ecode != ChannelDataEcode.OK.value:
+            if error_code != ChannelDataErrcode.OK.value:
                break
-        return ecode, error_info
+        return error_code, error_info
    @staticmethod
    def check_npdata(npdata):
-        ecode = ChannelDataEcode.OK.value
+        error_code = ChannelDataErrcode.OK.value
        error_info = None
        if isinstance(npdata, list):
            # batch data
            for sample in npdata:
                if not isinstance(sample, dict):
-                    ecode = ChannelDataEcode.TYPE_ERROR.value
+                    error_code = ChannelDataErrcode.TYPE_ERROR.value
                    error_info = "Failed to check data: the " \
                            "value of data must be dict, but get {}.".format(
                                    type(sample))
                    break
                for _, value in sample.items():
                    if not isinstance(value, np.ndarray):
-                        ecode = ChannelDataEcode.TYPE_ERROR.value
+                        error_code = ChannelDataErrcode.TYPE_ERROR.value
                        error_info = "Failed to check data: the" \
                                " value of data must be np.ndarray, but get {}.".format(
                                        type(value))
-                        return ecode, error_info
+                        return error_code, error_info
        elif isinstance(npdata, dict):
            # batch_size = 1
            for _, value in npdata.items():
                if not isinstance(value, np.ndarray):
-                    ecode = ChannelDataEcode.TYPE_ERROR.value
+                    error_code = ChannelDataErrcode.TYPE_ERROR.value
                    error_info = "Failed to check data: the value " \
                            "of data must be np.ndarray, but get {}.".format(
                                    type(value))
                    break
        else:
-            ecode = ChannelDataEcode.TYPE_ERROR.value
+            error_code = ChannelDataErrcode.TYPE_ERROR.value
            error_info = "Failed to check data: the value of data " \
                    "must be dict, but get {}.".format(type(npdata))
-        return ecode, error_info
+        return error_code, error_info
    def parse(self):
        feed = None
@@ -191,8 +214,9 @@ class ChannelData(object):
            return 1
    def __str__(self):
-        return "type[{}], ecode[{}], id[{}]".format(
+        return "type[{}], error_code[{}], data_id[{}], log_id[{}], dict_data[{}]".format(
-            ChannelDataType(self.datatype).name, self.ecode, self.id)
+            ChannelDataType(self.datatype).name, self.error_code, self.id,
+            self.log_id, str(self.dictdata))
 class ProcessChannel(object):
@@ -289,14 +313,14 @@ class ProcessChannel(object):
    def push(self, channeldata, op_name=None):
        _LOGGER.debug(
-            self._log("(logid={}) Op({}) Pushing data".format(channeldata.id,
+            self._log("(data_id={} log_id={}) Op({}) Enter channel::push".
-                                                              op_name)))
+                      format(channeldata.id, channeldata.log_id, op_name)))
        if len(self._producers) == 0:
            _LOGGER.critical(
                self._log(
-                    "(logid={}) Op({}) Failed to push data: expected number"
+                    "(data_id={} log_id={}) Op({}) Failed to push data: expected number"
                    " of producers to be greater than 0, but the it is 0.".
-                    format(channeldata.id, op_name)))
+                    format(channeldata.id, channeldata.log_id, op_name)))
            os._exit(-1)
        elif len(self._producers) == 1:
            with self._cv:
@@ -310,19 +334,21 @@ class ProcessChannel(object):
                    raise ChannelStopError()
                self._cv.notify_all()
            _LOGGER.debug(
-                self._log("(logid={}) Op({}) Pushed data into internal queue.".
+                self._log(
-                          format(channeldata.id, op_name)))
+                    "(data_id={} log_id={}) Op({}) Pushed data into internal queue.".
+                    format(channeldata.id, channeldata.log_id, op_name)))
            return True
        elif op_name is None:
            _LOGGER.critical(
                self._log(
-                    "(logid={}) Op({}) Failed to push data: there are multiple "
+                    "(data_id={} log_id={}) Op({}) Failed to push data: there are multiple "
                    "producers, so op_name cannot be None.".format(
-                        channeldata.id, op_name)))
+                        channeldata.id, channeldata.log_id, op_name)))
            os._exit(-1)
        producer_num = len(self._producers)
        data_id = channeldata.id
+        log_id = channeldata.log_id
        put_data = None
        with self._cv:
            if data_id not in self._input_buf:
@@ -347,8 +373,8 @@ class ProcessChannel(object):
            if put_data is None:
                _LOGGER.debug(
                    self._log(
-                        "(logid={}) Op({}) Pushed data into input_buffer.".
+                        "(data_id={} log_id={}) Op({}) Pushed data into input_buffer.".
-                        format(data_id, op_name)))
+                        format(data_id, log_id, op_name)))
            else:
                while self._stop.value == 0:
                    try:
@@ -361,8 +387,8 @@ class ProcessChannel(object):
                _LOGGER.debug(
                    self._log(
-                        "(logid={}) Op({}) Pushed data into internal_queue.".
+                        "(data_id={} log_id={}) Op({}) Pushed data into internal_queue.".
-                        format(data_id, op_name)))
+                        format(data_id, log_id, op_name)))
            self._cv.notify_all()
        return True
@@ -403,9 +429,12 @@ class ProcessChannel(object):
                            self._cv.wait()
                if self._stop.value == 1:
                    raise ChannelStopError()
+            if resp is not None:
+                list_values = list(resp.values())
                _LOGGER.debug(
-                self._log("(logid={}) Op({}) Got data".format(resp.values()[0]
+                    self._log("(data_id={} log_id={}) Op({}) Got data".format(
-                                                              .id, op_name)))
+                        list_values[0].id, list_values[0].log_id, op_name)))
            return resp
        elif op_name is None:
            _LOGGER.critical(
@@ -432,10 +461,12 @@ class ProcessChannel(object):
                try:
                    channeldata = self._que.get(timeout=0)
                    self._output_buf.append(channeldata)
+                    list_values = list(channeldata.values())
                    _LOGGER.debug(
                        self._log(
-                            "(logid={}) Op({}) Pop ready item into output_buffer".
+                            "(data_id={} log_id={}) Op({}) Pop ready item into output_buffer".
-                            format(channeldata.values()[0].id, op_name)))
+                            format(list_values[0].id, list_values[0].log_id,
+                                   op_name)))
                    break
                except Queue.Empty:
                    if timeout is not None:
@@ -486,9 +517,12 @@ class ProcessChannel(object):
            self._cv.notify_all()
+        if resp is not None:
+            list_values = list(resp.values())
            _LOGGER.debug(
-            self._log("(logid={}) Op({}) Got data from output_buffer".format(
+                self._log(
-                resp.values()[0].id, op_name)))
+                    "(data_id={} log_id={}) Op({}) Got data from output_buffer".
+                    format(list_values[0].id, list_values[0].log_id, op_name)))
        return resp
    def stop(self):
@@ -586,14 +620,14 @@ class ThreadChannel(Queue.PriorityQueue):
    def push(self, channeldata, op_name=None):
        _LOGGER.debug(
-            self._log("(logid={}) Op({}) Pushing data".format(channeldata.id,
+            self._log("(data_id={} log_id={}) Op({}) Pushing data".format(
-                                                              op_name)))
+                channeldata.id, channeldata.log_id, op_name)))
        if len(self._producers) == 0:
            _LOGGER.critical(
                self._log(
-                    "(logid={}) Op({}) Failed to push data: expected number of "
+                    "(data_id={} log_id={}) Op({}) Failed to push data: expected number of "
                    "producers to be greater than 0, but the it is 0.".format(
-                        channeldata.id, op_name)))
+                        channeldata.id, channeldata.log_id, op_name)))
            os._exit(-1)
        elif len(self._producers) == 1:
            with self._cv:
@@ -607,19 +641,21 @@ class ThreadChannel(Queue.PriorityQueue):
                    raise ChannelStopError()
                self._cv.notify_all()
            _LOGGER.debug(
-                self._log("(logid={}) Op({}) Pushed data into internal_queue.".
+                self._log(
-                          format(channeldata.id, op_name)))
+                    "(data_id={} log_id={}) Op({}) Pushed data into internal_queue.".
+                    format(channeldata.id, channeldata.log_id, op_name)))
            return True
        elif op_name is None:
            _LOGGER.critical(
                self._log(
-                    "(logid={}) Op({}) Failed to push data: there are multiple"
+                    "(data_id={} log_id={}) Op({}) Failed to push data: there are multiple"
                    " producers, so op_name cannot be None.".format(
-                        channeldata.id, op_name)))
+                        channeldata.id, channeldata.log_id, op_name)))
            os._exit(-1)
        producer_num = len(self._producers)
        data_id = channeldata.id
+        log_id = channeldata.log_id
        put_data = None
        with self._cv:
            if data_id not in self._input_buf:
@@ -639,8 +675,8 @@ class ThreadChannel(Queue.PriorityQueue):
            if put_data is None:
                _LOGGER.debug(
                    self._log(
-                        "(logid={}) Op({}) Pushed data into input_buffer.".
+                        "(data_id={} log_id={}) Op({}) Pushed data into input_buffer.".
-                        format(data_id, op_name)))
+                        format(data_id, log_id, op_name)))
            else:
                while self._stop is False:
                    try:
@@ -653,8 +689,8 @@ class ThreadChannel(Queue.PriorityQueue):
                _LOGGER.debug(
                    self._log(
-                        "(logid={}) Op({}) Pushed data into internal_queue.".
+                        "(data_id={} log_id={}) Op({}) Pushed data into internal_queue.".
-                        format(data_id, op_name)))
+                        format(data_id, log_id, op_name)))
            self._cv.notify_all()
        return True
@@ -696,9 +732,11 @@ class ThreadChannel(Queue.PriorityQueue):
                            self._cv.wait()
                if self._stop:
                    raise ChannelStopError()
+            if resp is not None:
+                list_values = list(resp.values())
                _LOGGER.debug(
-                self._log("(logid={}) Op({}) Got data".format(resp.values()[0]
+                    self._log("(data_id={} log_id={}) Op({}) Got data".format(
-                                                              .id, op_name)))
+                        list_values[0].id, list_values[0].log_id, op_name)))
            return resp
        elif op_name is None:
            _LOGGER.critical(
@@ -725,10 +763,12 @@ class ThreadChannel(Queue.PriorityQueue):
                try:
                    channeldata = self.get(timeout=0)
                    self._output_buf.append(channeldata)
+                    list_values = list(channeldata.values())
                    _LOGGER.debug(
                        self._log(
-                            "(logid={}) Op({}) Pop ready item into output_buffer".
+                            "(data_id={} log_id={}) Op({}) Pop ready item into output_buffer".
-                            format(channeldata.values()[0].id, op_name)))
+                            format(list_values[0].id, list_values[0].log_id,
+                                   op_name)))
                    break
                except Queue.Empty:
                    if timeout is not None:
@@ -779,9 +819,12 @@ class ThreadChannel(Queue.PriorityQueue):
            self._cv.notify_all()
+        if resp is not None:
+            list_values = list(resp.values())
            _LOGGER.debug(
-            self._log("(logid={}) Op({}) Got data from output_buffer".format(
+                self._log(
-                resp.values()[0].id, op_name)))
+                    "(data_id={} log_id={}) Op({}) Got data from output_buffer".
+                    format(list_values[0].id, list_values[0].log_id, op_name)))
        return resp
    def stop(self):

--- a/python/pipeline/dag.py
+++ b/python/pipeline/dag.py
@@ -25,10 +25,12 @@ else:
 import os
 import logging
 import collections
+import json
 from .operator import Op, RequestOp, ResponseOp, VirtualOp
 from .channel import (ThreadChannel, ProcessChannel, ChannelData,
-                      ChannelDataEcode, ChannelDataType, ChannelStopError)
+                      ChannelDataErrcode, ChannelDataType, ChannelStopError,
+                      ProductErrCode)
 from .profiler import TimeProfiler, PerformanceTracer
 from .util import NameGenerator, ThreadIdGenerator, PipelineProcSyncManager
 from .proto import pipeline_service_pb2
@@ -37,13 +39,28 @@ _LOGGER = logging.getLogger(__name__)
 class DAGExecutor(object):
+    """
+    DAG Executor, the service entrance of DAG.
+    """
    def __init__(self, response_op, server_conf, worker_idx):
+        """
+        Initialize DAGExecutor.
+        Args:
+            response_op: Response OP
+            server_conf: server conf. config.yaml
+            worker_idx: DAGExecutor index, PipelineServer creates many
+                DAGExecutors when _build_dag_each_worker is true.
+        Returns:
+            None.
+        """
        build_dag_each_worker = server_conf["build_dag_each_worker"]
        server_worker_num = server_conf["worker_num"]
        dag_conf = server_conf["dag"]
        self._retry = dag_conf["retry"]
-        client_type = dag_conf["client_type"]
        self._server_use_profile = dag_conf["use_profile"]
        channel_size = dag_conf["channel_size"]
        self._is_thread_op = dag_conf["is_thread_op"]
@@ -61,8 +78,8 @@ class DAGExecutor(object):
                self._is_thread_op, tracer_interval_s, server_worker_num)
        self._dag = DAG(self.name, response_op, self._server_use_profile,
-                        self._is_thread_op, client_type, channel_size,
+                        self._is_thread_op, channel_size, build_dag_each_worker,
-                        build_dag_each_worker, self._tracer)
+                        self._tracer)
        (in_channel, out_channel, pack_rpc_func,
         unpack_rpc_func) = self._dag.build()
        self._dag.start()
@@ -75,7 +92,9 @@ class DAGExecutor(object):
        if self._tracer is not None:
            self._tracer.start()
-        # generate id: data_id == request_id == log_id
+        # generate id 
+        # data_id: Server Unique ID, automatically generated by the framework
+        # log_id: Trace one product request, can be empty, not unique.
        base_counter = 0
        gen_id_step = 1
        if build_dag_each_worker:
@@ -95,6 +114,15 @@ class DAGExecutor(object):
        self._client_profile_value = "1"
    def start(self):
+        """
+        Starting one thread for receiving data from the last channel background.
+        Args:
+            None
+        Returns:
+            None
+        """
        self._recive_func = threading.Thread(
            target=DAGExecutor._recive_out_channel_func, args=(self, ))
        self._recive_func.daemon = True
@@ -102,11 +130,30 @@ class DAGExecutor(object):
        _LOGGER.debug("[DAG Executor] Start recive thread")
    def stop(self):
+        """
+        Stopping DAG
+        Args:
+            None
+        Returns:
+            None
+        """
        self._dag.stop()
        self._dag.join()
        _LOGGER.info("[DAG Executor] Stop")
    def _get_next_data_id(self):
+        """
+        Generate data_id incrementally and Uniquely
+        Args:
+            None
+        Returns:
+            data_id: uniq id
+            cond_v: condition variable
+        """
        data_id = self._id_generator.next()
        cond_v = threading.Condition()
        with self._cv_for_cv_pool:
@@ -115,6 +162,15 @@ class DAGExecutor(object):
        return data_id, cond_v
    def _set_in_channel(self, in_channel):
+        """
+        Set in_channel of DAG
+        Args:
+            in_channel: input channel of DAG
+        Returns:
+            None 
+        """
        if not isinstance(in_channel, (ThreadChannel, ProcessChannel)):
            _LOGGER.critical("[DAG Executor] Failed to set in_channel: "
                             "in_channel must be Channel type, but get {}".
@@ -122,8 +178,18 @@ class DAGExecutor(object):
            os._exit(-1)
        in_channel.add_producer(self.name)
        self._in_channel = in_channel
+        _LOGGER.info("[DAG] set in channel succ, name [{}]".format(self.name))
    def _set_out_channel(self, out_channel):
+        """
+        Set out_channel of DAG
+        Args:
+            out_channel: output channel of DAG
+        Returns:
+            None 
+        """
        if not isinstance(out_channel, (ThreadChannel, ProcessChannel)):
            _LOGGER.critical("[DAG Executor] Failed to set out_channel: "
                             "must be Channel type, but get {}".format(
@@ -133,6 +199,17 @@ class DAGExecutor(object):
        self._out_channel = out_channel
    def _recive_out_channel_func(self):
+        """
+        Receiving data from the output channel, and pushing data into 
+        _fetch_buffer. Function _get_channeldata_from_fetch_buffer gets 
+        data by retry time.
+        Args:
+            None
+        Returns:
+            None
+        """
        cv = None
        while True:
            try:
@@ -142,14 +219,13 @@ class DAGExecutor(object):
                with self._cv_for_cv_pool:
                    for data_id, cv in self._cv_pool.items():
                        closed_errror_data = ChannelData(
-                            ecode=ChannelDataEcode.CLOSED_ERROR.value,
+                            error_code=ChannelDataErrcode.CLOSED_ERROR.value,
                            error_info="dag closed.",
                            data_id=data_id)
                        with cv:
                            self._fetch_buffer[data_id] = closed_errror_data
                            cv.notify_all()
                break
            if len(channeldata_dict) != 1:
                _LOGGER.critical(
                    "[DAG Executor] Failed to fetch result: out_channel "
@@ -173,6 +249,16 @@ class DAGExecutor(object):
                cond_v.notify_all()
    def _get_channeldata_from_fetch_buffer(self, data_id, cond_v):
+        """
+        Getting the channel data from _fetch_buffer.
+        Args:
+            data_id: search key
+            cond_v: conditional variable
+        Returns:
+            ready_data: one channel data processed
+        """
        ready_data = None
        with cond_v:
@@ -189,45 +275,82 @@ class DAGExecutor(object):
                    ready_data = self._fetch_buffer[data_id]
                    self._cv_pool.pop(data_id)
                    self._fetch_buffer.pop(data_id)
-        _LOGGER.debug("(logid={}) [resp thread] Got data".format(data_id))
+        _LOGGER.debug("(data_id={}) [resp thread] Got data".format(data_id))
        return ready_data
    def _pack_channeldata(self, rpc_request, data_id):
+        """
+        Unpacking data from RPC request. and creating one channelData.
+        Args:
+           rpc_request: one RPC request
+           data_id: data id, unique
+        Returns:
+            ChannelData: one channel data to be processed
+        """
        dictdata = None
+        log_id = None
        try:
-            dictdata = self._unpack_rpc_func(rpc_request)
+            dictdata, log_id, prod_errcode, prod_errinfo = self._unpack_rpc_func(
+                rpc_request)
        except Exception as e:
            _LOGGER.error(
                "(logid={}) Failed to parse RPC request package: {}"
                .format(data_id, e),
                exc_info=True)
            return ChannelData(
-                ecode=ChannelDataEcode.RPC_PACKAGE_ERROR.value,
+                error_code=ChannelDataErrcode.RPC_PACKAGE_ERROR.value,
                error_info="rpc package error: {}".format(e),
-                data_id=data_id)
+                data_id=data_id,
+                log_id=log_id)
        else:
-            # because unpack_rpc_func is rewritten by user, we need
+            # because unpack_rpc_func is rewritten by user, we need to look
-            # to look for client_profile_key field in rpc_request
+            # for product_errcode in returns, and  client_profile_key field
+            # in rpc_request
+            if prod_errcode is not None:
+                # product errors occured
+                _LOGGER.error("unpack_rpc_func prod_errcode:{}".format(
+                    prod_errcode))
+                return ChannelData(
+                    error_code=ChannelDataErrcode.PRODUCT_ERROR.value,
+                    error_info="",
+                    prod_error_code=prod_errcode,
+                    prod_error_info=prod_errinfo,
+                    data_id=data_id,
+                    log_id=log_id)
            profile_value = None
-            for idx, key in enumerate(rpc_request.key):
+            profile_value = dictdata.get(self._client_profile_key)
-                if key == self._client_profile_key:
-                    profile_value = rpc_request.value[idx]
-                    break
            client_need_profile = (profile_value == self._client_profile_value)
-            _LOGGER.debug("(logid={}) Need profile in client: {}".format(
-                data_id, client_need_profile))
            return ChannelData(
                datatype=ChannelDataType.DICT.value,
                dictdata=dictdata,
                data_id=data_id,
+                log_id=log_id,
                client_need_profile=client_need_profile)
    def call(self, rpc_request):
+        """
+        DAGExcutor enterance function. There are 5 steps:
+        1._get_next_data_id: Generate an incremental ID
+        2._pack_channeldata: pack the channel data from request.
+        3.retry loop: 
+            a. push channel_data into _in_channel
+            b. get_channeldata_from_fetch_buffer: get results.
+        4._pack_for_rpc_resp: pack RPC responses
+        5.profile: generte profile string and pack into response.
+        Args:
+            rpc_request: one RPC request
+        Returns:
+            rpc_resp: one RPC response
+        """
        if self._tracer is not None:
            trace_buffer = self._tracer.data_buffer()
        data_id, cond_v = self._get_next_data_id()
-        _LOGGER.info("(logid={}) Succ generate id".format(data_id))
        start_call, end_call = None, None
        if not self._is_thread_op:
@@ -236,45 +359,64 @@ class DAGExecutor(object):
        else:
            start_call = self._profiler.record("call_{}#DAG_0".format(data_id))
-        _LOGGER.debug("(logid={}) Parsing RPC request package".format(data_id))
        self._profiler.record("prepack_{}#{}_0".format(data_id, self.name))
        req_channeldata = self._pack_channeldata(rpc_request, data_id)
        self._profiler.record("prepack_{}#{}_1".format(data_id, self.name))
+        log_id = req_channeldata.log_id
+        _LOGGER.info("(data_id={} log_id={}) Succ Generate ID ".format(data_id,
+                                                                       log_id))
        resp_channeldata = None
        for i in range(self._retry):
-            _LOGGER.debug("(logid={}) Pushing data into Graph engine".format(
+            _LOGGER.debug("(data_id={}) Pushing data into Graph engine".format(
                data_id))
            try:
+                if req_channeldata is None:
+                    _LOGGER.critical(
+                        "(data_id={} log_id={}) req_channeldata is None"
+                        .format(data_id, log_id))
+                if not isinstance(self._in_channel,
+                                  (ThreadChannel, ProcessChannel)):
+                    _LOGGER.critical(
+                        "(data_id={} log_id={})[DAG Executor] Failed to "
+                        "set in_channel: in_channel must be Channel type, but get {}".
+                        format(data_id, log_id, type(self._in_channel)))
                self._in_channel.push(req_channeldata, self.name)
            except ChannelStopError:
-                _LOGGER.debug("[DAG Executor] Stop")
+                _LOGGER.error("(data_id:{} log_id={})[DAG Executor] Stop".
+                              format(data_id, log_id))
                with self._cv_for_cv_pool:
                    self._cv_pool.pop(data_id)
                return self._pack_for_rpc_resp(
                    ChannelData(
-                        ecode=ChannelDataEcode.CLOSED_ERROR.value,
+                        error_code=ChannelDataErrcode.CLOSED_ERROR.value,
                        error_info="dag closed.",
                        data_id=data_id))
-            _LOGGER.debug("(logid={}) Wait for Graph engine...".format(data_id))
+            _LOGGER.debug("(data_id={} log_id={}) Wait for Graph engine...".
+                          format(data_id, log_id))
            resp_channeldata = self._get_channeldata_from_fetch_buffer(data_id,
                                                                       cond_v)
-            if resp_channeldata.ecode == ChannelDataEcode.OK.value:
+            if resp_channeldata.error_code == ChannelDataErrcode.OK.value:
-                _LOGGER.info("(logid={}) Succ predict".format(data_id))
+                _LOGGER.info("(data_id={} log_id={}) Succ predict".format(
+                    data_id, log_id))
                break
            else:
-                _LOGGER.error("(logid={}) Failed to predict: {}"
+                _LOGGER.error("(data_id={} log_id={}) Failed to predict: {}"
-                              .format(data_id, resp_channeldata.error_info))
+                              .format(data_id, log_id,
-                if resp_channeldata.ecode != ChannelDataEcode.TIMEOUT.value:
+                                      resp_channeldata.error_info))
+                if resp_channeldata.error_code != ChannelDataErrcode.TIMEOUT.value:
                    break
            if i + 1 < self._retry:
-                _LOGGER.warning("(logid={}) DAGExecutor retry({}/{})".format(
+                _LOGGER.warning(
-                    data_id, i + 1, self._retry))
+                    "(data_id={} log_id={}) DAGExecutor retry({}/{})"
+                    .format(data_id, log_id, i + 1, self._retry))
-        _LOGGER.debug("(logid={}) Packing RPC response package".format(data_id))
+        _LOGGER.debug("(data_id={} log_id={}) Packing RPC response package"
+                      .format(data_id, log_id))
        self._profiler.record("postpack_{}#{}_0".format(data_id, self.name))
        rpc_resp = self._pack_for_rpc_resp(resp_channeldata)
        self._profiler.record("postpack_{}#{}_1".format(data_id, self.name))
@@ -288,7 +430,8 @@ class DAGExecutor(object):
            trace_buffer.put({
                "name": "DAG",
                "id": data_id,
-                "succ": resp_channeldata.ecode == ChannelDataEcode.OK.value,
+                "succ":
+                resp_channeldata.error_code == ChannelDataErrcode.OK.value,
                "actions": {
                    "call_{}".format(data_id): end_call - start_call,
                },
@@ -309,6 +452,15 @@ class DAGExecutor(object):
        return rpc_resp
    def _pack_for_rpc_resp(self, channeldata):
+        """
+        Packing one RPC response
+        Args:
+            channeldata: one channel data to be packed
+        Returns:
+            resp: one RPC response
+        """
        try:
            return self._pack_rpc_func(channeldata)
        except Exception as e:
@@ -317,20 +469,23 @@ class DAGExecutor(object):
                .format(channeldata.id, e),
                exc_info=True)
            resp = pipeline_service_pb2.Response()
-            resp.ecode = ChannelDataEcode.RPC_PACKAGE_ERROR.value
+            resp.err_no = ChannelDataErrcode.RPC_PACKAGE_ERROR.value
-            resp.error_info = "rpc package error: {}".format(e)
+            resp.err_msg = "rpc package error: {}".format(e)
            return resp
 class DAG(object):
+    """
+    Directed Acyclic Graph(DAG) engine, builds one DAG topology.
+    """
    def __init__(self, request_name, response_op, use_profile, is_thread_op,
-                 client_type, channel_size, build_dag_each_worker, tracer):
+                 channel_size, build_dag_each_worker, tracer):
        self._request_name = request_name
        self._response_op = response_op
        self._use_profile = use_profile
        self._is_thread_op = is_thread_op
        self._channel_size = channel_size
-        self._client_type = client_type
        self._build_dag_each_worker = build_dag_each_worker
        self._tracer = tracer
        if not self._is_thread_op:
@@ -339,6 +494,18 @@ class DAG(object):
    @staticmethod
    def get_use_ops(response_op):
+        """
+        Starting from ResponseOp, recursively traverse the front OPs. Getting
+        all used ops and the post op list of each op (excluding ResponseOp)
+        Args:
+            response_op: ResponseOp
+        Returns:
+            used_ops: used ops, set
+            succ_ops_of_use_op: op and the next op list, dict.
+        """
        unique_names = set()
        used_ops = set()
        succ_ops_of_use_op = {}  # {op_name: succ_ops}
@@ -364,6 +531,15 @@ class DAG(object):
        return used_ops, succ_ops_of_use_op
    def _gen_channel(self, name_gen):
+        """
+        Generate one ThreadChannel or ProcessChannel.
+        Args:
+            name_gen: channel name
+        Returns:
+            channel: one channel generated
+        """
        channel = None
        if self._is_thread_op:
            channel = ThreadChannel(
@@ -375,11 +551,37 @@ class DAG(object):
        return channel
    def _gen_virtual_op(self, name_gen):
+        """
+        Generate one virtual Op
+        Args:
+            name_gen: Op name
+        Returns:
+            vir_op: one virtual Op object.
+        """
        vir_op = VirtualOp(name=name_gen.next())
        _LOGGER.debug("[DAG] Generate virtual_op: {}".format(vir_op.name))
        return vir_op
    def _topo_sort(self, used_ops, response_op, out_degree_ops):
+        """
+        Topological sort of DAG, creates inverted multi-layers views.
+        Args:
+            used_ops: op used in DAG
+            response_op: response op
+            out_degree_ops: Next op list for each op, dict. the output of 
+                get_use_ops()
+        Returns:
+            dag_views: the inverted hierarchical topology list. examples:
+                DAG :[A -> B -> C -> E]
+                            \-> D /
+                dag_views: [[E], [C, D], [B], [A]]
+            last_op:the last op front of ResponseOp
+        """
        out_degree_num = {
            name: len(ops)
            for name, ops in out_degree_ops.items()
@@ -423,6 +625,23 @@ class DAG(object):
        return dag_views, last_op
    def _build_dag(self, response_op):
+        """
+        Building DAG, the most important function in class DAG. Core steps:
+        1.get_use_ops: Getting used ops, and out degree op list for each op.
+        2._topo_sort: Topological sort creates inverted multi-layers views.
+        3.create channels and virtual ops.
+        Args:
+            response_op: ResponseOp
+        Returns:
+            actual_ops: all OPs used in DAG, including virtual OPs
+            channels: all channels used in DAG 
+            input_channel: the channel of first OP 
+            output_channel: the channel of last OP
+            pack_func: pack_response_package function of response_op
+            unpack_func: unpack_request_package function of request_op
+        """
        if response_op is None:
            _LOGGER.critical("Failed to build DAG: ResponseOp"
                             " has not been set.")
@@ -548,6 +767,18 @@ class DAG(object):
        return self._channels
    def build(self):
+        """
+        Interface for building one DAG outside.
+        Args:
+            None
+        Returns:
+            _input_channel: the channel of first OP
+            _output_channel:  the channel of last OP
+            _pack_func: pack_response_package function of response_op
+            _unpack_func: unpack_request_package function of request_op
+        """
        (actual_ops, channels, input_channel, output_channel, pack_func,
         unpack_func) = self._build_dag(self._response_op)
        _LOGGER.info("[DAG] Succ build DAG")
@@ -565,26 +796,52 @@ class DAG(object):
        return self._input_channel, self._output_channel, self._pack_func, self._unpack_func
    def start(self):
+        """
+        Each OP starts a thread or process by _is_thread_op 
+        Args:
+            None
+        Returns:
+            _threads_or_proces: threads or process list.
+        """
        self._threads_or_proces = []
        for op in self._actual_ops:
            op.use_profiler(self._use_profile)
            op.set_tracer(self._tracer)
            if self._is_thread_op:
-                self._threads_or_proces.extend(
+                self._threads_or_proces.extend(op.start_with_thread())
-                    op.start_with_thread(self._client_type))
            else:
-                self._threads_or_proces.extend(
+                self._threads_or_proces.extend(op.start_with_process())
-                    op.start_with_process(self._client_type))
        _LOGGER.info("[DAG] start")
        # not join yet
        return self._threads_or_proces
    def join(self):
+        """
+        All threads or processes join.
+        Args:
+            None
+        Returns:
+            None
+        """
        for x in self._threads_or_proces:
+            if x is not None:
                x.join()
    def stop(self):
+        """
+        Stopping and cleanning all channels.
+        Args:
+            None
+        Returns:
+            None 
+        """
        for chl in self._channels:
            chl.stop()
        for op in self._actual_ops:

--- a/python/pipeline/gateway/proto/gateway.proto
+++ b/python/pipeline/gateway/proto/gateway.proto
@@ -19,22 +19,25 @@ option go_package = ".;pipeline_serving";
 import "google/api/annotations.proto";
 message Response {
-  repeated string key = 1;
+  int32 err_no = 1;
-  repeated string value = 2;
+  string err_msg = 2;
-  int32 ecode = 3;
+  repeated string key = 3;
-  string error_info = 4;
+  repeated string value = 4;
 };
 message Request {
  repeated string key = 1;
  repeated string value = 2;
  string name = 3;
-}
+  string method = 4;
+  int64 logid = 5;
+  string clientip = 6;
+};
 service PipelineService {
  rpc inference(Request) returns (Response) {
    option (google.api.http) = {
-      post : "/{name=*}/prediction"
+      post : "/{name=*}/{method=*}"
      body : "*"
    };
  }

--- a/python/pipeline/gateway/proxy_server.go
+++ b/python/pipeline/gateway/proxy_server.go
@@ -25,7 +25,7 @@ import (
  "github.com/grpc-ecosystem/grpc-gateway/runtime"
  "google.golang.org/grpc"
-  gw "./proto"
+  gw "serving-gateway/proto"
 )
 //export run_proxy_server
@@ -38,7 +38,8 @@ func run_proxy_server(grpc_port int, http_port int) error {
  ctx, cancel := context.WithCancel(ctx)
  defer cancel()
-  mux := runtime.NewServeMux()
+  //EmitDefaults=true, does not filter out the default inputs 
+  mux := runtime.NewServeMux(runtime.WithMarshalerOption(runtime.MIMEWildcard, &runtime.JSONPb{OrigName: true, EmitDefaults: true}))
  opts := []grpc.DialOption{grpc.WithInsecure()}
  err := gw.RegisterPipelineServiceHandlerFromEndpoint(ctx, mux, *pipelineEndpoint, opts)
  if err != nil {

--- a/python/pipeline/local_rpc_service_handler.py
+++ b/python/pipeline/local_rpc_service_handler.py
@@ -15,73 +15,164 @@
 import os
 import logging
 import multiprocessing
-try:
+#from paddle_serving_server_gpu import OpMaker, OpSeqMaker
-    from paddle_serving_server_gpu import OpMaker, OpSeqMaker, Server
+#from paddle_serving_server_gpu import Server as GpuServer
-    PACKAGE_VERSION = "GPU"
+#from paddle_serving_server import Server as CpuServer
-except ImportError:
-    from paddle_serving_server import OpMaker, OpSeqMaker, Server
-    PACKAGE_VERSION = "CPU"
 from . import util
+#from paddle_serving_app.local_predict import LocalPredictor
 _LOGGER = logging.getLogger(__name__)
 _workdir_name_gen = util.NameGenerator("workdir_")
-class LocalRpcServiceHandler(object):
+class LocalServiceHandler(object):
+    """
+    LocalServiceHandler is the processor of the local service, contains
+    three client types, brpc, grpc and local_predictor.If you use the 
+    brpc or grpc, serveing startup ability is provided.If you use
+    local_predictor, local predict ability is provided by paddle_serving_app.
+    """
    def __init__(self,
                 model_config,
+                 client_type='local_predictor',
                 workdir="",
                 thread_num=2,
                 devices="",
+                 fetch_names=None,
                 mem_optim=True,
                 ir_optim=False,
-                 available_port_generator=None):
+                 available_port_generator=None,
+                 use_trt=False,
+                 use_profile=False):
+        """
+        Initialization of localservicehandler
+        Args:
+           model_config: model config path
+           client_type: brpc, grpc and local_predictor[default]
+           workdir: work directory
+           thread_num: number of threads, concurrent quantity.
+           devices: gpu id list[gpu], "" default[cpu]
+           fetch_names: get fetch names out of LocalServiceHandler in 
+               local_predictor mode. fetch_names_ is compatible for Client().
+           mem_optim: use memory/graphics memory optimization, True default.
+           ir_optim: use calculation chart optimization, False default.
+           available_port_generator: generate available ports
+           use_trt: use nvidia tensorRt engine, False default.
+           use_profile: use profiling, False default.
+        Returns:
+           None
+        """
        if available_port_generator is None:
            available_port_generator = util.GetAvailablePortGenerator()
        self._model_config = model_config
        self._port_list = []
+        self._device_type = "cpu"
        if devices == "":
            # cpu
            devices = [-1]
+            self._device_type = "cpu"
            self._port_list.append(available_port_generator.next())
            _LOGGER.info("Model({}) will be launch in cpu device. Port({})"
                         .format(model_config, self._port_list))
        else:
            # gpu
-            if PACKAGE_VERSION == "CPU":
+            self._device_type = "gpu"
-                raise ValueError(
-                    "You are using the CPU version package("
-                    "paddle-serving-server), unable to set devices")
            devices = [int(x) for x in devices.split(",")]
            for _ in devices:
                self._port_list.append(available_port_generator.next())
            _LOGGER.info("Model({}) will be launch in gpu device: {}. Port({})"
                         .format(model_config, devices, self._port_list))
+        self._client_type = client_type
        self._workdir = workdir
        self._devices = devices
        self._thread_num = thread_num
        self._mem_optim = mem_optim
        self._ir_optim = ir_optim
+        self._local_predictor_client = None
        self._rpc_service_list = []
        self._server_pros = []
-        self._fetch_vars = None
+        self._use_trt = use_trt
+        self._use_profile = use_profile
+        self.fetch_names_ = fetch_names
    def get_fetch_list(self):
-        return self._fetch_vars
+        return self.fetch_names_
    def get_port_list(self):
        return self._port_list
+    def get_client(self, concurrency_idx):
+        """
+        Function get_client is only used for local predictor case, creates one
+        LocalPredictor object, and initializes the paddle predictor by function
+        load_model_config.The concurrency_idx is used to select running devices.  
+        Args:
+            concurrency_idx: process/thread index
+        Returns:
+            _local_predictor_client
+        """
+        #checking the legality of concurrency_idx.
+        device_num = len(self._devices)
+        if device_num <= 0:
+            _LOGGER.error("device_num must be not greater than 0. devices({})".
+                          format(self._devices))
+            raise ValueError("The number of self._devices error")
+        if concurrency_idx < 0:
+            _LOGGER.error("concurrency_idx({}) must be one positive number".
+                          format(concurrency_idx))
+            concurrency_idx = 0
+        elif concurrency_idx >= device_num:
+            concurrency_idx = concurrency_idx % device_num
+        _LOGGER.info("GET_CLIENT : concurrency_idx={}, device_num={}".format(
+            concurrency_idx, device_num))
+        from paddle_serving_app.local_predict import LocalPredictor
+        if self._local_predictor_client is None:
+            self._local_predictor_client = LocalPredictor()
+            use_gpu = False
+            if self._device_type == "gpu":
+                use_gpu = True
+            self._local_predictor_client.load_model_config(
+                model_path=self._model_config,
+                use_gpu=use_gpu,
+                gpu_id=self._devices[concurrency_idx],
+                use_profile=self._use_profile,
+                thread_num=self._thread_num,
+                mem_optim=self._mem_optim,
+                ir_optim=self._ir_optim,
+                use_trt=self._use_trt)
+        return self._local_predictor_client
    def get_client_config(self):
        return os.path.join(self._model_config, "serving_server_conf.prototxt")
    def _prepare_one_server(self, workdir, port, gpuid, thread_num, mem_optim,
                            ir_optim):
-        device = "gpu"
+        """
-        if gpuid == -1:
+        According to _device_type, generating one CpuServer or GpuServer, and
-            device = "cpu"
+        setting the model config amd startup params.
+        Args:
+            workdir: work directory
+            port: network port
+            gpuid: gpu id
+            thread_num: thread num
+            mem_optim: use memory/graphics memory optimization
+            ir_optim: use calculation chart optimization
+        Returns:
+            server: CpuServer/GpuServer
+        """
+        if self._device_type == "cpu":
+            from paddle_serving_server import OpMaker, OpSeqMaker, Server
            op_maker = OpMaker()
            read_op = op_maker.create('general_reader')
            general_infer_op = op_maker.create('general_infer')
@@ -93,23 +184,51 @@ class LocalRpcServiceHandler(object):
            op_seq_maker.add_op(general_response_op)
            server = Server()
+        else:
+            #gpu
+            from paddle_serving_server_gpu import OpMaker, OpSeqMaker, Server
+            op_maker = OpMaker()
+            read_op = op_maker.create('general_reader')
+            general_infer_op = op_maker.create('general_infer')
+            general_response_op = op_maker.create('general_response')
+            op_seq_maker = OpSeqMaker()
+            op_seq_maker.add_op(read_op)
+            op_seq_maker.add_op(general_infer_op)
+            op_seq_maker.add_op(general_response_op)
+            server = Server()
+            if gpuid >= 0:
+                server.set_gpuid(gpuid)
        server.set_op_sequence(op_seq_maker.get_op_sequence())
        server.set_num_threads(thread_num)
        server.set_memory_optimize(mem_optim)
        server.set_ir_optimize(ir_optim)
        server.load_model_config(self._model_config)
-        if gpuid >= 0:
+        server.prepare_server(
-            server.set_gpuid(gpuid)
+            workdir=workdir, port=port, device=self._device_type)
-        server.prepare_server(workdir=workdir, port=port, device=device)
+        if self.fetch_names_ is None:
-        if self._fetch_vars is None:
+            self.fetch_names_ = server.get_fetch_list()
-            self._fetch_vars = server.get_fetch_list()
        return server
    def _start_one_server(self, service_idx):
+        """
+        Start one server
+        Args:
+            service_idx: server index
+        Returns:
+            None
+        """
        self._rpc_service_list[service_idx].run_server()
    def prepare_server(self):
+        """
+        Prepare all servers to be started, and append them into list. 
+        """
        for i, device_id in enumerate(self._devices):
            if self._workdir != "":
                workdir = "{}_{}".format(self._workdir, i)
@@ -125,6 +244,9 @@ class LocalRpcServiceHandler(object):
                    ir_optim=self._ir_optim))
    def start_server(self):
+        """
+        Start multiple processes and start one server in each process
+        """
        for i, service in enumerate(self._rpc_service_list):
            p = multiprocessing.Process(
                target=self._start_one_server, args=(i, ))

--- a/python/pipeline/operator.py
+++ b/python/pipeline/operator.py
@@ -24,6 +24,7 @@ import os
 import sys
 import collections
 import numpy as np
+import json
 from numpy import *
 if sys.version_info.major == 2:
    import Queue
@@ -33,12 +34,12 @@ else:
    raise Exception("Error Python version")
 from .proto import pipeline_service_pb2
-from .channel import (ThreadChannel, ProcessChannel, ChannelDataEcode,
+from .channel import (ThreadChannel, ProcessChannel, ChannelDataErrcode,
                      ChannelData, ChannelDataType, ChannelStopError,
-                      ChannelTimeoutError)
+                      ChannelTimeoutError, ProductErrCode)
 from .util import NameGenerator
 from .profiler import UnsafeTimeProfiler as TimeProfiler
-from . import local_rpc_service_handler
+from . import local_service_handler
 _LOGGER = logging.getLogger(__name__)
 _op_name_gen = NameGenerator("Op")
@@ -51,12 +52,13 @@ class Op(object):
                 server_endpoints=None,
                 fetch_list=None,
                 client_config=None,
+                 client_type=None,
                 concurrency=None,
                 timeout=None,
-                 retry=None,
+                 retry=0,
                 batch_size=None,
                 auto_batching_timeout=None,
-                 local_rpc_service_handler=None):
+                 local_service_handler=None):
        # In __init__, all the parameters are just saved and Op is not initialized
        if name is None:
            name = _op_name_gen.next()
@@ -64,10 +66,11 @@ class Op(object):
        self.concurrency = concurrency  # amount of concurrency
        self.set_input_ops(input_ops)
-        self._local_rpc_service_handler = local_rpc_service_handler
+        self._local_service_handler = local_service_handler
        self._server_endpoints = server_endpoints
        self._fetch_names = fetch_list
        self._client_config = client_config
+        self.client_type = client_type
        self._timeout = timeout
        self._retry = max(1, retry)
        self._batch_size = batch_size
@@ -86,6 +89,18 @@ class Op(object):
        self._succ_close_op = False
    def init_from_dict(self, conf):
+        """
+        Initializing one Op from config.yaml. If server_endpoints exist,
+        which is remote RPC mode, otherwise it is local RPC mode. There
+        are three types of predictios in local RPC mode, brpc, grpc and
+        local_predictor.
+        Args:
+            conf: config.yaml
+        Returns:
+            None
+        """
        # init op
        if self.concurrency is None:
            self.concurrency = conf["concurrency"]
@@ -116,31 +131,46 @@ class Op(object):
        else:
            self._auto_batching_timeout = self._auto_batching_timeout / 1000.0
+        self.model_config = None
+        self.workdir = None
+        self.thread_num = self.concurrency
+        self.devices = ""
+        self.mem_optim = False
+        self.ir_optim = False
        if self._server_endpoints is None:
            server_endpoints = conf.get("server_endpoints", [])
            if len(server_endpoints) != 0:
                # remote service
                self.with_serving = True
                self._server_endpoints = server_endpoints
+                self.client_type = conf["client_type"]
            else:
-                if self._local_rpc_service_handler is None:
+                if self._local_service_handler is None:
                    local_service_conf = conf.get("local_service_conf")
                    _LOGGER.info("local_service_conf: {}".format(
                        local_service_conf))
-                    model_config = local_service_conf.get("model_config")
+                    self.model_config = local_service_conf.get("model_config")
-                    _LOGGER.info("model_config: {}".format(model_config))
+                    self.client_type = local_service_conf.get("client_type")
-                    if model_config is None:
+                    self.workdir = local_service_conf.get("workdir")
+                    self.thread_num = local_service_conf.get("thread_num")
+                    self.devices = local_service_conf.get("devices")
+                    self.mem_optim = local_service_conf.get("mem_optim")
+                    self.ir_optim = local_service_conf.get("ir_optim")
+                    self._fetch_names = local_service_conf.get("fetch_list")
+                    if self.model_config is None:
                        self.with_serving = False
                    else:
                        # local rpc service
                        self.with_serving = True
-                        service_handler = local_rpc_service_handler.LocalRpcServiceHandler(
+                        if self.client_type == "brpc" or self.client_type == "grpc":
-                            model_config=model_config,
+                            service_handler = local_service_handler.LocalServiceHandler(
-                            workdir=local_service_conf["workdir"],
+                                model_config=self.model_config,
-                            thread_num=local_service_conf["thread_num"],
+                                client_type=self.client_type,
-                            devices=local_service_conf["devices"],
+                                workdir=self.workdir,
-                            mem_optim=local_service_conf["mem_optim"],
+                                thread_num=self.thread_num,
-                            ir_optim=local_service_conf["ir_optim"])
+                                devices=self.devices,
+                                mem_optim=self.mem_optim,
+                                ir_optim=self.ir_optim)
                            service_handler.prepare_server()  # get fetch_list
                            serivce_ports = service_handler.get_port_list()
                            self._server_endpoints = [
@@ -150,22 +180,33 @@ class Op(object):
                                self._client_config = service_handler.get_client_config(
                                )
                            if self._fetch_names is None:
-                            self._fetch_names = service_handler.get_fetch_list()
+                                self._fetch_names = service_handler.get_fetch_list(
-                        self._local_rpc_service_handler = service_handler
+                                )
+                        elif self.client_type == "local_predictor":
+                            service_handler = local_service_handler.LocalServiceHandler(
+                                model_config=self.model_config,
+                                client_type=self.client_type,
+                                workdir=self.workdir,
+                                thread_num=self.thread_num,
+                                devices=self.devices,
+                                fetch_names=self._fetch_names)
+                            if self._client_config is None:
+                                self._client_config = service_handler.get_client_config(
+                                )
+                        self._local_service_handler = service_handler
                else:
                    self.with_serving = True
-                    self._local_rpc_service_handler.prepare_server(
+                    self._local_service_handler.prepare_server(
                    )  # get fetch_list
-                    serivce_ports = self._local_rpc_service_handler.get_port_list(
+                    serivce_ports = self._local_service_handler.get_port_list()
-                    )
                    self._server_endpoints = [
                        "127.0.0.1:{}".format(p) for p in serivce_ports
                    ]
                    if self._client_config is None:
-                        self._client_config = self._local_rpc_service_handler.get_client_config(
+                        self._client_config = self._local_service_handler.get_client_config(
                        )
                    if self._fetch_names is None:
-                        self._fetch_names = self._local_rpc_service_handler.get_fetch_list(
+                        self._fetch_names = self._local_service_handler.get_fetch_list(
                        )
        else:
            self.with_serving = True
@@ -188,17 +229,38 @@ class Op(object):
                              self._batch_size, self._auto_batching_timeout)))
    def launch_local_rpc_service(self):
-        if self._local_rpc_service_handler is None:
+        """
+        Launching multiple local rpc servers.
+        Args:
+            None
+        Returns:
+            None
+        """
+        if self._local_service_handler is None:
            _LOGGER.warning(
                self._log("Failed to launch local rpc"
-                          " service: local_rpc_service_handler is None."))
+                          " service: local_service_handler is None."))
            return
-        port = self._local_rpc_service_handler.get_port_list()
+        port = self._local_service_handler.get_port_list()
-        self._local_rpc_service_handler.start_server()
+        #if self._local_service_handler.client_type == "local_predictor":
+        #    _LOGGER.info("Op({}) use local predictor.")
+        #    return
+        self._local_service_handler.start_server()
        _LOGGER.info("Op({}) use local rpc service at port: {}"
                     .format(self.name, port))
    def use_default_auto_batching_config(self):
+        """
+        Set the auto batching config default.
+        Args:
+            None
+        Returns:
+            None
+        """
        if self._batch_size != 1:
            _LOGGER.warning("Op({}) reset batch_size=1 (original: {})"
                            .format(self.name, self._batch_size))
@@ -215,28 +277,56 @@ class Op(object):
    def set_tracer(self, tracer):
        self._tracer = tracer
-    def init_client(self, client_type, client_config, server_endpoints,
+    def init_client(self, client_config, server_endpoints):
-                    fetch_names):
+        """
+        Initialize the client object. There are three types of clients, brpc,
+        grpc and local_predictor. In grpc or brpc mode, the client connects 
+        endpoints.
+        Args:
+            client_config: client config info
+            server_endpoints: server IP/Port list.
+        Returns:
+            client: client object.
+        """
        if self.with_serving == False:
            _LOGGER.info("Op({}) has no client (and it also do not "
                         "run the process function)".format(self.name))
            return None
-        if client_type == 'brpc':
+        if self.client_type == 'brpc':
            client = Client()
            client.load_client_config(client_config)
-        elif client_type == 'grpc':
+        elif self.client_type == 'grpc':
            client = MultiLangClient()
+        elif self.client_type == 'local_predictor':
+            if self.local_predictor is None:
+                raise ValueError("local predictor not yet created")
+            client = self.local_predictor
        else:
            raise ValueError("Failed to init client: unknow client "
-                             "type {}".format(client_type))
+                             "type {}".format(self.client_type))
+        if self._fetch_names is None:
+            self._fetch_names = client.fetch_names_
+            _LOGGER.info("Op({}) has no fetch name set. So fetch all vars")
+        if self.client_type != "local_predictor":
            client.connect(server_endpoints)
-        self._fetch_names = fetch_names
        return client
    def get_input_ops(self):
        return self._input_ops
    def set_input_ops(self, ops):
+        """
+        Set input ops.Each op have many input ops, but only one input
+        channel.
+        Args:
+            ops: op list
+        Returns:
+            None.
+        """
        if not isinstance(ops, list):
            ops = [] if ops is None else [ops]
        self._input_ops = []
@@ -249,6 +339,10 @@ class Op(object):
            self._input_ops.append(op)
    def add_input_channel(self, channel):
+        """
+        Adding one input channel to the Op. Each op have many front op,
+        but, only one input channel.
+        """
        if not isinstance(channel, (ThreadChannel, ProcessChannel)):
            _LOGGER.critical(
                self._log("Failed to set input_channel: input "
@@ -265,6 +359,16 @@ class Op(object):
        return self._input
    def add_output_channel(self, channel):
+        """
+        Adding one output channel to the Op. Each op have many output channels,
+        But only one front channel.
+        Args:
+            channel: an output channel object.
+        Returns:
+            None
+        """
        if not isinstance(channel, (ThreadChannel, ProcessChannel)):
            _LOGGER.critical(
                self._log("Failed to add output_channel: output channel "
@@ -279,7 +383,23 @@ class Op(object):
    def _get_output_channels(self):
        return self._outputs
-    def preprocess(self, input_dicts):
+    def preprocess(self, input_dicts, data_id=0, log_id=0):
+        """
+        In preprocess stage, assembling data for process stage. users can 
+        override this function for model feed features.
+        Args:
+            input_dicts: input data to be preprocessed
+            data_id: inner unique id, 0 default
+            log_id: global unique id for RTT, 0 default
+        Return:
+            input_dict: data for process stage
+            is_skip_process: skip process stage or not, False default
+            prod_errcode: None default, otherwise, product errores occured.
+                          It is handled in the same way as exception. 
+            prod_errinfo: "" default
+        """
        # multiple previous Op
        if len(input_dicts) != 1:
            _LOGGER.critical(
@@ -289,44 +409,92 @@ class Op(object):
            os._exit(-1)
        (_, input_dict), = input_dicts.items()
-        return input_dict
+        return input_dict, False, None, ""
-    def process(self, feed_batch, typical_logid):
+    def process(self, feed_batch, typical_logid=0):
+        """
+        In process stage, send requests to the inference server or predict locally.
+        users do not need to inherit this function
+        Args:
+            feed_batch: data to be fed to inference server
+            typical_logid: mark batch predicts, usually the first logid in batch,
+                0 default.
+        Returns:
+            call_result: predict result
+        """
        err, err_info = ChannelData.check_batch_npdata(feed_batch)
        if err != 0:
            _LOGGER.critical(
                self._log("Failed to run process: {}. Please override "
                          "preprocess func.".format(err_info)))
            os._exit(-1)
+        if self.client_type == "local_predictor":
            call_result = self.client.predict(
-            feed=feed_batch, fetch=self._fetch_names, log_id=typical_logid)
+                feed=feed_batch[0],
+                fetch=self._fetch_names,
+                batch=True,
+                log_id=typical_logid)
+        else:
+            call_result = self.client.predict(
+                feed=feed_batch,
+                fetch=self._fetch_names,
+                batch=True,
+                log_id=typical_logid)
        if isinstance(self.client, MultiLangClient):
            if call_result is None or call_result["serving_status_code"] != 0:
                return None
            call_result.pop("serving_status_code")
        return call_result
-    def postprocess(self, input_dict, fetch_dict):
+    def postprocess(self, input_dict, fetch_dict, log_id=0):
-        return fetch_dict
+        """
+        In postprocess stage, assemble data for next op or output.
+        Args:
+            input_dict: data returned in preprocess stage.
+            fetch_dict: data returned in process stage.
+            log_id: logid, 0 default
+        Returns: 
+            fetch_dict: return fetch_dict default
+            prod_errcode: None default, otherwise, product errores occured.
+                          It is handled in the same way as exception.
+            prod_errinfo: "" default
+        """
+        return fetch_dict, None, ""
    def _parse_channeldata(self, channeldata_dict):
+        """
+        Parse one channeldata 
+        Args:
+            channeldata_dict : channel data to be parsed, dict type
+        Return:
+            data_id: created by dag._id_generator, unique
+            error_channeldata: error channeldata
+            parsed_data: get np/dict data from channeldata
+            client_need_profile: need profile info
+            profile_set: profile info
+            log_id: logid for tracing a request 
+        """
        data_id, error_channeldata = None, None
        client_need_profile, profile_set = False, set()
        parsed_data = {}
        key = list(channeldata_dict.keys())[0]
        data_id = channeldata_dict[key].id
+        log_id = channeldata_dict[key].log_id
        client_need_profile = channeldata_dict[key].client_need_profile
        for name, data in channeldata_dict.items():
-            if data.ecode != ChannelDataEcode.OK.value:
+            if data.error_code != ChannelDataErrcode.OK.value:
                error_channeldata = data
                break
            parsed_data[name] = data.parse()
            if client_need_profile:
                profile_set |= data.profile_data_set
        return (data_id, error_channeldata, parsed_data, client_need_profile,
-                profile_set)
+                profile_set, log_id)
    def _push_to_output_channels(self,
                                 data,
@@ -335,6 +503,20 @@ class Op(object):
                                 profile_str=None,
                                 client_need_profile=False,
                                 profile_set=None):
+        """
+        Push data to output channels, Do not run the later stage(preprocess,
+        process, postprocess)
+        Args:
+            data: channeldata, to be pushed
+            channels: output channels
+            name: op name  
+            profile_str: one profile message
+            client_need_profile: False default
+            profile_set: profile message collections
+        Returns:
+            None
+        """
        if name is None:
            name = self.name
@@ -347,33 +529,61 @@ class Op(object):
        for channel in channels:
            channel.push(data, name)
-    def start_with_process(self, client_type):
+    def start_with_process(self):
+        """
+        Each OP creates a process to run the main loop, initializes the CUDA
+        environment in each individual process.
+        Args:
+            None
+        Returns:
+            process array
+        """
        trace_buffer = None
        if self._tracer is not None:
            trace_buffer = self._tracer.data_buffer()
-        proces = []
+        process = []
        for concurrency_idx in range(self.concurrency):
            p = multiprocessing.Process(
                target=self._run,
                args=(concurrency_idx, self._get_input_channel(),
-                      self._get_output_channels(), client_type, False,
+                      self._get_output_channels(), False, trace_buffer,
-                      trace_buffer))
+                      self.model_config, self.workdir, self.thread_num,
+                      self.devices, self.mem_optim, self.ir_optim))
            p.daemon = True
            p.start()
-            proces.append(p)
+            process.append(p)
-        return proces
+        return process
+    def start_with_thread(self):
+        """
+        Each OP creates a thread to run the main loop, initializes the CUDA 
+        environment in the main thread.
-    def start_with_thread(self, client_type):
+        Args:
+            None
+        Returns:
+            thread array
+        """
        trace_buffer = None
        if self._tracer is not None:
            trace_buffer = self._tracer.data_buffer()
+        #Init cuda env in main thread
+        if self.client_type == "local_predictor":
+            _LOGGER.info("Init cuda env in main thread")
+            self.local_predictor = self._local_service_handler.get_client(0)
        threads = []
        for concurrency_idx in range(self.concurrency):
            t = threading.Thread(
                target=self._run,
                args=(concurrency_idx, self._get_input_channel(),
-                      self._get_output_channels(), client_type, True,
+                      self._get_output_channels(), True, trace_buffer,
-                      trace_buffer))
+                      self.model_config, self.workdir, self.thread_num,
+                      self.devices, self.mem_optim, self.ir_optim))
            # When a process exits, it attempts to terminate
            # all of its daemonic child processes.
            t.daemon = True
@@ -384,52 +594,109 @@ class Op(object):
    def init_op(self):
        pass
-    def _run_preprocess(self, parsed_data_dict, op_info_prefix):
+    def _run_preprocess(self, parsed_data_dict, op_info_prefix, logid_dict):
+        """
+        Run preprocess stage
+        Args:
+            parsed_data_dict: data to be pre-processed
+            op_info_prefix: input op info
+            logid_dict: logid dict
+        Returns:
+            preped_data_dict: data preprocessed, to be processed 
+            err_channeldata_dict: when exceptions occurred, putting errors in it.
+            skip_process_dict: skip process stage or not
+        """
        _LOGGER.debug("{} Running preprocess".format(op_info_prefix))
        preped_data_dict = collections.OrderedDict()
        err_channeldata_dict = collections.OrderedDict()
+        skip_process_dict = {}
        for data_id, parsed_data in parsed_data_dict.items():
            preped_data, error_channeldata = None, None
+            is_skip_process = False
+            prod_errcode, prod_errinfo = None, None
+            log_id = logid_dict.get(data_id)
            try:
-                preped_data = self.preprocess(parsed_data)
+                preped_data, is_skip_process, prod_errcode, prod_errinfo = self.preprocess(
+                    parsed_data, data_id, logid_dict.get(data_id))
+                # Set skip_process_dict
+                if is_skip_process is True:
+                    skip_process_dict[data_id] = True
            except TypeError as e:
                # Error type in channeldata.datatype
-                error_info = "(logid={}) {} Failed to preprocess: {}".format(
+                error_info = "(data_id={} log_id={}) {} Failed to preprocess: {}".format(
-                    data_id, op_info_prefix, e)
+                    data_id, log_id, op_info_prefix, e)
                _LOGGER.error(error_info, exc_info=True)
                error_channeldata = ChannelData(
-                    ecode=ChannelDataEcode.TYPE_ERROR.value,
+                    error_code=ChannelDataErrcode.TYPE_ERROR.value,
                    error_info=error_info,
-                    data_id=data_id)
+                    data_id=data_id,
+                    log_id=log_id)
            except Exception as e:
-                error_info = "(logid={}) {} Failed to preprocess: {}".format(
+                error_info = "(data_id={} log_id={}) {} Failed to preprocess: {}".format(
-                    data_id, op_info_prefix, e)
+                    data_id, log_id, op_info_prefix, e)
                _LOGGER.error(error_info, exc_info=True)
                error_channeldata = ChannelData(
-                    ecode=ChannelDataEcode.UNKNOW.value,
+                    error_code=ChannelDataErrcode.UNKNOW.value,
                    error_info=error_info,
-                    data_id=data_id)
+                    data_id=data_id,
+                    log_id=log_id)
+            if prod_errcode is not None:
+                # product errors occured
+                error_channeldata = ChannelData(
+                    error_code=ChannelDataErrcode.PRODUCT_ERROR.value,
+                    error_info="",
+                    prod_error_code=prod_errcode,
+                    prod_error_info=prod_errinfo,
+                    data_id=data_id,
+                    log_id=log_id)
            if error_channeldata is not None:
                err_channeldata_dict[data_id] = error_channeldata
            else:
                preped_data_dict[data_id] = preped_data
        _LOGGER.debug("{} Succ preprocess".format(op_info_prefix))
-        return preped_data_dict, err_channeldata_dict
+        return preped_data_dict, err_channeldata_dict, skip_process_dict
-    def _run_process(self, preped_data_dict, op_info_prefix):
+    def _run_process(self, preped_data_dict, op_info_prefix, skip_process_dict,
+                     logid_dict):
+        """
+        Run process stage
+        Args:
+            preped_data_dict: feed the data to be predicted by the model.  
+            op_info_prefix: prefix op info
+            skip_process_dict: skip process stage or not
+            logid_dict: logid dict
+        Returns:
+            midped_data_dict: data midprocessed, to be post-processed 
+            err_channeldata_dict: when exceptions occurred, putting errors in it 
+        """
        _LOGGER.debug("{} Running process".format(op_info_prefix))
        midped_data_dict = collections.OrderedDict()
        err_channeldata_dict = collections.OrderedDict()
-        if self.with_serving:
+        ### if (batch_num == 1 && skip == True) ,then skip the process stage.
-            data_ids = preped_data_dict.keys()
+        is_skip_process = False
+        data_ids = list(preped_data_dict.keys())
+        if len(data_ids) == 1 and skip_process_dict.get(data_ids[0]) == True:
+            is_skip_process = True
+            _LOGGER.info("(data_id={} log_id={}) skip process stage".format(
+                data_ids[0], logid_dict.get(data_ids[0])))
+        if self.with_serving is True and is_skip_process is False:
+            # use typical_logid to mark batch data
            typical_logid = data_ids[0]
            if len(data_ids) != 1:
                for data_id in data_ids:
                    _LOGGER.info(
-                        "(logid={}) {} During access to PaddleServingService,"
+                        "(data_id={} logid={}) {} During access to PaddleServingService,"
                        " we selected logid={} (from batch: {}) as a "
                        "representative for logging.".format(
-                            data_id, op_info_prefix, typical_logid, data_ids))
+                            data_id,
+                            logid_dict.get(data_id), op_info_prefix,
+                            typical_logid, data_ids))
            # combine samples to batch
            one_input = preped_data_dict[data_ids[0]]
@@ -449,64 +716,70 @@ class Op(object):
                    input_offset.append(offset)
            else:
                _LOGGER.critical(
-                    "{} Failed to process: expect input type is dict(sample"
+                    "(data_id={} log_id={}){} Failed to process: expect input type is dict(sample"
-                    " input) or list(batch input), but get {}".format(
+                    " input) or list(batch input), but get {}".format(data_ids[
-                        op_info_prefix, type(one_input)))
+                        0], typical_logid, op_info_prefix, type(one_input)))
                os._exit(-1)
            midped_batch = None
-            ecode = ChannelDataEcode.OK.value
+            error_code = ChannelDataErrcode.OK.value
            if self._timeout <= 0:
                try:
                    midped_batch = self.process(feed_batch, typical_logid)
                except Exception as e:
-                    ecode = ChannelDataEcode.UNKNOW.value
+                    error_code = ChannelDataErrcode.UNKNOW.value
-                    error_info = "(logid={}) {} Failed to process(batch: {}): {}".format(
+                    error_info = "(data_id={} log_id={}) {} Failed to process(batch: {}): {}".format(
-                        typical_logid, op_info_prefix, data_ids, e)
+                        data_ids[0], typical_logid, op_info_prefix, data_ids, e)
                    _LOGGER.error(error_info, exc_info=True)
            else:
+                # retry N times configed in yaml files.
                for i in range(self._retry):
                    try:
+                        # time out for each process
                        midped_batch = func_timeout.func_timeout(
                            self._timeout,
                            self.process,
                            args=(feed_batch, typical_logid))
                    except func_timeout.FunctionTimedOut as e:
                        if i + 1 >= self._retry:
-                            ecode = ChannelDataEcode.TIMEOUT.value
+                            error_code = ChannelDataErrcode.TIMEOUT.value
-                            error_info = "(logid={}) {} Failed to process(batch: {}): " \
+                            error_info = "(log_id={}) {} Failed to process(batch: {}): " \
                                    "exceeded retry count.".format(
                                            typical_logid, op_info_prefix, data_ids)
                            _LOGGER.error(error_info)
                        else:
                            _LOGGER.warning(
-                                "(logid={}) {} Failed to process(batch: {}): timeout,"
+                                "(log_id={}) {} Failed to process(batch: {}): timeout,"
                                " and retrying({}/{})...".format(
                                    typical_logid, op_info_prefix, data_ids, i +
                                    1, self._retry))
                    except Exception as e:
-                        ecode = ChannelDataEcode.UNKNOW.value
+                        error_code = ChannelDataErrcode.UNKNOW.value
-                        error_info = "(logid={}) {} Failed to process(batch: {}): {}".format(
+                        error_info = "(log_id={}) {} Failed to process(batch: {}): {}".format(
                            typical_logid, op_info_prefix, data_ids, e)
                        _LOGGER.error(error_info, exc_info=True)
                        break
                    else:
                        break
-            if ecode != ChannelDataEcode.OK.value:
+            if error_code != ChannelDataErrcode.OK.value:
                for data_id in data_ids:
                    err_channeldata_dict[data_id] = ChannelData(
-                        ecode=ecode, error_info=error_info, data_id=data_id)
+                        error_code=error_code,
+                        error_info=error_info,
+                        data_id=data_id,
+                        log_id=logid_dict.get(data_id))
            elif midped_batch is None:
                # op client return None
-                error_info = "(logid={}) {} Failed to predict, please check if " \
+                error_info = "(log_id={}) {} Failed to predict, please check if " \
                        "PaddleServingService is working properly.".format(
                                typical_logid, op_info_prefix)
                _LOGGER.error(error_info)
                for data_id in data_ids:
                    err_channeldata_dict[data_id] = ChannelData(
-                        ecode=ChannelDataEcode.CLIENT_ERROR.value,
+                        error_code=ChannelDataErrcode.CLIENT_ERROR.value,
                        error_info=error_info,
-                        data_id=data_id)
+                        data_id=data_id,
+                        log_id=logid_dict.get(data_id))
            else:
                # transform np format to dict format
                var_names = midped_batch.keys()
@@ -515,7 +788,7 @@ class Op(object):
                for name in var_names:
                    lod_offset_name = "{}.lod".format(name)
                    if lod_offset_name in var_names:
-                        _LOGGER.debug("(logid={}) {} {} is LodTensor".format(
+                        _LOGGER.debug("(log_id={}) {} {} is LodTensor".format(
                            typical_logid, op_info_prefix, name))
                        lod_var_names.add(name)
                        lod_offset_names.add(lod_offset_name)
@@ -551,38 +824,67 @@ class Op(object):
        return midped_data_dict, err_channeldata_dict
    def _run_postprocess(self, parsed_data_dict, midped_data_dict,
-                         op_info_prefix):
+                         op_info_prefix, logid_dict):
+        """
+        Run postprocess stage.
+        Args:
+            parsed_data_dict: data returned in preprocess stage 
+            midped_data_dict: data returned in process stage
+            op_info_prefix: prefix op info
+            logid_dict: logid dict
+        Returns:
+            postped_data_dict: data postprocessed 
+            err_channeldata_dict: when exceptions occurred, putting errors in it
+        """
        _LOGGER.debug("{} Running postprocess".format(op_info_prefix))
        postped_data_dict = collections.OrderedDict()
        err_channeldata_dict = collections.OrderedDict()
        for data_id, midped_data in midped_data_dict.items():
+            log_id = logid_dict.get(data_id)
            postped_data, err_channeldata = None, None
+            prod_errcode, prod_errinfo = None, None
            try:
-                postped_data = self.postprocess(parsed_data_dict[data_id],
+                postped_data, prod_errcode, prod_errinfo = self.postprocess(
-                                                midped_data)
+                    parsed_data_dict[data_id], midped_data,
+                    logid_dict.get(data_id))
            except Exception as e:
-                error_info = "(logid={}) {} Failed to postprocess: {}".format(
+                error_info = "(data_id={} log_id={}) {} Failed to postprocess: {}".format(
-                    data_id, op_info_prefix, e)
+                    data_id, log_id, op_info_prefix, e)
                _LOGGER.error(error_info, exc_info=True)
                err_channeldata = ChannelData(
-                    ecode=ChannelDataEcode.UNKNOW.value,
+                    error_code=ChannelDataErrcode.UNKNOW.value,
                    error_info=error_info,
-                    data_id=data_id)
+                    data_id=data_id,
+                    log_id=log_id)
+            if prod_errcode is not None:
+                # product errors occured
+                err_channeldata = ChannelData(
+                    error_code=ChannelDataErrcode.PRODUCT_ERROR.value,
+                    error_info="",
+                    prod_error_code=prod_errcode,
+                    prod_error_info=prod_errinfo,
+                    data_id=data_id,
+                    log_id=log_id)
            if err_channeldata is not None:
                err_channeldata_dict[data_id] = err_channeldata
                continue
            else:
                if not isinstance(postped_data, dict):
-                    error_info = "(logid={}) {} Failed to postprocess: " \
+                    error_info = "(log_id={} log_id={}) {} Failed to postprocess: " \
                            "output of postprocess funticon must be " \
                            "dict type, but get {}".format(
-                                data_id, op_info_prefix,
+                                data_id, log_id, op_info_prefix,
                                type(postped_data))
                    _LOGGER.error(error_info)
                    err_channeldata = ChannelData(
-                        ecode=ChannelDataEcode.UNKNOW.value,
+                        error_code=ChannelDataErrcode.UNKNOW.value,
                        error_info=error_info,
-                        data_id=data_id)
+                        data_id=data_id,
+                        log_id=log_id)
                    err_channeldata_dict[data_id] = err_channeldata
                    continue
@@ -592,18 +894,36 @@ class Op(object):
                    output_data = ChannelData(
                        ChannelDataType.CHANNEL_NPDATA.value,
                        npdata=postped_data,
-                        data_id=data_id)
+                        data_id=data_id,
+                        log_id=log_id)
                else:
                    output_data = ChannelData(
                        ChannelDataType.DICT.value,
                        dictdata=postped_data,
-                        data_id=data_id)
+                        data_id=data_id,
+                        log_id=log_id)
                postped_data_dict[data_id] = output_data
        _LOGGER.debug("{} Succ postprocess".format(op_info_prefix))
        return postped_data_dict, err_channeldata_dict
    def _auto_batching_generator(self, input_channel, op_name, batch_size,
                                 timeout, op_info_prefix):
+        """
+        Merge batch_size requests for one prediction.Taking one piece of data 
+        from the input channel each time until equals batch_size, or the waiting 
+        time exceeds auto_batching_timeout.
+        Args:
+            input_channel: the input channel of Op
+            op_name: op name
+            batch_size: batch size, Less than worker_num
+            timeout: batch timeout, seconds, If timeout is None, and the quantity 
+                taken from the front is less than batch_size, blocking occured.
+            op_info_prefix: op link info.
+        Returns:
+            None
+        """
        while True:
            batch = []
            while len(batch) == 0:
@@ -624,6 +944,9 @@ class Op(object):
                        else:
                            channeldata_dict = input_channel.front(op_name)
                        batch.append(channeldata_dict)
+                        _LOGGER.debug(
+                            "_auto_batching_generator get {} channeldata from op:{} into batch, batch_size:{}".
+                            format(idx, op_name, batch_size))
                    except ChannelTimeoutError:
                        _LOGGER.debug("{} Failed to generate batch: "
                                      "timeout".format(op_info_prefix))
@@ -633,38 +956,92 @@ class Op(object):
            yield batch
    def _parse_channeldata_batch(self, batch, output_channels):
+        """
+        Parse channeldatas batch
+        Args:
+            batch: auto-batching batch datas
+            output_channels: output channels 
+        Returns:
+            parsed_data_dict: parsed from channeldata in batch
+            need_profile_dict: need profile dict in batch 
+            profile_dict: profile info dict in batch
+            logid_dict: trace each request in batch
+        """
        parsed_data_dict = collections.OrderedDict()
        need_profile_dict = {}
        profile_dict = {}
+        logid_dict = {}
        for channeldata_dict in batch:
            (data_id, error_channeldata, parsed_data,
-                    client_need_profile, profile_set) = \
+                    client_need_profile, profile_set, log_id) = \
                            self._parse_channeldata(channeldata_dict)
            if error_channeldata is None:
                parsed_data_dict[data_id] = parsed_data
                need_profile_dict[data_id] = client_need_profile
                profile_dict[data_id] = profile_set
+                logid_dict[data_id] = log_id
            else:
                # error data in predecessor Op
                # (error_channeldata with profile info)
                self._push_to_output_channels(error_channeldata,
                                              output_channels)
-        return parsed_data_dict, need_profile_dict, profile_dict
+        return parsed_data_dict, need_profile_dict, profile_dict, logid_dict
-    def _run(self, concurrency_idx, input_channel, output_channels, client_type,
+    def _run(self, concurrency_idx, input_channel, output_channels,
-             is_thread_op, trace_buffer):
+             is_thread_op, trace_buffer, model_config, workdir, thread_num,
+             devices, mem_optim, ir_optim):
+        """
+        _run() is the entry function of OP process / thread model.When client 
+        type is local_predictor in process mode, the CUDA environment needs to 
+        be initialized by LocalServiceHandler[child process], otherwise, Cuda
+        error(3), initialization error is occured. Preprocess, process and 
+        postprocess are executed in the main loop. The preprocess and postprocess
+        function is usually rewrited by users. Trace data is recorded by trace_que.
+        Args:
+            concurrency_idx: thread/process index
+            input_channel: input channel, take the data to be processed
+            output_channels: output channel, store processed data
+            is_thread_op: False, It's process op; True, It's thread op
+            trace_buffer: store trace infomations
+            model_config: model config path
+            workdir: work directory
+            thread_num: number of threads, concurrent quantity
+            devices: gpu id list[gpu], "" default[cpu]
+            mem_optim: use memory/graphics memory optimization, True default.
+            ir_optim: use calculation chart optimization, False default. 
+        Returns:
+            None
+        """
        op_info_prefix = "[{}|{}]".format(self.name, concurrency_idx)
        tid = threading.current_thread().ident
-        # init op
+        # init ops
        profiler = None
        try:
-            profiler = self._initialize(is_thread_op, client_type,
+            if is_thread_op == False and self.client_type == "local_predictor":
+                self.service_handler = local_service_handler.LocalServiceHandler(
+                    model_config=model_config,
+                    client_type="local_predictor",
+                    workdir=workdir,
+                    thread_num=thread_num,
+                    devices=devices,
+                    mem_optim=mem_optim,
+                    ir_optim=ir_optim)
+                _LOGGER.info("Init cuda env in process {}".format(
+                    concurrency_idx))
+                self.local_predictor = self.service_handler.get_client(
                    concurrency_idx)
+            # check all ops initialized successfully.
+            profiler = self._initialize(is_thread_op, concurrency_idx)
        except Exception as e:
            _LOGGER.critical(
-                "{} Failed to init op: {}".format(op_info_prefix, e),
+                "{} failed to init op: {}".format(op_info_prefix, e),
                exc_info=True)
            os._exit(-1)
        _LOGGER.info("{} Succ init".format(op_info_prefix))
@@ -691,7 +1068,7 @@ class Op(object):
            # parse channeldata batch
            try:
-                parsed_data_dict, need_profile_dict, profile_dict \
+                parsed_data_dict, need_profile_dict, profile_dict, logid_dict\
                        = self._parse_channeldata_batch(
                                channeldata_dict_batch, output_channels)
            except ChannelStopError:
@@ -704,11 +1081,12 @@ class Op(object):
            # preprecess
            start = profiler.record("prep#{}_0".format(op_info_prefix))
-            preped_data_dict, err_channeldata_dict \
+            preped_data_dict, err_channeldata_dict, skip_process_dict \
-                    = self._run_preprocess(parsed_data_dict, op_info_prefix)
+                    = self._run_preprocess(parsed_data_dict, op_info_prefix, logid_dict)
            end = profiler.record("prep#{}_1".format(op_info_prefix))
            prep_time = end - start
            try:
+                # put error requests into output channel, skip process and postprocess stage
                for data_id, err_channeldata in err_channeldata_dict.items():
                    self._push_to_output_channels(
                        data=err_channeldata,
@@ -725,7 +1103,7 @@ class Op(object):
            # process
            start = profiler.record("midp#{}_0".format(op_info_prefix))
            midped_data_dict, err_channeldata_dict \
-                    = self._run_process(preped_data_dict, op_info_prefix)
+                    = self._run_process(preped_data_dict, op_info_prefix, skip_process_dict, logid_dict)
            end = profiler.record("midp#{}_1".format(op_info_prefix))
            midp_time = end - start
            try:
@@ -745,8 +1123,7 @@ class Op(object):
            # postprocess
            start = profiler.record("postp#{}_0".format(op_info_prefix))
            postped_data_dict, err_channeldata_dict \
-                    = self._run_postprocess(
+                    = self._run_postprocess(parsed_data_dict, midped_data_dict, op_info_prefix, logid_dict)
-                            parsed_data_dict, midped_data_dict, op_info_prefix)
            end = profiler.record("postp#{}_1".format(op_info_prefix))
            postp_time = end - start
            try:
@@ -801,16 +1178,28 @@ class Op(object):
                    except Queue.Full:
                        break
-    def _initialize(self, is_thread_op, client_type, concurrency_idx):
+    def _initialize(self, is_thread_op, concurrency_idx):
+        """
+        Initialize one OP object in the target function of a thread or porcess.
+        Initialize the client object with _client_config and _server_endpoints.
+        Create a TimeProfiler per thread or process for recording profiler info.
+        Args:
+            is_thread_op: True, one op runs in one thread; False, one op runs
+                in one process.
+            concurrency_idx: process id, Thread mode does not use this param.
+        Returns:
+            TimeProfiler
+        """
        if is_thread_op:
            with self._for_init_op_lock:
                if not self._succ_init_op:
                    # for the threaded version of Op, each thread cannot get its concurrency_idx
                    self.concurrency_idx = None
                    # init client
-                    self.client = self.init_client(
+                    self.client = self.init_client(self._client_config,
-                        client_type, self._client_config,
+                                                   self._server_endpoints)
-                        self._server_endpoints, self._fetch_names)
                    # user defined
                    self.init_op()
                    self._succ_init_op = True
@@ -818,9 +1207,8 @@ class Op(object):
        else:
            self.concurrency_idx = concurrency_idx
            # init client
-            self.client = self.init_client(client_type, self._client_config,
+            self.client = self.init_client(self._client_config,
-                                           self._server_endpoints,
+                                           self._server_endpoints)
-                                           self._fetch_names)
            # user defined
            self.init_op()
@@ -843,9 +1231,17 @@ class Op(object):
 class RequestOp(Op):
-    """ RequestOp do not run preprocess, process, postprocess. """
+    """
+    RequestOp is a special Op, for unpacking one request package. If the
+    request needs one special unpackaging method, you need to inherit class
+    RequestOp and rewrite function unpack_request_package.Notice!!! Class
+    RequestOp does not run preprocess, process, postprocess.
+    """
    def __init__(self):
+        """
+        Initialize the RequestOp
+        """
        # PipelineService.name = "@DAGExecutor"
        super(RequestOp, self).__init__(name="@DAGExecutor", input_ops=[])
        # init op
@@ -856,7 +1252,25 @@ class RequestOp(Op):
            os._exit(-1)
    def unpack_request_package(self, request):
-        dictdata = {}
+        """
+        Unpack request package by gateway.proto
+        Args:
+            request: HTTP body, JSON format
+        Returns:
+            dict_data: json fields in HTTP body
+            log_id: log_id
+            prod_errcode: None or ProductErrCode.SUCC.value default, otherwise,
+                          product errores occured.It is handled in the same way
+                          as exception.
+            prod_errinfo: "" default 
+        """
+        dict_data = {}
+        log_id = None
+        if request is None:
+            _LOGGER.critical("request is None")
+            raise ValueError("request is None")
        for idx, key in enumerate(request.key):
            data = request.value[idx]
            try:
@@ -865,14 +1279,27 @@ class RequestOp(Op):
                    data = evaled_data
            except Exception as e:
                pass
-            dictdata[key] = data
+            dict_data[key] = data
-        return dictdata
+        log_id = request.logid
+        _LOGGER.info("RequestOp unpack one request. log_id:{}, clientip:{} \
+            name:{}, method:{}".format(log_id, request.clientip, request.name,
+                                       request.method))
+        return dict_data, log_id, None, ""
 class ResponseOp(Op):
-    """ ResponseOp do not run preprocess, process, postprocess. """
+    """ 
+    ResponseOp is a special Op, for packing one response package. If the channeldata 
+    needs a special packaging method, you need to inherit class ReponseOp and rewrite
+    pack_response_package function. Notice!!! Class ResponseOp does not run preprocess,
+    process, postprocess.
+    """
    def __init__(self, input_ops):
+        """
+        Initialize the ResponseOp
+        """
        super(ResponseOp, self).__init__(
            name="@DAGExecutor", input_ops=input_ops)
        # init op
@@ -884,9 +1311,21 @@ class ResponseOp(Op):
            os._exit(-1)
    def pack_response_package(self, channeldata):
+        """
+        Getting channeldata from the last channel, packting the response 
+        package serialized by protobuf.  
+        Args:
+            channeldata: Type ChannelData
+        Returns:
+            resp: pipeline_service_pb2.Response()
+        """
        resp = pipeline_service_pb2.Response()
-        resp.ecode = channeldata.ecode
+        error_code = channeldata.error_code
-        if resp.ecode == ChannelDataEcode.OK.value:
+        error_info = ""
+        if error_code == ChannelDataErrcode.OK.value:
+            # Framework level errors
            if channeldata.datatype == ChannelDataType.CHANNEL_NPDATA.value:
                feed = channeldata.parse()
                # ndarray to string:
@@ -899,30 +1338,57 @@ class ResponseOp(Op):
                feed = channeldata.parse()
                for name, var in feed.items():
                    if not isinstance(var, str):
-                        resp.ecode = ChannelDataEcode.TYPE_ERROR.value
+                        error_code = ChannelDataErrcode.TYPE_ERROR.value
-                        resp.error_info = self._log(
+                        error_info = self._log(
                            "fetch var type must be str({}).".format(
                                type(var)))
                        _LOGGER.error("(logid={}) Failed to pack RPC "
                                      "response package: {}".format(
-                                          channeldata.id, resp.error_info))
+                                          channeldata.id, resp.err_msg))
                        break
                    resp.value.append(var)
                    resp.key.append(name)
            else:
-                resp.ecode = ChannelDataEcode.TYPE_ERROR.value
+                error_code = ChannelDataErrcode.TYPE_ERROR.value
-                resp.error_info = self._log(
+                error_info = self._log("error type({}) in datatype.".format(
-                    "error type({}) in datatype.".format(channeldata.datatype))
+                    channeldata.datatype))
                _LOGGER.error("(logid={}) Failed to pack RPC response"
-                              " package: {}".format(channeldata.id,
+                              " package: {}".format(channeldata.id, error_info))
-                                                    resp.error_info))
        else:
-            resp.error_info = channeldata.error_info
+            # Product level errors
+            error_info = channeldata.error_info
+            if error_code == ChannelDataErrcode.PRODUCT_ERROR.value:
+                #rewrite error_code when product errors occured
+                error_code = channeldata.prod_error_code
+                error_info = channeldata.prod_error_info
+        # pack results
+        if error_code is None:
+            error_code = 0
+        resp.err_no = error_code
+        resp.err_msg = error_info
        return resp
 class VirtualOp(Op):
-    ''' For connecting two channels. '''
+    """ 
+    To connect 2 ops across levels in dag view, we create virtual ops
+    between non-virtual ops, and transfer data only. For examples, 
+    the pred ops of F are D & E.In the process of building DAG, we will
+    create channels layer by layer according to dag views.Op F is not 
+    in the next layer view of [B, E], so we will create a virtual OP 
+    'V1' whose pred OP is E. And so on, we create two virtual op 'V2'
+    and 'V3', Finally, we find the non-virtual op F. we create 4 channels
+    among E, V1, V2, V3 and F, the producer of V1, V2, V3 and F is E.
+        DAG: [A -> B -> C -> D -> F]
+               \-> E ----------/
+        DAG view: [[A], [B, E], [C], [D], [F]]
+        BUILD DAG: [A -> B -> C -> D -> E -> F]
+                     \-> E -> V1-> V2-> V3/
+    """
    def __init__(self, name, concurrency=1):
        super(VirtualOp, self).__init__(
@@ -930,9 +1396,27 @@ class VirtualOp(Op):
        self._virtual_pred_ops = []
    def add_virtual_pred_op(self, op):
+        """
+        Add the front op of current vritual op.
+        Args:
+            op: one op object, may be a virtual op or not.
+        Returns:
+            None
+        """
        self._virtual_pred_ops.append(op)
    def _actual_pred_op_names(self, op):
+        """
+        Recursively find the front op which is a non-virtual op.
+        Args:
+            op: one op object
+        Returns:
+            names: the name of non-virtual pred ops.
+        """
        # can use disjoint-set, but it's not necessary
        if not isinstance(op, VirtualOp):
            return [op.name]
@@ -942,6 +1426,15 @@ class VirtualOp(Op):
        return names
    def add_output_channel(self, channel):
+        """
+        Adding the output channel of non-virtual pred ops.
+        Args:
+            channel: one channel.
+        Returns:
+            None.
+        """
        if not isinstance(channel, (ThreadChannel, ProcessChannel)):
            _LOGGER.critical(
                self._log("Failed to add output_channel: output_channel"
@@ -955,6 +1448,20 @@ class VirtualOp(Op):
    def _run(self, concurrency_idx, input_channel, output_channels, client_type,
             is_thread_op):
+        """
+        The target function _run() only transfers data between OPs in one thread
+        or process.
+        Args:
+            concurrency_idx: process id, not avaliable in thread mode.
+            input_channel: input channel
+            output_channels: output channels
+            client_type: no use
+            is_thread_op: True, thread mode; False, process mode
+        Returns:
+            None
+        """
        op_info_prefix = "[{}|{}]".format(self.name, concurrency_idx)
        log = get_log_func(op_info_prefix)
        tid = threading.current_thread().ident

--- a/python/pipeline/pipeline_client.py
+++ b/python/pipeline/pipeline_client.py
@@ -18,14 +18,20 @@ import numpy as np
 from numpy import *
 import logging
 import functools
-from .channel import ChannelDataEcode
+import json
+import socket
+from .channel import ChannelDataErrcode
 from .proto import pipeline_service_pb2
 from .proto import pipeline_service_pb2_grpc
+import six
 _LOGGER = logging.getLogger(__name__)
 class PipelineClient(object):
+    """
+    PipelineClient provides the basic capabilities of the pipeline SDK
+    """
    def __init__(self):
        self._channel = None
        self._profile_key = "pipeline.profile"
@@ -42,13 +48,38 @@ class PipelineClient(object):
    def _pack_request_package(self, feed_dict, profile):
        req = pipeline_service_pb2.Request()
+        logid = feed_dict.get("logid")
+        if logid is None:
+            req.logid = 0
+        else:
+            if sys.version_info.major == 2:
+                req.logid = long(logid)
+            elif sys.version_info.major == 3:
+                req.logid = int(logid)
+            feed_dict.pop("logid")
+        clientip = feed_dict.get("clientip")
+        if clientip is None:
+            hostname = socket.gethostname()
+            ip = socket.gethostbyname(hostname)
+            req.clientip = ip
+        else:
+            req.clientip = clientip
+            feed_dict.pop("clientip")
        np.set_printoptions(threshold=sys.maxsize)
        for key, value in feed_dict.items():
            req.key.append(key)
+            if (sys.version_info.major == 2 and isinstance(value,
+                                                           (str, unicode)) or
+                ((sys.version_info.major == 3) and isinstance(value, str))):
+                req.value.append(value)
+                continue
            if isinstance(value, np.ndarray):
                req.value.append(value.__repr__())
-            elif isinstance(value, (str, unicode)):
-                req.value.append(value)
            elif isinstance(value, list):
                req.value.append(np.array(value).__repr__())
            else:
@@ -60,29 +91,7 @@ class PipelineClient(object):
        return req
    def _unpack_response_package(self, resp, fetch):
-        if resp.ecode != 0:
+        return resp
-            return {
-                "ecode": resp.ecode,
-                "ecode_desc": ChannelDataEcode(resp.ecode),
-                "error_info": resp.error_info,
-            }
-        fetch_map = {"ecode": resp.ecode}
-        for idx, key in enumerate(resp.key):
-            if key == self._profile_key:
-                if resp.value[idx] != "":
-                    sys.stderr.write(resp.value[idx])
-                continue
-            if fetch is not None and key not in fetch:
-                continue
-            data = resp.value[idx]
-            try:
-                evaled_data = eval(data)
-                if isinstance(evaled_data, np.ndarray):
-                    data = evaled_data
-            except Exception as e:
-                pass
-            fetch_map[key] = data
-        return fetch_map
    def predict(self, feed_dict, fetch=None, asyn=False, profile=False):
        if not isinstance(feed_dict, dict):

--- a/python/pipeline/pipeline_server.py
+++ b/python/pipeline/pipeline_server.py
@@ -32,6 +32,10 @@ _LOGGER = logging.getLogger(__name__)
 class PipelineServicer(pipeline_service_pb2_grpc.PipelineServiceServicer):
+    """
+    Pipeline Servicer entrance.
+    """
    def __init__(self, name, response_op, dag_conf, worker_idx=-1):
        super(PipelineServicer, self).__init__()
        self._name = name
@@ -42,10 +46,16 @@ class PipelineServicer(pipeline_service_pb2_grpc.PipelineServiceServicer):
        _LOGGER.info("[PipelineServicer] succ init")
    def inference(self, request, context):
+        _LOGGER.info("(log_id={}) inference request name:{} self.name:{}".
+                     format(request.logid, request.name, self._name))
        if request.name != "" and request.name != self._name:
+            _LOGGER.error("(log_id={}) name dismatch error. request.name:{},"
+                          "server.name={}".format(request.logid, request.name,
+                                                  self._name))
            resp = pipeline_service_pb2.Response()
-            resp.ecode = channel.ChannelDataEcode.NO_SERVICE.value
+            resp.err_no = channel.ChannelDataErrcode.NO_SERVICE.value
-            resp.error_info = "Failed to inference: Service name error."
+            resp.err_msg = "Failed to inference: Service name error."
+            resp.result = ""
            return resp
        resp = self._dag_executor.call(request)
        return resp
@@ -53,7 +63,9 @@ class PipelineServicer(pipeline_service_pb2_grpc.PipelineServiceServicer):
 @contextlib.contextmanager
 def _reserve_port(port):
-    """Find and reserve a port for all subprocesses to use."""
+    """
+    Find and reserve a port for all subprocesses to use.
+    """
    sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
    if sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT) == 0:
@@ -66,6 +78,10 @@ def _reserve_port(port):
 class PipelineServer(object):
+    """
+    Pipeline Server : grpc gateway + grpc server.
+    """
    def __init__(self, name=None):
        self._name = name  # for grpc-gateway path
        self._rpc_port = None
@@ -74,6 +90,16 @@ class PipelineServer(object):
        self._proxy_server = None
    def _grpc_gateway(self, grpc_port, http_port):
+        """
+        Running a gateway server, linking libproxy_server.so
+        Args:
+            grpc_port: GRPC port
+            http_port: HTTP port
+        Returns:
+            None
+        """
        import os
        from ctypes import cdll
        from . import gateway
@@ -83,6 +109,17 @@ class PipelineServer(object):
        proxy_server.run_proxy_server(grpc_port, http_port)
    def _run_grpc_gateway(self, grpc_port, http_port):
+        """
+        Starting the GRPC gateway in a new process. Exposing one 
+        available HTTP port outside, and reflecting the data to RPC port.
+        Args:
+            grpc_port: GRPC port
+            http_port: HTTP port
+        Returns:
+            None
+        """
        if http_port <= 0:
            _LOGGER.info("Ignore grpc_gateway configuration.")
            return
@@ -99,6 +136,15 @@ class PipelineServer(object):
        self._proxy_server.start()
    def set_response_op(self, response_op):
+        """
+        Set the response OP.
+        Args:
+            response_op: ResponseOp or its subclass object
+        Returns:
+            None
+        """
        if not isinstance(response_op, operator.ResponseOp):
            raise Exception("Failed to set response_op: response_op "
                            "must be ResponseOp type.")
@@ -109,6 +155,17 @@ class PipelineServer(object):
        self._used_op, _ = dag.DAG.get_use_ops(self._response_op)
    def prepare_server(self, yml_file=None, yml_dict=None):
+        """
+        Reading configures from the yml file(config.yaml), and launching
+        local services.
+        Args:
+            yml_file: Reading configures from yaml files
+            yml_dict: Reading configures from yaml dict.
+        Returns:
+            None 
+        """
        conf = ServerYamlConfChecker.load_server_yaml_conf(
            yml_file=yml_file, yml_dict=yml_dict)
@@ -158,6 +215,15 @@ class PipelineServer(object):
        self._start_local_rpc_service()
    def _init_ops(self, op_conf):
+        """
+        Initializing all OPs from dicetory.
+        Args:
+            op_conf: the op configures in yaml dict.
+        Returns:
+            None.
+        """
        default_conf = {
            "concurrency": 1,
            "timeout": -1,
@@ -187,12 +253,22 @@ class PipelineServer(object):
                op.launch_local_rpc_service()
    def run_server(self):
+        """
+        If _build_dag_each_worker is True, Starting _worker_num processes and 
+        running one GRPC server in each process. Otherwise, Staring one GRPC
+        server.
+        Args:
+            None
+        Returns:
+            None
+        """
        if self._build_dag_each_worker:
            with _reserve_port(self._rpc_port) as port:
                bind_address = 'localhost:{}'.format(port)
                workers = []
                for i in range(self._worker_num):
-                    show_info = (i == 0)
                    worker = multiprocessing.Process(
                        target=self._run_server_func,
                        args=(bind_address, self._response_op, self._conf, i))
@@ -220,6 +296,15 @@ class PipelineServer(object):
            server.wait_for_termination()
    def _run_server_func(self, bind_address, response_op, dag_conf, worker_idx):
+        """
+        Running one GRPC server with PipelineServicer.
+        Args:
+            bind_address: binding IP/Port
+            response_op: ResponseOp or its subclass object
+            dag_conf: DAG config
+            worker_idx: Process index.
+        """
        options = [('grpc.so_reuseport', 1),
                   ('grpc.max_send_message_length', 256 * 1024 * 1024),
                   ('grpc.max_send_message_length', 256 * 1024 * 1024)]
@@ -235,6 +320,10 @@ class PipelineServer(object):
 class ServerYamlConfChecker(object):
+    """
+    Checking validities of server yaml files.
+    """
    def __init__(self):
        pass
@@ -244,7 +333,7 @@ class ServerYamlConfChecker(object):
            raise SystemExit("Failed to prepare_server: only one of yml_file"
                             " or yml_dict can be selected as the parameter.")
        if yml_file is not None:
-            with open(yml_file) as f:
+            with open(yml_file, encoding='utf-8') as f:
                conf = yaml.load(f.read())
        elif yml_dict is not None:
            conf = yml_dict

--- a/python/pipeline/proto/pipeline_service.proto
+++ b/python/pipeline/proto/pipeline_service.proto
@@ -19,13 +19,16 @@ message Request {
  repeated string key = 1;
  repeated string value = 2;
  optional string name = 3;
+  optional string method = 4;
+  optional int64 logid = 5;
+  optional string clientip = 6;
 };
 message Response {
-  repeated string key = 1;
+  optional int32 err_no = 1;
-  repeated string value = 2;
+  optional string err_msg = 2;
-  required int32 ecode = 3;
+  repeated string key = 3;
-  optional string error_info = 4;
+  repeated string value = 4;
 };
 service PipelineService {

--- a/python/requirements_mac.txt
+++ b/python/requirements_mac.txt
+numpy>=1.12, <=1.16.4 ; python_version<"3.5"
+shapely==1.7.0
+wheel>=0.34.0, <0.35.0
+setuptools>=44.1.0
+opencv-python==4.2.0.32
+google>=2.0.3
+opencv-python==4.2.0.32
+protobuf>=3.12.2
+grpcio-tools>=1.33.2
+grpcio>=1.33.2
+func-timeout>=4.3.5
+pyyaml>=1.3.0
+sentencepiece==0.1.83
+flask>=1.1.2
+ujson>=2.0.3
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
 sphinx==2.1.0
 mistune
 sphinx_rtd_theme
-paddlepaddle>=1.6
+paddlepaddle>=1.8.4
+shapely
--- a/python/setup.py.app.in
+++ b/python/setup.py.app.in
@@ -32,8 +32,8 @@ if '${PACK}' == 'ON':
 REQUIRED_PACKAGES = [
-    'six >= 1.10.0', 'sentencepiece', 'opencv-python<=4.2.0.32', 'pillow',
+    'six >= 1.10.0', 'sentencepiece<=0.1.92', 'opencv-python<=4.2.0.32', 'pillow',
-    'shapely<=1.6.1', 'pyclipper'
+    'pyclipper'
 ]
 packages=['paddle_serving_app',

--- a/python/setup.py.client.in
+++ b/python/setup.py.client.in
@@ -28,17 +28,11 @@ import util
 py_version = sys.version_info
 def copy_lib():
-    if py_version[0] == 2:
-        lib_list = ['libpython2.7.so.1.0', 'libssl.so.10', 'libcrypto.so.10'] 
-    elif py_version[1] == 6:
-        lib_list = ['libpython3.6m.so.1.0', 'libssl.so.10', 'libcrypto.so.10']
-    elif py_version[1] == 7:
-        lib_list = ['libpython3.7m.so.1.0', 'libssl.so.10', 'libcrypto.so.10']
    os.popen('mkdir -p paddle_serving_client/lib')
+    lib_list = ['${OPENSSL_CRYPTO_LIBRARY}', '${OPENSSL_SSL_LIBRARY}', 
+                '${PYTHON_LIBRARY}']
    for lib in lib_list:
-        r = os.popen('whereis {}'.format(lib))
+        os.popen('cp {} ./paddle_serving_client/lib'.format(lib))
-        text = r.read()
-        os.popen('cp {} ./paddle_serving_client/lib'.format(text.strip().split(' ')[1]))
 max_version, mid_version, min_version = util.python_version()
@@ -49,13 +43,10 @@ if '${PACK}' == 'ON':
    copy_lib()
 REQUIRED_PACKAGES = [
-    'six >= 1.10.0', 'protobuf >= 3.11.0', 'numpy >= 1.12', 'grpcio >= 1.28.1',
+    'six >= 1.10.0', 'protobuf >= 3.11.0', 'numpy >= 1.12', 'grpcio <= 1.33.2',
-    'grpcio-tools >= 1.28.1'
+    'grpcio-tools <= 1.33.2'
 ]
-if not util.find_package("paddlepaddle") and not util.find_package("paddlepaddle-gpu"):
-    REQUIRED_PACKAGES.append("paddlepaddle")
 packages=['paddle_serving_client',
          'paddle_serving_client.proto',

--- a/python/setup.py.server.in
+++ b/python/setup.py.server.in
@@ -28,8 +28,8 @@ max_version, mid_version, min_version = util.python_version()
 util.gen_pipeline_code("paddle_serving_server")
 REQUIRED_PACKAGES = [
-    'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio >= 1.28.1', 'grpcio-tools >= 1.28.1',
+    'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio <= 1.33.2', 'grpcio-tools <= 1.33.2',
-    'paddle_serving_client', 'flask >= 1.1.1', 'paddle_serving_app'
+    'flask >= 1.1.1', 'func_timeout', 'pyyaml'
 ]
 packages=['paddle_serving_server',

--- a/python/setup.py.server_gpu.in
+++ b/python/setup.py.server_gpu.in
@@ -19,17 +19,19 @@ from __future__ import print_function
 from setuptools import setup, Distribution, Extension
 from setuptools import find_packages
 from setuptools import setup
-from paddle_serving_server_gpu.version import serving_server_version
+from paddle_serving_server_gpu.version import serving_server_version, cuda_version
 import util
-max_version, mid_version, min_version = util.python_version()
+if cuda_version != "trt":
+    cuda_version = "post" + cuda_version
+max_version, mid_version, min_version = util.python_version()
 # gen pipeline proto code
 util.gen_pipeline_code("paddle_serving_server_gpu")
 REQUIRED_PACKAGES = [
-    'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio >= 1.28.1', 'grpcio-tools >= 1.28.1',
+    'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio <= 1.33.2', 'grpcio-tools <= 1.33.2',
-    'paddle_serving_client', 'flask >= 1.1.1', 'paddle_serving_app'
+    'flask >= 1.1.1', 'func_timeout', 'pyyaml'
 ]
 packages=['paddle_serving_server_gpu',
@@ -56,7 +58,7 @@ package_data={'paddle_serving_server_gpu': ['pipeline/gateway/libproxy_server.so
 setup(
    name='paddle-serving-server-gpu',
-    version=serving_server_version.replace('-', '') + '.post@CUDA_VERSION_MAJOR@',
+    version=serving_server_version.replace('-', '') + "." + cuda_version,
    description=
    ('Paddle Serving Package for saved model with PaddlePaddle'),
    url='https://github.com/PaddlePaddle/Serving',

--- a/python/util.py
+++ b/python/util.py
@@ -44,8 +44,8 @@ def gen_pipeline_code(package_name):
    ret = os.system(
        "cd {}/pipeline/gateway/proto/ && "
        "../../../../../third_party/install/protobuf/bin/protoc -I. "
-        "-I$GOPATH/src "
+        "-I$GOPATH/pkg/mod "
-        "-I$GOPATH/src/github.com/grpc-ecosystem/grpc-gateway/third_party/googleapis "
+        "-I$GOPATH/pkg/mod/github.com/grpc-ecosystem/grpc-gateway\@v1.15.2/third_party/googleapis "
        "--go_out=plugins=grpc:. "
        "gateway.proto".format(package_name))
    if ret != 0:
@@ -54,14 +54,18 @@ def gen_pipeline_code(package_name):
    ret = os.system(
        "cd {}/pipeline/gateway/proto/ && "
        "../../../../../third_party/install/protobuf/bin/protoc -I. "
-        "-I$GOPATH/src "
+        "-I$GOPATH/pkg/mod "
-        "-I$GOPATH/src/github.com/grpc-ecosystem/grpc-gateway/third_party/googleapis "
+        "-I$GOPATH/pkg/mod/github.com/grpc-ecosystem/grpc-gateway\@v1.15.2/third_party/googleapis "
        "--grpc-gateway_out=logtostderr=true:. "
        "gateway.proto".format(package_name))
    if ret != 0:
        exit(1)
    # pipeline grpc-gateway shared-lib
+    ret = os.system("cd {}/pipeline/gateway/ && go mod init serving-gateway".
+                    format(package_name))
+    ret = os.system("cd {}/pipeline/gateway/ && go mod vendor && go mod tidy".
+                    format(package_name))
    ret = os.system(
        "cd {}/pipeline/gateway && "
        "go build -buildmode=c-shared -o libproxy_server.so proxy_server.go".

--- a/tools/Dockerfile.centos6.cuda9.0-cudnn7.devel
+++ b/tools/Dockerfile.centos6.cuda9.0-cudnn7.devel
@@ -41,6 +41,12 @@ RUN yum -y install wget && \
    echo 'export LD_LIBRARY_PATH=/usr/local/python3.6/lib:$LD_LIBRARY_PATH' >> /root/.bashrc && \
    source /root/.bashrc && \
    cd .. && rm -rf Python-3.6.8* && \
+    wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.2/protobuf-all-3.11.2.tar.gz && \
+    tar zxf protobuf-all-3.11.2.tar.gz && \
+    cd protobuf-3.11.2 && \
+    ./configure && make -j4 && make install && \
+    make clean && \
+    cd .. && rm -rf protobuf-* &&\
    yum -y install epel-release && yum -y install patchelf libXext libSM libXrender && \
    yum clean all && \
    echo "export LANG=en_US.utf8" >> /root/.bashrc && \

--- a/tools/Dockerfile.centos6.devel
+++ b/tools/Dockerfile.centos6.devel
@@ -41,6 +41,12 @@ RUN yum -y install wget && \
    echo 'export LD_LIBRARY_PATH=/usr/local/python3.6/lib:$LD_LIBRARY_PATH' >> /root/.bashrc && \
    source /root/.bashrc && \
    cd .. && rm -rf Python-3.6.8* && \
+    wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.2/protobuf-all-3.11.2.tar.gz && \
+    tar zxf protobuf-all-3.11.2.tar.gz && \
+    cd protobuf-3.11.2 && \
+    ./configure && make -j4 && make install && \
+    make clean && \
+    cd .. && rm -rf protobuf-* && \
    yum -y install epel-release && yum -y install patchelf libXext libSM libXrender && \
    yum clean all && \
    localedef -c -i en_US -f UTF-8 en_US.UTF-8 && \

--- a/tools/Dockerfile.ci
+++ b/tools/Dockerfile.ci
@@ -34,6 +34,13 @@ RUN wget http://nixos.org/releases/patchelf/patchelf-0.10/patchelf-0.10.tar.bz2
    && cd .. \
    && rm -rf patchelf-0.10*
+RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.2/protobuf-all-3.11.2.tar.gz && \
+    tar zxf protobuf-all-3.11.2.tar.gz && \
+    cd protobuf-3.11.2 && \
+    ./configure && make -j4 && make install && \
+    make clean && \
+    cd .. && rm -rf protobuf-*
 RUN yum install -y python3 python3-devel
 RUN yum -y update >/dev/null \

--- a/tools/Dockerfile.cuda10.0-cudnn7.devel
+++ b/tools/Dockerfile.cuda10.0-cudnn7.devel
@@ -7,6 +7,13 @@ RUN yum -y install wget >/dev/null \
    && yum -y install libXrender-0.9.10-1.el7.x86_64 --setopt=protected_multilib=false \
    && yum -y install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false 
+RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.2/protobuf-all-3.11.2.tar.gz && \
+    tar zxf protobuf-all-3.11.2.tar.gz && \
+    cd protobuf-3.11.2 && \
+    ./configure && make -j4 && make install && \
+    make clean && \
+    cd .. && rm -rf protobuf-*
 RUN wget https://cmake.org/files/v3.2/cmake-3.2.0-Linux-x86_64.tar.gz >/dev/null \
    && tar xzf cmake-3.2.0-Linux-x86_64.tar.gz \
    && mv cmake-3.2.0-Linux-x86_64 /usr/local/cmake3.2.0 \
@@ -32,3 +39,5 @@ RUN yum install -y python3 python3-devel \
 RUN localedef -c -i en_US -f UTF-8 en_US.UTF-8 \
    && echo "export LANG=en_US.utf8" >> /root/.bashrc \
    && echo "export LANGUAGE=en_US.utf8" >> /root/.bashrc
--- a/tools/Dockerfile.cuda10.1-cudnn7-trt6.devel
+++ b/tools/Dockerfile.cuda10.1-cudnn7-trt6.devel
+FROM nvidia/cuda:10.1-cudnn7-devel-centos7
+RUN export http_proxy="http://172.19.56.199:3128" \
+    && export https_proxy="http://172.19.56.199:3128" \
+    && yum -y install wget >/dev/null \
+    && yum -y install gcc gcc-c++ make glibc-static which  \
+    && yum -y install git openssl-devel curl-devel bzip2-devel python-devel \
+    && yum -y install libSM-1.2.2-2.el7.x86_64 --setopt=protected_multilib=false \
+    && yum -y install libXrender-0.9.10-1.el7.x86_64 --setopt=protected_multilib=false \
+    && yum -y install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false 
+RUN export http_proxy="http://172.19.56.199:3128" \
+    && export https_proxy="http://172.19.56.199:3128" && \
+    wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.2/protobuf-all-3.11.2.tar.gz && \
+    tar zxf protobuf-all-3.11.2.tar.gz && \
+    cd protobuf-3.11.2 && \
+    ./configure && make -j4 && make install && \
+    make clean && \
+    cd .. && rm -rf protobuf-*
+RUN export http_proxy="http://172.19.56.199:3128" \
+    && export https_proxy="http://172.19.56.199:3128" && \
+    wget https://cmake.org/files/v3.2/cmake-3.2.0-Linux-x86_64.tar.gz >/dev/null \
+    && tar xzf cmake-3.2.0-Linux-x86_64.tar.gz \
+    && mv cmake-3.2.0-Linux-x86_64 /usr/local/cmake3.2.0 \
+    && echo 'export PATH=/usr/local/cmake3.2.0/bin:$PATH' >> /root/.bashrc \
+    && rm cmake-3.2.0-Linux-x86_64.tar.gz 
+RUN export http_proxy="http://172.19.56.199:3128" \
+    && export https_proxy="http://172.19.56.199:3128" && \
+    wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
+    && tar xzf go1.14.linux-amd64.tar.gz \
+    && mv go /usr/local/go \
+    && echo 'export GOROOT=/usr/local/go' >> /root/.bashrc \
+    && echo 'export PATH=/usr/local/go/bin:$PATH' >> /root/.bashrc \
+    && rm go1.14.linux-amd64.tar.gz 
+RUN export http_proxy="http://172.19.56.199:3128" \
+    && export https_proxy="http://172.19.56.199:3128" && \
+    yum -y install python-devel sqlite-devel  \
+    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
+    && python get-pip.py >/dev/null \
+    && rm get-pip.py 
+RUN export http_proxy="http://172.19.56.199:3128" \
+    && export https_proxy="http://172.19.56.199:3128" && \
+    yum install -y python3 python3-devel \
+    && yum -y install epel-release && yum -y install patchelf libXext libSM libXrender\
+    && yum clean all 
+RUN localedef -c -i en_US -f UTF-8 en_US.UTF-8 \
+    && echo "export LANG=en_US.utf8" >> /root/.bashrc \
+    && echo "export LANGUAGE=en_US.utf8" >> /root/.bashrc
+RUN wget https://paddle-serving.bj.bcebos.com/tools/TensorRT-6.0.1.5.CentOS-7.6.x86_64-gnu.cuda-10.1.cudnn7.6.tar.gz \
+    && tar -xzf TensorRT-6.0.1.5.CentOS-7.6.x86_64-gnu.cuda-10.1.cudnn7.6.tar.gz \
+    && mv TensorRT-6.0.1.5 /usr/local/ \
+    && rm TensorRT-6.0.1.5.CentOS-7.6.x86_64-gnu.cuda-10.1.cudnn7.6.tar.gz \
+    && echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/TensorRT-6.0.1.5/lib/' >> /root/.bashrc
--- a/tools/Dockerfile.cuda9.0-cudnn7.devel
+++ b/tools/Dockerfile.cuda9.0-cudnn7.devel
@@ -6,6 +6,13 @@ RUN yum -y install wget >/dev/null \
    && yum -y install libXrender-0.9.10-1.el7.x86_64 --setopt=protected_multilib=false \
    && yum -y install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false
+RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.2/protobuf-all-3.11.2.tar.gz && \
+    tar zxf protobuf-all-3.11.2.tar.gz && \
+    cd protobuf-3.11.2 && \
+    ./configure && make -j4 && make install && \
+    make clean && \
+    cd .. && rm -rf protobuf-*
 RUN wget https://cmake.org/files/v3.2/cmake-3.2.0-Linux-x86_64.tar.gz >/dev/null \
    && tar xzf cmake-3.2.0-Linux-x86_64.tar.gz \
    && mv cmake-3.2.0-Linux-x86_64 /usr/local/cmake3.2.0 \

--- a/tools/serving_build.sh
+++ b/tools/serving_build.sh
@@ -18,14 +18,20 @@ function init() {
    export PYTHONROOT=/usr
    cd Serving
    export SERVING_WORKDIR=$PWD
    $PYTHONROOT/bin/python -m pip install -r python/requirements.txt
+    $PYTHONROOT/bin/python -m pip install paddlepaddle
    export GOPATH=$HOME/go
    export PATH=$PATH:$GOPATH/bin
-    go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway
+    go env -w GO111MODULE=on
-    go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger
+    go env -w GOPROXY=https://goproxy.cn,direct
-    go get -u github.com/golang/protobuf/protoc-gen-go
-    go get -u google.golang.org/grpc
+    go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
+    go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
+    go get -u github.com/golang/protobuf/protoc-gen-go@v1.4.3
+    go get -u google.golang.org/grpc@v1.33.0
 }
 function check_cmd() {
@@ -605,7 +611,7 @@ function python_test_grpc_impl() {
            # test load server config and client config in Server side
            cd criteo_ctr_with_cube # pwd: /Serving/python/examples/grpc_impl_example/criteo_ctr_with_cube
+<<COMMENT #comment for compile bug, todo fix conflict between grpc-gateway and cube-agent 
            check_cmd "wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz > /dev/null"
            check_cmd "tar xf ctr_cube_unittest.tar.gz"
            check_cmd "mv models/ctr_client_conf ./"
@@ -626,9 +632,11 @@ function python_test_grpc_impl() {
                echo "error with criteo_ctr_with_cube inference auc test, auc should > 0.67"
                exit 1
            fi
+COMMENT
            echo "grpc impl test success"
            kill_server_process
-            ps -ef | grep "cube" | grep -v grep | awk '{print $2}' | xargs kill
+            #ps -ef | grep "cube" | grep -v grep | awk '{print $2}' | xargs kill
            cd .. # pwd: /Serving/python/examples/grpc_impl_example
            ;;
@@ -665,6 +673,7 @@ function python_test_grpc_impl() {
            cd .. # pwd: /Serving/python/examples/grpc_impl_example
            # test load server config and client config in Server side
+<<COMMENT #comment for compile bug, todo fix conflict between grpc-gateway and cube-agent 
            cd criteo_ctr_with_cube # pwd: /Serving/python/examples/grpc_impl_example/criteo_ctr_with_cube
            check_cmd "wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz"
@@ -689,10 +698,11 @@ function python_test_grpc_impl() {
                echo "error with criteo_ctr_with_cube inference auc test, auc should > 0.67"
                exit 1
            fi
+COMMENT
            echo "grpc impl test success"
            kill_server_process
            ps -ef | grep "test_server_gpu" | grep -v serving_build | grep -v grep | awk '{print $2}' | xargs kill
-            ps -ef | grep "cube" | grep -v grep | awk '{print $2}' | xargs kill
+            #ps -ef | grep "cube" | grep -v grep | awk '{print $2}' | xargs kill
            cd .. # pwd: /Serving/python/examples/grpc_impl_example
            ;;
        *)
@@ -829,8 +839,8 @@ EOF
            kill_process_by_port 18080
            # test: process servicer & thread op
-            pip uninstall grpcio -y
+            #pip uninstall grpcio -y
-            pip install grpcio --no-binary=grpcio
+            #pip install grpcio --no-binary=grpcio
            cat << EOF > config.yml
 rpc_port: 18080
 worker_num: 4
@@ -944,7 +954,7 @@ function python_run_test() {
    local TYPE=$1 # pwd: /Serving
    cd python/examples # pwd: /Serving/python/examples
    python_test_fit_a_line $TYPE # pwd: /Serving/python/examples
-    python_run_criteo_ctr_with_cube $TYPE # pwd: /Serving/python/examples
+    #python_run_criteo_ctr_with_cube $TYPE # pwd: /Serving/python/examples
    python_test_bert $TYPE # pwd: /Serving/python/examples
    python_test_imdb $TYPE # pwd: /Serving/python/examples
    python_test_lac $TYPE # pwd: /Serving/python/examples
@@ -953,7 +963,7 @@ function python_run_test() {
    python_test_yolov4 $TYPE # pwd: /Serving/python/examples
    python_test_grpc_impl $TYPE # pwd: /Serving/python/examples
    python_test_resnet50 $TYPE # pwd: /Serving/python/examples
-    python_test_pipeline $TYPE # pwd: /Serving/python/examples
+    #python_test_pipeline $TYPE # pwd: /Serving/python/examples
    echo "test python $TYPE part finished as expected."
    cd ../.. # pwd: /Serving
 }
@@ -1098,7 +1108,7 @@ function main() {
    build_app $TYPE # pwd: /Serving
    java_run_test $TYPE # pwd: /Serving
    python_run_test $TYPE # pwd: /Serving
-    monitor_test $TYPE # pwd: /Serving
+    #monitor_test $TYPE # pwd: /Serving
    echo "serving $TYPE part finished as expected."
 }