Merge branch 'develop' into doc

5857b506 · Dong Daxiang · GitHub · d8644787 · 52b885a6 · 5857b506
46 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,7 +49,9 @@ set(THIRD_PARTY_BUILD_TYPE Release)
 option(WITH_AVX	    "Compile Paddle Serving with AVX intrinsics"    OFF)
 option(WITH_MKL	    "Compile Paddle Serving with MKL support."      OFF)
 option(WITH_GPU	    "Compile Paddle Serving with NVIDIA GPU"        OFF)
-option(CLIENT_ONLY  "Compile client libraries and demos only"       OFF)
+option(CLIENT  	    "Compile Paddle Serving Client"		    OFF)
+option(SERVER	    "Compile Paddle Serving Server"		    OFF)
+option(APP          "Compile Paddle Serving App package"	    OFF)
 option(WITH_ELASTIC_CTR "Compile ELASITC-CTR solution"              OFF)
 option(PACK         "Compile for whl"                               OFF)

@@ -63,12 +65,12 @@ if (NOT DEFINED WITH_MKLDNN)
    endif()
 endif()

-if (NOT CLIENT_ONLY)
+if (SERVER)
 include(external/jsoncpp)
 #include(external/rocksdb)
 endif()
-#include(external/gtest)

+if (SERVER OR CLIENT)
 include(external/snappy)
 include(external/leveldb)
 include(external/zlib)
@@ -81,8 +83,9 @@ include(external/pybind11)
 include(external/python)
 include(generic)
 include(flags)
+endif()

-if (NOT CLIENT_ONLY)
+if (SERVER)
 include(external/cudnn)
 include(paddlepaddle)
 endif()
@@ -91,7 +94,7 @@ message("paddle serving source dir: " ${PADDLE_SERVING_SOURCE_DIR})
 include_directories(${PADDLE_SERVING_SOURCE_DIR})
 include_directories(${PADDLE_SERVING_BINARY_DIR})

-if(NOT CLIENT_ONLY)
+if(SERVER)
    set(EXTERNAL_LIBS
 	jsoncpp
 	gflags
@@ -109,28 +112,27 @@ set(EXTERNAL_LIBS
 	brpc
 )

-if(NOT CLIENT_ONLY)
+if(SERVER)
 if(WITH_MKLML)
    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()
 endif()


-if(NOT CLIENT_ONLY)
+if(SERVER)
 if(WITH_MKLDNN)
    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
 endif()

-if (NOT CLIENT_ONLY)
+if (SERVER)
    list(APPEND EXTERNAL_LIBS paddlepaddle)
 endif()

 add_subdirectory(core)

-if(NOT CLIENT_ONLY)
+if(SERVER)
 add_subdirectory(paddle_inference)
 endif()

 add_subdirectory(python)
-#add_subdirectory(examples)
--- a/README.md
+++ b/README.md
@@ -18,12 +18,12 @@

 <h2 align="center">Motivation</h2>

-Paddle Serving helps deep learning developers deploy an online inference service without much effort. **The goal of this project**: once you have trained a deep neural nets with [Paddle](https://github.com/PaddlePaddle/Paddle), you already have a model inference service. A demo of serving is as follows:
+We consider deploying deep learning inference service online to be a user-facing application in the future. **The goal of this project**: When you have trained a deep neural net with [Paddle](https://github.com/PaddlePaddle/Paddle), you can put the model online without much effort. A demo of serving is as follows:
 <p align="center">
    <img src="doc/demo.gif" width="700">
 </p>

-<h2 align="center">Key Features</h2>
+<h2 align="center">Some Key Features</h2>

 - Integrate with Paddle training pipeline seemlessly, most paddle models can be deployed **with one line command**.
 - **Industrial serving features** supported, such as models management, online loading, online A/B testing etc.
@@ -61,7 +61,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po

 | Argument | Type | Default | Description |
 |--------------|------|-----------|--------------------------------|
-| `thread` | int | `10` | Concurrency of current service |
+| `thread` | int | `4` | Concurrency of current service |
 | `port` | int | `9292` | Exposed port of current service to users|
 | `name` | str | `""` | Service name, can be used to generate HTTP request url |
 | `model` | str | `""` | Path of paddle model directory to be served |
@@ -93,6 +93,7 @@ fetch_map = client.predict(feed={"x": data}, fetch=["price"])
 print(fetch_map)

 ```
+Here, `client.predict` function has two arguments. `feed` is a `python dict` with model input variable alias name and values. `fetch` assigns the prediction variables to be returned from servers. In the example, the name of `"x"` and `"price"` are assigned when the servable model is saved during training.

 <h2 align="center"> Pre-built services with Paddle Serving</h2>

@@ -161,7 +162,7 @@ curl -H "Content-Type:application/json" -X POST -d '{"url": "https://paddle-serv

 ### New to Paddle Serving
 - [How to save a servable model?](doc/SAVE.md)
- [An end-to-end tutorial from training to serving](doc/END_TO_END.md)
+- [An end-to-end tutorial from training to serving(Chinese)](doc/TRAIN_TO_SERVICE.md)
 - [Write Bert-as-Service in 10 minutes](doc/BERT_10_MINS.md)

 ### Developers
@@ -177,8 +178,10 @@ curl -H "Content-Type:application/json" -X POST -d '{"url": "https://paddle-serv
 ### FAQ
 - [FAQ(Chinese)](doc/FAQ.md)

+
 ### Design
- [Design Doc(Chinese)](doc/DESIGN.md)
+- [Design Doc(Chinese)](doc/DESIGN_DOC.md)
+- [Design Doc(English)](doc/DESIGN_DOC_EN.md)

 <h2 align="center">Community</h2>


--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -12,19 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License

-if(NOT CLIENT_ONLY)
+if(SERVER)
 add_subdirectory(cube)
 #add_subdirectory(kvdb)
 endif()
+
+if (CLIENT OR SERVER)
 add_subdirectory(configure)
 add_subdirectory(pdcodegen)
 add_subdirectory(sdk-cpp)
-if(CLIENT_ONLY)
+endif()
+
+if(CLIENT)
 add_subdirectory(general-client)
 endif()

-if (NOT CLIENT_ONLY)
+if (SERVER)
 add_subdirectory(predictor)
 add_subdirectory(general-server)
 endif()
+
+if (CLIENT OR SERVER)
 add_subdirectory(util)
+endif()
--- a/core/configure/CMakeLists.txt
+++ b/core/configure/CMakeLists.txt
@@ -33,7 +33,7 @@ py_proto_compile(general_model_config_py_proto SRCS proto/general_model_config.p
 add_custom_target(general_model_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(general_model_config_py_proto general_model_config_py_proto_init)

-if (CLIENT_ONLY)
+if (CLIENT)
 py_proto_compile(sdk_configure_py_proto SRCS proto/sdk_configure.proto)
 add_custom_target(sdk_configure_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(sdk_configure_py_proto sdk_configure_py_proto_init)
@@ -51,7 +51,7 @@ add_custom_command(TARGET general_model_config_py_proto POST_BUILD

 endif()

-if (NOT CLIENT_ONLY)
+if (SERVER)
 py_proto_compile(server_config_py_proto SRCS proto/server_configure.proto)
 add_custom_target(server_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(server_config_py_proto server_config_py_proto_init)

--- a/core/general-client/CMakeLists.txt
+++ b/core/general-client/CMakeLists.txt
-if(CLIENT_ONLY)
+if(CLIENT)
 add_subdirectory(pybind11)
 pybind11_add_module(serving_client src/general_model.cpp src/pybind_general_model.cpp)
 target_link_libraries(serving_client PRIVATE -Wl,--whole-archive utils sdk-cpp pybind python -Wl,--no-whole-archive -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)

--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
 # 如何编译PaddleServing

-### 编译环境设置
+## 编译环境设置
+
 - os: CentOS 6u3
 - gcc: 4.8.2及以上
 - go: 1.9.2及以上
@@ -8,39 +9,116 @@
 - cmake：3.2.2及以上
 - python：2.7.2及以上

-### 获取代码
+推荐使用Docker准备Paddle Serving编译环境：[CPU Dockerfile.devel](../tools/Dockerfile.devel)，[GPU Dockerfile.gpu.devel](../tools/Dockerfile.gpu.devel)
+
+## 获取代码

 ``` python
 git clone https://github.com/PaddlePaddle/Serving
-git submodule update --init --recursive
+cd Serving && git submodule update --init --recursive
 ```

-### 编译Server部分
+## PYTHONROOT设置

-#### PYTHONROOT设置
-``` shell
+```shell
 # 例如python的路径为/usr/bin/python，可以设置PYTHONROOT
 export PYTHONROOT=/usr/
 ```

-#### 集成CPU版本Paddle Inference Library
+## 编译Server部分
+
+### 集成CPU版本Paddle Inference Library
+
 ``` shell
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCLIENT_ONLY=OFF ..
+mkdir build && cd build
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON ..
 make -j10
 ```

-#### 集成GPU版本Paddle Inference Library
+可以执行`make install`把目标产出放在`./output`目录下，cmake阶段需添加`-DCMAKE_INSTALL_PREFIX=./output`选项来指定存放路径。
+
+### 集成GPU版本Paddle Inference Library
+
 ``` shell
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCLIENT_ONLY=OFF -DWITH_GPU=ON ..
+mkdir build && cd build
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON -DWITH_GPU=ON ..
 make -j10
 ```

-### 编译Client部分
+执行`make install`可以把目标产出放在`./output`目录下。
+
+## 编译Client部分

 ``` shell
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCLIENT_ONLY=ON ..
+mkdir build && cd build
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCLIENT=ON ..
 make -j10
 ```

-### 安装wheel包
-无论是client端还是server端，编译完成后，安装python/dist/下的whl包即可
+执行`make install`可以把目标产出放在`./output`目录下。
+
+## 编译App部分
+
+```bash
+mkdir build && cd build
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCMAKE_INSTALL_PREFIX=./output -DAPP=ON ..
+make
+```
+
+## 安装wheel包
+
+无论是Client端，Server端还是App部分，编译完成后，安装`python/dist/`下的whl包即可。
+
+## 注意事项
+
+运行python端Server时，会检查`SERVING_BIN`环境变量，如果想使用自己编译的二进制文件，请将设置该环境变量为对应二进制文件的路径，通常是`export SERVING_BIN=${BUILD_DIR}/core/general-server/serving`。
+
+## CMake选项说明
+
+|     编译选项     |                    说明                    | 默认 |
+| :--------------: | :----------------------------------------: | :--: |
+|     WITH_AVX     | Compile Paddle Serving with AVX intrinsics | OFF  |
+|     WITH_MKL     |  Compile Paddle Serving with MKL support   | OFF  |
+|     WITH_GPU     |   Compile Paddle Serving with NVIDIA GPU   | OFF  |
+|    CUDNN_ROOT    |    Define CuDNN library and header path    |      |
+|      CLIENT      |       Compile Paddle Serving Client        | OFF  |
+|      SERVER      |       Compile Paddle Serving Server        | OFF  |
+|       APP        |     Compile Paddle Serving App package     | OFF  |
+| WITH_ELASTIC_CTR |        Compile ELASITC-CTR solution        | OFF  |
+|       PACK       |              Compile for whl               | OFF  |
+
+### WITH_GPU选项
+
+Paddle Serving通过PaddlePaddle预测库支持在GPU上做预测。WITH_GPU选项用于检测系统上CUDA/CUDNN等基础库，如检测到合适版本，在编译PaddlePaddle时就会编译出GPU版本的OP Kernel。
+
+在裸机上编译Paddle Serving GPU版本，需要安装这些基础库：
+
+- CUDA
+- CuDNN
+- NCCL2
+
+这里要注意的是：
+
+1. 编译Serving所在的系统上所安装的CUDA/CUDNN等基础库版本，需要兼容实际的GPU设备。例如，Tesla V100卡至少要CUDA 9.0。如果编译时所用CUDA等基础库版本过低，由于生成的GPU代码和实际硬件设备不兼容，会导致Serving进程无法启动，或出现coredump等严重问题。
+2. 运行Paddle Serving的系统上安装与实际GPU设备兼容的CUDA driver，并安装与编译期所用的CUDA/CuDNN等版本兼容的基础库。如运行Paddle Serving的系统上安装的CUDA/CuDNN的版本低于编译时所用版本，可能会导致奇怪的cuda函数调用失败等问题。
+
+以下是PaddlePaddle发布版本所使用的基础库版本匹配关系，供参考：
+
+|        |  CUDA   |          CuDNN           | NCCL2  |
+| :----: | :-----: | :----------------------: | :----: |
+| CUDA 8 | 8.0.61  | CuDNN 7.1.2 for CUDA 8.0 | 2.1.4  |
+| CUDA 9 | 9.0.176 | CuDNN 7.3.1 for CUDA 9.0 | 2.2.12 |
+
+### 如何让Paddle Serving编译系统探测到CuDNN库
+
+从NVIDIA developer官网下载对应版本CuDNN并在本地解压后，在cmake编译命令中增加`-DCUDNN_ROOT`参数，指定CuDNN库所在路径。
+
+### 如何让Paddle Serving编译系统探测到nccl库
+
+从NVIDIA developer官网下载对应版本nccl2库并解压后，增加如下环境变量 (以nccl2.1.4为例)：
+
+```shell
+export C_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$C_INCLUDE_PATH
+export CPLUS_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$CPLUS_INCLUDE_PATH
+export LD_LIBRARY_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/lib/:$LD_LIBRARY_PATH
+```
--- a/doc/CUBE_LOCAL.md
+++ b/doc/CUBE_LOCAL.md
+# Cube: Sparse Parameter Indexing Service (Local Mode)
+
+([简体中文](./CUBE_LOCAL_CN.md)|English)
+
+## Overview
+
+There are two examples on CTR under python / examples, they are criteo_ctr, criteo_ctr_with_cube. The former is to save the entire model during training, including sparse parameters. The latter is to cut out the sparse parameters and save them into two parts, one is the sparse parameter and the other is the dense parameter. Because the scale of sparse parameters is very large in industrial cases, reaching the order of 10 ^ 9. Therefore, it is not practical to start large-scale sparse parameter prediction on one machine. Therefore, we introduced Baidu's industrial-grade product Cube to provide the sparse parameter service for many years to provide distributed sparse parameter services.
+
+The local mode of Cube is different from distributed Cube, which is designed to be convenient for developers to use in experiments and demos. If there is a demand for distributed sparse parameter service, please continue reading [Distributed Cube User Guide](./Distributed_Cube) after reading this document (still developing).
+
+
+## Example
+in directory python/example/criteo_ctr_with_cube, run
+
+```
+python local_train.py # train model
+cp ../../../build_server/core/predictor/seq_generator seq_generator # copy Sequence File generator
+cp ../../../build_server/output/bin/cube* ./cube/ # copy Cube tool kits
+cp ../../../build_server/core/cube/cube-api/cube-cli ./cube/ # copy Cube Client
+cube_prepare.sh & # start deliver script
+```
+you will convert the Sparse Parameters from trained model to the Cube Server.
+
+## Components of Cube
+
+### cube-builder
+
+cube-builder is a tool for generating model shard files and version management. As the cube is used for distributed sparse parameter services, for each node in the distribution, different shards need to be loaded. However, the generated sparse parameter file is often a large file, and it needs to be divided into different shards by a hash function. At the same time, industrial-level scenarios need to support regular model distribution and streaming training, so it is very important for the version management of the model. This is also the missing part when training and saving the model. Therefore, while the cube-builder generates the shards, You can also manually specify version information.
+
+### cube-server
+
+The cube-server is based on the sparse parameter indexing, providing the sparse parameter service. It provides high-performance distributed query service through brpc, and makes remote calls through RestAPI.
+
+### cube-cli
+
+cube-cli is the client of cube-server. This part has been integrated into paddle serving. When we prepare the cube.conf configuration file and specify the kv_infer related op in the code of paddle serving server, cube-cli will Ready on the serving side.
+
+
+## Serving the Model Step by Step
+### precondition
+
+we need a trained model, and copy the tool kits from build_server folder.
+```
+python local_train.py # train model
+cp ../../../build_server/core/predictor/seq_generator seq_generator  # copy Sequence File generator
+cp ../../../build_server/output/bin/cube* ./cube/ # copy Cube tool kits
+cp ../../../build_server/core/cube/cube-api/cube-cli ./cube/ # copy Cube Client
+```
+
+### Generate Sequence File from Sparse Parameter
+
+In order to get the model parameters from the training end to the prediction end, we need to convert the trained model from the Paddle model save format to the Sequence File format.
+
+**why is Sequence File?**
+Sequence File is a common format for the Hadoop File System. It was mentioned at the beginning of the article that distributed cubes can provide support for ultra-large-scale sparse parameter services, and large-scale sparse parameters are stored in distributed file systems in actual production environments. Hadoop File System is one of the most stable distributed open source. So the Sequence File format became the file format for the Cube loading model.
+
+```
+mkdir -p cube_model
+mkdir -p cube/data
+./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature
+```
+
+### Generating Shards
+
+For the local version of Cube, the number of shard is 1. run
+
+```
+cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=./cube/data -shard_num=1  -only_build=false
+```
+
+
+### Deliver to Cube-Server
+
+The process of the cube local version is very simple, you only need to store the index files. in ./data folder where the cube binary program is located.
+```
+mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
+cd cube && ./cube &
+```
+
+
+### Cube-Client Verification
+
+this step is not necessary, but it can help you to verify if the model is ready.
+```
+./cube-cli -dict_name=test_dict -keys  keys -conf ./cube/cube.conf
+```
+if you succeed, you will see this
+<p align="center">
+    <img src="cube-cli.png" width="700">
+</p>
+
+If you see that each key has a corresponding value output, it means that the delivery was successful. This file can also be used by Serving to perform cube query in general kv infer op in Serving.
+
+
+## Appendix: Configuration
+the config file is cube.config located in python/examples/criteo_ctr_with_cube/cube/conf, this file is used by cube-cli.the Cube Local Mode users do not need to understand that just use it, it would be quite important in Cube Distributed Mode.
+
+```
+[{
+    "dict_name": "test_dict",  //table name
+    "shard": 1,  //shard num
+    "dup": 1,  //duplicates
+    "timeout": 200,
+    "retry": 3,
+    "backup_request": 100,
+    "type": "ipport_list",
+    "load_balancer": "rr",
+    "nodes": [{
+        "ipport_list": "list://127.0.0.1:8027" //IP list
+    }]
+}]
+```
--- a/doc/CUBE_LOCAL_CN.md
+++ b/doc/CUBE_LOCAL_CN.md
+# 稀疏参数索引服务Cube单机版使用指南
+
+(简体中文|[English](./CUBE_LOCAL.md))
+
+## 引言
+
+在python/examples下有两个关于CTR的示例，他们分别是criteo_ctr, criteo_ctr_with_cube。前者是在训练时保存整个模型，包括稀疏参数。后者是将稀疏参数裁剪出来，保存成两个部分，一个是稀疏参数，另一个是稠密参数。由于在工业级的场景中，稀疏参数的规模非常大，达到10^9数量级。因此在一台机器上启动大规模稀疏参数预测是不实际的，因此我们引入百度多年来在稀疏参数索引领域的工业级产品Cube，提供分布式的稀疏参数服务。
+
+单机版Cube是分布式Cube的弱化版本，旨在方便开发者做实验和Demo时使用。如果有分布式稀疏参数服务的需求，请在读完此文档之后，继续阅读  [稀疏参数索引服务Cube使用指南](分布式Cube)（正在建设中）。
+
+
+## 示例
+在python/example/criteo_ctr_with_cube下执行
+```
+python local_train.py # 训练模型
+cp ../../../build_server/core/predictor/seq_generator seq_generator #复制Sequence File模型生成工具
+cp ../../../build_server/output/bin/cube* ./cube/ #复制Cube应用程序
+cp ../../../build_server/core/cube/cube-api/cube-cli ./cube/ # 复制Cube-Cli
+cube_prepare.sh & #启动配送脚本
+```
+此示例是从模型训练到配送给Cube的全套流程，接下来会一一介绍。
+
+## 单机版Cube组件介绍
+
+
+### cube-builder
+
+cube-builder是把模型生成分片文件和版本管理的工具。由于cube是用于分布式的稀疏参数服务，对于分布式当中的每一个节点，需要加载不同的分片，然而生成的稀疏参数文件往往一个大文件，就需要用哈希函数将其分割为不同的分片。与此同时，工业级的场景需要支持定期模型的配送和流式训练，因此对于模型的版本管理十分重要，这也是在训练保存模型时缺失的部分，因此cube-builder在生成分片的同时，也可以人为指定增加版本信息。
+
+### cube-server
+
+cube-server基于Cube的KV能力，对外提供稀疏参数服务，它通过brpc提供高性能分布式查询服务，通过RestAPI来进行远端调用。
+
+### cube-cli
+
+cube-cli是cube-server的客户端，这部分已经被整合到paddle serving当中，当我们准备好cube.conf配置文件并在paddle serving server的代码中指定kv_infer相关的op时，cube-cli就会在serving端准备就绪。
+
+## 模型配送步骤
+### 前序步骤
+
+需要训练出模型文件，并复制相关build_server目录下的应用程序
+```
+python local_train.py
+cp ../../../build_server/core/predictor/seq_generator seq_generator #复制Sequence File模型生成工具
+cp ../../../build_server/output/bin/cube* ./cube/ #复制Cube应用程序
+cp ../../../build_server/core/cube/cube-api/cube-cli ./cube/ # 复制Cube-Cli
+```
+
+### 模型文件生成Sequence File
+
+为了让模型参数从训练端配送到预测端，我们需要把训练好的模型从Paddle 模型保存格式转换成Sequence File格式。
+
+**为什么是 Sequence File?**
+Sequence File是Hadoop File System的通用格式。在文章的开头提到了分布式Cube可以为超大规模稀疏参数服务提供支持，而大规模的稀疏参数在实际生产环境中保存在分布式文件系统当中，Hadoop File System是业界开源的最稳定的分布式文件系统之一，因此Sequence File格式成为了Cube加载模型的文件格式。
+
+```
+mkdir -p cube_model
+mkdir -p cube/data
+./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature
+```
+
+### 生成分片文件
+
+在单机版的环境下，分片数为1。执行
+
+```
+./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=./cube/data -shard_num=1  -only_build=false
+
+```
+
+### 配送给Cube-Server
+
+
+单机版本的配送过程非常简单，只需要在cube二进制程序所在目录下的data文件夹存放index.前缀的文件即可。
+
+```
+mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
+cd cube && ./cube &
+```
+
+### Cube-Client 验证配送是否成功
+此步非必须，用于测试配送是否成功
+```
+cd cube
+./cube-cli -dict_name=test_dict -keys  keys -conf ./cube/cube.conf
+```
+
+如果查看到每个key都有对应的value输出，就说明配送成功。此文件也可以被Serving使用，用作Serving中 general kv infer op中进行cube查询。
+
+如果执行成功，会看到如下结果
+<p align="center">
+    <img src="cube-cli.png" width="700">
+</p>
+
+
+## 注： 配置文件
+以python/examples/criteo_ctr_with_cube/cube/conf下的cube.conf示例，此文件被上述的cube-cli所使用，单机版用户可以直接使用不用关注此部分，它在分布式部署中更为重要。
+
+```
+[{
+    "dict_name": "test_dict",  //表名
+    "shard": 1,  //分片数
+    "dup": 1,  //副本数
+    "timeout": 200,
+    "retry": 3,
+    "backup_request": 100,
+    "type": "ipport_list",
+    "load_balancer": "rr",
+    "nodes": [{
+        "ipport_list": "list://127.0.0.1:8027" //IP列表
+    }]
+}]
+```
--- a/doc/DESIGN_DOC.md
+++ b/doc/DESIGN_DOC.md
+# Paddle Serving设计文档
+
+## 1. 整体设计目标
+
+- 长期使命：Paddle Serving是一个PaddlePaddle开源的在线服务框架，长期目标就是围绕着人工智能落地的最后一公里提供越来越专业、可靠、易用的服务。
+
+- 工业级：为了达到工业级深度学习模型在线部署的要求，
+Paddle Serving提供很多大规模场景需要的部署功能：1）分布式稀疏参数索引功能；2）高并发底层通信能力；3）模型管理、在线A/B流量测试、模型热加载。
+
+- 简单易用：为了让使用Paddle的用户能够以极低的成本部署模型，PaddleServing设计了一套与Paddle训练框架无缝打通的预测部署API，普通模型可以使用一行命令进行服务部署。
+
+- 功能扩展：当前，Paddle Serving支持C++、Python、Golang的客户端，未来也会面向不同类型的客户新增多种语言的客户端。在Paddle Serving的框架设计方面，尽管当前Paddle Serving以支持Paddle模型的部署为核心功能，
+用户可以很容易嵌入其他的机器学习库部署在线预测。
+
+## 2. 模块设计与实现
+
+### 2.1 Python API接口设计
+
+#### 2.1.1 训练模型的保存
+Paddle的模型预测需要重点关注的内容：1）模型的输入变量；2）模型的输出变量；3）模型结构和模型参数。Paddle Serving Python API提供用户可以在训练过程中保存模型的接口，并将Paddle Serving在部署阶段需要保存的配置打包保存，一个示例如下：
+``` python
+import paddle_serving_client.io as serving_io
+serving_io.save_model("serving_model", "client_conf",
+                      {"words": data}, {"prediction": prediction},
+                      fluid.default_main_program())
+```
+代码示例中，`{"words": data}`和`{"prediction": prediction}`分别指定了模型的输入和输出，`"words"`和`"prediction"`是输出和输出变量的别名，设计别名的目的是为了使开发者能够记忆自己训练模型的输入输出对应的字段。`data`和`prediction`则是Paddle训练过程中的`[Variable](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Variable_cn.html#variable)`，通常代表张量([Tensor](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Tensor_cn.html#tensor))或变长张量([LodTensor](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor))。调用保存命令后，会按照用户指定的`"serving_model"`和`"client_conf"`生成两个目录，内容如下：
+``` shell
+.
+├── client_conf
+│   ├── serving_client_conf.prototxt
+│   └── serving_client_conf.stream.prototxt
+└── serving_model
+    ├── embedding_0.w_0
+    ├── fc_0.b_0
+    ├── fc_0.w_0
+    ├── fc_1.b_0
+    ├── fc_1.w_0
+    ├── fc_2.b_0
+    ├── fc_2.w_0
+    ├── lstm_0.b_0
+    ├── lstm_0.w_0
+    ├── __model__
+    ├── serving_server_conf.prototxt
+    └── serving_server_conf.stream.prototxt
+```
+其中，`"serving_client_conf.prototxt"`和`"serving_server_conf.prototxt"`是Paddle Serving的Client和Server端需要加载的配置，`"serving_client_conf.stream.prototxt"`和`"serving_server_conf.stream.prototxt"`是配置文件的二进制形式。`"serving_model"`下保存的其他内容和Paddle保存的模型文件是一致的。我们会考虑未来在Paddle框架中直接保存可服务的配置，实现配置保存对用户无感。
+
+#### 2.1.2 服务端模型加载
+
+服务端的预测逻辑可以通过Paddle Serving Server端的API进行人工定义，一个例子：
+``` python
+import paddle_serving_server as serving
+op_maker = serving.OpMaker()
+read_op = op_maker.create('general_reader')
+dist_kv_op = op_maker.create('general_dist_kv')
+general_infer_op = op_maker.create('general_infer')
+general_response_op = op_maker.create('general_response')
+
+op_seq_maker = serving.OpSeqMaker()
+op_seq_maker.add_op(read_op)
+op_seq_maker.add_op(dist_kv_op)
+op_seq_maker.add_op(general_infer_op)
+op_seq_maker.add_op(general_response_op)
+```
+
+当前Paddle Serving在Server端支持的主要Op请参考如下列表：
+
+<center>
+
+| Op 名称 | 描述 |
+|--------------|------|
+| `general_reader` | 通用数据格式的读取Op |
+| `genreal_infer` | 通用数据格式的Paddle预测Op |
+| `general_response` | 通用数据格式的响应Op |
+| `general_dist_kv` | 分布式索引Op |
+
+</center>
+
+当前Paddle Serving中的预估引擎支持在CPU/GPU上进行预测，对应的预测服务安装包以及镜像也有两个。但无论是CPU上进行模型预估还是GPU上进行模型预估，普通模型的预测都可用一行命令进行启动。
+``` shell
+python -m paddle_serving_server.serve --model your_servable_model --thread 10 --port 9292
+```
+``` shell
+python -m paddle_serving_server_gpu.serve --model your_servable_model --thread 10 --port 9292
+```
+启动命令的选项列表如下：
+<center>
+
+| 参数 | 类型 | 默认值 | 描述 |
+|--------------|------|-----------|--------------------------------|
+| `thread` | int | `4` | 服务端的并发数，通常与CPU核数一致即可 |
+| `port` | int | `9292` | 服务暴露给用户的端口 |
+| `name` | str | `""` | 服务名称，当用户指定时代表直接启动的是HTTP服务 |
+| `model` | str | `""` | 服务端模型文件夹路径 |
+| `gpu_ids` | str | `""` | 仅在paddle_serving_server_gpu中可以使用，功能与CUDA_VISIBLE_DEVICES一致 |
+
+</center>
+
+举例`python -m paddle_serving_server.serve --model your_servable_model --thread 10 --port 9292`对应到具体的Server端具体配置如下
+``` python
+from paddle_serving_server import OpMaker, OpSeqMaker, Server
+
+op_maker = OpMaker()
+read_op = op_maker.create('general_reader')
+general_infer_op = op_maker.create('general_infer')
+general_response_op = op_maker.create('general_response')
+op_seq_maker = OpSeqMaker()
+op_seq_maker.add_op(read_op)
+op_seq_maker.add_op(general_infer_op)
+op_seq_maker.add_op(general_response_op)
+server = Server()
+server.set_op_sequence(op_seq_maker.get_op_sequence())
+server.set_num_threads(10)
+server.load_model_config(”your_servable_model“)
+server.prepare_server(port=9292, device="cpu")
+server.run_server()
+```
+
+#### 2.1.3 客户端访问API
+Paddle Serving支持远程服务访问的协议一种是基于RPC，另一种是HTTP。用户通过RPC访问，可以使用Paddle Serving提供的Python Client API，通过定制输入数据的格式来实现服务访问。下面的例子解释Paddle Serving Client如何定义输入数据。保存可部署模型时需要指定每个输入的别名，例如`sparse`和`dense`，对应的数据可以是离散的ID序列`[1, 1001, 100001]`，也可以是稠密的向量`[0.2, 0.5, 0.1, 0.4, 0.11, 0.22]`。当前Client的设计，对于离散的ID序列，支持Paddle中的`lod_level=0`和`lod_level=1`的情况，即张量以及一维变长张量。对于稠密的向量，支持`N-D Tensor`。用户不需要显式指定输入数据的形状，Paddle Serving的Client API会通过保存配置时记录的输入形状进行对应的检查。
+``` python
+feed_dict["sparse"] = [1, 1001, 100001]
+feed_dict["dense"] = [0.2, 0.5, 0.1, 0.4, 0.11, 0.22]
+fetch_map = client.predict(feed=feed_dict, fetch=["prob"])
+```
+Client链接Server的代码，通常只需要加载保存模型时保存的Client端配置，以及指定要去访问的服务端点即可。为了保持内部访问进行数据并行的扩展能力，Paddle Serving Client允许定义多个服务端点。
+``` python
+client = Client()
+client.load_client_config('servable_client_configs')
+client.connect(["127.0.0.1:9292"])
+```
+
+
+### 2.2 底层通信机制
+Paddle Serving采用[baidu-rpc](https://github.com/apache/incubator-brpc)进行底层的通信。baidu-rpc是百度开源的一款PRC通信库，具有高并发、低延时等特点，已经支持了包括百度在内上百万在线预估实例、上千个在线预估服务，稳定可靠。
+
+### 2.3 核心执行引擎
+Paddle Serving的核心执行引擎是一个有向无环图，图中的每个节点代表预估服务的一个环节，例如计算模型预测打分就是其中一个环节。有向无环图有利于可并发节点充分利用部署实例内的计算资源，缩短延时。一个例子，当同一份输入需要送入两个不同的模型进行预估，并将两个模型预估的打分进行加权求和时，两个模型的打分过程即可以通过有向无环图的拓扑关系并发。
+<p align="center">
+    <br>
+<img src='design_doc.png'">
+    <br>
+<p>
+
+### 2.4 微服务插件模式
+由于Paddle Serving底层采用基于C++的通信组件，并且核心框架也是基于C/C++编写，当用户想要在服务端定义复杂的前处理与后处理逻辑时，一种办法是修改Paddle Serving底层框架，重新编译源码。另一种方式可以通过在服务端嵌入轻量级的Web服务，通过在Web服务中实现更复杂的预处理逻辑，从而搭建一套逻辑完整的服务。当访问量超过了Web服务能够接受的范围，开发者有足够的理由开发一些高性能的C++预处理逻辑，并嵌入到Serving的原生服务库中。Web服务和RPC服务的关系以及他们的组合方式可以参考下文`用户类型`中的说明。
+
+## 3. 工业级特性
+
+### 3.1 分布式稀疏参数索引
+
+分布式稀疏参数索引通常在广告推荐中出现，并与分布式训练配合形成完整的离线-在线一体化部署。下图解释了其中的流程，产品的在线服务接受用户请求后将请求发送给预估服务，同时系统会记录用户的请求以进行相应的训练日志处理和拼接。离线分布式训练系统会针对流式产出的训练日志进行模型增量训练，而增量产生的模型会配送至分布式稀疏参数索引服务，同时对应的稠密的模型参数也会配送至在线的预估服务。在线服务由两部分组成，一部分是针对用户的请求提取特征后，将需要进行模型的稀疏参数索引的特征发送请求给分布式稀疏参数索引服务，针对分布式稀疏参数索引服务返回的稀疏参数再进行后续深度学习模型的计算流程，从而完成预估。
+
+<p align="center">
+    <br>
+<img src='cube_eng.png' width = "450" height = "230">
+    <br>
+<p>
+                    
+为什么要使用Paddle Serving提供的分布式稀疏参数索引服务？1）在一些推荐场景中，模型的输入特征规模通常可以达到上千亿，单台机器无法支撑T级别模型在内存的保存，因此需要进行分布式存储。2）Paddle Serving提供的分布式稀疏参数索引服务，具有并发请求多个节点的能力，从而以较低的延时完成预估服务。
+                          
+### 3.2 模型管理、在线A/B流量测试、模型热加载
+
+Paddle Serving的C++引擎支持模型管理、在线A/B流量测试、模型热加载等功能，当前在Python API还有没完全开放这部分功能的配置，敬请期待。
+
+## 4. 用户类型
+Paddle Serving面向的用户提供RPC和HTTP两种访问协议。对于HTTP协议，我们更倾向于流量中小型的服务使用，并且对延时没有严格要求的AI服务开发者。对于RPC协议，我们面向流量较大，对延时要求更高的用户，此外RPC的客户端可能也处在一个大系统的服务中，这种情况下非常适合使用Paddle Serving提供的RPC服务。对于使用分布式稀疏参数索引服务而言，Paddle Serving的用户不需要关心底层的细节，其调用本质也是通过RPC服务再调用RPC服务。下图给出了当前设计的Paddle Serving可能会使用Serving服务的几种场景。
+
+<p align="center">
+    <br>
+<img src='user_groups.png' width = "700" height = "470">
+    <br>
+<p>
+
+对于普通的模型而言（具体指通过Serving提供的IO保存的模型，并且没有对模型进行后处理），用户使用RPC服务不需要额外的开发即可实现服务启动，但需要开发一些Client端的代码来使用服务。对于Web服务的开发，需要用户现在Paddle Serving提供的Web Service框架中进行前后处理的开发，从而实现整个HTTP服务。
+
+### 4.1 Web服务开发
+
+Web服务有很多开源的框架，Paddle Serving当前集成了Flask框架，但这部分对用户不可见，在未来可能会提供性能更好的Web框架作为底层HTTP服务集成引擎。用户需要继承WebService，从而实现对rpc服务的输入输出进行加工的目的。
+
+``` python
+from paddle_serving_server.web_service import WebService
+from imdb_reader import IMDBDataset
+import sys
+
+
+class IMDBService(WebService):
+    def prepare_dict(self, args={}):
+        if len(args) == 0:
+            exit(-1)
+        self.dataset = IMDBDataset()
+        self.dataset.load_resource(args["dict_file_path"])
+
+    def preprocess(self, feed={}, fetch=[]):
+        if "words" not in feed:
+            exit(-1)
+        res_feed = {}
+        res_feed["words"] = self.dataset.get_words_only(feed["words"])[0]
+        return res_feed, fetch
+
+
+imdb_service = IMDBService(name="imdb")
+imdb_service.load_model_config(sys.argv[1])
+imdb_service.prepare_server(
+    workdir=sys.argv[2], port=int(sys.argv[3]), device="cpu")
+imdb_service.prepare_dict({"dict_file_path": sys.argv[4]})
+imdb_service.run_server()
+```
+
+`WebService`作为基类，提供将用户接受的HTTP请求转化为RPC输入的接口`preprocess`，同时提供对RPC请求返回的结果进行后处理的接口`postprocess`，继承`WebService`的子类，可以定义各种类型的成员函数。`WebService`的启动命令和普通RPC服务提供的启动API一致。
+
+## 5. 未来计划
+
+### 5.1 有向无环图结构定义开放
+当前版本开放的python API仅支持用户定义Sequential类型的执行流，如果想要进行Server进程内复杂的计算，需要增加对应的用户API。
+
+### 5.2 云端自动部署能力
+为了方便用户更容易将Paddle的预测模型部署到线上，Paddle Serving在接下来的版本会提供Kubernetes生态下任务编排的工具。
+
+### 5.3 向量检索、树结构检索
+在推荐与广告场景的召回系统中，通常需要采用基于向量的快速检索或者基于树结构的快速检索，Paddle Serving会对这方面的检索引擎进行集成或扩展。
--- a/doc/DESIGN_DOC_EN.md
+++ b/doc/DESIGN_DOC_EN.md
+# Paddle Serving Design Doc
+
+## 1. Design Objectives
+
+- Long Term Vision: Online deployment of deep learning models will be a user-facing application in the future. Any AI developer will face the problem of deploying an online service for his or her trained model.
+Paddle Serving is the official open source online deployment framework. The long term goal of Paddle Serving is to provide professional, reliable and easy-to-use online service to the last mile of AI application.
+
+- Easy-To-Use: For algorithmic developers to quickly deploy their models online, Paddle Serving designs APIs that can be used with Paddle's training process seamlessly, most Paddle models can be deployed as a service with one line command.
+
+- Industrial Oriented: To meet industrial deployment requirements, Paddle Serving supports lots of large-scale deployment functions: 1) Distributed Sparse Embedding Indexing. 2) Highly concurrent underlying communications. 3) Model Management, online A/B test, model online loading.
+
+- Extensibility: Paddle Serving supports C++, Python and Golang client, and will support more clients with different languages. It is very easy to extend Paddle Serving to support other machine learning inference library, although currently Paddle inference library is the only official supported inference backend.
+
+
+## 2. Module design and implementation
+
+### 2.1 Python API interface design
+
+#### 2.1.1 save a servable model
+The inference phase of Paddle model focuses on 1) input variables of the model. 2) output variables of the model. 3) model structure and model parameters. Paddle Serving Python API provides a `save_model` interface for trained model, and save necessary information for Paddle Serving to use during deployment phase. An example is as follows:
+
+``` python
+import paddle_serving_client.io as serving_io
+serving_io.save_model("serving_model", "client_conf",
+                      {"words": data}, {"prediction": prediction},
+                      fluid.default_main_program())
+```
+In the example, `{"words": data}` and `{"prediction": prediction}` assign the inputs and outputs of a model. `"words"` and `"prediction"` are alias names of inputs and outputs. The design of alias name is to help developers to memorize model inputs and model outputs. `data` and `prediction` are Paddle `[Variable](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Variable_cn.html#variable)` in training phase that often represents ([Tensor](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Tensor_cn.html#tensor)) or ([LodTensor](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor)). When the `save_model` API is called, two directories called `"serving_model"` and `"client_conf"` will be generated. The content of the saved model is as follows:
+
+``` shell
+.
+├── client_conf
+│   ├── serving_client_conf.prototxt
+│   └── serving_client_conf.stream.prototxt
+└── serving_model
+    ├── embedding_0.w_0
+    ├── fc_0.b_0
+    ├── fc_0.w_0
+    ├── fc_1.b_0
+    ├── fc_1.w_0
+    ├── fc_2.b_0
+    ├── fc_2.w_0
+    ├── lstm_0.b_0
+    ├── lstm_0.w_0
+    ├── __model__
+    ├── serving_server_conf.prototxt
+    └── serving_server_conf.stream.prototxt
+```
+`"serving_client_conf.prototxt"` and `"serving_server_conf.prototxt"` are the client side and the server side configurations of Paddle Serving, and `"serving_client_conf.stream.prototxt"` and `"serving_server_conf.stream.prototxt"` are the corresponding parts. Other contents saved in the directory are the same as Paddle saved inference model. We are considering to support `save_model` interface in Paddle training framework so that a user is not aware of the servable configurations. 
+
+#### 2.1.2 Model loading on the server side
+
+Prediction logics on the server side can be defined through Paddle Serving Server API with a few lines of code, an example is as follows:
+``` python
+import paddle_serving_server as serving
+op_maker = serving.OpMaker()
+read_op = op_maker.create('general_reader')
+dist_kv_op = op_maker.create('general_dist_kv')
+general_infer_op = op_maker.create('general_infer')
+general_response_op = op_maker.create('general_response')
+
+op_seq_maker = serving.OpSeqMaker()
+op_seq_maker.add_op(read_op)
+op_seq_maker.add_op(dist_kv_op)
+op_seq_maker.add_op(general_infer_op)
+op_seq_maker.add_op(general_response_op)
+```
+Current Paddle Serving supports operator list on the server side as follows:
+
+<center>
+
+| Op Name | Description |
+|--------------|------|
+| `general_reader` | General Data Reading Operator |
+| `genreal_infer` | General Data Inference with Paddle Operator |
+| `general_response` | General Data Response Operator |
+| `general_dist_kv` | Distributed Sparse Embedding Indexing |
+
+</center>
+
+Paddle Serving supports inference engine on multiple devices. Current supports are CPU and GPU engine. Docker Images of CPU and GPU are provided officially. User can use one line command to start an inference service either on CPU or on GPU. 
+
+``` shell
+python -m paddle_serving_server.serve --model your_servable_model --thread 10 --port 9292
+```
+``` shell
+python -m paddle_serving_server_gpu.serve --model your_servable_model --thread 10 --port 9292
+```
+
+Options of startup command are listed below: 
+<center>
+
+| Arguments | Types | Defaults | Descriptions |
+|--------------|------|-----------|--------------------------------|
+| `thread` | int | `4` | Concurrency on server side, usually equal to the number of CPU core |
+| `port` | int | `9292` | Port exposed to users |
+| `name` | str | `""` | Service name that if a user specifies, the name of HTTP service is allocated |
+| `model` | str | `""` | Servable models for Paddle Serving |
+| `gpu_ids` | str | `""` | Supported only in paddle_serving_server_gpu, similar to the usage of CUDA_VISIBLE_DEVICES |
+
+</center>
+
+For example, `python -m paddle_serving_server.serve --model your_servable_model --thread 10 --port 9292` is the same as the following code as user can define: 
+``` python
+from paddle_serving_server import OpMaker, OpSeqMaker, Server
+
+op_maker = OpMaker()
+read_op = op_maker.create('general_reader')
+general_infer_op = op_maker.create('general_infer')
+general_response_op = op_maker.create('general_response')
+op_seq_maker = OpSeqMaker()
+op_seq_maker.add_op(read_op)
+op_seq_maker.add_op(general_infer_op)
+op_seq_maker.add_op(general_response_op)
+server = Server()
+server.set_op_sequence(op_seq_maker.get_op_sequence())
+server.set_num_threads(10)
+server.load_model_config(”your_servable_model“)
+server.prepare_server(port=9292, device="cpu")
+server.run_server()
+```
+
+#### 2.1.3 Paddle Serving Client API
+Paddle Serving supports remote service access through RPC(remote procedure call) and HTTP. RPC access of remote service can be called through Client API of Paddle Serving. A user can define data preprocess function before calling Paddle Serving's client API. The example below explains how to define the input data of Paddle Serving Client. The servable model has two inputs with alias name of `sparse` and `dense`. `sparse` corresponds to sparse sequence ids such as `[1, 1001, 100001]` and `dense` corresponds to dense vector such as `[0.2, 0.5, 0.1, 0.4, 0.11, 0.22]`. For sparse sequence data, current design supports `lod_level=0` and `lod_level=1` of Paddle, that corresponds to `Tensor` and `LodTensor`. For dense vector, current design supports any `N-D Tensor`. Users do not need to assign the shape of inference model input. The Paddle Serving Client API will check the input data's shape with servable configurations.
+
+``` python
+feed_dict["sparse"] = [1, 1001, 100001]
+feed_dict["dense"] = [0.2, 0.5, 0.1, 0.4, 0.11, 0.22]
+fetch_map = client.predict(feed=feed_dict, fetch=["prob"])
+```
+
+The following code sample shows that Paddle Serving Client API connects to Server API with endpoint of the servers. To use the data parallelism ability during prediction, Paddle Serving Client allows users to define multiple server endpoints.
+``` python
+client = Client()
+client.load_client_config('servable_client_configs')
+client.connect(["127.0.0.1:9292"])
+```
+
+### 2.2 Underlying Communication Mechanism
+Paddle Serving adopts [baidu-rpc](https://github.com/apache/incubator-brpc) as underlying communication layer. baidu-rpc is an open-source RPC communication library with high concurrency and low latency advantages compared with other open source RPC library. Millions of instances and thousands of services are using baidu-rpc within Baidu.
+
+### 2.3 Core Execution Engine
+The core execution engine of Paddle Serving is a Directed acyclic graph(DAG). In the DAG, each node represents a phase of inference service, such as paddle inference prediction, data preprocessing and data postprocessing. DAG can fully parallelize the computation efficiency and can fully utilize the computation resources. For example, when a user has input data that needs to be feed into two models, and combine the scores of the two models, the computation of model scoring is parallelized through DAG.
+
+<p align="center">
+    <br>
+<img src='design_doc.png'">
+    <br>
+<p>
+
+### 2.4 Micro service plugin
+The underlying communication of Paddle Serving is implemented with C++ as well as the core framework, it is hard for users who do not familiar with C++ to implement new Paddle Serving Server Operators. Another approach is to use the light-weighted Web Service in Paddle Serving Server that can be viewed as a plugin. A user can implement complex data preprocessing and postprocessing logics to build a complex AI service. If access of the AI service has a large volumn, it is worth to implement the service with high performance Paddle Serving Server operators. The relationship between Web Service and RPC Service can be referenced in `User Type`.
+
+## 3. Industrial Features
+
+### 3.1 Distributed Sparse Parameter Indexing
+
+Distributed Sparse Parameter Indexing is commonly seen in advertising and recommendation scenarios, and is often used coupled with distributed training. The figure below explains a commonly seen architecture for online recommendation. When the recommendation service receives a request from a user, the system will automatically collects training log for the offline distributed online training. Mean while, the request is sent to Paddle Serving Server. For sparse features, distributed sparse parameter index service is called so that sparse parameters can be looked up. The dense input features together with the looked up sparse model parameters are fed into the Paddle Inference Node of the DAG in Paddle Serving Server. Then the score can be responsed through RPC to product service for item ranking.
+
+<p align="center">
+    <br>
+<img src='cube_eng.png' width = "450" height = "230">
+    <br>
+<p>
+
+Why do we need to support distributed sparse parameter indexing in Paddle Serving? 1) In some recommendation scenarios, the number of features can be up to hundreds of billions that a single node can not hold the parameters within random access memory. 2) Paddle Serving supports distributed sparse parameter indexing that can couple with paddle inference. Users do not need to do extra work to have a low latency inference engine with hundreds of billions of parameters.
+                          
+### 3.2 Model Management, online A/B test, Model Online Reloading
+
+Paddle Serving's C++ engine supports model management, online A/B test and model online reloading. Currently, python API is not released yet, please wait for the next release.
+
+## 4. User Types
+Paddle Serving provides RPC and HTTP protocol for users. For HTTP service, we recommend users with median or small traffic services to use, and the latency is not a strict requirement. For RPC protocol, we recommend high traffic services and low latency required services to use. For users who use distributed sparse parameter indexing built-in service, it is not necessary to care about the underlying details of communication. The following figure gives out several scenarios that user may want to use Paddle Serving. 
+
+<p align="center">
+    <br>
+<img src='user_groups.png' width = "700" height = "470">
+    <br>
+<p>
+
+For servable models saved from Paddle Serving IO API, users do not need to do extra coding work to startup a service, but may need some coding work on the client side. For development of Web Service plugin, a user needs to provide implementation of Web Service's preprocessing and postprocessing work if needed to get a HTTP service.
+
+### 4.1 Web Service Development
+
+Web Service has lots of open sourced framework. Currently Paddle Serving uses Flask as built-in service framework, and users are not aware of this. More efficient web service will be integrated in the furture if needed.
+
+``` python
+from paddle_serving_server.web_service import WebService
+from imdb_reader import IMDBDataset
+import sys
+
+
+class IMDBService(WebService):
+    def prepare_dict(self, args={}):
+        if len(args) == 0:
+            exit(-1)
+        self.dataset = IMDBDataset()
+        self.dataset.load_resource(args["dict_file_path"])
+
+    def preprocess(self, feed={}, fetch=[]):
+        if "words" not in feed:
+            exit(-1)
+        res_feed = {}
+        res_feed["words"] = self.dataset.get_words_only(feed["words"])[0]
+        return res_feed, fetch
+
+
+imdb_service = IMDBService(name="imdb")
+imdb_service.load_model_config(sys.argv[1])
+imdb_service.prepare_server(
+    workdir=sys.argv[2], port=int(sys.argv[3]), device="cpu")
+imdb_service.prepare_dict({"dict_file_path": sys.argv[4]})
+imdb_service.run_server()
+```
+
+`WebService` is a Base Class, providing inheritable interfaces such `preprocess` and `postprocess` for users to implement. In the inherited class of `WebService` class, users can define any functions they want and the startup function interface is the same as RPC service.
+
+## 5. Future Plan
+
+### 5.1 Open DAG definition API
+Current version of Paddle Serving Server supports sequential type of execution flow. DAG definition API can be more helpful to users on complex tasks.
+
+### 5.2 Auto Deployment on Cloud
+In order to make deployment more easily on public cloud, Paddle Serving considers to provides Operators on Kubernetes in submitting a service job.
+
+### 5.3 Vector Indexing and Tree based Indexing
+In recommendation and advertisement systems, it is commonly seen to use vector based index or tree based indexing service to do candidate retrievals. These retrieval tasks will be built-in services of Paddle Serving.
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
-# Install
-
-## 系统需求
-
-OS: Linux
-
-CMake: (验证过的版本：3.2/3.5.2)
-
-C++编译器 (验证过的版本：GCC 4.8.2/5.4.0)
-
-python (验证过的版本：2.7)
-
-Go编译器 (>=1.8 验证过的版本：1.9.2/1.12.0)
-
-openssl & openssl-devel
-
-curl-devel
-
-bzip2-devel
-
-## 编译
-
-推荐使用Docker准备Paddle Serving编译环境。[Docker编译使用说明](./DOCKER.md)
-
-以下命令将会下载Paddle Serving最新代码，并执行编译。
-
-```shell
-$ git clone https://github.com/PaddlePaddle/Serving.git
-$ cd Serving
-$ mkdir build
-$ cd build
-$ cmake ..
-$ make -j4
-$ make install
-```
-
-`make install`将把目标产出放在/path/to/Paddle-Serving/build/output/目录下，目录结构：
-
-```
-.
-|-- bin                             # Paddle Serving工具和protobuf编译插件pdcodegen所在目录
-|-- conf
-|-- demo                            # demo总目录
-|   |-- client                      # Demo client端
-|   |   |-- bert                    # bert模型客户端
-|   |   |-- ctr_prediction          # CTR prediction模型客户端
-|   |   |-- dense_format            # dense_format客户端
-|   |   |-- echo                    # 最简单的echo service客户端
-|   |   |-- echo_kvdb               # local KV读取demo客户端
-|   |   |-- image_classification    # 图像分类任务客户端
-|   |   |-- int64tensor_format      # int64tensor_format示例客户端
-|   |   |-- sparse_format           # sparse_format示例客户端
-|   |   `-- text_classification     # 文本分类任务示例客户端
-|   |-- db_func
-|   |-- db_thread
-|   |-- kvdb_test
-|   `-- serving                     # Demo serving端；该serving可同时响应所有demo client请求
-|-- include                         # Paddle Serving发布的头文件
-|-- lib                             # Paddle Serving发布的libs
-`-- tool                            # Paddle Serving发布的工具目录
-
-```
-
-如要编写新的预测服务，请参考[从零开始写一个预测服务](CREATING.md)
-
-# CMake编译选项说明
-
-| 编译选项 | 说明 |
-|----------|------|
-| WITH_AVX | For configuring PaddlePaddle. Compile PaddlePaddle with AVX intrinsics |
-| WITH_MKL | For configuring PaddlePaddle. Compile PaddlePaddle with MKLML library |
-| WITH_GPU | For configuring PaddlePaddle. Compile PaddlePaddle with NVIDIA GPU |
-| CUDNN_ROOT| For configuring PaddlePaddle. Define CuDNN library and header path |
-| CLINET_ONLY | Compile client libraries and demos only |
-
-## WITH_GPU选项
-
-Paddle Serving通过PaddlePaddle预测库支持在GPU上做预测。WITH_GPU选项用于检测系统上CUDA/CUDNN等基础库，如检测到合适版本，在编译PaddlePaddle时就会编译出GPU版本的OP Kernel。
-
-在裸机上编译Paddle Serving GPU版本，需要安装这些基础库：
-
- CUDA
- CuDNN
- NCCL2
-
-这里要注意的是：
-1) 编译Serving所在的系统上所安装的CUDA/CUDNN等基础库版本，需要兼容实际的GPU设备。例如，Tesla V100卡至少要CUDA 9.0。如果编译时所用CUDA等基础库版本过低，由于生成的GPU代码和实际硬件设备不兼容，会导致Serving进程无法启动，或出现coredump等严重问题。
-2) 运行Paddle Serving的系统上安装与实际GPU设备兼容的CUDA driver，并安装与编译期所用的CUDA/CuDNN等版本兼容的基础库。如运行Paddle Serving的系统上安装的CUDA/CuDNN的版本低于编译时所用版本，可能会导致奇怪的cuda函数调用失败等问题。
-
-以下是PaddlePaddle发布版本所使用的基础库版本匹配关系，供参考：
-
-| | CUDA  | CuDNN | NCCL2 |
-|-|-------|--------------------------|-------|
-| CUDA 8 | 8.0.61 | CuDNN 7.1.2 for CUDA 8.0 | 2.1.4 |
-| CUDA 9 | 9.0.176 | CuDNN 7.3.1 for CUDA 9.0| 2.2.12 |
-
-### 如何让Paddle Serving编译系统探测到CuDNN库
-
-从NVIDIA developer官网下载对应版本CuDNN并在本地解压后，在cmake编译命令中增加-DCUDNN_ROOT参数，指定CuDNN库所在路径：
-
-```
-$ pwd
-/path/to/paddle-serving
-
-$ mkdir build && cd build
-$ cmake -DWITH_GPU=ON -DCUDNN_ROOT=/path/to/cudnn/cudnn_v7/cuda ..
-```
-
-### 如何让Paddle Serving编译系统探测到nccl库
-
-从NVIDIA developer官网下载对应版本nccl2库并解压后，增加如下环境变量 (以nccl2.1.4为例)：
-
-```
-$ export C_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$C_INCLUDE_PATH
-$ export CPLUS_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$CPLUS_INCLUDE_PATH
-$ export LD_LIBRARY_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/lib/:$LD_LIBRARY_PATH
-```
--- a/doc/RUN_IN_DOCKER.md
+++ b/doc/RUN_IN_DOCKER.md
@@ -13,7 +13,7 @@ You can get images in two ways:
 1. Pull image directly

   ```bash
-   docker pull hub.baidubce.com/ctr/paddleserving:0.1.3
+   docker pull hub.baidubce.com/paddlepaddle/serving:0.1.3
   ```

 2. Building image based on dockerfile
@@ -21,13 +21,13 @@ You can get images in two ways:
   Create a new folder and copy [Dockerfile](../tools/Dockerfile) to this folder, and run the following command:

   ```bash
-   docker build -t hub.baidubce.com/ctr/paddleserving:0.1.3 .
+   docker build -t hub.baidubce.com/paddlepaddle/serving:0.1.3 .
   ```

 ### Create container

 ```bash
-docker run -p 9292:9292 --name test -dit hub.baidubce.com/ctr/paddleserving:0.1.3
+docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:0.1.3
 docker exec -it test bash
 ```

@@ -99,7 +99,7 @@ You can also get images in two ways:
 1. Pull image directly

   ```bash
-   nvidia-docker pull hub.baidubce.com/ctr/paddleserving:0.1.3-gpu
+   nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:0.1.3-gpu
   ```

 2. Building image based on dockerfile
@@ -107,13 +107,13 @@ You can also get images in two ways:
   Create a new folder and copy [Dockerfile.gpu](../tools/Dockerfile.gpu) to this folder, and run the following command:

   ```bash
-   nvidia-docker build -t hub.baidubce.com/ctr/paddleserving:0.1.3-gpu .
+   nvidia-docker build -t hub.baidubce.com/paddlepaddle/serving:0.1.3-gpu .
   ```

 ### Create container

 ```bash
-nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/ctr/paddleserving:0.1.3-gpu
+nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:0.1.3-gpu
 nvidia-docker exec -it test bash
 ```


--- a/doc/RUN_IN_DOCKER_CN.md
+++ b/doc/RUN_IN_DOCKER_CN.md
@@ -13,7 +13,7 @@ Docker（GPU版本需要在GPU机器上安装nvidia-docker）
 1. 直接拉取镜像

   ```bash
-   docker pull hub.baidubce.com/ctr/paddleserving:0.1.3
+   docker pull hub.baidubce.com/paddlepaddle/serving:0.1.3
   ```

 2. 基于Dockerfile构建镜像
@@ -21,13 +21,13 @@ Docker（GPU版本需要在GPU机器上安装nvidia-docker）
   建立新目录，复制[Dockerfile](../tools/Dockerfile)内容到该目录下Dockerfile文件。执行

   ```bash
-   docker build -t hub.baidubce.com/ctr/paddleserving:0.1.3 .
+   docker build -t hub.baidubce.com/paddlepaddle/serving:0.1.3 .
   ```

 ### 创建容器并进入

 ```bash
-docker run -p 9292:9292 --name test -dit hub.baidubce.com/ctr/paddleserving:0.1.3
+docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:0.1.3
 docker exec -it test bash
 ```

@@ -97,7 +97,7 @@ GPU版本与CPU版本基本一致，只有部分接口命名的差别（GPU版
 1. 直接拉取镜像

   ```bash
-   nvidia-docker pull hub.baidubce.com/ctr/paddleserving:0.1.3-gpu
+   nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:0.1.3-gpu
   ```

 2. 基于Dockerfile构建镜像
@@ -105,13 +105,13 @@ GPU版本与CPU版本基本一致，只有部分接口命名的差别（GPU版
   建立新目录，复制[Dockerfile.gpu](../tools/Dockerfile.gpu)内容到该目录下Dockerfile文件。执行

   ```bash
-   nvidia-docker build -t hub.baidubce.com/ctr/paddleserving:0.1.3-gpu .
+   nvidia-docker build -t hub.baidubce.com/paddlepaddle/serving:0.1.3-gpu .
   ```

 ### 创建容器并进入

 ```bash
-nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/ctr/paddleserving:0.1.3-gpu
+nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:0.1.3-gpu
 nvidia-docker exec -it test bash
 ```


--- a/doc/SAVE.md
+++ b/doc/SAVE.md
@@ -7,7 +7,7 @@ serving_io.save_model("imdb_model", "imdb_client_conf",
                      fluid.default_main_program())
 ```
 `imdb_model` is the server side model with serving configurations. `imdb_client_conf` is the client rpc configurations. Serving has a 
-dictionary for `Feed` and `Fetch` variables for client to assign. An alias name can be defined for each variable. An example of how to use alias name
+dictionary for `Feed` and `Fetch` variables for client to assign. In the example, `{"words": data}` is the feed dict that specify the input of saved inference model. `{"prediction": prediction}` is the fetch dic that specify the output of saved inference model. An alias name can be defined for feed and fetch variables. An example of how to use alias name
 is as follows:
 ``` python
 from paddle_serving_client import Client

--- a/doc/TRAIN_TO_SERVICE.md
+++ b/doc/TRAIN_TO_SERVICE.md
-# 使用PaddleServing快速搭建预测服务
+# 端到端完成从训练到部署全流程

 Paddle Serving是Paddle的高性能在线预测服务框架，可以灵活支持大多数模型的部署。本文中将以IMDB评论情感分析任务为例通过9步展示从模型的训练到部署预测服务的全流程。


--- a/doc/blank.png
+++ b/doc/blank.png
--- a/doc/coding_mode.png
+++ b/doc/coding_mode.png
--- a/doc/cube-cli.png
+++ b/doc/cube-cli.png
--- a/doc/cube.png
+++ b/doc/cube.png
--- a/doc/cube_eng.png
+++ b/doc/cube_eng.png
--- a/doc/design_doc.png
+++ b/doc/design_doc.png
--- a/doc/user_groups.png
+++ b/doc/user_groups.png
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
-if (CLIENT_ONLY)
+if (CLIENT)
    file(GLOB_RECURSE SERVING_CLIENT_PY_FILES paddle_serving_client/*.py)
    set(PY_FILES ${SERVING_CLIENT_PY_FILES})
    SET(PACKAGE_NAME "serving_client")
    set(SETUP_LOG_FILE "setup.py.client.log")
 endif()

-if (NOT CLIENT_ONLY)
+if (SERVER)
    if (NOT WITH_GPU)
        file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server/*.py)
    else()
@@ -16,12 +16,17 @@ if (NOT CLIENT_ONLY)
        set(SETUP_LOG_FILE "setup.py.server.log")
 endif()

-if (CLIENT_ONLY)
+if (CLIENT)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.client.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 endif()

-if (NOT CLIENT_ONLY)
+if (APP)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.app.in
+    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+endif()
+
+if (SERVER)
    if (NOT WITH_GPU)
        configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.server.in
            ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
@@ -34,7 +39,15 @@ endif()
 set (SERVING_CLIENT_CORE ${PADDLE_SERVING_BINARY_DIR}/core/general-client/*.so)
 message("python env: " ${py_env})

-if (CLIENT_ONLY)
+if (APP)
+add_custom_command(
+        OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+        COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_app/ ${PADDLE_SERVING_BINARY_DIR}/python/
+        COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel)
+add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+endif()
+
+if (CLIENT)
 add_custom_command(
 	OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
 	COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_client/ ${PADDLE_SERVING_BINARY_DIR}/python/
@@ -44,7 +57,7 @@ add_custom_command(
 add_custom_target(paddle_python ALL DEPENDS serving_client ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
 endif()

-if (NOT CLIENT_ONLY)
+if (SERVER)
    if(NOT WITH_GPU)
        add_custom_command(
            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
@@ -66,20 +79,22 @@ endif()
 set(SERVING_CLIENT_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 set(SERVING_SERVER_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)

-if (CLIENT_ONLY)
+if (CLIENT)
 install(DIRECTORY ${SERVING_CLIENT_PYTHON_PACKAGE_DIR}
    DESTINATION opt/serving_client/share/wheels
 )
 endif()

-if (NOT CLIENT_ONLY)
+if (SERVER)
 install(DIRECTORY ${SERVING_SERVER_PYTHON_PACKAGE_DIR}
    DESTINATION opt/serving_server/share/wheels
 )
 endif()

+if (CLIENT OR SERVER)
 find_program(PATCHELF_EXECUTABLE patchelf)
-if(NOT PATCHELF_EXECUTABLE)
+if (NOT PATCHELF_EXECUTABLE)
  message(FATAL_ERROR "patchelf not found, please install it.\n"
         "For Ubuntu, the command is: apt-get install -y patchelf.")
 endif()
+endif()
--- a/python/examples/bert/benchmark_batch.py
+++ b/python/examples/bert/benchmark_batch.py
@@ -41,13 +41,13 @@ def single_func(idx, resource):
        client = Client()
        client.load_client_config(args.model)
        client.connect([resource["endpoint"][idx % len(resource["endpoint"])]])
+        feed_batch = []
+        for bi in range(args.batch_size):
+            feed_batch.append(reader.process(dataset[bi]))

        start = time.time()
        for i in range(1000):
            if args.batch_size >= 1:
-                feed_batch = []
-                for bi in range(args.batch_size):
-                    feed_batch.append(reader.process(dataset[i]))
                result = client.batch_predict(
                    feed_batch=feed_batch, fetch=fetch)
            else:
@@ -61,7 +61,9 @@ def single_func(idx, resource):

 if __name__ == '__main__':
    multi_thread_runner = MultiThreadRunner()
-    endpoint_list = ["127.0.0.1:9292"]
+    endpoint_list = [
+        "127.0.0.1:9295", "127.0.0.1:9296", "127.0.0.1:9297", "127.0.0.1:9298"
+    ]
    result = multi_thread_runner.run(single_func, args.thread,
                                     {"endpoint": endpoint_list})
    avg_cost = 0

--- a/python/examples/bert/benchmark_batch.sh
+++ b/python/examples/bert/benchmark_batch.sh
 rm profile_log
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
+
+sleep 5
+
 for thread_num in 1 2 4 8 16
 do
 for batch_size in 1 2 4 8 16 32 64 128 256 512
 do
    $PYTHONROOT/bin/python benchmark_batch.py --thread $thread_num --batch_size $batch_size --model serving_client_conf/serving_client_conf.prototxt --request rpc > profile 2>&1
    echo "========================================"
+    echo "thread num: ", $thread_num
+    echo "batch size: ", $batch_size
    echo "batch size : $batch_size" >> profile_log
    $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log
    tail -n 1 profile >> profile_log

--- a/python/examples/bert/benchmark_with_profile.sh
+++ b/python/examples/bert/benchmark_with_profile.sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
+export FLAGS_profile_client=1
+export FLAGS_profile_server=1
+sleep 5
+thread_num=4
+python benchmark_batch.py --thread ${thread_num} --batch_size 64 --model serving_client_conf/serving_client_conf.prototxt 2> profile
+
+python show_profile.py profile ${thread_num}
+python timeline_trace.py profile trace
--- a/python/examples/criteo_ctr_with_cube/README.md
+++ b/python/examples/criteo_ctr_with_cube/README.md
-## 带稀疏参数服务器的CTR预测服务
+## Criteo CTR with Sparse Parameter Indexing Service
+
+([简体中文](./README_CN.md)|English)
+
+### Get Sample Dataset

-### 获取样例数据
 ```
 sh get_data.sh
 ```

-### 保存模型和配置文件
+### Train and Save Model
 ```
 python local_train.py
 ```
-执行脚本后会在当前目录生成ctr_server_model和ctr_client_config文件夹,以及ctr_server_model_kv, ctr_client_conf_kv。
+the trained model will be in ./ctr_server_model and ./ctr_client_config, and ctr_server_model_kv, ctr_client_conf_kv。

-### 启动稀疏参数服务器
+### Start Sparse Parameter Indexing Service
 ```
 cp ../../../build_server/core/predictor/seq_generator seq_generator
 cp ../../../build_server/output/bin/cube* ./cube/
 sh cube_prepare.sh &
 ```

-### 启动RPC预测服务，服务端线程数为4（可在test_server.py配置）
+Here, the sparse parameter is loaded by cube sparse parameter indexing service Cube，for more details please read [Cube: Sparse Parameter Indexing Service (Local Mode)](../../../doc/CUBE_LOCAL.md)
+
+### Start RPC Predictor, the number of serving thread is 4（configurable in test_server.py）

 ```
 python test_server.py ctr_serving_model_kv 
 ```

-### 执行预测
+### Run Prediction

 ```
 python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
@@ -32,17 +37,17 @@ python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data

 ### Benchmark

-设备 ：Intel(R) Xeon(R) CPU 6148 @ 2.40GHz 
+CPU ：Intel(R) Xeon(R) CPU 6148 @ 2.40GHz 

-模型 ：[Criteo CTR](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/ctr_criteo_with_cube/network_conf.py)
+Model ：[Criteo CTR](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/ctr_criteo_with_cube/network_conf.py)

 server core/thread num ： 4/8

-执行
+Run
 ```
 bash benchmark.sh
 ```
-客户端每个线程会发送1000个batch
+1000 batches will be sent by every client

 | client  thread num | prepro | client infer | op0    | op1   | op2    | postpro | avg_latency | qps   |
 | ------------------ | ------ | ------------ | ------ | ----- | ------ | ------- | ----- | ----- |
@@ -52,10 +57,10 @@ bash benchmark.sh
 | 8                  | 0.044  | 8.230        | 0.028  | 0.464 | 0.0023 | 0.0034  | 14.191 | 563.8 |
 | 16                 | 0.048  | 21.037       | 0.028  | 0.455 | 0.0025 | 0.0041  | 27.236 | 587.5 |

-平均每个线程耗时图如下
+the average latency of threads

 ![avg cost](../../../doc/criteo-cube-benchmark-avgcost.png)

-每个线程QPS耗时如下
+The QPS is 

 ![qps](../../../doc/criteo-cube-benchmark-qps.png)
--- a/python/examples/criteo_ctr_with_cube/README_CN.md
+++ b/python/examples/criteo_ctr_with_cube/README_CN.md
+## 带稀疏参数索引服务的CTR预测服务
+(简体中文|[English](./README.md))
+
+### 获取样例数据
+```
+sh get_data.sh
+```
+
+### 保存模型和配置文件
+```
+python local_train.py
+```
+执行脚本后会在当前目录生成ctr_server_model和ctr_client_config文件夹,以及ctr_server_model_kv, ctr_client_conf_kv。
+
+### 启动稀疏参数索引服务
+```
+cp ../../../build_server/core/predictor/seq_generator seq_generator
+cp ../../../build_server/output/bin/cube* ./cube/
+sh cube_prepare.sh &
+```
+
+此处，模型当中的稀疏参数会被存放在稀疏参数索引服务Cube当中，关于稀疏参数索引服务Cube的介绍，请阅读[稀疏参数索引服务Cube单机版使用指南](../../../doc/CUBE_LOCAL_CN.md)
+
+### 启动RPC预测服务，服务端线程数为4（可在test_server.py配置）
+
+```
+python test_server.py ctr_serving_model_kv 
+```
+
+### 执行预测
+
+```
+python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
+```
+
+### Benchmark
+
+设备 ：Intel(R) Xeon(R) CPU 6148 @ 2.40GHz 
+
+模型 ：[Criteo CTR](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/ctr_criteo_with_cube/network_conf.py)
+
+server core/thread num ： 4/8
+
+执行
+```
+bash benchmark.sh
+```
+客户端每个线程会发送1000个batch
+
+| client  thread num | prepro | client infer | op0    | op1   | op2    | postpro | avg_latency | qps   |
+| ------------------ | ------ | ------------ | ------ | ----- | ------ | ------- | ----- | ----- |
+| 1                  | 0.035  | 1.596        | 0.021  | 0.518 | 0.0024 | 0.0025  | 6.774 | 147.7 |
+| 2                  | 0.034  | 1.780        | 0.027  | 0.463 | 0.0020 | 0.0023  | 6.931 | 288.3 |
+| 4                  | 0.038  | 2.954        | 0.025  | 0.455 | 0.0019 | 0.0027  | 8.378 | 477.5 |
+| 8                  | 0.044  | 8.230        | 0.028  | 0.464 | 0.0023 | 0.0034  | 14.191 | 563.8 |
+| 16                 | 0.048  | 21.037       | 0.028  | 0.455 | 0.0025 | 0.0041  | 27.236 | 587.5 |
+
+平均每个线程耗时图如下
+
+![avg cost](../../../doc/criteo-cube-benchmark-avgcost.png)
+
+每个线程QPS耗时如下
+
+![qps](../../../doc/criteo-cube-benchmark-qps.png)
--- a/python/examples/imagenet/image_classification_service.py
+++ b/python/examples/imagenet/image_classification_service.py
@@ -25,11 +25,21 @@ class ImageService(WebService):
        reader = ImageReader()
        if "image" not in feed:
            raise ("feed data error!")
-        sample = base64.b64decode(feed["image"])
-        img = reader.process_image(sample)
-        res_feed = {}
-        res_feed["image"] = img.reshape(-1)
-        return res_feed, fetch
+        if isinstance(feed["image"], list):
+            feed_batch = []
+            for image in feed["image"]:
+                sample = base64.b64decode(image)
+                img = reader.process_image(sample)
+                res_feed = {}
+                res_feed["image"] = img.reshape(-1)
+                feed_batch.append(res_feed)
+            return feed_batch, fetch
+        else:
+            sample = base64.b64decode(feed["image"])
+            img = reader.process_image(sample)
+            res_feed = {}
+            res_feed["image"] = img.reshape(-1)
+            return res_feed, fetch


 image_service = ImageService(name="image")

--- a/python/examples/imagenet/image_classification_service_gpu.py
+++ b/python/examples/imagenet/image_classification_service_gpu.py
@@ -25,16 +25,27 @@ class ImageService(WebService):
        reader = ImageReader()
        if "image" not in feed:
            raise ("feed data error!")
-        sample = base64.b64decode(feed["image"])
-        img = reader.process_image(sample)
-        res_feed = {}
-        res_feed["image"] = img.reshape(-1)
-        return res_feed, fetch
+        print(type(feed["image"]), isinstance(feed["image"], list))
+        if isinstance(feed["image"], list):
+            feed_batch = []
+            for image in feed["image"]:
+                sample = base64.b64decode(image)
+                img = reader.process_image(sample)
+                res_feed = {}
+                res_feed["image"] = img.reshape(-1)
+                feed_batch.append(res_feed)
+            return feed_batch, fetch
+        else:
+            sample = base64.b64decode(feed["image"])
+            img = reader.process_image(sample)
+            res_feed = {}
+            res_feed["image"] = img.reshape(-1)
+            return res_feed, fetch


 image_service = ImageService(name="image")
 image_service.load_model_config(sys.argv[1])
-image_service.set_gpus("0,1,2,3")
+image_service.set_gpus("0,1")
 image_service.prepare_server(
    workdir=sys.argv[2], port=int(sys.argv[3]), device="gpu")
 image_service.run_server()
--- a/python/examples/imagenet/image_http_client.py
+++ b/python/examples/imagenet/image_http_client.py
@@ -24,17 +24,26 @@ def predict(image_path, server):
    req = json.dumps({"image": image, "fetch": ["score"]})
    r = requests.post(
        server, data=req, headers={"Content-Type": "application/json"})
+    print(r.json()["score"][0])
+    return r
+
+
+def batch_predict(image_path, server):
+    image = base64.b64encode(open(image_path).read())
+    req = json.dumps({"image": [image, image], "fetch": ["score"]})
+    r = requests.post(
+        server, data=req, headers={"Content-Type": "application/json"})
+    print(r.json()["result"][1]["score"][0])
    return r


 if __name__ == "__main__":
-    server = "http://127.0.0.1:9295/image/prediction"
+    server = "http://127.0.0.1:9393/image/prediction"
    #image_path = "./data/n01440764_10026.JPEG"
-    image_list = os.listdir("./data/image_data/n01440764/")
+    image_list = os.listdir("./image_data/n01440764/")
    start = time.time()
    for img in image_list:
-        image_file = "./data/image_data/n01440764/" + img
+        image_file = "./image_data/n01440764/" + img
        res = predict(image_file, server)
-        print(res.json()["score"][0])
    end = time.time()
    print(end - start)
--- a/python/paddle_serving_app/__init__.py
+++ b/python/paddle_serving_app/__init__.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .reader.chinese_bert_reader import ChineseBertReader
--- a/python/paddle_serving_app/reader/__init__.py
+++ b/python/paddle_serving_app/reader/__init__.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/python/paddle_serving_app/reader/batching.py
+++ b/python/paddle_serving_app/reader/batching.py
+#coding:utf-8
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def prepare_batch_data(insts,
+                       total_token_num,
+                       max_seq_len=128,
+                       pad_id=None,
+                       cls_id=None,
+                       sep_id=None,
+                       mask_id=None,
+                       return_input_mask=True,
+                       return_max_len=True,
+                       return_num_token=False):
+    """
+    1. generate Tensor of data
+    2. generate Tensor of position
+    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
+    """
+
+    batch_src_ids = [inst[0] for inst in insts]
+    batch_sent_ids = [inst[1] for inst in insts]
+    batch_pos_ids = [inst[2] for inst in insts]
+    labels_list = []
+    # compatible with squad, whose example includes start/end positions,
+    # or unique id
+
+    for i in range(3, len(insts[0]), 1):
+        labels = [inst[i] for inst in insts]
+        labels = np.array(labels).astype("int64").reshape([-1, 1])
+        labels_list.append(labels)
+
+    out = batch_src_ids
+    # Second step: padding
+    src_id, self_input_mask = pad_batch_data(
+        out, pad_idx=pad_id, max_seq_len=max_seq_len, return_input_mask=True)
+    pos_id = pad_batch_data(
+        batch_pos_ids,
+        pad_idx=pad_id,
+        max_seq_len=max_seq_len,
+        return_pos=False,
+        return_input_mask=False)
+    sent_id = pad_batch_data(
+        batch_sent_ids,
+        pad_idx=pad_id,
+        max_seq_len=max_seq_len,
+        return_pos=False,
+        return_input_mask=False)
+
+    return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   max_seq_len=128,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False,
+                   return_seq_lens=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    #max_len = max(len(inst) for inst in insts)
+    max_len = max_seq_len
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    inst_data = np.array([
+        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
+    ])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array(
+            [[1] * len(inst) + [0] * (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    if return_seq_lens:
+        seq_lens = np.array([len(inst) for inst in insts])
+        return_list += [seq_lens.astype("int64").reshape([-1, 1])]
+
+    return return_list if len(return_list) > 1 else return_list[0]
--- a/python/paddle_serving_app/reader/bert_base_reader.py
+++ b/python/paddle_serving_app/reader/bert_base_reader.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .reader import ReaderBase
+
+
+class BertBaseReader(ReaderBase):
+    def __init__(self):
+        super(BertBaseReader, self).__init__()
+        pass
+
+    def process(self, line):
+        super(BertBaseReader, self).process(line)
+        pass
--- a/python/paddle_serving_app/reader/chinese_bert_reader.py
+++ b/python/paddle_serving_app/reader/chinese_bert_reader.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding=utf-8
+from .bert_base_reader import BertBaseReader
+from .batching import pad_batch_data
+from .tokenization import FullTokenizer, convert_to_unicode
+
+
+class ChineseBertReader(BertBaseReader):
+    """
+    ChineseBertReader handles the most traditional Chinese Bert
+    preprocessing, a user can define the vocab file through initialization
+    
+    Examples:
+    from paddle_serving_app import ChineseBertReader
+
+    line = ["this is China"]
+    reader = ChineseBertReader()
+    reader.process(line[0])
+
+    """
+
+    def __init__(self, args={}):
+        super(ChineseBertReader, self).__init__()
+        vocab_file = ""
+        if "vocab_file" in args:
+            vocab_file = args["vocab_file"]
+        else:
+            vocab_file = self._download_or_not()
+
+        self.tokenizer = FullTokenizer(vocab_file=vocab_file)
+        if "max_seq_len" in args:
+            self.max_seq_len = args["max_seq_len"]
+        else:
+            self.max_seq_len = 20
+        self.vocab = self.tokenizer.vocab
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.mask_id = self.vocab["[MASK]"]
+        self.feed_keys = [
+            "input_ids", "position_ids", "segment_ids", "input_mask"
+        ]
+
+    """
+    inner function
+    """
+
+    def _download_or_not(self):
+        import os
+        import paddle_serving_app
+        module_path = os.path.dirname(paddle_serving_app.__file__)
+        full_path = "{}/tmp/chinese_bert".format(module_path)
+        os.system("mkdir -p {}".format(full_path))
+        if os.path.exists("{}/vocab.txt".format(full_path)):
+            pass
+        else:
+            url = "https://paddle-serving.bj.bcebos.com/reader/chinese_bert/vocab.txt"
+            r = os.system("wget --no-check-certificate " + url)
+            os.system("mv vocab.txt {}".format(full_path))
+            if r != 0:
+                raise SystemExit('Download failed, please check your network')
+        return "{}/vocab.txt".format(full_path)
+
+    """
+    inner function
+    """
+
+    def _pad_batch(self, token_ids, text_type_ids, position_ids):
+        batch_token_ids = [token_ids]
+        batch_text_type_ids = [text_type_ids]
+        batch_position_ids = [position_ids]
+
+        padded_token_ids, input_mask = pad_batch_data(
+            batch_token_ids,
+            max_seq_len=self.max_seq_len,
+            pad_idx=self.pad_id,
+            return_input_mask=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids,
+            max_seq_len=self.max_seq_len,
+            pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids,
+            max_seq_len=self.max_seq_len,
+            pad_idx=self.pad_id)
+        return padded_token_ids, padded_position_ids, padded_text_type_ids, input_mask
+
+    """
+    process function deals with a raw Chinese string as a sentence
+    this funtion returns a feed_dict
+    default key of the returned feed_dict: input_ids, position_ids, segment_ids, input_mask
+    """
+
+    def process(self, line):
+        text_a = convert_to_unicode(line)
+        tokens_a = self.tokenizer.tokenize(text_a)
+        if len(tokens_a) > self.max_seq_len - 2:
+            tokens_a = tokens_a[0:(self.max_seq_len - 2)]
+        tokens = []
+        text_type_ids = []
+        tokens.append("[CLS]")
+        text_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            text_type_ids.append(0)
+        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+        position_ids = list(range(len(token_ids)))
+        p_token_ids, p_pos_ids, p_text_type_ids, input_mask = \
+            self._pad_batch(token_ids, text_type_ids, position_ids)
+        feed_result = {
+            self.feed_keys[0]: p_token_ids.reshape(-1).tolist(),
+            self.feed_keys[1]: p_pos_ids.reshape(-1).tolist(),
+            self.feed_keys[2]: p_text_type_ids.reshape(-1).tolist(),
+            self.feed_keys[3]: input_mask.reshape(-1).tolist()
+        }
+        return feed_result
--- a/python/paddle_serving_app/reader/reader.py
+++ b/python/paddle_serving_app/reader/reader.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class ReaderBase(object):
+    def __init__(self):
+        self.feed_keys = []
+
+    def set_feed_keys(self, keys):
+        self.feed_keys = keys
+
+    def get_feed_keys(self):
+        return self.feed_keys
--- a/python/paddle_serving_app/reader/tokenization.py
+++ b/python/paddle_serving_app/reader/tokenization.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import io
+import unicodedata
+import six
+import sentencepiece as spm
+import pickle
+
+
+def convert_to_unicode(text):  # pylint: disable=doc-string-with-all-args
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):  # noqa
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):  # pylint: disable=doc-string-with-all-args
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):  # noqa
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):  # pylint: disable=doc-string-with-all-args, doc-string-with-returns
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    fin = io.open(vocab_file, "r", encoding="UTF-8")
+    for num, line in enumerate(fin):
+        items = convert_to_unicode(line.strip()).split("\t")
+        if len(items) > 2:
+            break
+        token = items[0]
+        index = items[1] if len(items) == 2 else num
+        token = token.strip()
+        vocab[token] = int(index)
+    fin.close()
+    return vocab
+
+
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self,
+                 vocab_file,
+                 do_lower_case=True,
+                 use_sentence_piece_vocab=False):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.use_sentence_piece_vocab = use_sentence_piece_vocab
+        self.wordpiece_tokenizer = WordpieceTokenizer(
+            vocab=self.vocab,
+            use_sentence_piece_vocab=self.use_sentence_piece_vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class CharTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in text.lower().split(" "):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class WSSPTokenizer(object):  # pylint: disable=doc-string-missing
+    def __init__(self, vocab_file, sp_model_dir, word_dict, ws=True,
+                 lower=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.ws = ws
+        self.lower = lower
+        self.dict = pickle.load(open(word_dict, 'rb'))
+        self.sp_model = spm.SentencePieceProcessor()
+        self.window_size = 5
+        self.sp_model.Load(sp_model_dir)
+
+    def cut(self, chars):  # pylint: disable=doc-string-missing
+        words = []
+        idx = 0
+        while idx < len(chars):
+            matched = False
+            for i in range(self.window_size, 0, -1):
+                cand = chars[idx:idx + i]
+                if cand in self.dict:
+                    words.append(cand)
+                    matched = True
+                    break
+            if not matched:
+                i = 1
+                words.append(chars[idx])
+            idx += i
+        return words
+
+    def tokenize(self, text, unk_token="[UNK]"):  # pylint: disable=doc-string-missing
+        text = convert_to_unicode(text)
+        if self.ws:
+            text = [s for s in self.cut(text) if s != ' ']
+        else:
+            text = text.split(' ')
+        if self.lower:
+            text = [s.lower() for s in text]
+        text = ' '.join(text)
+        tokens = self.sp_model.EncodeAsPieces(text)
+        in_vocab_tokens = []
+        for token in tokens:
+            if token in self.vocab:
+                in_vocab_tokens.append(token)
+            else:
+                in_vocab_tokens.append(unk_token)
+        return in_vocab_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+        Args:
+            do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):  # pylint: disable=doc-string-with-all-args, doc-string-with-returns
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+
+    def __init__(self,
+                 vocab,
+                 unk_token="[UNK]",
+                 max_input_chars_per_word=100,
+                 use_sentence_piece_vocab=False):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+        self.use_sentence_piece_vocab = use_sentence_piece_vocab
+
+    def tokenize(self, text):  # pylint: disable=doc-string-with-all-args
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+            input = "unaffable"
+            output = ["un", "##aff", "##able"]
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start == 0 and self.use_sentence_piece_vocab:
+                        substr = u'\u2581' + substr
+                    if start > 0 and not self.use_sentence_piece_vocab:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
--- a/python/paddle_serving_app/version.py
+++ b/python/paddle_serving_app/version.py
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Paddle Serving App version string """
+serving_app_version = "0.0.1"
--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
@@ -79,6 +79,8 @@ class Client(object):
        self.feed_names_to_idx_ = {}
        self.rpath()
        self.pid = os.getpid()
+        self.producers = []
+        self.consumer = None

    def rpath(self):
        lib_path = os.path.dirname(paddle_serving_client.__file__)
@@ -137,7 +139,6 @@ class Client(object):
        predictor_sdk = SDKConfig()
        predictor_sdk.set_server_endpoints(endpoints)
        sdk_desc = predictor_sdk.gen_desc()
-        print(sdk_desc)
        self.client_handle_.create_predictor_by_desc(sdk_desc.SerializeToString(
        ))

@@ -155,44 +156,26 @@ class Client(object):
            raise SystemExit("The shape of feed tensor {} not match.".format(
                key))

-    def predict(self, feed={}, fetch=[]):
-        int_slot = []
-        float_slot = []
-        int_feed_names = []
-        float_feed_names = []
-        fetch_names = []
-
-        for key in feed:
-            self.shape_check(feed, key)
-            if key not in self.feed_names_:
-                continue
-            if self.feed_types_[key] == int_type:
-                int_feed_names.append(key)
-                int_slot.append(feed[key])
-            elif self.feed_types_[key] == float_type:
-                float_feed_names.append(key)
-                float_slot.append(feed[key])
-
-        for key in fetch:
-            if key in self.fetch_names_:
-                fetch_names.append(key)
+    def predict(self, feed=None, fetch=None):
+        if feed is None or fetch is None:
+            raise ValueError("You should specify feed and fetch for prediction")
+
+        fetch_list = []
+        if isinstance(fetch, str):
+            fetch_list = [fetch]
+        elif isinstance(fetch, list):
+            fetch_list = fetch
+        else:
+            raise ValueError("fetch only accepts string and list of string")
+
+        feed_batch = []
+        if isinstance(feed, dict):
+            feed_batch.append(feed)
+        elif isinstance(feed, list):
+            feed_batch = feed
+        else:
+            raise ValueError("feed only accepts dict and list of dict")

-        ret = self.client_handle_.predict(float_slot, float_feed_names,
-                                          int_slot, int_feed_names, fetch_names,
-                                          self.result_handle_, self.pid)
-
-        result_map = {}
-        for i, name in enumerate(fetch_names):
-            if self.fetch_names_to_type_[name] == int_type:
-                result_map[name] = self.result_handle_.get_int64_by_name(name)[
-                    0]
-            elif self.fetch_names_to_type_[name] == float_type:
-                result_map[name] = self.result_handle_.get_float_by_name(name)[
-                    0]
-
-        return result_map
-
-    def batch_predict(self, feed_batch=[], fetch=[]):
        int_slot_batch = []
        float_slot_batch = []
        int_feed_names = []
@@ -200,28 +183,33 @@ class Client(object):
        fetch_names = []
        counter = 0
        batch_size = len(feed_batch)
-        for feed in feed_batch:
+
+        for key in fetch_list:
+            if key in self.fetch_names_:
+                fetch_names.append(key)
+
+        if len(fetch_names) == 0:
+            raise ValueError(
+                "fetch names should not be empty or out of saved fetch list")
+            return {}
+
+        for i, feed_i in enumerate(feed_batch):
            int_slot = []
            float_slot = []
-            for key in feed:
+            for key in feed_i:
                if key not in self.feed_names_:
                    continue
                if self.feed_types_[key] == int_type:
-                    if counter == 0:
+                    if i == 0:
                        int_feed_names.append(key)
                    int_slot.append(feed[key])
                elif self.feed_types_[key] == float_type:
-                    if counter == 0:
+                    if i == 0:
                        float_feed_names.append(key)
-                    float_slot.append(feed[key])
-            counter += 1
+                    float_slot.append(feed_i[key])
            int_slot_batch.append(int_slot)
            float_slot_batch.append(float_slot)

-        for key in fetch:
-            if key in self.fetch_names_:
-                fetch_names.append(key)
-
        result_batch = self.result_handle_
        res = self.client_handle_.batch_predict(
            float_slot_batch, float_feed_names, int_slot_batch, int_feed_names,
@@ -240,7 +228,10 @@ class Client(object):
                single_result[key] = result_map[key][i]
            result_map_batch.append(single_result)

-        return result_map_batch
+        if batch_size == 1:
+            return result_map_batch[0]
+        else:
+            return result_map_batch

    def release(self):
        self.client_handle_.destroy_predictor()

--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -64,12 +64,19 @@ class WebService(object):
            if "fetch" not in request.json:
                abort(400)
            feed, fetch = self.preprocess(request.json, request.json["fetch"])
-            if "fetch" in feed:
-                del feed["fetch"]
-            fetch_map = client_service.predict(feed=feed, fetch=fetch)
-            fetch_map = self.postprocess(
-                feed=request.json, fetch=fetch, fetch_map=fetch_map)
-            return fetch_map
+            if isinstance(feed, list):
+                fetch_map_batch = client_service.batch_predict(
+                    feed_batch=feed, fetch=fetch)
+                fetch_map_batch = self.postprocess(
+                    feed=request.json, fetch=fetch, fetch_map=fetch_map_batch)
+                result = {"result": fetch_map_batch}
+            elif isinstance(feed, dict):
+                if "fetch" in feed:
+                    del feed["fetch"]
+                fetch_map = client_service.predict(feed=feed, fetch=fetch)
+                result = self.postprocess(
+                    feed=request.json, fetch=fetch, fetch_map=fetch_map)
+            return result

        app_instance.run(host="0.0.0.0",
                         port=self.port,
@@ -92,5 +99,5 @@ class WebService(object):
    def preprocess(self, feed={}, fetch=[]):
        return feed, fetch

-    def postprocess(self, feed={}, fetch=[], fetch_map={}):
+    def postprocess(self, feed={}, fetch=[], fetch_map=None):
        return fetch_map
--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
@@ -23,14 +23,14 @@ from multiprocessing import Pool, Process
 from paddle_serving_server_gpu import serve_args


-def start_gpu_card_model(gpuid, args):  # pylint: disable=doc-string-missing
+def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-missing
    gpuid = int(gpuid)
    device = "gpu"
    port = args.port
    if gpuid == -1:
        device = "cpu"
    elif gpuid >= 0:
-        port = args.port + gpuid
+        port = args.port + index
    thread_num = args.thread
    model = args.model
    workdir = "{}_{}".format(args.workdir, gpuid)
@@ -78,6 +78,7 @@ def start_multi_card(args):  # pylint: disable=doc-string-missing
            p = Process(
                target=start_gpu_card_model, args=(
                    i,
+                    gpu_id,
                    args, ))
            gpu_processes.append(p)
        for p in gpu_processes:
@@ -91,15 +92,15 @@ if __name__ == "__main__":
    if args.name == "None":
        start_multi_card(args)
    else:
+        from .web_service import WebService
        web_service = WebService(name=args.name)
        web_service.load_model_config(args.model)
-        gpu_ids = []
-        if args.gpu_ids == "":
+        gpu_ids = args.gpu_ids
+        if gpu_ids == "":
            if "CUDA_VISIBLE_DEVICES" in os.environ:
                gpu_ids = os.environ["CUDA_VISIBLE_DEVICES"]
        if len(gpu_ids) > 0:
-            gpus = [int(x) for x in gpu_ids.split(",")]
-            web_service.set_gpus(gpus)
+            web_service.set_gpus(gpu_ids)
        web_service.prepare_server(
            workdir=args.workdir, port=args.port, device=args.device)
        web_service.run_server()
--- a/python/paddle_serving_server_gpu/web_service.py
+++ b/python/paddle_serving_server_gpu/web_service.py
@@ -95,12 +95,20 @@ class WebService(object):
        while True:
            request_json = inputqueue.get()
            feed, fetch = self.preprocess(request_json, request_json["fetch"])
-            if "fetch" in feed:
-                del feed["fetch"]
-            fetch_map = client.predict(feed=feed, fetch=fetch)
-            fetch_map = self.postprocess(
-                feed=request_json, fetch=fetch, fetch_map=fetch_map)
-            self.output_queue.put(fetch_map)
+            if isinstance(feed, list):
+                fetch_map_batch = client.batch_predict(
+                    feed_batch=feed, fetch=fetch)
+                fetch_map_batch = self.postprocess(
+                    feed=request_json, fetch=fetch, fetch_map=fetch_map_batch)
+                result = {"result": fetch_map_batch}
+            elif isinstance(feed, dict):
+                if "fetch" in feed:
+                    del feed["fetch"]
+                fetch_map = client.predict(feed=feed, fetch=fetch)
+                result = self.postprocess(
+                    feed=request_json, fetch=fetch, fetch_map=fetch_map)
+
+            self.output_queue.put(result)

    def _launch_web_service(self, gpu_num):
        app_instance = Flask(__name__)
@@ -186,5 +194,5 @@ class WebService(object):
    def preprocess(self, feed={}, fetch=[]):
        return feed, fetch

-    def postprocess(self, feed={}, fetch=[], fetch_map={}):
+    def postprocess(self, feed={}, fetch=[], fetch_map=None):
        return fetch_map
--- a/tools/Dockerfile.gpu
+++ b/tools/Dockerfile.gpu
@@ -10,6 +10,6 @@ RUN yum -y install wget && \
    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
    python get-pip.py && rm get-pip.py && \
    ln -s /usr/local/cuda-9.0/lib64/libcublas.so.9.0 /usr/local/cuda-9.0/lib64/libcublas.so && \
-    echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64':$LD_LIBRARY_PATH >> /root/.bashrc && \
+    echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> /root/.bashrc && \
    ln -s /usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudnn.so.7 /usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudnn.so && \
    echo 'export LD_LIBRARY_PATH=/usr/local/cuda-9.0/targets/x86_64-linux/lib:$LD_LIBRARY_PATH' >> /root/.bashrc
--- a/tools/Dockerfile.gpu.devel
+++ b/tools/Dockerfile.gpu.devel
+FROM nvidia/cuda:9.0-cudnn7-devel-centos7
+
+RUN yum -y install wget >/dev/null \
+    && yum -y install gcc gcc-c++ make glibc-static which >/dev/null \
+    && yum -y install git openssl-devel curl-devel bzip2-devel python-devel >/dev/null \
+    && wget https://cmake.org/files/v3.2/cmake-3.2.0-Linux-x86_64.tar.gz >/dev/null \
+    && tar xzf cmake-3.2.0-Linux-x86_64.tar.gz \
+    && mv cmake-3.2.0-Linux-x86_64 /usr/local/cmake3.2.0 \
+    && echo 'export PATH=/usr/local/cmake3.2.0/bin:$PATH' >> /root/.bashrc \
+    && rm cmake-3.2.0-Linux-x86_64.tar.gz \
+    && wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
+    && tar xzf go1.14.linux-amd64.tar.gz \
+    && mv go /usr/local/go \
+    && echo 'export GOROOT=/usr/local/go' >> /root/.bashrc \
+    && echo 'export PATH=/usr/local/go/bin:$PATH' >> /root/.bashrc \
+    && rm go1.14.linux-amd64.tar.gz \
+    && yum -y install python-devel sqlite-devel >/dev/null \
+    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
+    && python get-pip.py >/dev/null \
+    && pip install google protobuf setuptools wheel flask >/dev/null \
+    && rm get-pip.py \
+    && yum -y install epel-release && yum -y install patchelf \
+    && yum clean all
--- a/tools/serving_build.sh
+++ b/tools/serving_build.sh
 #!/usr/bin/env bash

+function unsetproxy() {
+    HTTP_PROXY_TEMP=$http_proxy
+    HTTPS_PROXY_TEMP=$https_proxy
+    unset http_proxy
+    unset https_proxy
+}
+
+function setproxy() {
+    export http_proxy=$HTTP_PROXY_TEMP
+    export https_proxy=$HTTPS_PROXY_TEMP
+}
+
 function init() {
    source /root/.bashrc
    set -v
-    #export http_proxy=http://172.19.56.199:3128
-    #export https_proxy=http://172.19.56.199:3128
    export PYTHONROOT=/usr
    cd Serving
+    export SERVING_WORKDIR=$PWD
 }

 function check_cmd() {
@@ -16,18 +27,40 @@ function check_cmd() {
    fi
 }

+function rerun() {
+    if [ $# -ne 2 ]; then
+        echo "usage: rerun command rerun-times"
+        exit 1
+    fi
+    local command=$1
+    local times=$2
+    for((i=1;i<=${times};i++))
+    do
+        if [ ${i} != 1 ]; then
+            echo "${i}-th run command: ${command}..."
+        fi
+        eval $command
+        if [ $? -eq 0 ]; then
+            return 0
+        fi
+        echo "${i}-th run(command: ${command}) failed."
+    done
+    exit 1
+}
+
 function build_client() {
    local TYPE=$1
    local DIRNAME=build-client-$TYPE
-    mkdir $DIRNAME && cd $DIRNAME
+    mkdir $DIRNAME # pwd: /Serving
+    cd $DIRNAME # pwd: /Serving/build-client-$TYPE
    case $TYPE in
        CPU|GPU)
            cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
                  -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython2.7.so \
                  -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
-                  -DCLIENT_ONLY=ON ..
-            check_cmd "make -j2 >/dev/null"
-            pip install python/dist/paddle_serving_client* >/dev/null
+                  -DCLIENT=ON ..
+            rerun "make -j2 >/dev/null" 3 # due to some network reasons, compilation may fail
+            pip install -U python/dist/paddle_serving_client* >/dev/null
            ;;
        *)
            echo "error type"
@@ -35,31 +68,34 @@ function build_client() {
            ;;
    esac
    echo "build client $TYPE part finished as expected."
-    cd ..
-    rm -rf $DIRNAME
+    cd .. # pwd: /Serving
+    # rm -rf $DIRNAME
 }

 function build_server() {
    local TYPE=$1
    local DIRNAME=build-server-$TYPE
-    mkdir $DIRNAME && cd $DIRNAME
+    mkdir $DIRNAME # pwd: /Serving
+    cd $DIRNAME # pwd: /Serving/build-server-$TYPE
    case $TYPE in
        CPU)
            cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
                  -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython2.7.so \
                  -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
-                  -DCLIENT_ONLY=OFF ..
-            check_cmd "make -j2 >/dev/null && make install -j2 >/dev/null"
-            pip install python/dist/paddle_serving_server* >/dev/null
+                  -DSERVER=ON ..
+            rerun "make -j2 >/dev/null" 3 # due to some network reasons, compilation may fail
+            check_cmd "make install -j2 >/dev/null"
+            pip install -U python/dist/paddle_serving_server* >/dev/null
            ;;
        GPU)
            cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
                  -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython2.7.so \
                  -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
-                  -DCLIENT_ONLY=OFF \
+                  -DSERVER=ON \
                  -DWITH_GPU=ON ..
-            check_cmd "make -j2 >/dev/null && make install -j2 >/dev/null"
-            pip install python/dist/paddle_serving_server* >/dev/null
+            rerun "make -j2 >/dev/null" 3 # due to some network reasons, compilation may fail
+            check_cmd "make install -j2 >/dev/null"
+            pip install -U python/dist/paddle_serving_server* >/dev/null
            ;;
        *)
            echo "error type"
@@ -67,30 +103,63 @@ function build_server() {
            ;;
    esac
    echo "build server $TYPE part finished as expected."
-    cd ..
+    cd .. # pwd: /Serving
+    # rm -rf $DIRNAME    for export SERVING_BIN
 }

+function kill_server_process() {
+    ps -ef | grep "serving" | grep -v serving_build | grep -v grep | awk '{print $2}' | xargs kill
+}
+
+
 function python_test_fit_a_line() {
-    cd fit_a_line
+    # pwd: /Serving/python/examples
+    cd fit_a_line # pwd: /Serving/python/examples/fit_a_line
    sh get_data.sh
    local TYPE=$1
-    echo $TYPE
+    export SERVING_BIN=${SERVING_WORKDIR}/build-server-${TYPE}/core/general-server/serving
    case $TYPE in
        CPU)
            # test rpc
-            check_cmd "python test_server.py uci_housing_model/ > /dev/null &"
-            sleep 5
+            check_cmd "python -m paddle_serving_server.serve --model uci_housing_model --port 9393 --thread 4 > /dev/null &"
+            sleep 5 # wait for the server to start
            check_cmd "python test_client.py uci_housing_client/serving_client_conf.prototxt > /dev/null"
-            ps -ef | grep "paddle_serving_server" | grep -v grep | awk '{print $2}' | xargs kill
+            kill_server_process
+            
            # test web
-            check_cmd "python -m paddle_serving_server.serve --model uci_housing_model/ --name uci --port 9399 --name uci > /dev/null &"
-            sleep 5
-            check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332], \"fetch\":[\"price\"]}' http://127.0.0.1:9399/uci/prediction"
-            ps -ef | grep "paddle_serving_server" | grep -v grep | awk '{print $2}' | xargs kill
+            unsetproxy # maybe the proxy is used on iPipe, which makes web-test failed.
+            check_cmd "python -m paddle_serving_server.serve --model uci_housing_model --name uci --port 9393 --thread 4 --name uci > /dev/null &"
+            sleep 5 # wait for the server to start
+            check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
+            # check http code
+            http_code=`curl -H "Content-Type:application/json" -X POST -d '{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
+            setproxy # recover proxy state
+            kill_server_process
+            if [ ${http_code} -ne 200 ]; then
+                echo "HTTP status code -ne 200"
+                exit 1
+            fi
            ;;
        GPU)
-            echo "not support yet"
-            exit 1
+            # test rpc
+            check_cmd "python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9393 --thread 4 --gpu_ids 0 > /dev/null &"
+            sleep 5 # wait for the server to start
+            check_cmd "python test_client.py uci_housing_client/serving_client_conf.prototxt > /dev/null"
+            kill_server_process
+            
+            # test web
+            unsetproxy # maybe the proxy is used on iPipe, which makes web-test failed.
+            check_cmd "python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9393 --thread 2 --gpu_ids 0 --name uci > /dev/null &"
+            sleep 5 # wait for the server to start
+            check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
+            # check http code
+            http_code=`curl -H "Content-Type:application/json" -X POST -d '{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
+            setproxy # recover proxy state
+            kill_server_process
+            if [ ${http_code} -ne 200 ]; then
+                echo "HTTP status code -ne 200"
+                exit 1
+            fi
            ;;
        *)
            echo "error type"
@@ -99,57 +168,69 @@ function python_test_fit_a_line() {
    esac
    echo "test fit_a_line $TYPE part finished as expected."
    rm -rf image kvdb log uci_housing* work*
-    cd ..
+    unset SERVING_BIN
+    cd .. # pwd: /Serving/python/examples
 }

 function python_run_criteo_ctr_with_cube() {
+    # pwd: /Serving/python/examples
    local TYPE=$1
    yum install -y bc >/dev/null
-    cd criteo_ctr_with_cube
-    check_cmd "wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz"
-    check_cmd "tar xf ctr_cube_unittest.tar.gz"
-    check_cmd "mv models/ctr_client_conf ./"
-    check_cmd "mv models/ctr_serving_model_kv ./"
-    check_cmd "mv models/data ./cube/"
-    check_cmd "mv models/ut_data ./"
-    cp ../../../build-server-$TYPE/output/bin/cube* ./cube/ 
-    mkdir -p $PYTHONROOT/lib/python2.7/site-packages/paddle_serving_server/serving-cpu-avx-openblas-0.1.3/
-    yes | cp ../../../build-server-$TYPE/output/demo/serving/bin/serving $PYTHONROOT/lib/python2.7/site-packages/paddle_serving_server/serving-cpu-avx-openblas-0.1.3/
+    cd criteo_ctr_with_cube # pwd: /Serving/python/examples/criteo_ctr_with_cube
+    case $TYPE in
+        CPU)
+            check_cmd "wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz"
+            check_cmd "tar xf ctr_cube_unittest.tar.gz"
+            check_cmd "mv models/ctr_client_conf ./"
+            check_cmd "mv models/ctr_serving_model_kv ./"
+            check_cmd "mv models/data ./cube/"
+            check_cmd "mv models/ut_data ./"
+            cp ../../../build-server-$TYPE/output/bin/cube* ./cube/ 
+            mkdir -p $PYTHONROOT/lib/python2.7/site-packages/paddle_serving_server/serving-cpu-avx-openblas-0.1.3/
+            yes | cp ../../../build-server-$TYPE/output/demo/serving/bin/serving $PYTHONROOT/lib/python2.7/site-packages/paddle_serving_server/serving-cpu-avx-openblas-0.1.3/

-    sh cube_prepare.sh &
-    check_cmd "mkdir work_dir1 && cp cube/conf/cube.conf ./work_dir1/"    
-    python test_server.py ctr_serving_model_kv &
-    check_cmd "python test_client.py ctr_client_conf/serving_client_conf.prototxt ./ut_data >score"
-    AUC=$(tail -n 2  score | awk 'NR==1')
-    VAR2="0.70"
-    RES=$( echo "$AUC>$VAR2" | bc )
-    if [[ $RES -eq 0 ]]; then
-        echo "error with criteo_ctr_with_cube inference auc test, auc should > 0.70"
-        exit 1
-    fi
-    echo "criteo_ctr_with_cube inference auc test success"
-    ps -ef | grep "paddle_serving_server" | grep -v grep | awk '{print $2}' | xargs kill
-    ps -ef | grep "cube" | grep -v grep | awk '{print $2}' | xargs kill
+            sh cube_prepare.sh &
+            check_cmd "mkdir work_dir1 && cp cube/conf/cube.conf ./work_dir1/"    
+            python test_server.py ctr_serving_model_kv &
+            check_cmd "python test_client.py ctr_client_conf/serving_client_conf.prototxt ./ut_data >score"
+            AUC=$(tail -n 2  score | awk 'NR==1')
+            VAR2="0.70"
+            RES=$( echo "$AUC>$VAR2" | bc )
+            if [[ $RES -eq 0 ]]; then
+                echo "error with criteo_ctr_with_cube inference auc test, auc should > 0.70"
+                exit 1
+            fi
+            echo "criteo_ctr_with_cube inference auc test success"
+            ps -ef | grep "paddle_serving_server" | grep -v grep | awk '{print $2}' | xargs kill
+            ps -ef | grep "cube" | grep -v grep | awk '{print $2}' | xargs kill
+            ;;
+        GPU)
+            ;;
+        *)
+            echo "error type"
+            exit 1
+            ;;
+    esac
+    echo "test criteo_ctr_with_cube $TYPE part finished as expected."
+    cd .. # pwd: /Serving/python/examples
 }

 function python_run_test() {
-    cd python/examples
-    local TYPE=$1
-    # Frist time run, downloading PaddleServing components ...
-    python -c "from paddle_serving_server import Server; server = Server(); server.download_bin()"
-    python_test_fit_a_line $TYPE
-    python_run_criteo_ctr_with_cube $TYPE
+    # Using the compiled binary
+    local TYPE=$1 # pwd: /Serving
+    cd python/examples # pwd: /Serving/python/examples
+    python_test_fit_a_line $TYPE # pwd: /Serving/python/examples
+    python_run_criteo_ctr_with_cube $TYPE # pwd: /Serving/python/examples
    echo "test python $TYPE part finished as expected."
-    cd ../..
+    cd ../.. # pwd: /Serving
 }

 function main() {
-    local TYPE=$1
-    init
-    build_client $TYPE
-    build_server $TYPE
-    cd Serving/
-    python_run_test $TYPE
+    local TYPE=$1 # pwd: /
+    init # pwd: /Serving
+    build_client $TYPE # pwd: /Serving
+    build_server $TYPE # pwd: /Serving
+    python_run_test $TYPE # pwd: /Serving
    echo "serving $TYPE part finished as expected."
 }