Merge remote-tracking branch 'upstream/develop' into trt

c1d8bc9f · MRXLT · 03883072 · f78c66ba · c1d8bc9f · c1d8bc9f
35 changed file
--- a/README.md
+++ b/README.md
@@ -54,8 +54,11 @@ You may need to use a domestic mirror source (in China, you can use the Tsinghua

 If you need install modules compiled with develop branch, please download packages from [latest packages list](./doc/LATEST_PACKAGES.md) and install with `pip install` command.

-Packages of Paddle Serving support Centos 6/7 and Ubuntu 16/18, or you can use HTTP service without install client.
+Packages of paddle-serving-server and paddle-serving-server-gpu support Centos 6/7 and Ubuntu 16/18.

+Packages of paddle-serving-client and paddle-serving-app support Linux and Windows, but paddle-serving-client only support python2.7/3.6/3.7.
+
+Recommended to install paddle >= 1.8.2.

 <h2 align="center"> Pre-built services with Paddle Serving</h2>

@@ -121,7 +124,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 | `port` | int | `9292` | Exposed port of current service to users|
 | `name` | str | `""` | Service name, can be used to generate HTTP request url |
 | `model` | str | `""` | Path of paddle model directory to be served |
-| `mem_optim` | - | - | Enable memory / graphic memory optimization |
+| `mem_optim_off` | - | - | Disable memory / graphic memory optimization |
 | `ir_optim` | - | - | Enable analysis and optimization of calculation graph |
 | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL |


--- a/README_CN.md
+++ b/README_CN.md
@@ -56,7 +56,11 @@ pip install paddle-serving-server-gpu # GPU

 如果需要使用develop分支编译的安装包，请从[最新安装包列表](./doc/LATEST_PACKAGES.md)中获取下载地址进行下载，使用`pip install`命令进行安装。

-Paddle Serving安装包支持Centos 6/7和Ubuntu 16/18，或者您可以使用HTTP服务，这种情况下不需要安装客户端。
+paddle-serving-server和paddle-serving-server-gpu安装包支持Centos 6/7和Ubuntu 16/18。
+
+paddle-serving-client和paddle-serving-app安装包支持Linux和Windows，其中paddle-serving-client仅支持python2.7/3.5/3.6。
+
+推荐安装1.8.2及以上版本的paddle

 <h2 align="center"> Paddle Serving预装的服务 </h2>

@@ -116,7 +120,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 | `port` | int | `9292` | Exposed port of current service to users|
 | `name` | str | `""` | Service name, can be used to generate HTTP request url |
 | `model` | str | `""` | Path of paddle model directory to be served |
-| `mem_optim` | - | - | Enable memory optimization |
+| `mem_optim_off` | - | - | Disable memory optimization |
 | `ir_optim` | - | - | Enable analysis and optimization of calculation graph |
 | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL |


--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -40,8 +40,8 @@ ExternalProject_Add(
    extern_brpc
    ${EXTERNAL_PROJECT_LOG_ARGS}
    # TODO(gongwb): change to de newst repo when they changed.
-    GIT_REPOSITORY  "https://github.com/gongweibao/brpc"
-    GIT_TAG         "e9b67ec1b7458f2af5fae76451afe1e27e01b4b4"
+    GIT_REPOSITORY  "https://github.com/wangjiawei04/brpc"
+    GIT_TAG         "6d79e0b17f25107c35b705ea58d888083f59ff47"
    PREFIX          ${BRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -4,12 +4,26 @@

 ## Compilation environment requirements

- OS: CentOS 7
- GCC: 4.8.2 and later
- Golang: 1.9.2 and later
- Git：2.17.1 and later
- CMake：3.2.2 and later
- Python：2.7.2 and later / 3.6 and later
+|            module            |                           version                            |
+| :--------------------------: | :----------------------------------------------------------: |
+|              OS              |                           CentOS 7                           |
+|             gcc              |                       4.8.5 and later                        |
+|           gcc-c++            |                       4.8.5 and later                        |
+|             git              |                        3.82 and later                        |
+|            cmake             |                       3.2.0 and later                        |
+|            Python            |               2.7.2 and later / 3.6 and later                |
+|              Go              |                       1.9.2 and later                        |
+|             git              |                       2.17.1 and later                       |
+|         glibc-static         |                             2.17                             |
+|        openssl-devel         |                            1.0.2k                            |
+|         bzip2-devel          |                       1.0.6 and later                        |
+| python-devel / python3-devel |              2.7.5 and later / 3.6.8 and later               |
+|         sqlite-devel         |                       3.7.17 and later                       |
+|           patchelf           |                        0.9 and later                         |
+|           libXext            |                            1.3.3                             |
+|            libSM             |                            1.2.2                             |
+|          libXrender          |                            0.9.10                            |
+|          python-whl          | numpy>=1.12, <=1.16.4<br/>google>=2.0.3<br/>protobuf>=3.12.2<br/>grpcio-tools>=1.28.1<br/>grpcio>=1.28.1<br/>func-timeout>=4.3.5<br/>pyyaml>=1.3.0<br/>sentencepiece==0.1.92<br>flask>=1.1.2<br>ujson>=2.0.3 |

 It is recommended to use Docker for compilation. We have prepared the Paddle Serving compilation environment for you, see [this document](DOCKER_IMAGES.md).


--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -4,12 +4,26 @@

 ## 编译环境设置

- OS: CentOS 7
- GCC: 4.8.2及以上
- Golang: 1.9.2及以上
- Git：2.17.1及以上
- CMake：3.2.2及以上
- Python：2.7.2及以上 / 3.6及以上
+|             组件             |                           版本要求                           |
+| :--------------------------: | :----------------------------------------------------------: |
+|              OS              |                           CentOS 7                           |
+|             gcc              |                       4.8.5 and later                        |
+|           gcc-c++            |                       4.8.5 and later                        |
+|             git              |                        3.82 and later                        |
+|            cmake             |                       3.2.0 and later                        |
+|            Python            |               2.7.2 and later / 3.6 and later                |
+|              Go              |                       1.9.2 and later                        |
+|             git              |                       2.17.1 and later                       |
+|         glibc-static         |                             2.17                             |
+|        openssl-devel         |                            1.0.2k                            |
+|         bzip2-devel          |                       1.0.6 and later                        |
+| python-devel / python3-devel |              2.7.5 and later / 3.6.8 and later               |
+|         sqlite-devel         |                       3.7.17 and later                       |
+|           patchelf           |                             0.9                              |
+|           libXext            |                            1.3.3                             |
+|            libSM             |                            1.2.2                             |
+|          libXrender          |                            0.9.10                            |
+|          python-whl          | numpy>=1.12, <=1.16.4<br/>google>=2.0.3<br/>protobuf>=3.12.2<br/>grpcio-tools>=1.28.1<br/>grpcio>=1.28.1<br/>func-timeout>=4.3.5<br/>pyyaml>=1.3.0<br/>sentencepiece==0.1.92<br/>flask>=1.1.2<br/>ujson>=2.0.3 |

 推荐使用Docker编译，我们已经为您准备好了Paddle Serving编译环境，详见[该文档](DOCKER_IMAGES_CN.md)。


--- a/doc/CONTRIBUTE.md
+++ b/doc/CONTRIBUTE.md
@@ -68,7 +68,7 @@ Paddle Serving uses this [Git branching model](http://nvie.com/posts/a-successfu

 1. Build and test

-   Users can build Paddle Serving natively on Linux, see the [BUILD steps](doc/INSTALL.md).
+   Users can build Paddle Serving natively on Linux, see the [BUILD steps](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE.md).

 1. Keep pulling


--- a/doc/CUBE_LOCAL.md
+++ b/doc/CUBE_LOCAL.md
@@ -6,7 +6,8 @@

 There are two examples on CTR under python / examples, they are criteo_ctr, criteo_ctr_with_cube. The former is to save the entire model during training, including sparse parameters. The latter is to cut out the sparse parameters and save them into two parts, one is the sparse parameter and the other is the dense parameter. Because the scale of sparse parameters is very large in industrial cases, reaching the order of 10 ^ 9. Therefore, it is not practical to start large-scale sparse parameter prediction on one machine. Therefore, we introduced Baidu's industrial-grade product Cube to provide the sparse parameter service for many years to provide distributed sparse parameter services.

-The local mode of Cube is different from distributed Cube, which is designed to be convenient for developers to use in experiments and demos. If there is a demand for distributed sparse parameter service, please continue reading [Distributed Cube User Guide](./Distributed_Cube) after reading this document (still developing).
+The local mode of Cube is different from distributed Cube, which is designed to be convenient for developers to use in experiments and demos. 
+<!--If there is a demand for distributed sparse parameter service, please continue reading [Distributed Cube User Guide](./Distributed_Cube) after reading this document (still developing).-->

 This document uses the original model without any compression algorithm. If there is a need for a quantitative model to go online, please read the [Quantization Storage on Cube Sparse Parameter Indexing](./CUBE_QUANT.md)


--- a/doc/CUBE_LOCAL_CN.md
+++ b/doc/CUBE_LOCAL_CN.md
@@ -6,7 +6,7 @@

 在python/examples下有两个关于CTR的示例，他们分别是criteo_ctr, criteo_ctr_with_cube。前者是在训练时保存整个模型，包括稀疏参数。后者是将稀疏参数裁剪出来，保存成两个部分，一个是稀疏参数，另一个是稠密参数。由于在工业级的场景中，稀疏参数的规模非常大，达到10^9数量级。因此在一台机器上启动大规模稀疏参数预测是不实际的，因此我们引入百度多年来在稀疏参数索引领域的工业级产品Cube，提供分布式的稀疏参数服务。

-单机版Cube是分布式Cube的弱化版本，旨在方便开发者做实验和Demo时使用。如果有分布式稀疏参数服务的需求，请在读完此文档之后，继续阅读  [稀疏参数索引服务Cube使用指南](分布式Cube)（正在建设中）。
+<!--单机版Cube是分布式Cube的弱化版本，旨在方便开发者做实验和Demo时使用。如果有分布式稀疏参数服务的需求，请在读完此文档之后，继续阅读  [稀疏参数索引服务Cube使用指南](分布式Cube)（正在建设中）。-->

 本文档使用的都是未经过任何压缩算法处理的原始模型，如果有量化模型上线需求，请阅读[Cube稀疏参数索引量化存储使用指南](./CUBE_QUANT_CN.md)


--- a/doc/DESIGN_CN.md
+++ b/doc/DESIGN_CN.md
@@ -106,7 +106,7 @@ class FluidFamilyCore {

 ![预测服务Service](predict-service.png)

-关于OP之间的依赖关系，以及通过OP组建workflow，可以参考[从零开始写一个预测服务](CREATING.md)的相关章节
+关于OP之间的依赖关系，以及通过OP组建workflow，可以参考[从零开始写一个预测服务](https://github.com/PaddlePaddle/Serving/blob/develop/doc/deprecated/CREATING.md)的相关章节

 服务端实例透视图


--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -13,3 +13,6 @@
  client.set_rpc_timeout_ms(100000)
  client.connect(["127.0.0.1:9393"])
   ```
+
+- Q: 如何使用自己编译的Paddle Serving进行预测？
+  A：通过pip命令安装自己编译出的whl包，并设置SERVING_BIN环境变量为编译出的serving二进制文件路径。
--- a/doc/LATEST_PACKAGES.md
+++ b/doc/LATEST_PACKAGES.md
@@ -3,45 +3,51 @@
 ## CPU server
 ### Python 3
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.3.1-py3-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.3.2-py3-none-any.whl
 ```

 ### Python 2
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.3.1-py2-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.3.2-py2-none-any.whl
 ```

 ## GPU server
 ### Python 3
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.1-py3-none-any.whl
+#cuda 9.0
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.2.post9-py3-none-any.whl
+#cuda 10.0
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.2.post10-py3-none-any.whl
 ```
 ### Python 2
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.1-py2-none-any.whl
+#cuda 9.0
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.2.post9-py2-none-any.whl
+#cuda 10.0
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.2.post10-py2-none-any.whl
 ```

 ## Client
 ### Python 3.7
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.3.1-cp37-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.3.2-cp37-none-any.whl
 ```
 ### Python 3.6
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.3.1-cp36-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.3.2-cp36-none-any.whl
 ```
 ### Python 2.7
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.3.1-cp27-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.3.2-cp27-none-any.whl
 ```

 ## App
 ### Python 3
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_app-0.1.1-py3-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_app-0.1.2-py3-none-any.whl
 ```

 ### Python 2
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_app-0.1.1-py2-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_app-0.1.2-py2-none-any.whl
 ```
--- a/doc/PERFORMANCE_OPTIM.md
+++ b/doc/PERFORMANCE_OPTIM.md
@@ -14,7 +14,35 @@ Under the same conditions, the communication time of the HTTP prediction service

 Parameters for performance optimization:

+The memory/graphic memory optimization option is enabled by default in Paddle Serving, which can reduce the memory/video memory usage and usually does not affect performance. If you need to turn it off, you can use --mem_optim_off in the command line.
+
+r_optim can optimize the calculation graph and increase the inference speed. It is turned off by default and turned on by --ir_optim in the command line.
+
 | Parameters | Type | Default | Description                                                  |
 | ---------- | ---- | ------- | ------------------------------------------------------------ |
-| mem_optim  | - | - | Enable memory / graphic memory optimization                                   |
+| mem_optim_off  | - | - | Disable memory / graphic memory optimization                                   |
 | ir_optim   | - | -  | Enable analysis and optimization of calculation graph,including OP fusion, etc |
+
+
+For the mode of using Python code to start the prediction service, the API of the above two parameters is as follows:
+
+RPC Service
+```
+from paddle_serving_server import Server
+server = Server()
+...
+server.set_memory_optimize(mem_optim)
+server.set_ir_optimize(ir_optim)
+...
+```
+
+HTTP Service
+```
+from paddle_serving_server import WebService
+class NewService(WebService):
+...
+new_service = NewService(name="new")
+...
+new_service.prepare_server(mem_optim=True, ir_optim=False)
+...
+```
--- a/doc/PERFORMANCE_OPTIM_CN.md
+++ b/doc/PERFORMANCE_OPTIM_CN.md
@@ -14,7 +14,33 @@

 性能优化相关参数：

+Paddle Serving中默认开启内存/显存优化选项，可以减少对内存/显存的占用，通常不会对性能造成影响，如果需要关闭可以在命令行启动模式中使用--mem_optim_off。
+ir_optim可以优化计算图，提升推理速度，默认关闭，在命令行启动的模式中通过--ir_optim开启。
+
 | 参数      | 类型 | 默认值 | 含义                      |
 | --------- | ---- | ------ | -------------------------------- |
-| mem_optim | - | -  | 开启内存/显存优化                |
+| mem_optim_off | - | -  | 关闭内存/显存优化                |
 | ir_optim  | - | -  | 开启计算图分析优化，包括OP融合等 |
+
+
+对于使用Python代码启动预测服务的模式，以上两个参数的接口如下：
+RPC服务
+```
+from paddle_serving_server import Server
+server = Server()
+...
+server.set_memory_optimize(mem_optim)
+server.set_ir_optimize(ir_optim)
+...
+```
+
+HTTP服务
+```
+from paddle_serving_server import WebService
+class NewService(WebService):
+...
+new_service = NewService(name="new")
+...
+new_service.prepare_server(mem_optim=True, ir_optim=False)
+...
+```
--- a/doc/deprecated/CREATING.md
+++ b/doc/deprecated/CREATING.md
@@ -77,7 +77,7 @@ service ImageClassifyService {

 关于Serving端的配置的详细信息，可以参考[Serving端配置](SERVING_CONFIGURE.md)

-以下配置文件将ReaderOP, ClassifyOP和WriteJsonOP串联成一个workflow (关于OP/workflow等概念，可参考[设计文档](DESIGN.md))
+以下配置文件将ReaderOP, ClassifyOP和WriteJsonOP串联成一个workflow (关于OP/workflow等概念，可参考[设计文档](../DESIGN.md))

 - 配置文件示例：


--- a/doc/deprecated/CTR_PREDICTION.md
+++ b/doc/deprecated/CTR_PREDICTION.md
@@ -26,7 +26,7 @@

 第1) - 第5)步裁剪完毕后的模型网络配置如下：

-![Pruned CTR prediction network](pruned-ctr-network.png)
+![Pruned CTR prediction network](../pruned-ctr-network.png)


 整个裁剪过程具体说明如下：

--- a/doc/deprecated/DOCKER.md
+++ b/doc/deprecated/DOCKER.md
-# Docker compilation environment preparation
-
-([简体中文](./DOCKER_CN.md)|English)
-
-## Environmental requirements
-
-+ Docker is installed on the development machine.
-+ Compiling the GPU version requires nvidia-docker.
-
-## Dockerfile
-
-[CPU Version Dockerfile](../tools/Dockerfile)
-
-[GPU Version Dockerfile](../tools/Dockerfile.gpu)
-
-## Instructions
-
-### Building Docker Image
-
-Create a new directory and copy the Dockerfile to this directory.
-
-Run
-
-```bash
-docker build -t serving_compile:cpu .
-```
-
-Or
-
-```bash
-docker build -t serving_compile:cuda9 .
-```
-
-## Enter Docker Container
-
-CPU Version please run
-
-```bash
-docker run -it serving_compile:cpu bash
-```
-
-GPU Version please run
-
-```bash
-docker run -it --runtime=nvidia -it serving_compile:cuda9 bash
-```
-
-##  List of supported environments compiled by Docker
-
-The list of supported environments is as follows:：
-
-| System Environment Supported by CPU Docker Compiled Executables |
-| -------------------------- |
-| Centos6                    |
-| Centos7                    |
-| Ubuntu16.04                |
-| Ubuntu18.04               |
-
-
-
-| System Environment Supported by GPU Docker Compiled Executables |
-| ---------------------------------- |
-| Centos6_cuda9_cudnn7                       |
-| Centos7_cuda9_cudnn7                  |
-| Ubuntu16.04_cuda9_cudnn7                       |
-| Ubuntu16.04_cuda10_cudnn7                  |
-
-
-
-**Remarks:**
-+ If you cannot find libcrypto.so.10 and libssl.so.10 when you execute the pre-compiled version, you can change /usr/lib64/libssl.so.10 and /usr/lib64/libcrypto.so in the Docker environment. 10 Copy to the directory where the executable is located.
-+ CPU pre-compiled version can only be executed on CPU machines, GPU pre-compiled version can only be executed on GPU machines.
--- a/doc/deprecated/DOCKER_CN.md
+++ b/doc/deprecated/DOCKER_CN.md
-# Docker编译环境准备
-
-(简体中文|[English](./DOCKER.md))
-
-## 环境要求
-
-+ 开发机上已安装Docker。
-+ 编译GPU版本需要安装nvidia-docker。
-
-## Dockerfile文件
-
-[CPU版本Dockerfile](../tools/Dockerfile)
-
-[GPU版本Dockerfile](../tools/Dockerfile.gpu)
-
-## 使用方法
-
-### 构建Docker镜像
-
-建立新目录，复制Dockerfile内容到该目录下Dockerfile文件。
-
-执行
-
-```bash
-docker build -t serving_compile:cpu .
-```
-
-或者
-
-```bash
-docker build -t serving_compile:cuda9 .
-```
-
-## 进入Docker
-
-CPU版本请执行
-
-```bash
-docker run -it serving_compile:cpu bash
-```
-
-GPU版本请执行
-
-```bash
-docker run -it --runtime=nvidia -it serving_compile:cuda9 bash
-```
-
-## Docker编译出的可执行文件支持的环境列表
-
-经过验证的环境列表如下：
-
-| CPU Docker编译出的可执行文件支持的系统环境 |
-| -------------------------- |
-| Centos6                    |
-| Centos7                    |
-| Ubuntu16.04                |
-| Ubuntu18.04               |
-
-
-
-| GPU Docker编译出的可执行文件支持的系统环境 |
-| ---------------------------------- |
-| Centos6_cuda9_cudnn7                       |
-| Centos7_cuda9_cudnn7                  |
-| Ubuntu16.04_cuda9_cudnn7                       |
-| Ubuntu16.04_cuda10_cudnn7                  |
-
-
-
-**备注：** 
-+ 若执行预编译版本出现找不到libcrypto.so.10、libssl.so.10的情况，可以将Docker环境中的/usr/lib64/libssl.so.10与/usr/lib64/libcrypto.so.10复制到可执行文件所在目录。
-+ CPU预编译版本仅可在CPU机器上执行，GPU预编译版本仅可在GPU机器上执行。
--- a/doc/deprecated/GETTING_STARTED.md
+++ b/doc/deprecated/GETTING_STARTED.md
-
-# Getting Started
-
-请先按照[编译安装说明](INSTALL.md)完成编译
-
-## 运行示例
-说明：Imagenet图像分类模型，默认采用CPU模式（GPU模式当前版本暂未提供支持）
-
-Step1：启动Server端：
-```shell
-cd /path/to/paddle-serving/output/demo/serving/ && ./bin/serving &
-```
-
-默认启动后日志写在./log/下，可tail日志查看serving端接收请求的日志：
-```shell
-tail -f log/serving.INFO
-```
-
-Step2：启动Client端：
-```shell
-cd path/to/paddle-serving/output/demo/client/image_classification &&  ./bin/ximage &
-```
-
-默认启动后日志写在./log/下，可tail日志查看分类结果：
-```shell
-tail -f log/ximage.INFO
-```
--- a/doc/deprecated/HTTP_INTERFACE.md
+++ b/doc/deprecated/HTTP_INTERFACE.md
@@ -72,7 +72,7 @@ for i in range(0, len(samples) - BATCH_SIZE, BATCH_SIZE):
        print e.reason
 ```

-完整示例请参考[text_classification.py](../demo-client/python/text_classification.py)
+完整示例请参考[text_classification.py](https://github.com/PaddlePaddle/Serving/blob/develop/tools/cpp_examples/demo-client/python/text_classification.py)

 ## 3. PHP访问HTTP Serving

@@ -128,4 +128,4 @@ for ($i = 0; $i < count($samples) - BATCH_SIZE; $i += BATCH_SIZE) {
 curl_close($ch);
 ```

-完整代码请参考[text_classification.php](../demo-client/php/text_classification.php)
+完整代码请参考[text_classification.php](https://github.com/PaddlePaddle/Serving/blob/develop/tools/cpp_examples/demo-client/php/text_classification.php)
--- a/doc/deprecated/INDEX.md
+++ b/doc/deprecated/INDEX.md
-[Design](DESIGN.md)
-
-[Installation](INSTALL.md)
-
-[Getting Started](GETTING_STARTED.md)
-
-[Creating a Prediction Service](CREATING.md)
-
-[Client Configure](CLIENT_CONFIGURE.md)
-
-[Server Side Configuration](SERVING_CONFIGURE.md)
-
-[How to Configure a Clustered Service](CLUSTERING.md)
-
-[Multiple Serving Instances over Single GPU Card](MULTI_SERVING_OVER_SINGLE_GPU_CARD.md)
-
-[Benchmarking](BENCHMARKING.md)
-
-[GPU Benchmarking](GPU_BENCHMARKING.md)
-
-[FAQ](FAQ.md)
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -83,6 +83,7 @@ if (SERVER)
            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
            COMMAND cp -r
            ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
+            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} paddle_serving_server_gpu/gen_cuda_version.py ${CUDA_VERSION_MAJOR}
            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)

--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/README_CN.md
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/README_CN.md
-## 带稀疏参数索引服务的CTR预测服务
-
-该样例是为了展示gRPC Server 端 `load_model_config` 函数，在这个例子中，bRPC Server 端与 bRPC Client 端的配置文件是不同的（bPRC Client 端的数据先交给 cube，经过 cube 处理后再交给预测库）
-
-### 获取样例数据
-```
-sh get_data.sh
-```
-
-### 下载模型和稀疏参数序列文件
-```
-wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz
-tar xf ctr_cube_unittest.tar.gz
-mv models/ctr_client_conf ./
-mv models/ctr_serving_model_kv ./
-mv models/data ./cube/
-```
-执行脚本后会在当前目录有ctr_server_model_kv和ctr_client_config文件夹。
-
-### 启动稀疏参数索引服务
-```
-wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz
-tar xf cube_app.tar.gz
-mv cube_app/cube* ./cube/
-sh cube_prepare.sh &
-```
-
-此处，模型当中的稀疏参数会被存放在稀疏参数索引服务Cube当中，关于稀疏参数索引服务Cube的介绍，请阅读[稀疏参数索引服务Cube单机版使用指南](../../../doc/CUBE_LOCAL_CN.md)
-
-### 启动RPC预测服务，服务端线程数为4（可在test_server.py配置）
-
-```
-python test_server.py ctr_serving_model_kv ctr_client_conf/serving_client_conf.prototxt 
-```
-
-### 执行预测
-
-```
-python test_client.py ./raw_data
-```
--- a/python/examples/imagenet/resnet50_web_service.py
+++ b/python/examples/imagenet/resnet50_web_service.py
@@ -54,6 +54,7 @@ class ImageService(WebService):
        score_list = fetch_map["score"]
        result = {"label": [], "prob": []}
        for score in score_list:
+            score = score.tolist()
            max_score = max(score)
            result["label"].append(self.label_dict[score.index(max_score)]
                                   .strip().replace(",", ""))
@@ -65,7 +66,7 @@ image_service = ImageService(name="image")
 image_service.load_model_config(sys.argv[1])
 image_service.init_imagenet_setting()
 if device == "gpu":
-    image_service.set_gpus("0,1")
+    image_service.set_gpus("0")
 image_service.prepare_server(
    workdir="workdir", port=int(sys.argv[3]), device=device)
 image_service.run_rpc_service()

--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
@@ -40,7 +40,7 @@ def parse_args():  # pylint: disable=doc-string-missing
    parser.add_argument(
        "--device", type=str, default="cpu", help="Type of device")
    parser.add_argument(
-        "--mem_optim",
+        "--mem_optim_off",
        default=False,
        action="store_true",
        help="Memory optimize")
@@ -68,7 +68,7 @@ def start_standard_model():  # pylint: disable=doc-string-missing
    port = args.port
    workdir = args.workdir
    device = args.device
-    mem_optim = args.mem_optim
+    mem_optim = args.mem_optim_off is False
    ir_optim = args.ir_optim
    max_body_size = args.max_body_size
    use_mkl = args.use_mkl

--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -41,6 +41,8 @@ class WebService(object):
        server = Server()
        server.set_op_sequence(op_seq_maker.get_op_sequence())
        server.set_num_threads(16)
+        server.set_memory_optimize(self.mem_optim)
+        server.set_ir_optimize(self.ir_optim)
        server.load_model_config(self.model_config)
        server.prepare_server(
            workdir=self.workdir, port=self.port_list[0], device=self.device)
@@ -55,12 +57,19 @@ class WebService(object):
        else:
            return False

-    def prepare_server(self, workdir="", port=9393, device="cpu"):
+    def prepare_server(self,
+                       workdir="",
+                       port=9393,
+                       device="cpu",
+                       mem_optim=True,
+                       ir_optim=False):
        self.workdir = workdir
        self.port = port
        self.device = device
        default_port = 12000
        self.port_list = []
+        self.mem_optim = mem_optim
+        self.ir_optim = ir_optim
        for i in range(1000):
            if self.port_is_available(default_port + i):
                self.port_list.append(default_port + i)
@@ -83,8 +92,6 @@ class WebService(object):
            if isinstance(feed, dict) and "fetch" in feed:
                del feed["fetch"]
            fetch_map = self.client.predict(feed=feed, fetch=fetch)
-            for key in fetch_map:
-                fetch_map[key] = fetch_map[key].tolist()
            result = self.postprocess(
                feed=request.json["feed"], fetch=fetch, fetch_map=fetch_map)
            result = {"result": result}
@@ -128,4 +135,6 @@ class WebService(object):
        return feed, fetch

    def postprocess(self, feed=[], fetch=[], fetch_map=None):
+        for key in fetch_map:
+            fetch_map[key] = fetch_map[key].tolist()
        return fetch_map
--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
@@ -41,7 +41,7 @@ from concurrent import futures
 def serve_args():
    parser = argparse.ArgumentParser("serve")
    parser.add_argument(
-        "--thread", type=int, default=10, help="Concurrency of server")
+        "--thread", type=int, default=2, help="Concurrency of server")
    parser.add_argument(
        "--model", type=str, default="", help="Model for serving")
    parser.add_argument(
@@ -57,7 +57,7 @@ def serve_args():
    parser.add_argument(
        "--name", type=str, default="None", help="Default service name")
    parser.add_argument(
-        "--mem_optim",
+        "--mem_optim_off",
        default=False,
        action="store_true",
        help="Memory optimize")
@@ -187,7 +187,7 @@ class Server(object):
        self.cube_config_fn = "cube.conf"
        self.workdir = ""
        self.max_concurrency = 0
-        self.num_threads = 4
+        self.num_threads = 2
        self.port = 8080
        self.reload_interval_s = 10
        self.max_body_size = 64 * 1024 * 1024
@@ -363,7 +363,15 @@ class Server(object):
    def download_bin(self):
        os.chdir(self.module_path)
        need_download = False
-        device_version = "serving-gpu-"
+
+        #acquire lock
+        version_file = open("{}/version.py".format(self.module_path), "r")
+        import re
+        for line in version_file.readlines():
+            if re.match("cuda_version", line):
+                cuda_version = line.split("\"")[1]
+                device_version = "serving-gpu-cuda" + cuda_version + "-"
+
        folder_name = device_version + serving_server_version
        tar_name = folder_name + ".tar.gz"
        bin_url = "https://paddle-serving.bj.bcebos.com/bin/" + tar_name
@@ -372,8 +380,6 @@ class Server(object):
        download_flag = "{}/{}.is_download".format(self.module_path,
                                                   folder_name)

-        #acquire lock
-        version_file = open("{}/version.py".format(self.module_path), "r")
        fcntl.flock(version_file, fcntl.LOCK_EX)

        if os.path.exists(download_flag):
@@ -385,6 +391,7 @@ class Server(object):
            os.system("touch {}/{}.is_download".format(self.module_path,
                                                       folder_name))
            print('Frist time run, downloading PaddleServing components ...')
+
            r = os.system('wget ' + bin_url + ' --no-check-certificate')
            if r != 0:
                if os.path.exists(tar_name):

--- a/python/paddle_serving_server_gpu/gen_cuda_version.py
+++ b/python/paddle_serving_server_gpu/gen_cuda_version.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import re
+import os
+
+new_str = ""
+with open("paddle_serving_server_gpu/version.py", "r") as f:
+    for line in f.readlines():
+        if re.match("cuda_version", line):
+            line = re.sub(r"\d+", sys.argv[1], line)
+        new_str = new_str + line
+
+with open("paddle_serving_server_gpu/version.py", "w") as f:
+    f.write(new_str)
--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
@@ -34,7 +34,7 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
        port = args.port + index
    thread_num = args.thread
    model = args.model
-    mem_optim = args.mem_optim
+    mem_optim = args.mem_optim_off is False
    ir_optim = args.ir_optim
    max_body_size = args.max_body_size
    use_multilang = args.use_multilang

--- a/python/paddle_serving_server_gpu/version.py
+++ b/python/paddle_serving_server_gpu/version.py
@@ -15,3 +15,4 @@
 serving_client_version = "0.3.2"
 serving_server_version = "0.3.2"
 module_proto_version = "0.3.2"
+cuda_version = "9"
--- a/python/paddle_serving_server_gpu/web_service.py
+++ b/python/paddle_serving_server_gpu/web_service.py
@@ -41,7 +41,9 @@ class WebService(object):
                            workdir="conf",
                            port=9292,
                            gpuid=0,
-                            thread_num=10):
+                            thread_num=2,
+                            mem_optim=True,
+                            ir_optim=False):
        device = "gpu"
        if gpuid == -1:
            device = "cpu"
@@ -58,6 +60,8 @@ class WebService(object):
        server = Server()
        server.set_op_sequence(op_seq_maker.get_op_sequence())
        server.set_num_threads(thread_num)
+        server.set_memory_optimize(mem_optim)
+        server.set_ir_optimize(ir_optim)

        server.load_model_config(self.model_config)
        if gpuid >= 0:
@@ -77,7 +81,13 @@ class WebService(object):
        else:
            return False

-    def prepare_server(self, workdir="", port=9393, device="gpu", gpuid=0):
+    def prepare_server(self,
+                       workdir="",
+                       port=9393,
+                       device="gpu",
+                       gpuid=0,
+                       mem_optim=True,
+                       ir_optim=False):
        self.workdir = workdir
        self.port = port
        self.device = device
@@ -94,7 +104,12 @@ class WebService(object):
            # init cpu service
            self.rpc_service_list.append(
                self.default_rpc_service(
-                    self.workdir, self.port_list[0], -1, thread_num=10))
+                    self.workdir,
+                    self.port_list[0],
+                    -1,
+                    thread_num=2,
+                    mem_optim=mem_optim,
+                    ir_optim=ir_optim))
        else:
            for i, gpuid in enumerate(self.gpus):
                self.rpc_service_list.append(
@@ -102,7 +117,9 @@ class WebService(object):
                        "{}_{}".format(self.workdir, i),
                        self.port_list[i],
                        gpuid,
-                        thread_num=10))
+                        thread_num=2,
+                        mem_optim=mem_optim,
+                        ir_optim=ir_optim))

    def _launch_web_service(self):
        gpu_num = len(self.gpus)
@@ -204,4 +221,6 @@ class WebService(object):
        return feed, fetch

    def postprocess(self, feed=[], fetch=[], fetch_map=None):
+        for key in fetch_map.iterkeys():
+            fetch_map[key] = fetch_map[key].tolist()
        return fetch_map
--- a/python/requirements.txt
+++ b/python/requirements.txt
 numpy>=1.12, <=1.16.4 ; python_version<"3.5"
+google>=2.0.3
 protobuf>=3.12.2
 grpcio-tools>=1.28.1
 grpcio>=1.28.1
 func-timeout>=4.3.5
 pyyaml>=1.3.0
+sentencepiece==0.1.92
+flask>=1.1.2
+ujson>=2.0.3
--- a/python/setup.py.server_gpu.in
+++ b/python/setup.py.server_gpu.in
@@ -41,7 +41,6 @@ REQUIRED_PACKAGES = [
    'paddle_serving_client', 'flask >= 1.1.1', 'paddle_serving_app'
 ]

-
 packages=['paddle_serving_server_gpu',
          'paddle_serving_server_gpu.proto',
          'paddle_serving_server_gpu.pipeline',
@@ -58,7 +57,7 @@ package_dir={'paddle_serving_server_gpu':

 setup(
    name='paddle-serving-server-gpu',
-    version=serving_server_version.replace('-', ''),
+    version=serving_server_version.replace('-', '') + '.post@CUDA_VERSION_MAJOR@',
    description=
    ('Paddle Serving Package for saved model with PaddlePaddle'),
    url='https://github.com/PaddlePaddle/Serving',

--- a/tools/Dockerfile
+++ b/tools/Dockerfile
@@ -2,7 +2,7 @@ FROM centos:7.3.1611

 RUN yum -y install wget && \
    yum -y install epel-release && yum -y install patchelf && \
-    yum -y install gcc make python-devel && \
+    yum -y install gcc gcc-c++ make python-devel && \
    yum -y install libSM-1.2.2-2.el7.x86_64 --setopt=protected_multilib=false && \
    yum -y install libXrender-0.9.10-1.el7.x86_64 --setopt=protected_multilib=false && \
    yum -y install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false && \

--- a/tools/Dockerfile.cuda10.0-cudnn7.devel
+++ b/tools/Dockerfile.cuda10.0-cudnn7.devel
-FROM nvidia/cuda:10.0-cudnn7-runtime-centos7
+FROM nvidia/cuda:10.0-cudnn7-devel-centos7

 RUN yum -y install wget >/dev/null \
    && yum -y install gcc gcc-c++ make glibc-static which  \

--- a/tools/serving_build.sh
+++ b/tools/serving_build.sh
@@ -54,7 +54,6 @@ function build_app() {
    local DIRNAME=build-app-$TYPE
    mkdir $DIRNAME # pwd: /Serving
    cd $DIRNAME # pwd: /Serving/build-app-$TYPE
-    pip install numpy sentencepiece
    case $TYPE in
        CPU|GPU)
            cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
@@ -295,8 +294,6 @@ function python_run_criteo_ctr_with_cube() {
 function python_test_bert() {
    # pwd: /Serving/python/examples
    local TYPE=$1
-    yum install -y libXext libSM libXrender >/dev/null
-    pip install ujson
    export SERVING_BIN=${SERVING_WORKDIR}/build-server-${TYPE}/core/general-server/serving
    cd bert # pwd: /Serving/python/examples/bert
    case $TYPE in