fix_encryption and doc

81593989 · HexToString · 67889ab6 · 98a195fa · 81593989 · 81593989
26 changed file
--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -31,8 +31,7 @@ message( "WITH_GPU = ${WITH_GPU}")
 # Paddle Version should be one of:
 # latest: latest develop build
 # version number like 1.5.2
-#SET(PADDLE_VERSION "2.0.0-rc1")
+SET(PADDLE_VERSION "2.0.0")
-SET(PADDLE_VERSION "latest")
 if (WITH_GPU)
    if (WITH_TRT)
        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7-avx-mkl-trt6")
@@ -136,8 +135,8 @@ if (WITH_TRT)
 endif()
 if (WITH_LITE)
-    ADD_LIBRARY(paddle_api_full_bundled STATIC IMPORTED GLOBAL)
+    ADD_LIBRARY(paddle_full_api_shared STATIC IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET paddle_api_full_bundled PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/lite/cxx/lib/libpaddle_api_full_bundled.a)
+    SET_PROPERTY(TARGET paddle_full_api_shared PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/lite/cxx/lib/libpaddle_full_api_shared.so)
    if (WITH_XPU)
        ADD_LIBRARY(xpuapi SHARED IMPORTED GLOBAL)
@@ -160,7 +159,7 @@ LIST(APPEND paddle_depend_libs
        xxhash cryptopp)
 if(WITH_LITE)
-    LIST(APPEND paddle_depend_libs paddle_api_full_bundled)
+    LIST(APPEND paddle_depend_libs paddle_full_api_shared)
    if(WITH_XPU)
        LIST(APPEND paddle_depend_libs xpuapi xpurt)
    endif()

--- a/core/pdcodegen/CMakeLists.txt
+++ b/core/pdcodegen/CMakeLists.txt
@@ -7,6 +7,7 @@ PROTOBUF_GENERATE_CPP(pdcodegen_proto_srcs pdcodegen_proto_hdrs
 LIST(APPEND pdcodegen_srcs ${pdcodegen_proto_srcs})
 add_executable(pdcodegen ${pdcodegen_srcs})
+add_dependencies(pdcodegen boost)
 target_link_libraries(pdcodegen protobuf ${PROTOBUF_PROTOC_LIBRARY})
 # install

--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -34,6 +34,42 @@
 **A:** http rpc 
+## 安装问题
+#### Q: pip install安装whl包过程，报错信息如下：
+```
+Collecting opencv-python
+  Using cached opencv-python-4.3.0.38.tar.gz (88.0 MB)
+  Installing build dependencies ... done
+  Getting requirements to build wheel ... error
+  ERROR: Command errored out with exit status 1:
+   command: /home/work/Python-2.7.17/build/bin/python /home/work/Python-2.7.17/build/lib/python2.7/site-packages/pip/_vendor/pep517/_in_process.py get_requires_for_build_wheel /tmp/tmpLiweA9
+       cwd: /tmp/pip-install-_w6AUI/opencv-python
+  Complete output (22 lines):
+  Traceback (most recent call last):
+    File "/home/work/Python-2.7.17/build/lib/python2.7/site-packages/pip/_vendor/pep517/_in_process.py", line 280, in <module>
+      main()
+    File "/home/work/Python-2.7.17/build/lib/python2.7/site-packages/pip/_vendor/pep517/_in_process.py", line 263, in main
+      json_out['return_val'] = hook(**hook_input['kwargs'])
+    File "/home/work/Python-2.7.17/build/lib/python2.7/site-packages/pip/_vendor/pep517/_in_process.py", line 114, in get_requires_for_build_wheel
+      return hook(config_settings)
+    File "/tmp/pip-build-env-AUCbP4/overlay/lib/python2.7/site-packages/setuptools/build_meta.py", line 146, in get_requires_for_build_wheel
+      return self._get_build_requires(config_settings, requirements=['wheel'])
+    File "/tmp/pip-build-env-AUCbP4/overlay/lib/python2.7/site-packages/setuptools/build_meta.py", line 127, in _get_build_requires
+      self.run_setup()
+    File "/tmp/pip-build-env-AUCbP4/overlay/lib/python2.7/site-packages/setuptools/build_meta.py", line 243, in run_setup
+      self).run_setup(setup_script=setup_script)
+    File "/tmp/pip-build-env-AUCbP4/overlay/lib/python2.7/site-packages/setuptools/build_meta.py", line 142, in run_setup
+      exec(compile(code, __file__, 'exec'), locals())
+    File "setup.py", line 448, in <module>
+      main()
+    File "setup.py", line 99, in main
+      % {"ext": re.escape(sysconfig.get_config_var("EXT_SUFFIX"))}
+    File "/home/work/Python-2.7.17/build/lib/python2.7/re.py", line 210, in escape
+      s = list(pattern)
+  TypeError: 'NoneType' object is not iterable
+```
+**A:** 指定opencv-python版本安装，pip install opencv-python==4.2.0.32，再安装whl包
 ## 编译问题

--- a/doc/INFERENCE_TO_SERVING.md
+++ b/doc/INFERENCE_TO_SERVING.md
@@ -2,35 +2,15 @@
 ([简体中文](./INFERENCE_TO_SERVING_CN.md)|English)
-We should know something before converting to serving model
+you can use a build-in python module called `paddle_serving_client.convert` to convert it.
+```python
-**inference_model_dir**：the directory of Paddle inference model
+python -m paddle_serving_client.convert --dirname ./your_inference_model_dir
-**serving_client_dir**: the directory of server side configuration
-**serving_client_dir**: the directory of client side configuration
-**model_filename**: this is model description file whose default value is `__model__`, if it's not default name, set `model_filename` explicitly
-**params_filename**: during `save_inference_model` every Variable will be save as a single file. If we have the inference model whose params are compressed into one file, please set `params_filename` explicitly
-## Example
-``` python
-from paddle_serving_client.io import inference_model_to_serving
-inference_model_dir = "your_inference_model"
-serving_client_dir = "serving_client_dir"
-serving_server_dir = "serving_server_dir"
-feed_var_names, fetch_var_names = inference_model_to_serving(
-		inference_model_dir, serving_server_dir, serving_client_dir)
-```
-if your model file and params file are both standalone, please use the following api.
-```
-feed_var_names, fetch_var_names = inference_model_to_serving(
-		inference_model_dir, serving_server_dir, serving_client_dir,
-		model_filename="model", params_filename="params")
 ```
+Arguments are the same as `inference_model_to_serving` API.
+| Argument | Type | Default | Description |
+|--------------|------|-----------|--------------------------------|
+| `dirname` | str | - | Path of saved model files. Program file and parameter files are saved in this directory. |
+| `serving_server` | str | `"serving_server"` | The path of model files and configuration files for server. |
+| `serving_client` | str | `"serving_client"` | The path of configuration files for client. |
+| `model_filename` | str | None | The name of file to load the inference program. If it is None, the default filename `__model__` will be used. |
+| `params_filename` | str | None | The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. |
--- a/doc/INFERENCE_TO_SERVING_CN.md
+++ b/doc/INFERENCE_TO_SERVING_CN.md
@@ -2,32 +2,15 @@
 ([English](./INFERENCE_TO_SERVING.md)|简体中文)
-## 示例
+你可以使用Paddle Serving提供的名为`paddle_serving_client.convert`的内置模块进行转换。
+```python
-在下列代码中，我们需要知道以下信息。
+python -m paddle_serving_client.convert --dirname ./your_inference_model_dir
-**模型文件夹**：这个文件夹就是Paddle的inference_model所在的文件夹
-**serving_client_dir**: 这个文件夹是inference_model转换成Serving模型后，服务端配置的保存路径
-**serving_client_dir**: 这个文件夹是inference_model转换成Serving模型后，客户端配置的保存路径
-**模型描述文件**: 模型描述文件也就是`model_filename`默认值为`__model__`,是一个pb2文本文件，如果是别的文件名需要显式指定
-**模型参数文件**: 在`save_inference_model`阶段，默认方式是每一个Variable保存一个二进制文件，如果是这种情况就不需要做指定。如果所有参数用压缩成一个文件的形式保存，则需要显式指定`params_filename`
-``` python
-from paddle_serving_client.io import inference_model_to_serving
-inference_model_dir = "your_inference_model"
-serving_client_dir = "serving_client_dir"
-serving_server_dir = "serving_server_dir"
-feed_var_names, fetch_var_names = inference_model_to_serving(
-		inference_model_dir, serving_server_dir, serving_client_dir)
-```
-如果模型中有模型描述文件`model_filename` 和 模型参数文件`params_filename`，那么请用
-```
-feed_var_names, fetch_var_names = inference_model_to_serving(
-		inference_model_dir, serving_server_dir, serving_client_dir,
-		 model_filename="model", params_filename="params")
 ```
+模块参数与`inference_model_to_serving`接口参数相同。
+| 参数 | 类型 | 默认值 | 描述 |
+|--------------|------|-----------|--------------------------------|
+| `dirname` | str | - | 需要转换的模型文件存储路径，Program结构文件和参数文件均保存在此目录。|
+| `serving_server` | str | `"serving_server"` | 转换后的模型文件和配置文件的存储路径。默认值为serving_server |
+| `serving_client` | str | `"serving_client"` | 转换后的客户端配置文件存储路径。默认值为serving_client |
+| `model_filename` | str | None | 存储需要转换的模型Inference Program结构的文件名称。如果设置为None，则使用 `__model__` 作为默认的文件名 |
+| `params_filename` | str | None | 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保存在一个单独的>二进制文件中，它才需要被指定。如果模型参数是存储在各自分离的文件中，设置它的值为None |
--- a/doc/LATEST_PACKAGES.md
+++ b/doc/LATEST_PACKAGES.md
@@ -19,7 +19,9 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-p
 #cuda 10.0
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py3-none-any.whl
 #cuda10.1 with TensorRT 6
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py3-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post101-py3-none-any.whl
+#cuda10.2 with TensorRT 7
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post102-py3-none-any.whl
 ```
 ### Python 2
 ```
@@ -27,8 +29,11 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py3
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py2-none-any.whl
 #cuda 10.0
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py2-none-any.whl
-##cuda10.1 with TensorRT 6
+#cuda10.1 with TensorRT 6
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py2-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post101-py2-none-any.whl
+#cuda10.2 with TensorRT 7
+>>>>>>> fit_a_line
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post102-py2-none-any.whl
 ```
 ## Client

--- a/java/README.md
+++ b/java/README.md
@@ -112,10 +112,8 @@ The second is to deploy GPU Serving and Java Client separately. If they are on t
 **It should be noted that in the example, all models(not pipeline) need to use `--use_multilang` to start GRPC multi-programming language support, and the port number is 9393. If you need another port, you need to modify it in the java file**
-**Currently Serving has launched the Pipeline mode (see [Pipeline Serving](../doc/PIPELINE_SERVING.md) for details). Pipeline Serving Client for Java is released, the next version multi-thread java client example will be released**
+**Currently Serving has launched the Pipeline mode (see [Pipeline Serving](../doc/PIPELINE_SERVING.md) for details). Pipeline Serving Client for Java is released.**
-**It should be noted that in the example, Java Pipeline Client code is in path /Java/Examples and /Java/src/main, and the Pipeline server code is in path /python/examples/pipeline/
+**It should be noted that in the example, Java Pipeline Client code is in path /Java/Examples and /Java/src/main, and the Pipeline server code is in path /python/examples/pipeline/    The Client IP and Port(which is configured in java/examples/src/main/java/PipelineClientExample.java) should be corresponding to the Pipeline Server IP and Port(which is configured in config.yaml) **
-The Client IP and Port(which is configured in java/examples/src/main/java/PipelineClientExample.java) should be corresponding to the Pipeline Server IP and Port(which is configured in config.yaml)
-**
--- a/java/README_CN.md
+++ b/java/README_CN.md
@@ -111,11 +111,9 @@ java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar Pipeli
 第二种是GPU Serving和Java Client分开部署，如果在同一台宿主机，可以通过ifconfig了解对应容器的IP地址，然后在`examples/src/main/java/PaddleServingClientExample.java`当中对client.connect时的endpoint做修改，然后再编译一次。 或者在docker启动时选择 `--net=host`来绑定docker和宿主机的网络设备，这样不需要定制java代码可以直接运行。
-**需要注意的是，在示例中，所有非pipeline模型都需要使用`--use_multilang`来启动GRPC多编程语言支持，以及端口号都是9393，如果需要别的端口，需要在java文件里修改**
+**需要注意的是，在示例中，所有非Pipeline模型都需要使用`--use_multilang`来启动GRPC多编程语言支持，以及端口号都是9393，如果需要别的端口，需要在java文件里修改**
 **目前Serving已推出Pipeline模式（详见[Pipeline Serving](../doc/PIPELINE_SERVING_CN.md)），面向Java的Pipeline Serving Client已发布。**
-**需要注意的是，Java Pipeline Client相关示例在/Java/Examples和/Java/src/main中，对应的Pipeline server在/python/examples/pipeline/中。
+**需要注意的是，Java Pipeline Client相关示例在/Java/Examples和/Java/src/main中，对应的Pipeline server在/python/examples/pipeline/中
-  注意java/examples/src/main/java/PipelineClientExample.java中的ip和port，需要与/python/examples/pipeline/中对应Pipeline server的config.yaml文件中配置的ip和port相对应**
+注意java/examples/src/main/java/PipelineClientExample.java中的ip和port，需要与/python/examples/pipeline/中对应Pipeline server的config.yaml文件中配置的ip和port相对应。**
--- a/paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h
+++ b/paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h
@@ -128,20 +128,22 @@ class FluidArmAnalysisCore : public FluidFamilyCore {
    config.DisableGpu();
    config.SetCpuMathLibraryNumThreads(1);
-    if (params.enable_memory_optimization()) {
+    if (params.use_lite()) {
-      config.EnableMemoryOptim();
+      config.EnableLiteEngine(PrecisionType::kFloat32, true);
    }
-    if (params.enable_memory_optimization()) {
+    if (params.use_xpu()) {
-      config.EnableMemoryOptim();
+      config.EnableXpu(2 * 1024 * 1024);
    }
-    if (params.use_lite()) {
+    if (params.enable_memory_optimization()) {
-      config.EnableLiteEngine(PrecisionType::kFloat32, true);
+      config.EnableMemoryOptim();
    }
-    if (params.use_xpu()) {
+    if (params.enable_ir_optimization()) {
-      config.EnableXpu(100);
+      config.SwitchIrOptim(true);
+    } else {
+      config.SwitchIrOptim(false);
    }
    config.SwitchSpecifyInputNames(true);
@@ -173,6 +175,14 @@ class FluidArmAnalysisDirCore : public FluidFamilyCore {
    config.SwitchSpecifyInputNames(true);
    config.SetCpuMathLibraryNumThreads(1);
+    if (params.use_lite()) {
+      config.EnableLiteEngine(PrecisionType::kFloat32, true);
+    }
+    if (params.use_xpu()) {
+      config.EnableXpu(2 * 1024 * 1024);
+    }
    if (params.enable_memory_optimization()) {
      config.EnableMemoryOptim();
    }
@@ -183,14 +193,6 @@ class FluidArmAnalysisDirCore : public FluidFamilyCore {
      config.SwitchIrOptim(false);
    }
-    if (params.use_lite()) {
-      config.EnableLiteEngine(PrecisionType::kFloat32, true);
-    }
-    if (params.use_xpu()) {
-      config.EnableXpu(100);
-    }
    AutoLock lock(GlobalPaddleCreateMutex::instance());
    _core = CreatePredictor(config);
    if (NULL == _core.get()) {

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -81,25 +81,45 @@ if (SERVER)
            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
    elseif(WITH_TRT)
+        if(CUDA_VERSION EQUAL 10.1)
+            set(SUFFIX 101)
+        elseif(CUDA_VERSION EQUAL 10.2)
+            set(SUFFIX 102)
+        elseif(CUDA_VERSION EQUAL 11.0)
+            set(SUFFIX 110)
+        endif()
        add_custom_command(
            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
            COMMAND cp -r
            ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
-            "server_gpu" trt
+            "server_gpu"  ${SUFFIX}
            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
    elseif(WITH_LITE)
-        add_custom_command(
+        if(WITH_XPU)
-            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+            add_custom_command(
-            COMMAND cp -r
+                OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
-            ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
+                COMMAND cp -r
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
+                ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
-            "server_gpu" arm 
+                COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+                "server_gpu" arm-xpu 
-            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
+                COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+                DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
+            add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+        else()
+            add_custom_command(
+                OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+                COMMAND cp -r
+                ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
+                COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
+                "server_gpu" arm 
+                COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+                DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
+            add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+        endif()
    else()
        add_custom_command(
            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp

--- a/python/examples/bert/README.md
+++ b/python/examples/bert/README.md
@@ -3,9 +3,10 @@
 ([简体中文](./README_CN.md)|English)
 In the example, a BERT model is used for semantic understanding prediction, and the text is represented as a vector, which can be used for further analysis and prediction.
+If your python version is 3.X, replace the 'pip' field in the following command with 'pip3',replace 'python' with 'python3'.
 ### Getting Model
+method 1:
 This example use model [BERT Chinese Model](https://www.paddlepaddle.org.cn/hubdetail?name=bert_chinese_L-12_H-768_A-12&en_category=SemanticModel) from [Paddlehub](https://github.com/PaddlePaddle/PaddleHub).
 Install paddlehub first
@@ -22,11 +23,13 @@ the 128 in the command above means max_seq_len in BERT model, which is the lengt
 the config file and model file for server side are saved in the folder bert_seq128_model.
 the config file generated for client side is saved in the folder bert_seq128_client.
+method 2:
 You can also download the above model from BOS(max_seq_len=128). After decompression, the config file and model file for server side are stored in the bert_chinese_L-12_H-768_A-12_model folder, and the config file generated for client side is stored in the bert_chinese_L-12_H-768_A-12_client folder:
 ```shell
 wget https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SemanticModel/bert_chinese_L-12_H-768_A-12.tar.gz
 tar -xzf bert_chinese_L-12_H-768_A-12.tar.gz
 ```
+if your model is bert_chinese_L-12_H-768_A-12_model, replace the 'bert_seq128_model' field in the following command with 'bert_chinese_L-12_H-768_A-12_model',replace 'bert_seq128_client' with 'bert_chinese_L-12_H-768_A-12_client'.
 ### Getting Dict and Sample Dataset
@@ -36,11 +39,11 @@ sh get_data.sh
 this script will download Chinese Dictionary File vocab.txt and Chinese Sample Data data-c.txt
 ### RPC Inference Service
-Run
+start cpu inference service,Run
 ```
 python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #cpu inference service
 ```
-Or
+Or,start gpu inference service,Run
 ```
 python -m paddle_serving_server_gpu.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #launch gpu inference service at GPU 0
 ```
@@ -59,12 +62,18 @@ head data-c.txt | python bert_client.py --model bert_seq128_client/serving_clien
 the client reads data from data-c.txt and send prediction request, the prediction is given by word vector. (Due to massive data in the word vector, we do not print it).
 ### HTTP Inference Service
+start cpu HTTP inference service,Run
+```
+ python bert_web_service.py bert_seq128_model/ 9292 #launch gpu inference service
+```
+Or,start gpu HTTP inference service,Run
 ```
 export CUDA_VISIBLE_DEVICES=0,1
 ```
 set environmental variable to specify which gpus are used, the command above means gpu 0 and gpu 1 is used.
 ```
- python bert_web_service.py bert_seq128_model/ 9292 #launch gpu inference service
+ python bert_web_service_gpu.py bert_seq128_model/ 9292 #launch gpu inference service
 ```
 ### HTTP Inference 

--- a/python/examples/bert/README_CN.md
+++ b/python/examples/bert/README_CN.md
@@ -4,8 +4,9 @@
 示例中采用BERT模型进行语义理解预测，将文本表示为向量的形式，可以用来做进一步的分析和预测。
+若使用python的版本为3.X, 将以下命令中的pip 替换为pip3, python替换为python3.
 ### 获取模型
+方法1：
 示例中采用[Paddlehub](https://github.com/PaddlePaddle/PaddleHub)中的[BERT中文模型](https://www.paddlepaddle.org.cn/hubdetail?name=bert_chinese_L-12_H-768_A-12&en_category=SemanticModel)。
 请先安装paddlehub
 ```
@@ -19,11 +20,15 @@ python prepare_model.py 128
 生成server端配置文件与模型文件，存放在bert_seq128_model文件夹。
 生成client端配置文件，存放在bert_seq128_client文件夹。
+方法2：
 您也可以从bos上直接下载上述模型（max_seq_len=128），解压后server端配置文件与模型文件存放在bert_chinese_L-12_H-768_A-12_model文件夹，client端配置文件存放在bert_chinese_L-12_H-768_A-12_client文件夹：
 ```shell
 wget https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SemanticModel/bert_chinese_L-12_H-768_A-12.tar.gz
 tar -xzf bert_chinese_L-12_H-768_A-12.tar.gz
 ```
+若使用bert_chinese_L-12_H-768_A-12_model模型，将下面命令中的bert_seq128_model字段替换为bert_chinese_L-12_H-768_A-12_model，bert_seq128_client字段替换为bert_chinese_L-12_H-768_A-12_client.
 ### 获取词典和样例数据
@@ -33,13 +38,15 @@ sh get_data.sh
 脚本将下载中文词典vocab.txt和中文样例数据data-c.txt
 ### 启动RPC预测服务
-执行
+启动cpu预测服务，执行
 ```
 python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #启动cpu预测服务
 ```
-或者
+或者，启动gpu预测服务，执行
 ```
 python -m paddle_serving_server_gpu.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务
 ```
 ### 执行预测
@@ -51,17 +58,28 @@ pip install paddle_serving_app
 执行
 ```
 head data-c.txt | python bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
 ```
 启动client读取data-c.txt中的数据进行预测，预测结果为文本的向量表示（由于数据较多，脚本中没有将输出进行打印），server端的地址在脚本中修改。
 ### 启动HTTP预测服务
+启动cpu HTTP预测服务，执行
+```
+python bert_web_service.py bert_seq128_model/ 9292 #启动gpu预测服务
+```
+或者，启动gpu HTTP预测服务，执行
 ```
 export CUDA_VISIBLE_DEVICES=0,1
 ```
 通过环境变量指定gpu预测服务使用的gpu，示例中指定索引为0和1的两块gpu
 ```
- python bert_web_service.py bert_seq128_model/ 9292 #启动gpu预测服务
+python bert_web_service_gpu.py bert_seq128_model/ 9292 #启动gpu预测服务
 ```
 ### 执行预测
 ```

--- a/python/examples/bert/bert_web_service_gpu.py
+++ b/python/examples/bert/bert_web_service_gpu.py
+# coding=utf-8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_server_gpu.web_service import WebService
+from paddle_serving_app.reader import ChineseBertReader
+import sys
+import os
+import numpy as np
+class BertService(WebService):
+    def load(self):
+        self.reader = ChineseBertReader({
+            "vocab_file": "vocab.txt",
+            "max_seq_len": 128
+        })
+    def preprocess(self, feed=[], fetch=[]):
+        feed_res = []
+        is_batch = False
+        for ins in feed:
+            feed_dict = self.reader.process(ins["words"].encode("utf-8"))
+            for key in feed_dict.keys():
+                feed_dict[key] = np.array(feed_dict[key]).reshape(
+                    (len(feed_dict[key]), 1))
+            feed_res.append(feed_dict)
+        return feed_res, fetch, is_batch
+bert_service = BertService(name="bert")
+bert_service.load()
+bert_service.load_model_config(sys.argv[1])
+bert_service.prepare_server(
+    workdir="workdir", port=int(sys.argv[2]), device="gpu")
+bert_service.run_rpc_service()
+bert_service.run_web_service()
--- a/python/examples/encryption/get_data.sh
+++ b/python/examples/encryption/get_data.sh
 wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing_example/encrypt.tar.gz
 tar -xzf encrypt.tar.gz
-cp -rvf ../fit_a_line/uci_housing_model .
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
-cp -rvf ../fit_a_line/uci_housing_client .
+tar -xzf uci_housing.tar.gz
--- a/python/examples/fit_a_line/README.md
+++ b/python/examples/fit_a_line/README.md
@@ -34,7 +34,7 @@ python test_client.py uci_housing_client/serving_client_conf.prototxt
 Start a web service with default web service hosting modules:
 ``` shell
-python test_server.py
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --name uci
 ```
 ### Client prediction

--- a/python/examples/fit_a_line/README_CN.md
+++ b/python/examples/fit_a_line/README_CN.md
@@ -35,7 +35,7 @@ python test_client.py uci_housing_client/serving_client_conf.prototxt
 通过下面的一行代码开启默认web服务：
 ``` shell
-python test_server.py
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --name uci
 ```
 ### 客户端预测

--- a/python/examples/pipeline/imagenet/config.yml
+++ b/python/examples/pipeline/imagenet/config.yml
@@ -20,6 +20,9 @@ op:
            #uci模型路径
            model_config: ResNet50_vd_model
+            #计算硬件类型: 空缺时由devices决定(CPU/GPU)，0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
+            device_type: 1
            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
            devices: "0" # "0,1"

--- a/python/examples/pipeline/simple_web_service/config.yml
+++ b/python/examples/pipeline/simple_web_service/config.yml
@@ -20,7 +20,10 @@ op:
            #uci模型路径
            model_config: uci_housing_model
-            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
+            #计算硬件类型: 空缺时由devices决定(CPU/GPU)，0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
+            device_type: 0
+            #计算硬件ID，优先由device_type决定硬件类型。devices为""或空缺时为CPU预测；当为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
            devices: "" # "0,1"
            #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测

--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -20,6 +20,7 @@ import google.protobuf.text_format
 import numpy as np
 import argparse
 import paddle.fluid as fluid
+import paddle.inference as inference
 from .proto import general_model_config_pb2 as m_config
 from paddle.fluid.core import PaddleTensor
 from paddle.fluid.core import AnalysisConfig
@@ -125,14 +126,14 @@ class LocalPredictor(object):
        if use_lite:
            config.enable_lite_engine(
-                precision_mode = PrecisionType.Float32,
+                precision_mode=inference.PrecisionType.Float32,
-                zero_copy = True,
+                zero_copy=True,
-                passes_filter = [],
+                passes_filter=[],
-                ops_filter = []
+                ops_filter=[])
-            )
        if use_xpu:
-            config.enable_xpu(100 * 1024 * 1024)
+            # 2MB l3 cache
+            config.enable_xpu(8 * 1024 * 1024)
        self.predictor = create_paddle_predictor(config)

--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -20,7 +20,7 @@ from paddle_serving_server import OpMaker, OpSeqMaker, Server
 from paddle_serving_client import Client
 from contextlib import closing
 import socket
+import numpy as np
 from paddle_serving_server import pipeline
 from paddle_serving_server.pipeline import Op
@@ -74,8 +74,8 @@ class WebService(object):
        f = open(client_config, 'r')
        model_conf = google.protobuf.text_format.Merge(
            str(f.read()), model_conf)
-        self.feed_names = [var.alias_name for var in model_conf.feed_var]
+        self.feed_vars = {var.name: var for var in model_conf.feed_var}
-        self.fetch_names = [var.alias_name for var in model_conf.fetch_var]
+        self.fetch_vars = {var.name: var for var in model_conf.fetch_var}
    def _launch_rpc_service(self):
        op_maker = OpMaker()
@@ -211,6 +211,15 @@ class WebService(object):
    def preprocess(self, feed=[], fetch=[]):
        print("This API will be deprecated later. Please do not use it")
        is_batch = True
+        feed_dict = {}
+        for var_name in self.feed_vars.keys():
+            feed_dict[var_name] = []
+        for feed_ins in feed:
+            for key in feed_ins:
+                feed_dict[key].append(np.array(feed_ins[key]).reshape(list(self.feed_vars[key].shape))[np.newaxis,:])
+        feed = {}
+        for key in feed_dict:
+            feed[key] = np.concatenate(feed_dict[key], axis=0) 
        return feed, fetch, is_batch
    def postprocess(self, feed=[], fetch=[], fetch_map=None):

--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
@@ -217,6 +217,7 @@ class Server(object):
        self.module_path = os.path.dirname(paddle_serving_server.__file__)
        self.cur_path = os.getcwd()
        self.use_local_bin = False
+        self.device = "cpu"
        self.gpuid = 0
        self.use_trt = False
        self.use_lite = False
@@ -284,6 +285,9 @@ class Server(object):
                "GPU not found, please check your environment or use cpu version by \"pip install paddle_serving_server\""
            )
+    def set_device(self, device="cpu"):
+        self.device = device
    def set_gpuid(self, gpuid=0):
        self.gpuid = gpuid
@@ -316,24 +320,25 @@ class Server(object):
            engine.static_optimization = False
            engine.force_update_static_cache = False
            engine.use_trt = self.use_trt
-            engine.use_lite = self.use_lite
+            if os.path.exists('{}/__params__'.format(model_config_path)):
-            engine.use_xpu = self.use_xpu
+                suffix = ""
+            else:
+                suffix = "_DIR" 
+            if device == "arm":
+                engine.use_lite = self.use_lite
+                engine.use_xpu = self.use_xpu
            if device == "cpu":
 		if use_encryption_model:
                    engine.type = "FLUID_CPU_ANALYSIS_ENCRPT"
                else:
-                    engine.type = "FLUID_CPU_ANALYSIS_DIR"
+                    engine.type = "FLUID_CPU_ANALYSIS"+suffix
            elif device == "gpu":
 		if use_encryption_model:
                    engine.type = "FLUID_GPU_ANALYSIS_ENCRPT"
                else:
-                    engine.type = "FLUID_GPU_ANALYSIS_DIR"
+                    engine.type = "FLUID_GPU_ANALYSIS"+suffix
            elif device == "arm":
-                engine.type = "FLUID_ARM_ANALYSIS_DIR"
+                engine.type = "FLUID_ARM_ANALYSIS" + suffix
            self.model_toolkit_conf.engines.extend([engine])
    def _prepare_infer_service(self, port):
@@ -434,9 +439,9 @@ class Server(object):
        for line in version_file.readlines():
            if re.match("cuda_version", line):
                cuda_version = line.split("\"")[1]
-                if cuda_version == "trt":
+                if cuda_version == "101" or cuda_version == "102" or cuda_version == "110":
                    device_version = "serving-gpu-" + cuda_version + "-"
-                elif cuda_version == "arm":
+                elif cuda_version == "arm" or cuda_version == "arm-xpu":
                    device_version = "serving-" + cuda_version + "-"
                else:
                    device_version = "serving-gpu-cuda" + cuda_version + "-"
@@ -541,7 +546,8 @@ class Server(object):
        else:
            print("Use local bin : {}".format(self.bin_path))
        #self.check_cuda()
-        if self.use_lite:
+        # Todo: merge CPU and GPU code, remove device to model_toolkit
+        if self.device == "cpu" or self.device == "arm":
            command = "{} " \
                      "-enable_model_toolkit " \
                      "-inferservice_path {} " \

--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
@@ -76,6 +76,7 @@ def start_gpu_card_model(index, gpuid, port, args):  # pylint: disable=doc-strin
        server.set_lite()
        device = "arm"
+    server.set_device(device)
    if args.use_xpu:
        server.set_xpu()

--- a/python/paddle_serving_server_gpu/web_service.py
+++ b/python/paddle_serving_server_gpu/web_service.py
@@ -81,8 +81,8 @@ class WebService(object):
        f = open(client_config, 'r')
        model_conf = google.protobuf.text_format.Merge(
            str(f.read()), model_conf)
-        self.feed_names = [var.alias_name for var in model_conf.feed_var]
+        self.feed_vars = {var.name: var for var in model_conf.feed_var}
-        self.fetch_names = [var.alias_name for var in model_conf.fetch_var]
+        self.fetch_vars = {var.name: var for var in model_conf.fetch_var}
    def set_gpus(self, gpus):
        print("This API will be deprecated later. Please do not use it")
@@ -118,6 +118,7 @@ class WebService(object):
        server.set_num_threads(thread_num)
        server.set_memory_optimize(mem_optim)
        server.set_ir_optimize(ir_optim)
+        server.set_device(device)
        if use_lite:
            server.set_lite()
@@ -289,6 +290,15 @@ class WebService(object):
    def preprocess(self, feed=[], fetch=[]):
        print("This API will be deprecated later. Please do not use it")
        is_batch = True
+        feed_dict = {}
+        for var_name in self.feed_vars.keys():
+            feed_dict[var_name] = []
+        for feed_ins in feed:
+            for key in feed_ins:
+                feed_dict[key].append(np.array(feed_ins[key]).reshape(list(self.feed_vars[key].shape))[np.newaxis,:])
+        feed = {}
+        for key in feed_dict:
+            feed[key] = np.concatenate(feed_dict[key], axis=0)
        return feed, fetch, is_batch
    def postprocess(self, feed=[], fetch=[], fetch_map=None):

--- a/python/pipeline/local_service_handler.py
+++ b/python/pipeline/local_service_handler.py
@@ -38,14 +38,12 @@ class LocalServiceHandler(object):
                 client_type='local_predictor',
                 workdir="",
                 thread_num=2,
+                 device_type=-1,
                 devices="",
                 fetch_names=None,
                 mem_optim=True,
                 ir_optim=False,
                 available_port_generator=None,
-                 use_trt=False,
-                 use_lite=False,
-                 use_xpu=False,
                 use_profile=False):
        """
        Initialization of localservicehandler
@@ -55,15 +53,14 @@ class LocalServiceHandler(object):
           client_type: brpc, grpc and local_predictor[default]
           workdir: work directory
           thread_num: number of threads, concurrent quantity.
+           device_type: support multiple devices. -1=Not set, determined by
+               `devices`. 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
           devices: gpu id list[gpu], "" default[cpu]
           fetch_names: get fetch names out of LocalServiceHandler in 
               local_predictor mode. fetch_names_ is compatible for Client().
           mem_optim: use memory/graphics memory optimization, True default.
           ir_optim: use calculation chart optimization, False default.
           available_port_generator: generate available ports
-           use_trt: use nvidia tensorRt engine, False default.
-           use_lite: use Paddle-Lite engine, False default.
-           use_xpu: run predict on Baidu Kunlun, False default.
           use_profile: use profiling, False default.
        Returns:
@@ -74,28 +71,61 @@ class LocalServiceHandler(object):
        self._model_config = model_config
        self._port_list = []
-        self._device_type = "cpu"
+        self._device_name = "cpu"
-        if devices == "":
+        self._use_gpu = False
-            # cpu
+        self._use_trt = False
-            devices = [-1]
+        self._use_lite = False
-            if use_lite:
+        self._use_xpu = False
-                self._device_type = "arm"
-                self._port_list.append(available_port_generator.next())
+        if device_type == -1:
-                _LOGGER.info("Model({}) will be launch in arm device. Port({})"
+            # device_type is not set, determined by `devices`, 
-                             .format(model_config, self._port_list))
+            if devices == "":
+                # CPU
+                self._device_name = "cpu"
+                devices = [-1]
            else:
-                self._device_type = "cpu"
+                # GPU
-                self._port_list.append(available_port_generator.next())
+                self._device_name = "gpu"
-                _LOGGER.info("Model({}) will be launch in cpu device. Port({})"
+                self._use_gpu = True
-                             .format(model_config, self._port_list))
+                devices = [int(x) for x in devices.split(",")]
-        else:
-            # gpu
+        elif device_type == 0:
-            self._device_type = "gpu"
+            # CPU
+            self._device_name = "cpu"
+            devices = [-1]
+        elif device_type == 1:
+            # GPU
+            self._device_name = "gpu"
+            self._use_gpu = True
+            devices = [int(x) for x in devices.split(",")]
+        elif device_type == 2:
+            # Nvidia Tensor RT
+            self._device_name = "gpu"
+            self._use_gpu = True
            devices = [int(x) for x in devices.split(",")]
+            self._use_trt = True
+        elif device_type == 3:
+            # ARM CPU
+            self._device_name = "arm"
+            devices = [-1]
+            self._use_lite = True
+        elif device_type == 4:
+            # Kunlun XPU
+            self._device_name = "arm"
+            devices = [int(x) for x in devices.split(",")]
+            self._use_lite = True
+            self._use_xpu = True
+        else:
+            _LOGGER.error(
+                "LocalServiceHandler initialization fail. device_type={}"
+                .format(device_type))
+        if client_type == "brpc" or client_type == "grpc":
            for _ in devices:
                self._port_list.append(available_port_generator.next())
-            _LOGGER.info("Model({}) will be launch in gpu device: {}. Port({})"
+            _LOGGER.info("Create ports for devices:{}. Port:{}"
-                         .format(model_config, devices, self._port_list))
+                         .format(devices, self._port_list))
        self._client_type = client_type
        self._workdir = workdir
        self._devices = devices
@@ -105,14 +135,21 @@ class LocalServiceHandler(object):
        self._local_predictor_client = None
        self._rpc_service_list = []
        self._server_pros = []
-        self._use_trt = use_trt
-        self._use_lite = use_lite
-        self._use_xpu = use_xpu
        self._use_profile = use_profile
-        self.fetch_names_ = fetch_names
+        self._fetch_names = fetch_names
+        _LOGGER.info(
+            "Models({}) will be launched by device {}. use_gpu:{}, "
+            "use_trt:{}, use_lite:{}, use_xpu:{}, device_type:{}, devices:{}, "
+            "mem_optim:{}, ir_optim:{}, use_profile:{}, thread_num:{}, "
+            "client_type:{}, fetch_names:{}".format(
+                model_config, self._device_name, self._use_gpu, self._use_trt,
+                self._use_lite, self._use_xpu, device_type, self._devices,
+                self._mem_optim, self._ir_optim, self._use_profile,
+                self._thread_num, self._client_type, self._fetch_names))
    def get_fetch_list(self):
-        return self.fetch_names_
+        return self._fetch_names
    def get_port_list(self):
        return self._port_list
@@ -149,22 +186,17 @@ class LocalServiceHandler(object):
        from paddle_serving_app.local_predict import LocalPredictor
        if self._local_predictor_client is None:
            self._local_predictor_client = LocalPredictor()
-            use_gpu = False
-            use_lite = False
-            if self._device_type == "gpu":
-                use_gpu = True
-            elif self._device_type == "arm":
-                use_lite = True
            self._local_predictor_client.load_model_config(
                model_path=self._model_config,
-                use_gpu=use_gpu,
+                use_gpu=self._use_gpu,
                gpu_id=self._devices[concurrency_idx],
                use_profile=self._use_profile,
                thread_num=self._thread_num,
                mem_optim=self._mem_optim,
                ir_optim=self._ir_optim,
                use_trt=self._use_trt,
-                use_lite=use_lite,
+                use_lite=self._use_lite,
                use_xpu=self._use_xpu)
        return self._local_predictor_client
@@ -174,7 +206,7 @@ class LocalServiceHandler(object):
    def _prepare_one_server(self, workdir, port, gpuid, thread_num, mem_optim,
                            ir_optim):
        """
-        According to _device_type, generating one CpuServer or GpuServer, and
+        According to self._device_name, generating one Cpu/Gpu/Arm Server, and
        setting the model config amd startup params.
        Args:
@@ -188,7 +220,7 @@ class LocalServiceHandler(object):
        Returns:
            server: CpuServer/GpuServer
        """
-        if self._device_type == "cpu":
+        if self._device_name == "cpu":
            from paddle_serving_server import OpMaker, OpSeqMaker, Server
            op_maker = OpMaker()
            read_op = op_maker.create('general_reader')
@@ -217,6 +249,8 @@ class LocalServiceHandler(object):
            server = Server()
            if gpuid >= 0:
                server.set_gpuid(gpuid)
+            # TODO: support arm or arm + xpu later
+            server.set_device(self._device_name)
        server.set_op_sequence(op_seq_maker.get_op_sequence())
        server.set_num_threads(thread_num)
@@ -225,9 +259,9 @@ class LocalServiceHandler(object):
        server.load_model_config(self._model_config)
        server.prepare_server(
-            workdir=workdir, port=port, device=self._device_type)
+            workdir=workdir, port=port, device=self._device_name)
-        if self.fetch_names_ is None:
+        if self._fetch_names is None:
-            self.fetch_names_ = server.get_fetch_list()
+            self._fetch_names = server.get_fetch_list()
        return server
    def _start_one_server(self, service_idx):
@@ -264,7 +298,7 @@ class LocalServiceHandler(object):
        """
        Start multiple processes and start one server in each process
        """
-        for i, service in enumerate(self._rpc_service_list):
+        for i, _ in enumerate(self._rpc_service_list):
            p = multiprocessing.Process(
                target=self._start_one_server, args=(i, ))
            p.daemon = True

--- a/python/pipeline/operator.py
+++ b/python/pipeline/operator.py
@@ -134,6 +134,7 @@ class Op(object):
        self.model_config = None
        self.workdir = None
        self.thread_num = self.concurrency
+        self.device_type = -1
        self.devices = ""
        self.mem_optim = False
        self.ir_optim = False
@@ -153,6 +154,7 @@ class Op(object):
                    self.client_type = local_service_conf.get("client_type")
                    self.workdir = local_service_conf.get("workdir")
                    self.thread_num = local_service_conf.get("thread_num")
+                    self.device_type = local_service_conf.get("device_type")
                    self.devices = local_service_conf.get("devices")
                    self.mem_optim = local_service_conf.get("mem_optim")
                    self.ir_optim = local_service_conf.get("ir_optim")
@@ -168,6 +170,7 @@ class Op(object):
                                client_type=self.client_type,
                                workdir=self.workdir,
                                thread_num=self.thread_num,
+                                device_type=self.device_type,
                                devices=self.devices,
                                mem_optim=self.mem_optim,
                                ir_optim=self.ir_optim)
@@ -188,8 +191,11 @@ class Op(object):
                                client_type=self.client_type,
                                workdir=self.workdir,
                                thread_num=self.thread_num,
+                                device_type=self.device_type,
                                devices=self.devices,
-                                fetch_names=self._fetch_names)
+                                fetch_names=self._fetch_names,
+                                mem_optim=self.mem_optim,
+                                ir_optim=self.ir_optim)
                            if self._client_config is None:
                                self._client_config = service_handler.get_client_config(
                                )
@@ -550,7 +556,8 @@ class Op(object):
                args=(concurrency_idx, self._get_input_channel(),
                      self._get_output_channels(), False, trace_buffer,
                      self.model_config, self.workdir, self.thread_num,
-                      self.devices, self.mem_optim, self.ir_optim))
+                      self.device_type, self.devices, self.mem_optim,
+                      self.ir_optim))
            p.daemon = True
            p.start()
            process.append(p)
@@ -583,7 +590,8 @@ class Op(object):
                args=(concurrency_idx, self._get_input_channel(),
                      self._get_output_channels(), True, trace_buffer,
                      self.model_config, self.workdir, self.thread_num,
-                      self.devices, self.mem_optim, self.ir_optim))
+                      self.device_type, self.devices, self.mem_optim,
+                      self.ir_optim))
            # When a process exits, it attempts to terminate
            # all of its daemonic child processes.
            t.daemon = True
@@ -991,7 +999,7 @@ class Op(object):
    def _run(self, concurrency_idx, input_channel, output_channels,
             is_thread_op, trace_buffer, model_config, workdir, thread_num,
-             devices, mem_optim, ir_optim):
+             device_type, devices, mem_optim, ir_optim):
        """
        _run() is the entry function of OP process / thread model.When client 
        type is local_predictor in process mode, the CUDA environment needs to 
@@ -1009,6 +1017,7 @@ class Op(object):
            model_config: model config path
            workdir: work directory
            thread_num: number of threads, concurrent quantity
+            device_type: support multiple devices
            devices: gpu id list[gpu], "" default[cpu]
            mem_optim: use memory/graphics memory optimization, True default.
            ir_optim: use calculation chart optimization, False default. 
@@ -1017,7 +1026,6 @@ class Op(object):
            None
        """
        op_info_prefix = "[{}|{}]".format(self.name, concurrency_idx)
-        tid = threading.current_thread().ident
        # init ops
        profiler = None
@@ -1028,6 +1036,7 @@ class Op(object):
                    client_type="local_predictor",
                    workdir=workdir,
                    thread_num=thread_num,
+                    device_type=device_type,
                    devices=devices,
                    mem_optim=mem_optim,
                    ir_optim=ir_optim)

--- a/python/pipeline/pipeline_server.py
+++ b/python/pipeline/pipeline_server.py
@@ -234,6 +234,7 @@ class PipelineServer(object):
            "local_service_conf": {
                "workdir": "",
                "thread_num": 2,
+                "device_type": -1,
                "devices": "",
                "mem_optim": True,
                "ir_optim": False,
@@ -389,6 +390,7 @@ class ServerYamlConfChecker(object):
        default_conf = {
            "workdir": "",
            "thread_num": 2,
+            "device_type": -1,
            "devices": "",
            "mem_optim": True,
            "ir_optim": False,
@@ -397,6 +399,7 @@ class ServerYamlConfChecker(object):
            "model_config": str,
            "workdir": str,
            "thread_num": int,
+            "device_type": int,
            "devices": str,
            "mem_optim": bool,
            "ir_optim": bool,