Merge branch 'develop' of github.com:HexToString/Serving into develop

0cef40a5 · HexToString · 2fd830c6 · ebe51dfb · 0cef40a5 · 0cef40a5
102 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,6 +49,9 @@ set(THIRD_PARTY_BUILD_TYPE Release)
 option(WITH_AVX	    "Compile Paddle Serving with AVX intrinsics"    OFF)
 option(WITH_MKL	    "Compile Paddle Serving with MKL support."      OFF)
 option(WITH_GPU	    "Compile Paddle Serving with NVIDIA GPU"        OFF)
+option(WITH_LITE    "Compile Paddle Serving with Paddle Lite Engine"    OFF)
+option(WITH_XPU	    "Compile Paddle Serving with Baidu Kunlun"        OFF)
+option(WITH_PYTHON  "Compile Paddle Serving with Python"		    ON)
 option(CLIENT  	    "Compile Paddle Serving Client"		    OFF)
 option(SERVER	    "Compile Paddle Serving Server"		    OFF)
 option(APP          "Compile Paddle Serving App package"	    OFF)
@@ -66,40 +69,40 @@ if (NOT DEFINED WITH_MKLDNN)
    endif()
 endif()

-if (SERVER)
-include(external/jsoncpp)
-#include(external/rocksdb)
-endif()

 if (SERVER OR CLIENT)
-include(external/snappy)
-include(external/leveldb)
-include(external/zlib)
-include(external/boost)
-include(external/protobuf)
-include(external/brpc)
-include(external/gflags)
-include(external/glog)
-include(external/pybind11)
-include(external/python)
-include(generic)
-include(flags)
+    include(external/snappy)
+    include(external/leveldb)
+    include(external/zlib)
+    include(external/boost)
+    include(external/protobuf)
+    include(external/brpc)
+    include(external/gflags)
+    include(external/glog)
+    if (WITH_PYTHON)
+        include(external/pybind11)
+        include(external/python)
+    endif()
+    include(generic)
+    include(flags)
 endif()

 if (APP)
-include(external/zlib)
-include(external/boost)
-include(external/protobuf)
-include(external/gflags)
-include(external/glog)
-include(external/pybind11)
-include(external/python)
-include(generic)
+    include(external/zlib)
+    include(external/boost)
+    include(external/protobuf)
+    include(external/gflags)
+    include(external/glog)
+    include(external/pybind11)
+    include(external/python)
+    include(generic)
 endif()

 if (SERVER)
-include(external/cudnn)
-include(paddlepaddle)
+    include(external/jsoncpp)
+    #include(external/rocksdb)
+    include(external/cudnn)
+    include(paddlepaddle)
 endif()

 message("paddle serving source dir: " ${PADDLE_SERVING_SOURCE_DIR})
@@ -125,26 +128,24 @@ set(EXTERNAL_LIBS
 )

 if(SERVER)
-if(WITH_MKLML)
+    if(WITH_MKLML)
        list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
-endif()
-endif()
-
+    endif()

-if(SERVER)
-if(WITH_MKLDNN)
+    if(WITH_MKLDNN)
        list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
-endif()
-endif()
+    endif()

-if (SERVER)
    list(APPEND EXTERNAL_LIBS paddlepaddle)
 endif()

+
 add_subdirectory(core)

 if(SERVER)
-add_subdirectory(paddle_inference)
+    add_subdirectory(paddle_inference)
 endif()

-add_subdirectory(python)
+if (WITH_PYTHON)
+    add_subdirectory(python)
+endif()
--- a/README.md
+++ b/README.md
@@ -47,9 +47,10 @@ nvidia-docker exec -it test bash
 ```shell
 pip install paddle-serving-client==0.4.0 
 pip install paddle-serving-server==0.4.0 # CPU
+pip install paddle-serving-app==0.2.0
 pip install paddle-serving-server-gpu==0.4.0.post9 # GPU with CUDA9.0
 pip install paddle-serving-server-gpu==0.4.0.post10 # GPU with CUDA10.0
-pip install paddle-serving-server-gpu==0.4.0.trt # GPU with CUDA10.1+TensorRT
+pip install paddle-serving-server-gpu==0.4.0.100 # GPU with CUDA10.1+TensorRT
 ```

 You may need to use a domestic mirror source (in China, you can use the Tsinghua mirror source, add `-i https://pypi.tuna.tsinghua.edu.cn/simple` to pip command) to speed up the download.
@@ -66,15 +67,6 @@ For **Windows Users**, please read the document [Paddle Serving for Windows User

 <h2 align="center"> Pre-built services with Paddle Serving</h2>

-<h3 align="center">Latest release</h4>
-<p align="center">
-    <a href="https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/ocr">Optical Character Recognition</a>
-    <br>
-    <a href="https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/faster_rcnn_model">Object Detection</a>
-    <br>
-    <a href="https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/deeplabv3">Image Segmentation</a>
-<p>
-
 <h3 align="center">Chinese Word Segmentation</h4>

 ``` shell
@@ -133,7 +125,8 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 | `use_trt` (Only for trt version) | - | - | Run inference with TensorRT  |

 </center>
-``` python
+
+```python
 # A user can visit rpc service through paddle_serving_client API
 from paddle_serving_client import Client
 import numpy as np
@@ -147,13 +140,6 @@ print(fetch_map)
 ```
 Here, `client.predict` function has two arguments. `feed` is a `python dict` with model input variable alias name and values. `fetch` assigns the prediction variables to be returned from servers. In the example, the name of `"x"` and `"price"` are assigned when the servable model is saved during training.

-<h2 align="center">Some Key Features of Paddle Serving</h2>
-
- Integrate with Paddle training pipeline seamlessly, most paddle models can be deployed **with one line command**.
- **Industrial serving features** supported, such as models management, online loading, online A/B testing etc.
- **Distributed Key-Value indexing** supported which is especially useful for large scale sparse features as model inputs.
- **Highly concurrent and efficient communication** between clients and servers supported.
- **Multiple programming languages** supported on client side, such as Golang, C++ and python.

 ### WEB service

@@ -189,6 +175,14 @@ the response is
 {"result":{"price":[[18.901151657104492]]}}
 ```

+<h2 align="center">Some Key Features of Paddle Serving</h2>
+
+- Integrate with Paddle training pipeline seamlessly, most paddle models can be deployed **with one line command**.
+- **Industrial serving features** supported, such as models management, online loading, online A/B testing etc.
+- **Distributed Key-Value indexing** supported which is especially useful for large scale sparse features as model inputs.
+- **Highly concurrent and efficient communication** between clients and servers supported.
+- **Multiple programming languages** supported on client side, such as Golang, C++ and python.
+
 <h2 align="center">Document</h2>

 ### New to Paddle Serving
@@ -235,6 +229,10 @@ To connect with other users and contributors, welcome to join our [Slack channel

 If you want to contribute code to Paddle Serving, please reference [Contribution Guidelines](doc/CONTRIBUTE.md)

+- Special Thanks to [@BeyondYourself](https://github.com/BeyondYourself) in complementing the gRPC tutorial, updating the FAQ doc and modifying the mdkir command
+- Special Thanks to [@mcl-stone](https://github.com/mcl-stone) in updating faster_rcnn benchmark
+- Special Thanks to [@cg82616424](https://github.com/cg82616424) in updating the unet benchmark and modifying resize comment error
+
 ### Feedback

 For any feedback or to report a bug, please propose a [GitHub Issue](https://github.com/PaddlePaddle/Serving/issues).

--- a/README_CN.md
+++ b/README_CN.md
@@ -49,9 +49,10 @@ nvidia-docker exec -it test bash
 ```shell
 pip install paddle-serving-client==0.4.0
 pip install paddle-serving-server==0.4.0 # CPU
+pip install paddle-serving-app==0.2.0
 pip install paddle-serving-server-gpu==0.4.0.post9 # GPU with CUDA9.0
 pip install paddle-serving-server-gpu==0.4.0.post10 # GPU with CUDA10.0
-pip install paddle-serving-server-gpu==0.4.0.trt # GPU with CUDA10.1+TensorRT
+pip install paddle-serving-server-gpu==0.4.0.100 # GPU with CUDA10.1+TensorRT
 ```

 您可能需要使用国内镜像源（例如清华源, 在pip命令中添加`-i https://pypi.tuna.tsinghua.edu.cn/simple`）来加速下载。
@@ -148,7 +149,7 @@ print(fetch_map)
 在这里，`client.predict`函数具有两个参数。 `feed`是带有模型输入变量别名和值的`python dict`。 `fetch`被要从服务器返回的预测变量赋值。 在该示例中，在训练过程中保存可服务模型时，被赋值的tensor名为`"x"`和`"price"`。

 <h3 align="center">HTTP服务</h3>
-用户也可以将数据格式处理逻辑放在服务器端进行，这样就可以直接用curl去访问服务，参考如下案例，在目录``python/examples/fit_a_line``
+用户也可以将数据格式处理逻辑放在服务器端进行，这样就可以直接用curl去访问服务，参考如下案例，在目录`python/examples/fit_a_line`

 ```python
 from paddle_serving_server.web_service import WebService
@@ -232,6 +233,10 @@ curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1

 如果您想为Paddle Serving贡献代码，请参考 [Contribution Guidelines](doc/CONTRIBUTE.md)

+- 特别感谢 [@BeyondYourself](https://github.com/BeyondYourself) 提供grpc教程，更新FAQ教程，整理文件目录。
+- 特别感谢 [@mcl-stone](https://github.com/mcl-stone) 提供faster rcnn benchmark脚本
+- 特别感谢 [@cg82616424](https://github.com/cg82616424) 提供unet benchmark脚本和修改部分注释错误
+
 ### 反馈

 如有任何反馈或是bug，请在 [GitHub Issue](https://github.com/PaddlePaddle/Serving/issues)提交

--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -22,8 +22,9 @@ set(BOOST_PROJECT       "extern_boost")
 # version of boost, say, 1.66.0, doesn't build on CentOS 6.  We
 # checked that the devtools package of CentOS 6 installs boost 1.41.0.
 # So we use 1.41.0 here.
-set(BOOST_VER           "1.41.0")
-set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
+
+set(BOOST_VER           "1.74.0")
+set(BOOST_TAR "boost_1_74_0" CACHE STRING "" FORCE)
 set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)

 MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")

--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -13,6 +13,9 @@
 # limitations under the License.

 INCLUDE(ExternalProject)
+set(BRPC_CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-narrowing")
+set(BRPC_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
+set(BRPC_CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -Wno-narrowing")

 find_package(OpenSSL REQUIRED) 

@@ -35,19 +38,28 @@ INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
 set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")

+if(WITH_LITE)
+    set(BRPC_REPO "https://github.com/zhangjun/incubator-brpc.git")
+    set(BRPC_TAG "master")
+else()
+    set(BRPC_REPO "https://github.com/wangjiawei04/brpc")
+    set(BRPC_TAG "6d79e0b17f25107c35b705ea58d888083f59ff47")
+endif()
+
 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
    extern_brpc
    ${EXTERNAL_PROJECT_LOG_ARGS}
    # TODO(gongwb): change to de newst repo when they changed.
-    GIT_REPOSITORY  "https://github.com/wangjiawei04/brpc"
-    GIT_TAG         "6d79e0b17f25107c35b705ea58d888083f59ff47"
+    GIT_REPOSITORY  ${BRPC_REPO}
+    GIT_TAG         ${BRPC_TAG}
    PREFIX          ${BRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${BRPC_CMAKE_CXX_FLAGS}
+                    -DCMAKE_C_FLAGS=${BRPC_CMAKE_C_FLAGS}
+                    -DCMAKE_CPP_FLAGS=${BRPC_CMAKE_CPP_FLAGS}
                    -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR}
                    -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -93,7 +93,11 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 if(NOT APPLE)
  find_package(Threads REQUIRED)
  link_libraries(${CMAKE_THREAD_LIBS_INIT})
+  if(WITH_LITE OR WITH_XPU)
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -fopenmp -pthread -ldl -lrt")
+  else()
    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
+  endif()
 endif(NOT APPLE)

 set_property(GLOBAL PROPERTY FLUID_MODULES "")

--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -31,14 +31,20 @@ message( "WITH_GPU = ${WITH_GPU}")
 # Paddle Version should be one of:
 # latest: latest develop build
 # version number like 1.5.2
-SET(PADDLE_VERSION "1.8.4")
-
+#SET(PADDLE_VERSION "2.0.0-rc1")
+SET(PADDLE_VERSION "latest")
 if (WITH_GPU)
    if (WITH_TRT)
-        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7.6-avx-mkl-trt6")
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7-avx-mkl-trt6")
    else()
        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10-cudnn7-avx-mkl")
    endif()
+elseif (WITH_LITE)
+    if (WITH_XPU)
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-arm-xpu")
+    else()
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-arm")
+    endif()
 else()
    if (WITH_AVX)
        if (WITH_MKLML)
@@ -51,7 +57,12 @@ else()
    endif()
 endif()

-SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/fluid_inference.tgz")
+if(WITH_LITE)
+    SET(PADDLE_LIB_PATH "http://paddle-serving.bj.bcebos.com/inferlib/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
+else()
+    SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/paddle_inference.tgz")
+endif()
+
 MESSAGE(STATUS "PADDLE_LIB_PATH=${PADDLE_LIB_PATH}")
 if (WITH_GPU OR WITH_MKLML)
    if (WITH_TRT)
@@ -114,25 +125,48 @@ ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/openblas/lib/libopenblas.a)

 ADD_LIBRARY(paddle_fluid SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.so)
+SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.a)

 if (WITH_TRT)
-ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET nvinfer PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer.so)
+    ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET nvinfer PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer.so)
+    
+    ADD_LIBRARY(nvinfer_plugin SHARED IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET nvinfer_plugin PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer_plugin.so)
+endif()

-ADD_LIBRARY(nvinfer_plugin SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET nvinfer_plugin PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer_plugin.so)
+if (WITH_LITE)
+    ADD_LIBRARY(paddle_api_full_bundled STATIC IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET paddle_api_full_bundled PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/lite/cxx/lib/libpaddle_api_full_bundled.a)
+    
+    if (WITH_XPU)
+        ADD_LIBRARY(xpuapi SHARED IMPORTED GLOBAL)
+        SET_PROPERTY(TARGET xpuapi PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xpu/lib/libxpuapi.so)
+    
+        ADD_LIBRARY(xpurt SHARED IMPORTED GLOBAL)
+        SET_PROPERTY(TARGET xpurt PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xpu/lib/libxpurt.so)
+    endif()
 endif()

 ADD_LIBRARY(xxhash STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET xxhash PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xxhash/lib/libxxhash.a)

+ADD_LIBRARY(cryptopp STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET cryptopp PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/cryptopp/lib/libcryptopp.a)
+
 LIST(APPEND external_project_dependencies paddle)

 LIST(APPEND paddle_depend_libs
-    xxhash)
+        xxhash cryptopp)
+
+if(WITH_LITE)
+    LIST(APPEND paddle_depend_libs paddle_api_full_bundled)
+    if(WITH_XPU)
+        LIST(APPEND paddle_depend_libs xpuapi xpurt)
+    endif()
+endif()

 if(WITH_TRT)
-LIST(APPEND paddle_depend_libs
+    LIST(APPEND paddle_depend_libs
        nvinfer nvinfer_plugin)
 endif()
--- a/core/configure/CMakeLists.txt
+++ b/core/configure/CMakeLists.txt
@@ -14,10 +14,6 @@ list(APPEND configure_srcs ${CMAKE_CURRENT_LIST_DIR}/src/configure_parser.cpp)
 add_library(configure ${configure_srcs})
 add_dependencies(configure brpc)

-add_executable(test_configure
-        ${CMAKE_CURRENT_LIST_DIR}/tests/test_configure.cpp)
-target_link_libraries(test_configure configure protobuf)
-
 install(TARGETS configure 
        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
        )
@@ -31,6 +27,8 @@ install(FILES ${inc}
        DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/include/configure)
 endif()

+if (WITH_PYTHON)
+
 py_proto_compile(general_model_config_py_proto SRCS proto/general_model_config.proto)
 add_custom_target(general_model_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(general_model_config_py_proto general_model_config_py_proto_init)
@@ -45,19 +43,19 @@ add_custom_target(sdk_configure_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E to
 add_dependencies(sdk_configure_py_proto sdk_configure_py_proto_init)
 add_custom_command(TARGET sdk_configure_py_proto POST_BUILD
 		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-		COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
 		COMMENT "Copy generated python proto into directory paddle_serving_client/proto."
 		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

 add_custom_command(TARGET general_model_config_py_proto POST_BUILD
                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-                COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
                COMMENT "Copy generated general_model_config proto file into directory paddle_serving_client/proto."
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

 add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-                COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
                COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_client/proto."
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif()
@@ -65,7 +63,7 @@ endif()
 if (APP)
 add_custom_command(TARGET general_model_config_py_proto POST_BUILD
                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
-                COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
+                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
                COMMENT "Copy generated general_model_config proto file into directory paddle_serving_app/proto."
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif()
@@ -74,29 +72,29 @@ if (SERVER)
 py_proto_compile(server_config_py_proto SRCS proto/server_configure.proto)
 add_custom_target(server_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(server_config_py_proto server_config_py_proto_init)
-if (NOT WITH_GPU)
+if (NOT WITH_GPU AND NOT WITH_LITE)
 add_custom_command(TARGET server_config_py_proto POST_BUILD
 		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-		COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
 		COMMENT "Copy generated python proto into directory paddle_serving_server/proto."
 		WORKING_DIRECTORY ${CMAKE_CURRENT_BINRARY_DIR})

 add_custom_command(TARGET general_model_config_py_proto POST_BUILD
 		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-		COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
 		COMMENT "Copy generated general_model_config proto file into directory paddle_serving_server/proto."
 		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

 add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-                COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
                COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server/proto."
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 else()
 add_custom_command(TARGET server_config_py_proto POST_BUILD
 		COMMAND ${CMAKE_COMMAND} -E make_directory
        ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-		COMMAND cp *.py
+		COMMAND cp -f *.py
        ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
 		COMMENT "Copy generated python proto into directory
        paddle_serving_server_gpu/proto."
@@ -105,7 +103,7 @@ add_custom_command(TARGET server_config_py_proto POST_BUILD
 add_custom_command(TARGET general_model_config_py_proto POST_BUILD
 		COMMAND ${CMAKE_COMMAND} -E make_directory
        ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-		COMMAND cp *.py
+		COMMAND cp -f *.py
        ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
 		COMMENT "Copy generated general_model_config proto file into directory
        paddle_serving_server_gpu/proto."
@@ -113,8 +111,10 @@ add_custom_command(TARGET general_model_config_py_proto POST_BUILD

 add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-                COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
+                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
                COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server_gpu/proto."
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif()
 endif()
+
+endif()
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -45,6 +45,8 @@ message EngineDesc {
  optional bool force_update_static_cache = 15;
  optional bool enable_ir_optimization = 16;
  optional bool use_trt = 17;
+  optional bool use_lite = 18;
+  optional bool use_xpu = 19;
 };

 // model_toolkit conf

--- a/core/general-server/CMakeLists.txt
+++ b/core/general-server/CMakeLists.txt
@@ -6,6 +6,11 @@ add_dependencies(serving pdcodegen fluid_cpu_engine pdserving paddle_fluid cube-
 if (WITH_GPU)
    add_dependencies(serving fluid_gpu_engine)
 endif()
+
+if (WITH_LITE)
+    add_dependencies(serving fluid_arm_engine)
+endif()
+
 target_include_directories(serving PUBLIC
        ${CMAKE_CURRENT_BINARY_DIR}/../../core/predictor
        )
@@ -15,6 +20,11 @@ if(WITH_GPU)
            -Wl,--no-whole-archive)
 endif()

+if(WITH_LITE)
+    target_link_libraries(serving -Wl,--whole-archive fluid_arm_engine
+            -Wl,--no-whole-archive)
+endif()
+
 target_link_libraries(serving -Wl,--whole-archive fluid_cpu_engine
        -Wl,--no-whole-archive)


--- a/core/general-server/op/general_dist_kv_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_infer_op.cpp
@@ -38,145 +38,7 @@ using baidu::paddle_serving::predictor::general_model::FetchInst;
 using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;

-int GeneralDistKVInferOp::inference() {
-  VLOG(2) << "Going to run inference";
-  const std::vector<std::string> pre_node_names = pre_names();
-  if (pre_node_names.size() != 1) {
-    LOG(ERROR) << "This op(" << op_name()
-               << ") can only have one predecessor op, but received "
-               << pre_node_names.size();
-    return -1;
-  }
-  const std::string pre_name = pre_node_names[0];
-
-  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
-  uint64_t log_id = input_blob->GetLogId();
-  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
-  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
-
-  if (!input_blob) {
-    LOG(ERROR) << "(logid=" << log_id
-               << ") Failed mutable depended argument, op:" << pre_name;
-    return -1;
-  }
-
-  const TensorVector *in = &input_blob->tensor_vector;
-  TensorVector *out = &output_blob->tensor_vector;
-  int batch_size = input_blob->GetBatchSize();
-  VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
-  std::vector<uint64_t> keys;
-  std::vector<rec::mcube::CubeValue> values;
-  int sparse_count = 0;
-  int dense_count = 0;
-  std::vector<std::pair<int64_t *, size_t>> dataptr_size_pairs;
-  size_t key_len = 0;
-  for (size_t i = 0; i < in->size(); ++i) {
-    if (in->at(i).dtype != paddle::PaddleDType::INT64) {
-      ++dense_count;
-      continue;
-    }
-    ++sparse_count;
-    size_t elem_num = 1;
-    for (size_t s = 0; s < in->at(i).shape.size(); ++s) {
-      elem_num *= in->at(i).shape[s];
-    }
-    key_len += elem_num;
-    int64_t *data_ptr = static_cast<int64_t *>(in->at(i).data.data());
-    dataptr_size_pairs.push_back(std::make_pair(data_ptr, elem_num));
-  }
-  keys.resize(key_len);
-  int key_idx = 0;
-  for (size_t i = 0; i < dataptr_size_pairs.size(); ++i) {
-    std::copy(dataptr_size_pairs[i].first,
-              dataptr_size_pairs[i].first + dataptr_size_pairs[i].second,
-              keys.begin() + key_idx);
-    key_idx += dataptr_size_pairs[i].second;
-  }
-  Timer timeline;
-  int64_t cube_start = timeline.TimeStampUS();
-  timeline.Start();
-  rec::mcube::CubeAPI *cube = rec::mcube::CubeAPI::instance();
-  std::vector<std::string> table_names = cube->get_table_names();
-  if (table_names.size() == 0) {
-    LOG(ERROR) << "(logid=" << log_id
-               << ") cube init error or cube config not given.";
-    return -1;
-  }
-  int ret = cube->seek(table_names[0], keys, &values);
-  int64_t cube_end = timeline.TimeStampUS();
-  if (values.size() != keys.size() || values[0].buff.size() == 0) {
-    LOG(ERROR) << "(logid=" << log_id << ") cube value return null";
-  }
-  size_t EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
-  TensorVector sparse_out;
-  sparse_out.resize(sparse_count);
-  TensorVector dense_out;
-  dense_out.resize(dense_count);
-  int cube_val_idx = 0;
-  int sparse_idx = 0;
-  int dense_idx = 0;
-  std::unordered_map<int, int> in_out_map;
-  baidu::paddle_serving::predictor::Resource &resource =
-      baidu::paddle_serving::predictor::Resource::instance();
-  std::shared_ptr<PaddleGeneralModelConfig> model_config =
-      resource.get_general_model_config();
-  for (size_t i = 0; i < in->size(); ++i) {
-    if (in->at(i).dtype != paddle::PaddleDType::INT64) {
-      dense_out[dense_idx] = in->at(i);
-      ++dense_idx;
-      continue;
-    }
-
-    sparse_out[sparse_idx].lod.resize(in->at(i).lod.size());
-    for (size_t x = 0; x < sparse_out[sparse_idx].lod.size(); ++x) {
-      sparse_out[sparse_idx].lod[x].resize(in->at(i).lod[x].size());
-      std::copy(in->at(i).lod[x].begin(),
-                in->at(i).lod[x].end(),
-                sparse_out[sparse_idx].lod[x].begin());
-    }
-    sparse_out[sparse_idx].dtype = paddle::PaddleDType::FLOAT32;
-    sparse_out[sparse_idx].shape.push_back(
-        sparse_out[sparse_idx].lod[0].back());
-    sparse_out[sparse_idx].shape.push_back(EMBEDDING_SIZE);
-    sparse_out[sparse_idx].name = model_config->_feed_name[i];
-    sparse_out[sparse_idx].data.Resize(sparse_out[sparse_idx].lod[0].back() *
-                                       EMBEDDING_SIZE * sizeof(float));
-    float *dst_ptr = static_cast<float *>(sparse_out[sparse_idx].data.data());
-    for (int x = 0; x < sparse_out[sparse_idx].lod[0].back(); ++x) {
-      float *data_ptr = dst_ptr + x * EMBEDDING_SIZE;
-      memcpy(data_ptr,
-             values[cube_val_idx].buff.data(),
-             values[cube_val_idx].buff.size());
-      cube_val_idx++;
-    }
-    ++sparse_idx;
-  }
-  TensorVector infer_in;
-  infer_in.insert(infer_in.end(), dense_out.begin(), dense_out.end());
-  infer_in.insert(infer_in.end(), sparse_out.begin(), sparse_out.end());
-
-  output_blob->SetBatchSize(batch_size);
-  output_blob->SetLogId(log_id);
-
-  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
-
-  int64_t start = timeline.TimeStampUS();
-
-  if (InferManager::instance().infer(
-          engine_name().c_str(), &infer_in, out, batch_size)) {
-    LOG(ERROR) << "(logid=" << log_id
-               << ") Failed do infer in fluid model: " << engine_name();
-    return -1;
-  }
-
-  int64_t end = timeline.TimeStampUS();
-  CopyBlobInfo(input_blob, output_blob);
-  AddBlobInfo(output_blob, cube_start);
-  AddBlobInfo(output_blob, cube_end);
-  AddBlobInfo(output_blob, start);
-  AddBlobInfo(output_blob, end);
-  return 0;
-}
+int GeneralDistKVInferOp::inference() { return 0; }
 DEFINE_OP(GeneralDistKVInferOp);

 }  // namespace serving

--- a/core/general-server/op/general_dist_kv_quant_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_quant_infer_op.cpp
@@ -188,21 +188,6 @@ int GeneralDistKVQuantInferOp::inference() {

  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;

-  Timer timeline;
-  int64_t start = timeline.TimeStampUS();
-  timeline.Start();
-
-  if (InferManager::instance().infer(
-          engine_name().c_str(), &infer_in, out, batch_size)) {
-    LOG(ERROR) << "(logid=" << log_id
-               << ") Failed do infer in fluid model: " << engine_name();
-    return -1;
-  }
-
-  int64_t end = timeline.TimeStampUS();
-  CopyBlobInfo(input_blob, output_blob);
-  AddBlobInfo(output_blob, start);
-  AddBlobInfo(output_blob, end);
  return 0;
 }
 DEFINE_OP(GeneralDistKVQuantInferOp);

--- a/core/general-server/op/general_infer_op.cpp
+++ b/core/general-server/op/general_infer_op.cpp
@@ -44,45 +44,9 @@ int GeneralInferOp::inference() {
               << pre_node_names.size();
    return -1;
  }
-  const std::string pre_name = pre_node_names[0];
-
-  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
-  uint64_t log_id = input_blob->GetLogId();
-  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
-  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
-  output_blob->SetLogId(log_id);
-
-  if (!input_blob) {
-    LOG(ERROR) << "(logid=" << log_id
-               << ") Failed mutable depended argument, op:" << pre_name;
+  if (InferManager::instance().infer(engine_name().c_str())) {
    return -1;
  }
-
-  const TensorVector *in = &input_blob->tensor_vector;
-  TensorVector *out = &output_blob->tensor_vector;
-
-  int batch_size = input_blob->_batch_size;
-  VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
-
-  output_blob->_batch_size = batch_size;
-
-  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
-
-  Timer timeline;
-  int64_t start = timeline.TimeStampUS();
-  timeline.Start();
-
-  if (InferManager::instance().infer(
-          engine_name().c_str(), in, out, batch_size)) {
-    LOG(ERROR) << "(logid=" << log_id
-               << ") Failed do infer in fluid model: " << engine_name().c_str();
-    return -1;
-  }
-
-  int64_t end = timeline.TimeStampUS();
-  CopyBlobInfo(input_blob, output_blob);
-  AddBlobInfo(output_blob, start);
-  AddBlobInfo(output_blob, end);
  return 0;
 }
 DEFINE_OP(GeneralInferOp);

--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -20,6 +20,7 @@
 #include "core/general-server/op/general_infer_helper.h"
 #include "core/predictor/framework/infer.h"
 #include "core/predictor/framework/memory.h"
+#include "core/predictor/framework/resource.h"
 #include "core/util/include/timer.h"

 namespace baidu {
@@ -32,6 +33,7 @@ using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::general_model::FeedInst;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+using baidu::paddle_serving::predictor::InferManager;

 int conf_check(const Request *req,
               const std::shared_ptr<PaddleGeneralModelConfig> &model_config) {
@@ -71,75 +73,34 @@ int conf_check(const Request *req,

 int GeneralReaderOp::inference() {
  // reade request from client
+  // TODO: only support one engine here
+  std::string engine_name = "general_infer_0";
  const Request *req = dynamic_cast<const Request *>(get_request_message());
  uint64_t log_id = req->log_id();
  int input_var_num = 0;
  std::vector<int64_t> elem_type;
  std::vector<int64_t> elem_size;
  std::vector<int64_t> capacity;
-
-  GeneralBlob *res = mutable_data<GeneralBlob>();
-  TensorVector *out = &res->tensor_vector;
-
-  res->SetLogId(log_id);
-
-  if (!res) {
-    LOG(ERROR) << "(logid=" << log_id
-               << ") Failed get op tls reader object output";
-  }
-
-  Timer timeline;
-  int64_t start = timeline.TimeStampUS();
  int var_num = req->insts(0).tensor_array_size();
-  VLOG(2) << "(logid=" << log_id << ") var num: " << var_num;
-
-  VLOG(2) << "(logid=" << log_id
-          << ") start to call load general model_conf op";
-
  baidu::paddle_serving::predictor::Resource &resource =
      baidu::paddle_serving::predictor::Resource::instance();
-
-  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
  std::shared_ptr<PaddleGeneralModelConfig> model_config =
      resource.get_general_model_config();
-
-  VLOG(2) << "(logid=" << log_id << ") print general model config done.";
-
-  // TODO(guru4elephant): how to do conditional check?
-  /*
-  int ret = conf_check(req, model_config);
-  if (ret != 0) {
-    LOG(ERROR) << "model conf of server:";
-    resource.print_general_model_config(model_config);
-    return 0;
-  }
-  */
-  // package tensor
-
  elem_type.resize(var_num);
  elem_size.resize(var_num);
  capacity.resize(var_num);
-  // prepare basic information for input
  for (int i = 0; i < var_num; ++i) {
-    paddle::PaddleTensor lod_tensor;
-    elem_type[i] = req->insts(0).tensor_array(i).elem_type();
-    VLOG(2) << "var[" << i << "] has elem type: " << elem_type[i];
-    if (elem_type[i] == 0) {  // int64
-      elem_size[i] = sizeof(int64_t);
-      lod_tensor.dtype = paddle::PaddleDType::INT64;
-    } else if (elem_type[i] == 1) {
-      elem_size[i] = sizeof(float);
-      lod_tensor.dtype = paddle::PaddleDType::FLOAT32;
-    } else if (elem_type[i] == 2) {
-      elem_size[i] = sizeof(int32_t);
-      lod_tensor.dtype = paddle::PaddleDType::INT32;
-    }
-    // implement lod tensor here
+    std::string tensor_name = model_config->_feed_name[i];
+    VLOG(2) <<  "(logid=" << log_id << ") get tensor name: " << tensor_name;
+    auto lod_tensor = InferManager::instance().GetInputHandle(
+        engine_name.c_str(), tensor_name.c_str());
+    std::vector<std::vector<size_t>> lod;
+    std::vector<int> shape;
+    // get lod info here
    if (req->insts(0).tensor_array(i).lod_size() > 0) {
-      VLOG(2) << "(logid=" << log_id << ") var[" << i << "] is lod_tensor";
-      lod_tensor.lod.resize(1);
+      lod.resize(1);
      for (int k = 0; k < req->insts(0).tensor_array(i).lod_size(); ++k) {
-        lod_tensor.lod[0].push_back(req->insts(0).tensor_array(i).lod(k));
+        lod[0].push_back(req->insts(0).tensor_array(i).lod(k));
      }
      capacity[i] = 1;
      for (int k = 0; k < req->insts(0).tensor_array(i).shape_size(); ++k) {
@@ -147,7 +108,7 @@ int GeneralReaderOp::inference() {
        VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
                << "]: " << dim;
        capacity[i] *= dim;
-        lod_tensor.shape.push_back(dim);
+        shape.push_back(dim);
      }
      VLOG(2) << "(logid=" << log_id << ") var[" << i
              << "] is tensor, capacity: " << capacity[i];
@@ -158,92 +119,41 @@ int GeneralReaderOp::inference() {
        VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
                << "]: " << dim;
        capacity[i] *= dim;
-        lod_tensor.shape.push_back(dim);
+        shape.push_back(dim);
      }
      VLOG(2) << "(logid=" << log_id << ") var[" << i
              << "] is tensor, capacity: " << capacity[i];
    }
-    lod_tensor.name = model_config->_feed_name[i];
-    out->push_back(lod_tensor);
-  }
-  // specify the memory needed for output tensor_vector
-  for (int i = 0; i < var_num; ++i) {
-    if (out->at(i).lod.size() == 1) {
-      int tensor_size = 0;
-      const Tensor &tensor = req->insts(0).tensor_array(i);
-      int data_len = 0;
-      if (tensor.int64_data_size() > 0) {
-        data_len = tensor.int64_data_size();
-      } else if (tensor.float_data_size() > 0) {
-        data_len = tensor.float_data_size();
-      } else if (tensor.int_data_size() > 0) {
-        data_len = tensor.int_data_size();
-      }
-      VLOG(2) << "(logid=" << log_id << ") tensor size for var[" << i
-              << "]: " << data_len;
-      tensor_size += data_len;
-
-      int cur_len = out->at(i).lod[0].back();
-      VLOG(2) << "(logid=" << log_id << ") current len: " << cur_len;
-
-      int sample_len = 0;
-      if (tensor.shape_size() == 1) {
-        sample_len = data_len;
-      } else {
-        sample_len = tensor.shape(0);
-      }
-      VLOG(2) << "(logid=" << log_id << ") new len: " << cur_len + sample_len;
-      out->at(i).data.Resize(tensor_size * elem_size[i]);
-      VLOG(2) << "(logid=" << log_id << ") var[" << i
-              << "] is lod_tensor and len=" << out->at(i).lod[0].back();
-    } else {
-      out->at(i).data.Resize(capacity[i] * elem_size[i]);
-      VLOG(2) << "(logid=" << log_id << ") var[" << i
-              << "] is tensor and capacity=" << capacity[i];
-    }
-  }
-
-  // fill the data into output general_blob
-  for (int i = 0; i < var_num; ++i) {
-    if (elem_type[i] == 0) {
-      int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
-              << "] is " << req->insts(0).tensor_array(i).int64_data(0);
-      int offset = 0;
+    lod_tensor->SetLoD(lod);
+    lod_tensor->Reshape(shape);
+    // insert data here
+    if (req->insts(0).tensor_array(i).elem_type() == 0) {
+      // TODO: Copy twice here, can optimize
      int elem_num = req->insts(0).tensor_array(i).int64_data_size();
+      std::vector<int64_t> data(elem_num);
+      int64_t *dst_ptr = data.data();
      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[offset + k] = req->insts(0).tensor_array(i).int64_data(k);
+        dst_ptr[k] = req->insts(0).tensor_array(i).int64_data(k);
      }
-    } else if (elem_type[i] == 1) {
-      float *dst_ptr = static_cast<float *>(out->at(i).data.data());
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
-              << "] is " << req->insts(0).tensor_array(i).float_data(0);
-      int offset = 0;
+      lod_tensor->CopyFromCpu(dst_ptr);
+    } else if (req->insts(0).tensor_array(i).elem_type() == 1) {
      int elem_num = req->insts(0).tensor_array(i).float_data_size();
+      std::vector<float> data(elem_num);
+      float *dst_ptr = data.data();
      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[offset + k] = req->insts(0).tensor_array(i).float_data(k);
+        dst_ptr[k] = req->insts(0).tensor_array(i).float_data(k);
      }
-    } else if (elem_type[i] == 2) {
-      int32_t *dst_ptr = static_cast<int32_t *>(out->at(i).data.data());
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
-              << "] is " << req->insts(0).tensor_array(i).int_data(0);
-      int offset = 0;
+      lod_tensor->CopyFromCpu(dst_ptr);
+    } else if (req->insts(0).tensor_array(i).elem_type() == 2) {
      int elem_num = req->insts(0).tensor_array(i).int_data_size();
+      std::vector<int32_t> data(elem_num);
+      int32_t *dst_ptr = data.data();
      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[offset + k] = req->insts(0).tensor_array(i).int_data(k);
+        dst_ptr[k] = req->insts(0).tensor_array(i).int_data(k);
      }
+      lod_tensor->CopyFromCpu(dst_ptr);
    }
  }
-
-  VLOG(2) << "(logid=" << log_id << ") output size: " << out->size();
-  timeline.Pause();
-  int64_t end = timeline.TimeStampUS();
-  res->p_size = 0;
-  res->_batch_size = 1;
-  AddBlobInfo(res, start);
-  AddBlobInfo(res, end);
-
-  VLOG(2) << "(logid=" << log_id << ") read data from client success";
  return 0;
 }
 DEFINE_OP(GeneralReaderOp);

--- a/core/general-server/op/general_response_op.cpp
+++ b/core/general-server/op/general_response_op.cpp
@@ -40,160 +40,60 @@ using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;

 int GeneralResponseOp::inference() {
-  const std::vector<std::string> pre_node_names = pre_names();
-  VLOG(2) << "pre node names size: " << pre_node_names.size();
-  const GeneralBlob *input_blob;
-  uint64_t log_id =
-      get_depend_argument<GeneralBlob>(pre_node_names[0])->GetLogId();
-
  const Request *req = dynamic_cast<const Request *>(get_request_message());
  // response inst with only fetch_var_names
  Response *res = mutable_data<Response>();
-
-  Timer timeline;
-  // double response_time = 0.0;
-  // timeline.Start();
-  int64_t start = timeline.TimeStampUS();
-
-  VLOG(2) << "(logid=" << log_id
-          << ") start to call load general model_conf op";
  baidu::paddle_serving::predictor::Resource &resource =
      baidu::paddle_serving::predictor::Resource::instance();
-
-  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
  std::shared_ptr<PaddleGeneralModelConfig> model_config =
      resource.get_general_model_config();
-
-  VLOG(2) << "(logid=" << log_id
-          << ") max body size : " << brpc::fLU64::FLAGS_max_body_size;
-
-  std::vector<int> fetch_index;
-  fetch_index.resize(req->fetch_var_names_size());
-  for (int i = 0; i < req->fetch_var_names_size(); ++i) {
-    fetch_index[i] =
-        model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
-  }
-
-  for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
-    const std::string &pre_name = pre_node_names[pi];
-    VLOG(2) << "(logid=" << log_id << ") pre names[" << pi << "]: " << pre_name
-            << " (" << pre_node_names.size() << ")";
-    input_blob = get_depend_argument<GeneralBlob>(pre_name);
-    // fprintf(stderr, "input(%s) blob address %x\n", pre_names.c_str(),
-    // input_blob);
-    if (!input_blob) {
-      LOG(ERROR) << "(logid=" << log_id
-                 << ") Failed mutable depended argument, op: " << pre_name;
-      return -1;
-    }
-
-    const TensorVector *in = &input_blob->tensor_vector;
-
+  std::vector<int> capacity(req->fetch_var_names_size(), 1);
+  std::string engine_name = "general_infer_0";
  ModelOutput *output = res->add_outputs();
-    // To get the order of model return values
-    output->set_engine_name(pre_name);
  FetchInst *fetch_inst = output->add_insts();
-
-    for (auto &idx : fetch_index) {
-      Tensor *tensor = fetch_inst->add_tensor_array();
-      if (model_config->_is_lod_fetch[idx]) {
-        VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
-                << model_config->_fetch_name[idx] << " is lod_tensor";
-        for (int k = 0; k < in->at(idx).shape.size(); ++k) {
-          VLOG(2) << "(logid=" << log_id << ") shape[" << k
-                  << "]: " << in->at(idx).shape[k];
-          tensor->add_shape(in->at(idx).shape[k]);
-        }
-      } else {
-        VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
-                << model_config->_fetch_name[idx] << " is tensor";
-        for (int k = 0; k < in->at(idx).shape.size(); ++k) {
-          VLOG(2) << "(logid=" << log_id << ") shape[" << k
-                  << "]: " << in->at(idx).shape[k];
-          tensor->add_shape(in->at(idx).shape[k]);
-        }
-      }
-    }
-
-    int var_idx = 0;
-    for (auto &idx : fetch_index) {
-      int cap = 1;
-      for (int j = 0; j < in->at(idx).shape.size(); ++j) {
-        cap *= in->at(idx).shape[j];
-      }
-
  FetchInst *fetch_p = output->mutable_insts(0);
-      auto dtype = in->at(idx).dtype;
-
+  std::vector<std::string> outs =
+      InferManager::instance().GetOutputNames(engine_name.c_str());
+  for (int i = 0; i < req->fetch_var_names_size(); ++i) {
+    Tensor *tensor = fetch_inst->add_tensor_array();
+    std::string tensor_name = outs[i];
+    auto lod_tensor = InferManager::instance().GetOutputHandle(
+        engine_name.c_str(), tensor_name.c_str());
+    std::vector<int> shape = lod_tensor->shape();
+    for (int k = 0; k < shape.size(); ++k) {
+      capacity[i] *= shape[k];
+      tensor->add_shape(shape[k]);
+    }
+    auto dtype = lod_tensor->type();
    if (dtype == paddle::PaddleDType::INT64) {
-        VLOG(2) << "(logid=" << log_id << ") Prepare int64 var ["
-                << model_config->_fetch_name[idx] << "].";
-        int64_t *data_ptr = static_cast<int64_t *>(in->at(idx).data.data());
-        // from
-        // https://stackoverflow.com/questions/15499641/copy-a-stdvector-to-a-repeated-field-from-protobuf-with-memcpy
-        // `Swap` method is faster than `{}` method.
+      std::vector<int64_t> datas(capacity[i]);
+      int64_t *data_ptr = datas.data();
+      lod_tensor->CopyToCpu(data_ptr);
      google::protobuf::RepeatedField<int64_t> tmp_data(data_ptr,
-                                                          data_ptr + cap);
-        fetch_p->mutable_tensor_array(var_idx)->mutable_int64_data()->Swap(
-            &tmp_data);
+                                                        data_ptr + capacity[i]);
+      tensor->mutable_int64_data()->Swap(&tmp_data);
    } else if (dtype == paddle::PaddleDType::FLOAT32) {
-        VLOG(2) << "(logid=" << log_id << ") Prepare float var ["
-                << model_config->_fetch_name[idx] << "].";
-        float *data_ptr = static_cast<float *>(in->at(idx).data.data());
+      std::vector<float> datas(capacity[i]);
+      float *data_ptr = datas.data();
+      lod_tensor->CopyToCpu(data_ptr);
      google::protobuf::RepeatedField<float> tmp_data(data_ptr,
-                                                        data_ptr + cap);
-        fetch_p->mutable_tensor_array(var_idx)->mutable_float_data()->Swap(
-            &tmp_data);
+                                                      data_ptr + capacity[i]);
+      tensor->mutable_float_data()->Swap(&tmp_data);
    } else if (dtype == paddle::PaddleDType::INT32) {
-        VLOG(2) << "(logid=" << log_id << ")Prepare int32 var ["
-                << model_config->_fetch_name[idx] << "].";
-        int32_t *data_ptr = static_cast<int32_t *>(in->at(idx).data.data());
+      std::vector<int32_t> datas(capacity[i]);
+      int32_t *data_ptr = datas.data();
+      lod_tensor->CopyToCpu(data_ptr);
      google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr,
-                                                          data_ptr + cap);
-        fetch_p->mutable_tensor_array(var_idx)->mutable_int_data()->Swap(
-            &tmp_data);
+                                                        data_ptr + capacity[i]);
+      tensor->mutable_int_data()->Swap(&tmp_data);
    }
-
-      if (model_config->_is_lod_fetch[idx]) {
-        if (in->at(idx).lod.size() > 0) {
-          for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
-            fetch_p->mutable_tensor_array(var_idx)->add_lod(
-                in->at(idx).lod[0][j]);
+    std::vector<std::vector<size_t>> lod = lod_tensor->lod();
+    if (lod.size() > 0) {
+      for (int j = 0; j < lod[0].size(); ++j) {
+        tensor->add_lod(lod[0][j]);
      }
    }
  }
-
-      VLOG(2) << "(logid=" << log_id << ") fetch var ["
-              << model_config->_fetch_name[idx] << "] ready";
-      var_idx++;
-    }
-  }
-
-  if (req->profile_server()) {
-    int64_t end = timeline.TimeStampUS();
-    // TODO(barriery): multi-model profile_time.
-    // At present, only the response_op is multi-input, so here we get
-    // the profile_time by hard coding. It needs to be replaced with
-    // a more elegant way.
-    for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
-      input_blob = get_depend_argument<GeneralBlob>(pre_node_names[pi]);
-      VLOG(2) << "(logid=" << log_id
-              << ") p size for input blob: " << input_blob->p_size;
-      int profile_time_idx = -1;
-      if (pi == 0) {
-        profile_time_idx = 0;
-      } else {
-        profile_time_idx = input_blob->p_size - 2;
-      }
-      for (; profile_time_idx < input_blob->p_size; ++profile_time_idx) {
-        res->add_profile_time(input_blob->time_stamp[profile_time_idx]);
-      }
-    }
-    // TODO(guru4elephant): find more elegant way to do this
-    res->add_profile_time(start);
-    res->add_profile_time(end);
-  }
-
  return 0;
 }


--- a/core/predictor/CMakeLists.txt
+++ b/core/predictor/CMakeLists.txt
@@ -12,13 +12,12 @@ set_source_files_properties(
        ${pdserving_srcs}
        PROPERTIES
        COMPILE_FLAGS  "-Wno-strict-aliasing -Wno-unused-variable -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure)
+add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure extern_paddle paddle_fluid)
 if (WITH_TRT)
    add_definitions(-DWITH_TRT)
 endif()
 target_link_libraries(pdserving
-        brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
-
+        brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz paddle_fluid ${paddle_depend_libs})
 # install
 install(TARGETS pdserving
        RUNTIME DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/bin

--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -20,10 +20,9 @@
 #include <utility>
 #include <vector>
 #include "core/predictor/common/inner_common.h"
-#include "core/predictor/framework/bsf.h"
 #include "core/predictor/framework/factory.h"
 #include "core/predictor/framework/infer_data.h"
-
+#include "paddle_inference_api.h"  // NOLINT
 namespace baidu {
 namespace paddle_serving {
 namespace predictor {
@@ -39,6 +38,8 @@ class InferEngineCreationParams {
    _static_optimization = false;
    _force_update_static_cache = false;
    _use_trt = false;
+    _use_lite = false;
+    _use_xpu = false;
  }

  void set_path(const std::string& path) { _path = path; }
@@ -53,6 +54,10 @@ class InferEngineCreationParams {

  void set_use_trt(bool use_trt) { _use_trt = use_trt; }

+  void set_use_lite(bool use_lite) { _use_lite = use_lite; }
+
+  void set_use_xpu(bool use_xpu) { _use_xpu = use_xpu; }
+
  bool enable_memory_optimization() const {
    return _enable_memory_optimization;
  }
@@ -61,6 +66,10 @@ class InferEngineCreationParams {

  bool use_trt() const { return _use_trt; }

+  bool use_lite() const { return _use_lite; }
+
+  bool use_xpu() const { return _use_xpu; }
+
  void set_static_optimization(bool static_optimization = false) {
    _static_optimization = static_optimization;
  }
@@ -80,6 +89,9 @@ class InferEngineCreationParams {
              << "model_path = " << _path << ", "
              << "enable_memory_optimization = " << _enable_memory_optimization
              << ", "
+              << "enable_tensorrt = " << _use_trt << ", "
+              << "enable_lite = " << _use_lite << ", "
+              << "enable_xpu = " << _use_xpu << ", "
              << "enable_ir_optimization = " << _enable_ir_optimization << ", "
              << "static_optimization = " << _static_optimization << ", "
              << "force_update_static_cache = " << _force_update_static_cache;
@@ -92,6 +104,8 @@ class InferEngineCreationParams {
  bool _static_optimization;
  bool _force_update_static_cache;
  bool _use_trt;
+  bool _use_lite;
+  bool _use_xpu;
 };

 class InferEngine {
@@ -105,9 +119,7 @@ class InferEngine {
  virtual int thrd_initialize() { return thrd_initialize_impl(); }
  virtual int thrd_clear() { return thrd_clear_impl(); }
  virtual int thrd_finalize() { return thrd_finalize_impl(); }
-  virtual int infer(const void* in, void* out, uint32_t batch_size = -1) {
-    return infer_impl1(in, out, batch_size);
-  }
+  virtual int infer() { return infer_impl(); }

  virtual int reload() = 0;

@@ -120,11 +132,13 @@ class InferEngine {
  virtual int thrd_finalize_impl() = 0;
  virtual int thrd_clear_impl() = 0;
  virtual int proc_finalize_impl() = 0;
-  virtual int infer_impl1(const void* in,
-                          void* out,
-                          uint32_t batch_size = -1) = 0;
-  virtual int infer_impl2(const BatchTensor& in,
-                          BatchTensor& out) = 0;  // NOLINT
+  virtual std::vector<std::string> GetInputNames() = 0;
+  virtual std::vector<std::string> GetOutputNames() = 0;
+  virtual std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
+      const std::string& name) = 0;
+  virtual std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
+      const std::string& name) = 0;
+  virtual int infer_impl() = 0;
  // end: framework inner call
 };

@@ -138,8 +152,6 @@ class ReloadableInferEngine : public InferEngine {
    uint64_t last_revision;
  };

-  typedef im::bsf::Task<Tensor, Tensor> TaskT;
-
  virtual int load(const InferEngineCreationParams& params) = 0;

  int proc_initialize_impl(const configure::EngineDesc& conf, bool version) {
@@ -182,6 +194,14 @@ class ReloadableInferEngine : public InferEngine {
      _infer_engine_params.set_use_trt(conf.use_trt());
    }

+    if (conf.has_use_lite()) {
+      _infer_engine_params.set_use_lite(conf.use_lite());
+    }
+
+    if (conf.has_use_xpu()) {
+      _infer_engine_params.set_use_xpu(conf.use_xpu());
+    }
+
    if (!check_need_reload() || load(_infer_engine_params) != 0) {
      LOG(ERROR) << "Failed load model_data_path" << _model_data_path;
      return -1;
@@ -201,45 +221,10 @@ class ReloadableInferEngine : public InferEngine {
      LOG(ERROR) << "Failed proc initialize impl";
      return -1;
    }
-
-    // init bsf framework
-    if (_infer_thread_num <= 0) {
-      return 0;
-    }
-
-    im::bsf::TaskExecutor<TaskT>::instance()->set_thread_init_fn(
-        boost::bind(&InferEngine::thrd_initialize_impl, this));
-    im::bsf::TaskExecutor<TaskT>::instance()->set_thread_reset_fn(
-        boost::bind(&InferEngine::thrd_clear_impl, this));
-    im::bsf::TaskExecutor<TaskT>::instance()->set_thread_callback_fn(
-        boost::bind(&InferEngine::infer_impl2, this, _1, _2));
-    im::bsf::TaskExecutor<TaskT>::instance()->set_batch_size(_infer_batch_size);
-    im::bsf::TaskExecutor<TaskT>::instance()->set_batch_align(
-        _infer_batch_align);
-    if (im::bsf::TaskExecutor<TaskT>::instance()->start(_infer_thread_num) !=
-        0) {
-      LOG(ERROR) << "Failed start bsf executor, threads:" << _infer_thread_num;
-      return -1;
-    }
-
-    LOG(WARNING) << "Enable batch schedule framework, thread_num:"
-                 << _infer_thread_num << ", batch_size:" << _infer_batch_size
-                 << ", enable_batch_align:" << _infer_batch_align;
-
    return 0;
  }

-  int infer(const void* in, void* out, uint32_t batch_size = -1) {
-    if (_infer_thread_num <= 0) {
-      return infer_impl1(in, out, batch_size);
-    }
-
-    im::bsf::TaskManager<Tensor, Tensor> task_manager;
-    task_manager.schedule(*(reinterpret_cast<const BatchTensor*>(in)),
-                          *(reinterpret_cast<BatchTensor*>(out)));
-    task_manager.wait();
-    return 0;
-  }
+  int infer() { return infer_impl(); }

  int thrd_initialize() {
    if (_infer_thread_num > 0) {
@@ -263,10 +248,6 @@ class ReloadableInferEngine : public InferEngine {
      return -1;
    }

-    if (_infer_thread_num > 0) {
-      im::bsf::TaskExecutor<TaskT>::instance()->stop();
-    }
-
    return 0;
  }

@@ -417,10 +398,6 @@ class DBReloadableInferEngine : public ReloadableInferEngine {

  virtual int thrd_initialize_impl() {
    // memory pool to be inited in non-serving-threads
-    if (MempoolWrapper::instance().thread_initialize() != 0) {
-      LOG(ERROR) << "Failed thread initialize mempool";
-      return -1;
-    }

    ModelData<EngineCore>* md = new (std::nothrow) ModelData<EngineCore>;
    if (!md || load_data(md, _infer_engine_params) != 0) {
@@ -430,17 +407,12 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
    }

    THREAD_SETSPECIFIC(_skey, md);
-    im::bsf::AutoMutex lock(_mutex);
    _reload_vec.push_back(md);
    return 0;
  }

  int thrd_clear_impl() {
    // for non-serving-threads
-    if (MempoolWrapper::instance().thread_clear() != 0) {
-      LOG(ERROR) << "Failed thread clear mempool";
-      return -1;
-    }
    return 0;
  }

@@ -538,12 +510,6 @@ class CloneDBReloadableInferEngine
  }

  virtual int thrd_initialize_impl() {
-    // memory pool to be inited in non-serving-threads
-    if (MempoolWrapper::instance().thread_initialize() != 0) {
-      LOG(ERROR) << "Failed thread initialize mempool";
-      return -1;
-    }
-
    ModelData<EngineCore>* md = new (std::nothrow) ModelData<EngineCore>;
    if (!md || load_data(md, _pd->cores[_pd->current_idx]) != 0) {
      LOG(ERROR) << "Failed clone thread data, origin_core["
@@ -552,7 +518,6 @@ class CloneDBReloadableInferEngine
    }

    THREAD_SETSPECIFIC(DBReloadableInferEngine<EngineCore>::_skey, md);
-    im::bsf::AutoMutex lock(DBReloadableInferEngine<EngineCore>::_mutex);
    DBReloadableInferEngine<EngineCore>::_reload_vec.push_back(md);
    return 0;
  }
@@ -571,8 +536,45 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
 public:  // NOLINT
  FluidInferEngine() {}
  ~FluidInferEngine() {}
+  std::vector<std::string> GetInputNames() {
+    FluidFamilyCore* core =
+        DBReloadableInferEngine<FluidFamilyCore>::get_core();
+    if (!core || !core->get()) {
+      LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
+    }
+    return core->GetInputNames();
+  }
+
+  std::vector<std::string> GetOutputNames() {
+    FluidFamilyCore* core =
+        DBReloadableInferEngine<FluidFamilyCore>::get_core();
+    if (!core || !core->get()) {
+      LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
+    }
+    return core->GetOutputNames();
+  }
+
+  std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
+      const std::string& name) {
+    FluidFamilyCore* core =
+        DBReloadableInferEngine<FluidFamilyCore>::get_core();
+    if (!core || !core->get()) {
+      LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
+    }
+    return core->GetInputHandle(name);
+  }
+
+  std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
+      const std::string& name) {
+    FluidFamilyCore* core =
+        DBReloadableInferEngine<FluidFamilyCore>::get_core();
+    if (!core || !core->get()) {
+      LOG(ERROR) << "Failed get fluid core in GetOutputHandle()";
+    }
+    return core->GetOutputHandle(name);
+  }

-  int infer_impl1(const void* in, void* out, uint32_t batch_size = -1) {
+  int infer_impl() {
    FluidFamilyCore* core =
        DBReloadableInferEngine<FluidFamilyCore>::get_core();
    if (!core || !core->get()) {
@@ -580,16 +582,12 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
      return -1;
    }

-    if (!core->Run(in, out)) {
+    if (!core->Run()) {
      LOG(ERROR) << "Failed run fluid family core";
      return -1;
    }
    return 0;
  }
-
-  int infer_impl2(const BatchTensor& in, BatchTensor& out) {  // NOLINT
-    return infer_impl1(&in, &out);
-  }
 };

 typedef FactoryPool<InferEngine> StaticInferFactory;
@@ -715,13 +713,45 @@ class VersionedInferEngine : public InferEngine {
    return _versions.begin()->second;
  }

-  int infer(const void* in, void* out, uint32_t batch_size) {
+  int infer() {
    InferEngine* engine = default_engine();
    if (!engine) {
      LOG(WARNING) << "fail to get default engine";
      return -1;
    }
-    return engine->infer(in, out, batch_size);
+    return engine->infer();
+  }
+
+  std::vector<std::string> GetInputNames() {
+    InferEngine* engine = default_engine();
+    if (!engine) {
+      LOG(WARNING) << "fail to get default engine";
+    }
+    return engine->GetInputNames();
+  }
+  std::vector<std::string> GetOutputNames() {
+    InferEngine* engine = default_engine();
+    if (!engine) {
+      LOG(WARNING) << "fail to get default engine";
+    }
+    return engine->GetOutputNames();
+  }
+  std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
+      const std::string& name) {
+    InferEngine* engine = default_engine();
+    if (!engine) {
+      LOG(WARNING) << "fail to get default engine";
+    }
+    return engine->GetInputHandle(name);
+  }
+
+  std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
+      const std::string& name) {
+    InferEngine* engine = default_engine();
+    if (!engine) {
+      LOG(WARNING) << "fail to get default engine";
+    }
+    return engine->GetOutputHandle(name);
  }

  template <typename T>
@@ -740,14 +770,47 @@ class VersionedInferEngine : public InferEngine {
  }

  // versioned inference interface
-  int infer(const void* in, void* out, uint32_t batch_size, uint64_t version) {
+  int infer(uint64_t version) {
    auto iter = _versions.find(version);
    if (iter == _versions.end()) {
      LOG(ERROR) << "Not found version engine: " << version;
      return -1;
    }

-    return iter->second->infer(in, out, batch_size);
+    return iter->second->infer();
+  }
+  std::vector<std::string> GetInputNames(uint64_t version) {
+    auto iter = _versions.find(version);
+    if (iter == _versions.end()) {
+      LOG(ERROR) << "Not found version engine: " << version;
+    }
+    return iter->second->GetInputNames();
+  }
+
+  std::vector<std::string> GetOutputNames(uint64_t version) {
+    auto iter = _versions.find(version);
+    if (iter == _versions.end()) {
+      LOG(ERROR) << "Not found version engine: " << version;
+    }
+    return iter->second->GetOutputNames();
+  }
+
+  std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
+      uint64_t version, const std::string& name) {
+    auto iter = _versions.find(version);
+    if (iter == _versions.end()) {
+      LOG(ERROR) << "Not found version engine: " << version;
+    }
+    return iter->second->GetInputHandle(name);
+  }
+
+  std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
+      uint64_t version, const std::string& name) {
+    auto iter = _versions.find(version);
+    if (iter == _versions.end()) {
+      LOG(ERROR) << "Not found version engine: " << version;
+    }
+    return iter->second->GetOutputHandle(name);
  }

  template <typename T>
@@ -774,12 +837,7 @@ class VersionedInferEngine : public InferEngine {
  int thrd_finalize_impl() { return -1; }
  int thrd_clear_impl() { return -1; }
  int proc_finalize_impl() { return -1; }
-  int infer_impl1(const void* in, void* out, uint32_t batch_size = -1) {
-    return -1;
-  }
-  int infer_impl2(const BatchTensor& in, BatchTensor& out) {  // NOLINT
-    return -1;
-  }  // NOLINT
+  int infer_impl() { return -1; }

 private:
  boost::unordered_map<uint64_t, InferEngine*> _versions;
@@ -877,16 +935,44 @@ class InferManager {
  }

  // Inference interface
-  int infer(const char* model_name,
-            const void* in,
-            void* out,
-            uint32_t batch_size = -1) {
+  int infer(const char* model_name) {
    auto it = _map.find(model_name);
    if (it == _map.end()) {
      LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
      return -1;
    }
-    return it->second->infer(in, out, batch_size);
+    return it->second->infer();
+  }
+
+  std::vector<std::string> GetInputNames(const char* model_name) {
+    auto it = _map.find(model_name);
+    if (it == _map.end()) {
+      LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
+    }
+    return it->second->GetInputNames();
+  }
+  std::vector<std::string> GetOutputNames(const char* model_name) {
+    auto it = _map.find(model_name);
+    if (it == _map.end()) {
+      LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
+    }
+    return it->second->GetOutputNames();
+  }
+  std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
+      const char* model_name, const std::string& name) {
+    auto it = _map.find(model_name);
+    if (it == _map.end()) {
+      LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
+    }
+    return it->second->GetInputHandle(name);
+  }
+  std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
+      const char* model_name, const std::string& name) {
+    auto it = _map.find(model_name);
+    if (it == _map.end()) {
+      LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
+    }
+    return it->second->GetOutputHandle(name);
  }

  template <typename T>
@@ -906,19 +992,48 @@ class InferManager {
  }

  // Versioned inference interface
-  int infer(const char* model_name,
-            const void* in,
-            void* out,
-            uint32_t batch_size,
-            uint64_t version) {
+  int infer(const char* model_name, uint64_t version) {
    auto it = _map.find(model_name);
    if (it == _map.end()) {
      LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
      return -1;
    }
-    return it->second->infer(in, out, batch_size, version);
+    return it->second->infer(version);
+  }
+  std::vector<std::string> GetInputNames(const char* model_name,
+                                         uint64_t version) {
+    auto it = _map.find(model_name);
+    if (it == _map.end()) {
+      LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
+    }
+    return it->second->GetInputNames(version);
+  }
+
+  std::vector<std::string> GetOutputNames(const char* model_name,
+                                          uint64_t version) {
+    auto it = _map.find(model_name);
+    if (it == _map.end()) {
+      LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
+    }
+    return it->second->GetOutputNames(version);
  }

+  std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
+      const char* model_name, uint64_t version, const std::string& name) {
+    auto it = _map.find(model_name);
+    if (it == _map.end()) {
+      LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
+    }
+    return it->second->GetInputHandle(version, name);
+  }
+  std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
+      const char* model_name, uint64_t version, const std::string& name) {
+    auto it = _map.find(model_name);
+    if (it == _map.end()) {
+      LOG(WARNING) << "Cannot find engine in map, model name:" << model_name;
+    }
+    return it->second->GetOutputHandle(version, name);
+  }
  template <typename T>
  T* get_core(const char* model_name, uint64_t version) {
    auto it = _map.find(model_name);

--- a/doc/BERT_10_MINS.md
+++ b/doc/BERT_10_MINS.md
@@ -56,21 +56,25 @@ the script of client side bert_client.py is as follow:

 [//file]:#bert_client.py
 ``` python
-import os
 import sys
 from paddle_serving_client import Client
+from paddle_serving_client.utils import benchmark_args
 from paddle_serving_app.reader import ChineseBertReader
+import numpy as np
+args = benchmark_args()

-reader = ChineseBertReader()
+reader = ChineseBertReader({"max_seq_len": 128})
 fetch = ["pooled_output"]
-endpoint_list = ["127.0.0.1:9292"]
+endpoint_list = ['127.0.0.1:9292']
 client = Client()
-client.load_client_config("bert_seq20_client/serving_client_conf.prototxt")
+client.load_client_config(args.model)
 client.connect(endpoint_list)

 for line in sys.stdin:
    feed_dict = reader.process(line)
-    result = client.predict(feed=feed_dict, fetch=fetch)
+    for key in feed_dict.keys():
+        feed_dict[key] = np.array(feed_dict[key]).reshape((128, 1))
+    result = client.predict(feed=feed_dict, fetch=fetch, batch=False)
 ```

 run

--- a/doc/BERT_10_MINS_CN.md
+++ b/doc/BERT_10_MINS_CN.md
@@ -52,18 +52,23 @@ pip install paddle_serving_app
 ``` python
 import sys
 from paddle_serving_client import Client
+from paddle_serving_client.utils import benchmark_args
 from paddle_serving_app.reader import ChineseBertReader
+import numpy as np
+args = benchmark_args()

-reader = ChineseBertReader()
+reader = ChineseBertReader({"max_seq_len": 128})
 fetch = ["pooled_output"]
-endpoint_list = ["127.0.0.1:9292"]
+endpoint_list = ['127.0.0.1:9292']
 client = Client()
-client.load_client_config("bert_seq20_client/serving_client_conf.prototxt")
+client.load_client_config(args.model)
 client.connect(endpoint_list)

 for line in sys.stdin:
    feed_dict = reader.process(line)
-    result = client.predict(feed=feed_dict, fetch=fetch)
+    for key in feed_dict.keys():
+        feed_dict[key] = np.array(feed_dict[key]).reshape((128, 1))
+    result = client.predict(feed=feed_dict, fetch=fetch, batch=False)
 ```

 执行

--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -122,6 +122,7 @@ make -j10
 export CUDA_PATH='/usr/local'
 export CUDNN_LIBRARY='/usr/local/cuda/lib64/'
 export CUDA_CUDART_LIBRARY="/usr/local/cuda/lib64/"
+export TENSORRT_LIBRARY_PATH="/usr/local/TensorRT-6.0.1.5/targets/x86_64-linux-gnu/"

 mkdir server-build-trt && cd server-build-trt
 cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \

--- a/doc/PIPELINE_SERVING_CN.md
+++ b/doc/PIPELINE_SERVING_CN.md
@@ -676,7 +676,7 @@ service_throughput = 1 / 最慢OP的耗时 * 并发数
 service_avg_cost = ∑op_concurrency 【关键路径】

 Channel堆积：
-channel_acc_size = QPS(down - up) * time
+channel_acc_size = QPS(down - up) * time

 批量预测平均耗时：
 avg_batch_cost = (N * pre + mid + post) / N 

--- a/doc/RUN_IN_DOCKER.md
+++ b/doc/RUN_IN_DOCKER.md
@@ -32,63 +32,9 @@ The `-p` option is to map the `9292` port of the container to the `9292` port of

 ### Install PaddleServing

-In order to make the image smaller, the PaddleServing package is not installed in the image. You can run the following command to install it:
-
-```bash
-pip install paddle-serving-server
-```
-
-You may need to use a domestic mirror source (in China, you can use the Tsinghua mirror source of the following example) to speed up the download:
-
-```shell
-pip install paddle-serving-server -i https://pypi.tuna.tsinghua.edu.cn/simple
-```
-
-### Test example
-
-Get the trained Boston house price prediction model by the following command:
-
-```bash
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
-tar -xzf uci_housing.tar.gz
-```
-
- Test HTTP service
-
-  Running on the Server side (inside the container):
-
-  ```bash
-  python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --name uci >std.log 2>err.log &
-  ```
-
-  Running on the Client side (inside or outside the container):
-
-  ```bash
-  curl -H "Content-Type:application/json" -X POST -d '{"feed":{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
-  ```
-
- Test RPC service
-
-  Running on the Server side (inside the container):
-
-  ```bash
-  python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 >std.log 2>err.log &
-  ```
-
-  Running following Python code on the Client side (inside or outside the container, The `paddle-serving-client` package needs to be installed):
-
-  ```bash
-  from paddle_serving_client import Client
-  
-  client = Client()
-  client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
-  client.connect(["127.0.0.1:9292"])
-  data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-          -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
-  fetch_map = client.predict(feed={"x": data}, fetch=["price"])
-  print(fetch_map)
-  ```
+The mirror comes with `paddle_serving_server`, `paddle_serving_client`, and `paddle_serving_app` corresponding to the mirror tag version. If users don’t need to change the version, they can use it directly, which is suitable for environments without extranet services.

+If you need to change the version, please refer to the instructions on the homepage to download the pip package of the corresponding version.
  

 ## GPU
@@ -100,7 +46,7 @@ The GPU version is basically the same as the CPU version, with only some differe
 Refer to [this document](DOCKER_IMAGES.md) for a docker image, the following is an example of an `cuda9.0-cudnn7` image:

 ```shell
-nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
+docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
 ```

 ### Create container
@@ -110,77 +56,21 @@ nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/se
 nvidia-docker exec -it test bash
 ```

-The `-p` option is to map the `9292` port of the container to the `9292` port of the host.
-
-### Install PaddleServing
-
-In order to make the image smaller, the PaddleServing package is not installed in the image. You can run the following command to install it:
+or

 ```bash
-pip install paddle-serving-server-gpu
-```
-
-You may need to use a domestic mirror source (in China, you can use the Tsinghua mirror source of the following example) to speed up the download:
-
-```shell
-pip install paddle-serving-server-gpu -i https://pypi.tuna.tsinghua.edu.cn/simple
-```
-
-### Test example
-
-When running the GPU Server, you need to set the GPUs used by the prediction service through the `--gpu_ids` option, and the CPU is used by default. An error will be reported when the value of `--gpu_ids` exceeds the environment variable `CUDA_VISIBLE_DEVICES`. The following example specifies to use a GPU with index 0:
-```shell
-export CUDA_VISIBLE_DEVICES=0,1
-python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9292 --gpu_ids 0
-```
-
-
-Get the trained Boston house price prediction model by the following command:
-
-```bash
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
-tar -xzf uci_housing.tar.gz
+docker run --gpus all -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
+docker exec -it test bash
 ```

- Test HTTP service
-
-  Running on the Server side (inside the container):
-
-  ```bash
-  python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9292 --name uci --gpu_ids 0
-  ```
-
-  Running on the Client side (inside or outside the container):
-
-  ```bash
-  curl -H "Content-Type:application/json" -X POST -d '{"feed":{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
-  ```
-
- Test RPC service
-
-  Running on the Server side (inside the container):
-
-  ```bash
-  python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0
-  ```
-
-  Running following Python code on the Client side (inside or outside the container, The `paddle-serving-client` package needs to be installed):
-
-  ```bash
-  from paddle_serving_client import Client
-  
-  client = Client()
-  client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
-  client.connect(["127.0.0.1:9292"])
-  data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-          -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
-  fetch_map = client.predict(feed={"x": data}, fetch=["price"])
-  print(fetch_map)
-  ```
+The `-p` option is to map the `9292` port of the container to the `9292` port of the host.

+### Install PaddleServing

+The mirror comes with `paddle_serving_server_gpu`, `paddle_serving_client`, and `paddle_serving_app` corresponding to the mirror tag version. If users don’t need to change the version, they can use it directly, which is suitable for environments without extranet services.

+If you need to change the version, please refer to the instructions on the homepage to download the pip package of the corresponding version.

-## Attention
+## Precautious

 Runtime images cannot be used for compilation. If you want to compile from source, refer to [COMPILE](COMPILE.md).
--- a/doc/RUN_IN_DOCKER_CN.md
+++ b/doc/RUN_IN_DOCKER_CN.md
@@ -20,7 +20,6 @@ Docker（GPU版本需要在GPU机器上安装nvidia-docker）
 docker pull hub.baidubce.com/paddlepaddle/serving:latest
 ```

-
 ### 创建容器并进入

 ```bash
@@ -32,74 +31,11 @@ docker exec -it test bash

 ### 安装PaddleServing

-为了减小镜像的体积，镜像中没有安装Serving包，要执行下面命令进行安装。
-
-```bash
-pip install paddle-serving-server
-```
-
-您可能需要使用国内镜像源（例如清华源）来加速下载。
-
-```shell
-pip install paddle-serving-server -i https://pypi.tuna.tsinghua.edu.cn/simple
-```
-
-### 测试example
-
-通过下面命令获取训练好的Boston房价预估模型：
-
-```bash
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
-tar -xzf uci_housing.tar.gz
-```
-
- 测试HTTP服务
-
-  在Server端（容器内）运行：
+镜像里自带对应镜像tag版本的`paddle_serving_server`，`paddle_serving_client`，`paddle_serving_app`，如果用户不需要更改版本，可以直接使用，适用于没有外网服务的环境。

-  ```bash
-  python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --name uci >std.log 2>err.log &
-  ```
+如果需要更换版本，请参照首页的指导，下载对应版本的pip包。

-  在Client端（容器内或容器外）运行：
-
-  ```bash
-  curl -H "Content-Type:application/json" -X POST -d '{"feed":{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
-  ```
-
- 测试RPC服务
-
-  在Server端（容器内）运行：
-
-  ```bash
-  python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 >std.log 2>err.log &
-  ```
-
-  在Client端（容器内或容器外，需要安装`paddle-serving-client`包）运行下面Python代码：
-
-  ```python
-  from paddle_serving_client import Client
-  
-  client = Client()
-  client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
-  client.connect(["127.0.0.1:9292"])
-  data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-          -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
-  fetch_map = client.predict(feed={"x": data}, fetch=["price"])
-  print(fetch_map)
-  ```
-
-## GPU版本
-
-GPU版本与CPU版本基本一致，只有部分接口命名的差别（GPU版本需要在GPU机器上安装nvidia-docker）。
-
-### 获取镜像
-
-参考[该文档](DOCKER_IMAGES_CN.md)获取镜像，这里以 `cuda9.0-cudnn7` 的镜像为例：
-
-```shell
-nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
-```
+## GPU 版本

 ### 创建容器并进入

@@ -107,74 +43,19 @@ nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
 nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
 nvidia-docker exec -it test bash
 ```
-
-`-p`选项是为了将容器的`9292`端口映射到宿主机的`9292`端口。
-
-### 安装PaddleServing
-
-为了减小镜像的体积，镜像中没有安装Serving包，要执行下面命令进行安装。
-
+或者
 ```bash
-pip install paddle-serving-server-gpu
-```
-
-您可能需要使用国内镜像源（例如清华源）来加速下载。
-
-```shell
-pip install paddle-serving-server-gpu -i https://pypi.tuna.tsinghua.edu.cn/simple
-```
-
-### 测试example
-
-在运行GPU版Server时需要通过`--gpu_ids`选项设置预测服务使用的GPU，缺省状态默认使用CPU。当设置的`--gpu_ids`超出环境变量`CUDA_VISIBLE_DEVICES`时会报错。下面的示例为指定使用索引为0的GPU：
-```shell
-export CUDA_VISIBLE_DEVICES=0,1
-python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9292 --gpu_ids 0
-```
-
-
-通过下面命令获取训练好的Boston房价预估模型：
-
-```bash
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
-tar -xzf uci_housing.tar.gz
+docker run --gpus all -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
+docker exec -it test bash
 ```

- 测试HTTP服务
-
-  在Server端（容器内）运行：
-
-  ```bash
-  python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9292 --name uci --gpu_ids 0
-  ```
-
-  在Client端（容器内或容器外）运行：
-
-  ```bash
-  curl -H "Content-Type:application/json" -X POST -d '{"feed":{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, "fetch":["price"]}' http://127.0.0.1:9292/uci/prediction
-  ```
-
- 测试RPC服务
-
-  在Server端（容器内）运行：
-
-  ```bash
-  python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0
-  ```
+`-p`选项是为了将容器的`9292`端口映射到宿主机的`9292`端口。

-  在Client端（容器内或容器外，需要安装`paddle-serving-client`包）运行下面Python代码：
+### 安装PaddleServing

-  ```bash
-  from paddle_serving_client import Client
+镜像里自带对应镜像tag版本的`paddle_serving_server_gpu`，`paddle_serving_client`，`paddle_serving_app`，如果用户不需要更改版本，可以直接使用，适用于没有外网服务的环境。

-  client = Client()
-  client.load_client_config("uci_housing_client/serving_client_conf.prototxt")
-  client.connect(["127.0.0.1:9292"])
-  data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
-          -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]
-  fetch_map = client.predict(feed={"x": data}, fetch=["price"])
-  print(fetch_map)
-  ```
+如果需要更换版本，请参照首页的指导，下载对应版本的pip包。

 ## 注意事项


--- a/doc/SAVE.md
+++ b/doc/SAVE.md
@@ -49,4 +49,4 @@ Arguments are the same as `inference_model_to_serving` API.
 | `serving_server` | str | `"serving_server"` | The path of model files and configuration files for server. |
 | `serving_client` | str | `"serving_client"` | The path of configuration files for client. |
 | `model_filename` | str | None | The name of file to load the inference program. If it is None, the default filename `__model__` will be used. |
-| `paras_filename` | str | None | The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. |
+| `params_filename` | str | None | The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. |
--- a/doc/SAVE_CN.md
+++ b/doc/SAVE_CN.md
@@ -50,4 +50,4 @@ python -m paddle_serving_client.convert --dirname ./your_inference_model_dir
 | `serving_server` | str | `"serving_server"` | 转换后的模型文件和配置文件的存储路径。默认值为serving_server |
 | `serving_client` | str | `"serving_client"` | 转换后的客户端配置文件存储路径。默认值为serving_client |
 | `model_filename` | str | None | 存储需要转换的模型Inference Program结构的文件名称。如果设置为None，则使用 `__model__` 作为默认的文件名 |
-| `paras_filename` | str | None | 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保存在一个单独的二进制文件中，它才需要被指定。如果模型参数是存储在各自分离的文件中，设置它的值为None |
+| `params_filename` | str | None | 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保存在一个单独的二进制文件中，它才需要被指定。如果模型参数是存储在各自分离的文件中，设置它的值为None |
--- a/doc/WINDOWS_TUTORIAL.md
+++ b/doc/WINDOWS_TUTORIAL.md
@@ -8,7 +8,7 @@ This document guides users how to build Paddle Serving service on the Windows pl

 ### Running Paddle Serving on Native Windows System

-**Configure Python environment variables to PATH**: First, you need to add the directory where the Python executable program is located to the PATH. Usually in **System Properties/My Computer Properties**-**Advanced**-**Environment Variables**, click Path and add the path at the beginning. For example, `C:\Users\$USER\AppData\Local\Programs\Python\Python36`, and finally click **OK** continuously. If you enter python on Powershell, you can enter the python interactive interface, indicating that the environment variable configuration is successful.
+**Configure Python environment variables to PATH**: **We only support Python 3.5+ on Native Windows System.**. First, you need to add the directory where the Python executable program is located to the PATH. Usually in **System Properties/My Computer Properties**-**Advanced**-**Environment Variables**, click Path and add the path at the beginning. For example, `C:\Users\$USER\AppData\Local\Programs\Python\Python36`, and finally click **OK** continuously. If you enter python on Powershell, you can enter the python interactive interface, indicating that the environment variable configuration is successful.

 **Install wget**: Because all the downloads in the tutorial and the built-in model download function in `paddle_serving_app` all use the wget tool, download the binary package at the [link](http://gnuwin32.sourceforge.net/packages/wget.htm), unzip and copy it to `C:\Windows\System32`, if there is a security prompt, you need to pass it.

@@ -32,6 +32,7 @@ python -m pip install -U paddle_serving_server_gpu paddle_serving_client paddle_

 ```
 git clone https://github.com/paddlepaddle/Serving
+pip install -r python/requirements_win.txt
 ```

 **Run OCR example**:

--- a/doc/WINDOWS_TUTORIAL_CN.md
+++ b/doc/WINDOWS_TUTORIAL_CN.md
@@ -8,7 +8,7 @@

 ### 原生Windows系统运行Paddle Serving

-**配置Python环境变量到PATH**：首先需要将Python的可执行程序所在目录加入到PATH当中。通常在**系统属性/我的电脑属性**-**高级**-**环境变量** ，点选Path，并在开头加上路径。例如`C:\Users\$USER\AppData\Local\Programs\Python\Python36`，最后连续点击**确定** 。在Powershell上如果输入python可以进入python交互界面，说明环境变量配置成功。
+**配置Python环境变量到PATH**：**目前原生Windows仅支持Python 3.5或更高版本**。首先需要将Python的可执行程序所在目录加入到PATH当中。通常在**系统属性/我的电脑属性**-**高级**-**环境变量** ，点选Path，并在开头加上路径。例如`C:\Users\$USER\AppData\Local\Programs\Python\Python36`，最后连续点击**确定** 。在Powershell上如果输入python可以进入python交互界面，说明环境变量配置成功。

 **安装wget工具**：由于教程当中所有的下载，以及`paddle_serving_app`当中内嵌的模型下载功能，都是用到wget工具，在链接[下载wget](http://gnuwin32.sourceforge.net/packages/wget.htm)，解压后复制到`C:\Windows\System32`下，如有安全提示需要通过。

@@ -32,6 +32,7 @@ python -m pip install -U paddle_serving_server_gpu paddle_serving_client paddle_

 ```
 git clone https://github.com/paddlepaddle/Serving
+pip install -r python/requirements_win.txt
 ```

 **运行OCR示例**：

--- a/java/README.md
+++ b/java/README.md
-## Java Demo
+## Tutorial of Java Client for Paddle Serving
+
+(English|[简体中文](./README_CN.md))
+
+### Development Environment
+
+In order to facilitate users to use java for development, we provide the compiled Serving project to be placed in the java mirror. The way to get the mirror and enter the development environment is
+
+```
+docker pull hub.baidubce.com/paddlepaddle/serving:0.4.0-java
+docker run --rm -dit --name java_serving hub.baidubce.com/paddlepaddle/serving:0.4.0-java
+docker exec -it java_serving bash
+cd Serving/java
+```
+
+The Serving folder is at the develop branch when the docker image is generated. You need to git pull to the latest version or git checkout to the desired branch.
+
+### Install client dependencies
+
+Due to the large number of dependent libraries, the image has been compiled once at the time of generation, and the user can perform the following operations

-### Install package
 ```
 mvn compile
 mvn install
@@ -9,18 +27,93 @@ mvn compile
 mvn install
 ```

-### Start Server
+### Start the server(not pipeline)

-take the fit_a_line demo as example
+Take the fit_a_line model as an example, the server starts

 ```
- python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang #CPU
-python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang #GPU
+cd ../../python/examples/fit_a_line
+sh get_data.sh
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang &
 ```

-### Client Predict
+Client prediction
+
 ```
+cd ../../../java/examples/target
 java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample fit_a_line
 ```

-The Java example also contains the prediction client of Bert, Model_enaemble, asyn_predict, batch_predict, Cube_local, Cube_quant, and Yolov4 models.
+Take yolov4 as an example, the server starts
+
+```
+python -m paddle_serving_app.package --get_model yolov4
+tar -xzvf yolov4.tar.gz
+python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang & #It needs to be executed in GPU Docker, otherwise the execution method of CPU must be used.
+```
+
+Client prediction
+
+```
+# in /Serving/java/examples/target
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample yolov4 ../../../python/examples/yolov4/000000570688.jpg
+# The case of yolov4 needs to specify a picture as input
+```
+### Start the server(pipeline)
+
+as for input data type = string，take IMDB model ensemble as an example，the server starts
+
+```
+cd ../../python/examples/pipeline/imdb_model_ensemble
+sh get_data.sh
+python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 &> cnn.log &
+python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.log &
+python test_pipeline_server.py &>pipeline.log &
+```
+
+Client prediction(Synchronous)
+
+```
+cd ../../../java/examples/target
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample string_imdb_predict
+```
+
+Client prediction(Asynchronous)
+
+```
+cd ../../../java/examples/target
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample asyn_predict
+```
+
+
+as for input data type = INDArray，take uci_housing_model as an example，the server starts
+
+```
+cd ../../python/examples/pipeline/simple_web_service
+sh get_data.sh
+python web_service_java.py &>log.txt &
+```
+
+Client prediction(Synchronous)
+
+```
+cd ../../../java/examples/target
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample indarray_predict
+```
+
+
+### Customization guidance
+
+The above example is running in CPU mode. If GPU mode is required, there are two options.
+
+The first is that GPU Serving and Java Client are in the same image. After starting the corresponding image, the user needs to move /Serving/java in the java image to the corresponding image.
+
+The second is to deploy GPU Serving and Java Client separately. If they are on the same host, you can learn the IP address of the corresponding container through ifconfig, and then when you connect to client.connect in `examples/src/main/java/PaddleServingClientExample.java` Make changes to the endpoint, and then compile it again. Or select `--net=host` to bind the network device of docker and host when docker starts, so that it can run directly without customizing java code.
+
+**It should be noted that in the example, all models need to use `--use_multilang` to start GRPC multi-programming language support, and the port number is 9393. If you need another port, you need to modify it in the java file**
+
+**Currently Serving has launched the Pipeline mode (see [Pipeline Serving](../doc/PIPELINE_SERVING.md) for details). Pipeline Serving Client for Java is released, the next version multi-thread java client example will be released**
+
+**It should be noted that in the example, Java Pipeline Client code is in path /Java/Examples and /Java/src/main, and the Pipeline server code is in path /python/examples/pipeline/**
+
+
--- a/java/README_CN.md
+++ b/java/README_CN.md
-## Java 示例
+## 用于Paddle Serving的Java客户端
+
+([English](./README.md)|简体中文)
+
+### 开发环境
+
+为了方便用户使用java进行开发，我们提供了编译好的Serving工程放置在java镜像当中，获取镜像并进入开发环境的方式是
+
+```
+docker pull hub.baidubce.com/paddlepaddle/serving:0.4.0-java
+docker run --rm -dit --name java_serving hub.baidubce.com/paddlepaddle/serving:0.4.0-java
+docker exec -it java_serving bash
+cd Serving/java
+```
+
+Serving文件夹是镜像生成时的develop分支工程目录，需要git pull 到最新版本，或者git checkout 到想要的分支。

 ### 安装客户端依赖
+
+由于依赖库数量庞大，因此镜像已经在生成时编译过一次，用户执行以下操作即可
+
 ```
 mvn compile
 mvn install
@@ -9,18 +27,95 @@ mvn compile
 mvn install
 ```

-### 启动服务端
+### 启动服务端(非pipeline方式)

-以fit_a_line模型为例
+以fit_a_line模型为例，服务端启动

 ```
- python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang #CPU
-python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang #GPU
+cd ../../python/examples/fit_a_line
+sh get_data.sh
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang &
 ```

-### 客户端预测
+客户端预测
+
 ```
+cd ../../../java/examples/target
 java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample fit_a_line
 ```

-java示例中还包含了bert、model_enaemble、asyn_predict、batch_predict、cube_local、cube_quant、yolov4模型的预测客户端。
+以yolov4为例子，服务端启动
+
+```
+python -m paddle_serving_app.package --get_model yolov4
+tar -xzvf yolov4.tar.gz
+python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang &  #需要在GPU Docker当中执行，否则要使用CPU的执行方式。
+```
+
+客户端预测
+
+```
+# in /Serving/java/examples/target
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample yolov4 ../../../python/examples/yolov4/000000570688.jpg
+# yolov4的案例需要指定一个图片作为输入
+
+```
+
+### 启动服务端(Pipeline方式)
+
+对于input data type = string类型，以IMDB model ensemble模型为例，服务端启动
+
+```
+cd ../../python/examples/pipeline/imdb_model_ensemble
+sh get_data.sh
+python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 &> cnn.log &
+python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.log &
+python test_pipeline_server.py &>pipeline.log &
+```
+
+客户端预测(同步)
+
+```
+cd ../../../java/examples/target
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample string_imdb_predict
+```
+
+客户端预测(异步)
+
+```
+cd ../../../java/examples/target
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample asyn_predict
+```
+
+
+对于input data type = INDArray类型，以Simple Pipeline WebService中的uci_housing_model模型为例，服务端启动
+
+```
+cd ../../python/examples/pipeline/simple_web_service
+sh get_data.sh
+python web_service_java.py &>log.txt &
+```
+
+客户端预测(同步)
+
+```
+cd ../../../java/examples/target
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PipelineClientExample indarray_predict
+```
+
+### 二次开发指导
+
+上述示例是在CPU模式下运行，如果需要GPU模式，可以有两种选择。
+
+第一种是GPU Serving和Java Client在同一个镜像，需要用户在启动对应的镜像后，把java镜像当中的/Serving/java移动到对应的镜像中。
+
+第二种是GPU Serving和Java Client分开部署，如果在同一台宿主机，可以通过ifconfig了解对应容器的IP地址，然后在`examples/src/main/java/PaddleServingClientExample.java`当中对client.connect时的endpoint做修改，然后再编译一次。 或者在docker启动时选择 `--net=host`来绑定docker和宿主机的网络设备，这样不需要定制java代码可以直接运行。
+
+**需要注意的是，在示例中，所有模型都需要使用`--use_multilang`来启动GRPC多编程语言支持，以及端口号都是9393，如果需要别的端口，需要在java文件里修改**
+
+**目前Serving已推出Pipeline模式（详见[Pipeline Serving](../doc/PIPELINE_SERVING_CN.md)），面向Java的Pipeline Serving Client已发布，下个更新会发布Java版本的多线程用例敬请期待。**
+
+**需要注意的是，Java Pipeline Client相关示例在/Java/Examples和/Java/src/main中，对应的Pipeline server在/python/examples/pipeline/中**
+
+
+**目前Serving已推出Pipeline模式（详见[Pipeline Serving](../doc/PIPELINE_SERVING_CN.md)），下个版本（0.4.1）面向Java的Pipeline Serving Client将会发布，敬请期待。**
--- a/java/examples/src/main/java/PipelineClientExample.java
+++ b/java/examples/src/main/java/PipelineClientExample.java
+import io.paddle.serving.pipelineclient.*;
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import org.nd4j.linalg.api.iter.NdIndexIterator;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.datavec.image.loader.NativeImageLoader;
+import org.nd4j.linalg.api.ops.CustomOp;
+import org.nd4j.linalg.api.ops.DynamicCustomOp;
+import org.nd4j.linalg.factory.Nd4j;
+import java.util.*;
+
+/**
+* this class give an example for using the client to predict(grpc)
+* StaticPipelineClient.client supports mutil-thread.
+* By setting StaticPipelineClient.client properties，you can change the Maximum concurrency
+* Do not need to generate multiple instances of client,Use the StaticPipelineClient.client or SingleTon instead.
+* @author HexToString
+*/
+public class PipelineClientExample {
+
+    /**
+   * This method gives an example of synchronous prediction whose input type is string.
+   */
+    boolean string_imdb_predict() {
+        HashMap<String, String> feed_data
+            = new HashMap<String, String>() {{
+                put("words", "i am very sad | 0");
+            }};
+        System.out.println(feed_data);
+        List<String> fetch = Arrays.asList("prediction");
+        System.out.println(fetch);
+        
+        if (StaticPipelineClient.succ != true) {
+            if(!StaticPipelineClient.initClient("172.17.0.2","18070")){
+                System.out.println("connect failed.");
+                return false;
+            }
+        }
+        HashMap<String,String> result = StaticPipelineClient.client.predict(feed_data, fetch,false,0);
+        if (result == null) {
+            return false;
+        }
+        System.out.println(result);
+        return true;
+    }
+
+    /**
+   * This method gives an example of asynchronous prediction whose input type is string.
+   */
+    boolean asyn_predict() {
+        HashMap<String, String> feed_data
+            = new HashMap<String, String>() {{
+                put("words", "i am very sad | 0");
+            }};
+        System.out.println(feed_data);
+        List<String> fetch = Arrays.asList("prediction");
+        System.out.println(fetch);
+        if (StaticPipelineClient.succ != true) {
+            if(!StaticPipelineClient.initClient("172.17.0.2","18070")){
+                System.out.println("connect failed.");
+                return false;
+            }
+        }
+        PipelineFuture future = StaticPipelineClient.client.asyn_pr::qedict(feed_data, fetch,false,0);
+        HashMap<String,String> result = future.get();
+        if (result == null) {
+            return false;
+        }
+        System.out.println(result);
+        return true;
+    }
+
+    /**
+   * This method gives an example of synchronous prediction whose input type is Array or list or matrix.
+   * use Nd4j.createFromArray method to convert Array to INDArray.
+   * use convertINDArrayToString method to convert INDArray to specified String type(for python Numpy eval method).
+   */
+    boolean indarray_predict() {
+        float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f, 0.0582f, -0.0727f, -0.1583f, -0.0584f, 0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
+        INDArray npdata = Nd4j.createFromArray(data);
+
+        HashMap<String, String> feed_data
+            = new HashMap<String, String>() {{
+                put("x", convertINDArrayToString(npdata));
+            }};
+        List<String> fetch = Arrays.asList("prediction");
+        if (StaticPipelineClient.succ != true) {
+            if(!StaticPipelineClient.initClient("172.17.0.2","9998")){
+                System.out.println("connect failed.");
+                return false;
+            }
+        }
+
+        HashMap<String,String> result = StaticPipelineClient.client.predict(feed_data, fetch,false,0);
+        if (result == null) {
+            return false;
+        }
+        System.out.println(result);
+        return true;
+    }
+
+    /**
+   * This method convert INDArray to specified String type.
+   * @param npdata INDArray type(The input data).
+   * @return String (specified String type for python Numpy eval method).
+   */
+    String convertINDArrayToString(INDArray npdata){
+        return "array("+npdata.toString()+")";
+    }
+
+    /**
+   * This method is entry function.
+   * @param args String[] type(Command line parameters)
+   */
+    public static void main( String[] args ) {
+
+        PipelineClientExample e = new PipelineClientExample();
+        boolean succ = false;
+        if (args.length < 1) {
+            System.out.println("Usage: java -cp <jar> PaddleServingClientExample <test-type>.");
+            System.out.println("<test-type>: fit_a_line bert model_ensemble asyn_predict batch_predict cube_local cube_quant yolov4");
+            return;
+        }
+        
+        String testType = args[0];
+        System.out.format("[Example] %s\n", testType);
+        if ("string_imdb_predict".equals(testType)) {
+            succ = e.string_imdb_predict();
+        }else if ("asyn_predict".equals(testType)) {
+            succ = e.asyn_predict();
+        }else if ("indarray_predict".equals(testType)) {
+            succ = e.indarray_predict();
+        } else {
+            System.out.format("test-type(%s) not match.\n", testType);
+            return;
+        }
+
+        if (succ == true) {
+            System.out.println("[Example] succ.");
+        } else {
+            System.out.println("[Example] fail.");
+        }
+    }
+}
+
+
--- a/java/examples/src/main/java/StaticPipelineClient.java
+++ b/java/examples/src/main/java/StaticPipelineClient.java
+import io.paddle.serving.pipelineclient.*;
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import org.nd4j.linalg.api.iter.NdIndexIterator;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.datavec.image.loader.NativeImageLoader;
+import org.nd4j.linalg.api.ops.CustomOp;
+import org.nd4j.linalg.api.ops.DynamicCustomOp;
+import org.nd4j.linalg.factory.Nd4j;
+import java.util.*;
+
+/**
+* static resource management class
+* @author HexToString
+*/
+public class StaticPipelineClient {
+    /**
+     * Static Variable PipelineClient
+     */
+    public static PipelineClient client = new PipelineClient();
+    /**
+     * the sign of connect status
+     */
+    public static boolean succ = false;
+
+    /**
+   * This method returns the sign of connect status.
+   * @param strIp String type(The server ipv4) such as "192.168.10.10".
+   * @param strPort String type(The server port) such as "8891".
+   * @return boolean (the sign of connect status).
+   */
+    public static boolean initClient(String strIp,String strPort){
+        String target = strIp+ ":"+ strPort;//"172.17.0.2:18070";
+        System.out.println("initial connect.");
+        if(succ){
+            System.out.println("already connect.");
+            return true;
+        }
+        succ = clieint.connect(target);
+        if (succ != true) {
+            System.out.println("connect failed.");
+            return false;
+        }
+        return true;
+    }
+}
+
--- a/java/src/main/java/io/paddle/serving/client/PipelineClient.java
+++ b/java/src/main/java/io/paddle/serving/client/PipelineClient.java
+package io.paddle.serving.pipelineclient;
+
+import java.util.*;
+import java.util.function.Function;
+import java.lang.management.ManagementFactory;
+import java.lang.management.RuntimeMXBean;
+
+import io.grpc.ManagedChannel;
+import io.grpc.ManagedChannelBuilder;
+import io.grpc.StatusRuntimeException;
+import com.google.protobuf.ByteString;
+
+import com.google.common.util.concurrent.ListenableFuture;
+
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.iter.NdIndexIterator;
+import org.nd4j.linalg.factory.Nd4j;
+
+import io.paddle.serving.pipelineproto.*;
+import io.paddle.serving.pipelineclient.PipelineFuture;
+
+
+/**
+* PipelineClient class defination
+* @author HexToString
+*/
+public class PipelineClient {
+    private ManagedChannel channel_;
+    private PipelineServiceGrpc.PipelineServiceBlockingStub blockingStub_;
+    private PipelineServiceGrpc.PipelineServiceFutureStub futureStub_;
+    private String clientip;
+
+    private String _profile_key;
+    private String _profile_value;
+    
+    public PipelineClient() {
+        channel_ = null;
+        blockingStub_ = null;
+        futureStub_ = null;
+        boolean is_profile = false;
+        clientip = null;
+        _profile_value = "1";
+        _profile_key = "pipeline.profile";
+    }
+    
+    /**
+   * This method returns the sign of connect status.
+   * @param target String type(The server ipv4 and port) such as "192.168.10.10:8891".
+   * @return boolean (the sign of connect status).
+   */
+    public boolean connect(String target) {
+        try {
+            String[] temp = target.split(":");
+            this.clientip = temp[0] == "localhost"?"127.0.0.1":temp[0];
+            channel_ = ManagedChannelBuilder.forTarget(target)
+                .defaultLoadBalancingPolicy("round_robin")
+                .maxInboundMessageSize(Integer.MAX_VALUE)
+                .usePlaintext()
+                .build();
+            blockingStub_ = PipelineServiceGrpc.newBlockingStub(channel_);
+            futureStub_ = PipelineServiceGrpc.newFutureStub(channel_);
+        } catch (Exception e) {
+            System.out.format("Connect failed: %s\n", e.toString());
+            return false;
+        }
+        return true;
+    }
+
+    /**
+   * This method returns the Packaged Request.
+   * @param feed_dict HashMap<String, String>(input data).
+   * @param profile boolean(profile sign).
+   * @param logid int
+   * @return Request (the grpc protobuf Request).
+   */
+    private Request _packInferenceRequest(
+            HashMap<String, String> feed_dict,
+            boolean profile,
+            int logid) throws IllegalArgumentException {
+        List<String> keys = new ArrayList<String>();
+        List<String> values = new ArrayList<String>();
+        long[] flattened_shape = {-1};
+        
+        Request.Builder req_builder = Request.newBuilder()
+            .setClientip(this.clientip)
+            .setLogid(logid);
+        for (Map.Entry<String, String> entry : feed_dict.entrySet()) {
+            keys.add(entry.getKey());
+            values.add(entry.getValue());
+        }
+        if(profile){
+            keys.add(_profile_key);
+            values.add(_profile_value);
+        }
+        req_builder.addAllKey(keys);
+        req_builder.addAllValue(values);
+        return req_builder.build();
+    }
+
+    /**
+   * This method returns the HashMap which is unpackaged from Response.
+   * @param resp Response(the grpc protobuf Response).
+   * @return HashMap<String,String> (the output).
+   */
+    private HashMap<String,String> _unpackResponse(Response resp) throws IllegalArgumentException{
+        return PipelineClient._staitcUnpackResponse(resp);
+    }
+
+    /**
+   * This static method returns the HashMap which is unpackaged from Response.
+   * @param resp Response(the grpc protobuf Response).
+   * @return HashMap<String,String> (the output).
+   */
+    private static HashMap<String,String> _staitcUnpackResponse(Response resp) {
+        HashMap<String,String> ret_Map = new HashMap<String,String>();
+        int err_no  = resp.getErrNo();
+        if ( err_no!= 0) {
+            return null;
+        }
+        List<String> keys = resp.getKeyList();
+        List<String> values= resp.getValueList();
+        for (int i = 0;i<keys.size();i++) {
+            ret_Map.put(keys.get(i),values.get(i));
+        }
+        return ret_Map;
+    }
+
+    /**
+   * The synchronous prediction method.
+   * @param feed_batch HashMap<String, String>(input data).
+   * @param fetch Iterable<String>(the output key list).
+   * @param profile boolean(profile sign).
+   * @param logid int
+   * @return HashMap<String,String> (the output).
+   */
+    public HashMap<String,String> predict(
+            HashMap<String, String> feed_batch,
+            Iterable<String> fetch,
+            boolean profile,
+            int logid) {
+        try {
+            Request req = _packInferenceRequest(
+                    feed_batch, profile,logid);
+            Response resp = blockingStub_.inference(req);
+            return _unpackResponse(resp);
+        } catch (StatusRuntimeException e) {
+            System.out.format("Failed to predict: %s\n", e.toString());
+            return null;
+        }
+    }
+
+    /**
+   * The synchronous prediction overload function.
+   */
+    public HashMap<String,String> predict(
+            HashMap<String, String> feed_batch,
+            Iterable<String> fetch) {
+                return predict(feed_batch,fetch,false,0);
+    }
+
+    /**
+   * The synchronous prediction overload function.
+   */
+    public HashMap<String,String> predict(
+            HashMap<String, String> feed_batch,
+            Iterable<String> fetch,
+            boolean profile) {
+                return predict(feed_batch,fetch,profile,0);
+    }
+
+    /**
+   * The synchronous prediction overload function.
+   */
+    public HashMap<String,String> predict(
+            HashMap<String, String> feed_batch,
+            Iterable<String> fetch,
+            int logid) {
+                return predict(feed_batch,fetch,false,logid);
+    }
+
+    /**
+   * The asynchronous prediction method.use future.get() to get the result.
+   * @param feed_batch HashMap<String, String>(input data).
+   * @param fetch Iterable<String>(the output key list).
+   * @param profile boolean(profile sign).
+   * @param logid int
+   * @return PipelineFuture(the output future). 
+   */
+    public PipelineFuture asyn_predict(
+            HashMap<String, String> feed_batch,
+            Iterable<String> fetch,
+            boolean profile,
+            int logid) {
+        Request req = _packInferenceRequest(
+                feed_batch, profile, logid);
+        ListenableFuture<Response> future = futureStub_.inference(req);
+        PipelineFuture predict_future = new PipelineFuture(future, 
+            (Response resp) -> {
+                return PipelineClient._staitcUnpackResponse(resp);
+            }
+        );
+        return predict_future;
+    }
+
+    /**
+   * The asynchronous prediction overload function.
+   */
+    public PipelineFuture asyn_predict(
+            HashMap<String, String> feed_batch,
+            Iterable<String> fetch) {
+                return asyn_predict(feed_batch,fetch,false,0);
+    }
+
+    /**
+   * The asynchronous prediction overload function.
+   */
+    public PipelineFuture asyn_predict(
+            HashMap<String, String> feed_batch,
+            Iterable<String> fetch,
+            boolean profile) {
+                return asyn_predict(feed_batch,fetch,profile,0);
+    }
+
+    /**
+   * The asynchronous prediction overload function.
+   */
+    public PipelineFuture asyn_predict(
+            HashMap<String, String> feed_batch,
+            Iterable<String> fetch,
+            int logid) {
+                return asyn_predict(feed_batch,fetch,false,logid);
+    }
+
+
+}
--- a/java/src/main/java/io/paddle/serving/client/PipelineFuture.java
+++ b/java/src/main/java/io/paddle/serving/client/PipelineFuture.java
+package io.paddle.serving.pipelineclient;
+
+import java.util.*;
+import java.util.function.Function;
+import io.grpc.StatusRuntimeException;
+import com.google.common.util.concurrent.ListenableFuture;
+import org.nd4j.linalg.api.ndarray.INDArray;
+
+import io.paddle.serving.pipelineclient.PipelineClient;
+import io.paddle.serving.pipelineproto.*;
+
+/**
+* PipelineFuture class is for asynchronous prediction
+* @author HexToString
+*/
+public class PipelineFuture {
+    private ListenableFuture<Response> callFuture_;
+    private Function<Response, 
+        HashMap<String,String> > callBackFunc_;
+    
+        PipelineFuture(ListenableFuture<Response> call_future,
+            Function<Response, 
+            HashMap<String,String> > call_back_func) {
+        callFuture_ = call_future;
+        callBackFunc_ = call_back_func;
+    }
+
+    /**
+    * use this method to get the result of asynchronous prediction.
+    */
+    public HashMap<String,String> get() {
+        Response resp = null;
+        try {
+            resp = callFuture_.get();
+        } catch (Exception e) {
+            System.out.format("predict failed: %s\n", e.toString());
+            return null;
+        }
+        HashMap<String,String> result
+            = callBackFunc_.apply(resp);
+        return result;
+    }
+}
--- a/java/src/main/proto/pipeline_service.proto
+++ b/java/src/main/proto/pipeline_service.proto
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+option java_multiple_files = true;
+option java_package = "io.paddle.serving.pipelineproto";
+option java_outer_classname = "PipelineProto";
+
+package baidu.paddle_serving.pipeline_serving;
+
+message Request {
+  repeated string key = 1;
+  repeated string value = 2;
+  optional string name = 3;
+  optional string method = 4;
+  optional int64 logid = 5;
+  optional string clientip = 6;
+};
+
+message Response {
+  optional int32 err_no = 1;
+  optional string err_msg = 2;
+  repeated string key = 3;
+  repeated string value = 4;
+};
+
+service PipelineService {
+  rpc inference(Request) returns (Response) {}
+};
--- a/paddle_inference/CMakeLists.txt
+++ b/paddle_inference/CMakeLists.txt
@@ -13,8 +13,13 @@
 # limitations under the License

 if (NOT CLIENT_ONLY)
-add_subdirectory(inferencer-fluid-cpu)
-if (WITH_GPU)
-add_subdirectory(inferencer-fluid-gpu)
-endif()
+    add_subdirectory(inferencer-fluid-cpu)
+    
+    if (WITH_GPU)
+        add_subdirectory(inferencer-fluid-gpu)
+    endif()
+    
+    if (WITH_LITE)
+        add_subdirectory(inferencer-fluid-arm)
+    endif()
 endif()
--- a/paddle_inference/inferencer-fluid-arm/CMakeLists.txt
+++ b/paddle_inference/inferencer-fluid-arm/CMakeLists.txt
+FILE(GLOB fluid_arm_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
+add_library(fluid_arm_engine ${fluid_arm_engine_srcs})
+target_include_directories(fluid_arm_engine PUBLIC
+        ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
+add_dependencies(fluid_arm_engine pdserving extern_paddle configure)
+target_link_libraries(fluid_arm_engine pdserving paddle_fluid -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
+
+install(TARGETS fluid_arm_engine 
+        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
+        )
--- a/paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h
+++ b/paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pthread.h>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "core/configure/include/configure_parser.h"
+#include "core/configure/inferencer_configure.pb.h"
+#include "core/predictor/framework/infer.h"
+#include "paddle_inference_api.h"  // NOLINT
+
+namespace baidu {
+namespace paddle_serving {
+namespace fluid_arm {
+
+class AutoLock {
+ public:
+  explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
+    pthread_mutex_lock(&mutex);
+  }
+
+  ~AutoLock() { pthread_mutex_unlock(&_mut); }
+
+ private:
+  pthread_mutex_t& _mut;
+};
+
+class GlobalPaddleCreateMutex {
+ public:
+  pthread_mutex_t& mutex() { return _mut; }
+
+  static pthread_mutex_t& instance() {
+    static GlobalPaddleCreateMutex gmutex;
+    return gmutex.mutex();
+  }
+
+ private:
+  GlobalPaddleCreateMutex() { pthread_mutex_init(&_mut, NULL); }
+
+  pthread_mutex_t _mut;
+};
+
+using paddle_infer::Config;
+using paddle_infer::Predictor;
+using paddle_infer::Tensor;
+using paddle_infer::PrecisionType;
+using paddle_infer::CreatePredictor;
+
+// data interface
+class FluidFamilyCore {
+ public:
+  virtual ~FluidFamilyCore() {}
+  virtual std::vector<std::string> GetInputNames() {
+    return _core->GetInputNames();
+  }
+
+  virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
+    return _core->GetInputHandle(name);
+  }
+
+  virtual std::vector<std::string> GetOutputNames() {
+    return _core->GetOutputNames();
+  }
+
+  virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
+    return _core->GetOutputHandle(name);
+  }
+
+  virtual bool Run() {
+    if (!_core->Run()) {
+      LOG(ERROR) << "Failed call Run with paddle predictor";
+      return false;
+    }
+    return true;
+  }
+
+  virtual int create(const predictor::InferEngineCreationParams& params) = 0;
+
+  virtual int clone(void* origin_core) {
+    if (origin_core == NULL) {
+      LOG(ERROR) << "origin paddle Predictor is null.";
+      return -1;
+    }
+    Predictor* p_predictor = (Predictor*)origin_core;
+    _core = p_predictor->Clone();
+    if (_core.get() == NULL) {
+      LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
+      return -1;
+    }
+    return 0;
+  }
+
+  virtual void* get() { return _core.get(); }
+
+ protected:
+  std::shared_ptr<Predictor> _core;
+};
+
+// infer interface
+class FluidArmAnalysisCore : public FluidFamilyCore {
+ public:
+  int create(const predictor::InferEngineCreationParams& params) {
+    std::string data_path = params.get_path();
+    if (access(data_path.c_str(), F_OK) == -1) {
+      LOG(ERROR) << "create paddle predictor failed, path not exits: "
+                 << data_path;
+      return -1;
+    }
+
+    Config config;
+    config.SetParamsFile(data_path + "/__params__");
+    config.SetProgFile(data_path + "/__model__");
+    config.DisableGpu();
+    config.SetCpuMathLibraryNumThreads(1);
+
+    if (params.enable_memory_optimization()) {
+      config.EnableMemoryOptim();
+    }
+
+    if (params.enable_memory_optimization()) {
+      config.EnableMemoryOptim();
+    }
+
+    if (params.use_lite()) {
+      config.EnableLiteEngine(PrecisionType::kFloat32, true);
+    }
+
+    if (params.use_xpu()) {
+      config.EnableXpu(100);
+    }
+
+    config.SwitchSpecifyInputNames(true);
+    AutoLock lock(GlobalPaddleCreateMutex::instance());
+    _core = CreatePredictor(config);
+    if (NULL == _core.get()) {
+      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
+      return -1;
+    }
+
+    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
+    return 0;
+  }
+};
+
+class FluidArmAnalysisDirCore : public FluidFamilyCore {
+ public:
+  int create(const predictor::InferEngineCreationParams& params) {
+    std::string data_path = params.get_path();
+    if (access(data_path.c_str(), F_OK) == -1) {
+      LOG(ERROR) << "create paddle predictor failed, path not exits: "
+                 << data_path;
+      return -1;
+    }
+
+    Config config;
+    config.SetModel(data_path);
+    config.DisableGpu();
+    config.SwitchSpecifyInputNames(true);
+    config.SetCpuMathLibraryNumThreads(1);
+
+    if (params.enable_memory_optimization()) {
+      config.EnableMemoryOptim();
+    }
+
+    if (params.enable_ir_optimization()) {
+      config.SwitchIrOptim(true);
+    } else {
+      config.SwitchIrOptim(false);
+    }
+
+    if (params.use_lite()) {
+      config.EnableLiteEngine(PrecisionType::kFloat32, true);
+    }
+
+    if (params.use_xpu()) {
+      config.EnableXpu(100);
+    }
+
+    AutoLock lock(GlobalPaddleCreateMutex::instance());
+    _core = CreatePredictor(config);
+    if (NULL == _core.get()) {
+      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
+      return -1;
+    }
+
+    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
+    return 0;
+  }
+};
+
+class Parameter {
+ public:
+  Parameter() : _row(0), _col(0), _params(NULL) {}
+  ~Parameter() {
+    VLOG(2) << "before destroy Parameter, file_name[" << _file_name << "]";
+    destroy();
+  }
+
+  int init(int row, int col, const char* file_name) {
+    destroy();
+    _file_name = file_name;
+    _row = row;
+    _col = col;
+    _params = reinterpret_cast<float*>(malloc(_row * _col * sizeof(float)));
+    if (_params == NULL) {
+      LOG(ERROR) << "Load " << _file_name << " malloc error.";
+      return -1;
+    }
+    VLOG(2) << "Load parameter file[" << _file_name << "] success.";
+    return 0;
+  }
+
+  void destroy() {
+    _row = 0;
+    _col = 0;
+    if (_params != NULL) {
+      free(_params);
+      _params = NULL;
+    }
+  }
+
+  int load() {
+    if (_params == NULL || _row <= 0 || _col <= 0) {
+      LOG(ERROR) << "load parameter error [not inited].";
+      return -1;
+    }
+
+    FILE* fs = fopen(_file_name.c_str(), "rb");
+    if (fs == NULL) {
+      LOG(ERROR) << "load " << _file_name << " fopen error.";
+      return -1;
+    }
+    static const uint32_t MODEL_FILE_HEAD_LEN = 16;
+    char head[MODEL_FILE_HEAD_LEN] = {0};
+    if (fread(head, 1, MODEL_FILE_HEAD_LEN, fs) != MODEL_FILE_HEAD_LEN) {
+      destroy();
+      LOG(ERROR) << "Load " << _file_name << " read head error.";
+      if (fs != NULL) {
+        fclose(fs);
+        fs = NULL;
+      }
+      return -1;
+    }
+
+    uint32_t matrix_size = _row * _col;
+    if (matrix_size == fread(_params, sizeof(float), matrix_size, fs)) {
+      if (fs != NULL) {
+        fclose(fs);
+        fs = NULL;
+      }
+      VLOG(2) << "load " << _file_name << " read ok.";
+      return 0;
+    } else {
+      LOG(ERROR) << "load " << _file_name << " read error.";
+      destroy();
+      if (fs != NULL) {
+        fclose(fs);
+        fs = NULL;
+      }
+      return -1;
+    }
+    return 0;
+  }
+
+ public:
+  std::string _file_name;
+  int _row;
+  int _col;
+  float* _params;
+};
+
+}  // namespace fluid_arm
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/paddle_inference/inferencer-fluid-arm/src/fluid_arm_engine.cpp
+++ b/paddle_inference/inferencer-fluid-arm/src/fluid_arm_engine.cpp
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h"
+#include "core/predictor/framework/factory.h"
+
+namespace baidu {
+namespace paddle_serving {
+namespace fluid_arm {
+
+REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
+    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidArmAnalysisCore>,
+    ::baidu::paddle_serving::predictor::InferEngine,
+    "FLUID_ARM_ANALYSIS");
+
+REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
+    ::baidu::paddle_serving::predictor::FluidInferEngine<
+        FluidArmAnalysisDirCore>,
+    ::baidu::paddle_serving::predictor::InferEngine,
+    "FLUID_ARM_ANALYSIS_DIR");
+
+}  // namespace fluid_arm
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
+++ b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
@@ -28,8 +28,6 @@ namespace baidu {
 namespace paddle_serving {
 namespace fluid_cpu {

-using configure::SigmoidConf;
-
 class AutoLock {
 public:
  explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
@@ -57,31 +55,36 @@ class GlobalPaddleCreateMutex {
  pthread_mutex_t _mut;
 };

-class GlobalSigmoidCreateMutex {
- public:
-  pthread_mutex_t& mutex() { return _mut; }
-  static pthread_mutex_t& instance() {
-    static GlobalSigmoidCreateMutex gmutex;
-    return gmutex.mutex();
-  }
-
- private:
-  GlobalSigmoidCreateMutex() { pthread_mutex_init(&_mut, NULL); }
-
-  pthread_mutex_t _mut;
-};
+using paddle_infer::Config;
+using paddle_infer::Predictor;
+using paddle_infer::Tensor;
+using paddle_infer::CreatePredictor;

 // data interface
 class FluidFamilyCore {
 public:
  virtual ~FluidFamilyCore() {}
-  virtual bool Run(const void* in_data, void* out_data) {
-    if (!_core->Run(*(std::vector<paddle::PaddleTensor>*)in_data,
-                    (std::vector<paddle::PaddleTensor>*)out_data)) {
+  virtual std::vector<std::string> GetInputNames() {
+    return _core->GetInputNames();
+  }
+
+  virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
+    return _core->GetInputHandle(name);
+  }
+
+  virtual std::vector<std::string> GetOutputNames() {
+    return _core->GetOutputNames();
+  }
+
+  virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
+    return _core->GetOutputHandle(name);
+  }
+
+  virtual bool Run() {
+    if (!_core->Run()) {
      LOG(ERROR) << "Failed call Run with paddle predictor";
      return false;
    }
-
    return true;
  }

@@ -92,8 +95,7 @@ class FluidFamilyCore {
      LOG(ERROR) << "origin paddle Predictor is null.";
      return -1;
    }
-    paddle::PaddlePredictor* p_predictor =
-        (paddle::PaddlePredictor*)origin_core;
+    Predictor* p_predictor = (Predictor*)origin_core;
    _core = p_predictor->Clone();
    if (_core.get() == NULL) {
      LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
@@ -105,7 +107,7 @@ class FluidFamilyCore {
  virtual void* get() { return _core.get(); }

 protected:
-  std::unique_ptr<paddle::PaddlePredictor> _core;
+  std::shared_ptr<Predictor> _core;
 };

 // infer interface
@@ -119,51 +121,19 @@ class FluidCpuAnalysisCore : public FluidFamilyCore {
      return -1;
    }

-    paddle::AnalysisConfig analysis_config;
-    analysis_config.SetParamsFile(data_path + "/__params__");
-    analysis_config.SetProgFile(data_path + "/__model__");
-    analysis_config.DisableGpu();
-    analysis_config.SetCpuMathLibraryNumThreads(1);
+    Config config;
+    config.SetParamsFile(data_path + "/__params__");
+    config.SetProgFile(data_path + "/__model__");
+    config.DisableGpu();
+    config.SetCpuMathLibraryNumThreads(1);

    if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim();
-    }
-
-    analysis_config.SwitchSpecifyInputNames(true);
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core =
-        paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-
-class FluidCpuNativeCore : public FluidFamilyCore {
- public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
+      config.EnableMemoryOptim();
    }

-    paddle::NativeConfig native_config;
-    native_config.param_file = data_path + "/__params__";
-    native_config.prog_file = data_path + "/__model__";
-    native_config.use_gpu = false;
-    native_config.device = 0;
-    native_config.fraction_of_gpu_memory = 0;
-
+    config.SwitchSpecifyInputNames(true);
    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
-                                          paddle::PaddleEngineKind::kNative>(
-        native_config);
+    _core = CreatePredictor(config);
    if (NULL == _core.get()) {
      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
      return -1;
@@ -184,54 +154,24 @@ class FluidCpuAnalysisDirCore : public FluidFamilyCore {
      return -1;
    }

-    paddle::AnalysisConfig analysis_config;
-    analysis_config.SetModel(data_path);
-    analysis_config.DisableGpu();
-    analysis_config.SwitchSpecifyInputNames(true);
-    analysis_config.SetCpuMathLibraryNumThreads(1);
+    Config config;
+    config.SetModel(data_path);
+    config.DisableGpu();
+    config.SwitchSpecifyInputNames(true);
+    config.SetCpuMathLibraryNumThreads(1);

    if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim();
+      config.EnableMemoryOptim();
    }

    if (params.enable_ir_optimization()) {
-      analysis_config.SwitchIrOptim(true);
+      config.SwitchIrOptim(true);
    } else {
-      analysis_config.SwitchIrOptim(false);
-    }
-
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core =
-        paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
+      config.SwitchIrOptim(false);
    }

-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-
-class FluidCpuNativeDirCore : public FluidFamilyCore {
- public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
-    }
-
-    paddle::NativeConfig native_config;
-    native_config.model_dir = data_path;
-    native_config.use_gpu = false;
-    native_config.device = 0;
-    native_config.fraction_of_gpu_memory = 0;
    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
-                                          paddle::PaddleEngineKind::kNative>(
-        native_config);
+    _core = CreatePredictor(config);
    if (NULL == _core.get()) {
      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
      return -1;
@@ -323,209 +263,57 @@ class Parameter {
  float* _params;
 };

-class SigmoidModel {
- public:
-  ~SigmoidModel() {}
-  int load(const char* sigmoid_w_file,
-           const char* sigmoid_b_file,
-           float exp_max,
-           float exp_min) {
-    AutoLock lock(GlobalSigmoidCreateMutex::instance());
-    if (0 != _sigmoid_w.init(2, 1, sigmoid_w_file) || 0 != _sigmoid_w.load()) {
-      LOG(ERROR) << "load params sigmoid_w failed.";
-      return -1;
-    }
-    VLOG(2) << "load sigmoid_w [" << _sigmoid_w._params[0] << "] ["
-            << _sigmoid_w._params[1] << "].";
-    if (0 != _sigmoid_b.init(2, 1, sigmoid_b_file) || 0 != _sigmoid_b.load()) {
-      LOG(ERROR) << "load params sigmoid_b failed.";
-      return -1;
-    }
-    VLOG(2) << "load sigmoid_b [" << _sigmoid_b._params[0] << "] ["
-            << _sigmoid_b._params[1] << "].";
-    _exp_max_input = exp_max;
-    _exp_min_input = exp_min;
-    return 0;
-  }
-
-  int softmax(float x, double& o) {  // NOLINT
-    float _y0 = x * _sigmoid_w._params[0] + _sigmoid_b._params[0];
-    float _y1 = x * _sigmoid_w._params[1] + _sigmoid_b._params[1];
-    _y0 = (_y0 > _exp_max_input)
-              ? _exp_max_input
-              : ((_y0 < _exp_min_input) ? _exp_min_input : _y0);
-    _y1 = (_y1 > _exp_max_input)
-              ? _exp_max_input
-              : ((_y1 < _exp_min_input) ? _exp_min_input : _y1);
-    o = 1.0f / (1.0f + exp(_y0 - _y1));
-    return 0;
-  }
-
- public:
-  Parameter _sigmoid_w;
-  Parameter _sigmoid_b;
-  float _exp_max_input;
-  float _exp_min_input;
-};
-
-class SigmoidFluidModel {
+class FluidCpuAnalysisEncryptCore : public FluidFamilyCore {
 public:
-  int softmax(float x, double& o) {  // NOLINT
-    return _sigmoid_core->softmax(x, o);
-  }  // NOLINT
-
-  std::unique_ptr<SigmoidFluidModel> Clone() {
-    std::unique_ptr<SigmoidFluidModel> clone_model;
-    clone_model.reset(new SigmoidFluidModel());
-    clone_model->_sigmoid_core = _sigmoid_core;
-    clone_model->_fluid_core = _fluid_core->Clone();
-    return std::move(clone_model);  // NOLINT
+  void ReadBinaryFile(const std::string& filename, std::string* contents) {
+    std::ifstream fin(filename, std::ios::in | std::ios::binary);
+    fin.seekg(0, std::ios::end);
+    contents->clear();
+    contents->resize(fin.tellg());
+    fin.seekg(0, std::ios::beg);
+    fin.read(&(contents->at(0)), contents->size());
+    fin.close();
  }

- public:
-  std::unique_ptr<paddle::PaddlePredictor> _fluid_core;
-  std::shared_ptr<SigmoidModel> _sigmoid_core;
-};
-
-class FluidCpuWithSigmoidCore : public FluidFamilyCore {
- public:
-  virtual ~FluidCpuWithSigmoidCore() {}
-
- public:
  int create(const predictor::InferEngineCreationParams& params) {
-    std::string model_path = params.get_path();
-    size_t pos = model_path.find_last_of("/\\");
-    std::string conf_path = model_path.substr(0, pos);
-    std::string conf_file = model_path.substr(pos);
-    configure::SigmoidConf conf;
-    if (configure::read_proto_conf(conf_path, conf_file, &conf) != 0) {
-      LOG(ERROR) << "failed load model path: " << model_path;
-      return -1;
-    }
-
-    _core.reset(new SigmoidFluidModel);
-
-    std::string fluid_model_data_path = conf.dnn_model_path();
-    predictor::InferEngineCreationParams new_params(params);
-    new_params.set_path(fluid_model_data_path);
-    int ret = load_fluid_model(new_params);
-    if (ret < 0) {
-      LOG(ERROR) << "fail to load fluid model.";
-      return -1;
-    }
-    const char* sigmoid_w_file = conf.sigmoid_w_file().c_str();
-    const char* sigmoid_b_file = conf.sigmoid_b_file().c_str();
-    float exp_max = conf.exp_max_input();
-    float exp_min = conf.exp_min_input();
-    _core->_sigmoid_core.reset(new SigmoidModel);
-    VLOG(2) << "create sigmoid core[" << _core->_sigmoid_core.get()
-            << "], use count[" << _core->_sigmoid_core.use_count() << "].";
-    ret = _core->_sigmoid_core->load(
-        sigmoid_w_file, sigmoid_b_file, exp_max, exp_min);
-    if (ret < 0) {
-      LOG(ERROR) << "fail to load sigmoid model.";
-      return -1;
-    }
-    return 0;
-  }
-
-  virtual bool Run(const void* in_data, void* out_data) {
-    if (!_core->_fluid_core->Run(
-            *(std::vector<paddle::PaddleTensor>*)in_data,
-            (std::vector<paddle::PaddleTensor>*)out_data)) {
-      LOG(ERROR) << "Failed call Run with paddle predictor";
-      return false;
-    }
-
-    return true;
-  }
-
-  virtual int clone(SigmoidFluidModel* origin_core) {
-    if (origin_core == NULL) {
-      LOG(ERROR) << "origin paddle Predictor is null.";
-      return -1;
-    }
-    _core = origin_core->Clone();
-    if (_core.get() == NULL) {
-      LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
-      return -1;
-    }
-    VLOG(2) << "clone sigmoid core[" << _core->_sigmoid_core.get()
-            << "] use count[" << _core->_sigmoid_core.use_count() << "].";
-    return 0;
-  }
-
-  virtual SigmoidFluidModel* get() { return _core.get(); }
-
-  virtual int load_fluid_model(
-      const predictor::InferEngineCreationParams& params) = 0;
-
-  int softmax(float x, double& o) {  // NOLINT
-    return _core->_sigmoid_core->softmax(x, o);
-  }
-
- protected:
-  std::unique_ptr<SigmoidFluidModel> _core;  // NOLINT
-};
-
-class FluidCpuNativeDirWithSigmoidCore : public FluidCpuWithSigmoidCore {
- public:
-  int load_fluid_model(const predictor::InferEngineCreationParams& params) {
    std::string data_path = params.get_path();
    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
+      LOG(ERROR) << "create paddle predictor failed, path note exits: "
                 << data_path;
      return -1;
    }

-    paddle::NativeConfig native_config;
-    native_config.model_dir = data_path;
-    native_config.use_gpu = false;
-    native_config.device = 0;
-    native_config.fraction_of_gpu_memory = 0;
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core->_fluid_core =
-        paddle::CreatePaddlePredictor<paddle::NativeConfig,
-                                      paddle::PaddleEngineKind::kNative>(
-            native_config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
+    std::string model_buffer, params_buffer, key_buffer;
+    ReadBinaryFile(data_path + "encrypt_model", &model_buffer);
+    ReadBinaryFile(data_path + "encrypt_params", &params_buffer);
+    ReadBinaryFile(data_path + "key", &key_buffer);

-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
+    VLOG(2) << "prepare for encryption model";

-class FluidCpuAnalysisDirWithSigmoidCore : public FluidCpuWithSigmoidCore {
- public:
-  int load_fluid_model(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
-    }
+    auto cipher = paddle::MakeCipher("");
+    std::string real_model_buffer = cipher->Decrypt(model_buffer, key_buffer);
+    std::string real_params_buffer = cipher->Decrypt(params_buffer, key_buffer);

-    paddle::AnalysisConfig analysis_config;
-    analysis_config.SetModel(data_path);
+    Config analysis_config;
+    //paddle::AnalysisConfig analysis_config;
+    analysis_config.SetModelBuffer(&real_model_buffer[0],
+                                   real_model_buffer.size(),
+                                   &real_params_buffer[0],
+                                   real_params_buffer.size());
    analysis_config.DisableGpu();
-    analysis_config.SwitchSpecifyInputNames(true);
    analysis_config.SetCpuMathLibraryNumThreads(1);
-
    if (params.enable_memory_optimization()) {
      analysis_config.EnableMemoryOptim();
    }
-
+    analysis_config.SwitchSpecifyInputNames(true);
    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core->_fluid_core =
-        paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
+    VLOG(2) << "decrypt model file sucess";
+    _core =
+        CreatePredictor(analysis_config);
    if (NULL == _core.get()) {
      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
      return -1;
    }
-
    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
    return 0;
  }

--- a/paddle_inference/inferencer-fluid-cpu/src/fluid_cpu_engine.cpp
+++ b/paddle_inference/inferencer-fluid-cpu/src/fluid_cpu_engine.cpp
@@ -30,28 +30,13 @@ REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
    ::baidu::paddle_serving::predictor::InferEngine,
    "FLUID_CPU_ANALYSIS_DIR");

+#if 1
 REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
    ::baidu::paddle_serving::predictor::FluidInferEngine<
-        FluidCpuAnalysisDirWithSigmoidCore>,
+        FluidCpuAnalysisEncryptCore>,
    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_CPU_ANALYSIS_DIR_SIGMOID");
-
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidCpuNativeCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_CPU_NATIVE");
-
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidCpuNativeDirCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_CPU_NATIVE_DIR");
-
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<
-        FluidCpuNativeDirWithSigmoidCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_CPU_NATIVE_DIR_SIGMOID");
-
+    "FLUID_CPU_ANALYSIS_ENCRYPT");
+#endif
 }  // namespace fluid_cpu
 }  // namespace paddle_serving
 }  // namespace baidu
--- a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
+++ b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
@@ -61,31 +61,36 @@ class GlobalPaddleCreateMutex {
  pthread_mutex_t _mut;
 };

-class GlobalSigmoidCreateMutex {
- public:
-  pthread_mutex_t& mutex() { return _mut; }
-  static pthread_mutex_t& instance() {
-    static GlobalSigmoidCreateMutex gmutex;
-    return gmutex.mutex();
-  }
-
- private:
-  GlobalSigmoidCreateMutex() { pthread_mutex_init(&_mut, NULL); }
-
-  pthread_mutex_t _mut;
-};
+using paddle_infer::Config;
+using paddle_infer::Predictor;
+using paddle_infer::Tensor;
+using paddle_infer::CreatePredictor;

 // data interface
 class FluidFamilyCore {
 public:
  virtual ~FluidFamilyCore() {}
-  virtual bool Run(const void* in_data, void* out_data) {
-    if (!_core->Run(*(std::vector<paddle::PaddleTensor>*)in_data,
-                    (std::vector<paddle::PaddleTensor>*)out_data)) {
+  virtual std::vector<std::string> GetInputNames() {
+    return _core->GetInputNames();
+  }
+
+  virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
+    return _core->GetInputHandle(name);
+  }
+
+  virtual std::vector<std::string> GetOutputNames() {
+    return _core->GetOutputNames();
+  }
+
+  virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
+    return _core->GetOutputHandle(name);
+  }
+
+  virtual bool Run() {
+    if (!_core->Run()) {
      LOG(ERROR) << "Failed call Run with paddle predictor";
      return false;
    }
-
    return true;
  }

@@ -96,8 +101,7 @@ class FluidFamilyCore {
      LOG(ERROR) << "origin paddle Predictor is null.";
      return -1;
    }
-    paddle::PaddlePredictor* p_predictor =
-        (paddle::PaddlePredictor*)origin_core;
+    Predictor* p_predictor = (Predictor*)origin_core;
    _core = p_predictor->Clone();
    if (_core.get() == NULL) {
      LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
@@ -109,7 +113,7 @@ class FluidFamilyCore {
  virtual void* get() { return _core.get(); }

 protected:
-  std::unique_ptr<paddle::PaddlePredictor> _core;
+  std::shared_ptr<Predictor> _core;
 };

 // infer interface
@@ -123,51 +127,19 @@ class FluidGpuAnalysisCore : public FluidFamilyCore {
      return -1;
    }

-    paddle::AnalysisConfig analysis_config;
-    analysis_config.SetParamsFile(data_path + "/__params__");
-    analysis_config.SetProgFile(data_path + "/__model__");
-    analysis_config.EnableUseGpu(100, FLAGS_gpuid);
-    analysis_config.SetCpuMathLibraryNumThreads(1);
+    Config config;
+    config.SetParamsFile(data_path + "/__params__");
+    config.SetProgFile(data_path + "/__model__");
+    config.EnableUseGpu(100, FLAGS_gpuid);
+    config.SetCpuMathLibraryNumThreads(1);

    if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim();
+      config.EnableMemoryOptim();
    }

-    analysis_config.SwitchSpecifyInputNames(true);
-
+    config.SwitchSpecifyInputNames(true);
    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core =
-        paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-
-class FluidGpuNativeCore : public FluidFamilyCore {
- public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
-    }
-
-    paddle::NativeConfig native_config;
-    native_config.param_file = data_path + "/__params__";
-    native_config.prog_file = data_path + "/__model__";
-    native_config.use_gpu = true;
-    native_config.fraction_of_gpu_memory = 0.01;
-    native_config.device = FLAGS_gpuid;
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
-                                          paddle::PaddleEngineKind::kNative>(
-        native_config);
+    _core = CreatePredictor(config);
    if (NULL == _core.get()) {
      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
      return -1;
@@ -188,110 +160,38 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
      return -1;
    }

-    paddle::AnalysisConfig analysis_config;
-    analysis_config.SetModel(data_path);
-    analysis_config.EnableUseGpu(1500, FLAGS_gpuid);
-    analysis_config.SwitchSpecifyInputNames(true);
-    analysis_config.SetCpuMathLibraryNumThreads(1);
+    Config config;
+    config.SetModel(data_path);
+    config.EnableUseGpu(1500, FLAGS_gpuid);
+    config.SwitchSpecifyInputNames(true);
+    config.SetCpuMathLibraryNumThreads(1);

    if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim();
-    }
-
-#if 0  // todo: support flexible shape
-
-    int min_seq_len = 1;
-    int max_seq_len = 512;
-    int opt_seq_len = 128;
-    int head_number = 12;
-    int batch = 50;
-
-    std::vector<int> min_in_shape = {batch, min_seq_len, 1};
-    std::vector<int> max_in_shape = {batch, max_seq_len, 1};
-    std::vector<int> opt_in_shape = {batch, opt_seq_len, 1};
-
-    std::string input1_name = "src_text_a_ids";
-    std::string input2_name = "pos_text_a_ids";
-    std::string input3_name = "sent_text_a_ids";
-    std::string input4_name = "stack_0.tmp_0";
-
-    std::map<std::string, std::vector<int>> min_input_shape = {
-        {input1_name, min_in_shape},
-        {input2_name, min_in_shape},
-        {input3_name, min_in_shape},
-        {input4_name, {batch, head_number, min_seq_len, min_seq_len}},
-    };
-
-    std::map<std::string, std::vector<int>> max_input_shape = {
-        {input1_name, max_in_shape},
-        {input2_name, max_in_shape},
-        {input3_name, max_in_shape},
-        {input4_name, {batch, head_number, max_seq_len, max_seq_len}},
-    };
-    std::map<std::string, std::vector<int>> opt_input_shape = {
-        {input1_name, opt_in_shape},
-        {input2_name, opt_in_shape},
-        {input3_name, opt_in_shape},
-        {input4_name, {batch, head_number, opt_seq_len, opt_seq_len}},
-    };
-
-    analysis_config.SetTRTDynamicShapeInfo(
-        min_input_shape, max_input_shape, opt_input_shape);
-#endif
+      config.EnableMemoryOptim();
+    }
    int max_batch = 32;
    int min_subgraph_size = 3;
    if (params.use_trt()) {
-      analysis_config.EnableTensorRtEngine(
-          1 << 20,
+      config.EnableTensorRtEngine(1 << 20,
                                  max_batch,
                                  min_subgraph_size,
-          paddle::AnalysisConfig::Precision::kFloat32,
+                                  Config::Precision::kFloat32,
                                  false,
                                  false);
      LOG(INFO) << "create TensorRT predictor";
    } else {
      if (params.enable_memory_optimization()) {
-        analysis_config.EnableMemoryOptim();
+        config.EnableMemoryOptim();
      }

      if (params.enable_ir_optimization()) {
-        analysis_config.SwitchIrOptim(true);
+        config.SwitchIrOptim(true);
      } else {
-        analysis_config.SwitchIrOptim(false);
+        config.SwitchIrOptim(false);
      }
    }
    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core =
-        paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-
-class FluidGpuNativeDirCore : public FluidFamilyCore {
- public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
-    }
-
-    paddle::NativeConfig native_config;
-    native_config.model_dir = data_path;
-    native_config.use_gpu = true;
-    native_config.fraction_of_gpu_memory = 0.01;
-    native_config.device = FLAGS_gpuid;
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
-                                          paddle::PaddleEngineKind::kNative>(
-        native_config);
+    _core = CreatePredictor(config);
    if (NULL == _core.get()) {
      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
      return -1;
@@ -383,214 +283,6 @@ class Parameter {
  float* _params;
 };

-class SigmoidModel {
- public:
-  ~SigmoidModel() {}
-  int load(const char* sigmoid_w_file,
-           const char* sigmoid_b_file,
-           float exp_max,
-           float exp_min) {
-    AutoLock lock(GlobalSigmoidCreateMutex::instance());
-    if (0 != _sigmoid_w.init(2, 1, sigmoid_w_file) || 0 != _sigmoid_w.load()) {
-      LOG(ERROR) << "load params sigmoid_w failed.";
-      return -1;
-    }
-    VLOG(2) << "load sigmoid_w [" << _sigmoid_w._params[0] << "] ["
-            << _sigmoid_w._params[1] << "].";
-    if (0 != _sigmoid_b.init(2, 1, sigmoid_b_file) || 0 != _sigmoid_b.load()) {
-      LOG(ERROR) << "load params sigmoid_b failed.";
-      return -1;
-    }
-    VLOG(2) << "load sigmoid_b [" << _sigmoid_b._params[0] << "] ["
-            << _sigmoid_b._params[1] << "].";
-    _exp_max_input = exp_max;
-    _exp_min_input = exp_min;
-    return 0;
-  }
-
-  int softmax(float x, double& o) {  // NOLINT
-    float _y0 = x * _sigmoid_w._params[0] + _sigmoid_b._params[0];
-    float _y1 = x * _sigmoid_w._params[1] + _sigmoid_b._params[1];
-    _y0 = (_y0 > _exp_max_input)
-              ? _exp_max_input
-              : ((_y0 < _exp_min_input) ? _exp_min_input : _y0);
-    _y1 = (_y1 > _exp_max_input)
-              ? _exp_max_input
-              : ((_y1 < _exp_min_input) ? _exp_min_input : _y1);
-    o = 1.0f / (1.0f + exp(_y0 - _y1));
-    return 0;
-  }
-
- public:
-  Parameter _sigmoid_w;
-  Parameter _sigmoid_b;
-  float _exp_max_input;
-  float _exp_min_input;
-};
-
-class SigmoidFluidModel {
- public:
-  int softmax(float x, double& o) {  // NOLINT
-    return _sigmoid_core->softmax(x, o);
-  }  // NOLINT
-
-  std::unique_ptr<SigmoidFluidModel> Clone() {
-    std::unique_ptr<SigmoidFluidModel> clone_model;
-    clone_model.reset(new SigmoidFluidModel());
-    clone_model->_sigmoid_core = _sigmoid_core;
-    clone_model->_fluid_core = _fluid_core->Clone();
-    return std::move(clone_model);
-  }
-
- public:
-  std::unique_ptr<paddle::PaddlePredictor> _fluid_core;
-  std::shared_ptr<SigmoidModel> _sigmoid_core;
-};
-
-class FluidGpuWithSigmoidCore : public FluidFamilyCore {
- public:
-  virtual ~FluidGpuWithSigmoidCore() {}
-
- public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string model_path = params.get_path();
-    size_t pos = model_path.find_last_of("/\\");
-    std::string conf_path = model_path.substr(0, pos);
-    std::string conf_file = model_path.substr(pos);
-    configure::SigmoidConf conf;
-    if (configure::read_proto_conf(conf_path, conf_file, &conf) != 0) {
-      LOG(ERROR) << "failed load model path: " << model_path;
-      return -1;
-    }
-
-    _core.reset(new SigmoidFluidModel);
-
-    std::string fluid_model_data_path = conf.dnn_model_path();
-    predictor::InferEngineCreationParams new_params(params);
-    new_params.set_path(fluid_model_data_path);
-    int ret = load_fluid_model(new_params);
-    if (ret < 0) {
-      LOG(ERROR) << "fail to load fluid model.";
-      return -1;
-    }
-    const char* sigmoid_w_file = conf.sigmoid_w_file().c_str();
-    const char* sigmoid_b_file = conf.sigmoid_b_file().c_str();
-    float exp_max = conf.exp_max_input();
-    float exp_min = conf.exp_min_input();
-    _core->_sigmoid_core.reset(new SigmoidModel);
-    LOG(INFO) << "create sigmoid core[" << _core->_sigmoid_core.get()
-              << "], use count[" << _core->_sigmoid_core.use_count() << "].";
-    ret = _core->_sigmoid_core->load(
-        sigmoid_w_file, sigmoid_b_file, exp_max, exp_min);
-    if (ret < 0) {
-      LOG(ERROR) << "fail to load sigmoid model.";
-      return -1;
-    }
-    return 0;
-  }
-
-  virtual bool Run(const void* in_data, void* out_data) {
-    if (!_core->_fluid_core->Run(
-            *(std::vector<paddle::PaddleTensor>*)in_data,
-            (std::vector<paddle::PaddleTensor>*)out_data)) {
-      LOG(ERROR) << "Failed call Run with paddle predictor";
-      return false;
-    }
-
-    return true;
-  }
-
-  virtual int clone(SigmoidFluidModel* origin_core) {
-    if (origin_core == NULL) {
-      LOG(ERROR) << "origin paddle Predictor is null.";
-      return -1;
-    }
-    _core = origin_core->Clone();
-    if (_core.get() == NULL) {
-      LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
-      return -1;
-    }
-    LOG(INFO) << "clone sigmoid core[" << _core->_sigmoid_core.get()
-              << "] use count[" << _core->_sigmoid_core.use_count() << "].";
-    return 0;
-  }
-
-  virtual SigmoidFluidModel* get() { return _core.get(); }
-
-  virtual int load_fluid_model(
-      const predictor::InferEngineCreationParams& params) = 0;
-
-  int softmax(float x, double& o) {  // NOLINT
-    return _core->_sigmoid_core->softmax(x, o);
-  }
-
- protected:
-  std::unique_ptr<SigmoidFluidModel> _core;
-};
-
-class FluidGpuNativeDirWithSigmoidCore : public FluidGpuWithSigmoidCore {
- public:
-  int load_fluid_model(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
-    }
-
-    paddle::NativeConfig native_config;
-    native_config.model_dir = data_path;
-    native_config.use_gpu = true;
-    native_config.fraction_of_gpu_memory = 0.01;
-    native_config.device = FLAGS_gpuid;
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core->_fluid_core =
-        paddle::CreatePaddlePredictor<paddle::NativeConfig,
-                                      paddle::PaddleEngineKind::kNative>(
-            native_config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-
-class FluidGpuAnalysisDirWithSigmoidCore : public FluidGpuWithSigmoidCore {
- public:
-  int load_fluid_model(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
-    }
-
-    paddle::AnalysisConfig analysis_config;
-    analysis_config.SetModel(data_path);
-    analysis_config.EnableUseGpu(100, FLAGS_gpuid);
-    analysis_config.SwitchSpecifyInputNames(true);
-    analysis_config.SetCpuMathLibraryNumThreads(1);
-
-    if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim();
-    }
-
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core->_fluid_core =
-        paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-
 }  // namespace fluid_gpu
 }  // namespace paddle_serving
 }  // namespace baidu
--- a/paddle_inference/inferencer-fluid-gpu/src/fluid_gpu_engine.cpp
+++ b/paddle_inference/inferencer-fluid-gpu/src/fluid_gpu_engine.cpp
@@ -32,28 +32,6 @@ REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
    ::baidu::paddle_serving::predictor::InferEngine,
    "FLUID_GPU_ANALYSIS_DIR");

-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<
-        FluidGpuAnalysisDirWithSigmoidCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_GPU_ANALYSIS_DIR_SIGMOID");
-
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidGpuNativeCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_GPU_NATIVE");
-
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidGpuNativeDirCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_GPU_NATIVE_DIR");
-
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<
-        FluidGpuNativeDirWithSigmoidCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_GPU_NATIVE_DIR_SIGMOID");
-
 }  // namespace fluid_gpu
 }  // namespace paddle_serving
 }  // namespace baidu
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -7,7 +7,7 @@ if (CLIENT)
 endif()

 if (SERVER)
-    if (NOT WITH_GPU)
+    if (NOT WITH_GPU AND NOT WITH_LITE)
        file(INSTALL pipeline DESTINATION paddle_serving_server)
        file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server/*.py)
    else()
@@ -34,7 +34,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.app.in
 endif()

 if (SERVER)
-    if (NOT WITH_GPU)
+    if (NOT WITH_GPU AND NOT WITH_LITE)
        configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.server.in
            ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
    else()
@@ -72,7 +72,7 @@ add_custom_target(paddle_python ALL DEPENDS serving_client ${PADDLE_SERVING_BINA
 endif()

 if (SERVER)
-    if(NOT WITH_GPU)
+    if(NOT WITH_GPU AND NOT WITH_LITE)
        add_custom_command(
            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
            COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server/ ${PADDLE_SERVING_BINARY_DIR}/python/
@@ -90,6 +90,16 @@ if (SERVER)
            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+    elseif(WITH_LITE)
+        add_custom_command(
+            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+            COMMAND cp -r
+            ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
+            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
+            "server_gpu" arm 
+            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
+        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
    else()
        add_custom_command(
            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp

--- a/python/examples/criteo_ctr_with_cube/README.md
+++ b/python/examples/criteo_ctr_with_cube/README.md
-## Criteo CTR with Sparse Parameter Indexing Service
-
-([简体中文](./README_CN.md)|English)
-
-### Get Sample Dataset
-
-go to directory `python/examples/criteo_ctr_with_cube`
-```
-sh get_data.sh
-```
-
-### Download Model and Sparse Parameter Sequence Files
-```
-wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz
-tar xf ctr_cube_unittest.tar.gz
-mv models/ctr_client_conf ./
-mv models/ctr_serving_model_kv ./
-mv models/data ./cube/
-```
-the model will be in ./ctr_server_model_kv and ./ctr_client_config.
-
-### Start Sparse Parameter Indexing Service
-```
-wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz
-tar xf cube_app.tar.gz
-mv cube_app/cube* ./cube/
-sh cube_prepare.sh &
-```
-
-Here, the sparse parameter is loaded by cube sparse parameter indexing service Cube.
-
-### Start RPC Predictor, the number of serving thread is 4（configurable in test_server.py）
-
-```
-python test_server.py ctr_serving_model_kv 
-```
-
-### Run Prediction
-
-```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
-```
-
-### Benchmark
-
-CPU ：Intel(R) Xeon(R) CPU 6148 @ 2.40GHz 
-
-Model ：[Criteo CTR](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/criteo_ctr_with_cube/network_conf.py)
-
-server core/thread num ： 4/8
-
-Run
-```
-bash benchmark.sh
-```
-1000 batches will be sent by every client
-
-| client  thread num | prepro | client infer | op0    | op1   | op2    | postpro | avg_latency | qps   |
-| ------------------ | ------ | ------------ | ------ | ----- | ------ | ------- | ----- | ----- |
-| 1                  | 0.035  | 1.596        | 0.021  | 0.518 | 0.0024 | 0.0025  | 6.774 | 147.7 |
-| 2                  | 0.034  | 1.780        | 0.027  | 0.463 | 0.0020 | 0.0023  | 6.931 | 288.3 |
-| 4                  | 0.038  | 2.954        | 0.025  | 0.455 | 0.0019 | 0.0027  | 8.378 | 477.5 |
-| 8                  | 0.044  | 8.230        | 0.028  | 0.464 | 0.0023 | 0.0034  | 14.191 | 563.8 |
-| 16                 | 0.048  | 21.037       | 0.028  | 0.455 | 0.0025 | 0.0041  | 27.236 | 587.5 |
-
-the average latency of threads
-
-![avg cost](../../../doc/criteo-cube-benchmark-avgcost.png)
-
-The QPS is 
-
-![qps](../../../doc/criteo-cube-benchmark-qps.png)
--- a/python/examples/criteo_ctr_with_cube/README_CN.md
+++ b/python/examples/criteo_ctr_with_cube/README_CN.md
-## 带稀疏参数索引服务的CTR预测服务
-(简体中文|[English](./README.md))
-
-### 获取样例数据
-进入目录 `python/examples/criteo_ctr_with_cube`
-```
-sh get_data.sh
-```
-
-### 下载模型和稀疏参数序列文件
-```
-wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz
-tar xf ctr_cube_unittest.tar.gz
-mv models/ctr_client_conf ./
-mv models/ctr_serving_model_kv ./
-mv models/data ./cube/
-```
-执行脚本后会在当前目录有ctr_server_model_kv和ctr_client_config文件夹。
-
-### 启动稀疏参数索引服务
-```
-wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz
-tar xf cube_app.tar.gz
-mv cube_app/cube* ./cube/
-sh cube_prepare.sh &
-```
-
-此处，模型当中的稀疏参数会被存放在稀疏参数索引服务Cube当中。
-
-### 启动RPC预测服务，服务端线程数为4（可在test_server.py配置）
-
-```
-python test_server.py ctr_serving_model_kv 
-```
-
-### 执行预测
-
-```
-python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
-```
-
-### Benchmark
-
-设备 ：Intel(R) Xeon(R) CPU 6148 @ 2.40GHz 
-
-模型 ：[Criteo CTR](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/criteo_ctr_with_cube/network_conf.py)
-
-server core/thread num ： 4/8
-
-执行
-```
-bash benchmark.sh
-```
-客户端每个线程会发送1000个batch
-
-| client  thread num | prepro | client infer | op0    | op1   | op2    | postpro | avg_latency | qps   |
-| ------------------ | ------ | ------------ | ------ | ----- | ------ | ------- | ----- | ----- |
-| 1                  | 0.035  | 1.596        | 0.021  | 0.518 | 0.0024 | 0.0025  | 6.774 | 147.7 |
-| 2                  | 0.034  | 1.780        | 0.027  | 0.463 | 0.0020 | 0.0023  | 6.931 | 288.3 |
-| 4                  | 0.038  | 2.954        | 0.025  | 0.455 | 0.0019 | 0.0027  | 8.378 | 477.5 |
-| 8                  | 0.044  | 8.230        | 0.028  | 0.464 | 0.0023 | 0.0034  | 14.191 | 563.8 |
-| 16                 | 0.048  | 21.037       | 0.028  | 0.455 | 0.0025 | 0.0041  | 27.236 | 587.5 |
-
-平均每个线程耗时图如下
-
-![avg cost](../../../doc/criteo-cube-benchmark-avgcost.png)
-
-每个线程QPS耗时如下
-
-![qps](../../../doc/criteo-cube-benchmark-qps.png)
--- a/python/examples/criteo_ctr_with_cube/args.py
+++ b/python/examples/criteo_ctr_with_cube/args.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-import argparse
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="PaddlePaddle CTR example")
-    parser.add_argument(
-        '--train_data_path',
-        type=str,
-        default='./data/raw/train.txt',
-        help="The path of training dataset")
-    parser.add_argument(
-        '--sparse_only',
-        type=bool,
-        default=False,
-        help="Whether we use sparse features only")
-    parser.add_argument(
-        '--test_data_path',
-        type=str,
-        default='./data/raw/valid.txt',
-        help="The path of testing dataset")
-    parser.add_argument(
-        '--batch_size',
-        type=int,
-        default=1000,
-        help="The size of mini-batch (default:1000)")
-    parser.add_argument(
-        '--embedding_size',
-        type=int,
-        default=10,
-        help="The size for embedding layer (default:10)")
-    parser.add_argument(
-        '--num_passes',
-        type=int,
-        default=10,
-        help="The number of passes to train (default: 10)")
-    parser.add_argument(
-        '--model_output_dir',
-        type=str,
-        default='models',
-        help='The path for model to store (default: models)')
-    parser.add_argument(
-        '--sparse_feature_dim',
-        type=int,
-        default=1000001,
-        help='sparse feature hashing space for index processing')
-    parser.add_argument(
-        '--is_local',
-        type=int,
-        default=1,
-        help='Local train or distributed train (default: 1)')
-    parser.add_argument(
-        '--cloud_train',
-        type=int,
-        default=0,
-        help='Local train or distributed train on paddlecloud (default: 0)')
-    parser.add_argument(
-        '--async_mode',
-        action='store_true',
-        default=False,
-        help='Whether start pserver in async mode to support ASGD')
-    parser.add_argument(
-        '--no_split_var',
-        action='store_true',
-        default=False,
-        help='Whether split variables into blocks when update_method is pserver')
-    parser.add_argument(
-        '--role',
-        type=str,
-        default='pserver',  # trainer or pserver
-        help='The path for model to store (default: models)')
-    parser.add_argument(
-        '--endpoints',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The pserver endpoints, like: 127.0.0.1:6000,127.0.0.1:6001')
-    parser.add_argument(
-        '--current_endpoint',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The path for model to store (default: 127.0.0.1:6000)')
-    parser.add_argument(
-        '--trainer_id',
-        type=int,
-        default=0,
-        help='The path for model to store (default: models)')
-    parser.add_argument(
-        '--trainers',
-        type=int,
-        default=1,
-        help='The num of trianers, (default: 1)')
-    return parser.parse_args()
--- a/python/examples/criteo_ctr_with_cube/benchmark.py
+++ b/python/examples/criteo_ctr_with_cube/benchmark.py
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-from paddle_serving_client import Client
-import sys
-import os
-import criteo as criteo
-import time
-from paddle_serving_client.utils import MultiThreadRunner
-from paddle_serving_client.utils import benchmark_args
-from paddle_serving_client.metric import auc
-
-py_version = sys.version_info[0]
-args = benchmark_args()
-
-
-def single_func(idx, resource):
-    client = Client()
-    print([resource["endpoint"][idx % len(resource["endpoint"])]])
-    client.load_client_config('ctr_client_conf/serving_client_conf.prototxt')
-    client.connect(['127.0.0.1:9292'])
-    batch = 1
-    buf_size = 100
-    dataset = criteo.CriteoDataset()
-    dataset.setup(1000001)
-    test_filelists = [
-        "./raw_data/part-%d" % x for x in range(len(os.listdir("./raw_data")))
-    ]
-    reader = dataset.infer_reader(test_filelists[len(test_filelists) - 40:],
-                                  batch, buf_size)
-    if args.request == "rpc":
-        fetch = ["prob"]
-        start = time.time()
-        itr = 1000
-        for ei in range(itr):
-            if args.batch_size > 0:
-                feed_batch = []
-                for bi in range(args.batch_size):
-                    if py_version == 2:
-                        data = reader().next()
-                    else:
-                        data = reader().__next__()
-                    feed_dict = {}
-                    feed_dict['dense_input'] = data[0][0]
-                    for i in range(1, 27):
-                        feed_dict["embedding_{}.tmp_0".format(i - 1)] = data[0][
-                            i]
-                    feed_batch.append(feed_dict)
-                result = client.predict(feed=feed_batch, fetch=fetch)
-            else:
-                print("unsupport batch size {}".format(args.batch_size))
-
-    elif args.request == "http":
-        raise ("Not support http service.")
-    end = time.time()
-    qps = itr * args.batch_size / (end - start)
-    return [[end - start, qps]]
-
-
-if __name__ == '__main__':
-    multi_thread_runner = MultiThreadRunner()
-    endpoint_list = ["127.0.0.1:9292"]
-    #result = single_func(0, {"endpoint": endpoint_list})
-    start = time.time()
-    result = multi_thread_runner.run(single_func, args.thread,
-                                     {"endpoint": endpoint_list})
-    end = time.time()
-    total_cost = end - start
-    avg_cost = 0
-    qps = 0
-    for i in range(args.thread):
-        avg_cost += result[0][i * 2 + 0]
-        qps += result[0][i * 2 + 1]
-    avg_cost = avg_cost / args.thread
-    print("total cost: {}".format(total_cost))
-    print("average total cost {} s.".format(avg_cost))
-    print("qps {} ins/s".format(qps))
--- a/python/examples/criteo_ctr_with_cube/benchmark.sh
+++ b/python/examples/criteo_ctr_with_cube/benchmark.sh
-rm profile_log
-export FLAGS_profile_client=1
-export FLAGS_profile_server=1
-
-wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz --no-check-certificate
-tar xf ctr_cube_unittest.tar.gz
-mv models/ctr_client_conf ./
-mv models/ctr_serving_model_kv ./
-mv models/data ./cube/
-
-wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz --no-check-certificate
-tar xf cube_app.tar.gz
-mv cube_app/cube* ./cube/
-sh cube_prepare.sh &
-
-python test_server.py ctr_serving_model_kv > serving_log 2>&1 &
-
-for thread_num in 1 4 16
-do
-for batch_size in 1 4 16 64
-do
-    $PYTHONROOT/bin/python benchmark.py --thread $thread_num --batch_size $batch_size --model serving_client_conf/serving_client_conf.prototxt --request rpc > profile 2>&1
-    echo "batch size : $batch_size"
-    echo "thread num : $thread_num"
-    echo "========================================"
-    echo "batch size : $batch_size" >> profile_log
-    $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log
-    tail -n 3 profile >> profile_log
-done
-done
-
-ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9
--- a/python/examples/criteo_ctr_with_cube/benchmark_cube.sh
+++ b/python/examples/criteo_ctr_with_cube/benchmark_cube.sh
-rm profile_log
-
-#wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz --no-check-certificate
-#tar xf ctr_cube_unittest.tar.gz
-mv models/ctr_client_conf ./
-mv models/ctr_serving_model_kv ./
-mv models/data ./cube/
-
-#wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz --no-check-certificate
-#tar xf cube_app.tar.gz
-mv cube_app/cube* ./cube/
-sh cube_prepare.sh &
-
-cp ../../../build_server/core/cube/cube-api/cube-cli .
-python gen_key.py
-
-for thread_num in 1 4 16 32
-do
-for batch_size in 1000
-do
-    ./cube-cli -config_file ./cube/conf/cube.conf -keys key -dict test_dict -thread_num $thread_num --batch $batch_size > profile 2>&1
-    echo "batch size : $batch_size"
-    echo "thread num : $thread_num"
-    echo "========================================"
-    echo "batch size : $batch_size" >> profile_log
-    echo "thread num : $thread_num" >> profile_log
-    tail -n 8 profile >> profile_log
-
-done
-done
-
-ps -ef|grep 'cube'|grep -v grep|cut -c 9-15 | xargs kill -9
--- a/python/examples/criteo_ctr_with_cube/clean.sh
+++ b/python/examples/criteo_ctr_with_cube/clean.sh
-ps -ef | grep cube | awk {'print $2'} | xargs kill -9
-rm -rf cube/cube_data cube/data cube/log* cube/nohup* cube/output/ cube/donefile cube/input cube/monitor cube/cube-builder.INFO
-ps -ef | grep test | awk {'print $2'} | xargs kill -9
-ps -ef | grep serving | awk {'print $2'} | xargs kill -9
--- a/python/examples/criteo_ctr_with_cube/criteo.py
+++ b/python/examples/criteo_ctr_with_cube/criteo.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-
-
-class CriteoDataset(object):
-    def setup(self, sparse_feature_dim):
-        self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-        self.cont_max_ = [
-            20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
-        ]
-        self.cont_diff_ = [
-            20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
-        ]
-        self.hash_dim_ = sparse_feature_dim
-        # here, training data are lines with line_index < train_idx_
-        self.train_idx_ = 41256555
-        self.continuous_range_ = range(1, 14)
-        self.categorical_range_ = range(14, 40)
-
-    def _process_line(self, line):
-        features = line.rstrip('\n').split('\t')
-        dense_feature = []
-        sparse_feature = []
-        for idx in self.continuous_range_:
-            if features[idx] == '':
-                dense_feature.append(0.0)
-            else:
-                dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / \
-                                     self.cont_diff_[idx - 1])
-        for idx in self.categorical_range_:
-            sparse_feature.append(
-                [hash(str(idx) + features[idx]) % self.hash_dim_])
-
-        return dense_feature, sparse_feature, [int(features[0])]
-
-    def infer_reader(self, filelist, batch, buf_size):
-        def local_iter():
-            for fname in filelist:
-                with open(fname.strip(), "r") as fin:
-                    for line in fin:
-                        dense_feature, sparse_feature, label = self._process_line(
-                            line)
-                        #yield dense_feature, sparse_feature, label
-                        yield [dense_feature] + sparse_feature + [label]
-
-        import paddle
-        batch_iter = paddle.batch(
-            paddle.reader.shuffle(
-                local_iter, buf_size=buf_size),
-            batch_size=batch)
-        return batch_iter
-
-    def generate_sample(self, line):
-        def data_iter():
-            dense_feature, sparse_feature, label = self._process_line(line)
-            feature_name = ["dense_input"]
-            for idx in self.categorical_range_:
-                feature_name.append("C" + str(idx - 13))
-            feature_name.append("label")
-            yield zip(feature_name, [dense_feature] + sparse_feature + [label])
-
-        return data_iter
-
-
-if __name__ == "__main__":
-    criteo_dataset = CriteoDataset()
-    criteo_dataset.setup(int(sys.argv[1]))
-    criteo_dataset.run_from_stdin()
--- a/python/examples/criteo_ctr_with_cube/criteo_reader.py
+++ b/python/examples/criteo_ctr_with_cube/criteo_reader.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-import sys
-import paddle.fluid.incubate.data_generator as dg
-
-
-class CriteoDataset(dg.MultiSlotDataGenerator):
-    def setup(self, sparse_feature_dim):
-        self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-        self.cont_max_ = [
-            20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
-        ]
-        self.cont_diff_ = [
-            20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
-        ]
-        self.hash_dim_ = sparse_feature_dim
-        # here, training data are lines with line_index < train_idx_
-        self.train_idx_ = 41256555
-        self.continuous_range_ = range(1, 14)
-        self.categorical_range_ = range(14, 40)
-
-    def _process_line(self, line):
-        features = line.rstrip('\n').split('\t')
-        dense_feature = []
-        sparse_feature = []
-        for idx in self.continuous_range_:
-            if features[idx] == '':
-                dense_feature.append(0.0)
-            else:
-                dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / \
-                                     self.cont_diff_[idx - 1])
-        for idx in self.categorical_range_:
-            sparse_feature.append(
-                [hash(str(idx) + features[idx]) % self.hash_dim_])
-
-        return dense_feature, sparse_feature, [int(features[0])]
-
-    def infer_reader(self, filelist, batch, buf_size):
-        def local_iter():
-            for fname in filelist:
-                with open(fname.strip(), "r") as fin:
-                    for line in fin:
-                        dense_feature, sparse_feature, label = self._process_line(
-                            line)
-                        #yield dense_feature, sparse_feature, label
-                        yield [dense_feature] + sparse_feature + [label]
-
-        import paddle
-        batch_iter = paddle.batch(
-            paddle.reader.shuffle(
-                local_iter, buf_size=buf_size),
-            batch_size=batch)
-        return batch_iter
-
-    def generate_sample(self, line):
-        def data_iter():
-            dense_feature, sparse_feature, label = self._process_line(line)
-            feature_name = ["dense_input"]
-            for idx in self.categorical_range_:
-                feature_name.append("C" + str(idx - 13))
-            feature_name.append("label")
-            yield zip(feature_name, [dense_feature] + sparse_feature + [label])
-
-        return data_iter
-
-
-if __name__ == "__main__":
-    criteo_dataset = CriteoDataset()
-    criteo_dataset.setup(int(sys.argv[1]))
-    criteo_dataset.run_from_stdin()
--- a/python/examples/criteo_ctr_with_cube/cube/conf/cube.conf
+++ b/python/examples/criteo_ctr_with_cube/cube/conf/cube.conf
-[{
-    "dict_name": "test_dict",
-    "shard": 1,
-    "dup": 1,
-    "timeout": 200,
-    "retry": 3,
-    "backup_request": 100,
-    "type": "ipport_list",
-    "load_balancer": "rr",
-    "nodes": [{
-        "ipport_list": "list://127.0.0.1:8027"
-    }]
-}]
--- a/python/examples/criteo_ctr_with_cube/cube/conf/gflags.conf
+++ b/python/examples/criteo_ctr_with_cube/cube/conf/gflags.conf
--port=8027
--dict_split=1
--in_mem=true
--log_dir=./log/
--- a/python/examples/criteo_ctr_with_cube/cube/keys
+++ b/python/examples/criteo_ctr_with_cube/cube/keys
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
--- a/python/examples/criteo_ctr_with_cube/cube_prepare.sh
+++ b/python/examples/criteo_ctr_with_cube/cube_prepare.sh
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-#! /bin/bash
-
-mkdir -p cube_model
-mkdir -p cube/data
-./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1  -only_build=false
-cd cube && ./cube
--- a/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh
+++ b/python/examples/criteo_ctr_with_cube/cube_quant_prepare.sh
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-#! /bin/bash
-
-mkdir -p cube_model
-mkdir -p cube/data
-./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature 8  
-./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1  -only_build=false
-mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
-cd cube && ./cube 
--- a/python/examples/criteo_ctr_with_cube/local_train.py
+++ b/python/examples/criteo_ctr_with_cube/local_train.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-from __future__ import print_function
-
-from args import parse_args
-import os
-import paddle.fluid as fluid
-import sys
-from network_conf import dnn_model
-
-dense_feature_dim = 13
-
-
-def train():
-    args = parse_args()
-    sparse_only = args.sparse_only
-    if not os.path.isdir(args.model_output_dir):
-        os.mkdir(args.model_output_dir)
-    dense_input = fluid.layers.data(
-        name="dense_input", shape=[dense_feature_dim], dtype='float32')
-    sparse_input_ids = [
-        fluid.layers.data(
-            name="C" + str(i), shape=[1], lod_level=1, dtype="int64")
-        for i in range(1, 27)
-    ]
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    #nn_input = None if sparse_only else dense_input
-    nn_input = dense_input
-    predict_y, loss, auc_var, batch_auc_var, infer_vars = dnn_model(
-        nn_input, sparse_input_ids, label, args.embedding_size,
-        args.sparse_feature_dim)
-
-    optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
-    optimizer.minimize(loss)
-
-    exe = fluid.Executor(fluid.CPUPlace())
-    exe.run(fluid.default_startup_program())
-    dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-    dataset.set_use_var([dense_input] + sparse_input_ids + [label])
-
-    python_executable = "python"
-    pipe_command = "{} criteo_reader.py {}".format(python_executable,
-                                                   args.sparse_feature_dim)
-
-    dataset.set_pipe_command(pipe_command)
-    dataset.set_batch_size(128)
-    thread_num = 10
-    dataset.set_thread(thread_num)
-
-    whole_filelist = [
-        "raw_data/part-%d" % x for x in range(len(os.listdir("raw_data")))
-    ]
-
-    print(whole_filelist)
-    dataset.set_filelist(whole_filelist[:100])
-    dataset.load_into_memory()
-    fluid.layers.Print(auc_var)
-    epochs = 1
-    for i in range(epochs):
-        exe.train_from_dataset(
-            program=fluid.default_main_program(), dataset=dataset, debug=True)
-        print("epoch {} finished".format(i))
-
-    import paddle_serving_client.io as server_io
-    feed_var_dict = {}
-    feed_var_dict['dense_input'] = dense_input
-    for i, sparse in enumerate(sparse_input_ids):
-        feed_var_dict["embedding_{}.tmp_0".format(i)] = sparse
-    fetch_var_dict = {"prob": predict_y}
-
-    feed_kv_dict = {}
-    feed_kv_dict['dense_input'] = dense_input
-    for i, emb in enumerate(infer_vars):
-        feed_kv_dict["embedding_{}.tmp_0".format(i)] = emb
-    fetch_var_dict = {"prob": predict_y}
-
-    server_io.save_model("ctr_serving_model", "ctr_client_conf", feed_var_dict,
-                         fetch_var_dict, fluid.default_main_program())
-
-    server_io.save_model("ctr_serving_model_kv", "ctr_client_conf_kv",
-                         feed_kv_dict, fetch_var_dict,
-                         fluid.default_main_program())
-
-
-if __name__ == '__main__':
-    train()
--- a/python/examples/criteo_ctr_with_cube/network_conf.py
+++ b/python/examples/criteo_ctr_with_cube/network_conf.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-import paddle.fluid as fluid
-import math
-
-
-def dnn_model(dense_input, sparse_inputs, label, embedding_size,
-              sparse_feature_dim):
-    def embedding_layer(input):
-        emb = fluid.layers.embedding(
-            input=input,
-            is_sparse=True,
-            is_distributed=False,
-            size=[sparse_feature_dim, embedding_size],
-            param_attr=fluid.ParamAttr(
-                name="SparseFeatFactors",
-                initializer=fluid.initializer.Uniform()))
-        x = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-        return emb, x
-
-    def mlp_input_tensor(emb_sums, dense_tensor):
-        #if isinstance(dense_tensor, fluid.Variable):
-        #    return fluid.layers.concat(emb_sums, axis=1)
-        #else:
-        return fluid.layers.concat(emb_sums + [dense_tensor], axis=1)
-
-    def mlp(mlp_input):
-        fc1 = fluid.layers.fc(input=mlp_input,
-                              size=400,
-                              act='relu',
-                              param_attr=fluid.ParamAttr(
-                                  initializer=fluid.initializer.Normal(
-                                      scale=1 / math.sqrt(mlp_input.shape[1]))))
-        fc2 = fluid.layers.fc(input=fc1,
-                              size=400,
-                              act='relu',
-                              param_attr=fluid.ParamAttr(
-                                  initializer=fluid.initializer.Normal(
-                                      scale=1 / math.sqrt(fc1.shape[1]))))
-        fc3 = fluid.layers.fc(input=fc2,
-                              size=400,
-                              act='relu',
-                              param_attr=fluid.ParamAttr(
-                                  initializer=fluid.initializer.Normal(
-                                      scale=1 / math.sqrt(fc2.shape[1]))))
-        pre = fluid.layers.fc(input=fc3,
-                              size=2,
-                              act='softmax',
-                              param_attr=fluid.ParamAttr(
-                                  initializer=fluid.initializer.Normal(
-                                      scale=1 / math.sqrt(fc3.shape[1]))))
-        return pre
-
-    emb_pair_sums = list(map(embedding_layer, sparse_inputs))
-    emb_sums = [x[1] for x in emb_pair_sums]
-    infer_vars = [x[0] for x in emb_pair_sums]
-    mlp_in = mlp_input_tensor(emb_sums, dense_input)
-    predict = mlp(mlp_in)
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.reduce_sum(cost)
-    accuracy = fluid.layers.accuracy(input=predict, label=label)
-    auc_var, batch_auc_var, auc_states = \
-        fluid.layers.auc(input=predict, label=label, num_thresholds=2 ** 12, slide_steps=20)
-    return predict, avg_cost, auc_var, batch_auc_var, infer_vars
--- a/python/examples/criteo_ctr_with_cube/test_server.py
+++ b/python/examples/criteo_ctr_with_cube/test_server.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-import os
-import sys
-from paddle_serving_server import OpMaker
-from paddle_serving_server import OpSeqMaker
-from paddle_serving_server import Server
-
-op_maker = OpMaker()
-read_op = op_maker.create('general_reader')
-general_dist_kv_infer_op = op_maker.create('general_dist_kv_infer')
-response_op = op_maker.create('general_response')
-
-op_seq_maker = OpSeqMaker()
-op_seq_maker.add_op(read_op)
-op_seq_maker.add_op(general_dist_kv_infer_op)
-op_seq_maker.add_op(response_op)
-
-server = Server()
-server.set_op_sequence(op_seq_maker.get_op_sequence())
-server.set_num_threads(4)
-server.load_model_config(sys.argv[1])
-server.prepare_server(
-    workdir="work_dir1",
-    port=9292,
-    device="cpu",
-    cube_conf="./cube/conf/cube.conf")
-server.run_server()
--- a/python/examples/criteo_ctr_with_cube/test_server_gpu.py
+++ b/python/examples/criteo_ctr_with_cube/test_server_gpu.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-import os
-import sys
-from paddle_serving_server_gpu import OpMaker
-from paddle_serving_server_gpu import OpSeqMaker
-from paddle_serving_server_gpu import Server
-
-op_maker = OpMaker()
-read_op = op_maker.create('general_reader')
-general_dist_kv_infer_op = op_maker.create('general_dist_kv_infer')
-response_op = op_maker.create('general_response')
-
-op_seq_maker = OpSeqMaker()
-op_seq_maker.add_op(read_op)
-op_seq_maker.add_op(general_dist_kv_infer_op)
-op_seq_maker.add_op(response_op)
-
-server = Server()
-server.set_op_sequence(op_seq_maker.get_op_sequence())
-server.set_num_threads(4)
-server.load_model_config(sys.argv[1])
-server.prepare_server(
-    workdir="work_dir1",
-    port=9292,
-    device="cpu",
-    cube_conf="./cube/conf/cube.conf")
-server.run_server()
--- a/python/examples/criteo_ctr_with_cube/test_server_quant.py
+++ b/python/examples/criteo_ctr_with_cube/test_server_quant.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-import os
-import sys
-from paddle_serving_server import OpMaker
-from paddle_serving_server import OpSeqMaker
-from paddle_serving_server import Server
-
-op_maker = OpMaker()
-read_op = op_maker.create('general_reader')
-general_dist_kv_infer_op = op_maker.create('general_dist_kv_quant_infer')
-response_op = op_maker.create('general_response')
-
-op_seq_maker = OpSeqMaker()
-op_seq_maker.add_op(read_op)
-op_seq_maker.add_op(general_dist_kv_infer_op)
-op_seq_maker.add_op(response_op)
-
-server = Server()
-server.set_op_sequence(op_seq_maker.get_op_sequence())
-server.set_num_threads(4)
-server.load_model_config(sys.argv[1])
-server.prepare_server(
-    workdir="work_dir1",
-    port=9292,
-    device="cpu",
-    cube_conf="./cube/conf/cube.conf")
-server.run_server()
--- a/python/examples/encryption/README.md
+++ b/python/examples/encryption/README.md
+# Encryption Model Prediction
+
+([简体中文](README_CN.md)|English)
+
+## Get Origin Model
+
+The example uses the model file of the fit_a_line example as a origin model
+
+```
+sh get_data.sh
+```
+
+## Encrypt Model
+
+```
+python encrypt.py
+```
+The key is stored in the `key` file, and the encrypted model file and server-side configuration file are stored in the `encrypt_server` directory.
+client-side configuration file are stored in the `encrypt_client` directory.
+
+## Start Encryption Service
+CPU Service
+```
+python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model
+```
+GPU Service
+```
+python -m paddle_serving_server_gpu.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
+```
+
+## Prediction
+```
+python test_client.py uci_housing_client/serving_client_conf.prototxt
+```
--- a/python/examples/encryption/README_CN.md
+++ b/python/examples/encryption/README_CN.md
+# 加密模型预测
+
+(简体中文|[English](README.md))
+
+## 获取明文模型
+
+示例中使用fit_a_line示例的模型文件作为明文模型
+
+```
+sh get_data.sh
+```
+
+## 模型加密
+
+```
+python encrypt.py
+```
+密钥保存在`key`文件中，加密模型文件以及server端配置文件保存在`encrypt_server`目录下，client端配置文件保存在`encrypt_client`目录下。
+
+## 启动加密预测服务
+CPU预测服务
+```
+python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model
+```
+GPU预测服务
+```
+python -m paddle_serving_server_gpu.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
+```
+
+## 预测
+```
+python test_client.py uci_housing_client/serving_client_conf.prototxt
+```
--- a/python/examples/criteo_ctr_with_cube/gen_key.py
+++ b/python/examples/criteo_ctr_with_cube/gen_key.py
@@ -12,9 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import sys
-import random
+from paddle_serving_client.io import inference_model_to_serving

-with open("key", "w") as f:
-    for i in range(1000000):
-        f.write("{}\n".format(random.randint(0, 999999)))
+
+def serving_encryption():
+    inference_model_to_serving(
+        dirname="./uci_housing_model",
+        serving_server="encrypt_server",
+        serving_client="encrypt_client",
+        encryption=True)
+
+
+if __name__ == "__main__":
+    serving_encryption()
--- a/python/examples/criteo_ctr_with_cube/get_data.sh
+++ b/python/examples/criteo_ctr_with_cube/get_data.sh
-wget --no-check-certificate https://paddle-serving.bj.bcebos.com/data/ctr_prediction/ctr_data.tar.gz
-tar -zxvf ctr_data.tar.gz
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing_example/encrypt.tar.gz
+tar -xzf encrypt.tar.gz
+cp -rvf ../fit_a_line/uci_housing_model .
+cp -rvf ../fit_a_line/uci_housing_client .
--- a/python/examples/grpc_impl_example/fit_a_line/test_numpy_input_client.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_numpy_input_client.py
@@ -13,19 +13,20 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing

-from paddle_serving_client import MultiLangClient as Client
-import numpy as np
+from paddle_serving_client import Client
+import sys

 client = Client()
-client.connect(["127.0.0.1:9393"])
+client.load_client_config(sys.argv[1])
+client.use_key("./key")
+client.connect(["127.0.0.1:9300"], encryption=True)

-x = [
-    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
-    0.4919, 0.1856, 0.0795, -0.0332
-]
-for i in range(3):
-    fetch_map = client.predict(feed={"x": np.array(x)}, fetch=["price"])
-    if fetch_map["serving_status_code"] == 0:
-        print(fetch_map)
-    else:
-        print(fetch_map["serving_status_code"])
+import paddle
+test_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.test(), buf_size=500),
+    batch_size=1)
+
+for data in test_reader():
+    fetch_map = client.predict(feed={"x": data[0][0]}, fetch=["price"])
+    print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
--- a/python/examples/fit_a_line/README_CN.md
+++ b/python/examples/fit_a_line/README_CN.md
@@ -14,12 +14,6 @@ sh get_data.sh

 ### 开启服务端

-``` shell
-python test_server.py uci_housing_model/
-```
-
-也可以通过下面的一行代码开启默认RPC服务：
-
 ```shell
 python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393
 ```

--- a/python/examples/fit_a_line/local_train.py
+++ b/python/examples/fit_a_line/local_train.py
@@ -16,7 +16,7 @@
 import sys
 import paddle
 import paddle.fluid as fluid
-
+paddle.enable_static()
 train_reader = paddle.batch(
    paddle.reader.shuffle(
        paddle.dataset.uci_housing.train(), buf_size=500),

--- a/python/examples/fit_a_line/test_server.py
+++ b/python/examples/fit_a_line/test_server.py
@@ -31,6 +31,6 @@ class UciService(WebService):

 uci_service = UciService(name="uci")
 uci_service.load_model_config("uci_housing_model")
-uci_service.prepare_server(workdir="workdir", port=9292)
+uci_service.prepare_server(workdir="workdir", port=9393)
 uci_service.run_rpc_service()
 uci_service.run_web_service()
--- a/python/examples/grpc_impl_example/fit_a_line/README_CN.md
+++ b/python/examples/grpc_impl_example/fit_a_line/README_CN.md
@@ -38,20 +38,9 @@ python test_asyn_client.py
 python test_batch_client.py
 ```

-### 通用 pb 预测
-
-``` shell
-python test_general_pb_client.py
-```
-
 ### 预测超时

 ``` shell
 python test_timeout_client.py
 ```

-### List 输入
-
-``` shell
-python test_list_input_client.py
-```
--- a/python/examples/grpc_impl_example/fit_a_line/test_asyn_client.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_asyn_client.py
@@ -18,7 +18,7 @@ import functools
 import time
 import threading
 import grpc
-
+import numpy as np
 client = Client()
 client.connect(["127.0.0.1:9393"])

@@ -43,7 +43,8 @@ x = [
 ]
 task_count = 0
 for i in range(3):
-    future = client.predict(feed={"x": x}, fetch=["price"], asyn=True)
+    new_data = np.array(x).astype("float32").reshape((1,13))
+    future = client.predict(feed={"x": new_data}, fetch=["price"], batch=False, asyn=True)
    task_count += 1
    future.add_done_callback(functools.partial(call_back))


--- a/python/examples/grpc_impl_example/fit_a_line/test_batch_client.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_batch_client.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing
 from paddle_serving_client import MultiLangClient as Client
-
+import numpy as np
 client = Client()
 client.connect(["127.0.0.1:9393"])

@@ -24,8 +24,11 @@ x = [
 ]

 for i in range(3):
-    batch_feed = [{"x": x} for j in range(batch_size)]
-    fetch_map = client.predict(feed=batch_feed, fetch=["price"])
+    new_data = np.array(x).astype("float32").reshape((1, 1, 13))
+    batch_data = np.concatenate([new_data, new_data, new_data], axis=0)
+    print(batch_data.shape)
+    fetch_map = client.predict(feed={"x":batch_data}, fetch=["price"], batch=True)
+
    if fetch_map["serving_status_code"] == 0:
        print(fetch_map)
    else:

--- a/python/examples/grpc_impl_example/fit_a_line/test_general_pb_client.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_general_pb_client.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-
-from paddle_serving_client import MultiLangClient as Client
-
-client = Client()
-client.connect(["127.0.0.1:9393"])
-
-x = [
-    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
-    0.4919, 0.1856, 0.0795, -0.0332
-]
-for i in range(3):
-    fetch_map = client.predict(feed={"x": x}, fetch=["price"], is_python=False)
-    if fetch_map["serving_status_code"] == 0:
-        print(fetch_map)
-    else:
-        print(fetch_map["serving_status_code"])
--- a/python/examples/grpc_impl_example/fit_a_line/test_sync_client.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_sync_client.py
@@ -14,16 +14,27 @@
 # pylint: disable=doc-string-missing

 from paddle_serving_client import MultiLangClient as Client
-
+import numpy as np
 client = Client()
 client.connect(["127.0.0.1:9393"])

+"""
+for data in test_reader():
+    new_data = np.zeros((1, 1, 13)).astype("float32")
+    new_data[0] = data[0][0]
+    fetch_map = client.predict(
+        feed={"x": new_data}, fetch=["price"], batch=True)
+    print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
+    print(fetch_map)
+"""
+
 x = [
    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
    0.4919, 0.1856, 0.0795, -0.0332
 ]
 for i in range(3):
-    fetch_map = client.predict(feed={"x": x}, fetch=["price"])
+    new_data = np.array(x).astype("float32").reshape((1,13))
+    fetch_map = client.predict(feed={"x": new_data}, fetch=["price"], batch=False)
    if fetch_map["serving_status_code"] == 0:
        print(fetch_map)
    else:

--- a/python/examples/grpc_impl_example/fit_a_line/test_timeout_client.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_timeout_client.py
@@ -15,17 +15,18 @@

 from paddle_serving_client import MultiLangClient as Client
 import grpc
-
+import numpy as np
 client = Client()
 client.connect(["127.0.0.1:9393"])
-client.set_rpc_timeout_ms(1)
+client.set_rpc_timeout_ms(40)

 x = [
    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
    0.4919, 0.1856, 0.0795, -0.0332
 ]
 for i in range(3):
-    fetch_map = client.predict(feed={"x": x}, fetch=["price"])
+    new_data = np.array(x).astype("float32").reshape((1,13))
+    fetch_map = client.predict(feed={"x": new_data}, fetch=["price"], batch=False)
    if fetch_map["serving_status_code"] == 0:
        print(fetch_map)
    elif fetch_map["serving_status_code"] == grpc.StatusCode.DEADLINE_EXCEEDED:

--- a/python/examples/grpc_impl_example/yolov4/test_client.py
+++ b/python/examples/grpc_impl_example/yolov4/test_client.py
@@ -27,7 +27,7 @@ preprocess = Sequential([
 postprocess = RCNNPostprocess("label_list.txt", "output", [608, 608])
 client = Client()
 client.connect(['127.0.0.1:9393'])
-# client.set_rpc_timeout_ms(10000)
+client.set_rpc_timeout_ms(15000)

 im = preprocess(sys.argv[1])
 fetch_map = client.predict(
@@ -35,7 +35,8 @@ fetch_map = client.predict(
        "image": im,
        "im_size": np.array(list(im.shape[1:])),
    },
-    fetch=["save_infer_model/scale_0.tmp_0"])
+    fetch=["save_infer_model/scale_0.tmp_0"], batch=False)
+print(fetch_map)
 fetch_map.pop("serving_status_code")
 fetch_map["image"] = sys.argv[1]
 postprocess(fetch_map)
--- a/python/examples/pipeline/simple_web_service/config.yml
+++ b/python/examples/pipeline/simple_web_service/config.yml
@@ -3,6 +3,7 @@
 worker_num: 1

 #http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
+rpc_port: 9998
 http_port: 18082

 dag:
@@ -20,7 +21,7 @@ op:
            model_config: uci_housing_model

            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
-            devices: "0" # "0,1"
+            devices: "" # "0,1"

            #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
            client_type: local_predictor

--- a/python/examples/criteo_ctr_with_cube/test_client.py
+++ b/python/examples/criteo_ctr_with_cube/test_client.py
@@ -11,49 +11,50 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# pylint: disable=doc-string-missing
-
-from paddle_serving_client import Client
-import sys
-import os
-import criteo as criteo
-import time
-from paddle_serving_client.metric import auc
+try:
+    from paddle_serving_server.web_service import WebService, Op
+except ImportError:
+    from paddle_serving_server.web_service import WebService, Op
+import logging
 import numpy as np
+from numpy import array
+import sys
+import base64
+
+_LOGGER = logging.getLogger()
+np.set_printoptions(threshold=sys.maxsize)
+class UciOp(Op):
+    def init_op(self):
+        self.separator = ","
+
+    def preprocess(self, input_dicts, data_id, log_id):
+        """
+        diff with web_server.py
+	javaclient input type is INDArray, restful request input is list.
+	this function simply reshape input to the Specified shape.
+        """
+        (_, input_dict), = input_dicts.items()
+        _LOGGER.error("UciOp::preprocess >>> log_id:{}, input:{}".format(
+            log_id, input_dict))
+        proc_dict = {}
+        x_value = input_dict["x"]
+        input_dict["x"] = x_value.reshape(1,13)
+        
+        return input_dict, False, None, ""
+
+    def postprocess(self, input_dicts, fetch_dict, log_id):
+        _LOGGER.info("UciOp::postprocess >>> log_id:{}, fetch_dict:{}".format(
+            log_id, fetch_dict))
+        fetch_dict["price"] = str(fetch_dict["price"][0][0])
+        return fetch_dict, None, ""
+
+
+class UciService(WebService):
+    def get_pipeline_response(self, read_op):
+        uci_op = UciOp(name="uci", input_ops=[read_op])
+        return uci_op
+

-py_version = sys.version_info[0]
-
-client = Client()
-client.load_client_config(sys.argv[1])
-client.connect(["127.0.0.1:9292"])
-
-batch = 1
-buf_size = 100
-dataset = criteo.CriteoDataset()
-dataset.setup(1000001)
-test_filelists = ["{}/part-0".format(sys.argv[2])]
-reader = dataset.infer_reader(test_filelists, batch, buf_size)
-label_list = []
-prob_list = []
-start = time.time()
-for ei in range(10000):
-    if py_version == 2:
-        data = reader().next()
-    else:
-        data = reader().__next__()
-    feed_dict = {}
-    feed_dict['dense_input'] = np.array(data[0][0]).astype("float32").reshape(
-        1, 13)
-    feed_dict['dense_input.lod'] = [0, 1]
-    for i in range(1, 27):
-        tmp_data = np.array(data[0][i]).astype(np.int64)
-        feed_dict["embedding_{}.tmp_0".format(i - 1)] = tmp_data.reshape(
-            (1, len(data[0][i])))
-        feed_dict["embedding_{}.tmp_0.lod".format(i - 1)] = [0, 1]
-    fetch_map = client.predict(feed=feed_dict, fetch=["prob"], batch=True)
-    prob_list.append(fetch_map['prob'][0][1])
-    label_list.append(data[0][-1][0])
-
-print(auc(label_list, prob_list))
-end = time.time()
-print(end - start)
+uci_service = UciService(name="uci")
+uci_service.prepare_pipeline_config("config.yml")
+uci_service.run_service()
--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -57,6 +57,8 @@ class LocalPredictor(object):
                          mem_optim=True,
                          ir_optim=False,
                          use_trt=False,
+                          use_lite=False,
+                          use_xpu=False,
                          use_feed_fetch_ops=False):
        """
        Load model config and set the engine config for the paddle predictor
@@ -70,6 +72,8 @@ class LocalPredictor(object):
            mem_optim: memory optimization, True default.
            ir_optim: open calculation chart optimization, False default.
            use_trt: use nvidia TensorRT optimization, False default
+            use_lite: use Paddle-Lite Engint, False default
+            use_xpu: run predict on Baidu Kunlun, False default
            use_feed_fetch_ops: use feed/fetch ops, False default.
        """
        client_config = "{}/serving_server_conf.prototxt".format(model_path)
@@ -80,9 +84,9 @@ class LocalPredictor(object):
        config = AnalysisConfig(model_path)
        logger.info("load_model_config params: model_path:{}, use_gpu:{},\
            gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{},\
-            use_trt:{}, use_feed_fetch_ops:{}".format(
+            use_trt:{}, use_lite:{}, use_xpu: {}, use_feed_fetch_ops:{}".format(
            model_path, use_gpu, gpu_id, use_profile, thread_num, mem_optim,
-            ir_optim, use_trt, use_feed_fetch_ops))
+            ir_optim, use_trt, use_lite, use_xpu, use_feed_fetch_ops))

        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
@@ -119,6 +123,17 @@ class LocalPredictor(object):
                    use_static=False,
                    use_calib_mode=False)

+        if use_lite:
+            config.enable_lite_engine(
+                precision_mode = PrecisionType.Float32,
+                zero_copy = True,
+                passes_filter = [],
+                ops_filter = []
+            )
+
+        if use_xpu:
+            config.enable_xpu(100 * 1024 * 1024)
+
        self.predictor = create_paddle_predictor(config)

    def predict(self, feed=None, fetch=None, batch=False, log_id=0):

--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
@@ -19,6 +19,9 @@ from .proto import sdk_configure_pb2 as sdk
 from .proto import general_model_config_pb2 as m_config
 import google.protobuf.text_format
 import numpy as np
+import requests
+import json
+import base64
 import time
 import sys

@@ -161,6 +164,7 @@ class Client(object):
        self.fetch_names_to_idx_ = {}
        self.lod_tensor_set = set()
        self.feed_tensor_len = {}
+        self.key = None

        for i, var in enumerate(model_conf.feed_var):
            self.feed_names_to_idx_[var.alias_name] = i
@@ -193,7 +197,28 @@ class Client(object):
        else:
            self.rpc_timeout_ms = rpc_timeout

-    def connect(self, endpoints=None):
+    def use_key(self, key_filename):
+        with open(key_filename, "r") as f:
+            self.key = f.read()
+
+    def get_serving_port(self, endpoints):
+        if self.key is not None:
+            req = json.dumps({"key": base64.b64encode(self.key)})
+        else:
+            req = json.dumps({})
+        r = requests.post("http://" + endpoints[0], req)
+        result = r.json()
+        print(result)
+        if "endpoint_list" not in result:
+            raise ValueError("server not ready")
+        else:
+            endpoints = [
+                endpoints[0].split(":")[0] + ":" +
+                str(result["endpoint_list"][0])
+            ]
+            return endpoints
+
+    def connect(self, endpoints=None, encryption=False):
        # check whether current endpoint is available
        # init from client config
        # create predictor here
@@ -203,6 +228,8 @@ class Client(object):
                    "You must set the endpoints parameter or use add_variant function to create a variant."
                )
        else:
+	    if encryption:
+                endpoints = self.get_serving_port(endpoints)
            if self.predictor_sdk_ is None:
                self.add_variant('default_tag_{}'.format(id(self)), endpoints,
                                 100)
@@ -522,20 +549,15 @@ class MultiLangClient(object):
        req.fetch_var_names.extend(fetch)
        req.is_python = is_python
        req.log_id = log_id
-        feed_batch = None
-        if isinstance(feed, dict):
-            feed_batch = [feed]
-        elif isinstance(feed, list):
-            feed_batch = feed
-        else:
-            raise Exception("{} not support".format(type(feed)))
-        req.feed_var_names.extend(feed_batch[0].keys())
-        init_feed_names = False
-        for feed_data in feed_batch:
+        feed_var_names = []
+        for key in feed.keys():
+            if '.lod' not in key:
+                feed_var_names.append(key)
+        req.feed_var_names.extend(feed_var_names)
        inst = multi_lang_general_model_service_pb2.FeedInst()
        for name in req.feed_var_names:
            tensor = multi_lang_general_model_service_pb2.Tensor()
-                var = feed_data[name]
+            var = feed[name]
            v_type = self.feed_types_[name]
            if is_python:
                data = None
@@ -564,34 +586,9 @@ class MultiLangClient(object):
                else:
                    raise Exception("var must be list or ndarray.")
                tensor.data = data.tobytes()
-                else:
-                    if isinstance(var, np.ndarray):
-                        if v_type == 0:  # int64
-                            tensor.int64_data.extend(
-                                var.reshape(-1).astype("int64").tolist())
-                        elif v_type == 1:
-                            tensor.float_data.extend(
-                                var.reshape(-1).astype('float32').tolist())
-                        elif v_type == 2:
-                            tensor.int_data.extend(
-                                var.reshape(-1).astype('int32').tolist())
-                        else:
-                            raise Exception("error tensor value type.")
-                    elif isinstance(var, list):
-                        if v_type == 0:
-                            tensor.int64_data.extend(self._flatten_list(var))
-                        elif v_type == 1:
-                            tensor.float_data.extend(self._flatten_list(var))
-                        elif v_type == 2:
-                            tensor.int_data.extend(self._flatten_list(var))
-                        else:
-                            raise Exception("error tensor value type.")
-                    else:
-                        raise Exception("var must be list or ndarray.")
-                if isinstance(var, np.ndarray):
            tensor.shape.extend(list(var.shape))
-                else:
-                    tensor.shape.extend(self.feed_shapes_[name])
+            if "{}.lod".format(name) in feed.keys():
+                tensor.lod.extend(feed["{}.lod".format(name)])
            inst.tensor_array.append(tensor)
        req.insts.append(inst)
        return req
@@ -652,10 +649,17 @@ class MultiLangClient(object):
    def predict(self,
                feed,
                fetch,
+                batch=True,
                need_variant_tag=False,
                asyn=False,
                is_python=True,
                log_id=0):
+        if isinstance(feed, dict) is False:
+            raise ValueError("Type Error. grpc feed must be dict.")
+        if batch is False:
+            for key in feed:
+                if ".lod" not in key:
+                    feed[key] = feed[key][np.newaxis, :]
        if not asyn:
            try:
                self.profile_.record('py_prepro_0')

--- a/python/paddle_serving_client/io/__init__.py
+++ b/python/paddle_serving_client/io/__init__.py
@@ -21,15 +21,105 @@ from paddle.fluid.framework import Program
 from paddle.fluid import CPUPlace
 from paddle.fluid.io import save_inference_model
 import paddle.fluid as fluid
+from paddle.fluid.core import CipherUtils
+from paddle.fluid.core import CipherFactory
+from paddle.fluid.core import Cipher
 from ..proto import general_model_config_pb2 as model_conf
 import os
+import paddle
+import paddle.nn.functional as F
+import errno
+from paddle.jit import to_static

+def save_dygraph_model(serving_model_folder, client_config_folder, model):
+    paddle.jit.save(model, "serving_tmp")
+    loaded_layer = paddle.jit.load(path=".", model_filename="serving_tmp.pdmodel", params_filename="serving_tmp.pdiparams")
+    feed_target_names = [x.name for x in loaded_layer._input_spec()]
+    fetch_target_names = [x.name for x in loaded_layer._output_spec()]
+
+    inference_program = loaded_layer.program()
+    feed_var_dict = {
+           x: inference_program.global_block().var(x)
+           for x in feed_target_names
+    }
+    fetch_var_dict = {
+           x: inference_program.global_block().var(x)
+           for x in fetch_target_names
+    }
+    config = model_conf.GeneralModelConfig()
+
+    #int64 = 0; float32 = 1; int32 = 2;
+    for key in feed_var_dict:
+        feed_var = model_conf.FeedVar()
+        feed_var.alias_name = key
+        feed_var.name = feed_var_dict[key].name
+        feed_var.is_lod_tensor = feed_var_dict[key].lod_level >= 1
+        if feed_var_dict[key].dtype == core.VarDesc.VarType.INT64:
+            feed_var.feed_type = 0
+        if feed_var_dict[key].dtype == core.VarDesc.VarType.FP32:
+            feed_var.feed_type = 1
+        if feed_var_dict[key].dtype == core.VarDesc.VarType.INT32:
+            feed_var.feed_type = 2
+        if feed_var.is_lod_tensor:
+            feed_var.shape.extend([-1])
+        else:
+            tmp_shape = []
+            for v in feed_var_dict[key].shape:
+                if v >= 0:
+                    tmp_shape.append(v)
+            feed_var.shape.extend(tmp_shape)
+        config.feed_var.extend([feed_var])
+    for key in fetch_var_dict:
+        fetch_var = model_conf.FetchVar()
+        fetch_var.alias_name = key
+        fetch_var.name = fetch_var_dict[key].name
+        fetch_var.is_lod_tensor = 1
+        if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT64:
+            fetch_var.fetch_type = 0
+        if fetch_var_dict[key].dtype == core.VarDesc.VarType.FP32:
+            fetch_var.fetch_type = 1
+        if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT32:
+            fetch_var.fetch_type = 2
+        if fetch_var.is_lod_tensor:
+            fetch_var.shape.extend([-1])
+        else:
+            tmp_shape = []
+            for v in fetch_var_dict[key].shape:
+                if v >= 0:
+                    tmp_shape.append(v)
+            fetch_var.shape.extend(tmp_shape)
+        config.fetch_var.extend([fetch_var])
+    cmd = "mkdir -p {}".format(client_config_folder)
+    os.system(cmd)
+    cmd = "mkdir -p {}".format(serving_model_folder)
+    os.system(cmd)
+    cmd = "mv {} {}/__model__".format("serving_tmp.pdmodel", serving_model_folder)
+    os.system(cmd)
+    cmd = "mv {} {}/__params__".format("serving_tmp.pdiparams", serving_model_folder)
+    os.system(cmd)
+    cmd = "rm -rf serving_tmp.pd*"
+    os.system(cmd)
+    with open("{}/serving_client_conf.prototxt".format(client_config_folder),
+              "w") as fout:
+        fout.write(str(config))
+    with open("{}/serving_server_conf.prototxt".format(serving_model_folder),
+              "w") as fout:
+        fout.write(str(config))
+    with open("{}/serving_client_conf.stream.prototxt".format(
+            client_config_folder), "wb") as fout:
+        fout.write(config.SerializeToString())
+    with open("{}/serving_server_conf.stream.prototxt".format(
+            serving_model_folder), "wb") as fout:
+        fout.write(config.SerializeToString())

 def save_model(server_model_folder,
               client_config_folder,
               feed_var_dict,
               fetch_var_dict,
-               main_program=None):
+	       main_program=None,
+               encryption=False,
+               key_len=128,
+               encrypt_conf=None):
    executor = Executor(place=CPUPlace())

    feed_var_names = [feed_var_dict[x].name for x in feed_var_dict]
@@ -39,12 +129,31 @@ def save_model(server_model_folder,
        target_vars.append(fetch_var_dict[key])
        target_var_names.append(key)

+    if not encryption:
 	save_inference_model(
            server_model_folder,
            feed_var_names,
            target_vars,
            executor,
+            model_filename="__model__",
+            params_filename="__params__",
            main_program=main_program)
+    else:
+        if encrypt_conf == None:
+            aes_cipher = CipherFactory.create_cipher()
+        else:
+            #todo: more encryption algorithms
+            pass
+        key = CipherUtils.gen_key_to_file(128, "key")
+        params = fluid.io.save_persistables(
+            executor=executor, dirname=None, main_program=main_program)
+        model = main_program.desc.serialize_to_string()
+        if not os.path.exists(server_model_folder):
+            os.makedirs(server_model_folder)
+        os.chdir(server_model_folder)
+        aes_cipher.encrypt_to_file(params, key, "encrypt_params")
+        aes_cipher.encrypt_to_file(model, key, "encrypt_model")
+        os.chdir("..")

    config = model_conf.GeneralModelConfig()

@@ -116,7 +225,11 @@ def inference_model_to_serving(dirname,
                               serving_server="serving_server",
                               serving_client="serving_client",
                               model_filename=None,
-                               params_filename=None):
+                               params_filename=None,
+                               encryption=False,
+                               key_len=128,
+                               encrypt_conf=None):
+    paddle.enable_static()
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    inference_program, feed_target_names, fetch_targets = \
@@ -127,7 +240,7 @@ def inference_model_to_serving(dirname,
    }
    fetch_dict = {x.name: x for x in fetch_targets}
    save_model(serving_server, serving_client, feed_dict, fetch_dict,
-               inference_program)
+               inference_program, encryption, key_len, encrypt_conf)
    feed_names = feed_dict.keys()
    fetch_names = fetch_dict.keys()
    return feed_names, fetch_names
--- a/python/paddle_serving_server/__init__.py
+++ b/python/paddle_serving_server/__init__.py
@@ -157,6 +157,7 @@ class Server(object):
        self.cur_path = os.getcwd()
        self.use_local_bin = False
        self.mkl_flag = False
+        self.encryption_model = False
 	self.product_name = None
        self.container_id = None
        self.model_config_paths = None  # for multi-model in a workflow
@@ -196,6 +197,8 @@ class Server(object):

    def set_ir_optimize(self, flag=False):
        self.ir_optimization = flag
+    def use_encryption_model(self, flag=False):
+        self.encryption_model = flag

    def set_product_name(self, product_name=None):
        if product_name == None:
@@ -230,11 +233,21 @@ class Server(object):
            engine.enable_ir_optimization = self.ir_optimization
            engine.static_optimization = False
            engine.force_update_static_cache = False
+            if os.path.exists('{}/__params__'.format(model_config_path)):
+                suffix = ""
+            else:
+                suffix = "_DIR" 

            if device == "cpu":
-                engine.type = "FLUID_CPU_ANALYSIS_DIR"
+		if self.encryption_model:
+                    engine.type = "FLUID_CPU_ANALYSIS_ENCRYPT"
+                else:
+                    engine.type = "FLUID_CPU_ANALYSIS" + suffix
            elif device == "gpu":
-                engine.type = "FLUID_GPU_ANALYSIS_DIR"
+		if self.encryption_model:
+                    engine.type = "FLUID_GPU_ANALYSIS_ENCRYPT"
+                else:
+                    engine.type = "FLUID_GPU_ANALYSIS" + suffix

            self.model_toolkit_conf.engines.extend([engine])

@@ -523,9 +536,8 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
        fetch_names = list(request.fetch_var_names)
        is_python = request.is_python
        log_id = request.log_id
-        feed_batch = []
-        for feed_inst in request.insts:
        feed_dict = {}
+        feed_inst = request.insts[0]
        for idx, name in enumerate(feed_names):
            var = feed_inst.tensor_array[idx]
            v_type = self.feed_types_[name]
@@ -539,19 +551,11 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
                    data = np.frombuffer(var.data, dtype="int32")
                else:
                    raise Exception("error type.")
-                else:
-                    if v_type == 0:  # int64
-                        data = np.array(list(var.int64_data), dtype="int64")
-                    elif v_type == 1:  # float32
-                        data = np.array(list(var.float_data), dtype="float32")
-                    elif v_type == 2:  # int32
-                        data = np.array(list(var.int_data), dtype="int32")
-                    else:
-                        raise Exception("error type.")
            data.shape = list(feed_inst.tensor_array[idx].shape)
            feed_dict[name] = data
-            feed_batch.append(feed_dict)
-        return feed_batch, fetch_names, is_python, log_id
+            if len(var.lod) > 0:
+                feed_dict["{}.lod".format()] = var.lod
+        return feed_dict, fetch_names, is_python, log_id

    def _pack_inference_response(self, ret, fetch_names, is_python):
        resp = multi_lang_general_model_service_pb2.InferenceResponse()
@@ -608,6 +612,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
        ret = self.bclient_.predict(
            feed=feed_dict,
            fetch=fetch_names,
+            batch=True,
            need_variant_tag=True,
            log_id=log_id)
        return self._pack_inference_response(ret, fetch_names, is_python)

--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
@@ -18,8 +18,14 @@ Usage:
        python -m paddle_serving_server.serve --model ./serving_server_model --port 9292
 """
 import argparse
-from .web_service import WebService
+import sys
+import json
+import base64
+import time
+from multiprocessing import Process
+from web_service import WebService, port_is_available
 from flask import Flask, request
+from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer


 def parse_args():  # pylint: disable=doc-string-missing
@@ -53,6 +59,11 @@ def parse_args():  # pylint: disable=doc-string-missing
        type=int,
        default=512 * 1024 * 1024,
        help="Limit sizes of messages")
+    parser.add_argument(
+        "--use_encryption_model",
+        default=False,
+        action="store_true",
+        help="Use encryption model")
    parser.add_argument(
        "--use_multilang",
        default=False,
@@ -71,17 +82,18 @@ def parse_args():  # pylint: disable=doc-string-missing
    return parser.parse_args()


-def start_standard_model():  # pylint: disable=doc-string-missing
+def start_standard_model(serving_port):  # pylint: disable=doc-string-missing
    args = parse_args()
    thread_num = args.thread
    model = args.model
-    port = args.port
+    port = serving_port
    workdir = args.workdir
    device = args.device
    mem_optim = args.mem_optim_off is False
    ir_optim = args.ir_optim
    max_body_size = args.max_body_size
    use_mkl = args.use_mkl
+    use_encryption_model = args.use_encryption_model
    use_multilang = args.use_multilang

    if model == "":
@@ -111,6 +123,7 @@ def start_standard_model():  # pylint: disable=doc-string-missing
    server.use_mkl(use_mkl)
    server.set_max_body_size(max_body_size)
    server.set_port(port)
+    server.use_encryption_model(use_encryption_model)
    if args.product_name != None:
        server.set_product_name(args.product_name)
    if args.container_id != None:
@@ -120,12 +133,88 @@ def start_standard_model():  # pylint: disable=doc-string-missing
    server.prepare_server(workdir=workdir, port=port, device=device)
    server.run_server()

+class MainService(BaseHTTPRequestHandler):
+    def get_available_port(self):
+        default_port = 12000
+        for i in range(1000):
+            if port_is_available(default_port + i):
+                return default_port + i
+
+    def start_serving(self):
+        start_standard_model(serving_port)
+
+    def get_key(self, post_data):
+        if "key" not in post_data:
+            return False
+        else:
+            key = base64.b64decode(post_data["key"])
+            with open(args.model + "/key", "w") as f:
+                f.write(key)
+            return True
+
+    def check_key(self, post_data):
+        if "key" not in post_data:
+            return False
+        else:
+            key = base64.b64decode(post_data["key"])
+            with open(args.model + "/key", "r") as f:
+                cur_key = f.read()
+            return (key == cur_key)
+
+    def start(self, post_data):
+        post_data = json.loads(post_data)
+        global p_flag
+        if not p_flag:
+            if args.use_encryption_model:
+                print("waiting key for model")
+                if not self.get_key(post_data):
+                    print("not found key in request")
+                    return False
+            global serving_port
+            global p
+            serving_port = self.get_available_port()
+            p = Process(target=self.start_serving)
+            p.start()
+            time.sleep(3)
+            if p.is_alive():
+                p_flag = True
+            else:
+                return False
+        else:
+            if p.is_alive():
+                if not self.check_key(post_data):
+                    return False
+            else:
+                return False
+        return True
+
+    def do_POST(self):
+        content_length = int(self.headers['Content-Length'])
+        post_data = self.rfile.read(content_length)
+        if self.start(post_data):
+            response = {"endpoint_list": [serving_port]}
+        else:
+            response = {"message": "start serving failed"}
+        self.send_response(200)
+        self.send_header('Content-type', 'application/json')
+        self.end_headers()
+        self.wfile.write(json.dumps(response))

 if __name__ == "__main__":

    args = parse_args()
    if args.name == "None":
-        start_standard_model()
+        if args.use_encryption_model:
+            p_flag = False
+            p = None
+            serving_port = 0
+            server = HTTPServer(('localhost', int(args.port)), MainService)
+            print(
+                'Starting encryption server, waiting for key from client, use <Ctrl-C> to stop'
+            )
+            server.serve_forever()
+        else:
+            start_standard_model(args.port)
    else:
        service = WebService(name=args.name)
        service.load_model_config(args.model)

--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -25,6 +25,16 @@ from paddle_serving_server import pipeline
 from paddle_serving_server.pipeline import Op


+def port_is_available(port):
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        sock.settimeout(2)
+        result = sock.connect_ex(('0.0.0.0', port))
+    if result != 0:
+        return True
+    else:
+        return False
+
+
 class WebService(object):
    def __init__(self, name="default_service"):
        self.name = name
@@ -110,7 +120,7 @@ class WebService(object):
        self.mem_optim = mem_optim
        self.ir_optim = ir_optim
        for i in range(1000):
-            if self.port_is_available(default_port + i):
+	    if port_is_available(default_port + i):
                self.port_list.append(default_port + i)
                break


--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
@@ -70,6 +70,11 @@ def serve_args():
        type=int,
        default=512 * 1024 * 1024,
        help="Limit sizes of messages")
+    parser.add_argument(
+        "--use_encryption_model",
+        default=False,
+        action="store_true",
+        help="Use encryption model")
    parser.add_argument(
        "--use_multilang",
        default=False,
@@ -77,6 +82,10 @@ def serve_args():
        help="Use Multi-language-service")
    parser.add_argument(
        "--use_trt", default=False, action="store_true", help="Use TensorRT")
+    parser.add_argument(
+        "--use_lite", default=False, action="store_true", help="Use PaddleLite")
+    parser.add_argument(
+        "--use_xpu", default=False, action="store_true", help="Use XPU")
    parser.add_argument(
        "--product_name",
        type=str,
@@ -210,6 +219,8 @@ class Server(object):
        self.use_local_bin = False
        self.gpuid = 0
        self.use_trt = False
+        self.use_lite = False
+        self.use_xpu = False
        self.model_config_paths = None  # for multi-model in a workflow
        self.product_name = None
        self.container_id = None
@@ -279,7 +290,13 @@ class Server(object):
    def set_trt(self):
        self.use_trt = True

-    def _prepare_engine(self, model_config_paths, device):
+    def set_lite(self):
+        self.use_lite = True
+
+    def set_xpu(self):
+        self.use_xpu = True
+
+    def _prepare_engine(self, model_config_paths, device, use_encryption_model):
        if self.model_toolkit_conf == None:
            self.model_toolkit_conf = server_sdk.ModelToolkitConf()

@@ -299,11 +316,23 @@ class Server(object):
            engine.static_optimization = False
            engine.force_update_static_cache = False
            engine.use_trt = self.use_trt
+            engine.use_lite = self.use_lite
+            engine.use_xpu = self.use_xpu
+
+

            if device == "cpu":
+		if use_encryption_model:
+                    engine.type = "FLUID_CPU_ANALYSIS_ENCRPT"
+                else:
                    engine.type = "FLUID_CPU_ANALYSIS_DIR"
            elif device == "gpu":
+		if use_encryption_model:
+                    engine.type = "FLUID_GPU_ANALYSIS_ENCRPT"
+                else:
                    engine.type = "FLUID_GPU_ANALYSIS_DIR"
+            elif device == "arm":
+                engine.type = "FLUID_ARM_ANALYSIS_DIR"

            self.model_toolkit_conf.engines.extend([engine])

@@ -405,10 +434,12 @@ class Server(object):
        for line in version_file.readlines():
            if re.match("cuda_version", line):
                cuda_version = line.split("\"")[1]
-                if cuda_version != "trt":
-                    device_version = "serving-gpu-cuda" + cuda_version + "-"
-                else:
+                if cuda_version == "trt":
                    device_version = "serving-gpu-" + cuda_version + "-"
+                elif cuda_version == "arm":
+                    device_version = "serving-" + cuda_version + "-"
+                else:
+                    device_version = "serving-gpu-cuda" + cuda_version + "-"

        folder_name = device_version + serving_server_version
        tar_name = folder_name + ".tar.gz"
@@ -460,6 +491,7 @@ class Server(object):
                       workdir=None,
                       port=9292,
                       device="cpu",
+		       use_encryption_model=False,
                       cube_conf=None):
        if workdir == None:
            workdir = "./tmp"
@@ -473,7 +505,8 @@ class Server(object):

        self.set_port(port)
        self._prepare_resource(workdir, cube_conf)
-        self._prepare_engine(self.model_config_paths, device)
+        self._prepare_engine(self.model_config_paths, device,
+                             use_encryption_model)
        self._prepare_infer_service(port)
        self.workdir = workdir

@@ -507,7 +540,36 @@ class Server(object):
                time.sleep(1)
        else:
            print("Use local bin : {}".format(self.bin_path))
-        self.check_cuda()
+        #self.check_cuda()
+        if self.use_lite:
+            command = "{} " \
+                      "-enable_model_toolkit " \
+                      "-inferservice_path {} " \
+                      "-inferservice_file {} " \
+                      "-max_concurrency {} " \
+                      "-num_threads {} " \
+                      "-port {} " \
+                      "-reload_interval_s {} " \
+                      "-resource_path {} " \
+                      "-resource_file {} " \
+                      "-workflow_path {} " \
+                      "-workflow_file {} " \
+                      "-bthread_concurrency {} " \
+                      "-max_body_size {} ".format(
+                          self.bin_path,
+                          self.workdir,
+                          self.infer_service_fn,
+                          self.max_concurrency,
+                          self.num_threads,
+                          self.port,
+                          self.reload_interval_s,
+                          self.workdir,
+                          self.resource_fn,
+                          self.workdir,
+                          self.workflow_fn,
+                          self.num_threads,
+                          self.max_body_size)
+        else:
            command = "{} " \
                      "-enable_model_toolkit " \
                      "-inferservice_path {} " \

--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
@@ -19,25 +19,29 @@ Usage:
 """
 import argparse
 import os
+import json
+import base64
 from multiprocessing import Pool, Process
 from paddle_serving_server_gpu import serve_args
 from flask import Flask, request
+from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer


-def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-missing
+def start_gpu_card_model(index, gpuid, port, args):  # pylint: disable=doc-string-missing
    gpuid = int(gpuid)
    device = "gpu"
-    port = args.port
    if gpuid == -1:
        device = "cpu"
    elif gpuid >= 0:
-        port = args.port + index
+        port = port + index
    thread_num = args.thread
    model = args.model
    mem_optim = args.mem_optim_off is False
    ir_optim = args.ir_optim
    max_body_size = args.max_body_size
    use_multilang = args.use_multilang
+    workdir = args.workdir
+    if gpuid >= 0:
        workdir = "{}_{}".format(args.workdir, gpuid)

    if model == "":
@@ -67,20 +71,32 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
    if args.use_trt:
        server.set_trt()

+    if args.use_lite:
+        server.set_lite()
+        device = "arm"
+
+    if args.use_xpu:
+        server.set_xpu()
+
    if args.product_name != None:
        server.set_product_name(args.product_name)
    if args.container_id != None:
        server.set_container_id(args.container_id)

    server.load_model_config(model)
-    server.prepare_server(workdir=workdir, port=port, device=device)
+    server.prepare_server(
+        workdir=workdir,
+        port=port,
+        device=device,
+        use_encryption_model=args.use_encryption_model)
    if gpuid >= 0:
        server.set_gpuid(gpuid)
    server.run_server()

-
-def start_multi_card(args):  # pylint: disable=doc-string-missing
+def start_multi_card(args, serving_port=None):  # pylint: disable=doc-string-missing
    gpus = ""
+    if serving_port == None:
+        serving_port = args.port
    if args.gpu_ids == "":
        gpus = []
    else:
@@ -95,16 +111,21 @@ def start_multi_card(args):  # pylint: disable=doc-string-missing
                    exit(-1)
        else:
            env_gpus = []
-    if len(gpus) <= 0:
-        print("gpu_ids not set, going to run cpu service.")
+    if args.use_lite:
+        print("run arm server.")
        start_gpu_card_model(-1, -1, args)
+    elif len(gpus) <= 0:
+        print("gpu_ids not set, going to run cpu service.")
+        start_gpu_card_model(-1, -1, serving_port, args)
    else:
        gpu_processes = []
        for i, gpu_id in enumerate(gpus):
            p = Process(
-                target=start_gpu_card_model, args=(
+		target=start_gpu_card_model,
+                args=(
                    i,
                    gpu_id,
+		    serving_port,
                    args, ))
            gpu_processes.append(p)
        for p in gpu_processes:
@@ -112,10 +133,88 @@ def start_multi_card(args):  # pylint: disable=doc-string-missing
        for p in gpu_processes:
            p.join()

+class MainService(BaseHTTPRequestHandler):
+    def get_available_port(self):
+        default_port = 12000
+        for i in range(1000):
+            if port_is_available(default_port + i):
+                return default_port + i
+
+    def start_serving(self):
+        start_multi_card(args, serving_port)
+
+    def get_key(self, post_data):
+        if "key" not in post_data:
+            return False
+        else:
+            key = base64.b64decode(post_data["key"])
+            with open(args.model + "/key", "w") as f:
+                f.write(key)
+            return True
+
+    def check_key(self, post_data):
+        if "key" not in post_data:
+            return False
+        else:
+            key = base64.b64decode(post_data["key"])
+            with open(args.model + "/key", "r") as f:
+                cur_key = f.read()
+            return (key == cur_key)
+
+    def start(self, post_data):
+        post_data = json.loads(post_data)
+        global p_flag
+        if not p_flag:
+            if args.use_encryption_model:
+                print("waiting key for model")
+                if not self.get_key(post_data):
+                    print("not found key in request")
+                    return False
+            global serving_port
+            global p
+            serving_port = self.get_available_port()
+            p = Process(target=self.start_serving)
+            p.start()
+            time.sleep(3)
+            if p.is_alive():
+                p_flag = True
+            else:
+                return False
+        else:
+            if p.is_alive():
+                if not self.check_key(post_data):
+                    return False
+            else:
+                return False
+        return True
+
+    def do_POST(self):
+        content_length = int(self.headers['Content-Length'])
+        post_data = self.rfile.read(content_length)
+        if self.start(post_data):
+            response = {"endpoint_list": [serving_port]}
+        else:
+            response = {"message": "start serving failed"}
+        self.send_response(200)
+        self.send_header('Content-type', 'application/json')
+        self.end_headers()
+        self.wfile.write(json.dumps(response))
+

 if __name__ == "__main__":
    args = serve_args()
    if args.name == "None":
+        from .web_service import port_is_available
+        if args.use_encryption_model:
+            p_flag = False
+            p = None
+            serving_port = 0
+            server = HTTPServer(('localhost', int(args.port)), MainService)
+            print(
+                'Starting encryption server, waiting for key from client, use <Ctrl-C> to stop'
+            )
+            server.serve_forever()
+        else:
            start_multi_card(args)
    else:
        from .web_service import WebService
@@ -128,7 +227,8 @@ if __name__ == "__main__":
        if len(gpu_ids) > 0:
            web_service.set_gpus(gpu_ids)
        web_service.prepare_server(
-            workdir=args.workdir, port=args.port, device=args.device)
+            workdir=args.workdir, port=args.port, device=args.device,
+            use_lite=args.use_lite, use_xpu=args.use_xpu, ir_optim=args.ir_optim)
        web_service.run_rpc_service()

        app_instance = Flask(__name__)

--- a/python/paddle_serving_server_gpu/web_service.py
+++ b/python/paddle_serving_server_gpu/web_service.py
@@ -28,6 +28,16 @@ from paddle_serving_server_gpu import pipeline
 from paddle_serving_server_gpu.pipeline import Op


+def port_is_available(port):
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        sock.settimeout(2)
+        result = sock.connect_ex(('0.0.0.0', port))
+    if result != 0:
+        return True
+    else:
+        return False
+
+
 class WebService(object):
    def __init__(self, name="default_service"):
        self.name = name
@@ -83,9 +93,14 @@ class WebService(object):
                            gpuid=0,
                            thread_num=2,
                            mem_optim=True,
+                            use_lite=False,
+                            use_xpu=False,
                            ir_optim=False):
        device = "gpu"
        if gpuid == -1:
+            if use_lite:
+                device = "arm"
+            else:
                device = "cpu"
        op_maker = serving.OpMaker()
        read_op = op_maker.create('general_reader')
@@ -103,6 +118,11 @@ class WebService(object):
        server.set_memory_optimize(mem_optim)
        server.set_ir_optimize(ir_optim)

+        if use_lite:
+            server.set_lite()
+        if use_xpu:
+            server.set_xpu()
+
        server.load_model_config(self.model_config)
        if gpuid >= 0:
            server.set_gpuid(gpuid)
@@ -125,9 +145,11 @@ class WebService(object):
                       workdir="",
                       port=9393,
                       device="gpu",
+                       use_lite=False,
+                       use_xpu=False,
+                       ir_optim=False,
                       gpuid=0,
-                       mem_optim=True,
-                       ir_optim=False):
+                       mem_optim=True):
        print("This API will be deprecated later. Please do not use it")
        self.workdir = workdir
        self.port = port
@@ -136,7 +158,7 @@ class WebService(object):
        self.port_list = []
        default_port = 12000
        for i in range(1000):
-            if self.port_is_available(default_port + i):
+	    if port_is_available(default_port + i):
                self.port_list.append(default_port + i)
            if len(self.port_list) > len(self.gpus):
                break
@@ -150,6 +172,8 @@ class WebService(object):
                    -1,
                    thread_num=2,
                    mem_optim=mem_optim,
+                    use_lite=use_lite,
+                    use_xpu=use_xpu,
                    ir_optim=ir_optim))
        else:
            for i, gpuid in enumerate(self.gpus):
@@ -160,6 +184,8 @@ class WebService(object):
                        gpuid,
                        thread_num=2,
                        mem_optim=mem_optim,
+                        use_lite=use_lite,
+                        use_xpu=use_xpu,
                        ir_optim=ir_optim))

    def _launch_web_service(self):

--- a/python/pipeline/local_service_handler.py
+++ b/python/pipeline/local_service_handler.py
@@ -44,6 +44,8 @@ class LocalServiceHandler(object):
                 ir_optim=False,
                 available_port_generator=None,
                 use_trt=False,
+                 use_lite=False,
+                 use_xpu=False,
                 use_profile=False):
        """
        Initialization of localservicehandler
@@ -60,6 +62,8 @@ class LocalServiceHandler(object):
           ir_optim: use calculation chart optimization, False default.
           available_port_generator: generate available ports
           use_trt: use nvidia tensorRt engine, False default.
+           use_lite: use Paddle-Lite engine, False default.
+           use_xpu: run predict on Baidu Kunlun, False default.
           use_profile: use profiling, False default.

        Returns:
@@ -74,6 +78,12 @@ class LocalServiceHandler(object):
        if devices == "":
            # cpu
            devices = [-1]
+            if use_lite:
+                self._device_type = "arm"
+                self._port_list.append(available_port_generator.next())
+                _LOGGER.info("Model({}) will be launch in arm device. Port({})"
+                             .format(model_config, self._port_list))
+            else:
                self._device_type = "cpu"
                self._port_list.append(available_port_generator.next())
                _LOGGER.info("Model({}) will be launch in cpu device. Port({})"
@@ -96,6 +106,8 @@ class LocalServiceHandler(object):
        self._rpc_service_list = []
        self._server_pros = []
        self._use_trt = use_trt
+        self._use_lite = use_lite
+        self._use_xpu = use_xpu
        self._use_profile = use_profile
        self.fetch_names_ = fetch_names

@@ -138,8 +150,11 @@ class LocalServiceHandler(object):
        if self._local_predictor_client is None:
            self._local_predictor_client = LocalPredictor()
            use_gpu = False
+            use_lite = False
            if self._device_type == "gpu":
                use_gpu = True
+            elif self._device_type == "arm":
+                use_lite = True
            self._local_predictor_client.load_model_config(
                model_path=self._model_config,
                use_gpu=use_gpu,
@@ -148,7 +163,9 @@ class LocalServiceHandler(object):
                thread_num=self._thread_num,
                mem_optim=self._mem_optim,
                ir_optim=self._ir_optim,
-                use_trt=self._use_trt)
+                use_trt=self._use_trt,
+                use_lite=use_lite,
+                use_xpu=self._use_xpu)
        return self._local_predictor_client

    def get_client_config(self):
@@ -185,7 +202,7 @@ class LocalServiceHandler(object):

            server = Server()
        else:
-            #gpu
+            #gpu or arm
            from paddle_serving_server_gpu import OpMaker, OpSeqMaker, Server
            op_maker = OpMaker()
            read_op = op_maker.create('general_reader')

--- a/python/pipeline/pipeline_server.py
+++ b/python/pipeline/pipeline_server.py
@@ -21,6 +21,7 @@ import contextlib
 from contextlib import closing
 import multiprocessing
 import yaml
+import io

 from .proto import pipeline_service_pb2_grpc, pipeline_service_pb2
 from . import operator
@@ -333,7 +334,7 @@ class ServerYamlConfChecker(object):
            raise SystemExit("Failed to prepare_server: only one of yml_file"
                             " or yml_dict can be selected as the parameter.")
        if yml_file is not None:
-            with open(yml_file) as f:
+            with io.open(yml_file, encoding='utf-8') as f:
                conf = yaml.load(f.read())
        elif yml_dict is not None:
            conf = yml_dict

--- a/requirements_win.txt
+++ b/requirements_win.txt
--- a/python/setup.py.app.in
+++ b/python/setup.py.app.in
@@ -32,7 +32,7 @@ if '${PACK}' == 'ON':


 REQUIRED_PACKAGES = [
-    'six >= 1.10.0', 'sentencepiece<=0.1.92', 'opencv-python<=4.2.0.32', 'pillow',
+    'six >= 1.10.0', 'sentencepiece', 'opencv-python', 'pillow',
    'pyclipper'
 ]


--- a/python/setup.py.server.in
+++ b/python/setup.py.server.in
@@ -29,7 +29,7 @@ util.gen_pipeline_code("paddle_serving_server")

 REQUIRED_PACKAGES = [
    'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio <= 1.33.2', 'grpcio-tools <= 1.33.2',
-    'paddle_serving_client', 'flask >= 1.1.1', 'paddle_serving_app', 'func_timeout', 'pyyaml'
+    'flask >= 1.1.1', 'func_timeout', 'pyyaml'
 ]

 packages=['paddle_serving_server',

--- a/python/setup.py.server_gpu.in
+++ b/python/setup.py.server_gpu.in
@@ -31,7 +31,7 @@ util.gen_pipeline_code("paddle_serving_server_gpu")

 REQUIRED_PACKAGES = [
    'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio <= 1.33.2', 'grpcio-tools <= 1.33.2',
-    'paddle_serving_client', 'flask >= 1.1.1', 'paddle_serving_app', 'func_timeout', 'pyyaml'
+    'flask >= 1.1.1', 'func_timeout', 'pyyaml'
 ]

 packages=['paddle_serving_server_gpu',

--- a/requirements.txt
+++ b/requirements.txt
-sphinx==2.1.0
-mistune
-sphinx_rtd_theme
-paddlepaddle>=1.8.4
-shapely<=1.6.1
--- a/tools/Dockerfile.centos6.cuda9.0-cudnn7.devel
+++ b/tools/Dockerfile.centos6.cuda9.0-cudnn7.devel
@@ -39,6 +39,8 @@ RUN yum -y install wget && \
    make clean && \
    echo 'export PATH=/usr/local/python3.6/bin:$PATH' >> /root/.bashrc && \
    echo 'export LD_LIBRARY_PATH=/usr/local/python3.6/lib:$LD_LIBRARY_PATH' >> /root/.bashrc && \
+    pip install requests && \
+    pip3 install requests && \
    source /root/.bashrc && \
    cd .. && rm -rf Python-3.6.8* && \
    wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.2/protobuf-all-3.11.2.tar.gz && \

--- a/tools/Dockerfile.centos6.devel
+++ b/tools/Dockerfile.centos6.devel
@@ -49,6 +49,8 @@ RUN yum -y install wget && \
    cd .. && rm -rf protobuf-* && \
    yum -y install epel-release && yum -y install patchelf libXext libSM libXrender && \
    yum clean all && \
+    pip install requests && \
+    pip3 install requests && \
    localedef -c -i en_US -f UTF-8 en_US.UTF-8 && \
    echo "export LANG=en_US.utf8" >> /root/.bashrc && \
    echo "export LANGUAGE=en_US.utf8" >> /root/.bashrc
--- a/tools/Dockerfile.ci
+++ b/tools/Dockerfile.ci
@@ -23,7 +23,8 @@ RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
 RUN yum -y install python-devel sqlite-devel >/dev/null \
    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
    && python get-pip.py >/dev/null \
-    && rm get-pip.py
+    && rm get-pip.py \
+    && pip install requests 

 RUN wget http://nixos.org/releases/patchelf/patchelf-0.10/patchelf-0.10.tar.bz2 \
    && yum -y install bzip2 >/dev/null \
@@ -34,6 +35,9 @@ RUN wget http://nixos.org/releases/patchelf/patchelf-0.10/patchelf-0.10.tar.bz2
    && cd .. \
    && rm -rf patchelf-0.10*

+RUN yum install -y python3 python3-devel \
+    && pip3 install requests
+
 RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.2/protobuf-all-3.11.2.tar.gz && \
    tar zxf protobuf-all-3.11.2.tar.gz && \
    cd protobuf-3.11.2 && \
@@ -41,8 +45,6 @@ RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.2/p
    make clean && \
    cd .. && rm -rf protobuf-*

-RUN yum install -y python3 python3-devel
-
 RUN yum -y update >/dev/null \
    && yum -y install dnf >/dev/null \
    && yum -y install dnf-plugins-core >/dev/null \

--- a/tools/Dockerfile.cuda10.0-cudnn7.devel
+++ b/tools/Dockerfile.cuda10.0-cudnn7.devel
@@ -30,11 +30,13 @@ RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
 RUN yum -y install python-devel sqlite-devel  \
    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
    && python get-pip.py >/dev/null \
-    && rm get-pip.py 
+    && rm get-pip.py \
+    && pip install requests 

 RUN yum install -y python3 python3-devel \
    && yum -y install epel-release && yum -y install patchelf libXext libSM libXrender\
-    && yum clean all 
+    && yum clean all \
+    && pip3 install requests 

 RUN localedef -c -i en_US -f UTF-8 en_US.UTF-8 \
    && echo "export LANG=en_US.utf8" >> /root/.bashrc \

--- a/tools/Dockerfile.cuda9.0-cudnn7.devel
+++ b/tools/Dockerfile.cuda9.0-cudnn7.devel
@@ -29,11 +29,13 @@ RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
 RUN yum -y install python-devel sqlite-devel  \
    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
    && python get-pip.py >/dev/null \
-    && rm get-pip.py 
+    && rm get-pip.py \
+    && pip install requests 

 RUN yum install -y python3 python3-devel \
    && yum -y install epel-release && yum -y install patchelf libXext libSM libXrender\
-    && yum clean all 
+    && yum clean all \
+    && pip3 install requests

 RUN localedef -c -i en_US -f UTF-8 en_US.UTF-8 \
    && echo "export LANG=en_US.utf8" >> /root/.bashrc \

--- a/tools/Dockerfile.devel
+++ b/tools/Dockerfile.devel
@@ -19,11 +19,13 @@ RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
 RUN yum -y install python-devel sqlite-devel  \
    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
    && python get-pip.py >/dev/null \
-    && rm get-pip.py 
+    && rm get-pip.py \
+    && pip install requests

 RUN yum install -y python3 python3-devel \
    && yum -y install epel-release && yum -y install patchelf libXext libSM libXrender\
-    && yum clean all 
+    && yum clean all \
+    && pip3 install requests 

 RUN localedef -c -i en_US -f UTF-8 en_US.UTF-8 \
    && echo "export LANG=en_US.utf8" >> /root/.bashrc \

--- a/tools/serving_build.sh
+++ b/tools/serving_build.sh
@@ -174,7 +174,7 @@ function python_test_fit_a_line() {

            # test web
            unsetproxy # maybe the proxy is used on iPipe, which makes web-test failed.
-            check_cmd "python -m paddle_serving_server.serve --model uci_housing_model --name uci --port 9393 --thread 4 --name uci > /dev/null &"
+            check_cmd "python test_server.py > /dev/null &"
            sleep 5 # wait for the server to start
            check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
            # check http code
@@ -183,14 +183,6 @@ function python_test_fit_a_line() {
                echo "HTTP status code -ne 200"
                exit 1
            fi
-            # test web batch
-            check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
-            # check http code
-            http_code=`curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
-            if [ ${http_code} -ne 200 ]; then
-                echo "HTTP status code -ne 200"
-                exit 1
-            fi
            setproxy # recover proxy state
            kill_server_process
            ;;
@@ -202,27 +194,6 @@ function python_test_fit_a_line() {
            check_cmd "python test_client.py uci_housing_client/serving_client_conf.prototxt > /dev/null"
            kill_server_process

-            # test web
-            #unsetproxy # maybe the proxy is used on iPipe, which makes web-test failed.
-            #check_cmd "python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9393 --thread 2 --gpu_ids 0 --name uci > /dev/null &"
-            #sleep 5 # wait for the server to start
-            #check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
-            # check http code
-            #http_code=`curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
-            #if [ ${http_code} -ne 200 ]; then
-            #    echo "HTTP status code -ne 200"
-            #    exit 1
-            #fi
-            # test web batch
-            #check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
-            # check http code
-            #http_code=`curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
-            #if [ ${http_code} -ne 200 ]; then
-            #    echo "HTTP status code -ne 200"
-            #    exit 1
-            #fi
-            #setproxy # recover proxy state
-            #kill_server_process
            ;;
        *)
            echo "error type"
@@ -514,6 +485,42 @@ function python_test_lac() {
    cd ..
 }

+
+function python_test_encryption(){
+    #pwd: /Serving/python/examples
+    cd encryption
+    sh get_data.sh
+    local TYPE=$1
+    export SERVING_BIN=${SERIVNG_WORKDIR}/build-server-${TYPE}/core/general-server/serving
+    case $TYPE in
+        CPU)
+            #check_cmd "python encrypt.py"
+            #sleep 5
+            check_cmd "python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model > /dev/null &"
+            sleep 5
+            check_cmd "python test_client.py encrypt_client/serving_client_conf.prototxt"
+            kill_server_process
+            ;;
+        GPU)
+            #check_cmd "python encrypt.py"
+            #sleep 5
+            check_cmd "python -m paddle_serving_server_gpu.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0"
+            sleep 5
+            check_cmd "python test_client.py encrypt_client/serving_client_conf.prototxt"
+            kill_servere_process
+            ;;
+        *)
+            echo "error type"
+            exit 1
+            ;;
+    esac
+    echo "encryption $TYPE test finished as expected"
+    setproxy
+    unset SERVING_BIN
+    cd ..
+}
+
+
 function java_run_test() {
    # pwd: /Serving
    local TYPE=$1
@@ -589,9 +596,6 @@ function python_test_grpc_impl() {
            sleep 5 # wait for the server to start
            check_cmd "python test_sync_client.py > /dev/null"
            check_cmd "python test_asyn_client.py > /dev/null"
-            check_cmd "python test_general_pb_client.py > /dev/null"
-            check_cmd "python test_numpy_input_client.py > /dev/null"
-            check_cmd "python test_batch_client.py > /dev/null"
            check_cmd "python test_timeout_client.py > /dev/null"
            kill_server_process
            kill_process_by_port 9393
@@ -600,9 +604,6 @@ function python_test_grpc_impl() {
            sleep 5 # wait for the server to start
            check_cmd "python test_sync_client.py > /dev/null"
            check_cmd "python test_asyn_client.py > /dev/null"
-            check_cmd "python test_general_pb_client.py > /dev/null"
-            check_cmd "python test_numpy_input_client.py > /dev/null"
-            check_cmd "python test_batch_client.py > /dev/null"
            check_cmd "python test_timeout_client.py > /dev/null"
            kill_server_process
            kill_process_by_port 9393
@@ -651,9 +652,7 @@ COMMENT
            sleep 5 # wait for the server to start
            check_cmd "python test_sync_client.py > /dev/null"
            check_cmd "python test_asyn_client.py > /dev/null"
-            check_cmd "python test_general_pb_client.py > /dev/null"
-            check_cmd "python test_numpy_input_client.py > /dev/null"
-            check_cmd "python test_batch_client.py > /dev/null"
+            #check_cmd "python test_batch_client.py > /dev/null"
            check_cmd "python test_timeout_client.py > /dev/null"
            kill_server_process
            kill_process_by_port 9393
@@ -662,9 +661,7 @@ COMMENT
            sleep 5 # wait for the server to start
            check_cmd "python test_sync_client.py > /dev/null"
            check_cmd "python test_asyn_client.py > /dev/null"
-            check_cmd "python test_general_pb_client.py > /dev/null"
-            check_cmd "python test_numpy_input_client.py > /dev/null"
-            check_cmd "python test_batch_client.py > /dev/null"
+            #check_cmd "python test_batch_client.py > /dev/null"
            check_cmd "python test_timeout_client.py > /dev/null"
            kill_server_process
            kill_process_by_port 9393
@@ -960,6 +957,7 @@ function python_run_test() {
    python_test_lac $TYPE # pwd: /Serving/python/examples
    python_test_multi_process $TYPE # pwd: /Serving/python/examples
    python_test_multi_fetch $TYPE # pwd: /Serving/python/examples
+    python_test_encryption $TYPE # pwd: /Serving/python/examples
    python_test_yolov4 $TYPE # pwd: /Serving/python/examples
    python_test_grpc_impl $TYPE # pwd: /Serving/python/examples
    python_test_resnet50 $TYPE # pwd: /Serving/python/examples