Merge pull request #32 from PaddlePaddle/develop

Sync codes

Merge pull request #32 from PaddlePaddle/develop
Sync codes
7b77852b · TeslaZhao · GitHub · d85a7733 · 592fe770 · 7b77852b
119 changed file
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ We consider deploying deep learning inference service online to be a user-facing
 <h2 align="center">AIStudio Turorial</h2>
-Here we provide tutorial on AIStudio(Chinese Version) [AIStudio教程-Paddle Serving服务化部署框架](https://aistudio.baidu.com/aistudio/projectdetail/1550674)
+Here we provide tutorial on AIStudio(Chinese Version) [AIStudio教程-Paddle Serving服务化部署框架](https://www.paddlepaddle.org.cn/tutorials/projectdetail/1555945)
 The tutorial provides 
 <ul>
@@ -85,14 +85,14 @@ We **highly recommend** you to **run Paddle Serving in Docker**, please visit [R
 ```
 # Run CPU Docker
 docker pull registry.baidubce.com/paddlepaddle/serving:0.5.0-devel
-docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-devel
+docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-devel bash
 docker exec -it test bash
 git clone https://github.com/PaddlePaddle/Serving
 ```
 ```
 # Run GPU Docker
 nvidia-docker pull registry.baidubce.com/paddlepaddle/serving:0.5.0-cuda10.2-cudnn8-devel
-nvidia-docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-cuda10.2-cudnn8-devel
+nvidia-docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-cuda10.2-cudnn8-devel bash
 nvidia-docker exec -it test bash
 git clone https://github.com/PaddlePaddle/Serving
 ```

--- a/README_CN.md
+++ b/README_CN.md
@@ -53,7 +53,7 @@ Paddle Serving 旨在帮助深度学习开发者轻易部署在线预测服务
 <h2 align="center">教程</h2>
-Paddle Serving开发者为您提供了简单易用的[AIStudio教程-Paddle Serving服务化部署框架](https://aistudio.baidu.com/aistudio/projectdetail/1550674)
+Paddle Serving开发者为您提供了简单易用的[AIStudio教程-Paddle Serving服务化部署框架](https://www.paddlepaddle.org.cn/tutorials/projectdetail/1555945)
 教程提供了如下内容
@@ -86,14 +86,14 @@ Paddle Serving开发者为您提供了简单易用的[AIStudio教程-Paddle Serv
 ```
 # 启动 CPU Docker
 docker pull registry.baidubce.com/paddlepaddle/serving:0.5.0-devel
-docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-devel
+docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-devel bash
 docker exec -it test bash
 git clone https://github.com/PaddlePaddle/Serving
 ```
 ```
 # 启动 GPU Docker
 nvidia-docker pull registry.baidubce.com/paddlepaddle/serving:0.5.0-cuda10.2-cudnn8-devel
-nvidia-docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-cuda10.2-cudnn8-devel
+nvidia-docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-cuda10.2-cudnn8-devel bash
 nvidia-docker exec -it test bash
 git clone https://github.com/PaddlePaddle/Serving
 ```

--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -18,7 +18,7 @@ SET(PADDLE_SOURCES_DIR ${THIRD_PARTY_PATH}/Paddle)
 SET(PADDLE_DOWNLOAD_DIR ${PADDLE_SOURCES_DIR}/src/extern_paddle)
 SET(PADDLE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/Paddle/)
 SET(PADDLE_INCLUDE_DIR "${PADDLE_INSTALL_DIR}/include" CACHE PATH "PaddlePaddle include directory." FORCE)
-SET(PADDLE_LIBRARIES "${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.a" CACHE FILEPATH "Paddle library." FORCE)
+SET(PADDLE_LIBRARIES "${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.a" CACHE FILEPATH "Paddle library." FORCE)
 message("paddle install dir: " ${PADDLE_INSTALL_DIR})
@@ -31,7 +31,7 @@ message( "WITH_GPU = ${WITH_GPU}")
 # Paddle Version should be one of:
 # latest: latest develop build
 # version number like 1.5.2
-SET(PADDLE_VERSION "2.0.0")
+SET(PADDLE_VERSION "2.0.1")
 if (WITH_GPU)
    if(CUDA_VERSION EQUAL 11.0)
        set(CUDA_SUFFIX "cuda11-cudnn8-avx-mkl")
@@ -55,9 +55,9 @@ if (WITH_GPU)
    SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-${CUDA_SUFFIX}")
 elseif (WITH_LITE)
    if (WITH_XPU)
-        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-arm-xpu")
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-${CMAKE_SYSTEM_PROCESSOR}-xpu")
    else()
-        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-arm")
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-${CMAKE_SYSTEM_PROCESSOR}")
    endif()
 else()
    if (WITH_AVX)
@@ -139,8 +139,8 @@ LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib)
 ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/openblas/lib/libopenblas.a)
-ADD_LIBRARY(paddle_fluid STATIC IMPORTED GLOBAL)
+ADD_LIBRARY(paddle_inference STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.a)
+SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.a)
 if (WITH_TRT)
    ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL)

--- a/core/configure/CMakeLists.txt
+++ b/core/configure/CMakeLists.txt
 if (SERVER OR CLIENT)
-LIST(APPEND protofiles
+  LIST(APPEND protofiles
-        ${CMAKE_CURRENT_LIST_DIR}/proto/server_configure.proto
+          ${CMAKE_CURRENT_LIST_DIR}/proto/server_configure.proto
-        ${CMAKE_CURRENT_LIST_DIR}/proto/sdk_configure.proto
+          ${CMAKE_CURRENT_LIST_DIR}/proto/sdk_configure.proto
-        ${CMAKE_CURRENT_LIST_DIR}/proto/inferencer_configure.proto
+          ${CMAKE_CURRENT_LIST_DIR}/proto/inferencer_configure.proto
-	${CMAKE_CURRENT_LIST_DIR}/proto/general_model_config.proto
+  	${CMAKE_CURRENT_LIST_DIR}/proto/general_model_config.proto
-)
+  )
-PROTOBUF_GENERATE_CPP(configure_proto_srcs configure_proto_hdrs ${protofiles})
+  PROTOBUF_GENERATE_CPP(configure_proto_srcs configure_proto_hdrs ${protofiles})
-list(APPEND configure_srcs ${configure_proto_srcs})
+  list(APPEND configure_srcs ${configure_proto_srcs})
-list(APPEND configure_srcs ${CMAKE_CURRENT_LIST_DIR}/src/configure_parser.cpp)
+  list(APPEND configure_srcs ${CMAKE_CURRENT_LIST_DIR}/src/configure_parser.cpp)
-add_library(configure ${configure_srcs})
+  add_library(configure ${configure_srcs})
-add_dependencies(configure brpc)
+  add_dependencies(configure brpc)
-install(TARGETS configure 
+  install(TARGETS configure 
-        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
+          ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
-        )
+          )
-install(FILES ${CMAKE_CURRENT_LIST_DIR}/include/configure_parser.h
+  install(FILES ${CMAKE_CURRENT_LIST_DIR}/include/configure_parser.h
-        DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/include/configure/include)
+          DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/include/configure/include)
-FILE(GLOB inc ${CMAKE_CURRENT_BINARY_DIR}/*.pb.h)
+  FILE(GLOB inc ${CMAKE_CURRENT_BINARY_DIR}/*.pb.h)
-install(FILES ${inc}
+  install(FILES ${inc}
-        DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/include/configure)
+          DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/include/configure)
 endif()
 if (WITH_PYTHON)
-py_proto_compile(general_model_config_py_proto SRCS proto/general_model_config.proto)
+  py_proto_compile(general_model_config_py_proto SRCS proto/general_model_config.proto)
-add_custom_target(general_model_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+  add_custom_target(general_model_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-add_dependencies(general_model_config_py_proto general_model_config_py_proto_init)
+  add_dependencies(general_model_config_py_proto general_model_config_py_proto_init)
-py_grpc_proto_compile(multi_lang_general_model_service_py_proto SRCS proto/multi_lang_general_model_service.proto)
+  py_grpc_proto_compile(multi_lang_general_model_service_py_proto SRCS proto/multi_lang_general_model_service.proto)
-add_custom_target(multi_lang_general_model_service_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+  add_custom_target(multi_lang_general_model_service_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-add_dependencies(multi_lang_general_model_service_py_proto multi_lang_general_model_service_py_proto_init)
+  add_dependencies(multi_lang_general_model_service_py_proto multi_lang_general_model_service_py_proto_init)
-if (CLIENT)
+  if (CLIENT)
-py_proto_compile(sdk_configure_py_proto SRCS proto/sdk_configure.proto)
+    py_proto_compile(sdk_configure_py_proto SRCS proto/sdk_configure.proto)
-add_custom_target(sdk_configure_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+    add_custom_target(sdk_configure_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-add_dependencies(sdk_configure_py_proto sdk_configure_py_proto_init)
+    add_dependencies(sdk_configure_py_proto sdk_configure_py_proto_init)
-add_custom_command(TARGET sdk_configure_py_proto POST_BUILD
+    add_custom_command(TARGET sdk_configure_py_proto POST_BUILD
-		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+    		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+    		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-		COMMENT "Copy generated python proto into directory paddle_serving_client/proto."
+    		COMMENT "Copy generated python proto into directory paddle_serving_client/proto."
-		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-add_custom_command(TARGET general_model_config_py_proto POST_BUILD
+    add_custom_command(TARGET general_model_config_py_proto POST_BUILD
-                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+                    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+                    COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-                COMMENT "Copy generated general_model_config proto file into directory paddle_serving_client/proto."
+                    COMMENT "Copy generated general_model_config proto file into directory paddle_serving_client/proto."
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
+    add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
-                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+                    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+                    COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-                COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_client/proto."
+                    COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_client/proto."
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif()
+  endif()
-if (APP)
+  if (APP)
-add_custom_command(TARGET general_model_config_py_proto POST_BUILD
+    add_custom_command(TARGET general_model_config_py_proto POST_BUILD
-                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
+                    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
-                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
+                    COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
-                COMMENT "Copy generated general_model_config proto file into directory paddle_serving_app/proto."
+                    COMMENT "Copy generated general_model_config proto file into directory paddle_serving_app/proto."
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif()
+  endif()
-if (SERVER)
+  if (SERVER)
-py_proto_compile(server_config_py_proto SRCS proto/server_configure.proto)
+    py_proto_compile(server_config_py_proto SRCS proto/server_configure.proto)
-add_custom_target(server_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+    add_custom_target(server_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-add_dependencies(server_config_py_proto server_config_py_proto_init)
+    add_dependencies(server_config_py_proto server_config_py_proto_init)
-if (NOT WITH_GPU AND NOT WITH_LITE)
+    add_custom_command(TARGET server_config_py_proto POST_BUILD
-add_custom_command(TARGET server_config_py_proto POST_BUILD
+    		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+    		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+    		COMMENT "Copy generated python proto into directory paddle_serving_server/proto."
-		COMMENT "Copy generated python proto into directory paddle_serving_server/proto."
+    		WORKING_DIRECTORY ${CMAKE_CURRENT_BINRARY_DIR})
-		WORKING_DIRECTORY ${CMAKE_CURRENT_BINRARY_DIR})
+    add_custom_command(TARGET general_model_config_py_proto POST_BUILD
-add_custom_command(TARGET general_model_config_py_proto POST_BUILD
+    		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+    		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+    		COMMENT "Copy generated general_model_config proto file into directory paddle_serving_server/proto."
-		COMMENT "Copy generated general_model_config proto file into directory paddle_serving_server/proto."
+    		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
-add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
+                    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+                    COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+                    COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server/proto."
-                COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server/proto."
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  endif()
-else()
-add_custom_command(TARGET server_config_py_proto POST_BUILD
-		COMMAND ${CMAKE_COMMAND} -E make_directory
-        ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-		COMMAND cp -f *.py
-        ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-		COMMENT "Copy generated python proto into directory
-        paddle_serving_server_gpu/proto."
-		WORKING_DIRECTORY ${CMAKE_CURRENT_BINRARY_DIR})
-add_custom_command(TARGET general_model_config_py_proto POST_BUILD
-		COMMAND ${CMAKE_COMMAND} -E make_directory
-        ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-		COMMAND cp -f *.py
-        ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-		COMMENT "Copy generated general_model_config proto file into directory
-        paddle_serving_server_gpu/proto."
-		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
-                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-                COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server_gpu/proto."
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif()
-endif()
 endif()
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -20,7 +20,7 @@ message EngineDesc {
  required string type = 2;
  required string reloadable_meta = 3;
  required string reloadable_type = 4;
-  required string model_data_path = 5;
+  required string model_dir = 5;
  required int32 runtime_thread_num = 6;
  required int32 batch_infer_size = 7;
  required int32 enable_batch_align = 8;
@@ -41,12 +41,13 @@ message EngineDesc {
  optional SparseParamServiceType sparse_param_service_type = 11;
  optional string sparse_param_service_table_name = 12;
  optional bool enable_memory_optimization = 13;
-  optional bool static_optimization = 14;
+  optional bool enable_ir_optimization = 14;
-  optional bool force_update_static_cache = 15;
+  optional bool use_trt = 15;
-  optional bool enable_ir_optimization = 16;
+  optional bool use_lite = 16;
-  optional bool use_trt = 17;
+  optional bool use_xpu = 17;
-  optional bool use_lite = 18;
+  optional bool use_gpu = 18;
-  optional bool use_xpu = 19;
+  optional bool combined_model = 19;
+  optional bool encrypted_model = 20;
 };
 // model_toolkit conf

--- a/core/configure/tests/test_configure.cpp
+++ b/core/configure/tests/test_configure.cpp
@@ -69,8 +69,6 @@ int test_write_conf() {
  engine->set_sparse_param_service_type(EngineDesc::LOCAL);
  engine->set_sparse_param_service_table_name("local_kv");
  engine->set_enable_memory_optimization(true);
-  engine->set_static_optimization(false);
-  engine->set_force_update_static_cache(false);
  int ret = baidu::paddle_serving::configure::write_proto_conf(
      &model_toolkit_conf, output_dir, model_toolkit_conf_file);

--- a/core/general-client/CMakeLists.txt
+++ b/core/general-client/CMakeLists.txt
 if(CLIENT)
 add_subdirectory(pybind11)
 pybind11_add_module(serving_client src/general_model.cpp src/pybind_general_model.cpp)
-add_dependencies(serving_client sdk_cpp)
 target_link_libraries(serving_client PRIVATE -Wl,--whole-archive utils sdk-cpp pybind python -Wl,--no-whole-archive -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -Wl,-rpath,'$ORIGIN'/lib)
 endif()
--- a/core/general-server/CMakeLists.txt
+++ b/core/general-server/CMakeLists.txt
@@ -2,33 +2,25 @@ include_directories(SYSTEM  ${CMAKE_CURRENT_LIST_DIR}/../../)
 include(op/CMakeLists.txt)
 include(proto/CMakeLists.txt)
 add_executable(serving ${serving_srcs})
-add_dependencies(serving pdcodegen fluid_cpu_engine pdserving paddle_fluid cube-api utils)
+add_dependencies(serving pdcodegen paddle_inference_engine pdserving paddle_inference cube-api utils)
 if (WITH_GPU)
-    add_dependencies(serving fluid_gpu_engine)
+    add_dependencies(serving paddle_inference_engine)
 endif()
 if (WITH_LITE)
-    add_dependencies(serving fluid_arm_engine)
+    add_dependencies(serving paddle_inference_engine)
 endif()
 target_include_directories(serving PUBLIC
        ${CMAKE_CURRENT_BINARY_DIR}/../../core/predictor
-        )
+)
-    include_directories(${CUDNN_ROOT}/include/)
+include_directories(${CUDNN_ROOT}/include/)
-if(WITH_GPU)
-    target_link_libraries(serving -Wl,--whole-archive fluid_gpu_engine
-            -Wl,--no-whole-archive)
-endif()
-if(WITH_LITE)
-    target_link_libraries(serving -Wl,--whole-archive fluid_arm_engine
-            -Wl,--no-whole-archive)
-endif()
-target_link_libraries(serving -Wl,--whole-archive fluid_cpu_engine
+target_link_libraries(serving -Wl,--whole-archive paddle_inference_engine
        -Wl,--no-whole-archive)
-target_link_libraries(serving paddle_fluid ${paddle_depend_libs})
+target_link_libraries(serving paddle_inference ${paddle_depend_libs})
 target_link_libraries(serving brpc)
 target_link_libraries(serving protobuf)
 target_link_libraries(serving pdserving)

--- a/core/predictor/CMakeLists.txt
+++ b/core/predictor/CMakeLists.txt
@@ -12,12 +12,12 @@ set_source_files_properties(
        ${pdserving_srcs}
        PROPERTIES
        COMPILE_FLAGS  "-Wno-strict-aliasing -Wno-unused-variable -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure extern_paddle paddle_fluid)
+add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure extern_paddle paddle_inference)
 if (WITH_TRT)
    add_definitions(-DWITH_TRT)
 endif()
 target_link_libraries(pdserving
-        brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz paddle_fluid ${paddle_depend_libs})
+        brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz paddle_inference ${paddle_depend_libs})
 # install
 install(TARGETS pdserving
        RUNTIME DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/bin

--- a/core/predictor/common/utils.h
+++ b/core/predictor/common/utils.h
@@ -14,6 +14,7 @@
 #pragma once
 #include <string>
+#include <fstream>
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/common/macros.h"
@@ -148,6 +149,16 @@ class IsDerivedFrom {
  }
 };
+static void ReadBinaryFile(const std::string& filename, std::string* contents) {
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  fin.seekg(0, std::ios::end);
+  contents->clear();
+  contents->resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(contents->at(0)), contents->size());
+  fin.close();
+}
 }  // namespace predictor
 }  // namespace paddle_serving
 }  // namespace baidu
--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -16,6 +16,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <pthread.h>
 #include <string>
 #include <utility>
 #include <vector>
@@ -29,83 +30,29 @@ namespace predictor {
 using configure::ModelToolkitConf;
-class InferEngineCreationParams {
+class AutoLock {
 public:
-  InferEngineCreationParams() {
+  explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
-    _path = "";
+    pthread_mutex_lock(&mutex);
-    _enable_memory_optimization = false;
-    _enable_ir_optimization = false;
-    _static_optimization = false;
-    _force_update_static_cache = false;
-    _use_trt = false;
-    _use_lite = false;
-    _use_xpu = false;
  }
+  ~AutoLock() { pthread_mutex_unlock(&_mut); }
-  void set_path(const std::string& path) { _path = path; }
+ private:
+  pthread_mutex_t& _mut;
-  void set_enable_memory_optimization(bool enable_memory_optimization) {
+};
-    _enable_memory_optimization = enable_memory_optimization;
-  }
-  void set_enable_ir_optimization(bool enable_ir_optimization) {
-    _enable_ir_optimization = enable_ir_optimization;
-  }
-  void set_use_trt(bool use_trt) { _use_trt = use_trt; }
-  void set_use_lite(bool use_lite) { _use_lite = use_lite; }
-  void set_use_xpu(bool use_xpu) { _use_xpu = use_xpu; }
-  bool enable_memory_optimization() const {
-    return _enable_memory_optimization;
-  }
-  bool enable_ir_optimization() const { return _enable_ir_optimization; }
-  bool use_trt() const { return _use_trt; }
-  bool use_lite() const { return _use_lite; }
-  bool use_xpu() const { return _use_xpu; }
-  void set_static_optimization(bool static_optimization = false) {
-    _static_optimization = static_optimization;
-  }
-  void set_force_update_static_cache(bool force_update_static_cache = false) {
-    _force_update_static_cache = force_update_static_cache;
-  }
-  bool static_optimization() const { return _static_optimization; }
-  bool force_update_static_cache() const { return _force_update_static_cache; }
-  std::string get_path() const { return _path; }
+class GlobalCreateMutex {
+ public:
+  pthread_mutex_t& mutex() { return _mut; }
-  void dump() const {
+  static pthread_mutex_t& instance() {
-    LOG(INFO) << "InferEngineCreationParams: "
+    static GlobalCreateMutex gmutex;
-              << "model_path = " << _path << ", "
+    return gmutex.mutex();
-              << "enable_memory_optimization = " << _enable_memory_optimization
-              << ", "
-              << "enable_tensorrt = " << _use_trt << ", "
-              << "enable_lite = " << _use_lite << ", "
-              << "enable_xpu = " << _use_xpu << ", "
-              << "enable_ir_optimization = " << _enable_ir_optimization << ", "
-              << "static_optimization = " << _static_optimization << ", "
-              << "force_update_static_cache = " << _force_update_static_cache;
  }
 private:
-  std::string _path;
+  GlobalCreateMutex() { pthread_mutex_init(&_mut, NULL); }
-  bool _enable_memory_optimization;
+  pthread_mutex_t _mut;
-  bool _enable_ir_optimization;
-  bool _static_optimization;
-  bool _force_update_static_cache;
-  bool _use_trt;
-  bool _use_lite;
-  bool _use_xpu;
 };
 class InferEngine {
@@ -152,57 +99,19 @@ class ReloadableInferEngine : public InferEngine {
    uint64_t last_revision;
  };
-  virtual int load(const InferEngineCreationParams& params) = 0;
+  virtual int load(const configure::EngineDesc& conf) = 0;
  int proc_initialize_impl(const configure::EngineDesc& conf, bool version) {
    _reload_tag_file = conf.reloadable_meta();
    _reload_mode_tag = conf.reloadable_type();
-    _model_data_path = conf.model_data_path();
+    _model_data_path = conf.model_dir();
    _infer_thread_num = conf.runtime_thread_num();
    _infer_batch_size = conf.batch_infer_size();
    _infer_batch_align = conf.enable_batch_align();
-    bool enable_memory_optimization = false;
+    _conf = conf;
-    if (conf.has_enable_memory_optimization()) {
-      enable_memory_optimization = conf.enable_memory_optimization();
-    }
-    bool static_optimization = false;
-    if (conf.has_static_optimization()) {
-      static_optimization = conf.static_optimization();
-    }
-    bool force_update_static_cache = false;
-    if (conf.has_force_update_static_cache()) {
-      force_update_static_cache = conf.force_update_static_cache();
-    }
-    if (conf.has_enable_ir_optimization()) {
+    if (!check_need_reload() || load(conf) != 0) {
-      _infer_engine_params.set_enable_ir_optimization(
-          conf.enable_ir_optimization());
-    }
-    _infer_engine_params.set_path(_model_data_path);
-    if (enable_memory_optimization) {
-      _infer_engine_params.set_enable_memory_optimization(true);
-      _infer_engine_params.set_static_optimization(static_optimization);
-      _infer_engine_params.set_force_update_static_cache(
-          force_update_static_cache);
-    }
-    if (conf.has_use_trt()) {
-      _infer_engine_params.set_use_trt(conf.use_trt());
-    }
-    if (conf.has_use_lite()) {
-      _infer_engine_params.set_use_lite(conf.use_lite());
-    }
-    if (conf.has_use_xpu()) {
-      _infer_engine_params.set_use_xpu(conf.use_xpu());
-    }
-    if (!check_need_reload() || load(_infer_engine_params) != 0) {
      LOG(ERROR) << "Failed load model_data_path" << _model_data_path;
      return -1;
    }
@@ -230,7 +139,6 @@ class ReloadableInferEngine : public InferEngine {
    if (_infer_thread_num > 0) {
      return 0;
    }
    return thrd_initialize_impl();
  }
@@ -254,13 +162,13 @@ class ReloadableInferEngine : public InferEngine {
  int reload() {
    if (check_need_reload()) {
      LOG(WARNING) << "begin reload model[" << _model_data_path << "].";
-      return load(_infer_engine_params);
+      return load(_conf);
    }
    return 0;
  }
  uint64_t version() const { return _version; }
  uint32_t thread_num() const { return _infer_thread_num; }
 private:
@@ -322,7 +230,7 @@ class ReloadableInferEngine : public InferEngine {
 protected:
  std::string _model_data_path;
-  InferEngineCreationParams _infer_engine_params;
+  configure::EngineDesc _conf;
 private:
  std::string _reload_tag_file;
@@ -361,25 +269,25 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
    return ReloadableInferEngine::proc_initialize(conf, version);
  }
-  virtual int load(const InferEngineCreationParams& params) {
+  virtual int load(const configure::EngineDesc& conf) {
    if (_reload_vec.empty()) {
      return 0;
    }
    for (uint32_t ti = 0; ti < _reload_vec.size(); ++ti) {
-      if (load_data(_reload_vec[ti], params) != 0) {
+      if (load_data(_reload_vec[ti], conf) != 0) {
        LOG(ERROR) << "Failed reload engine model: " << ti;
        return -1;
      }
    }
-    LOG(WARNING) << "Succ load engine, path: " << params.get_path();
+    LOG(WARNING) << "Succ load engine, path: " << conf.model_dir();
    return 0;
  }
  int load_data(ModelData<EngineCore>* md,
-                const InferEngineCreationParams& params) {
+                const configure::EngineDesc& conf) {
    uint32_t next_idx = (md->current_idx + 1) % 2;
    if (md->cores[next_idx]) {
      delete md->cores[next_idx];
@@ -387,9 +295,9 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
    md->cores[next_idx] = new (std::nothrow) EngineCore;
-    params.dump();
+    //params.dump();
-    if (!md->cores[next_idx] || md->cores[next_idx]->create(params) != 0) {
+    if (!md->cores[next_idx] || md->cores[next_idx]->create(conf) != 0) {
-      LOG(ERROR) << "Failed create model, path: " << params.get_path();
+      LOG(ERROR) << "Failed create model, path: " << conf.model_dir();
      return -1;
    }
    md->current_idx = next_idx;
@@ -400,9 +308,9 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
    // memory pool to be inited in non-serving-threads
    ModelData<EngineCore>* md = new (std::nothrow) ModelData<EngineCore>;
-    if (!md || load_data(md, _infer_engine_params) != 0) {
+    if (!md || load_data(md, _conf) != 0) {
      LOG(ERROR) << "Failed create thread data from "
-                 << _infer_engine_params.get_path();
+                 << _conf.model_dir();
      return -1;
    }
@@ -458,16 +366,16 @@ class CloneDBReloadableInferEngine
    return DBReloadableInferEngine<EngineCore>::proc_initialize(conf, version);
  }
-  virtual int load(const InferEngineCreationParams& params) {
+  virtual int load(const configure::EngineDesc& conf) {
    // 加载进程级模型数据
    if (!_pd ||
-        DBReloadableInferEngine<EngineCore>::load_data(_pd, params) != 0) {
+        DBReloadableInferEngine<EngineCore>::load_data(_pd, conf) != 0) {
-      LOG(ERROR) << "Failed to create common model from [" << params.get_path()
+      LOG(ERROR) << "Failed to create common model from [" << conf.model_dir()
                 << "].";
      return -1;
    }
    LOG(WARNING) << "Succ load common model[" << _pd->cores[_pd->current_idx]
-                 << "], path[" << params.get_path() << "].";
+                 << "], path[" << conf.model_dir() << "].";
    if (DBReloadableInferEngine<EngineCore>::_reload_vec.empty()) {
      return 0;
@@ -483,7 +391,7 @@ class CloneDBReloadableInferEngine
      }
    }
-    LOG(WARNING) << "Succ load clone model, path[" << params.get_path() << "]";
+    LOG(WARNING) << "Succ load clone model, path[" << conf.model_dir() << "]";
    return 0;
  }
@@ -527,18 +435,18 @@ class CloneDBReloadableInferEngine
      _pd;  // 进程级EngineCore，多个线程级EngineCore共用该对象的模型数据
 };
-template <typename FluidFamilyCore>
+template <typename PaddleInferenceCore>
 #ifdef WITH_TRT
-class FluidInferEngine : public DBReloadableInferEngine<FluidFamilyCore> {
+class FluidInferEngine : public DBReloadableInferEngine<PaddleInferenceCore> {
 #else
-class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
+class FluidInferEngine : public CloneDBReloadableInferEngine<PaddleInferenceCore> {
 #endif
 public:  // NOLINT
  FluidInferEngine() {}
  ~FluidInferEngine() {}
  std::vector<std::string> GetInputNames() {
-    FluidFamilyCore* core =
+    PaddleInferenceCore* core =
-        DBReloadableInferEngine<FluidFamilyCore>::get_core();
+        DBReloadableInferEngine<PaddleInferenceCore>::get_core();
    if (!core || !core->get()) {
      LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
    }
@@ -546,8 +454,8 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
  }
  std::vector<std::string> GetOutputNames() {
-    FluidFamilyCore* core =
+    PaddleInferenceCore* core =
-        DBReloadableInferEngine<FluidFamilyCore>::get_core();
+        DBReloadableInferEngine<PaddleInferenceCore>::get_core();
    if (!core || !core->get()) {
      LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
    }
@@ -556,8 +464,8 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
  std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
      const std::string& name) {
-    FluidFamilyCore* core =
+    PaddleInferenceCore* core =
-        DBReloadableInferEngine<FluidFamilyCore>::get_core();
+        DBReloadableInferEngine<PaddleInferenceCore>::get_core();
    if (!core || !core->get()) {
      LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
    }
@@ -566,8 +474,8 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
  std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
      const std::string& name) {
-    FluidFamilyCore* core =
+    PaddleInferenceCore* core =
-        DBReloadableInferEngine<FluidFamilyCore>::get_core();
+        DBReloadableInferEngine<PaddleInferenceCore>::get_core();
    if (!core || !core->get()) {
      LOG(ERROR) << "Failed get fluid core in GetOutputHandle()";
    }
@@ -575,8 +483,8 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
  }
  int infer_impl() {
-    FluidFamilyCore* core =
+    PaddleInferenceCore* core =
-        DBReloadableInferEngine<FluidFamilyCore>::get_core();
+        DBReloadableInferEngine<PaddleInferenceCore>::get_core();
    if (!core || !core->get()) {
      LOG(ERROR) << "Failed get fluid core in infer_impl()";
      return -1;

--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -77,7 +77,7 @@ export PYTHON_EXECUTABLE=$PYTHONROOT/bin/python3.8
 ## Install Python dependencies
 ```shell
-pip install -r python/requirements.txt
+pip install -r python/requirements.txt -i https://mirror.baidu.com/pypi/simple
 ```
 If you use other Python version, please use the right `pip` accordingly.
@@ -123,14 +123,13 @@ Compared with CPU environment, GPU environment needs to refer to the following t
 **It should be noted that the following table is used as a reference for non-Docker compilation environment. The Docker compilation environment has been configured with relevant parameters and does not need to be specified in cmake process. **
 | cmake environment variable | meaning | GPU environment considerations | whether Docker environment is needed |
-|-----------------------|------------------------- ------------|-------------------------------|----- ---------------|
+|-----------------------|-------------------------------------|-------------------------------|--------------------|
-| CUDA_TOOLKIT_ROOT_DIR | cuda installation path, usually /usr/local/cuda | Required for all environments | No
+| CUDA_TOOLKIT_ROOT_DIR | cuda installation path, usually /usr/local/cuda | Required for all environments | No (/usr/local/cuda) |
-(/usr/local/cuda) |
 | CUDNN_LIBRARY | The directory where libcudnn.so.* is located, usually /usr/local/cuda/lib64/ | Required for all environments | No (/usr/local/cuda/lib64/) |
 | CUDA_CUDART_LIBRARY | The directory where libcudart.so.* is located, usually /usr/local/cuda/lib64/ | Required for all environments | No (/usr/local/cuda/lib64/) |
 | TENSORRT_ROOT | The upper level directory of the directory where libnvinfer.so.* is located, depends on the TensorRT installation directory | Cuda 9.0/10.0 does not need, other needs | No (/usr) |
-If not in Docker environment, users can refer to the following execution methods. The specific path is subject to the current environment, and the code is only for reference.
+If not in Docker environment, users can refer to the following execution methods. The specific path is subject to the current environment, and the code is only for reference.TENSORRT_LIBRARY_PATH is related to the TensorRT version and should be set according to the actual situation。For example, in the cuda10.1 environment, the TensorRT version is 6.0 (/usr/local/TensorRT-6.0.1.5/targets/x86_64-linux-gnu/)，In the cuda10.2 environment, the TensorRT version is 7.1 (/usr/local/TensorRT-7.1.3.4/targets/x86_64-linux-gnu/).
 ``` shell
 export CUDA_PATH='/usr/local/cuda'
@@ -145,7 +144,7 @@ cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR \
    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
    -DCUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY} \
-    -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH}
+    -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH} \
    -DSERVER=ON \
    -DWITH_GPU=ON ..
 make -j10

--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -76,7 +76,7 @@ export PYTHON_EXECUTABLE=$PYTHONROOT/bin/python3.8
 ## 安装Python依赖
 ```shell
-pip install -r python/requirements.txt
+pip install -r python/requirements.txt -i https://mirror.baidu.com/pypi/simple
 ```
 如果使用其他Python版本，请使用对应版本的`pip`。
@@ -128,7 +128,7 @@ make -j10
 | CUDA_CUDART_LIBRARY   | libcudart.so.*所在目录，通常为/usr/local/cuda/lib64/ | 全部环境都需要                | 否(/usr/local/cuda/lib64/)                 |
 | TENSORRT_ROOT         | libnvinfer.so.*所在目录的上一级目录，取决于TensorRT安装目录 | Cuda 9.0/10.0不需要，其他需要 | 否(/usr)                 |
-非Docker环境下，用户可以参考如下执行方式，具体的路径以当时环境为准，代码仅作为参考。
+非Docker环境下，用户可以参考如下执行方式，具体的路径以当时环境为准，代码仅作为参考。TENSORRT_LIBRARY_PATH和TensorRT版本有关，要根据实际情况设置。例如在cuda10.1环境下TensorRT版本是6.0(/usr/local/TensorRT-6.0.1.5/targets/x86_64-linux-gnu/)，在cuda10.2环境下TensorRT版本是7.1（/usr/local/TensorRT-7.1.3.4/targets/x86_64-linux-gnu/）。
 ``` shell
 export CUDA_PATH='/usr/local/cuda'
@@ -143,7 +143,7 @@ cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR \
    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
    -DCUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY} \
-    -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH}
+    -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH} \
    -DSERVER=ON \
    -DWITH_GPU=ON ..
 make -j10
@@ -159,7 +159,7 @@ make -j10
 mkdir client-build && cd client-build
 cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR \
    -DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
-    -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \    
+    -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
    -DCLIENT=ON ..
 make -j10
 ```

--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -6,17 +6,17 @@
 #### Q: Paddle Serving 、Paddle Inference、PaddleHub Serving三者的区别及联系？
-**A:** paddle serving是远程服务，即发起预测的设备（手机、浏览器、客户端等）与实际预测的硬件不在一起。	paddle inference是一个library，适合嵌入到一个大系统中保证预测效率，paddle serving调用了paddle       inference做远程服务。paddlehub serving可以认为是一个示例，都会使用paddle serving作为统一预测服务入口。如果在web端交互，一般是调用远程服务的形式，可以使用paddle serving的web service搭建。
+**A:** paddle serving是远程服务，即发起预测的设备（手机、浏览器、客户端等）与实际预测的硬件不在一起。   paddle inference是一个library，适合嵌入到一个大系统中保证预测效率，paddle serving调用了paddle       inference做远程服务。paddlehub serving可以认为是一个示例，都会使用paddle serving作为统一预测服务入口。如果在web端交互，一般是调用远程服务的形式，可以使用paddle serving的web service搭建。
 #### Q: paddle-serving是否支持Int32支持
 **A:** 在protobuf定feed_type和fetch_type编号与数据类型对应如下
     0-int64
-	  1-float32
+      1-float32
-	  2-int32
+      2-int32
 #### Q: paddle-serving是否支持windows和Linux环境下的多线程调用 
@@ -37,6 +37,7 @@
 ## 安装问题
 #### Q: pip install安装whl包过程，报错信息如下：
 ```
 Collecting opencv-python
  Using cached opencv-python-4.3.0.38.tar.gz (88.0 MB)
@@ -69,9 +70,11 @@ Collecting opencv-python
      s = list(pattern)
  TypeError: 'NoneType' object is not iterable
 ```
 **A:** 指定opencv-python版本安装，pip install opencv-python==4.2.0.32，再安装whl包
 #### Q: pip3 install whl包过程报错信息如下：
 ```
    Complete output from command python setup.py egg_info:
    Found cython-generated files...
@@ -80,13 +83,16 @@ Collecting opencv-python
    ----------------------------------------
 Command "python setup.py egg_info" failed with error code 1 in /tmp/pip-install-taoxz02y/grpcio/
 ```
 **A:** 需要升级pip3，再重新执行安装命令。
 ```
 pip3 install --upgrade pip
 pip3 install --upgrade setuptools
 ```
 #### Q: 运行过程中报错，信息如下：
 ```
 Traceback (most recent call last):
  File "../../deploy/serving/test_client.py", line 18, in <module>
@@ -97,7 +103,9 @@ Traceback (most recent call last):
    from shapely.geometry import Polygon
 ImportError: No module named shapely.geometry
 ```
 **A:** 有2种方法，第一种通过pip/pip3安装shapely，第二种通过pip/pip3安装所有依赖组件。
 ```
 方法1：
 pip install shapely==1.7.0
@@ -116,7 +124,69 @@ pip install -r python/requirements.txt
 **A:** 没有安装JDK，或者JAVA_HOME路径配置错误（正确配置是JDK路径，常见错误配置成JRE路径，例如正确路径参考JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.262.b10-0.el7_8.x86_64/"）。Java JDK安装参考https://segmentfault.com/a/1190000015389941
+## 环境问题
+#### Q：使用过程中出现CXXABI错误。
+这个问题出现的原因是Python使用的gcc版本和Serving所需的gcc版本对不上。对于Docker用户，推荐使用[Docker容器](./RUN_IN_DOCKER_CN.md)，由于Docker容器内的Python版本与Serving在发布前都做过适配，这样就不会出现类似的错误。如果是其他开发环境，首先需要确保开发环境中具备GCC 8.2，如果没有gcc 8.2，参考安装方式
+```bash
+wget -q https://paddle-ci.gz.bcebos.com/gcc-8.2.0.tar.xz 
+tar -xvf gcc-8.2.0.tar.xz && \
+cd gcc-8.2.0 && \
+unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
+./contrib/download_prerequisites && \
+cd .. && mkdir temp_gcc82 && cd temp_gcc82 && \
+../gcc-8.2.0/configure --prefix=/usr/local/gcc-8.2 --enable-threads=posix --disable-checking --disable-multilib && \
+make -j8 && make install
+cd .. && rm -rf temp_gcc82
+cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} && 
+ln -s /usr/local/gcc-8.2/lib64/libgfortran.so.5 ${lib_so_5} && \
+ln -s /usr/local/gcc-8.2/lib64/libstdc++.so.6 ${lib_so_6} && \
+cp /usr/local/gcc-8.2/lib64/libstdc++.so.6.0.25 ${lib_path}
+```
+假如已经有了GCC 8.2，可以自行安装Python，此外我们也提供了两个GCC 8.2编译的[Python2.7](https://paddle-serving.bj.bcebos.com/others/Python2.7.17-gcc82.tar) 和 [Python3.6](https://paddle-serving.bj.bcebos.com/others/Python3.6.10-gcc82.tar) 。下载解压后，需要将对应的目录设置为`PYTHONROOT`，并设置`PATH`和`LD_LIBRARY_PATH`。
+```bash
+export PYTHONROOT=/path/of/python # 对应解压后的Python目录
+export PATH=$PYTHONROOT/bin:$PATH
+export LD_LIBRARY_PATH=$PYTHONROOT/lib:$LD_LIBRARY_PATH
+```
+#### Q：遇到libstdc++.so.6的版本不够的问题
+触发该问题的原因在于，编译Paddle Serving相关可执行程序和动态库，所采用的是GCC 8.2(Cuda 9.0和10.0的Server可执行程序受限Cuda兼容性采用GCC 4.8编译)。Python在调用的过程中，有可能链接到了其他GCC版本的 `libstdc++.so`。 需要做的就是受限确保所在环境具备GCC 8.2，其次将GCC8.2的`libstdc++.so.*`拷贝到某个目录例如`/home/libstdcpp`下。最后`export LD_LIBRARY_PATH=/home/libstdcpp:$LD_LIBRARY_PATH` 即可。
+#### Q: 遇到OPENSSL_1.0.1EC 符号找不到的问题。
+目前Serving的可执行程序和客户端动态库需要链接1.0.2k版本的openssl动态库。如果环境当中没有，可以执行
+```bash
+wget https://paddle-serving.bj.bcebos.com/others/centos_ssl.tar && \
+    tar xf centos_ssl.tar && rm -rf centos_ssl.tar && \
+    mv libcrypto.so.1.0.2k /usr/lib/libcrypto.so.1.0.2k && mv libssl.so.1.0.2k /usr/lib/libssl.so.1.0.2k && \
+    ln -sf /usr/lib/libcrypto.so.1.0.2k /usr/lib/libcrypto.so.10 && \
+    ln -sf /usr/lib/libssl.so.1.0.2k /usr/lib/libssl.so.10 && \
+    ln -sf /usr/lib/libcrypto.so.10 /usr/lib/libcrypto.so && \
+    ln -sf /usr/lib/libssl.so.10 /usr/lib/libssl.so
+```
+其中`/usr/lib` 可以换成其他目录，并确保该目录在`LD_LIBRARY_PATH`下。
+### GPU相关环境问题
+#### Q：需要做哪些检查确保Serving可以运行在GPU环境
+**注：如果是使用Serving提供的镜像不需要做下列检查，如果是其他开发环境可以参考以下指导。**
+首先需要确保`nvidia-smi`可用，其次需要确保所需的动态库so文件在`LD_LIBRARY_PATH`所在的目录（包括系统lib库）。
+（1）Cuda显卡驱动：文件名通常为 `libcuda.so.$DRIVER_VERSION` 例如驱动版本为440.10.15，文件名就是`libcuda.so.440.10.15`。
+（2）Cuda和Cudnn动态库：文件名通常为 `libcudart.so.$CUDA_VERSION`，和 `libcudnn.so.$CUDNN_VERSION`。例如Cuda9就是 `libcudart.so.9.0`，Cudnn7就是 `libcudnn.so.7`。Cuda和Cudnn与Serving的版本匹配参见[Serving所有镜像列表](DOCKER_IMAGES_CN.md#%E9%99%84%E5%BD%95%E6%89%80%E6%9C%89%E9%95%9C%E5%83%8F%E5%88%97%E8%A1%A8).
+  (3) Cuda10.1及更高版本需要TensorRT。安装TensorRT相关文件的脚本参考 [install_trt.sh](../tools/dockerfile/build_scripts/install_trt.sh).
 ## 部署问题
@@ -154,7 +224,7 @@ InvalidArgumentError: Device id must be less than GPU count, but received id is:
 **A:**:1)使用[GPU docker](https://github.com/PaddlePaddle/Serving/blob/develop/doc/RUN_IN_DOCKER.md#gpunvidia-docker)解决环境问题
-	   2)修改anaconda的虚拟环境下安装的python的gcc版本[参考](https://www.jianshu.com/p/c498b3d86f77) 
+       2)修改anaconda的虚拟环境下安装的python的gcc版本[参考](https://www.jianshu.com/p/c498b3d86f77) 
 #### Q: paddle-serving是否支持本地离线安装 
@@ -221,9 +291,10 @@ client端的日志直接打印到标准输出。
 **A:** 1)警告是glog组件打印的，告知glog初始化之前日志打印在STDERR
-	   2)一般采用GLOG_v方式启动服务同时设置日志级别。
+       2)一般采用GLOG_v方式启动服务同时设置日志级别。
 例如：
 ```
 GLOG_v=2 python -m paddle_serving_server.serve --model xxx_conf/ --port 9999 
 ```

--- a/paddle_inference/CMakeLists.txt
+++ b/paddle_inference/CMakeLists.txt
@@ -13,13 +13,5 @@
 # limitations under the License
 if (NOT CLIENT_ONLY)
-    add_subdirectory(inferencer-fluid-cpu)
+    add_subdirectory(paddle)
-    if (WITH_GPU)
-        add_subdirectory(inferencer-fluid-gpu)
-    endif()
-    if (WITH_LITE)
-        add_subdirectory(inferencer-fluid-arm)
-    endif()
 endif()
--- a/paddle_inference/inferencer-fluid-arm/CMakeLists.txt
+++ b/paddle_inference/inferencer-fluid-arm/CMakeLists.txt
-FILE(GLOB fluid_arm_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
-add_library(fluid_arm_engine ${fluid_arm_engine_srcs})
-target_include_directories(fluid_arm_engine PUBLIC
-        ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
-add_dependencies(fluid_arm_engine pdserving extern_paddle configure)
-target_link_libraries(fluid_arm_engine pdserving paddle_fluid -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
-install(TARGETS fluid_arm_engine 
-        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
-        )
--- a/paddle_inference/inferencer-fluid-cpu/CMakeLists.txt
+++ b/paddle_inference/inferencer-fluid-cpu/CMakeLists.txt
-FILE(GLOB fluid_cpu_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
-add_library(fluid_cpu_engine ${fluid_cpu_engine_srcs})
-target_include_directories(fluid_cpu_engine PUBLIC
-        ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
-add_dependencies(fluid_cpu_engine pdserving extern_paddle configure)
-target_link_libraries(fluid_cpu_engine pdserving paddle_fluid -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
-install(TARGETS fluid_cpu_engine 
-        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
-        )
--- a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
+++ b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <pthread.h>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include "core/configure/include/configure_parser.h"
-#include "core/configure/inferencer_configure.pb.h"
-#include "core/predictor/framework/infer.h"
-#include "paddle_inference_api.h"  // NOLINT
-namespace baidu {
-namespace paddle_serving {
-namespace fluid_cpu {
-class AutoLock {
- public:
-  explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
-    pthread_mutex_lock(&mutex);
-  }
-  ~AutoLock() { pthread_mutex_unlock(&_mut); }
- private:
-  pthread_mutex_t& _mut;
-};
-class GlobalPaddleCreateMutex {
- public:
-  pthread_mutex_t& mutex() { return _mut; }
-  static pthread_mutex_t& instance() {
-    static GlobalPaddleCreateMutex gmutex;
-    return gmutex.mutex();
-  }
- private:
-  GlobalPaddleCreateMutex() { pthread_mutex_init(&_mut, NULL); }
-  pthread_mutex_t _mut;
-};
-using paddle_infer::Config;
-using paddle_infer::Predictor;
-using paddle_infer::Tensor;
-using paddle_infer::CreatePredictor;
-// data interface
-class FluidFamilyCore {
- public:
-  virtual ~FluidFamilyCore() {}
-  virtual std::vector<std::string> GetInputNames() {
-    return _core->GetInputNames();
-  }
-  virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
-    return _core->GetInputHandle(name);
-  }
-  virtual std::vector<std::string> GetOutputNames() {
-    return _core->GetOutputNames();
-  }
-  virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
-    return _core->GetOutputHandle(name);
-  }
-  virtual bool Run() {
-    if (!_core->Run()) {
-      LOG(ERROR) << "Failed call Run with paddle predictor";
-      return false;
-    }
-    return true;
-  }
-  virtual int create(const predictor::InferEngineCreationParams& params) = 0;
-  virtual int clone(void* origin_core) {
-    if (origin_core == NULL) {
-      LOG(ERROR) << "origin paddle Predictor is null.";
-      return -1;
-    }
-    Predictor* p_predictor = (Predictor*)origin_core;
-    _core = p_predictor->Clone();
-    if (_core.get() == NULL) {
-      LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
-      return -1;
-    }
-    return 0;
-  }
-  virtual void* get() { return _core.get(); }
- protected:
-  std::shared_ptr<Predictor> _core;
-};
-// infer interface
-class FluidCpuAnalysisCore : public FluidFamilyCore {
- public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
-    }
-    Config config;
-    config.SetParamsFile(data_path + "/__params__");
-    config.SetProgFile(data_path + "/__model__");
-    config.DisableGpu();
-    config.SetCpuMathLibraryNumThreads(1);
-    if (params.enable_memory_optimization()) {
-      config.EnableMemoryOptim();
-    }
-    config.SwitchSpecifyInputNames(true);
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core = CreatePredictor(config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-class FluidCpuAnalysisDirCore : public FluidFamilyCore {
- public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
-    }
-    Config config;
-    config.SetModel(data_path);
-    config.DisableGpu();
-    config.SwitchSpecifyInputNames(true);
-    config.SetCpuMathLibraryNumThreads(1);
-    if (params.enable_memory_optimization()) {
-      config.EnableMemoryOptim();
-    }
-    if (params.enable_ir_optimization()) {
-      config.SwitchIrOptim(true);
-    } else {
-      config.SwitchIrOptim(false);
-    }
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core = CreatePredictor(config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-class Parameter {
- public:
-  Parameter() : _row(0), _col(0), _params(NULL) {}
-  ~Parameter() {
-    VLOG(2) << "before destroy Parameter, file_name[" << _file_name << "]";
-    destroy();
-  }
-  int init(int row, int col, const char* file_name) {
-    destroy();
-    _file_name = file_name;
-    _row = row;
-    _col = col;
-    _params = reinterpret_cast<float*>(malloc(_row * _col * sizeof(float)));
-    if (_params == NULL) {
-      LOG(ERROR) << "Load " << _file_name << " malloc error.";
-      return -1;
-    }
-    VLOG(2) << "Load parameter file[" << _file_name << "] success.";
-    return 0;
-  }
-  void destroy() {
-    _row = 0;
-    _col = 0;
-    if (_params != NULL) {
-      free(_params);
-      _params = NULL;
-    }
-  }
-  int load() {
-    if (_params == NULL || _row <= 0 || _col <= 0) {
-      LOG(ERROR) << "load parameter error [not inited].";
-      return -1;
-    }
-    FILE* fs = fopen(_file_name.c_str(), "rb");
-    if (fs == NULL) {
-      LOG(ERROR) << "load " << _file_name << " fopen error.";
-      return -1;
-    }
-    static const uint32_t MODEL_FILE_HEAD_LEN = 16;
-    char head[MODEL_FILE_HEAD_LEN] = {0};
-    if (fread(head, 1, MODEL_FILE_HEAD_LEN, fs) != MODEL_FILE_HEAD_LEN) {
-      destroy();
-      LOG(ERROR) << "Load " << _file_name << " read head error.";
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      return -1;
-    }
-    uint32_t matrix_size = _row * _col;
-    if (matrix_size == fread(_params, sizeof(float), matrix_size, fs)) {
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      VLOG(2) << "load " << _file_name << " read ok.";
-      return 0;
-    } else {
-      LOG(ERROR) << "load " << _file_name << " read error.";
-      destroy();
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      return -1;
-    }
-    return 0;
-  }
- public:
-  std::string _file_name;
-  int _row;
-  int _col;
-  float* _params;
-};
-class FluidCpuAnalysisEncryptCore : public FluidFamilyCore {
- public:
-  void ReadBinaryFile(const std::string& filename, std::string* contents) {
-    std::ifstream fin(filename, std::ios::in | std::ios::binary);
-    fin.seekg(0, std::ios::end);
-    contents->clear();
-    contents->resize(fin.tellg());
-    fin.seekg(0, std::ios::beg);
-    fin.read(&(contents->at(0)), contents->size());
-    fin.close();
-  }
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path note exits: "
-                 << data_path;
-      return -1;
-    }
-    std::string model_buffer, params_buffer, key_buffer;
-    ReadBinaryFile(data_path + "encrypt_model", &model_buffer);
-    ReadBinaryFile(data_path + "encrypt_params", &params_buffer);
-    ReadBinaryFile(data_path + "key", &key_buffer);
-    VLOG(2) << "prepare for encryption model";
-    auto cipher = paddle::MakeCipher("");
-    std::string real_model_buffer = cipher->Decrypt(model_buffer, key_buffer);
-    std::string real_params_buffer = cipher->Decrypt(params_buffer, key_buffer);
-    Config analysis_config;
-    // paddle::AnalysisConfig analysis_config;
-    analysis_config.SetModelBuffer(&real_model_buffer[0],
-                                   real_model_buffer.size(),
-                                   &real_params_buffer[0],
-                                   real_params_buffer.size());
-    analysis_config.DisableGpu();
-    analysis_config.SetCpuMathLibraryNumThreads(1);
-    if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim();
-    }
-    analysis_config.SwitchSpecifyInputNames(true);
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    VLOG(2) << "decrypt model file sucess";
-    _core = CreatePredictor(analysis_config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-}  // namespace fluid_cpu
-}  // namespace paddle_serving
-}  // namespace baidu
--- a/paddle_inference/inferencer-fluid-cpu/src/fluid_cpu_engine.cpp
+++ b/paddle_inference/inferencer-fluid-cpu/src/fluid_cpu_engine.cpp
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h"
-#include "core/predictor/framework/factory.h"
-namespace baidu {
-namespace paddle_serving {
-namespace fluid_cpu {
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidCpuAnalysisCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_CPU_ANALYSIS");
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<
-        FluidCpuAnalysisDirCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_CPU_ANALYSIS_DIR");
-#if 1
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<
-        FluidCpuAnalysisEncryptCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_CPU_ANALYSIS_ENCRYPT");
-#endif
-}  // namespace fluid_cpu
-}  // namespace paddle_serving
-}  // namespace baidu
--- a/paddle_inference/inferencer-fluid-gpu/CMakeLists.txt
+++ b/paddle_inference/inferencer-fluid-gpu/CMakeLists.txt
-FILE(GLOB fluid_gpu_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
-add_library(fluid_gpu_engine ${fluid_gpu_engine_srcs})
-target_include_directories(fluid_gpu_engine PUBLIC
-        ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
-add_dependencies(fluid_gpu_engine pdserving extern_paddle configure)
-target_link_libraries(fluid_gpu_engine pdserving paddle_fluid iomp5 mklml_intel -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
-install(TARGETS fluid_gpu_engine 
-        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
-        )
--- a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
+++ b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <pthread.h>
-#include <fstream>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "core/configure/include/configure_parser.h"
-#include "core/configure/inferencer_configure.pb.h"
-#include "core/predictor/framework/infer.h"
-#include "paddle_inference_api.h"  // NOLINT
-DECLARE_int32(gpuid);
-namespace baidu {
-namespace paddle_serving {
-namespace fluid_gpu {
-using configure::SigmoidConf;
-class AutoLock {
- public:
-  explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
-    pthread_mutex_lock(&mutex);
-  }
-  ~AutoLock() { pthread_mutex_unlock(&_mut); }
- private:
-  pthread_mutex_t& _mut;
-};
-class GlobalPaddleCreateMutex {
- public:
-  pthread_mutex_t& mutex() { return _mut; }
-  static pthread_mutex_t& instance() {
-    static GlobalPaddleCreateMutex gmutex;
-    return gmutex.mutex();
-  }
- private:
-  GlobalPaddleCreateMutex() { pthread_mutex_init(&_mut, NULL); }
-  pthread_mutex_t _mut;
-};
-using paddle_infer::Config;
-using paddle_infer::Predictor;
-using paddle_infer::Tensor;
-using paddle_infer::CreatePredictor;
-// data interface
-class FluidFamilyCore {
- public:
-  virtual ~FluidFamilyCore() {}
-  virtual std::vector<std::string> GetInputNames() {
-    return _core->GetInputNames();
-  }
-  virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
-    return _core->GetInputHandle(name);
-  }
-  virtual std::vector<std::string> GetOutputNames() {
-    return _core->GetOutputNames();
-  }
-  virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
-    return _core->GetOutputHandle(name);
-  }
-  virtual bool Run() {
-    if (!_core->Run()) {
-      LOG(ERROR) << "Failed call Run with paddle predictor";
-      return false;
-    }
-    return true;
-  }
-  virtual int create(const predictor::InferEngineCreationParams& params) = 0;
-  virtual int clone(void* origin_core) {
-    if (origin_core == NULL) {
-      LOG(ERROR) << "origin paddle Predictor is null.";
-      return -1;
-    }
-    Predictor* p_predictor = (Predictor*)origin_core;
-    _core = p_predictor->Clone();
-    if (_core.get() == NULL) {
-      LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
-      return -1;
-    }
-    return 0;
-  }
-  virtual void* get() { return _core.get(); }
- protected:
-  std::shared_ptr<Predictor> _core;
-};
-// infer interface
-class FluidGpuAnalysisCore : public FluidFamilyCore {
- public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
-    }
-    Config config;
-    config.SetParamsFile(data_path + "/__params__");
-    config.SetProgFile(data_path + "/__model__");
-    config.EnableUseGpu(100, FLAGS_gpuid);
-    config.SetCpuMathLibraryNumThreads(1);
-    if (params.enable_memory_optimization()) {
-      config.EnableMemoryOptim();
-    }
-    config.SwitchSpecifyInputNames(true);
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core = CreatePredictor(config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-class FluidGpuAnalysisDirCore : public FluidFamilyCore {
- public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
-    }
-    Config config;
-    config.SetModel(data_path);
-    config.EnableUseGpu(1500, FLAGS_gpuid);
-    config.SwitchSpecifyInputNames(true);
-    config.SetCpuMathLibraryNumThreads(1);
-    if (params.enable_memory_optimization()) {
-      config.EnableMemoryOptim();
-    }
-    int max_batch = 32;
-    int min_subgraph_size = 3;
-    if (params.use_trt()) {
-      config.EnableTensorRtEngine(1 << 20,
-                                  max_batch,
-                                  min_subgraph_size,
-                                  Config::Precision::kFloat32,
-                                  false,
-                                  false);
-      LOG(INFO) << "create TensorRT predictor";
-    } else {
-      if (params.enable_memory_optimization()) {
-        config.EnableMemoryOptim();
-      }
-      if (params.enable_ir_optimization()) {
-        config.SwitchIrOptim(true);
-      } else {
-        config.SwitchIrOptim(false);
-      }
-    }
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core = CreatePredictor(config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-class Parameter {
- public:
-  Parameter() : _row(0), _col(0), _params(NULL) {}
-  ~Parameter() {
-    LOG(INFO) << "before destroy Parameter, file_name[" << _file_name << "]";
-    destroy();
-  }
-  int init(int row, int col, const char* file_name) {
-    destroy();
-    _file_name = file_name;
-    _row = row;
-    _col = col;
-    _params = reinterpret_cast<float*>(malloc(_row * _col * sizeof(float)));
-    if (_params == NULL) {
-      LOG(ERROR) << "Load " << _file_name << " malloc error.";
-      return -1;
-    }
-    VLOG(2) << "Load parameter file[" << _file_name << "] success.";
-    return 0;
-  }
-  void destroy() {
-    _row = 0;
-    _col = 0;
-    if (_params != NULL) {
-      free(_params);
-      _params = NULL;
-    }
-  }
-  int load() {
-    if (_params == NULL || _row <= 0 || _col <= 0) {
-      LOG(ERROR) << "load parameter error [not inited].";
-      return -1;
-    }
-    FILE* fs = fopen(_file_name.c_str(), "rb");
-    if (fs == NULL) {
-      LOG(ERROR) << "load " << _file_name << " fopen error.";
-      return -1;
-    }
-    static const uint32_t MODEL_FILE_HEAD_LEN = 16;
-    char head[MODEL_FILE_HEAD_LEN] = {0};
-    if (fread(head, 1, MODEL_FILE_HEAD_LEN, fs) != MODEL_FILE_HEAD_LEN) {
-      destroy();
-      LOG(ERROR) << "Load " << _file_name << " read head error.";
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      return -1;
-    }
-    uint32_t matrix_size = _row * _col;
-    if (matrix_size == fread(_params, sizeof(float), matrix_size, fs)) {
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      LOG(INFO) << "load " << _file_name << " read ok.";
-      return 0;
-    } else {
-      LOG(ERROR) << "load " << _file_name << " read error.";
-      destroy();
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      return -1;
-    }
-    return 0;
-  }
- public:
-  std::string _file_name;
-  int _row;
-  int _col;
-  float* _params;
-};
-class FluidGpuAnalysisEncryptCore : public FluidFamilyCore {
- public:
-  void ReadBinaryFile(const std::string& filename, std::string* contents) {
-    std::ifstream fin(filename, std::ios::in | std::ios::binary);
-    fin.seekg(0, std::ios::end);
-    contents->clear();
-    contents->resize(fin.tellg());
-    fin.seekg(0, std::ios::beg);
-    fin.read(&(contents->at(0)), contents->size());
-    fin.close();
-  }
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path note exits: "
-                 << data_path;
-      return -1;
-    }
-    std::string model_buffer, params_buffer, key_buffer;
-    ReadBinaryFile(data_path + "encrypt_model", &model_buffer);
-    ReadBinaryFile(data_path + "encrypt_params", &params_buffer);
-    ReadBinaryFile(data_path + "key", &key_buffer);
-    VLOG(2) << "prepare for encryption model";
-    auto cipher = paddle::MakeCipher("");
-    std::string real_model_buffer = cipher->Decrypt(model_buffer, key_buffer);
-    std::string real_params_buffer = cipher->Decrypt(params_buffer, key_buffer);
-    Config analysis_config;
-    analysis_config.SetModelBuffer(&real_model_buffer[0],
-                                   real_model_buffer.size(),
-                                   &real_params_buffer[0],
-                                   real_params_buffer.size());
-    analysis_config.EnableUseGpu(100, FLAGS_gpuid);
-    analysis_config.SetCpuMathLibraryNumThreads(1);
-    if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim();
-    }
-    analysis_config.SwitchSpecifyInputNames(true);
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    VLOG(2) << "decrypt model file sucess";
-    _core = CreatePredictor(analysis_config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-}  // namespace fluid_gpu
-}  // namespace paddle_serving
-}  // namespace baidu
--- a/paddle_inference/inferencer-fluid-gpu/src/fluid_gpu_engine.cpp
+++ b/paddle_inference/inferencer-fluid-gpu/src/fluid_gpu_engine.cpp
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h"
-#include "core/predictor/framework/factory.h"
-DEFINE_int32(gpuid, 0, "GPU device id to use");
-namespace baidu {
-namespace paddle_serving {
-namespace fluid_gpu {
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidGpuAnalysisCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_GPU_ANALYSIS");
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<
-        FluidGpuAnalysisDirCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_GPU_ANALYSIS_DIR");
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<
-        FluidGpuAnalysisEncryptCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_GPU_ANALYSIS_ENCRPT")
-}  // namespace fluid_gpu
-}  // namespace paddle_serving
-}  // namespace baidu
--- a/paddle_inference/paddle/CMakeLists.txt
+++ b/paddle_inference/paddle/CMakeLists.txt
+FILE(GLOB paddle_inference_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
+add_library(paddle_inference_engine ${paddle_inference_engine_srcs})
+target_include_directories(paddle_inference_engine PUBLIC
+        ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
+add_dependencies(paddle_inference_engine pdserving extern_paddle configure)
+target_link_libraries(paddle_inference_engine pdserving paddle_inference -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
+install(TARGETS paddle_inference_engine 
+        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
+        )
--- a/paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h
+++ b/paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -17,275 +17,174 @@
 #include <pthread.h>
 #include <fstream>
 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
 #include "core/configure/include/configure_parser.h"
 #include "core/configure/inferencer_configure.pb.h"
+#include "core/predictor/common/utils.h"
 #include "core/predictor/framework/infer.h"
 #include "paddle_inference_api.h"  // NOLINT
 namespace baidu {
 namespace paddle_serving {
-namespace fluid_arm {
+namespace inference {
-class AutoLock {
- public:
-  explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
-    pthread_mutex_lock(&mutex);
-  }
-  ~AutoLock() { pthread_mutex_unlock(&_mut); }
- private:
-  pthread_mutex_t& _mut;
-};
-class GlobalPaddleCreateMutex {
- public:
-  pthread_mutex_t& mutex() { return _mut; }
-  static pthread_mutex_t& instance() {
-    static GlobalPaddleCreateMutex gmutex;
-    return gmutex.mutex();
-  }
- private:
-  GlobalPaddleCreateMutex() { pthread_mutex_init(&_mut, NULL); }
-  pthread_mutex_t _mut;
-};
 using paddle_infer::Config;
+using paddle_infer::PrecisionType;
 using paddle_infer::Predictor;
 using paddle_infer::Tensor;
-using paddle_infer::PrecisionType;
 using paddle_infer::CreatePredictor;
-// data interface
+DECLARE_int32(gpuid);
-class FluidFamilyCore {
+static const int max_batch = 32;
+static const int min_subgraph_size = 3;
+// Engine Base
+class PaddleEngineBase {
 public:
-  virtual ~FluidFamilyCore() {}
+  virtual ~PaddleEngineBase() {}
  virtual std::vector<std::string> GetInputNames() {
-    return _core->GetInputNames();
+    return _predictor->GetInputNames();
  }
  virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
-    return _core->GetInputHandle(name);
+    return _predictor->GetInputHandle(name);
  }
  virtual std::vector<std::string> GetOutputNames() {
-    return _core->GetOutputNames();
+    return _predictor->GetOutputNames();
  }
  virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
-    return _core->GetOutputHandle(name);
+    return _predictor->GetOutputHandle(name);
  }
  virtual bool Run() {
-    if (!_core->Run()) {
+    if (!_predictor->Run()) {
      LOG(ERROR) << "Failed call Run with paddle predictor";
      return false;
    }
    return true;
  }
-  virtual int create(const predictor::InferEngineCreationParams& params) = 0;
+  virtual int create(const configure::EngineDesc& conf) = 0;
-  virtual int clone(void* origin_core) {
+  virtual int clone(void* predictor) {
-    if (origin_core == NULL) {
+    if (predictor == NULL) {
      LOG(ERROR) << "origin paddle Predictor is null.";
      return -1;
    }
-    Predictor* p_predictor = (Predictor*)origin_core;
+    Predictor* prep = static_cast<Predictor*>(predictor);
-    _core = p_predictor->Clone();
+    _predictor = prep->Clone();
-    if (_core.get() == NULL) {
+    if (_predictor.get() == NULL) {
-      LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
+      LOG(ERROR) << "fail to clone paddle predictor: " << predictor;
      return -1;
    }
    return 0;
  }
-  virtual void* get() { return _core.get(); }
+  virtual void* get() { return _predictor.get(); }
 protected:
-  std::shared_ptr<Predictor> _core;
+  std::shared_ptr<Predictor> _predictor;
 };
-// infer interface
+// Paddle Inference Engine
-class FluidArmAnalysisCore : public FluidFamilyCore {
+class PaddleInferenceEngine : public PaddleEngineBase {
 public:
-  int create(const predictor::InferEngineCreationParams& params) {
+  int create(const configure::EngineDesc& engine_conf) {
-    std::string data_path = params.get_path();
+    std::string model_path = engine_conf.model_dir();
-    if (access(data_path.c_str(), F_OK) == -1) {
+    if (access(model_path.c_str(), F_OK) == -1) {
      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
+                 << model_path;
      return -1;
    }
    Config config;
-    config.SetParamsFile(data_path + "/__params__");
+    // todo, auto config(zhangjun)
-    config.SetProgFile(data_path + "/__model__");
+    if (engine_conf.has_combined_model()) {
-    config.DisableGpu();
+      if (!engine_conf.combined_model()) {
-    config.SetCpuMathLibraryNumThreads(1);
+        config.SetModel(model_path);
+      } else {
-    if (params.use_lite()) {
+        config.SetParamsFile(model_path + "/__params__");
-      config.EnableLiteEngine(PrecisionType::kFloat32, true);
+        config.SetProgFile(model_path + "/__model__");
-    }
+      }
-    if (params.use_xpu()) {
-      config.EnableXpu(2 * 1024 * 1024);
-    }
-    if (params.enable_memory_optimization()) {
-      config.EnableMemoryOptim();
-    }
-    if (params.enable_ir_optimization()) {
-      config.SwitchIrOptim(true);
    } else {
-      config.SwitchIrOptim(false);
+      config.SetParamsFile(model_path + "/__params__");
+      config.SetProgFile(model_path + "/__model__");
    }
    config.SwitchSpecifyInputNames(true);
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
+    config.SetCpuMathLibraryNumThreads(1);
-    _core = CreatePredictor(config);
+    if (engine_conf.has_use_gpu() && engine_conf.use_gpu()) {
-    if (NULL == _core.get()) {
+      // 2000MB GPU memory
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
+      config.EnableUseGpu(2000, FLAGS_gpuid);
-      return -1;
    }
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
+    if (engine_conf.has_use_trt() && engine_conf.use_trt()) {
-    return 0;
+      if (!engine_conf.has_use_gpu() || !engine_conf.use_gpu()) {
-  }
+        config.EnableUseGpu(2000, FLAGS_gpuid);
-};
+      }
+      config.EnableTensorRtEngine(1 << 20,
-class FluidArmAnalysisDirCore : public FluidFamilyCore {
+                                  max_batch,
- public:
+                                  min_subgraph_size,
-  int create(const predictor::InferEngineCreationParams& params) {
+                                  Config::Precision::kFloat32,
-    std::string data_path = params.get_path();
+                                  false,
-    if (access(data_path.c_str(), F_OK) == -1) {
+                                  false);
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
+      LOG(INFO) << "create TensorRT predictor";
-                 << data_path;
-      return -1;
    }
-    Config config;
+    if (engine_conf.has_use_lite() && engine_conf.use_lite()) {
-    config.SetModel(data_path);
-    config.DisableGpu();
-    config.SwitchSpecifyInputNames(true);
-    config.SetCpuMathLibraryNumThreads(1);
-    if (params.use_lite()) {
      config.EnableLiteEngine(PrecisionType::kFloat32, true);
    }
-    if (params.use_xpu()) {
+    if (engine_conf.has_use_xpu() && engine_conf.use_xpu()) {
+      // 2 MB l3 cache
      config.EnableXpu(2 * 1024 * 1024);
    }
+    if (engine_conf.has_enable_ir_optimization() &&
-    if (params.enable_memory_optimization()) {
+        !engine_conf.enable_ir_optimization()) {
-      config.EnableMemoryOptim();
-    }
-    if (params.enable_ir_optimization()) {
-      config.SwitchIrOptim(true);
-    } else {
      config.SwitchIrOptim(false);
+    } else {
+      config.SwitchIrOptim(true);
    }
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
+    if (engine_conf.has_enable_memory_optimization() &&
-    _core = CreatePredictor(config);
+        engine_conf.enable_memory_optimization()) {
-    if (NULL == _core.get()) {
+      config.EnableMemoryOptim();
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-class Parameter {
- public:
-  Parameter() : _row(0), _col(0), _params(NULL) {}
-  ~Parameter() {
-    VLOG(2) << "before destroy Parameter, file_name[" << _file_name << "]";
-    destroy();
-  }
-  int init(int row, int col, const char* file_name) {
-    destroy();
-    _file_name = file_name;
-    _row = row;
-    _col = col;
-    _params = reinterpret_cast<float*>(malloc(_row * _col * sizeof(float)));
-    if (_params == NULL) {
-      LOG(ERROR) << "Load " << _file_name << " malloc error.";
-      return -1;
    }
-    VLOG(2) << "Load parameter file[" << _file_name << "] success.";
-    return 0;
-  }
-  void destroy() {
+    if (engine_conf.has_encrypted_model() && engine_conf.encrypted_model()) {
-    _row = 0;
+      // decrypt model
-    _col = 0;
+      std::string model_buffer, params_buffer, key_buffer;
-    if (_params != NULL) {
+      predictor::ReadBinaryFile(model_path + "encrypt_model", &model_buffer);
-      free(_params);
+      predictor::ReadBinaryFile(model_path + "encrypt_params", &params_buffer);
-      _params = NULL;
+      predictor::ReadBinaryFile(model_path + "key", &key_buffer);
-    }
-  }
-  int load() {
+      auto cipher = paddle::MakeCipher("");
-    if (_params == NULL || _row <= 0 || _col <= 0) {
+      std::string real_model_buffer = cipher->Decrypt(model_buffer, key_buffer);
-      LOG(ERROR) << "load parameter error [not inited].";
+      std::string real_params_buffer =
-      return -1;
+          cipher->Decrypt(params_buffer, key_buffer);
+      config.SetModelBuffer(&real_model_buffer[0],
+                            real_model_buffer.size(),
+                            &real_params_buffer[0],
+                            real_params_buffer.size());
    }
-    FILE* fs = fopen(_file_name.c_str(), "rb");
+    predictor::AutoLock lock(predictor::GlobalCreateMutex::instance());
-    if (fs == NULL) {
+    _predictor = CreatePredictor(config);
-      LOG(ERROR) << "load " << _file_name << " fopen error.";
+    if (NULL == _predictor.get()) {
-      return -1;
+      LOG(ERROR) << "create paddle predictor failed, path: " << model_path;
-    }
-    static const uint32_t MODEL_FILE_HEAD_LEN = 16;
-    char head[MODEL_FILE_HEAD_LEN] = {0};
-    if (fread(head, 1, MODEL_FILE_HEAD_LEN, fs) != MODEL_FILE_HEAD_LEN) {
-      destroy();
-      LOG(ERROR) << "Load " << _file_name << " read head error.";
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
      return -1;
    }
-    uint32_t matrix_size = _row * _col;
+    VLOG(2) << "create paddle predictor sucess, path: " << model_path;
-    if (matrix_size == fread(_params, sizeof(float), matrix_size, fs)) {
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      VLOG(2) << "load " << _file_name << " read ok.";
-      return 0;
-    } else {
-      LOG(ERROR) << "load " << _file_name << " read error.";
-      destroy();
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      return -1;
-    }
    return 0;
  }
- public:
-  std::string _file_name;
-  int _row;
-  int _col;
-  float* _params;
 };
-}  // namespace fluid_arm
+}  // namespace inference
 }  // namespace paddle_serving
 }  // namespace baidu
--- a/paddle_inference/inferencer-fluid-arm/src/fluid_arm_engine.cpp
+++ b/paddle_inference/inferencer-fluid-arm/src/fluid_arm_engine.cpp
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,24 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h"
+#include "paddle_inference/paddle/include/paddle_engine.h"
 #include "core/predictor/framework/factory.h"
 namespace baidu {
 namespace paddle_serving {
-namespace fluid_arm {
+namespace inference {
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
+DEFINE_int32(gpuid, 0, "GPU device id to use");
-    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidArmAnalysisCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_ARM_ANALYSIS");
 REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<
+    ::baidu::paddle_serving::predictor::FluidInferEngine<PaddleInferenceEngine>,
-        FluidArmAnalysisDirCore>,
    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_ARM_ANALYSIS_DIR");
+    "PADDLE_INFER");
-}  // namespace fluid_arm
+}  // namespace inference
 }  // namespace paddle_serving
 }  // namespace baidu
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
 if (CLIENT)
-    file(INSTALL pipeline DESTINATION paddle_serving_client)
+  file(INSTALL pipeline DESTINATION paddle_serving_client)
-    file(GLOB_RECURSE SERVING_CLIENT_PY_FILES paddle_serving_client/*.py)
+  file(GLOB_RECURSE SERVING_CLIENT_PY_FILES paddle_serving_client/*.py)
-    set(PY_FILES ${SERVING_CLIENT_PY_FILES})
+  set(PY_FILES ${SERVING_CLIENT_PY_FILES})
-    SET(PACKAGE_NAME "serving_client")
+  SET(PACKAGE_NAME "serving_client")
-    set(SETUP_LOG_FILE "setup.py.client.log")
+  set(SETUP_LOG_FILE "setup.py.client.log")
 endif()
 if (SERVER)
-    if (NOT WITH_GPU AND NOT WITH_LITE)
+  SET(SERVER_PACKAGE_NAME "paddle-serving-server")
-        file(INSTALL pipeline DESTINATION paddle_serving_server)
+  if (WITH_GPU) 
-        file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server/*.py)
+    set(SERVER_PACKAGE_NAME "paddle-serving-server-gpu")
-    else()
+  elseif(WITH_XPU)
-        file(INSTALL pipeline DESTINATION paddle_serving_server_gpu)
+    set(SERVER_PACKAGE_NAME "paddle-serving-server-xpu")
-        file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server_gpu/*.py)
+  endif()
-    endif()
+  file(INSTALL pipeline DESTINATION paddle_serving_server)
-        set(PY_FILES ${SERVING_SERVER_PY_FILES})
+  file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server/*.py)
-        SET(PACKAGE_NAME "serving_server")
+  set(PY_FILES ${SERVING_SERVER_PY_FILES})
-        set(SETUP_LOG_FILE "setup.py.server.log")
+  set(SETUP_LOG_FILE "setup.py.server.log")
 endif()
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/util.py
    ${CMAKE_CURRENT_BINARY_DIR}/util.py)
 if (CLIENT)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.client.in
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.client.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../tools/python_tag.py
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../tools/python_tag.py
    ${CMAKE_CURRENT_BINARY_DIR}/python_tag.py)
 endif()
 if (APP)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.app.in
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.app.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 endif()
 if (SERVER)
-    if (NOT WITH_GPU AND NOT WITH_LITE)
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.server.in
-        configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.server.in
+    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-            ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-    else()
-        configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.server_gpu.in
-            ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-    endif()
 endif()
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/gen_version.py
@@ -50,108 +45,73 @@ set (SERVING_CLIENT_CORE ${PADDLE_SERVING_BINARY_DIR}/core/general-client/*.so)
 message("python env: " ${py_env})
 if (APP)
-add_custom_command(
+  add_custom_command(
-        OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+    OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
-        COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_app/ ${PADDLE_SERVING_BINARY_DIR}/python/
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_app/ ${PADDLE_SERVING_BINARY_DIR}/python/
-        COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py "app"
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py "app"
-        COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-        DEPENDS ${SERVING_APP_CORE} general_model_config_py_proto ${PY_FILES})
+    DEPENDS ${SERVING_APP_CORE} general_model_config_py_proto ${PY_FILES})
-add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+  add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
 endif()
 if (CLIENT)
-add_custom_command(
+  add_custom_command(
-	OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+    OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
-	COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_client/ ${PADDLE_SERVING_BINARY_DIR}/python/
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_client/ ${PADDLE_SERVING_BINARY_DIR}/python/
-	COMMAND ${CMAKE_COMMAND} -E copy ${SERVING_CLIENT_CORE} ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/serving_client.so
+    COMMAND ${CMAKE_COMMAND} -E copy ${SERVING_CLIENT_CORE} ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/serving_client.so
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} python_tag.py
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py "client"
-	COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-	DEPENDS ${SERVING_CLIENT_CORE} sdk_configure_py_proto ${PY_FILES})
+    DEPENDS ${SERVING_CLIENT_CORE} sdk_configure_py_proto ${PY_FILES})
-add_custom_target(paddle_python ALL DEPENDS serving_client ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+  add_custom_target(paddle_python ALL DEPENDS serving_client ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
 endif()
 if (SERVER)
-    if(NOT WITH_GPU AND NOT WITH_LITE)
+  # todo, generate suffix for cpu、gpu、arm
-        add_custom_command(
+  if(WITH_TRT)
-            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+    if(CUDA_VERSION EQUAL 10.1)
-            COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server/ ${PADDLE_SERVING_BINARY_DIR}/python/
+      set(VERSION_SUFFIX 101)
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py "server"
+    elseif(CUDA_VERSION EQUAL 10.2)
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+      set(VERSION_SUFFIX 102)
-            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
+    elseif(CUDA_VERSION EQUAL 11.0)
-        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+      set(VERSION_SUFFIX 11)
-    elseif(WITH_TRT)
-        if(CUDA_VERSION EQUAL 10.1)
-            set(SUFFIX 101)
-        elseif(CUDA_VERSION EQUAL 10.2)
-            set(SUFFIX 102)
-        elseif(CUDA_VERSION EQUAL 11.0)
-            set(SUFFIX 11)
-        endif()
-        add_custom_command(
-            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
-            COMMAND cp -r
-            ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
-            "server_gpu"  ${SUFFIX}
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
-        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
-    elseif(WITH_LITE)
-        if(WITH_XPU)
-            add_custom_command(
-                OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
-                COMMAND cp -r
-                ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
-                COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
-                "server_gpu" arm-xpu 
-                COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-                DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
-            add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
-        else()
-            add_custom_command(
-                OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
-                COMMAND cp -r
-                ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
-                COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
-                "server_gpu" arm 
-                COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-                DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
-            add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
-        endif()
-    else()
-        add_custom_command(
-            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
-            COMMAND cp -r
-            ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
-            "server_gpu" ${CUDA_VERSION_MAJOR}
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
-        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
    endif()
+  endif()
+  if(WITH_LITE)
+    set(VERSION_SUFFIX 2)
+  endif()
+  add_custom_command(
+    OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+    COMMAND cp -r
+    ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server/ ${PADDLE_SERVING_BINARY_DIR}/python/
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
+    "server" ${VERSION_SUFFIX}
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
+  add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
 endif()
 set(SERVING_CLIENT_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 set(SERVING_SERVER_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 if (CLIENT)
-install(DIRECTORY ${SERVING_CLIENT_PYTHON_PACKAGE_DIR}
+  install(DIRECTORY ${SERVING_CLIENT_PYTHON_PACKAGE_DIR}
    DESTINATION opt/serving_client/share/wheels
-)
+  )
 endif()
 if (SERVER)
-install(DIRECTORY ${SERVING_SERVER_PYTHON_PACKAGE_DIR}
+  install(DIRECTORY ${SERVING_SERVER_PYTHON_PACKAGE_DIR}
-    DESTINATION opt/serving_server/share/wheels
+      DESTINATION opt/serving_server/share/wheels
-)
+  )
 endif()
 if (CLIENT OR SERVER)
-find_program(PATCHELF_EXECUTABLE patchelf)
+  find_program(PATCHELF_EXECUTABLE patchelf)
-if (NOT PATCHELF_EXECUTABLE)
+  if (NOT PATCHELF_EXECUTABLE)
-  message(FATAL_ERROR "patchelf not found, please install it.\n"
+    message(FATAL_ERROR "patchelf not found, please install it.\n"
-         "For Ubuntu, the command is: apt-get install -y patchelf.")
+           "For Ubuntu, the command is: apt-get install -y patchelf.")
-endif()
+  endif()
 endif()
--- a/python/examples/bert/README.md
+++ b/python/examples/bert/README.md
@@ -49,7 +49,7 @@ python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #c
 ```
 Or,start gpu inference service,Run
 ```
-python -m paddle_serving_server_gpu.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #launch gpu inference service at GPU 0
+python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #launch gpu inference service at GPU 0
 ```
 ### RPC Inference

--- a/python/examples/bert/README_CN.md
+++ b/python/examples/bert/README_CN.md
@@ -48,7 +48,7 @@ python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #
 ```
 或者，启动gpu预测服务，执行
 ```
-python -m paddle_serving_server_gpu.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务
+python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务
 ```

--- a/python/examples/bert/benchmark.sh
+++ b/python/examples/bert/benchmark.sh
@@ -12,7 +12,7 @@ else
    mkdir utilization
 fi
 #start server
-$PYTHONROOT/bin/python3 -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim  --ir_optim >  elog  2>&1 &
+$PYTHONROOT/bin/python3 -m paddle_serving_server.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim  --ir_optim >  elog  2>&1 &
 sleep 5
 #warm up

--- a/python/examples/bert/benchmark_with_profile.sh
+++ b/python/examples/bert/benchmark_with_profile.sh
 export CUDA_VISIBLE_DEVICES=0,1,2,3
-python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
+python -m paddle_serving_server.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
 export FLAGS_profile_client=1
 export FLAGS_profile_server=1
 sleep 5

--- a/python/examples/bert/bert_gpu_server.py
+++ b/python/examples/bert/bert_gpu_server.py
@@ -14,9 +14,9 @@
 import os
 import sys
-from paddle_serving_server_gpu import OpMaker
+from paddle_serving_server import OpMaker
-from paddle_serving_server_gpu import OpSeqMaker
+from paddle_serving_server import OpSeqMaker
-from paddle_serving_server_gpu import Server
+from paddle_serving_server import Server
 op_maker = OpMaker()
 read_op = op_maker.create('general_reader')

--- a/python/examples/bert/bert_web_service_gpu.py
+++ b/python/examples/bert/bert_web_service_gpu.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
-from paddle_serving_server_gpu.web_service import WebService
+from paddle_serving_server.web_service import WebService
 from paddle_serving_app.reader import ChineseBertReader
 import sys
 import os

--- a/python/examples/cascade_rcnn/README.md
+++ b/python/examples/cascade_rcnn/README.md
@@ -10,7 +10,7 @@ If you want to have more detection models, please refer to [Paddle Detection Mod
 ### Start the service
 ```
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9292 --gpu_id 0
+python -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
 ```
 ### Perform prediction

--- a/python/examples/cascade_rcnn/README_CN.md
+++ b/python/examples/cascade_rcnn/README_CN.md
@@ -10,7 +10,7 @@ sh get_data.sh
 ### 启动服务
 ```
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9292 --gpu_id 0
+python -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
 ```
 ### 执行预测

--- a/python/examples/criteo_ctr/README.md
+++ b/python/examples/criteo_ctr/README.md
@@ -20,7 +20,7 @@ the directories like `ctr_serving_model` and `ctr_client_conf` will appear.
 ```
 python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #CPU RPC Service
-python -m paddle_serving_server_gpu.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #RPC Service on GPU 0
+python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #RPC Service on GPU 0
 ```
 ### RPC Infer

--- a/python/examples/criteo_ctr/README_CN.md
+++ b/python/examples/criteo_ctr/README_CN.md
@@ -20,7 +20,7 @@ mv models/ctr_serving_model .
 ```
 python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #启动CPU预测服务
-python -m paddle_serving_server_gpu.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #在GPU 0上启动预测服务
+python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #在GPU 0上启动预测服务
 ```
 ### 执行预测

--- a/python/examples/deeplabv3/README.md
+++ b/python/examples/deeplabv3/README.md
@@ -12,7 +12,7 @@ tar -xzvf deeplabv3.tar.gz
 ### Start Service
 ```
-python -m paddle_serving_server_gpu.serve --model deeplabv3_server --gpu_ids 0 --port 9494
+python -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
 ```
 ### Client Prediction

--- a/python/examples/deeplabv3/README_CN.md
+++ b/python/examples/deeplabv3/README_CN.md
@@ -12,7 +12,7 @@ tar -xzvf deeplabv3.tar.gz
 ### 启动服务端
 ```
-python -m paddle_serving_server_gpu.serve --model deeplabv3_server --gpu_ids 0 --port 9494
+python -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
 ```
 ### 客户端预测

--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
@@ -10,7 +10,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`. 

--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
@@ -11,7 +11,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。

--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md
@@ -10,7 +10,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ppyolo_r50vd_dcn_1x_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.

--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md
@@ -11,7 +11,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf ppyolo_r50vd_dcn_1x_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。

--- a/python/examples/detection/ttfnet_darknet53_1x_coco/README.md
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/README.md
@@ -10,7 +10,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ttfnet_darknet53_1x_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.

--- a/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md
@@ -11,7 +11,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf ttfnet_darknet53_1x_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。

--- a/python/examples/detection/yolov3_darknet53_270e_coco/README.md
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/README.md
@@ -10,7 +10,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf yolov3_darknet53_270e_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.

--- a/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md
@@ -11,7 +11,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf yolov3_darknet53_270e_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。

--- a/python/examples/encryption/README.md
+++ b/python/examples/encryption/README.md
@@ -26,7 +26,7 @@ python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_
 ```
 GPU Service
 ```
-python -m paddle_serving_server_gpu.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
+python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
 ```
 ## Prediction

--- a/python/examples/encryption/README_CN.md
+++ b/python/examples/encryption/README_CN.md
@@ -24,7 +24,7 @@ python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_
 ```
 GPU预测服务
 ```
-python -m paddle_serving_server_gpu.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
+python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
 ```
 ## 预测

--- a/python/examples/grpc_impl_example/fit_a_line/test_server_gpu.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_server_gpu.py
@@ -15,9 +15,9 @@
 import os
 import sys
-from paddle_serving_server_gpu import OpMaker
+from paddle_serving_server import OpMaker
-from paddle_serving_server_gpu import OpSeqMaker
+from paddle_serving_server import OpSeqMaker
-from paddle_serving_server_gpu import MultiLangServer as Server
+from paddle_serving_server import MultiLangServer as Server
 op_maker = OpMaker()
 read_op = op_maker.create('general_reader')

--- a/python/examples/grpc_impl_example/yolov4/README.md
+++ b/python/examples/grpc_impl_example/yolov4/README.md
@@ -12,7 +12,7 @@ tar -xzvf yolov4.tar.gz
 ## Start RPC Service
 ```
-python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang
+python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang
 ```
 ## Prediction

--- a/python/examples/grpc_impl_example/yolov4/README_CN.md
+++ b/python/examples/grpc_impl_example/yolov4/README_CN.md
@@ -12,7 +12,7 @@ tar -xzvf yolov4.tar.gz
 ## 启动RPC服务
 ```
-python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang
+python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang
 ```
 ## 预测

--- a/python/examples/imagenet/README.md
+++ b/python/examples/imagenet/README.md
@@ -39,7 +39,7 @@ python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu
 ```
 ```
-python -m paddle_serving_server_gpu.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu inference service
+python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu inference service
 ```
 client send inference request

--- a/python/examples/imagenet/README_CN.md
+++ b/python/examples/imagenet/README_CN.md
@@ -39,7 +39,7 @@ python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu
 ```
 ```
-python -m paddle_serving_server_gpu.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu预测服务
+python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu预测服务
 ```
 client端进行预测

--- a/python/examples/imagenet/benchmark.sh
+++ b/python/examples/imagenet/benchmark.sh
@@ -2,7 +2,7 @@ rm profile_log*
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 export FLAGS_profile_server=1
 export FLAGS_profile_client=1
-python -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim --ir_optim  2> elog > stdlog &
+python -m paddle_serving_server.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim --ir_optim  2> elog > stdlog &
 sleep 5
 gpu_id=0

--- a/python/examples/imagenet/resnet50_web_service.py
+++ b/python/examples/imagenet/resnet50_web_service.py
@@ -25,7 +25,7 @@ device = sys.argv[2]
 if device == "cpu":
    from paddle_serving_server.web_service import WebService
 else:
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService
 class ImageService(WebService):

--- a/python/examples/mobilenet/README.md
+++ b/python/examples/mobilenet/README.md
@@ -12,7 +12,7 @@ tar -xzvf mobilenet_v2_imagenet.tar.gz
 ### Start Service
 ```
-python -m paddle_serving_server_gpu.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
+python -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
 ```
 ### Client Prediction

--- a/python/examples/mobilenet/README_CN.md
+++ b/python/examples/mobilenet/README_CN.md
@@ -12,7 +12,7 @@ tar -xzvf mobilenet_v2_imagenet.tar.gz
 ### 启动服务端
 ```
-python -m paddle_serving_server_gpu.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
+python -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
 ```
 ### 客户端预测

--- a/python/examples/ocr/README.md
+++ b/python/examples/ocr/README.md
@@ -26,7 +26,7 @@ tar xf test_imgs.tar
 python -m paddle_serving_server.serve --model ocr_det_model --port 9293
 python ocr_web_server.py cpu
 #for gpu user
-python -m paddle_serving_server_gpu.serve --model ocr_det_model --port 9293 --gpu_id 0
+python -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_id 0
 python ocr_web_server.py gpu
 ```

--- a/python/examples/ocr/README_CN.md
+++ b/python/examples/ocr/README_CN.md
@@ -25,7 +25,7 @@ tar xf test_imgs.tar
 python -m paddle_serving_server.serve --model ocr_det_model --port 9293
 python ocr_web_server.py cpu
 #for gpu user
-python -m paddle_serving_server_gpu.serve --model ocr_det_model --port 9293 --gpu_id 0
+python -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_id 0
 python ocr_web_server.py gpu
 ```

--- a/python/examples/ocr/det_debugger_server.py
+++ b/python/examples/ocr/det_debugger_server.py
@@ -22,7 +22,7 @@ from paddle_serving_app.reader import Sequential, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes
 if sys.argv[1] == 'gpu':
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService
 elif sys.argv[1] == 'cpu':
    from paddle_serving_server.web_service import WebService
 import time

--- a/python/examples/ocr/det_web_server.py
+++ b/python/examples/ocr/det_web_server.py
@@ -22,7 +22,7 @@ from paddle_serving_app.reader import Sequential, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes
 if sys.argv[1] == 'gpu':
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService
 elif sys.argv[1] == 'cpu':
    from paddle_serving_server.web_service import WebService
 import time

--- a/python/examples/ocr/ocr_debugger_server.py
+++ b/python/examples/ocr/ocr_debugger_server.py
@@ -23,7 +23,7 @@ from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
 if sys.argv[1] == 'gpu':
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService
 elif sys.argv[1] == 'cpu':
    from paddle_serving_server.web_service import WebService
 from paddle_serving_app.local_predict import LocalPredictor

--- a/python/examples/ocr/ocr_web_server.py
+++ b/python/examples/ocr/ocr_web_server.py
@@ -23,7 +23,7 @@ from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
 if sys.argv[1] == 'gpu':
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService
 elif sys.argv[1] == 'cpu':
    from paddle_serving_server.web_service import WebService
 import time

--- a/python/examples/ocr/rec_debugger_server.py
+++ b/python/examples/ocr/rec_debugger_server.py
@@ -23,7 +23,7 @@ from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
 if sys.argv[1] == 'gpu':
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService
 elif sys.argv[1] == 'cpu':
    from paddle_serving_server.web_service import WebService
 import time

--- a/python/examples/ocr/rec_web_server.py
+++ b/python/examples/ocr/rec_web_server.py
@@ -23,7 +23,7 @@ from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
 if sys.argv[1] == 'gpu':
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService
 elif sys.argv[1] == 'cpu':
    from paddle_serving_server.web_service import WebService
 import time

--- a/python/examples/pipeline/bert/benchmark.py
+++ b/python/examples/pipeline/bert/benchmark.py
+import sys
+import os
+import yaml
+import requests
+import time
+import json
+try:
+    from paddle_serving_server_gpu.pipeline import PipelineClient
+except ImportError:
+    from paddle_serving_server.pipeline import PipelineClient
+import numpy as np
+from paddle_serving_client.utils import MultiThreadRunner
+from paddle_serving_client.utils import benchmark_args, show_latency
+'''
+2021-03-16 10:26:01,832 ==================== TRACER ======================
+2021-03-16 10:26:01,838 Op(bert):
+2021-03-16 10:26:01,838 	in[5.7833 ms]
+2021-03-16 10:26:01,838 	prep[8.2001 ms]
+2021-03-16 10:26:01,838 	midp[198.79853333333332 ms]
+2021-03-16 10:26:01,839 	postp[0.8411 ms]
+2021-03-16 10:26:01,839 	out[0.9440666666666667 ms]
+2021-03-16 10:26:01,839 	idle[0.03135320683677345]
+2021-03-16 10:26:01,839 DAGExecutor:
+2021-03-16 10:26:01,839 	Query count[30]
+2021-03-16 10:26:01,839 	QPS[3.0 q/s]
+2021-03-16 10:26:01,839 	Succ[1.0]
+2021-03-16 10:26:01,839 	Error req[]
+2021-03-16 10:26:01,839 	Latency:
+2021-03-16 10:26:01,839 		ave[237.85519999999997 ms]
+2021-03-16 10:26:01,839 		.50[179.937 ms]
+2021-03-16 10:26:01,839 		.60[179.994 ms]
+2021-03-16 10:26:01,839 		.70[180.515 ms]
+2021-03-16 10:26:01,840 		.80[180.735 ms]
+2021-03-16 10:26:01,840 		.90[182.275 ms]
+2021-03-16 10:26:01,840 		.95[182.789 ms]
+2021-03-16 10:26:01,840 		.99[1921.33 ms]
+2021-03-16 10:26:01,840 Channel (server worker num[1]):
+2021-03-16 10:26:01,840 	chl0(In: ['@DAGExecutor'], Out: ['bert']) size[0/0]
+2021-03-16 10:26:01,841 	chl1(In: ['bert'], Out: ['@DAGExecutor']) size[0/0]
+'''
+def parse_benchmark(filein, fileout):
+    with open(filein, "r") as fin:
+        res = yaml.load(fin)
+        del_list = []
+        for key in res["DAG"].keys():
+            if "call" in key:
+                del_list.append(key)
+        for key in del_list:
+            del res["DAG"][key]
+    with open(fileout, "w") as fout:
+        yaml.dump(res, fout, default_flow_style=False)
+def gen_yml(device):
+    fin = open("config.yml", "r")
+    config = yaml.load(fin)
+    fin.close()
+    config["dag"]["tracer"] = {"interval_s": 10}
+    if device == "gpu":
+        config["op"]["bert"]["local_service_conf"]["device_type"] = 1
+        config["op"]["bert"]["local_service_conf"]["devices"] = "2"        
+    with open("config2.yml", "w") as fout: 
+        yaml.dump(config, fout, default_flow_style=False)
+def run_http(idx, batch_size):
+    print("start thread ({})".format(idx))
+    url = "http://127.0.0.1:18082/bert/prediction"    
+    start = time.time()
+    with open("data-c.txt", 'r') as fin:
+        start = time.time()
+        lines = fin.readlines()
+        start_idx = 0
+        while start_idx < len(lines):
+            end_idx = min(len(lines), start_idx + batch_size)
+            feed = {}
+            for i in range(start_idx, end_idx):
+                feed[str(i - start_idx)] = lines[i]
+            keys = list(feed.keys())
+            values = [feed[x] for x in keys]
+            data = {"key": keys, "value": values}
+            r = requests.post(url=url, data=json.dumps(data))
+            start_idx += batch_size
+            if start_idx > 2000:
+                break
+        end = time.time()
+    return [[end - start]]
+def multithread_http(thread, batch_size):
+    multi_thread_runner = MultiThreadRunner()
+    result = multi_thread_runner.run(run_http , thread, batch_size)
+def run_rpc(thread, batch_size):
+    client = PipelineClient()
+    client.connect(['127.0.0.1:9998'])
+    with open("data-c.txt", 'r') as fin:
+        start = time.time()
+        lines = fin.readlines()
+        start_idx = 0
+        while start_idx < len(lines):
+            end_idx = min(len(lines), start_idx + batch_size)
+            feed = {}
+            for i in range(start_idx, end_idx):
+                feed[str(i - start_idx)] = lines[i]
+            ret = client.predict(feed_dict=feed, fetch=["res"])
+            start_idx += batch_size
+            if start_idx > 1000:
+                break
+        end = time.time()
+    return [[end - start]]
+def multithread_rpc(thraed, batch_size):
+    multi_thread_runner = MultiThreadRunner()
+    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+if __name__ == "__main__":
+    if sys.argv[1] == "yaml":
+        mode = sys.argv[2] # brpc/  local predictor
+        thread = int(sys.argv[3])
+        device = sys.argv[4]
+        gen_yml(device)
+    elif sys.argv[1] == "run":
+        mode = sys.argv[2] # http/ rpc
+        thread = int(sys.argv[3])
+        batch_size = int(sys.argv[4])
+        if mode == "http":
+            multithread_http(thread, batch_size)
+        elif mode == "rpc":
+            multithread_rpc(thread, batch_size)
+    elif sys.argv[1] == "dump":
+        filein = sys.argv[2]
+        fileout = sys.argv[3]
+        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/bert/benchmark.sh
+++ b/python/examples/pipeline/bert/benchmark.sh
+export FLAGS_profile_pipeline=1
+alias python3="python3.7"
+modelname="bert"
+# HTTP
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 
+sleep 3
+python3 benchmark.py yaml local_predictor 1 gpu 
+rm -rf profile_log_$modelname
+for thread_num in 1 8 16 
+do
+  for batch_size in 1 10 100
+  do
+    echo "----Bert thread num: $thread_num batch size: $batch_size mode:http ----" >>profile_log_$modelname
+    rm -rf PipelineServingLogs
+    rm -rf cpu_utilization.py
+    python3 web_service.py >web.log 2>&1 &
+    sleep 3
+    nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
+    nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
+    echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+    python3 benchmark.py run http $thread_num $batch_size
+    python3 cpu_utilization.py >>profile_log_$modelname
+    ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+    python3 benchmark.py dump benchmark.log benchmark.tmp
+    mv benchmark.tmp benchmark.log
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$modelname
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$modelname
+    cat benchmark.log >> profile_log_$modelname
+    #rm -rf gpu_use.log gpu_utilization.log
+  done
+done
+# RPC
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+sleep 3
+python3 benchmark.py yaml local_predictor 1 gpu
+for thread_num in 1 8 16
+do
+  for batch_size in 1 10 100
+  do
+    echo "----Bert thread num: $thread_num batch size: $batch_size mode:rpc ----" >>profile_log_$modelname
+    rm -rf PipelineServingLogs
+    rm -rf cpu_utilization.py
+    python3 web_service.py >web.log 2>&1 &
+    sleep 3
+    nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
+    nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
+    echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+    python3 benchmark.py run rpc $thread_num $batch_size
+    python3 cpu_utilization.py >>profile_log_$modelname
+    ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+    python3 benchmark.py dump benchmark.log benchmark.tmp
+    mv benchmark.tmp benchmark.log
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$modelname
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$modelname
+    #rm -rf gpu_use.log gpu_utilization.log
+    cat benchmark.log >> profile_log_$modelname
+  done
+done
--- a/python/examples/pipeline/bert/config.yml
+++ b/python/examples/pipeline/bert/config.yml
+dag:
+  is_thread_op: false
+  tracer:
+    interval_s: 10
+http_port: 18082
+op:
+  bert:
+    local_service_conf:
+      client_type: local_predictor
+      concurrency: 2
+      device_type: 1
+      devices: '2'
+      fetch_list:
+      - pooled_output
+      model_config: bert_seq128_model/
+rpc_port: 9998
+worker_num: 20
--- a/python/examples/pipeline/bert/get_data.sh
+++ b/python/examples/pipeline/bert/get_data.sh
+wget https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SemanticModel/bert_chinese_L-12_H-768_A-12.tar.gz
+tar -xzf bert_chinese_L-12_H-768_A-12.tar.gz
+mv bert_chinese_L-12_H-768_A-12_model bert_seq128_model
+mv bert_chinese_L-12_H-768_A-12_client bert_seq128_client
+wget https://paddle-serving.bj.bcebos.com/bert_example/data-c.txt --no-check-certificate
+wget https://paddle-serving.bj.bcebos.com/bert_example/vocab.txt --no-check-certificate
--- a/python/examples/pipeline/bert/pipeline_rpc_client.py
+++ b/python/examples/pipeline/bert/pipeline_rpc_client.py
+import sys
+import os
+import yaml
+import requests
+import time
+import json
+try:
+    from paddle_serving_server_gpu.pipeline import PipelineClient
+except ImportError:
+    from paddle_serving_server.pipeline import PipelineClient
+import numpy as np
+client = PipelineClient()
+client.connect(['127.0.0.1:9998'])
+batch_size = 101
+with open("data-c.txt", 'r') as fin:
+     lines = fin.readlines()
+     start_idx = 0
+     while start_idx < len(lines):
+         end_idx = min(len(lines), start_idx + batch_size)
+         feed = {}
+         for i in range(start_idx, end_idx):
+             feed[str(i - start_idx)] = lines[i]
+         ret = client.predict(feed_dict=feed, fetch=["res"])
+         print(ret)
+         start_idx += batch_size
--- a/python/examples/pipeline/bert/web_service.py
+++ b/python/examples/pipeline/bert/web_service.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    from paddle_serving_server_gpu.web_service import WebService, Op
+except ImportError:
+    from paddle_serving_server.web_service import WebService, Op
+import logging
+import numpy as np
+import sys
+from paddle_serving_app.reader import ChineseBertReader
+_LOGGER = logging.getLogger()
+class BertOp(Op):
+    def init_op(self):
+        self.reader = ChineseBertReader({
+            "vocab_file": "vocab.txt",
+            "max_seq_len": 128
+        })
+    def preprocess(self, input_dicts, data_id, log_id):
+        (_, input_dict), = input_dicts.items()
+        print("input dict", input_dict)
+        batch_size = len(input_dict.keys())
+        feed_res = []
+        for i in range(batch_size):
+            feed_dict = self.reader.process(input_dict[str(i)].encode("utf-8"))
+            for key in feed_dict.keys():
+                feed_dict[key] = np.array(feed_dict[key]).reshape((1, len(feed_dict[key]), 1))
+            feed_res.append(feed_dict)
+        feed_dict = {}
+        for key in feed_res[0].keys():
+            feed_dict[key] = np.concatenate([x[key] for x in feed_res], axis=0)
+            print(key, feed_dict[key].shape)
+        return feed_dict, False, None, ""
+    def postprocess(self, input_dicts, fetch_dict, log_id):
+        fetch_dict["pooled_output"] = str(fetch_dict["pooled_output"])
+        return fetch_dict, None, ""
+class BertService(WebService):
+    def get_pipeline_response(self, read_op):
+        bert_op = BertOp(name="bert", input_ops=[read_op])
+        return bert_op
+bert_service = BertService(name="bert")
+bert_service.prepare_pipeline_config("config2.yml")
+bert_service.run_service()
--- a/python/examples/pipeline/imagenet/pipeline_rpc_client.py
+++ b/python/examples/pipeline/imagenet/pipeline_rpc_client.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+    from paddle_serving_server.pipeline import PipelineClient
 except ImportError:
    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np

--- a/python/examples/pipeline/imagenet/resnet50_web_service.py
+++ b/python/examples/pipeline/imagenet/resnet50_web_service.py
@@ -14,7 +14,7 @@
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
 try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
+    from paddle_serving_server.web_service import WebService, Op
 except ImportError:
    from paddle_serving_server.web_service import WebService, Op
 import logging

--- a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
+++ b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
@@ -22,7 +22,7 @@ import logging
 try:
    from paddle_serving_server.web_service import WebService
 except ImportError:
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService
 _LOGGER = logging.getLogger()
 user_handler = logging.StreamHandler()

--- a/python/examples/pipeline/ocr/benchmark.py
+++ b/python/examples/pipeline/ocr/benchmark.py
+import sys
+import os
+import base64
+import yaml
+import requests
+import time
+import json
+try:
+    from paddle_serving_server_gpu.pipeline import PipelineClient
+except ImportError:
+    from paddle_serving_server.pipeline import PipelineClient
+import numpy as np
+from paddle_serving_client.utils import MultiThreadRunner
+from paddle_serving_client.utils import benchmark_args, show_latency
+def parse_benchmark(filein, fileout):
+    with open(filein, "r") as fin:
+        res = yaml.load(fin)
+        del_list = []
+        for key in res["DAG"].keys():
+            if "call" in key:
+                del_list.append(key)
+        for key in del_list:
+            del res["DAG"][key]
+    with open(fileout, "w") as fout:
+        yaml.dump(res, fout, default_flow_style=False)
+def gen_yml(device):
+    fin = open("config.yml", "r")
+    config = yaml.load(fin)
+    fin.close()
+    config["dag"]["tracer"] = {"interval_s": 10}
+    if device == "gpu":
+        config["op"]["det"]["local_service_conf"]["device_type"] = 1
+        config["op"]["det"]["local_service_conf"]["devices"] = "2"
+        config["op"]["rec"]["local_service_conf"]["device_type"] = 1
+        config["op"]["rec"]["local_service_conf"]["devices"] = "2"        
+    with open("config2.yml", "w") as fout: 
+        yaml.dump(config, fout, default_flow_style=False)
+def cv2_to_base64(image):
+    return base64.b64encode(image).decode('utf8')
+def run_http(idx, batch_size):
+    print("start thread ({})".format(idx))
+    url = "http://127.0.0.1:9999/ocr/prediction"    
+    start = time.time()
+    test_img_dir = "imgs/"
+    for img_file in os.listdir(test_img_dir):
+        with open(os.path.join(test_img_dir, img_file), 'rb') as file:
+            image_data1 = file.read()
+        image = cv2_to_base64(image_data1)
+        data = {"key": ["image"], "value": [image]}
+        for i in range(100):
+            r = requests.post(url=url, data=json.dumps(data))
+        end = time.time()
+    return [[end - start]]
+def multithread_http(thread, batch_size):
+    multi_thread_runner = MultiThreadRunner()
+    result = multi_thread_runner.run(run_http , thread, batch_size)
+def run_rpc(thread, batch_size):
+    client = PipelineClient()
+    client.connect(['127.0.0.1:18090'])
+    start = time.time()
+    test_img_dir = "imgs/"
+    for img_file in os.listdir(test_img_dir):
+        with open(os.path.join(test_img_dir, img_file), 'rb') as file:
+            image_data = file.read()
+        image = cv2_to_base64(image_data)
+        for i in range(100):
+            ret = client.predict(feed_dict={"image": image}, fetch=["res"])
+    end = time.time()
+    return [[end - start]]
+def multithread_rpc(thraed, batch_size):
+    multi_thread_runner = MultiThreadRunner()
+    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+if __name__ == "__main__":
+    if sys.argv[1] == "yaml":
+        mode = sys.argv[2] # brpc/  local predictor
+        thread = int(sys.argv[3])
+        device = sys.argv[4]
+        gen_yml(device)
+    elif sys.argv[1] == "run":
+        mode = sys.argv[2] # http/ rpc
+        thread = int(sys.argv[3])
+        batch_size = int(sys.argv[4])
+        if mode == "http":
+            multithread_http(thread, batch_size)
+        elif mode == "rpc":
+            multithread_rpc(thread, batch_size)
+    elif sys.argv[1] == "dump":
+        filein = sys.argv[2]
+        fileout = sys.argv[3]
+        parse_benchmark(filein, fileout)
--- a/python/examples/pipeline/ocr/benchmark.sh
+++ b/python/examples/pipeline/ocr/benchmark.sh
+export FLAGS_profile_pipeline=1
+alias python3="python3.7"
+modelname="ocr"
+# HTTP
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 
+sleep 3
+python3 benchmark.py yaml local_predictor 1 gpu 
+rm -rf profile_log_$modelname
+for thread_num in 1 8 16
+do
+  for batch_size in 1
+  do
+    echo "----Bert thread num: $thread_num batch size: $batch_size mode:http ----" >>profile_log_$modelname
+    rm -rf PipelineServingLogs
+    rm -rf cpu_utilization.py
+    python3 web_service.py >web.log 2>&1 &
+    sleep 3
+    nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
+    nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
+    echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+    python3 benchmark.py run http $thread_num $batch_size
+    python3 cpu_utilization.py >>profile_log_$modelname
+    ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+    python3 benchmark.py dump benchmark.log benchmark.tmp
+    mv benchmark.tmp benchmark.log
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$modelname
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$modelname
+    cat benchmark.log >> profile_log_$modelname
+    #rm -rf gpu_use.log gpu_utilization.log
+  done
+done
+# RPC
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+sleep 3
+python3 benchmark.py yaml local_predictor 1 gpu
+for thread_num in 1 8 16
+do
+  for batch_size in 1
+  do
+    echo "----Bert thread num: $thread_num batch size: $batch_size mode:rpc ----" >>profile_log_$modelname
+    rm -rf PipelineServingLogs
+    rm -rf cpu_utilization.py
+    python3 web_service.py >web.log 2>&1 &
+    sleep 3
+    nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
+    nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
+    echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+    python3 benchmark.py run rpc $thread_num $batch_size
+    python3 cpu_utilization.py >>profile_log_$modelname
+    ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+    python3 benchmark.py dump benchmark.log benchmark.tmp
+    mv benchmark.tmp benchmark.log
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$modelname
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$modelname
+    #rm -rf gpu_use.log gpu_utilization.log
+    cat benchmark.log >> profile_log_$modelname
+  done
+done
--- a/python/examples/pipeline/ocr/config.yml
+++ b/python/examples/pipeline/ocr/config.yml
@@ -6,7 +6,7 @@ http_port: 9999
 #worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
 ##当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
-worker_num: 1
+worker_num: 5
 #build_dag_each_worker, False，框架在进程内创建一条DAG；True，框架会每个进程内创建多个独立的DAG
 build_dag_each_worker: false
@@ -20,6 +20,9 @@ dag:
    #使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
    use_profile: false
+    tracer:
+        interval_s: 10
 op:
    det:
        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
@@ -37,7 +40,7 @@ op:
            fetch_list: ["concat_1.tmp_0"]
            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
-            devices: "0"
+            devices: "2"
    rec:
        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
        concurrency: 2
@@ -61,4 +64,4 @@ op:
            fetch_list: ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"] 
            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
-            devices: "0"
+            devices: "2"
--- a/python/examples/pipeline/ocr/pipeline_rpc_client.py
+++ b/python/examples/pipeline/ocr/pipeline_rpc_client.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+    from paddle_serving_server.pipeline import PipelineClient
 except ImportError:
    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np

--- a/python/examples/pipeline/ocr/web_service.py
+++ b/python/examples/pipeline/ocr/web_service.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 try:
-    from paddle_serving_server.web_service import WebService, Op
-except ImportError:
    from paddle_serving_server_gpu.web_service import WebService, Op
+except ImportError:
+    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import cv2
@@ -45,16 +45,19 @@ class DetOp(Op):
    def preprocess(self, input_dicts, data_id, log_id):
        (_, input_dict), = input_dicts.items()
-        data = base64.b64decode(input_dict["image"].encode('utf8'))
+        imgs = []
-        data = np.fromstring(data, np.uint8)
+        for key in input_dict.keys():
-        # Note: class variables(self.var) can only be used in process op mode
+            data = base64.b64decode(input_dict[key].encode('utf8'))
-        self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
+            data = np.fromstring(data, np.uint8)
-        self.ori_h, self.ori_w, _ = self.im.shape
+            self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
-        det_img = self.det_preprocess(self.im)
+            self.ori_h, self.ori_w, _ = self.im.shape
-        _, self.new_h, self.new_w = det_img.shape
+            det_img = self.det_preprocess(self.im)
-        return {"image": det_img[np.newaxis, :].copy()}, False, None, ""
+            _, self.new_h, self.new_w = det_img.shape
+            imgs.append(det_img[np.newaxis, :].copy())
+        return {"image": np.concatenate(imgs, axis=0)}, False, None, ""
    def postprocess(self, input_dicts, fetch_dict, log_id):
+#        print(fetch_dict)
        det_out = fetch_dict["concat_1.tmp_0"]
        ratio_list = [
            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
@@ -62,7 +65,6 @@ class DetOp(Op):
        dt_boxes_list = self.post_func(det_out, [ratio_list])
        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
-        print("out dict", out_dict)
        return out_dict, None, ""
@@ -112,5 +114,5 @@ class OcrService(WebService):
 uci_service = OcrService(name="ocr")
-uci_service.prepare_pipeline_config("config.yml")
+uci_service.prepare_pipeline_config("config2.yml")
 uci_service.run_service()
--- a/python/examples/pipeline/simple_web_service/benchmark.py
+++ b/python/examples/pipeline/simple_web_service/benchmark.py
+import sys
+import os
+import yaml
+import requests
+import time
+import json
+try:
+    from paddle_serving_server_gpu.pipeline import PipelineClient
+except ImportError:
+    from paddle_serving_server.pipeline import PipelineClient
+import numpy as np
+from paddle_serving_client.utils import MultiThreadRunner
+from paddle_serving_client.utils import benchmark_args, show_latency
+def gen_yml():
+    fin = open("config.yml", "r")
+    config = yaml.load(fin)
+    fin.close()
+    config["dag"]["tracer"] = {"interval_s": 5}
+    with open("config2.yml", "w") as fout: 
+        yaml.dump(config, fout, default_flow_style=False)
+def run_http(idx, batch_size):
+    print("start thread ({})".format(idx))
+    url = "http://127.0.0.1:18082/uci/prediction"    
+    start = time.time()
+    value = "0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"
+    all_value = ";".join([value for i in range(batch_size)])
+    data = {"key": ["x"], "value": [all_value]}
+    for i in range(1000):
+        r = requests.post(url=url, data=json.dumps(data))
+    print(r.json())
+    end = time.time()
+    return [[end - start]]
+def multithread_http(thread, batch_size):
+    multi_thread_runner = MultiThreadRunner()
+    result = multi_thread_runner.run(run_http , thread, batch_size)
+def run_rpc(thread, batch_size):
+    client = PipelineClient()
+    client.connect(['127.0.0.1:9998'])
+    value = "0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"
+    all_value = ";".join([value for i in range(batch_size)])
+    data = {"key": "x", "value": all_value}
+    for i in range(1000):
+        ret = client.predict(feed_dict={data["key"]: data["value"]}, fetch=["res"])
+    print(ret)
+def multithread_rpc(thraed, batch_size):
+    multi_thread_runner = MultiThreadRunner()
+    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+if __name__ == "__main__":
+    if sys.argv[1] == "yaml":
+        mode = sys.argv[2] # brpc/  local predictor
+        thread = int(sys.argv[3])
+        gen_yml()
+    elif sys.argv[1] == "run":
+        mode = sys.argv[2] # http/ rpc
+        thread = int(sys.argv[3])
+        batch_size = int(sys.argv[4])
+        if mode == "http":
+            multithread_http(thread, batch_size)
+        elif mode == "rpc":
+            multithread_rpc(thread, batch_size)
--- a/python/examples/pipeline/simple_web_service/benchmark.sh
+++ b/python/examples/pipeline/simple_web_service/benchmark.sh
+# HTTP
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 
+sleep 3
+python3 benchmark.py yaml local_predictor 1 
+for thread_num in 1 
+do
+for batch_size in 1
+do
+rm -rf PipelineServingLogs
+rm -rf cpu_utilization.py
+python3 web_service.py >web.log 2>&1 &
+sleep 3
+echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+python3 benchmark.py run http $thread_num $batch_size
+python3 cpu_utilization.py
+echo "------------Fit a line pipeline benchmark (Thread: $thread_num) (BatchSize: $batch_size)"
+tail -n 25 PipelineServingLogs/pipeline.tracer 
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+done
+done
+# RPC
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+sleep 3
+python3 benchmark.py yaml local_predictor 1
+for thread_num in 1 
+do
+for batch_size in 1 
+do
+rm -rf PipelineServingLogs
+rm -rf cpu_utilization.py
+python3 web_service.py >web.log 2>&1 &
+sleep 3
+echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+python3 benchmark.py run rpc $thread_num $batch_size
+python3 cpu_utilization.py
+echo "------------Fit a line pipeline benchmark (Thread: $thread_num) (BatchSize: $batch_size)"
+tail -n 25 PipelineServingLogs/pipeline.tracer
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+done
+done
--- a/python/examples/pipeline/simple_web_service/web_service.py
+++ b/python/examples/pipeline/simple_web_service/web_service.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
+    from paddle_serving_server.web_service import WebService, Op
 except ImportError:
    from paddle_serving_server.web_service import WebService, Op
 import logging
@@ -25,32 +25,24 @@ _LOGGER = logging.getLogger()
 class UciOp(Op):
    def init_op(self):
        self.separator = ","
+        self.batch_separator = ";"
    def preprocess(self, input_dicts, data_id, log_id):
        (_, input_dict), = input_dicts.items()
        _LOGGER.error("UciOp::preprocess >>> log_id:{}, input:{}".format(
            log_id, input_dict))
-        x_value = input_dict["x"]
+        x_value = input_dict["x"].split(self.batch_separator)
+        x_lst = []
+        for x_val in x_value:
+            x_lst.append(np.array([float(x.strip()) for x in x_val.split(self.separator)]).reshape(1, 13))
+        input_dict["x"] = np.concatenate(x_lst, axis=0) 
        proc_dict = {}
-        if sys.version_info.major == 2:
-            if isinstance(x_value, (str, unicode)):
-                input_dict["x"] = np.array(
-                    [float(x.strip())
-                     for x in x_value.split(self.separator)]).reshape(1, 13)
-                _LOGGER.error("input_dict:{}".format(input_dict))
-        else:
-            if isinstance(x_value, str):
-                input_dict["x"] = np.array(
-                    [float(x.strip())
-                     for x in x_value.split(self.separator)]).reshape(1, 13)
-                _LOGGER.error("input_dict:{}".format(input_dict))
        return input_dict, False, None, ""
    def postprocess(self, input_dicts, fetch_dict, log_id):
        _LOGGER.info("UciOp::postprocess >>> log_id:{}, fetch_dict:{}".format(
            log_id, fetch_dict))
-        fetch_dict["price"] = str(fetch_dict["price"][0][0])
+        fetch_dict["price"] = str(fetch_dict["price"])
        return fetch_dict, None, ""
@@ -61,5 +53,5 @@ class UciService(WebService):
 uci_service = UciService(name="uci")
-uci_service.prepare_pipeline_config("config.yml")
+uci_service.prepare_pipeline_config("config2.yml")
 uci_service.run_service()
--- a/python/examples/resnet_v2_50/README.md
+++ b/python/examples/resnet_v2_50/README.md
@@ -12,7 +12,7 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### Start Service
 ```
-python -m paddle_serving_server_gpu.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
+python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
 ```
 ### Client Prediction

--- a/python/examples/resnet_v2_50/README_CN.md
+++ b/python/examples/resnet_v2_50/README_CN.md
@@ -12,7 +12,7 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### 启动服务端
 ```
-python -m paddle_serving_server_gpu.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
+python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
 ```
 ### 客户端预测

--- a/python/examples/unet_for_image_seg/README.md
+++ b/python/examples/unet_for_image_seg/README.md
@@ -12,7 +12,7 @@ tar -xzvf unet.tar.gz
 ### Start Service
 ```
-python -m paddle_serving_server_gpu.serve --model unet_model --gpu_ids 0 --port 9494
+python -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
 ```
 ### Client Prediction

--- a/python/examples/unet_for_image_seg/README_CN.md
+++ b/python/examples/unet_for_image_seg/README_CN.md
@@ -12,7 +12,7 @@ tar -xzvf unet.tar.gz
 ### 启动服务端
 ```
-python -m paddle_serving_server_gpu.serve --model unet_model --gpu_ids 0 --port 9494
+python -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
 ```
 ### 客户端预测

--- a/python/examples/xpu/fit_a_line_xpu/README.md
+++ b/python/examples/xpu/fit_a_line_xpu/README.md
@@ -15,7 +15,7 @@ sh get_data.sh
 ### Start server
 ```shell
-python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
 ```
 ### Client prediction

--- a/python/examples/xpu/fit_a_line_xpu/test_server.py
+++ b/python/examples/xpu/fit_a_line_xpu/test_server.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing
-from paddle_serving_server_gpu.web_service import WebService
+from paddle_serving_server.web_service import WebService
 import numpy as np

--- a/python/examples/xpu/resnet_v2_50_xpu/README.md
+++ b/python/examples/xpu/resnet_v2_50_xpu/README.md
@@ -12,7 +12,7 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### Start Service
 ```
-python -m paddle_serving_server_gpu.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
+python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
 ```
 ### Client Prediction

--- a/python/examples/xpu/resnet_v2_50_xpu/README_CN.md
+++ b/python/examples/xpu/resnet_v2_50_xpu/README_CN.md
@@ -12,7 +12,7 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### 启动服务端
 ```
-python -m paddle_serving_server_gpu.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
+python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
 ```
 ### 客户端预测

--- a/python/examples/yolov4/README.md
+++ b/python/examples/yolov4/README.md
@@ -12,7 +12,7 @@ tar -xzvf yolov4.tar.gz
 ## Start RPC Service
 ```
-python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0
+python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
 ```
 ## Prediction

--- a/python/examples/yolov4/README_CN.md
+++ b/python/examples/yolov4/README_CN.md
@@ -12,7 +12,7 @@ tar -xzvf yolov4.tar.gz
 ## 启动RPC服务
 ```
-python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0
+python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
 ```
 ## 预测

--- a/python/gen_version.py
+++ b/python/gen_version.py
@@ -34,10 +34,16 @@ def update_info(file_name, feature, info):
        f.write(new_str)
-if len(sys.argv) > 2:
+if len(sys.argv) > 2 and len(sys.argv[2]) > 0:
-    update_info("paddle_serving_server_gpu/version.py", "cuda_version",
+    update_info("paddle_serving_server/version.py", "version_suffix",
                sys.argv[2])
+package_name = '${SERVER_PACKAGE_NAME}'
+if package_name.endswith('gpu'):
+    update_info("paddle_serving_server/version.py", "device_type", "1")
+elif package_name.endswith('xpu'):
+    update_info("paddle_serving_server/version.py", "device_type", "2")
 path = "paddle_serving_" + sys.argv[1]
 commit_id = subprocess.check_output(['git', 'rev-parse', 'HEAD'])
 update_info(path + "/version.py", "commit_id", commit_id)
--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -82,7 +82,10 @@ class LocalPredictor(object):
        f = open(client_config, 'r')
        model_conf = google.protobuf.text_format.Merge(
            str(f.read()), model_conf)
-        config = AnalysisConfig(model_path)
+        if os.path.exists(os.path.join(model_path, "__params__")):
+            config = AnalysisConfig(os.path.join(model_path, "__model__"), os.path.join(model_path, "__params__")) 
+        else:
+            config = AnalysisConfig(model_path) 
        logger.info("load_model_config params: model_path:{}, use_gpu:{},\
            gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{},\
            use_trt:{}, use_lite:{}, use_xpu: {}, use_feed_fetch_ops:{}".format(

--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
@@ -13,703 +13,9 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing
-import paddle_serving_client
+from . import version
-import os
-from .proto import sdk_configure_pb2 as sdk
-from .proto import general_model_config_pb2 as m_config
-import google.protobuf.text_format
-import numpy as np
-import requests
-import json
-import base64
-import time
-import sys
-import grpc
+from . import client
-from .proto import multi_lang_general_model_service_pb2
+from .client import *
-sys.path.append(
-    os.path.join(os.path.abspath(os.path.dirname(__file__)), 'proto'))
-from .proto import multi_lang_general_model_service_pb2_grpc
-int64_type = 0
+__version__ = version.serving_client_version
-float32_type = 1
-int32_type = 2
-int_type = set([int64_type, int32_type])
-float_type = set([float32_type])
-class _NOPProfiler(object):
-    def record(self, name):
-        pass
-    def print_profile(self):
-        pass
-class _TimeProfiler(object):
-    def __init__(self):
-        self.pid = os.getpid()
-        self.print_head = 'PROFILE\tpid:{}\t'.format(self.pid)
-        self.time_record = [self.print_head]
-    def record(self, name):
-        self.time_record.append('{}:{} '.format(
-            name, int(round(time.time() * 1000000))))
-    def print_profile(self):
-        self.time_record.append('\n')
-        sys.stderr.write(''.join(self.time_record))
-        self.time_record = [self.print_head]
-_is_profile = int(os.environ.get('FLAGS_profile_client', 0))
-_Profiler = _TimeProfiler if _is_profile else _NOPProfiler
-class SDKConfig(object):
-    def __init__(self):
-        self.sdk_desc = sdk.SDKConf()
-        self.tag_list = []
-        self.cluster_list = []
-        self.variant_weight_list = []
-        self.rpc_timeout_ms = 20000
-        self.load_balance_strategy = "la"
-    def add_server_variant(self, tag, cluster, variant_weight):
-        self.tag_list.append(tag)
-        self.cluster_list.append(cluster)
-        self.variant_weight_list.append(variant_weight)
-    def set_load_banlance_strategy(self, strategy):
-        self.load_balance_strategy = strategy
-    def gen_desc(self, rpc_timeout_ms):
-        predictor_desc = sdk.Predictor()
-        predictor_desc.name = "general_model"
-        predictor_desc.service_name = \
-            "baidu.paddle_serving.predictor.general_model.GeneralModelService"
-        predictor_desc.endpoint_router = "WeightedRandomRender"
-        predictor_desc.weighted_random_render_conf.variant_weight_list = "|".join(
-            self.variant_weight_list)
-        for idx, tag in enumerate(self.tag_list):
-            variant_desc = sdk.VariantConf()
-            variant_desc.tag = tag
-            variant_desc.naming_conf.cluster = "list://{}".format(",".join(
-                self.cluster_list[idx]))
-            predictor_desc.variants.extend([variant_desc])
-        self.sdk_desc.predictors.extend([predictor_desc])
-        self.sdk_desc.default_variant_conf.tag = "default"
-        self.sdk_desc.default_variant_conf.connection_conf.connect_timeout_ms = 2000
-        self.sdk_desc.default_variant_conf.connection_conf.rpc_timeout_ms = rpc_timeout_ms
-        self.sdk_desc.default_variant_conf.connection_conf.connect_retry_count = 2
-        self.sdk_desc.default_variant_conf.connection_conf.max_connection_per_host = 100
-        self.sdk_desc.default_variant_conf.connection_conf.hedge_request_timeout_ms = -1
-        self.sdk_desc.default_variant_conf.connection_conf.hedge_fetch_retry_count = 2
-        self.sdk_desc.default_variant_conf.connection_conf.connection_type = "pooled"
-        self.sdk_desc.default_variant_conf.naming_conf.cluster_filter_strategy = "Default"
-        self.sdk_desc.default_variant_conf.naming_conf.load_balance_strategy = "la"
-        self.sdk_desc.default_variant_conf.rpc_parameter.compress_type = 0
-        self.sdk_desc.default_variant_conf.rpc_parameter.package_size = 20
-        self.sdk_desc.default_variant_conf.rpc_parameter.protocol = "baidu_std"
-        self.sdk_desc.default_variant_conf.rpc_parameter.max_channel_per_request = 3
-        return self.sdk_desc
-class Client(object):
-    def __init__(self):
-        self.feed_names_ = []
-        self.fetch_names_ = []
-        self.client_handle_ = None
-        self.feed_shapes_ = {}
-        self.feed_types_ = {}
-        self.feed_names_to_idx_ = {}
-        self.pid = os.getpid()
-        self.predictor_sdk_ = None
-        self.producers = []
-        self.consumer = None
-        self.profile_ = _Profiler()
-        self.all_numpy_input = True
-        self.has_numpy_input = False
-        self.rpc_timeout_ms = 20000
-        from .serving_client import PredictorRes
-        self.predictorres_constructor = PredictorRes
-    def load_client_config(self, path):
-        from .serving_client import PredictorClient
-        model_conf = m_config.GeneralModelConfig()
-        f = open(path, 'r')
-        model_conf = google.protobuf.text_format.Merge(
-            str(f.read()), model_conf)
-        # load configuraion here
-        # get feed vars, fetch vars
-        # get feed shapes, feed types
-        # map feed names to index
-        self.client_handle_ = PredictorClient()
-        self.client_handle_.init(path)
-        if "FLAGS_max_body_size" not in os.environ:
-            os.environ["FLAGS_max_body_size"] = str(512 * 1024 * 1024)
-        read_env_flags = ["profile_client", "profile_server", "max_body_size"]
-        self.client_handle_.init_gflags([sys.argv[
-            0]] + ["--tryfromenv=" + ",".join(read_env_flags)])
-        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
-        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
-        self.feed_names_to_idx_ = {}
-        self.fetch_names_to_type_ = {}
-        self.fetch_names_to_idx_ = {}
-        self.lod_tensor_set = set()
-        self.feed_tensor_len = {}
-        self.key = None
-        for i, var in enumerate(model_conf.feed_var):
-            self.feed_names_to_idx_[var.alias_name] = i
-            self.feed_types_[var.alias_name] = var.feed_type
-            self.feed_shapes_[var.alias_name] = var.shape
-            if var.is_lod_tensor:
-                self.lod_tensor_set.add(var.alias_name)
-            else:
-                counter = 1
-                for dim in self.feed_shapes_[var.alias_name]:
-                    counter *= dim
-                self.feed_tensor_len[var.alias_name] = counter
-        for i, var in enumerate(model_conf.fetch_var):
-            self.fetch_names_to_idx_[var.alias_name] = i
-            self.fetch_names_to_type_[var.alias_name] = var.fetch_type
-            if var.is_lod_tensor:
-                self.lod_tensor_set.add(var.alias_name)
-        return
-    def add_variant(self, tag, cluster, variant_weight):
-        if self.predictor_sdk_ is None:
-            self.predictor_sdk_ = SDKConfig()
-        self.predictor_sdk_.add_server_variant(tag, cluster,
-                                               str(variant_weight))
-    def set_rpc_timeout_ms(self, rpc_timeout):
-        if not isinstance(rpc_timeout, int):
-            raise ValueError("rpc_timeout must be int type.")
-        else:
-            self.rpc_timeout_ms = rpc_timeout
-    def use_key(self, key_filename):
-        with open(key_filename, "rb") as f:
-            self.key = f.read()
-    def get_serving_port(self, endpoints):
-        if self.key is not None:
-            req = json.dumps({"key": base64.b64encode(self.key).decode()})
-        else:
-            req = json.dumps({})
-        r = requests.post("http://" + endpoints[0], req)
-        result = r.json()
-        print(result)
-        if "endpoint_list" not in result:
-            raise ValueError("server not ready")
-        else:
-            endpoints = [
-                endpoints[0].split(":")[0] + ":" +
-                str(result["endpoint_list"][0])
-            ]
-            return endpoints
-    def connect(self, endpoints=None, encryption=False):
-        # check whether current endpoint is available
-        # init from client config
-        # create predictor here
-        if endpoints is None:
-            if self.predictor_sdk_ is None:
-                raise ValueError(
-                    "You must set the endpoints parameter or use add_variant function to create a variant."
-                )
-        else:
-            if encryption:
-                endpoints = self.get_serving_port(endpoints)
-            if self.predictor_sdk_ is None:
-                self.add_variant('default_tag_{}'.format(id(self)), endpoints,
-                                 100)
-            else:
-                print(
-                    "parameter endpoints({}) will not take effect, because you use the add_variant function.".
-                    format(endpoints))
-        sdk_desc = self.predictor_sdk_.gen_desc(self.rpc_timeout_ms)
-        self.client_handle_.create_predictor_by_desc(sdk_desc.SerializeToString(
-        ))
-    def get_feed_names(self):
-        return self.feed_names_
-    def get_fetch_names(self):
-        return self.fetch_names_
-    def shape_check(self, feed, key):
-        if key in self.lod_tensor_set:
-            return
-        if isinstance(feed[key],
-                      list) and len(feed[key]) != self.feed_tensor_len[key]:
-            raise ValueError("The shape of feed tensor {} not match.".format(
-                key))
-        if type(feed[key]).__module__ == np.__name__ and np.size(feed[
-                key]) != self.feed_tensor_len[key]:
-            #raise SystemExit("The shape of feed tensor {} not match.".format(
-            #    key))
-            pass
-    def predict(self,
-                feed=None,
-                fetch=None,
-                batch=False,
-                need_variant_tag=False,
-                log_id=0):
-        self.profile_.record('py_prepro_0')
-        if feed is None or fetch is None:
-            raise ValueError("You should specify feed and fetch for prediction")
-        fetch_list = []
-        if isinstance(fetch, str):
-            fetch_list = [fetch]
-        elif isinstance(fetch, list):
-            fetch_list = fetch
-        else:
-            raise ValueError("Fetch only accepts string and list of string")
-        feed_batch = []
-        if isinstance(feed, dict):
-            feed_batch.append(feed)
-        elif isinstance(feed, list):
-            feed_batch = feed
-        else:
-            raise ValueError("Feed only accepts dict and list of dict")
-        int_slot_batch = []
-        float_slot_batch = []
-        int_feed_names = []
-        float_feed_names = []
-        int_shape = []
-        int_lod_slot_batch = []
-        float_lod_slot_batch = []
-        float_shape = []
-        fetch_names = []
-        counter = 0
-        batch_size = len(feed_batch)
-        for key in fetch_list:
-            if key in self.fetch_names_:
-                fetch_names.append(key)
-        if len(fetch_names) == 0:
-            raise ValueError(
-                "Fetch names should not be empty or out of saved fetch list.")
-            return {}
-        for i, feed_i in enumerate(feed_batch):
-            int_slot = []
-            float_slot = []
-            int_lod_slot = []
-            float_lod_slot = []
-            for key in feed_i:
-                if ".lod" not in key and key not in self.feed_names_:
-                    raise ValueError("Wrong feed name: {}.".format(key))
-                if ".lod" in key:
-                    continue
-                #if not isinstance(feed_i[key], np.ndarray):
-                self.shape_check(feed_i, key)
-                if self.feed_types_[key] in int_type:
-                    if i == 0:
-                        int_feed_names.append(key)
-                        shape_lst = []
-                        if batch == False:
-                            feed_i[key] = feed_i[key][np.newaxis, :]
-                        if isinstance(feed_i[key], np.ndarray):
-                            shape_lst.extend(list(feed_i[key].shape))
-                            int_shape.append(shape_lst)
-                        else:
-                            int_shape.append(self.feed_shapes_[key])
-                        if "{}.lod".format(key) in feed_i:
-                            int_lod_slot_batch.append(feed_i["{}.lod".format(
-                                key)])
-                        else:
-                            int_lod_slot_batch.append([])
-                    if isinstance(feed_i[key], np.ndarray):
-                        int_slot.append(feed_i[key])
-                        self.has_numpy_input = True
-                    else:
-                        int_slot.append(feed_i[key])
-                        self.all_numpy_input = False
-                elif self.feed_types_[key] in float_type:
-                    if i == 0:
-                        float_feed_names.append(key)
-                        shape_lst = []
-                        if batch == False:
-                            feed_i[key] = feed_i[key][np.newaxis, :]
-                        if isinstance(feed_i[key], np.ndarray):
-                            shape_lst.extend(list(feed_i[key].shape))
-                            float_shape.append(shape_lst)
-                        else:
-                            float_shape.append(self.feed_shapes_[key])
-                        if "{}.lod".format(key) in feed_i:
-                            float_lod_slot_batch.append(feed_i["{}.lod".format(
-                                key)])
-                        else:
-                            float_lod_slot_batch.append([])
-                    if isinstance(feed_i[key], np.ndarray):
-                        float_slot.append(feed_i[key])
-                        self.has_numpy_input = True
-                    else:
-                        float_slot.append(feed_i[key])
-                        self.all_numpy_input = False
-            int_slot_batch.append(int_slot)
-            float_slot_batch.append(float_slot)
-            int_lod_slot_batch.append(int_lod_slot)
-            float_lod_slot_batch.append(float_lod_slot)
-        self.profile_.record('py_prepro_1')
-        self.profile_.record('py_client_infer_0')
-        result_batch_handle = self.predictorres_constructor()
-        if self.all_numpy_input:
-            res = self.client_handle_.numpy_predict(
-                float_slot_batch, float_feed_names, float_shape,
-                float_lod_slot_batch, int_slot_batch, int_feed_names, int_shape,
-                int_lod_slot_batch, fetch_names, result_batch_handle, self.pid,
-                log_id)
-        elif self.has_numpy_input == False:
-            raise ValueError(
-                "Please make sure all of your inputs are numpy array")
-        else:
-            raise ValueError(
-                "Please make sure the inputs are all in list type or all in numpy.array type"
-            )
-        self.profile_.record('py_client_infer_1')
-        self.profile_.record('py_postpro_0')
-        if res == -1:
-            return None
-        multi_result_map = []
-        model_engine_names = result_batch_handle.get_engine_names()
-        for mi, engine_name in enumerate(model_engine_names):
-            result_map = {}
-            # result map needs to be a numpy array
-            for i, name in enumerate(fetch_names):
-                if self.fetch_names_to_type_[name] == int64_type:
-                    # result_map[name] will be py::array(numpy array)
-                    result_map[name] = result_batch_handle.get_int64_by_name(
-                        mi, name)
-                    shape = result_batch_handle.get_shape(mi, name)
-                    if result_map[name].size == 0:
-                        raise ValueError(
-                            "Failed to fetch, maybe the type of [{}]"
-                            " is wrong, please check the model file".format(
-                                name))
-                    result_map[name].shape = shape
-                    if name in self.lod_tensor_set:
-                        tmp_lod = result_batch_handle.get_lod(mi, name)
-                        if np.size(tmp_lod) > 0:
-                            result_map["{}.lod".format(name)] = tmp_lod
-                elif self.fetch_names_to_type_[name] == float32_type:
-                    result_map[name] = result_batch_handle.get_float_by_name(
-                        mi, name)
-                    if result_map[name].size == 0:
-                        raise ValueError(
-                            "Failed to fetch, maybe the type of [{}]"
-                            " is wrong, please check the model file".format(
-                                name))
-                    shape = result_batch_handle.get_shape(mi, name)
-                    result_map[name].shape = shape
-                    if name in self.lod_tensor_set:
-                        tmp_lod = result_batch_handle.get_lod(mi, name)
-                        if np.size(tmp_lod) > 0:
-                            result_map["{}.lod".format(name)] = tmp_lod
-                elif self.fetch_names_to_type_[name] == int32_type:
-                    # result_map[name] will be py::array(numpy array)
-                    result_map[name] = result_batch_handle.get_int32_by_name(
-                        mi, name)
-                    if result_map[name].size == 0:
-                        raise ValueError(
-                            "Failed to fetch, maybe the type of [{}]"
-                            " is wrong, please check the model file".format(
-                                name))
-                    shape = result_batch_handle.get_shape(mi, name)
-                    result_map[name].shape = shape
-                    if name in self.lod_tensor_set:
-                        tmp_lod = result_batch_handle.get_lod(mi, name)
-                        if np.size(tmp_lod) > 0:
-                            result_map["{}.lod".format(name)] = tmp_lod
-            multi_result_map.append(result_map)
-        ret = None
-        if len(model_engine_names) == 1:
-            # If only one model result is returned, the format of ret is result_map
-            ret = multi_result_map[0]
-        else:
-            # If multiple model results are returned, the format of ret is {name: result_map}
-            ret = {
-                engine_name: multi_result_map[mi]
-                for mi, engine_name in enumerate(model_engine_names)
-            }
-        self.profile_.record('py_postpro_1')
-        self.profile_.print_profile()
-        # When using the A/B test, the tag of variant needs to be returned
-        return ret if not need_variant_tag else [
-            ret, result_batch_handle.variant_tag()
-        ]
-    def release(self):
-        self.client_handle_.destroy_predictor()
-        self.client_handle_ = None
-class MultiLangClient(object):
-    def __init__(self):
-        self.channel_ = None
-        self.stub_ = None
-        self.rpc_timeout_s_ = 2
-        self.profile_ = _Profiler()
-    def add_variant(self, tag, cluster, variant_weight):
-        # TODO
-        raise Exception("cannot support ABtest yet")
-    def set_rpc_timeout_ms(self, rpc_timeout):
-        if self.stub_ is None:
-            raise Exception("set timeout must be set after connect.")
-        if not isinstance(rpc_timeout, int):
-            # for bclient
-            raise ValueError("rpc_timeout must be int type.")
-        self.rpc_timeout_s_ = rpc_timeout / 1000.0
-        timeout_req = multi_lang_general_model_service_pb2.SetTimeoutRequest()
-        timeout_req.timeout_ms = rpc_timeout
-        resp = self.stub_.SetTimeout(timeout_req)
-        return resp.err_code == 0
-    def connect(self, endpoints):
-        # https://github.com/tensorflow/serving/issues/1382
-        options = [('grpc.max_receive_message_length', 512 * 1024 * 1024),
-                   ('grpc.max_send_message_length', 512 * 1024 * 1024),
-                   ('grpc.lb_policy_name', 'round_robin')]
-        # TODO: weight round robin
-        g_endpoint = 'ipv4:{}'.format(','.join(endpoints))
-        self.channel_ = grpc.insecure_channel(g_endpoint, options=options)
-        self.stub_ = multi_lang_general_model_service_pb2_grpc.MultiLangGeneralModelServiceStub(
-            self.channel_)
-        # get client model config
-        get_client_config_req = multi_lang_general_model_service_pb2.GetClientConfigRequest(
-        )
-        resp = self.stub_.GetClientConfig(get_client_config_req)
-        model_config_str = resp.client_config_str
-        self._parse_model_config(model_config_str)
-    def _flatten_list(self, nested_list):
-        for item in nested_list:
-            if isinstance(item, (list, tuple)):
-                for sub_item in self._flatten_list(item):
-                    yield sub_item
-            else:
-                yield item
-    def _parse_model_config(self, model_config_str):
-        model_conf = m_config.GeneralModelConfig()
-        model_conf = google.protobuf.text_format.Merge(model_config_str,
-                                                       model_conf)
-        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
-        self.feed_types_ = {}
-        self.feed_shapes_ = {}
-        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
-        self.fetch_types_ = {}
-        self.lod_tensor_set_ = set()
-        for i, var in enumerate(model_conf.feed_var):
-            self.feed_types_[var.alias_name] = var.feed_type
-            self.feed_shapes_[var.alias_name] = var.shape
-            if var.is_lod_tensor:
-                self.lod_tensor_set_.add(var.alias_name)
-            else:
-                counter = 1
-                for dim in self.feed_shapes_[var.alias_name]:
-                    counter *= dim
-        for i, var in enumerate(model_conf.fetch_var):
-            self.fetch_types_[var.alias_name] = var.fetch_type
-            if var.is_lod_tensor:
-                self.lod_tensor_set_.add(var.alias_name)
-    def _pack_inference_request(self, feed, fetch, is_python, log_id):
-        req = multi_lang_general_model_service_pb2.InferenceRequest()
-        req.fetch_var_names.extend(fetch)
-        req.is_python = is_python
-        req.log_id = log_id
-        feed_var_names = []
-        for key in feed.keys():
-            if '.lod' not in key:
-                feed_var_names.append(key)
-        req.feed_var_names.extend(feed_var_names)
-        inst = multi_lang_general_model_service_pb2.FeedInst()
-        for name in req.feed_var_names:
-            tensor = multi_lang_general_model_service_pb2.Tensor()
-            var = feed[name]
-            v_type = self.feed_types_[name]
-            if is_python:
-                data = None
-                if isinstance(var, list):
-                    if v_type == 0:  # int64
-                        data = np.array(var, dtype="int64")
-                    elif v_type == 1:  # float32
-                        data = np.array(var, dtype="float32")
-                    elif v_type == 2:  # int32
-                        data = np.array(var, dtype="int32")
-                    else:
-                        raise Exception("error tensor value type.")
-                elif isinstance(var, np.ndarray):
-                    data = var
-                    if v_type == 0:
-                        if data.dtype != 'int64':
-                            data = data.astype("int64")
-                    elif v_type == 1:
-                        if data.dtype != 'float32':
-                            data = data.astype("float32")
-                    elif v_type == 2:
-                        if data.dtype != 'int32':
-                            data = data.astype("int32")
-                    else:
-                        raise Exception("error tensor value type.")
-                else:
-                    raise Exception("var must be list or ndarray.")
-                tensor.data = data.tobytes()
-            tensor.shape.extend(list(var.shape))
-            if "{}.lod".format(name) in feed.keys():
-                tensor.lod.extend(feed["{}.lod".format(name)])
-            inst.tensor_array.append(tensor)
-        req.insts.append(inst)
-        return req
-    def _unpack_inference_response(self, resp, fetch, is_python,
-                                   need_variant_tag):
-        if resp.err_code != 0:
-            return None
-        tag = resp.tag
-        multi_result_map = {}
-        for model_result in resp.outputs:
-            inst = model_result.insts[0]
-            result_map = {}
-            for i, name in enumerate(fetch):
-                var = inst.tensor_array[i]
-                v_type = self.fetch_types_[name]
-                if is_python:
-                    if v_type == 0:  # int64
-                        result_map[name] = np.frombuffer(
-                            var.data, dtype="int64")
-                    elif v_type == 1:  # float32
-                        result_map[name] = np.frombuffer(
-                            var.data, dtype="float32")
-                    else:
-                        raise Exception("error type.")
-                else:
-                    if v_type == 0:  # int64
-                        result_map[name] = np.array(
-                            list(var.int64_data), dtype="int64")
-                    elif v_type == 1:  # float32
-                        result_map[name] = np.array(
-                            list(var.float_data), dtype="float32")
-                    else:
-                        raise Exception("error type.")
-                result_map[name].shape = list(var.shape)
-                if name in self.lod_tensor_set_:
-                    result_map["{}.lod".format(name)] = np.array(list(var.lod))
-            multi_result_map[model_result.engine_name] = result_map
-        ret = None
-        if len(resp.outputs) == 1:
-            ret = list(multi_result_map.values())[0]
-        else:
-            ret = multi_result_map
-        ret["serving_status_code"] = 0
-        return ret if not need_variant_tag else [ret, tag]
-    def _done_callback_func(self, fetch, is_python, need_variant_tag):
-        def unpack_resp(resp):
-            return self._unpack_inference_response(resp, fetch, is_python,
-                                                   need_variant_tag)
-        return unpack_resp
-    def get_feed_names(self):
-        return self.feed_names_
-    def predict(self,
-                feed,
-                fetch,
-                batch=True,
-                need_variant_tag=False,
-                asyn=False,
-                is_python=True,
-                log_id=0):
-        if isinstance(feed, dict) is False:
-            raise ValueError("Type Error. grpc feed must be dict.")
-        if batch is False:
-            for key in feed:
-                if ".lod" not in key:
-                    feed[key] = feed[key][np.newaxis, :]
-        if not asyn:
-            try:
-                self.profile_.record('py_prepro_0')
-                req = self._pack_inference_request(
-                    feed, fetch, is_python=is_python, log_id=log_id)
-                self.profile_.record('py_prepro_1')
-                self.profile_.record('py_client_infer_0')
-                resp = self.stub_.Inference(req, timeout=self.rpc_timeout_s_)
-                self.profile_.record('py_client_infer_1')
-                self.profile_.record('py_postpro_0')
-                ret = self._unpack_inference_response(
-                    resp,
-                    fetch,
-                    is_python=is_python,
-                    need_variant_tag=need_variant_tag)
-                self.profile_.record('py_postpro_1')
-                self.profile_.print_profile()
-                return ret
-            except grpc.RpcError as e:
-                return {"serving_status_code": e.code()}
-        else:
-            req = self._pack_inference_request(
-                feed, fetch, is_python=is_python, log_id=log_id)
-            call_future = self.stub_.Inference.future(
-                req, timeout=self.rpc_timeout_s_)
-            return MultiLangPredictFuture(
-                call_future,
-                self._done_callback_func(
-                    fetch,
-                    is_python=is_python,
-                    need_variant_tag=need_variant_tag))
-class MultiLangPredictFuture(object):
-    def __init__(self, call_future, callback_func):
-        self.call_future_ = call_future
-        self.callback_func_ = callback_func
-    def result(self):
-        try:
-            resp = self.call_future_.result()
-        except grpc.RpcError as e:
-            return {"serving_status_code": e.code()}
-        return self.callback_func_(resp)
-    def add_done_callback(self, fn):
-        def __fn__(call_future):
-            assert call_future == self.call_future_
-            fn(self)
-        self.call_future_.add_done_callback(__fn__)
--- a/python/paddle_serving_client/client.py
+++ b/python/paddle_serving_client/client.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+import paddle_serving_client
+import os
+from .proto import sdk_configure_pb2 as sdk
+from .proto import general_model_config_pb2 as m_config
+import google.protobuf.text_format
+import numpy as np
+import requests
+import json
+import base64
+import time
+import sys
+import grpc
+from .proto import multi_lang_general_model_service_pb2
+sys.path.append(
+    os.path.join(os.path.abspath(os.path.dirname(__file__)), 'proto'))
+from .proto import multi_lang_general_model_service_pb2_grpc
+int64_type = 0
+float32_type = 1
+int32_type = 2
+int_type = set([int64_type, int32_type])
+float_type = set([float32_type])
+class _NOPProfiler(object):
+    def record(self, name):
+        pass
+    def print_profile(self):
+        pass
+class _TimeProfiler(object):
+    def __init__(self):
+        self.pid = os.getpid()
+        self.print_head = 'PROFILE\tpid:{}\t'.format(self.pid)
+        self.time_record = [self.print_head]
+    def record(self, name):
+        self.time_record.append('{}:{} '.format(
+            name, int(round(time.time() * 1000000))))
+    def print_profile(self):
+        self.time_record.append('\n')
+        sys.stderr.write(''.join(self.time_record))
+        self.time_record = [self.print_head]
+_is_profile = int(os.environ.get('FLAGS_profile_client', 0))
+_Profiler = _TimeProfiler if _is_profile else _NOPProfiler
+class SDKConfig(object):
+    def __init__(self):
+        self.sdk_desc = sdk.SDKConf()
+        self.tag_list = []
+        self.cluster_list = []
+        self.variant_weight_list = []
+        self.rpc_timeout_ms = 20000
+        self.load_balance_strategy = "la"
+    def add_server_variant(self, tag, cluster, variant_weight):
+        self.tag_list.append(tag)
+        self.cluster_list.append(cluster)
+        self.variant_weight_list.append(variant_weight)
+    def set_load_banlance_strategy(self, strategy):
+        self.load_balance_strategy = strategy
+    def gen_desc(self, rpc_timeout_ms):
+        predictor_desc = sdk.Predictor()
+        predictor_desc.name = "general_model"
+        predictor_desc.service_name = \
+            "baidu.paddle_serving.predictor.general_model.GeneralModelService"
+        predictor_desc.endpoint_router = "WeightedRandomRender"
+        predictor_desc.weighted_random_render_conf.variant_weight_list = "|".join(
+            self.variant_weight_list)
+        for idx, tag in enumerate(self.tag_list):
+            variant_desc = sdk.VariantConf()
+            variant_desc.tag = tag
+            variant_desc.naming_conf.cluster = "list://{}".format(",".join(
+                self.cluster_list[idx]))
+            predictor_desc.variants.extend([variant_desc])
+        self.sdk_desc.predictors.extend([predictor_desc])
+        self.sdk_desc.default_variant_conf.tag = "default"
+        self.sdk_desc.default_variant_conf.connection_conf.connect_timeout_ms = 2000
+        self.sdk_desc.default_variant_conf.connection_conf.rpc_timeout_ms = rpc_timeout_ms
+        self.sdk_desc.default_variant_conf.connection_conf.connect_retry_count = 2
+        self.sdk_desc.default_variant_conf.connection_conf.max_connection_per_host = 100
+        self.sdk_desc.default_variant_conf.connection_conf.hedge_request_timeout_ms = -1
+        self.sdk_desc.default_variant_conf.connection_conf.hedge_fetch_retry_count = 2
+        self.sdk_desc.default_variant_conf.connection_conf.connection_type = "pooled"
+        self.sdk_desc.default_variant_conf.naming_conf.cluster_filter_strategy = "Default"
+        self.sdk_desc.default_variant_conf.naming_conf.load_balance_strategy = "la"
+        self.sdk_desc.default_variant_conf.rpc_parameter.compress_type = 0
+        self.sdk_desc.default_variant_conf.rpc_parameter.package_size = 20
+        self.sdk_desc.default_variant_conf.rpc_parameter.protocol = "baidu_std"
+        self.sdk_desc.default_variant_conf.rpc_parameter.max_channel_per_request = 3
+        return self.sdk_desc
+class Client(object):
+    def __init__(self):
+        self.feed_names_ = []
+        self.fetch_names_ = []
+        self.client_handle_ = None
+        self.feed_shapes_ = {}
+        self.feed_types_ = {}
+        self.feed_names_to_idx_ = {}
+        self.pid = os.getpid()
+        self.predictor_sdk_ = None
+        self.producers = []
+        self.consumer = None
+        self.profile_ = _Profiler()
+        self.all_numpy_input = True
+        self.has_numpy_input = False
+        self.rpc_timeout_ms = 20000
+        from .serving_client import PredictorRes
+        self.predictorres_constructor = PredictorRes
+    def load_client_config(self, path):
+        from .serving_client import PredictorClient
+        model_conf = m_config.GeneralModelConfig()
+        f = open(path, 'r')
+        model_conf = google.protobuf.text_format.Merge(
+            str(f.read()), model_conf)
+        # load configuraion here
+        # get feed vars, fetch vars
+        # get feed shapes, feed types
+        # map feed names to index
+        self.client_handle_ = PredictorClient()
+        self.client_handle_.init(path)
+        if "FLAGS_max_body_size" not in os.environ:
+            os.environ["FLAGS_max_body_size"] = str(512 * 1024 * 1024)
+        read_env_flags = ["profile_client", "profile_server", "max_body_size"]
+        self.client_handle_.init_gflags([sys.argv[
+            0]] + ["--tryfromenv=" + ",".join(read_env_flags)])
+        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
+        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
+        self.feed_names_to_idx_ = {}
+        self.fetch_names_to_type_ = {}
+        self.fetch_names_to_idx_ = {}
+        self.lod_tensor_set = set()
+        self.feed_tensor_len = {}
+        self.key = None
+        for i, var in enumerate(model_conf.feed_var):
+            self.feed_names_to_idx_[var.alias_name] = i
+            self.feed_types_[var.alias_name] = var.feed_type
+            self.feed_shapes_[var.alias_name] = var.shape
+            if var.is_lod_tensor:
+                self.lod_tensor_set.add(var.alias_name)
+            else:
+                counter = 1
+                for dim in self.feed_shapes_[var.alias_name]:
+                    counter *= dim
+                self.feed_tensor_len[var.alias_name] = counter
+        for i, var in enumerate(model_conf.fetch_var):
+            self.fetch_names_to_idx_[var.alias_name] = i
+            self.fetch_names_to_type_[var.alias_name] = var.fetch_type
+            if var.is_lod_tensor:
+                self.lod_tensor_set.add(var.alias_name)
+        return
+    def add_variant(self, tag, cluster, variant_weight):
+        if self.predictor_sdk_ is None:
+            self.predictor_sdk_ = SDKConfig()
+        self.predictor_sdk_.add_server_variant(tag, cluster,
+                                               str(variant_weight))
+    def set_rpc_timeout_ms(self, rpc_timeout):
+        if not isinstance(rpc_timeout, int):
+            raise ValueError("rpc_timeout must be int type.")
+        else:
+            self.rpc_timeout_ms = rpc_timeout
+    def use_key(self, key_filename):
+        with open(key_filename, "rb") as f:
+            self.key = f.read()
+    def get_serving_port(self, endpoints):
+        if self.key is not None:
+            req = json.dumps({"key": base64.b64encode(self.key).decode()})
+        else:
+            req = json.dumps({})
+        r = requests.post("http://" + endpoints[0], req)
+        result = r.json()
+        print(result)
+        if "endpoint_list" not in result:
+            raise ValueError("server not ready")
+        else:
+            endpoints = [
+                endpoints[0].split(":")[0] + ":" +
+                str(result["endpoint_list"][0])
+            ]
+            return endpoints
+    def connect(self, endpoints=None, encryption=False):
+        # check whether current endpoint is available
+        # init from client config
+        # create predictor here
+        if endpoints is None:
+            if self.predictor_sdk_ is None:
+                raise ValueError(
+                    "You must set the endpoints parameter or use add_variant function to create a variant."
+                )
+        else:
+            if encryption:
+                endpoints = self.get_serving_port(endpoints)
+            if self.predictor_sdk_ is None:
+                self.add_variant('default_tag_{}'.format(id(self)), endpoints,
+                                 100)
+            else:
+                print(
+                    "parameter endpoints({}) will not take effect, because you use the add_variant function.".
+                    format(endpoints))
+        sdk_desc = self.predictor_sdk_.gen_desc(self.rpc_timeout_ms)
+        self.client_handle_.create_predictor_by_desc(sdk_desc.SerializeToString(
+        ))
+    def get_feed_names(self):
+        return self.feed_names_
+    def get_fetch_names(self):
+        return self.fetch_names_
+    def shape_check(self, feed, key):
+        if key in self.lod_tensor_set:
+            return
+        if isinstance(feed[key],
+                      list) and len(feed[key]) != self.feed_tensor_len[key]:
+            raise ValueError("The shape of feed tensor {} not match.".format(
+                key))
+        if type(feed[key]).__module__ == np.__name__ and np.size(feed[
+                key]) != self.feed_tensor_len[key]:
+            #raise SystemExit("The shape of feed tensor {} not match.".format(
+            #    key))
+            pass
+    def predict(self,
+                feed=None,
+                fetch=None,
+                batch=False,
+                need_variant_tag=False,
+                log_id=0):
+        self.profile_.record('py_prepro_0')
+        if feed is None or fetch is None:
+            raise ValueError("You should specify feed and fetch for prediction")
+        fetch_list = []
+        if isinstance(fetch, str):
+            fetch_list = [fetch]
+        elif isinstance(fetch, list):
+            fetch_list = fetch
+        else:
+            raise ValueError("Fetch only accepts string and list of string")
+        feed_batch = []
+        if isinstance(feed, dict):
+            feed_batch.append(feed)
+        elif isinstance(feed, list):
+            feed_batch = feed
+        else:
+            raise ValueError("Feed only accepts dict and list of dict")
+        int_slot_batch = []
+        float_slot_batch = []
+        int_feed_names = []
+        float_feed_names = []
+        int_shape = []
+        int_lod_slot_batch = []
+        float_lod_slot_batch = []
+        float_shape = []
+        fetch_names = []
+        counter = 0
+        batch_size = len(feed_batch)
+        for key in fetch_list:
+            if key in self.fetch_names_:
+                fetch_names.append(key)
+        if len(fetch_names) == 0:
+            raise ValueError(
+                "Fetch names should not be empty or out of saved fetch list.")
+            return {}
+        for i, feed_i in enumerate(feed_batch):
+            int_slot = []
+            float_slot = []
+            int_lod_slot = []
+            float_lod_slot = []
+            for key in feed_i:
+                if ".lod" not in key and key not in self.feed_names_:
+                    raise ValueError("Wrong feed name: {}.".format(key))
+                if ".lod" in key:
+                    continue
+                #if not isinstance(feed_i[key], np.ndarray):
+                self.shape_check(feed_i, key)
+                if self.feed_types_[key] in int_type:
+                    if i == 0:
+                        int_feed_names.append(key)
+                        shape_lst = []
+                        if batch == False:
+                            feed_i[key] = feed_i[key][np.newaxis, :]
+                        if isinstance(feed_i[key], np.ndarray):
+                            shape_lst.extend(list(feed_i[key].shape))
+                            int_shape.append(shape_lst)
+                        else:
+                            int_shape.append(self.feed_shapes_[key])
+                        if "{}.lod".format(key) in feed_i:
+                            int_lod_slot_batch.append(feed_i["{}.lod".format(
+                                key)])
+                        else:
+                            int_lod_slot_batch.append([])
+                    if isinstance(feed_i[key], np.ndarray):
+                        int_slot.append(feed_i[key])
+                        self.has_numpy_input = True
+                    else:
+                        int_slot.append(feed_i[key])
+                        self.all_numpy_input = False
+                elif self.feed_types_[key] in float_type:
+                    if i == 0:
+                        float_feed_names.append(key)
+                        shape_lst = []
+                        if batch == False:
+                            feed_i[key] = feed_i[key][np.newaxis, :]
+                        if isinstance(feed_i[key], np.ndarray):
+                            shape_lst.extend(list(feed_i[key].shape))
+                            float_shape.append(shape_lst)
+                        else:
+                            float_shape.append(self.feed_shapes_[key])
+                        if "{}.lod".format(key) in feed_i:
+                            float_lod_slot_batch.append(feed_i["{}.lod".format(
+                                key)])
+                        else:
+                            float_lod_slot_batch.append([])
+                    if isinstance(feed_i[key], np.ndarray):
+                        float_slot.append(feed_i[key])
+                        self.has_numpy_input = True
+                    else:
+                        float_slot.append(feed_i[key])
+                        self.all_numpy_input = False
+            int_slot_batch.append(int_slot)
+            float_slot_batch.append(float_slot)
+            int_lod_slot_batch.append(int_lod_slot)
+            float_lod_slot_batch.append(float_lod_slot)
+        self.profile_.record('py_prepro_1')
+        self.profile_.record('py_client_infer_0')
+        result_batch_handle = self.predictorres_constructor()
+        if self.all_numpy_input:
+            res = self.client_handle_.numpy_predict(
+                float_slot_batch, float_feed_names, float_shape,
+                float_lod_slot_batch, int_slot_batch, int_feed_names, int_shape,
+                int_lod_slot_batch, fetch_names, result_batch_handle, self.pid,
+                log_id)
+        elif self.has_numpy_input == False:
+            raise ValueError(
+                "Please make sure all of your inputs are numpy array")
+        else:
+            raise ValueError(
+                "Please make sure the inputs are all in list type or all in numpy.array type"
+            )
+        self.profile_.record('py_client_infer_1')
+        self.profile_.record('py_postpro_0')
+        if res == -1:
+            return None
+        multi_result_map = []
+        model_engine_names = result_batch_handle.get_engine_names()
+        for mi, engine_name in enumerate(model_engine_names):
+            result_map = {}
+            # result map needs to be a numpy array
+            for i, name in enumerate(fetch_names):
+                if self.fetch_names_to_type_[name] == int64_type:
+                    # result_map[name] will be py::array(numpy array)
+                    result_map[name] = result_batch_handle.get_int64_by_name(
+                        mi, name)
+                    shape = result_batch_handle.get_shape(mi, name)
+                    if result_map[name].size == 0:
+                        raise ValueError(
+                            "Failed to fetch, maybe the type of [{}]"
+                            " is wrong, please check the model file".format(
+                                name))
+                    result_map[name].shape = shape
+                    if name in self.lod_tensor_set:
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
+                elif self.fetch_names_to_type_[name] == float32_type:
+                    result_map[name] = result_batch_handle.get_float_by_name(
+                        mi, name)
+                    if result_map[name].size == 0:
+                        raise ValueError(
+                            "Failed to fetch, maybe the type of [{}]"
+                            " is wrong, please check the model file".format(
+                                name))
+                    shape = result_batch_handle.get_shape(mi, name)
+                    result_map[name].shape = shape
+                    if name in self.lod_tensor_set:
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
+                elif self.fetch_names_to_type_[name] == int32_type:
+                    # result_map[name] will be py::array(numpy array)
+                    result_map[name] = result_batch_handle.get_int32_by_name(
+                        mi, name)
+                    if result_map[name].size == 0:
+                        raise ValueError(
+                            "Failed to fetch, maybe the type of [{}]"
+                            " is wrong, please check the model file".format(
+                                name))
+                    shape = result_batch_handle.get_shape(mi, name)
+                    result_map[name].shape = shape
+                    if name in self.lod_tensor_set:
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
+            multi_result_map.append(result_map)
+        ret = None
+        if len(model_engine_names) == 1:
+            # If only one model result is returned, the format of ret is result_map
+            ret = multi_result_map[0]
+        else:
+            # If multiple model results are returned, the format of ret is {name: result_map}
+            ret = {
+                engine_name: multi_result_map[mi]
+                for mi, engine_name in enumerate(model_engine_names)
+            }
+        self.profile_.record('py_postpro_1')
+        self.profile_.print_profile()
+        # When using the A/B test, the tag of variant needs to be returned
+        return ret if not need_variant_tag else [
+            ret, result_batch_handle.variant_tag()
+        ]
+    def release(self):
+        self.client_handle_.destroy_predictor()
+        self.client_handle_ = None
+class MultiLangClient(object):
+    def __init__(self):
+        self.channel_ = None
+        self.stub_ = None
+        self.rpc_timeout_s_ = 2
+        self.profile_ = _Profiler()
+    def add_variant(self, tag, cluster, variant_weight):
+        # TODO
+        raise Exception("cannot support ABtest yet")
+    def set_rpc_timeout_ms(self, rpc_timeout):
+        if self.stub_ is None:
+            raise Exception("set timeout must be set after connect.")
+        if not isinstance(rpc_timeout, int):
+            # for bclient
+            raise ValueError("rpc_timeout must be int type.")
+        self.rpc_timeout_s_ = rpc_timeout / 1000.0
+        timeout_req = multi_lang_general_model_service_pb2.SetTimeoutRequest()
+        timeout_req.timeout_ms = rpc_timeout
+        resp = self.stub_.SetTimeout(timeout_req)
+        return resp.err_code == 0
+    def connect(self, endpoints):
+        # https://github.com/tensorflow/serving/issues/1382
+        options = [('grpc.max_receive_message_length', 512 * 1024 * 1024),
+                   ('grpc.max_send_message_length', 512 * 1024 * 1024),
+                   ('grpc.lb_policy_name', 'round_robin')]
+        # TODO: weight round robin
+        g_endpoint = 'ipv4:{}'.format(','.join(endpoints))
+        self.channel_ = grpc.insecure_channel(g_endpoint, options=options)
+        self.stub_ = multi_lang_general_model_service_pb2_grpc.MultiLangGeneralModelServiceStub(
+            self.channel_)
+        # get client model config
+        get_client_config_req = multi_lang_general_model_service_pb2.GetClientConfigRequest(
+        )
+        resp = self.stub_.GetClientConfig(get_client_config_req)
+        model_config_str = resp.client_config_str
+        self._parse_model_config(model_config_str)
+    def _flatten_list(self, nested_list):
+        for item in nested_list:
+            if isinstance(item, (list, tuple)):
+                for sub_item in self._flatten_list(item):
+                    yield sub_item
+            else:
+                yield item
+    def _parse_model_config(self, model_config_str):
+        model_conf = m_config.GeneralModelConfig()
+        model_conf = google.protobuf.text_format.Merge(model_config_str,
+                                                       model_conf)
+        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
+        self.feed_types_ = {}
+        self.feed_shapes_ = {}
+        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
+        self.fetch_types_ = {}
+        self.lod_tensor_set_ = set()
+        for i, var in enumerate(model_conf.feed_var):
+            self.feed_types_[var.alias_name] = var.feed_type
+            self.feed_shapes_[var.alias_name] = var.shape
+            if var.is_lod_tensor:
+                self.lod_tensor_set_.add(var.alias_name)
+            else:
+                counter = 1
+                for dim in self.feed_shapes_[var.alias_name]:
+                    counter *= dim
+        for i, var in enumerate(model_conf.fetch_var):
+            self.fetch_types_[var.alias_name] = var.fetch_type
+            if var.is_lod_tensor:
+                self.lod_tensor_set_.add(var.alias_name)
+    def _pack_inference_request(self, feed, fetch, is_python, log_id):
+        req = multi_lang_general_model_service_pb2.InferenceRequest()
+        req.fetch_var_names.extend(fetch)
+        req.is_python = is_python
+        req.log_id = log_id
+        feed_var_names = []
+        for key in feed.keys():
+            if '.lod' not in key:
+                feed_var_names.append(key)
+        req.feed_var_names.extend(feed_var_names)
+        inst = multi_lang_general_model_service_pb2.FeedInst()
+        for name in req.feed_var_names:
+            tensor = multi_lang_general_model_service_pb2.Tensor()
+            var = feed[name]
+            v_type = self.feed_types_[name]
+            if is_python:
+                data = None
+                if isinstance(var, list):
+                    if v_type == 0:  # int64
+                        data = np.array(var, dtype="int64")
+                    elif v_type == 1:  # float32
+                        data = np.array(var, dtype="float32")
+                    elif v_type == 2:  # int32
+                        data = np.array(var, dtype="int32")
+                    else:
+                        raise Exception("error tensor value type.")
+                elif isinstance(var, np.ndarray):
+                    data = var
+                    if v_type == 0:
+                        if data.dtype != 'int64':
+                            data = data.astype("int64")
+                    elif v_type == 1:
+                        if data.dtype != 'float32':
+                            data = data.astype("float32")
+                    elif v_type == 2:
+                        if data.dtype != 'int32':
+                            data = data.astype("int32")
+                    else:
+                        raise Exception("error tensor value type.")
+                else:
+                    raise Exception("var must be list or ndarray.")
+                tensor.data = data.tobytes()
+            tensor.shape.extend(list(var.shape))
+            if "{}.lod".format(name) in feed.keys():
+                tensor.lod.extend(feed["{}.lod".format(name)])
+            inst.tensor_array.append(tensor)
+        req.insts.append(inst)
+        return req
+    def _unpack_inference_response(self, resp, fetch, is_python,
+                                   need_variant_tag):
+        if resp.err_code != 0:
+            return None
+        tag = resp.tag
+        multi_result_map = {}
+        for model_result in resp.outputs:
+            inst = model_result.insts[0]
+            result_map = {}
+            for i, name in enumerate(fetch):
+                var = inst.tensor_array[i]
+                v_type = self.fetch_types_[name]
+                if is_python:
+                    if v_type == 0:  # int64
+                        result_map[name] = np.frombuffer(
+                            var.data, dtype="int64")
+                    elif v_type == 1:  # float32
+                        result_map[name] = np.frombuffer(
+                            var.data, dtype="float32")
+                    else:
+                        raise Exception("error type.")
+                else:
+                    if v_type == 0:  # int64
+                        result_map[name] = np.array(
+                            list(var.int64_data), dtype="int64")
+                    elif v_type == 1:  # float32
+                        result_map[name] = np.array(
+                            list(var.float_data), dtype="float32")
+                    else:
+                        raise Exception("error type.")
+                result_map[name].shape = list(var.shape)
+                if name in self.lod_tensor_set_:
+                    result_map["{}.lod".format(name)] = np.array(list(var.lod))
+            multi_result_map[model_result.engine_name] = result_map
+        ret = None
+        if len(resp.outputs) == 1:
+            ret = list(multi_result_map.values())[0]
+        else:
+            ret = multi_result_map
+        ret["serving_status_code"] = 0
+        return ret if not need_variant_tag else [ret, tag]
+    def _done_callback_func(self, fetch, is_python, need_variant_tag):
+        def unpack_resp(resp):
+            return self._unpack_inference_response(resp, fetch, is_python,
+                                                   need_variant_tag)
+        return unpack_resp
+    def get_feed_names(self):
+        return self.feed_names_
+    def predict(self,
+                feed,
+                fetch,
+                batch=True,
+                need_variant_tag=False,
+                asyn=False,
+                is_python=True,
+                log_id=0):
+        if isinstance(feed, dict) is False:
+            raise ValueError("Type Error. grpc feed must be dict.")
+        if batch is False:
+            for key in feed:
+                if ".lod" not in key:
+                    feed[key] = feed[key][np.newaxis, :]
+        if not asyn:
+            try:
+                self.profile_.record('py_prepro_0')
+                req = self._pack_inference_request(
+                    feed, fetch, is_python=is_python, log_id=log_id)
+                self.profile_.record('py_prepro_1')
+                self.profile_.record('py_client_infer_0')
+                resp = self.stub_.Inference(req, timeout=self.rpc_timeout_s_)
+                self.profile_.record('py_client_infer_1')
+                self.profile_.record('py_postpro_0')
+                ret = self._unpack_inference_response(
+                    resp,
+                    fetch,
+                    is_python=is_python,
+                    need_variant_tag=need_variant_tag)
+                self.profile_.record('py_postpro_1')
+                self.profile_.print_profile()
+                return ret
+            except grpc.RpcError as e:
+                return {"serving_status_code": e.code()}
+        else:
+            req = self._pack_inference_request(
+                feed, fetch, is_python=is_python, log_id=log_id)
+            call_future = self.stub_.Inference.future(
+                req, timeout=self.rpc_timeout_s_)
+            return MultiLangPredictFuture(
+                call_future,
+                self._done_callback_func(
+                    fetch,
+                    is_python=is_python,
+                    need_variant_tag=need_variant_tag))
+class MultiLangPredictFuture(object):
+    def __init__(self, call_future, callback_func):
+        self.call_future_ = call_future
+        self.callback_func_ = callback_func
+    def result(self):
+        try:
+            resp = self.call_future_.result()
+        except grpc.RpcError as e:
+            return {"serving_status_code": e.code()}
+        return self.callback_func_(resp)
+    def add_done_callback(self, fn):
+        def __fn__(call_future):
+            assert call_future == self.call_future_
+            fn(self)
+        self.call_future_.add_done_callback(__fn__)
--- a/python/paddle_serving_server/__init__.py
+++ b/python/paddle_serving_server/__init__.py
@@ -13,737 +13,22 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing
-import os
+from . import monitor
-from .proto import server_configure_pb2 as server_sdk
+from . import rpc_service
-from .proto import general_model_config_pb2 as m_config
+from . import serve
-import google.protobuf.text_format
+from . import version
-import tarfile
-import socket
-import paddle_serving_server as paddle_serving_server
-from .version import serving_server_version
-from contextlib import closing
-import collections
-import shutil
-import numpy as np
-import grpc
-from .proto import multi_lang_general_model_service_pb2
-import sys
-if sys.platform.startswith('win') is False:
-    import fcntl
-sys.path.append(
-    os.path.join(os.path.abspath(os.path.dirname(__file__)), 'proto'))
-from .proto import multi_lang_general_model_service_pb2_grpc
-from multiprocessing import Pool, Process
-from concurrent import futures
+__all__ = ["version", "server", "serve", "monitor", "rpc_service", "dag"]
-class OpMaker(object):
+from paddle_serving_server import (
-    def __init__(self):
+    version,
-        self.op_dict = {
+    server,
-            "general_infer": "GeneralInferOp",
+    serve,
-            "general_reader": "GeneralReaderOp",
+    monitor,
-            "general_response": "GeneralResponseOp",
+    rpc_service,
-            "general_text_reader": "GeneralTextReaderOp",
+    dag, )
-            "general_text_response": "GeneralTextResponseOp",
-            "general_single_kv": "GeneralSingleKVOp",
-            "general_dist_kv_infer": "GeneralDistKVInferOp",
-            "general_dist_kv_quant_infer": "GeneralDistKVQuantInferOp",
-            "general_copy": "GeneralCopyOp"
-        }
-        self.node_name_suffix_ = collections.defaultdict(int)
-    def create(self, node_type, engine_name=None, inputs=[], outputs=[]):
+from .dag import *
-        if node_type not in self.op_dict:
+from .server import *
-            raise Exception("Op type {} is not supported right now".format(
-                node_type))
-        node = server_sdk.DAGNode()
-        # node.name will be used as the infer engine name
-        if engine_name:
-            node.name = engine_name
-        else:
-            node.name = '{}_{}'.format(node_type,
-                                       self.node_name_suffix_[node_type])
-            self.node_name_suffix_[node_type] += 1
-        node.type = self.op_dict[node_type]
+__version__ = version.serving_server_version
-        if inputs:
-            for dep_node_str in inputs:
-                dep_node = server_sdk.DAGNode()
-                google.protobuf.text_format.Parse(dep_node_str, dep_node)
-                dep = server_sdk.DAGNodeDependency()
-                dep.name = dep_node.name
-                dep.mode = "RO"
-                node.dependencies.extend([dep])
-        # Because the return value will be used as the key value of the
-        # dict, and the proto object is variable which cannot be hashed,
-        # so it is processed into a string. This has little effect on
-        # overall efficiency.
-        return google.protobuf.text_format.MessageToString(node)
-class OpSeqMaker(object):
-    def __init__(self):
-        self.workflow = server_sdk.Workflow()
-        self.workflow.name = "workflow1"
-        self.workflow.workflow_type = "Sequence"
-    def add_op(self, node_str):
-        node = server_sdk.DAGNode()
-        google.protobuf.text_format.Parse(node_str, node)
-        if len(node.dependencies) > 1:
-            raise Exception(
-                'Set more than one predecessor for op in OpSeqMaker is not allowed.'
-            )
-        if len(self.workflow.nodes) >= 1:
-            if len(node.dependencies) == 0:
-                dep = server_sdk.DAGNodeDependency()
-                dep.name = self.workflow.nodes[-1].name
-                dep.mode = "RO"
-                node.dependencies.extend([dep])
-            elif len(node.dependencies) == 1:
-                if node.dependencies[0].name != self.workflow.nodes[-1].name:
-                    raise Exception(
-                        'You must add op in order in OpSeqMaker. The previous op is {}, but the current op is followed by {}.'
-                        .format(node.dependencies[0].name, self.workflow.nodes[
-                            -1].name))
-        self.workflow.nodes.extend([node])
-    def get_op_sequence(self):
-        workflow_conf = server_sdk.WorkflowConf()
-        workflow_conf.workflows.extend([self.workflow])
-        return workflow_conf
-class OpGraphMaker(object):
-    def __init__(self):
-        self.workflow = server_sdk.Workflow()
-        self.workflow.name = "workflow1"
-        # Currently, SDK only supports "Sequence"
-        self.workflow.workflow_type = "Sequence"
-    def add_op(self, node_str):
-        node = server_sdk.DAGNode()
-        google.protobuf.text_format.Parse(node_str, node)
-        self.workflow.nodes.extend([node])
-    def get_op_graph(self):
-        workflow_conf = server_sdk.WorkflowConf()
-        workflow_conf.workflows.extend([self.workflow])
-        return workflow_conf
-class Server(object):
-    def __init__(self):
-        self.server_handle_ = None
-        self.infer_service_conf = None
-        self.model_toolkit_conf = None
-        self.resource_conf = None
-        self.memory_optimization = False
-        self.ir_optimization = False
-        self.model_conf = None
-        self.workflow_fn = "workflow.prototxt"
-        self.resource_fn = "resource.prototxt"
-        self.infer_service_fn = "infer_service.prototxt"
-        self.model_toolkit_fn = "model_toolkit.prototxt"
-        self.general_model_config_fn = "general_model.prototxt"
-        self.cube_config_fn = "cube.conf"
-        self.workdir = ""
-        self.max_concurrency = 0
-        self.num_threads = 4
-        self.port = 8080
-        self.reload_interval_s = 10
-        self.max_body_size = 64 * 1024 * 1024
-        self.module_path = os.path.dirname(paddle_serving_server.__file__)
-        self.cur_path = os.getcwd()
-        self.use_local_bin = False
-        self.mkl_flag = False
-        self.encryption_model = False
-        self.product_name = None
-        self.container_id = None
-        self.model_config_paths = None  # for multi-model in a workflow
-    def get_fetch_list(self):
-        fetch_names = [var.alias_name for var in self.model_conf.fetch_var]
-        return fetch_names
-    def set_max_concurrency(self, concurrency):
-        self.max_concurrency = concurrency
-    def set_num_threads(self, threads):
-        self.num_threads = threads
-    def set_max_body_size(self, body_size):
-        if body_size >= self.max_body_size:
-            self.max_body_size = body_size
-        else:
-            print(
-                "max_body_size is less than default value, will use default value in service."
-            )
-    def set_port(self, port):
-        self.port = port
-    def set_reload_interval(self, interval):
-        self.reload_interval_s = interval
-    def set_op_sequence(self, op_seq):
-        self.workflow_conf = op_seq
-    def set_op_graph(self, op_graph):
-        self.workflow_conf = op_graph
-    def set_memory_optimize(self, flag=False):
-        self.memory_optimization = flag
-    def set_ir_optimize(self, flag=False):
-        self.ir_optimization = flag
-    def use_encryption_model(self, flag=False):
-        self.encryption_model = flag
-    def set_product_name(self, product_name=None):
-        if product_name == None:
-            raise ValueError("product_name can't be None.")
-        self.product_name = product_name
-    def set_container_id(self, container_id):
-        if container_id == None:
-            raise ValueError("container_id can't be None.")
-        self.container_id = container_id
-    def check_local_bin(self):
-        if "SERVING_BIN" in os.environ:
-            self.use_local_bin = True
-            self.bin_path = os.environ["SERVING_BIN"]
-    def _prepare_engine(self, model_config_paths, device):
-        if self.model_toolkit_conf == None:
-            self.model_toolkit_conf = server_sdk.ModelToolkitConf()
-        for engine_name, model_config_path in model_config_paths.items():
-            engine = server_sdk.EngineDesc()
-            engine.name = engine_name
-            engine.reloadable_meta = model_config_path + "/fluid_time_file"
-            os.system("touch {}".format(engine.reloadable_meta))
-            engine.reloadable_type = "timestamp_ne"
-            engine.runtime_thread_num = 0
-            engine.batch_infer_size = 0
-            engine.enable_batch_align = 0
-            engine.model_data_path = model_config_path
-            engine.enable_memory_optimization = self.memory_optimization
-            engine.enable_ir_optimization = self.ir_optimization
-            engine.static_optimization = False
-            engine.force_update_static_cache = False
-            if os.path.exists('{}/__params__'.format(model_config_path)):
-                suffix = ""
-            else:
-                suffix = "_DIR"
-            if device == "cpu":
-                if self.encryption_model:
-                    engine.type = "FLUID_CPU_ANALYSIS_ENCRYPT"
-                else:
-                    engine.type = "FLUID_CPU_ANALYSIS" + suffix
-            elif device == "gpu":
-                if self.encryption_model:
-                    engine.type = "FLUID_GPU_ANALYSIS_ENCRYPT"
-                else:
-                    engine.type = "FLUID_GPU_ANALYSIS" + suffix
-            self.model_toolkit_conf.engines.extend([engine])
-    def _prepare_infer_service(self, port):
-        if self.infer_service_conf == None:
-            self.infer_service_conf = server_sdk.InferServiceConf()
-            self.infer_service_conf.port = port
-            infer_service = server_sdk.InferService()
-            infer_service.name = "GeneralModelService"
-            infer_service.workflows.extend(["workflow1"])
-            self.infer_service_conf.services.extend([infer_service])
-    def _prepare_resource(self, workdir, cube_conf):
-        self.workdir = workdir
-        if self.resource_conf == None:
-            with open("{}/{}".format(workdir, self.general_model_config_fn),
-                      "w") as fout:
-                fout.write(str(self.model_conf))
-            self.resource_conf = server_sdk.ResourceConf()
-            for workflow in self.workflow_conf.workflows:
-                for node in workflow.nodes:
-                    if "dist_kv" in node.name:
-                        self.resource_conf.cube_config_path = workdir
-                        self.resource_conf.cube_config_file = self.cube_config_fn
-                        if cube_conf == None:
-                            raise ValueError(
-                                "Please set the path of cube.conf while use dist_kv op."
-                            )
-                        shutil.copy(cube_conf, workdir)
-                        if "quant" in node.name:
-                            self.resource_conf.cube_quant_bits = 8
-            self.resource_conf.model_toolkit_path = workdir
-            self.resource_conf.model_toolkit_file = self.model_toolkit_fn
-            self.resource_conf.general_model_path = workdir
-            self.resource_conf.general_model_file = self.general_model_config_fn
-            if self.product_name != None:
-                self.resource_conf.auth_product_name = self.product_name
-            if self.container_id != None:
-                self.resource_conf.auth_container_id = self.container_id
-    def _write_pb_str(self, filepath, pb_obj):
-        with open(filepath, "w") as fout:
-            fout.write(str(pb_obj))
-    def load_model_config(self, model_config_paths):
-        # At present, Serving needs to configure the model path in
-        # the resource.prototxt file to determine the input and output
-        # format of the workflow. To ensure that the input and output
-        # of multiple models are the same.
-        workflow_oi_config_path = None
-        if isinstance(model_config_paths, str):
-            # If there is only one model path, use the default infer_op.
-            # Because there are several infer_op type, we need to find
-            # it from workflow_conf.
-            default_engine_names = [
-                'general_infer_0', 'general_dist_kv_infer_0',
-                'general_dist_kv_quant_infer_0'
-            ]
-            engine_name = None
-            for node in self.workflow_conf.workflows[0].nodes:
-                if node.name in default_engine_names:
-                    engine_name = node.name
-                    break
-            if engine_name is None:
-                raise Exception(
-                    "You have set the engine_name of Op. Please use the form {op: model_path} to configure model path"
-                )
-            self.model_config_paths = {engine_name: model_config_paths}
-            workflow_oi_config_path = self.model_config_paths[engine_name]
-        elif isinstance(model_config_paths, dict):
-            self.model_config_paths = {}
-            for node_str, path in model_config_paths.items():
-                node = server_sdk.DAGNode()
-                google.protobuf.text_format.Parse(node_str, node)
-                self.model_config_paths[node.name] = path
-            print("You have specified multiple model paths, please ensure "
-                  "that the input and output of multiple models are the same.")
-            workflow_oi_config_path = list(self.model_config_paths.items())[0][
-                1]
-        else:
-            raise Exception("The type of model_config_paths must be str or "
-                            "dict({op: model_path}), not {}.".format(
-                                type(model_config_paths)))
-        self.model_conf = m_config.GeneralModelConfig()
-        f = open(
-            "{}/serving_server_conf.prototxt".format(workflow_oi_config_path),
-            'r')
-        self.model_conf = google.protobuf.text_format.Merge(
-            str(f.read()), self.model_conf)
-        # check config here
-        # print config here
-    def use_mkl(self, flag):
-        self.mkl_flag = flag
-    def get_device_version(self):
-        avx_flag = False
-        mkl_flag = self.mkl_flag
-        openblas_flag = False
-        r = os.system("cat /proc/cpuinfo | grep avx > /dev/null 2>&1")
-        if r == 0:
-            avx_flag = True
-        if avx_flag:
-            if mkl_flag:
-                device_version = "serving-cpu-avx-mkl-"
-            else:
-                device_version = "serving-cpu-avx-openblas-"
-        else:
-            if mkl_flag:
-                print(
-                    "Your CPU does not support AVX, server will running with noavx-openblas mode."
-                )
-            device_version = "serving-cpu-noavx-openblas-"
-        return device_version
-    def download_bin(self):
-        os.chdir(self.module_path)
-        need_download = False
-        device_version = self.get_device_version()
-        folder_name = device_version + serving_server_version
-        tar_name = folder_name + ".tar.gz"
-        bin_url = "https://paddle-serving.bj.bcebos.com/bin/" + tar_name
-        self.server_path = os.path.join(self.module_path, folder_name)
-        #acquire lock
-        version_file = open("{}/version.py".format(self.module_path), "r")
-        fcntl.flock(version_file, fcntl.LOCK_EX)
-        if not os.path.exists(self.server_path):
-            print('Frist time run, downloading PaddleServing components ...')
-            r = os.system('wget ' + bin_url + ' --no-check-certificate')
-            if r != 0:
-                if os.path.exists(tar_name):
-                    os.remove(tar_name)
-                raise SystemExit(
-                    'Download failed, please check your network or permission of {}.'
-                    .format(self.module_path))
-            else:
-                try:
-                    print('Decompressing files ..')
-                    tar = tarfile.open(tar_name)
-                    tar.extractall()
-                    tar.close()
-                except:
-                    if os.path.exists(exe_path):
-                        os.remove(exe_path)
-                    raise SystemExit(
-                        'Decompressing failed, please check your permission of {} or disk space left.'
-                        .format(self.module_path))
-                finally:
-                    os.remove(tar_name)
-        #release lock
-        version_file.close()
-        os.chdir(self.cur_path)
-        self.bin_path = self.server_path + "/serving"
-    def prepare_server(self,
-                       workdir=None,
-                       port=9292,
-                       device="cpu",
-                       cube_conf=None):
-        if workdir == None:
-            workdir = "./tmp"
-            os.system("mkdir {}".format(workdir))
-        else:
-            os.system("mkdir {}".format(workdir))
-        os.system("touch {}/fluid_time_file".format(workdir))
-        if not self.port_is_available(port):
-            raise SystemExit("Port {} is already used".format(port))
-        self.set_port(port)
-        self._prepare_resource(workdir, cube_conf)
-        self._prepare_engine(self.model_config_paths, device)
-        self._prepare_infer_service(port)
-        self.workdir = workdir
-        infer_service_fn = "{}/{}".format(workdir, self.infer_service_fn)
-        workflow_fn = "{}/{}".format(workdir, self.workflow_fn)
-        resource_fn = "{}/{}".format(workdir, self.resource_fn)
-        model_toolkit_fn = "{}/{}".format(workdir, self.model_toolkit_fn)
-        self._write_pb_str(infer_service_fn, self.infer_service_conf)
-        self._write_pb_str(workflow_fn, self.workflow_conf)
-        self._write_pb_str(resource_fn, self.resource_conf)
-        self._write_pb_str(model_toolkit_fn, self.model_toolkit_conf)
-    def port_is_available(self, port):
-        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-            sock.settimeout(2)
-            result = sock.connect_ex(('0.0.0.0', port))
-        if result != 0:
-            return True
-        else:
-            return False
-    def run_server(self):
-        # just run server with system command
-        # currently we do not load cube
-        self.check_local_bin()
-        if not self.use_local_bin:
-            self.download_bin()
-        else:
-            print("Use local bin : {}".format(self.bin_path))
-        command = "{} " \
-                  "-enable_model_toolkit " \
-                  "-inferservice_path {} " \
-                  "-inferservice_file {} " \
-                  "-max_concurrency {} " \
-                  "-num_threads {} " \
-                  "-port {} " \
-                  "-reload_interval_s {} " \
-                  "-resource_path {} " \
-                  "-resource_file {} " \
-                  "-workflow_path {} " \
-                  "-workflow_file {} " \
-                  "-bthread_concurrency {} " \
-                  "-max_body_size {} ".format(
-                      self.bin_path,
-                      self.workdir,
-                      self.infer_service_fn,
-                      self.max_concurrency,
-                      self.num_threads,
-                      self.port,
-                      self.reload_interval_s,
-                      self.workdir,
-                      self.resource_fn,
-                      self.workdir,
-                      self.workflow_fn,
-                      self.num_threads,
-                      self.max_body_size)
-        print("Going to Run Command")
-        print(command)
-        os.system(command)
-class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
-                                     MultiLangGeneralModelServiceServicer):
-    def __init__(self, model_config_path, is_multi_model, endpoints):
-        self.is_multi_model_ = is_multi_model
-        self.model_config_path_ = model_config_path
-        self.endpoints_ = endpoints
-        with open(self.model_config_path_) as f:
-            self.model_config_str_ = str(f.read())
-        self._parse_model_config(self.model_config_str_)
-        self._init_bclient(self.model_config_path_, self.endpoints_)
-    def _init_bclient(self, model_config_path, endpoints, timeout_ms=None):
-        from paddle_serving_client import Client
-        self.bclient_ = Client()
-        if timeout_ms is not None:
-            self.bclient_.set_rpc_timeout_ms(timeout_ms)
-        self.bclient_.load_client_config(model_config_path)
-        self.bclient_.connect(endpoints)
-    def _parse_model_config(self, model_config_str):
-        model_conf = m_config.GeneralModelConfig()
-        model_conf = google.protobuf.text_format.Merge(model_config_str,
-                                                       model_conf)
-        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
-        self.feed_types_ = {}
-        self.feed_shapes_ = {}
-        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
-        self.fetch_types_ = {}
-        self.lod_tensor_set_ = set()
-        for i, var in enumerate(model_conf.feed_var):
-            self.feed_types_[var.alias_name] = var.feed_type
-            self.feed_shapes_[var.alias_name] = var.shape
-            if var.is_lod_tensor:
-                self.lod_tensor_set_.add(var.alias_name)
-        for i, var in enumerate(model_conf.fetch_var):
-            self.fetch_types_[var.alias_name] = var.fetch_type
-            if var.is_lod_tensor:
-                self.lod_tensor_set_.add(var.alias_name)
-    def _flatten_list(self, nested_list):
-        for item in nested_list:
-            if isinstance(item, (list, tuple)):
-                for sub_item in self._flatten_list(item):
-                    yield sub_item
-            else:
-                yield item
-    def _unpack_inference_request(self, request):
-        feed_names = list(request.feed_var_names)
-        fetch_names = list(request.fetch_var_names)
-        is_python = request.is_python
-        log_id = request.log_id
-        feed_batch = []
-        for feed_inst in request.insts:
-            feed_dict = {}
-            for idx, name in enumerate(feed_names):
-                var = feed_inst.tensor_array[idx]
-                v_type = self.feed_types_[name]
-                data = None
-                if is_python:
-                    if v_type == 0:  # int64
-                        data = np.frombuffer(var.data, dtype="int64")
-                    elif v_type == 1:  # float32
-                        data = np.frombuffer(var.data, dtype="float32")
-                    elif v_type == 2:  # int32
-                        data = np.frombuffer(var.data, dtype="int32")
-                    else:
-                        raise Exception("error type.")
-                else:
-                    if v_type == 0:  # int64
-                        data = np.array(list(var.int64_data), dtype="int64")
-                    elif v_type == 1:  # float32
-                        data = np.array(list(var.float_data), dtype="float32")
-                    elif v_type == 2:  # int32
-                        data = np.array(list(var.int_data), dtype="int32")
-                    else:
-                        raise Exception("error type.")
-                data.shape = list(feed_inst.tensor_array[idx].shape)
-                feed_dict[name] = data
-                if len(var.lod) > 0:
-                    feed_dict["{}.lod".format(name)] = var.lod
-            feed_batch.append(feed_dict)
-        return feed_batch, fetch_names, is_python, log_id
-    def _pack_inference_response(self, ret, fetch_names, is_python):
-        resp = multi_lang_general_model_service_pb2.InferenceResponse()
-        if ret is None:
-            resp.err_code = 1
-            return resp
-        results, tag = ret
-        resp.tag = tag
-        resp.err_code = 0
-        if not self.is_multi_model_:
-            results = {'general_infer_0': results}
-        for model_name, model_result in results.items():
-            model_output = multi_lang_general_model_service_pb2.ModelOutput()
-            inst = multi_lang_general_model_service_pb2.FetchInst()
-            for idx, name in enumerate(fetch_names):
-                tensor = multi_lang_general_model_service_pb2.Tensor()
-                v_type = self.fetch_types_[name]
-                if is_python:
-                    tensor.data = model_result[name].tobytes()
-                else:
-                    if v_type == 0:  # int64
-                        tensor.int64_data.extend(model_result[name].reshape(-1)
-                                                 .tolist())
-                    elif v_type == 1:  # float32
-                        tensor.float_data.extend(model_result[name].reshape(-1)
-                                                 .tolist())
-                    elif v_type == 2:  # int32
-                        tensor.int_data.extend(model_result[name].reshape(-1)
-                                               .tolist())
-                    else:
-                        raise Exception("error type.")
-                tensor.shape.extend(list(model_result[name].shape))
-                if "{}.lod".format(name) in model_result:
-                    tensor.lod.extend(model_result["{}.lod".format(name)]
-                                      .tolist())
-                inst.tensor_array.append(tensor)
-            model_output.insts.append(inst)
-            model_output.engine_name = model_name
-            resp.outputs.append(model_output)
-        return resp
-    def SetTimeout(self, request, context):
-        # This porcess and Inference process cannot be operate at the same time.
-        # For performance reasons, do not add thread lock temporarily.
-        timeout_ms = request.timeout_ms
-        self._init_bclient(self.model_config_path_, self.endpoints_, timeout_ms)
-        resp = multi_lang_general_model_service_pb2.SimpleResponse()
-        resp.err_code = 0
-        return resp
-    def Inference(self, request, context):
-        feed_batch, fetch_names, is_python, log_id = \
-                self._unpack_inference_request(request)
-        ret = self.bclient_.predict(
-            feed=feed_batch,
-            fetch=fetch_names,
-            batch=True,
-            need_variant_tag=True,
-            log_id=log_id)
-        return self._pack_inference_response(ret, fetch_names, is_python)
-    def GetClientConfig(self, request, context):
-        resp = multi_lang_general_model_service_pb2.GetClientConfigResponse()
-        resp.client_config_str = self.model_config_str_
-        return resp
-class MultiLangServer(object):
-    def __init__(self):
-        self.bserver_ = Server()
-        self.worker_num_ = 4
-        self.body_size_ = 64 * 1024 * 1024
-        self.concurrency_ = 100000
-        self.is_multi_model_ = False  # for model ensemble
-    def set_max_concurrency(self, concurrency):
-        self.concurrency_ = concurrency
-        self.bserver_.set_max_concurrency(concurrency)
-    def set_num_threads(self, threads):
-        self.worker_num_ = threads
-        self.bserver_.set_num_threads(threads)
-    def set_max_body_size(self, body_size):
-        self.bserver_.set_max_body_size(body_size)
-        if body_size >= self.body_size_:
-            self.body_size_ = body_size
-        else:
-            print(
-                "max_body_size is less than default value, will use default value in service."
-            )
-    def use_encryption_model(self, flag=False):
-        self.encryption_model = flag
-    def set_port(self, port):
-        self.gport_ = port
-    def set_reload_interval(self, interval):
-        self.bserver_.set_reload_interval(interval)
-    def set_op_sequence(self, op_seq):
-        self.bserver_.set_op_sequence(op_seq)
-    def set_op_graph(self, op_graph):
-        self.bserver_.set_op_graph(op_graph)
-    def set_memory_optimize(self, flag=False):
-        self.bserver_.set_memory_optimize(flag)
-    def set_ir_optimize(self, flag=False):
-        self.bserver_.set_ir_optimize(flag)
-    def set_op_sequence(self, op_seq):
-        self.bserver_.set_op_sequence(op_seq)
-    def use_mkl(self, flag):
-        self.bserver_.use_mkl(flag)
-    def load_model_config(self, server_config_paths, client_config_path=None):
-        self.bserver_.load_model_config(server_config_paths)
-        if client_config_path is None:
-            if isinstance(server_config_paths, dict):
-                self.is_multi_model_ = True
-                client_config_path = '{}/serving_server_conf.prototxt'.format(
-                    list(server_config_paths.items())[0][1])
-            else:
-                client_config_path = '{}/serving_server_conf.prototxt'.format(
-                    server_config_paths)
-        self.bclient_config_path_ = client_config_path
-    def prepare_server(self,
-                       workdir=None,
-                       port=9292,
-                       device="cpu",
-                       cube_conf=None):
-        if not self._port_is_available(port):
-            raise SystemExit("Prot {} is already used".format(port))
-        default_port = 12000
-        self.port_list_ = []
-        for i in range(1000):
-            if default_port + i != port and self._port_is_available(default_port
-                                                                    + i):
-                self.port_list_.append(default_port + i)
-                break
-        self.bserver_.prepare_server(
-            workdir=workdir,
-            port=self.port_list_[0],
-            device=device,
-            cube_conf=cube_conf)
-        self.set_port(port)
-    def _launch_brpc_service(self, bserver):
-        bserver.run_server()
-    def _port_is_available(self, port):
-        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-            sock.settimeout(2)
-            result = sock.connect_ex(('0.0.0.0', port))
-        return result != 0
-    def run_server(self):
-        p_bserver = Process(
-            target=self._launch_brpc_service, args=(self.bserver_, ))
-        p_bserver.start()
-        options = [('grpc.max_send_message_length', self.body_size_),
-                   ('grpc.max_receive_message_length', self.body_size_)]
-        server = grpc.server(
-            futures.ThreadPoolExecutor(max_workers=self.worker_num_),
-            options=options,
-            maximum_concurrent_rpcs=self.concurrency_)
-        multi_lang_general_model_service_pb2_grpc.add_MultiLangGeneralModelServiceServicer_to_server(
-            MultiLangServerServiceServicer(
-                self.bclient_config_path_, self.is_multi_model_,
-                ["0.0.0.0:{}".format(self.port_list_[0])]), server)
-        server.add_insecure_port('[::]:{}'.format(self.gport_))
-        server.start()
-        p_bserver.join()
-        server.wait_for_termination()
--- a/python/paddle_serving_server/dag.py
+++ b/python/paddle_serving_server/dag.py
+from .proto import server_configure_pb2 as server_sdk
+import google.protobuf.text_format
+import collections
+class OpMaker(object):
+    def __init__(self):
+        self.op_dict = {
+            "general_infer": "GeneralInferOp",
+            "general_reader": "GeneralReaderOp",
+            "general_response": "GeneralResponseOp",
+            "general_text_reader": "GeneralTextReaderOp",
+            "general_text_response": "GeneralTextResponseOp",
+            "general_single_kv": "GeneralSingleKVOp",
+            "general_dist_kv_infer": "GeneralDistKVInferOp",
+            "general_dist_kv": "GeneralDistKVOp"
+        }
+        self.node_name_suffix_ = collections.defaultdict(int)
+    def create(self, node_type, engine_name=None, inputs=[], outputs=[]):
+        if node_type not in self.op_dict:
+            raise Exception("Op type {} is not supported right now".format(
+                node_type))
+        node = server_sdk.DAGNode()
+        # node.name will be used as the infer engine name
+        if engine_name:
+            node.name = engine_name
+        else:
+            node.name = '{}_{}'.format(node_type,
+                                       self.node_name_suffix_[node_type])
+            self.node_name_suffix_[node_type] += 1
+        node.type = self.op_dict[node_type]
+        if inputs:
+            for dep_node_str in inputs:
+                dep_node = server_sdk.DAGNode()
+                google.protobuf.text_format.Parse(dep_node_str, dep_node)
+                dep = server_sdk.DAGNodeDependency()
+                dep.name = dep_node.name
+                dep.mode = "RO"
+                node.dependencies.extend([dep])
+        # Because the return value will be used as the key value of the
+        # dict, and the proto object is variable which cannot be hashed,
+        # so it is processed into a string. This has little effect on
+        # overall efficiency.
+        return google.protobuf.text_format.MessageToString(node)
+class OpSeqMaker(object):
+    def __init__(self):
+        self.workflow = server_sdk.Workflow()
+        self.workflow.name = "workflow1"
+        self.workflow.workflow_type = "Sequence"
+    def add_op(self, node_str):
+        node = server_sdk.DAGNode()
+        google.protobuf.text_format.Parse(node_str, node)
+        if len(node.dependencies) > 1:
+            raise Exception(
+                'Set more than one predecessor for op in OpSeqMaker is not allowed.'
+            )
+        if len(self.workflow.nodes) >= 1:
+            if len(node.dependencies) == 0:
+                dep = server_sdk.DAGNodeDependency()
+                dep.name = self.workflow.nodes[-1].name
+                dep.mode = "RO"
+                node.dependencies.extend([dep])
+            elif len(node.dependencies) == 1:
+                if node.dependencies[0].name != self.workflow.nodes[-1].name:
+                    raise Exception(
+                        'You must add op in order in OpSeqMaker. The previous op is {}, but the current op is followed by {}.'
+                        .format(node.dependencies[0].name, self.workflow.nodes[
+                            -1].name))
+        self.workflow.nodes.extend([node])
+    def get_op_sequence(self):
+        workflow_conf = server_sdk.WorkflowConf()
+        workflow_conf.workflows.extend([self.workflow])
+        return workflow_conf
+class OpGraphMaker(object):
+    def __init__(self):
+        self.workflow = server_sdk.Workflow()
+        self.workflow.name = "workflow1"
+        # Currently, SDK only supports "Sequence"
+        self.workflow.workflow_type = "Sequence"
+    def add_op(self, node_str):
+        node = server_sdk.DAGNode()
+        google.protobuf.text_format.Parse(node_str, node)
+        self.workflow.nodes.extend([node])
+    def get_op_graph(self):
+        workflow_conf = server_sdk.WorkflowConf()
+        workflow_conf.workflows.extend([self.workflow])
+        return workflow_conf
--- a/python/paddle_serving_server/monitor.py
+++ b/python/paddle_serving_server/monitor.py
@@ -28,7 +28,6 @@ import logging
 _LOGGER = logging.getLogger(__name__)
 class Monitor(object):
    '''
    Monitor base class. It is used to monitor the remote model, pull and update the local model.

--- a/python/paddle_serving_server/rpc_service.py
+++ b/python/paddle_serving_server/rpc_service.py
+import sys
+import os
+import google.protobuf.text_format
+from .proto import general_model_config_pb2 as m_config
+from .proto import multi_lang_general_model_service_pb2
+sys.path.append(
+    os.path.join(os.path.abspath(os.path.dirname(__file__)), 'proto'))
+from .proto import multi_lang_general_model_service_pb2_grpc
+class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
+                                     MultiLangGeneralModelServiceServicer):
+    def __init__(self, model_config_path, is_multi_model, endpoints):
+        self.is_multi_model_ = is_multi_model
+        self.model_config_path_ = model_config_path
+        self.endpoints_ = endpoints
+        with open(self.model_config_path_) as f:
+            self.model_config_str_ = str(f.read())
+        self._parse_model_config(self.model_config_str_)
+        self._init_bclient(self.model_config_path_, self.endpoints_)
+    def _init_bclient(self, model_config_path, endpoints, timeout_ms=None):
+        from paddle_serving_client import Client
+        self.bclient_ = Client()
+        if timeout_ms is not None:
+            self.bclient_.set_rpc_timeout_ms(timeout_ms)
+        self.bclient_.load_client_config(model_config_path)
+        self.bclient_.connect(endpoints)
+    def _parse_model_config(self, model_config_str):
+        model_conf = m_config.GeneralModelConfig()
+        model_conf = google.protobuf.text_format.Merge(model_config_str,
+                                                       model_conf)
+        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
+        self.feed_types_ = {}
+        self.feed_shapes_ = {}
+        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
+        self.fetch_types_ = {}
+        self.lod_tensor_set_ = set()
+        for i, var in enumerate(model_conf.feed_var):
+            self.feed_types_[var.alias_name] = var.feed_type
+            self.feed_shapes_[var.alias_name] = var.shape
+            if var.is_lod_tensor:
+                self.lod_tensor_set_.add(var.alias_name)
+        for i, var in enumerate(model_conf.fetch_var):
+            self.fetch_types_[var.alias_name] = var.fetch_type
+            if var.is_lod_tensor:
+                self.lod_tensor_set_.add(var.alias_name)
+    def _flatten_list(self, nested_list):
+        for item in nested_list:
+            if isinstance(item, (list, tuple)):
+                for sub_item in self._flatten_list(item):
+                    yield sub_item
+            else:
+                yield item
+    def _unpack_inference_request(self, request):
+        feed_names = list(request.feed_var_names)
+        fetch_names = list(request.fetch_var_names)
+        is_python = request.is_python
+        log_id = request.log_id
+        feed_batch = []
+        for feed_inst in request.insts:
+            feed_dict = {}
+            for idx, name in enumerate(feed_names):
+                var = feed_inst.tensor_array[idx]
+                v_type = self.feed_types_[name]
+                data = None
+                if is_python:
+                    if v_type == 0:
+                        data = np.frombuffer(var.data, dtype="int64")
+                    elif v_type == 1:
+                        data = np.frombuffer(var.data, dtype="float32")
+                    elif v_type == 2:
+                        data = np.frombuffer(var.data, dtype="int32")
+                    else:
+                        raise Exception("error type.")
+                else:
+                    if v_type == 0:  # int64
+                        data = np.array(list(var.int64_data), dtype="int64")
+                    elif v_type == 1:  # float32
+                        data = np.array(list(var.float_data), dtype="float32")
+                    elif v_type == 2:
+                        data = np.array(list(var.int_data), dtype="int32")
+                    else:
+                        raise Exception("error type.")
+                data.shape = list(feed_inst.tensor_array[idx].shape)
+                feed_dict[name] = data
+                if len(var.lod) > 0:
+                    feed_dict["{}.lod".format(name)] = var.lod
+            feed_batch.append(feed_dict)
+        return feed_batch, fetch_names, is_python, log_id
+    def _pack_inference_response(self, ret, fetch_names, is_python):
+        resp = multi_lang_general_model_service_pb2.InferenceResponse()
+        if ret is None:
+            resp.err_code = 1
+            return resp
+        results, tag = ret
+        resp.tag = tag
+        resp.err_code = 0
+        if not self.is_multi_model_:
+            results = {'general_infer_0': results}
+        for model_name, model_result in results.items():
+            model_output = multi_lang_general_model_service_pb2.ModelOutput()
+            inst = multi_lang_general_model_service_pb2.FetchInst()
+            for idx, name in enumerate(fetch_names):
+                tensor = multi_lang_general_model_service_pb2.Tensor()
+                v_type = self.fetch_types_[name]
+                if is_python:
+                    tensor.data = model_result[name].tobytes()
+                else:
+                    if v_type == 0:  # int64
+                        tensor.int64_data.extend(model_result[name].reshape(-1)
+                                                 .tolist())
+                    elif v_type == 1:  # float32
+                        tensor.float_data.extend(model_result[name].reshape(-1)
+                                                 .tolist())
+                    elif v_type == 2:  # int32
+                        tensor.int_data.extend(model_result[name].reshape(-1)
+                                               .tolist())
+                    else:
+                        raise Exception("error type.")
+                tensor.shape.extend(list(model_result[name].shape))
+                if "{}.lod".format(name) in model_result:
+                    tensor.lod.extend(model_result["{}.lod".format(name)]
+                                      .tolist())
+                inst.tensor_array.append(tensor)
+            model_output.insts.append(inst)
+            model_output.engine_name = model_name
+            resp.outputs.append(model_output)
+        return resp
+    def SetTimeout(self, request, context):
+        # This porcess and Inference process cannot be operate at the same time.
+        # For performance reasons, do not add thread lock temporarily.
+        timeout_ms = request.timeout_ms
+        self._init_bclient(self.model_config_path_, self.endpoints_, timeout_ms)
+        resp = multi_lang_general_model_service_pb2.SimpleResponse()
+        resp.err_code = 0
+        return resp
+    def Inference(self, request, context):
+        feed_batch, fetch_names, is_python, log_id \
+                = self._unpack_inference_request(request)
+        ret = self.bclient_.predict(
+            feed=feed_batch,
+            fetch=fetch_names,
+            batch=True,
+            need_variant_tag=True,
+            log_id=log_id)
+        return self._pack_inference_response(ret, fetch_names, is_python)
+    def GetClientConfig(self, request, context):
+        resp = multi_lang_general_model_service_pb2.GetClientConfigResponse()
+        resp.client_config_str = self.model_config_str_
+        return resp
\ No newline at end of file
--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
@@ -18,12 +18,11 @@ Usage:
        python -m paddle_serving_server.serve --model ./serving_server_model --port 9292
 """
 import argparse
-import sys
+import os
 import json
 import base64
 import time
-from multiprocessing import Process
+from multiprocessing import Pool, Process
-from .web_service import WebService, port_is_available
 from flask import Flask, request
 import sys
 if sys.version_info.major == 2:
@@ -32,23 +31,26 @@ elif sys.version_info.major == 3:
    from http.server import BaseHTTPRequestHandler, HTTPServer
-def parse_args():  # pylint: disable=doc-string-missing
+def serve_args():
    parser = argparse.ArgumentParser("serve")
    parser.add_argument(
-        "--thread", type=int, default=10, help="Concurrency of server")
+        "--thread", type=int, default=2, help="Concurrency of server")
    parser.add_argument(
-        "--model", type=str, default="", help="Model for serving")
+        "--port", type=int, default=9292, help="Port of the starting gpu")
    parser.add_argument(
-        "--port", type=int, default=9292, help="Port the server")
+        "--device", type=str, default="gpu", help="Type of device")
+    parser.add_argument("--gpu_ids", type=str, default="", help="gpu ids")
    parser.add_argument(
-        "--name", type=str, default="None", help="Web service name")
+        "--model", type=str, default="", help="Model for serving")
    parser.add_argument(
        "--workdir",
        type=str,
        default="workdir",
        help="Working dir of current service")
    parser.add_argument(
-        "--device", type=str, default="cpu", help="Type of device")
+        "--name", type=str, default="None", help="Default service name")
+    parser.add_argument(
+        "--use_mkl", default=False, action="store_true", help="Use MKL")
    parser.add_argument(
        "--mem_optim_off",
        default=False,
@@ -56,8 +58,6 @@ def parse_args():  # pylint: disable=doc-string-missing
        help="Memory optimize")
    parser.add_argument(
        "--ir_optim", default=False, action="store_true", help="Graph optimize")
-    parser.add_argument(
-        "--use_mkl", default=False, action="store_true", help="Use MKL")
    parser.add_argument(
        "--max_body_size",
        type=int,
@@ -73,6 +73,12 @@ def parse_args():  # pylint: disable=doc-string-missing
        default=False,
        action="store_true",
        help="Use Multi-language-service")
+    parser.add_argument(
+        "--use_trt", default=False, action="store_true", help="Use TensorRT")
+    parser.add_argument(
+        "--use_lite", default=False, action="store_true", help="Use PaddleLite")
+    parser.add_argument(
+        "--use_xpu", default=False, action="store_true", help="Use XPU")
    parser.add_argument(
        "--product_name",
        type=str,
@@ -138,6 +144,116 @@ def start_standard_model(serving_port):  # pylint: disable=doc-string-missing
    server.run_server()
+def start_gpu_card_model(index, gpuid, port, args):  # pylint: disable=doc-string-missing
+    workdir = args.workdir
+    gpuid = int(gpuid)
+    device = "gpu"
+    if gpuid == -1:
+        device = "cpu"
+    elif gpuid >= 0:
+        port = port + index
+    thread_num = args.thread
+    model = args.model
+    mem_optim = args.mem_optim_off is False
+    ir_optim = args.ir_optim
+    use_mkl = args.use_mkl
+    max_body_size = args.max_body_size
+    use_multilang = args.use_multilang
+    if gpuid >= 0:
+        workdir = "{}_{}".format(args.workdir, gpuid)
+    if model == "":
+        print("You must specify your serving model")
+        exit(-1)
+    import paddle_serving_server as serving
+    op_maker = serving.OpMaker()
+    read_op = op_maker.create('general_reader')
+    general_infer_op = op_maker.create('general_infer')
+    general_response_op = op_maker.create('general_response')
+    op_seq_maker = serving.OpSeqMaker()
+    op_seq_maker.add_op(read_op)
+    op_seq_maker.add_op(general_infer_op)
+    op_seq_maker.add_op(general_response_op)
+    if use_multilang:
+        server = serving.MultiLangServer()
+    else:
+        server = serving.Server()
+    server.set_op_sequence(op_seq_maker.get_op_sequence())
+    server.set_num_threads(thread_num)
+    server.use_mkl(use_mkl)
+    server.set_memory_optimize(mem_optim)
+    server.set_ir_optimize(ir_optim)
+    server.set_max_body_size(max_body_size)
+    if args.use_trt:
+        server.set_trt()
+    if args.use_lite:
+        server.set_lite()
+    server.set_device(device)
+    if args.use_xpu:
+        server.set_xpu()
+    if args.product_name != None:
+        server.set_product_name(args.product_name)
+    if args.container_id != None:
+        server.set_container_id(args.container_id)
+    server.load_model_config(model)
+    server.prepare_server(
+        workdir=workdir,
+        port=port,
+        device=device,
+        use_encryption_model=args.use_encryption_model)
+    if gpuid >= 0:
+        server.set_gpuid(gpuid)
+    server.run_server()
+def start_multi_card(args, serving_port=None):  # pylint: disable=doc-string-missing
+    gpus = ""
+    if serving_port == None:
+        serving_port = args.port
+    if args.gpu_ids == "":
+        gpus = []
+    else:
+        gpus = args.gpu_ids.split(",")
+        if "CUDA_VISIBLE_DEVICES" in os.environ:
+            env_gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+            for ids in gpus:
+                if int(ids) >= len(env_gpus):
+                    print(
+                        " Max index of gpu_ids out of range, the number of CUDA_VISIBLE_DEVICES is {}."
+                        .format(len(env_gpus)))
+                    exit(-1)
+        else:
+            env_gpus = []
+    if args.use_lite:
+        print("run using paddle-lite.")
+        start_gpu_card_model(-1, -1, serving_port, args)
+    elif len(gpus) <= 0:
+        print("gpu_ids not set, going to run cpu service.")
+        start_gpu_card_model(-1, -1, serving_port, args)
+    else:
+        gpu_processes = []
+        for i, gpu_id in enumerate(gpus):
+            p = Process(
+                target=start_gpu_card_model,
+                args=(
+                    i,
+                    gpu_id,
+                    serving_port,
+                    args, ))
+            gpu_processes.append(p)
+        for p in gpu_processes:
+            p.start()
+        for p in gpu_processes:
+            p.join()
 class MainService(BaseHTTPRequestHandler):
    def get_available_port(self):
        default_port = 12000
@@ -146,7 +262,7 @@ class MainService(BaseHTTPRequestHandler):
                return default_port + i
    def start_serving(self):
-        start_standard_model(serving_port)
+        start_multi_card(args, serving_port)
    def get_key(self, post_data):
        if "key" not in post_data:
@@ -207,9 +323,9 @@ class MainService(BaseHTTPRequestHandler):
 if __name__ == "__main__":
+    args = serve_args()
-    args = parse_args()
    if args.name == "None":
+        from .web_service import port_is_available
        if args.use_encryption_model:
            p_flag = False
            p = None
@@ -220,27 +336,39 @@ if __name__ == "__main__":
            )
            server.serve_forever()
        else:
-            start_standard_model(args.port)
+            start_multi_card(args)
    else:
-        service = WebService(name=args.name)
+        from .web_service import WebService
-        service.load_model_config(args.model)
+        web_service = WebService(name=args.name)
-        service.prepare_server(
+        web_service.load_model_config(args.model)
-            workdir=args.workdir, port=args.port, device=args.device)
+        gpu_ids = args.gpu_ids
-        service.run_rpc_service()
+        if gpu_ids == "":
+            if "CUDA_VISIBLE_DEVICES" in os.environ:
+                gpu_ids = os.environ["CUDA_VISIBLE_DEVICES"]
+        if len(gpu_ids) > 0:
+            web_service.set_gpus(gpu_ids)
+        web_service.prepare_server(
+            workdir=args.workdir,
+            port=args.port,
+            device=args.device,
+            use_lite=args.use_lite,
+            use_xpu=args.use_xpu,
+            ir_optim=args.ir_optim)
+        web_service.run_rpc_service()
        app_instance = Flask(__name__)
        @app_instance.before_first_request
        def init():
-            service._launch_web_service()
+            web_service._launch_web_service()
-        service_name = "/" + service.name + "/prediction"
+        service_name = "/" + web_service.name + "/prediction"
        @app_instance.route(service_name, methods=["POST"])
        def run():
-            return service.get_prediction(request)
+            return web_service.get_prediction(request)
        app_instance.run(host="0.0.0.0",
-                         port=service.port,
+                         port=web_service.port,
                         threaded=False,
                         processes=4)
--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,188 +11,32 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# pylint: disable=doc-string-missing
 import os
+import tarfile
+import socket
+import paddle_serving_server as paddle_serving_server
 from .proto import server_configure_pb2 as server_sdk
 from .proto import general_model_config_pb2 as m_config
 import google.protobuf.text_format
-import tarfile
-import socket
-import paddle_serving_server_gpu as paddle_serving_server
 import time
-from .version import serving_server_version
+from .version import serving_server_version, version_suffix, device_type
 from contextlib import closing
 import argparse
-import collections
 import sys
 if sys.platform.startswith('win') is False:
    import fcntl
 import shutil
+import platform
 import numpy as np
 import grpc
-from .proto import multi_lang_general_model_service_pb2
 import sys
-sys.path.append(
-    os.path.join(os.path.abspath(os.path.dirname(__file__)), 'proto'))
-from .proto import multi_lang_general_model_service_pb2_grpc
 from multiprocessing import Pool, Process
 from concurrent import futures
-def serve_args():
-    parser = argparse.ArgumentParser("serve")
-    parser.add_argument(
-        "--thread", type=int, default=2, help="Concurrency of server")
-    parser.add_argument(
-        "--model", type=str, default="", help="Model for serving")
-    parser.add_argument(
-        "--port", type=int, default=9292, help="Port of the starting gpu")
-    parser.add_argument(
-        "--workdir",
-        type=str,
-        default="workdir",
-        help="Working dir of current service")
-    parser.add_argument(
-        "--device", type=str, default="gpu", help="Type of device")
-    parser.add_argument("--gpu_ids", type=str, default="", help="gpu ids")
-    parser.add_argument(
-        "--name", type=str, default="None", help="Default service name")
-    parser.add_argument(
-        "--mem_optim_off",
-        default=False,
-        action="store_true",
-        help="Memory optimize")
-    parser.add_argument(
-        "--ir_optim", default=False, action="store_true", help="Graph optimize")
-    parser.add_argument(
-        "--max_body_size",
-        type=int,
-        default=512 * 1024 * 1024,
-        help="Limit sizes of messages")
-    parser.add_argument(
-        "--use_encryption_model",
-        default=False,
-        action="store_true",
-        help="Use encryption model")
-    parser.add_argument(
-        "--use_multilang",
-        default=False,
-        action="store_true",
-        help="Use Multi-language-service")
-    parser.add_argument(
-        "--use_trt", default=False, action="store_true", help="Use TensorRT")
-    parser.add_argument(
-        "--use_lite", default=False, action="store_true", help="Use PaddleLite")
-    parser.add_argument(
-        "--use_xpu", default=False, action="store_true", help="Use XPU")
-    parser.add_argument(
-        "--product_name",
-        type=str,
-        default=None,
-        help="product_name for authentication")
-    parser.add_argument(
-        "--container_id",
-        type=str,
-        default=None,
-        help="container_id for authentication")
-    return parser.parse_args()
-class OpMaker(object):
-    def __init__(self):
-        self.op_dict = {
-            "general_infer": "GeneralInferOp",
-            "general_reader": "GeneralReaderOp",
-            "general_response": "GeneralResponseOp",
-            "general_text_reader": "GeneralTextReaderOp",
-            "general_text_response": "GeneralTextResponseOp",
-            "general_single_kv": "GeneralSingleKVOp",
-            "general_dist_kv_infer": "GeneralDistKVInferOp",
-            "general_dist_kv": "GeneralDistKVOp"
-        }
-        self.node_name_suffix_ = collections.defaultdict(int)
-    def create(self, node_type, engine_name=None, inputs=[], outputs=[]):
-        if node_type not in self.op_dict:
-            raise Exception("Op type {} is not supported right now".format(
-                node_type))
-        node = server_sdk.DAGNode()
-        # node.name will be used as the infer engine name
-        if engine_name:
-            node.name = engine_name
-        else:
-            node.name = '{}_{}'.format(node_type,
-                                       self.node_name_suffix_[node_type])
-            self.node_name_suffix_[node_type] += 1
-        node.type = self.op_dict[node_type]
-        if inputs:
-            for dep_node_str in inputs:
-                dep_node = server_sdk.DAGNode()
-                google.protobuf.text_format.Parse(dep_node_str, dep_node)
-                dep = server_sdk.DAGNodeDependency()
-                dep.name = dep_node.name
-                dep.mode = "RO"
-                node.dependencies.extend([dep])
-        # Because the return value will be used as the key value of the
-        # dict, and the proto object is variable which cannot be hashed,
-        # so it is processed into a string. This has little effect on
-        # overall efficiency.
-        return google.protobuf.text_format.MessageToString(node)
-class OpSeqMaker(object):
-    def __init__(self):
-        self.workflow = server_sdk.Workflow()
-        self.workflow.name = "workflow1"
-        self.workflow.workflow_type = "Sequence"
-    def add_op(self, node_str):
-        node = server_sdk.DAGNode()
-        google.protobuf.text_format.Parse(node_str, node)
-        if len(node.dependencies) > 1:
-            raise Exception(
-                'Set more than one predecessor for op in OpSeqMaker is not allowed.'
-            )
-        if len(self.workflow.nodes) >= 1:
-            if len(node.dependencies) == 0:
-                dep = server_sdk.DAGNodeDependency()
-                dep.name = self.workflow.nodes[-1].name
-                dep.mode = "RO"
-                node.dependencies.extend([dep])
-            elif len(node.dependencies) == 1:
-                if node.dependencies[0].name != self.workflow.nodes[-1].name:
-                    raise Exception(
-                        'You must add op in order in OpSeqMaker. The previous op is {}, but the current op is followed by {}.'
-                        .format(node.dependencies[0].name, self.workflow.nodes[
-                            -1].name))
-        self.workflow.nodes.extend([node])
-    def get_op_sequence(self):
-        workflow_conf = server_sdk.WorkflowConf()
-        workflow_conf.workflows.extend([self.workflow])
-        return workflow_conf
-class OpGraphMaker(object):
-    def __init__(self):
-        self.workflow = server_sdk.Workflow()
-        self.workflow.name = "workflow1"
-        # Currently, SDK only supports "Sequence"
-        self.workflow.workflow_type = "Sequence"
-    def add_op(self, node_str):
-        node = server_sdk.DAGNode()
-        google.protobuf.text_format.Parse(node_str, node)
-        self.workflow.nodes.extend([node])
-    def get_op_graph(self):
-        workflow_conf = server_sdk.WorkflowConf()
-        workflow_conf.workflows.extend([self.workflow])
-        return workflow_conf
 class Server(object):
    def __init__(self):
        self.server_handle_ = None
@@ -217,6 +61,7 @@ class Server(object):
        self.module_path = os.path.dirname(paddle_serving_server.__file__)
        self.cur_path = os.getcwd()
        self.use_local_bin = False
+        self.mkl_flag = False
        self.device = "cpu"
        self.gpuid = 0
        self.use_trt = False
@@ -317,31 +162,20 @@ class Server(object):
            engine.runtime_thread_num = 0
            engine.batch_infer_size = 0
            engine.enable_batch_align = 0
-            engine.model_data_path = model_config_path
+            engine.model_dir = model_config_path
            engine.enable_memory_optimization = self.memory_optimization
            engine.enable_ir_optimization = self.ir_optimization
-            engine.static_optimization = False
-            engine.force_update_static_cache = False
            engine.use_trt = self.use_trt
+            engine.use_lite = self.use_lite
+            engine.use_xpu = self.use_xpu
            if os.path.exists('{}/__params__'.format(model_config_path)):
-                suffix = ""
+                engine.combined_model = True
            else:
-                suffix = "_DIR"
+                engine.combined_model = False
-            if device == "arm":
+            if use_encryption_model:
-                engine.use_lite = self.use_lite
+                engine.encrypted_model = True
-                engine.use_xpu = self.use_xpu
+            engine.type = "PADDLE_INFER"
-            if device == "cpu":
-                if use_encryption_model:
-                    engine.type = "FLUID_CPU_ANALYSIS_ENCRPT"
-                else:
-                    engine.type = "FLUID_CPU_ANALYSIS" + suffix
-            elif device == "gpu":
-                if use_encryption_model:
-                    engine.type = "FLUID_GPU_ANALYSIS_ENCRPT"
-                else:
-                    engine.type = "FLUID_GPU_ANALYSIS" + suffix
-            elif device == "arm":
-                engine.type = "FLUID_ARM_ANALYSIS" + suffix
            self.model_toolkit_conf.engines.extend([engine])
    def _prepare_infer_service(self, port):
@@ -432,26 +266,53 @@ class Server(object):
        # check config here
        # print config here
+    def use_mkl(self, flag):
+        self.mkl_flag = flag
+    def get_device_version(self):
+        avx_flag = False
+        mkl_flag = self.mkl_flag
+        openblas_flag = False
+        r = os.system("cat /proc/cpuinfo | grep avx > /dev/null 2>&1")
+        if r == 0:
+            avx_flag = True
+        if avx_flag:
+            if mkl_flag:
+                device_version = "cpu-avx-mkl"
+            else:
+                device_version = "cpu-avx-openblas"
+        else:
+            if mkl_flag:
+                print(
+                    "Your CPU does not support AVX, server will running with noavx-openblas mode."
+                )
+            device_version = "cpu-noavx-openblas"
+        return device_version
+    def get_serving_bin_name(self):
+        if device_type == "0":
+            device_version = self.get_device_version()
+        elif device_type == "1":
+            if version_suffix == "101" or version_suffix == "102":
+                device_version = "gpu-" + version_suffix
+            else:
+                device_version = "gpu-cuda" + version_suffix
+        elif device_type == "2":
+            device_version = "xpu-" + platform.machine()
+        return device_version
    def download_bin(self):
        os.chdir(self.module_path)
        need_download = False
        #acquire lock
        version_file = open("{}/version.py".format(self.module_path), "r")
-        import re
-        for line in version_file.readlines():
+        folder_name = "serving-%s-%s" % (self.get_serving_bin_name(),
-            if re.match("cuda_version", line):
+                                         serving_server_version)
-                cuda_version = line.split("\"")[1]
+        tar_name = "%s.tar.gz" % folder_name
-                if cuda_version == "101" or cuda_version == "102":
+        bin_url = "https://paddle-serving.bj.bcebos.com/bin/%s" % tar_name
-                    device_version = "serving-gpu-" + cuda_version + "-"
-                elif cuda_version == "arm" or cuda_version == "arm-xpu":
-                    device_version = "serving-" + cuda_version + "-"
-                else:
-                    device_version = "serving-gpu-cuda" + cuda_version + "-"
-        folder_name = device_version + serving_server_version
-        tar_name = folder_name + ".tar.gz"
-        bin_url = "https://paddle-serving.bj.bcebos.com/bin/" + tar_name
        self.server_path = os.path.join(self.module_path, folder_name)
        download_flag = "{}/{}.is_download".format(self.module_path,
@@ -503,9 +364,9 @@ class Server(object):
                       cube_conf=None):
        if workdir == None:
            workdir = "./tmp"
-            os.system("mkdir {}".format(workdir))
+            os.system("mkdir -p {}".format(workdir))
        else:
-            os.system("mkdir {}".format(workdir))
+            os.system("mkdir -p {}".format(workdir))
        os.system("touch {}/fluid_time_file".format(workdir))
        if not self.port_is_available(port):
@@ -614,157 +475,6 @@ class Server(object):
        os.system(command)
-class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
-                                     MultiLangGeneralModelServiceServicer):
-    def __init__(self, model_config_path, is_multi_model, endpoints):
-        self.is_multi_model_ = is_multi_model
-        self.model_config_path_ = model_config_path
-        self.endpoints_ = endpoints
-        with open(self.model_config_path_) as f:
-            self.model_config_str_ = str(f.read())
-        self._parse_model_config(self.model_config_str_)
-        self._init_bclient(self.model_config_path_, self.endpoints_)
-    def _init_bclient(self, model_config_path, endpoints, timeout_ms=None):
-        from paddle_serving_client import Client
-        self.bclient_ = Client()
-        if timeout_ms is not None:
-            self.bclient_.set_rpc_timeout_ms(timeout_ms)
-        self.bclient_.load_client_config(model_config_path)
-        self.bclient_.connect(endpoints)
-    def _parse_model_config(self, model_config_str):
-        model_conf = m_config.GeneralModelConfig()
-        model_conf = google.protobuf.text_format.Merge(model_config_str,
-                                                       model_conf)
-        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
-        self.feed_types_ = {}
-        self.feed_shapes_ = {}
-        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
-        self.fetch_types_ = {}
-        self.lod_tensor_set_ = set()
-        for i, var in enumerate(model_conf.feed_var):
-            self.feed_types_[var.alias_name] = var.feed_type
-            self.feed_shapes_[var.alias_name] = var.shape
-            if var.is_lod_tensor:
-                self.lod_tensor_set_.add(var.alias_name)
-        for i, var in enumerate(model_conf.fetch_var):
-            self.fetch_types_[var.alias_name] = var.fetch_type
-            if var.is_lod_tensor:
-                self.lod_tensor_set_.add(var.alias_name)
-    def _flatten_list(self, nested_list):
-        for item in nested_list:
-            if isinstance(item, (list, tuple)):
-                for sub_item in self._flatten_list(item):
-                    yield sub_item
-            else:
-                yield item
-    def _unpack_inference_request(self, request):
-        feed_names = list(request.feed_var_names)
-        fetch_names = list(request.fetch_var_names)
-        is_python = request.is_python
-        log_id = request.log_id
-        feed_batch = []
-        for feed_inst in request.insts:
-            feed_dict = {}
-            for idx, name in enumerate(feed_names):
-                var = feed_inst.tensor_array[idx]
-                v_type = self.feed_types_[name]
-                data = None
-                if is_python:
-                    if v_type == 0:
-                        data = np.frombuffer(var.data, dtype="int64")
-                    elif v_type == 1:
-                        data = np.frombuffer(var.data, dtype="float32")
-                    elif v_type == 2:
-                        data = np.frombuffer(var.data, dtype="int32")
-                    else:
-                        raise Exception("error type.")
-                else:
-                    if v_type == 0:  # int64
-                        data = np.array(list(var.int64_data), dtype="int64")
-                    elif v_type == 1:  # float32
-                        data = np.array(list(var.float_data), dtype="float32")
-                    elif v_type == 2:
-                        data = np.array(list(var.int_data), dtype="int32")
-                    else:
-                        raise Exception("error type.")
-                data.shape = list(feed_inst.tensor_array[idx].shape)
-                feed_dict[name] = data
-                if len(var.lod) > 0:
-                    feed_dict["{}.lod".format(name)] = var.lod
-            feed_batch.append(feed_dict)
-        return feed_batch, fetch_names, is_python, log_id
-    def _pack_inference_response(self, ret, fetch_names, is_python):
-        resp = multi_lang_general_model_service_pb2.InferenceResponse()
-        if ret is None:
-            resp.err_code = 1
-            return resp
-        results, tag = ret
-        resp.tag = tag
-        resp.err_code = 0
-        if not self.is_multi_model_:
-            results = {'general_infer_0': results}
-        for model_name, model_result in results.items():
-            model_output = multi_lang_general_model_service_pb2.ModelOutput()
-            inst = multi_lang_general_model_service_pb2.FetchInst()
-            for idx, name in enumerate(fetch_names):
-                tensor = multi_lang_general_model_service_pb2.Tensor()
-                v_type = self.fetch_types_[name]
-                if is_python:
-                    tensor.data = model_result[name].tobytes()
-                else:
-                    if v_type == 0:  # int64
-                        tensor.int64_data.extend(model_result[name].reshape(-1)
-                                                 .tolist())
-                    elif v_type == 1:  # float32
-                        tensor.float_data.extend(model_result[name].reshape(-1)
-                                                 .tolist())
-                    elif v_type == 2:  # int32
-                        tensor.int_data.extend(model_result[name].reshape(-1)
-                                               .tolist())
-                    else:
-                        raise Exception("error type.")
-                tensor.shape.extend(list(model_result[name].shape))
-                if "{}.lod".format(name) in model_result:
-                    tensor.lod.extend(model_result["{}.lod".format(name)]
-                                      .tolist())
-                inst.tensor_array.append(tensor)
-            model_output.insts.append(inst)
-            model_output.engine_name = model_name
-            resp.outputs.append(model_output)
-        return resp
-    def SetTimeout(self, request, context):
-        # This porcess and Inference process cannot be operate at the same time.
-        # For performance reasons, do not add thread lock temporarily.
-        timeout_ms = request.timeout_ms
-        self._init_bclient(self.model_config_path_, self.endpoints_, timeout_ms)
-        resp = multi_lang_general_model_service_pb2.SimpleResponse()
-        resp.err_code = 0
-        return resp
-    def Inference(self, request, context):
-        feed_batch, fetch_names, is_python, log_id \
-                = self._unpack_inference_request(request)
-        ret = self.bclient_.predict(
-            feed=feed_batch,
-            fetch=fetch_names,
-            batch=True,
-            need_variant_tag=True,
-            log_id=log_id)
-        return self._pack_inference_response(ret, fetch_names, is_python)
-    def GetClientConfig(self, request, context):
-        resp = multi_lang_general_model_service_pb2.GetClientConfigResponse()
-        resp.client_config_str = self.model_config_str_
-        return resp
 class MultiLangServer(object):
    def __init__(self):
        self.bserver_ = Server()
@@ -808,6 +518,9 @@ class MultiLangServer(object):
    def set_op_graph(self, op_graph):
        self.bserver_.set_op_graph(op_graph)
+    def use_mkl(self, flag):
+        self.bserver_.use_mkl(flag)
    def set_memory_optimize(self, flag=False):
        self.bserver_.set_memory_optimize(flag)

--- a/python/paddle_serving_server/version.py
+++ b/python/paddle_serving_server/version.py
@@ -11,8 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Paddle Serving Client version string """
+""" Paddle Serving Server version string """
 serving_client_version = "0.0.0"
 serving_server_version = "0.0.0"
 module_proto_version = "0.0.0"
+version_suffix = ""
+device_type = "0"
+cuda_version = "9"
 commit_id = ""
--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -15,16 +15,19 @@
 # pylint: disable=doc-string-missing
 from flask import Flask, request, abort
-from multiprocessing import Pool, Process
-from paddle_serving_server import OpMaker, OpSeqMaker, Server
-from paddle_serving_client import Client
 from contextlib import closing
+from multiprocessing import Pool, Process, Queue
+from paddle_serving_client import Client
+from paddle_serving_server import OpMaker, OpSeqMaker, Server
+from paddle_serving_server.serve import start_multi_card
 import socket
+import sys
 import numpy as np
+import paddle_serving_server as serving
 from paddle_serving_server import pipeline
 from paddle_serving_server.pipeline import Op
 def port_is_available(port):
    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
        sock.settimeout(2)
@@ -34,13 +37,15 @@ def port_is_available(port):
    else:
        return False
 class WebService(object):
    def __init__(self, name="default_service"):
        self.name = name
        # pipeline
        self._server = pipeline.PipelineServer(self.name)
+        self.gpus = []  # deprecated
+        self.rpc_service_list = []  # deprecated
    def get_pipeline_response(self, read_op):
        return None
@@ -77,58 +82,115 @@ class WebService(object):
        self.feed_vars = {var.name: var for var in model_conf.feed_var}
        self.fetch_vars = {var.name: var for var in model_conf.fetch_var}
-    def _launch_rpc_service(self):
+    def set_gpus(self, gpus):
-        op_maker = OpMaker()
+        print("This API will be deprecated later. Please do not use it")
+        self.gpus = [int(x) for x in gpus.split(",")]
+    def default_rpc_service(self,
+                            workdir="conf",
+                            port=9292,
+                            gpuid=0,
+                            thread_num=2,
+                            mem_optim=True,
+                            use_lite=False,
+                            use_xpu=False,
+                            ir_optim=False):
+        device = "gpu"
+        if gpuid == -1:
+            if use_lite:
+                device = "arm"
+            else:
+                device = "cpu"
+        op_maker = serving.OpMaker()
        read_op = op_maker.create('general_reader')
        general_infer_op = op_maker.create('general_infer')
        general_response_op = op_maker.create('general_response')
        op_seq_maker = OpSeqMaker()
        op_seq_maker.add_op(read_op)
        op_seq_maker.add_op(general_infer_op)
        op_seq_maker.add_op(general_response_op)
        server = Server()
        server.set_op_sequence(op_seq_maker.get_op_sequence())
-        server.set_num_threads(16)
+        server.set_num_threads(thread_num)
-        server.set_memory_optimize(self.mem_optim)
+        server.set_memory_optimize(mem_optim)
-        server.set_ir_optimize(self.ir_optim)
+        server.set_ir_optimize(ir_optim)
+        server.set_device(device)
+        if use_lite:
+            server.set_lite()
+        if use_xpu:
+            server.set_xpu()
        server.load_model_config(self.model_config)
-        server.prepare_server(
+        if gpuid >= 0:
-            workdir=self.workdir, port=self.port_list[0], device=self.device)
+            server.set_gpuid(gpuid)
-        server.run_server()
+        server.prepare_server(workdir=workdir, port=port, device=device)
+        return server
-    def port_is_available(self, port):
-        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+    def _launch_rpc_service(self, service_idx):
-            sock.settimeout(2)
+        self.rpc_service_list[service_idx].run_server()
-            result = sock.connect_ex(('0.0.0.0', port))
-        if result != 0:
-            return True
-        else:
-            return False
    def prepare_server(self,
                       workdir="",
                       port=9393,
-                       device="cpu",
+                       device="gpu",
-                       mem_optim=True,
+                       use_lite=False,
-                       ir_optim=False):
+                       use_xpu=False,
+                       ir_optim=False,
+                       gpuid=0,
+                       mem_optim=True):
        print("This API will be deprecated later. Please do not use it")
        self.workdir = workdir
        self.port = port
        self.device = device
-        default_port = 12000
+        self.gpuid = gpuid
        self.port_list = []
-        self.mem_optim = mem_optim
+        default_port = 12000
-        self.ir_optim = ir_optim
        for i in range(1000):
            if port_is_available(default_port + i):
                self.port_list.append(default_port + i)
+            if len(self.port_list) > len(self.gpus):
                break
+        if len(self.gpus) == 0:
+            # init cpu service
+            self.rpc_service_list.append(
+                self.default_rpc_service(
+                    self.workdir,
+                    self.port_list[0],
+                    -1,
+                    thread_num=2,
+                    mem_optim=mem_optim,
+                    use_lite=use_lite,
+                    use_xpu=use_xpu,
+                    ir_optim=ir_optim))
+        else:
+            for i, gpuid in enumerate(self.gpus):
+                self.rpc_service_list.append(
+                    self.default_rpc_service(
+                        "{}_{}".format(self.workdir, i),
+                        self.port_list[i],
+                        gpuid,
+                        thread_num=2,
+                        mem_optim=mem_optim,
+                        use_lite=use_lite,
+                        use_xpu=use_xpu,
+                        ir_optim=ir_optim))
    def _launch_web_service(self):
+        gpu_num = len(self.gpus)
        self.client = Client()
        self.client.load_client_config("{}/serving_server_conf.prototxt".format(
            self.model_config))
-        self.client.connect(["0.0.0.0:{}".format(self.port_list[0])])
+        endpoints = ""
+        if gpu_num > 0:
+            for i in range(gpu_num):
+                endpoints += "127.0.0.1:{},".format(self.port_list[i])
+        else:
+            endpoints = "127.0.0.1:{}".format(self.port_list[0])
+        self.client.connect([endpoints])
    def get_prediction(self, request):
        if not request.json:
@@ -158,8 +220,12 @@ class WebService(object):
        print("web service address:")
        print("http://{}:{}/{}/prediction".format(localIP, self.port,
                                                  self.name))
-        p_rpc = Process(target=self._launch_rpc_service)
+        server_pros = []
-        p_rpc.start()
+        for i, service in enumerate(self.rpc_service_list):
+            p = Process(target=self._launch_rpc_service, args=(i, ))
+            server_pros.append(p)
+        for p in server_pros:
+            p.start()
        app_instance = Flask(__name__)
@@ -175,7 +241,9 @@ class WebService(object):
        self.app_instance = app_instance
-    def run_debugger_service(self):
+    # TODO: maybe change another API name: maybe run_local_predictor?
+    def run_debugger_service(self, gpu=False):
+        print("This API will be deprecated later. Please do not use it")
        import socket
        localIP = socket.gethostbyname(socket.gethostname())
        print("web service address:")
@@ -185,7 +253,7 @@ class WebService(object):
        @app_instance.before_first_request
        def init():
-            self._launch_local_predictor()
+            self._launch_local_predictor(gpu)
        service_name = "/" + self.name + "/prediction"
@@ -195,11 +263,11 @@ class WebService(object):
        self.app_instance = app_instance
-    def _launch_local_predictor(self):
+    def _launch_local_predictor(self, gpu):
        from paddle_serving_app.local_predict import LocalPredictor
        self.client = LocalPredictor()
        self.client.load_model_config(
-            "{}".format(self.model_config), use_gpu=False)
+            "{}".format(self.model_config), use_gpu=True, gpu_id=self.gpus[0])
    def run_web_service(self):
        print("This API will be deprecated later. Please do not use it")

--- a/python/paddle_serving_server_gpu/monitor.py
+++ b/python/paddle_serving_server_gpu/monitor.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Usage:
-    Start monitor with one line command
-    Example:
-        python -m paddle_serving_server.monitor
-"""
-import os
-import time
-import argparse
-import subprocess
-import datetime
-import shutil
-import tarfile
-import logging
-_LOGGER = logging.getLogger(__name__)
-class Monitor(object):
-    '''
-    Monitor base class. It is used to monitor the remote model, pull and update the local model.
-    '''
-    def __init__(self, interval):
-        self._remote_path = None
-        self._remote_model_name = None
-        self._remote_donefile_name = None
-        self._local_path = None
-        self._local_model_name = None
-        self._local_timestamp_file = None
-        self._interval = interval
-        self._remote_donefile_timestamp = None
-        self._local_tmp_path = None
-        self._unpacked_filename = None
-    def set_remote_path(self, remote_path):
-        self._remote_path = remote_path
-    def set_remote_model_name(self, model_name):
-        self._remote_model_name = model_name
-    def set_remote_donefile_name(self, donefile_name):
-        self._remote_donefile_name = donefile_name
-    def set_local_path(self, local_path):
-        self._local_path = local_path
-    def set_local_model_name(self, model_name):
-        self._local_model_name = model_name
-    def set_local_timestamp_file(self, timestamp_file):
-        self._local_timestamp_file = timestamp_file
-    def set_local_tmp_path(self, tmp_path):
-        self._local_tmp_path = tmp_path
-    def set_unpacked_filename(self, unpacked_filename):
-        self._unpacked_filename = unpacked_filename
-    def _check_param_help(self, param_name, param_value):
-        return "Please check the {}({}) parameter.".format(param_name,
-                                                           param_value)
-    def _check_params(self, params):
-        for param in params:
-            if getattr(self, param, None) is None:
-                raise Exception('{} not set.'.format(param))
-    def _print_params(self, params_name):
-        self._check_params(params_name)
-        for name in params_name:
-            _LOGGER.info('{}: {}'.format(name, getattr(self, name)))
-    def _decompress_model_file(self, local_tmp_path, model_name,
-                               unpacked_filename):
-        if unpacked_filename is None:
-            _LOGGER.debug('remote file({}) is already unpacked.'.format(
-                model_name))
-            return model_name
-        tar_model_path = os.path.join(local_tmp_path, model_name)
-        _LOGGER.info("try to unpack remote file({})".format(tar_model_path))
-        if not tarfile.is_tarfile(tar_model_path):
-            raise Exception('not a tar packaged file type. {}'.format(
-                self._check_param_help('remote_model_name', model_name)))
-        try:
-            _LOGGER.info('unpack remote file({}).'.format(model_name))
-            tar = tarfile.open(tar_model_path)
-            tar.extractall(local_tmp_path)
-            tar.close()
-        except:
-            raise Exception(
-                'Decompressing failed, maybe no disk space left. {}'.foemat(
-                    self._check_param_help('local_tmp_path', local_tmp_path)))
-        finally:
-            os.remove(tar_model_path)
-            _LOGGER.debug('remove packed file({}).'.format(tar_model_path))
-            _LOGGER.info('using unpacked filename: {}.'.format(
-                unpacked_filename))
-            if not os.path.exists(
-                    os.path.join(local_tmp_path, unpacked_filename)):
-                raise Exception('file not exist. {}'.format(
-                    self._check_param_help('unpacked_filename',
-                                           unpacked_filename)))
-            return unpacked_filename
-    def run(self):
-        '''
-        Monitor the remote model by polling and update the local model.
-        '''
-        params = [
-            '_remote_path', '_remote_model_name', '_remote_donefile_name',
-            '_local_model_name', '_local_path', '_local_timestamp_file',
-            '_local_tmp_path', '_interval'
-        ]
-        self._print_params(params)
-        local_tmp_path = os.path.join(self._local_path, self._local_tmp_path)
-        _LOGGER.info('local_tmp_path: {}'.format(local_tmp_path))
-        if not os.path.exists(local_tmp_path):
-            _LOGGER.info('mkdir: {}'.format(local_tmp_path))
-            os.makedirs(local_tmp_path)
-        while True:
-            [flag, timestamp] = self._exist_remote_file(
-                self._remote_path, self._remote_donefile_name, local_tmp_path)
-            if flag:
-                if self._remote_donefile_timestamp is None or \
-                        timestamp != self._remote_donefile_timestamp:
-                    _LOGGER.info('doneilfe({}) changed.'.format(
-                        self._remote_donefile_name))
-                    self._remote_donefile_timestamp = timestamp
-                    self._pull_remote_dir(self._remote_path,
-                                          self._remote_model_name,
-                                          local_tmp_path)
-                    _LOGGER.info('pull remote model({}).'.format(
-                        self._remote_model_name))
-                    unpacked_filename = self._decompress_model_file(
-                        local_tmp_path, self._remote_model_name,
-                        self._unpacked_filename)
-                    self._update_local_model(local_tmp_path, unpacked_filename,
-                                             self._local_path,
-                                             self._local_model_name)
-                    _LOGGER.info('update local model({}).'.format(
-                        self._local_model_name))
-                    self._update_local_donefile(self._local_path,
-                                                self._local_model_name,
-                                                self._local_timestamp_file)
-                    _LOGGER.info('update model timestamp({}).'.format(
-                        self._local_timestamp_file))
-            else:
-                _LOGGER.info('remote({}) has no donefile.'.format(
-                    self._remote_path))
-            _LOGGER.info('sleep {}s.'.format(self._interval))
-            time.sleep(self._interval)
-    def _exist_remote_file(self, path, filename, local_tmp_path):
-        raise Exception('This function must be inherited.')
-    def _pull_remote_dir(self, remote_path, dirname, local_tmp_path):
-        raise Exception('This function must be inherited.')
-    def _update_local_model(self, local_tmp_path, remote_model_name, local_path,
-                            local_model_name):
-        tmp_model_path = os.path.join(local_tmp_path, remote_model_name)
-        local_model_path = os.path.join(local_path, local_model_name)
-        cmd = 'cp -r {}/* {}'.format(tmp_model_path, local_model_path)
-        _LOGGER.debug('update model cmd: {}'.format(cmd))
-        if os.system(cmd) != 0:
-            raise Exception('update local model failed.')
-    def _update_local_donefile(self, local_path, local_model_name,
-                               local_timestamp_file):
-        donefile_path = os.path.join(local_path, local_model_name,
-                                     local_timestamp_file)
-        cmd = 'touch {}'.format(donefile_path)
-        _LOGGER.debug('update timestamp cmd: {}'.format(cmd))
-        if os.system(cmd) != 0:
-            raise Exception('update local donefile failed.')
-class HadoopMonitor(Monitor):
-    ''' Monitor HDFS or AFS by Hadoop-client. '''
-    def __init__(self, hadoop_bin, fs_name='', fs_ugi='', interval=10):
-        super(HadoopMonitor, self).__init__(interval)
-        self._hadoop_bin = hadoop_bin
-        self._fs_name = fs_name
-        self._fs_ugi = fs_ugi
-        self._print_params(['_hadoop_bin', '_fs_name', '_fs_ugi'])
-        self._cmd_prefix = '{} fs '.format(self._hadoop_bin)
-        if self._fs_name:
-            self._cmd_prefix += '-D fs.default.name={} '.format(self._fs_name)
-        if self._fs_ugi:
-            self._cmd_prefix += '-D hadoop.job.ugi={} '.format(self._fs_ugi)
-        _LOGGER.info('Hadoop prefix cmd: {}'.format(self._cmd_prefix))
-    def _exist_remote_file(self, path, filename, local_tmp_path):
-        remote_filepath = os.path.join(path, filename)
-        cmd = '{} -ls {} 2>/dev/null'.format(self._cmd_prefix, remote_filepath)
-        _LOGGER.debug('check cmd: {}'.format(cmd))
-        [status, output] = subprocess.getstatusoutput(cmd)
-        _LOGGER.debug('resp: {}'.format(output))
-        if status == 0:
-            [_, _, _, _, _, mdate, mtime, _] = output.split('\n')[-1].split()
-            timestr = mdate + mtime
-            return [True, timestr]
-        else:
-            return [False, None]
-    def _pull_remote_dir(self, remote_path, dirname, local_tmp_path):
-        # remove old file before pull remote dir
-        local_dirpath = os.path.join(local_tmp_path, dirname)
-        if os.path.exists(local_dirpath):
-            _LOGGER.info('remove old temporary model file({}).'.format(dirname))
-            if self._unpacked_filename is None:
-                # the remote file is model folder.
-                shutil.rmtree(local_dirpath)
-            else:
-                # the remote file is a packed model file
-                os.remove(local_dirpath)
-        remote_dirpath = os.path.join(remote_path, dirname)
-        cmd = '{} -get {} {} 2>/dev/null'.format(self._cmd_prefix,
-                                                 remote_dirpath, local_dirpath)
-        _LOGGER.debug('pull cmd: {}'.format(cmd))
-        if os.system(cmd) != 0:
-            raise Exception('pull remote dir failed. {}'.format(
-                self._check_param_help('remote_model_name', dirname)))
-class FTPMonitor(Monitor):
-    ''' FTP Monitor. '''
-    def __init__(self, host, port, username="", password="", interval=10):
-        super(FTPMonitor, self).__init__(interval)
-        import ftplib
-        self._ftp = ftplib.FTP()
-        self._ftp_host = host
-        self._ftp_port = port
-        self._ftp_username = username
-        self._ftp_password = password
-        self._ftp.connect(self._ftp_host, self._ftp_port)
-        self._ftp.login(self._ftp_username, self._ftp_password)
-        self._print_params(
-            ['_ftp_host', '_ftp_port', '_ftp_username', '_ftp_password'])
-    def _exist_remote_file(self, path, filename, local_tmp_path):
-        import ftplib
-        try:
-            _LOGGER.debug('cwd: {}'.format(path))
-            self._ftp.cwd(path)
-            timestamp = self._ftp.voidcmd('MDTM {}'.format(filename))[4:].strip(
-            )
-            return [True, timestamp]
-        except ftplib.error_perm:
-            _LOGGER.debug('remote file({}) not exist.'.format(filename))
-            return [False, None]
-    def _download_remote_file(self,
-                              remote_path,
-                              remote_filename,
-                              local_tmp_path,
-                              overwrite=True):
-        local_fullpath = os.path.join(local_tmp_path, remote_filename)
-        if not overwrite and os.path.isfile(fullpath):
-            return
-        else:
-            with open(local_fullpath, 'wb') as f:
-                _LOGGER.debug('cwd: {}'.format(remote_path))
-                self._ftp.cwd(remote_path)
-                _LOGGER.debug('download remote file({})'.format(
-                    remote_filename))
-                self._ftp.retrbinary('RETR {}'.format(remote_filename), f.write)
-    def _download_remote_files(self,
-                               remote_path,
-                               remote_dirname,
-                               local_tmp_path,
-                               overwrite=True):
-        import ftplib
-        remote_dirpath = os.path.join(remote_path, remote_dirname)
-        # Check whether remote_dirpath is a file or a folder
-        try:
-            _LOGGER.debug('cwd: {}'.format(remote_dirpath))
-            self._ftp.cwd(remote_dirpath)
-            _LOGGER.debug('{} is folder.'.format(remote_dirname))
-            local_dirpath = os.path.join(local_tmp_path, remote_dirname)
-            if not os.path.exists(local_dirpath):
-                _LOGGER.info('mkdir: {}'.format(local_dirpath))
-                os.mkdir(local_dirpath)
-            output = []
-            self._ftp.dir(output.append)
-            for line in output:
-                [attr, _, _, _, _, _, _, _, name] = line.split()
-                if attr[0] == 'd':
-                    self._download_remote_files(
-                        os.path.join(remote_path, remote_dirname), name,
-                        os.path.join(local_tmp_path, remote_dirname), overwrite)
-                else:
-                    self._download_remote_file(remote_dirpath, name,
-                                               local_dirpath, overwrite)
-        except ftplib.error_perm:
-            _LOGGER.debug('{} is file.'.format(remote_dirname))
-            self._download_remote_file(remote_path, remote_dirname,
-                                       local_tmp_path, overwrite)
-            return
-    def _pull_remote_dir(self, remote_path, dirname, local_tmp_path):
-        self._download_remote_files(
-            remote_path, dirname, local_tmp_path, overwrite=True)
-class GeneralMonitor(Monitor):
-    ''' General Monitor. '''
-    def __init__(self, host, interval=10):
-        super(GeneralMonitor, self).__init__(interval)
-        self._general_host = host
-        self._print_params(['_general_host'])
-    def _get_local_file_timestamp(self, filename):
-        return os.path.getmtime(filename)
-    def _exist_remote_file(self, remote_path, filename, local_tmp_path):
-        remote_filepath = os.path.join(remote_path, filename)
-        url = '{}/{}'.format(self._general_host, remote_filepath)
-        _LOGGER.debug('remote file url: {}'.format(url))
-        # only for check donefile, which is not a folder.
-        cmd = 'wget -nd -N -P {} {} &>/dev/null'.format(local_tmp_path, url)
-        _LOGGER.debug('wget cmd: {}'.format(cmd))
-        if os.system(cmd) != 0:
-            _LOGGER.debug('remote file({}) not exist.'.format(remote_filepath))
-            return [False, None]
-        else:
-            timestamp = self._get_local_file_timestamp(
-                os.path.join(local_tmp_path, filename))
-            return [True, timestamp]
-    def _pull_remote_dir(self, remote_path, dirname, local_tmp_path):
-        remote_dirpath = os.path.join(remote_path, dirname)
-        url = '{}/{}'.format(self._general_host, remote_dirpath)
-        _LOGGER.debug('remote file url: {}'.format(url))
-        if self._unpacked_filename is None:
-            # the remote file is model folder.
-            cmd = 'wget -nH -r -P {} {} &>/dev/null'.format(
-                os.path.join(local_tmp_path, dirname), url)
-        else:
-            # the remote file is a packed model file
-            cmd = 'wget -nd -N -P {} {} &>/dev/null'.format(local_tmp_path, url)
-        _LOGGER.debug('wget cmd: {}'.format(cmd))
-        if os.system(cmd) != 0:
-            raise Exception('pull remote dir failed. {}'.format(
-                self._check_param_help('remote_model_name', dirname)))
-def parse_args():
-    """ parse args.
-    Returns:
-        parser.parse_args().
-    """
-    parser = argparse.ArgumentParser(description="Monitor")
-    parser.add_argument(
-        "--type", type=str, default='general', help="Type of remote server")
-    parser.add_argument(
-        "--remote_path",
-        type=str,
-        required=True,
-        help="The base path for the remote")
-    parser.add_argument(
-        "--remote_model_name",
-        type=str,
-        required=True,
-        help="The model name to be pulled from the remote")
-    parser.add_argument(
-        "--remote_donefile_name",
-        type=str,
-        required=True,
-        help="The donefile name that marks the completion of the remote model update"
-    )
-    parser.add_argument(
-        "--local_path", type=str, required=True, help="Local work path")
-    parser.add_argument(
-        "--local_model_name", type=str, required=True, help="Local model name")
-    parser.add_argument(
-        "--local_timestamp_file",
-        type=str,
-        default='fluid_time_file',
-        help="The timestamp file used locally for hot loading, The file is considered to be placed in the `local_path/local_model_name` folder."
-    )
-    parser.add_argument(
-        "--local_tmp_path",
-        type=str,
-        default='_serving_monitor_tmp',
-        help="The path of the folder where temporary files are stored locally. If it does not exist, it will be created automatically"
-    )
-    parser.add_argument(
-        "--unpacked_filename",
-        type=str,
-        default=None,
-        help="If the model of the remote production is a packaged file, the unpacked file name should be set. Currently, only tar packaging format is supported."
-    )
-    parser.add_argument(
-        "--interval",
-        type=int,
-        default=10,
-        help="The polling interval in seconds")
-    parser.add_argument(
-        "--debug", action='store_true', help="If set, output more details")
-    parser.set_defaults(debug=False)
-    # general monitor
-    parser.add_argument("--general_host", type=str, help="General remote host")
-    # ftp monitor
-    parser.add_argument("--ftp_host", type=str, help="FTP remote host")
-    parser.add_argument("--ftp_port", type=int, help="FTP remote port")
-    parser.add_argument(
-        "--ftp_username",
-        type=str,
-        default='',
-        help="FTP username. Not used if anonymous access.")
-    parser.add_argument(
-        "--ftp_password",
-        type=str,
-        default='',
-        help="FTP password. Not used if anonymous access")
-    # afs/hdfs monitor
-    parser.add_argument(
-        "--hadoop_bin", type=str, help="Path of Hadoop binary file")
-    parser.add_argument(
-        "--fs_name",
-        type=str,
-        default='',
-        help="AFS/HDFS fs_name. Not used if set in Hadoop-client.")
-    parser.add_argument(
-        "--fs_ugi",
-        type=str,
-        default='',
-        help="AFS/HDFS fs_ugi, Not used if set in Hadoop-client")
-    return parser.parse_args()
-def get_monitor(mtype):
-    """ generator monitor instance.
-    Args:
-        mtype: type of monitor
-    Returns:
-        monitor instance.
-    """
-    if mtype == 'ftp':
-        return FTPMonitor(
-            args.ftp_host,
-            args.ftp_port,
-            username=args.ftp_username,
-            password=args.ftp_password,
-            interval=args.interval)
-    elif mtype == 'general':
-        return GeneralMonitor(args.general_host, interval=args.interval)
-    elif mtype == 'afs' or mtype == 'hdfs':
-        return HadoopMonitor(
-            args.hadoop_bin, args.fs_name, args.fs_ugi, interval=args.interval)
-    else:
-        raise Exception('unsupport type.')
-def start_monitor(monitor, args):
-    monitor.set_remote_path(args.remote_path)
-    monitor.set_remote_model_name(args.remote_model_name)
-    monitor.set_remote_donefile_name(args.remote_donefile_name)
-    monitor.set_local_path(args.local_path)
-    monitor.set_local_model_name(args.local_model_name)
-    monitor.set_local_timestamp_file(args.local_timestamp_file)
-    monitor.set_local_tmp_path(args.local_tmp_path)
-    monitor.set_unpacked_filename(args.unpacked_filename)
-    monitor.run()
-if __name__ == "__main__":
-    args = parse_args()
-    if args.debug:
-        logging.basicConfig(
-            format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',
-            datefmt='%Y-%m-%d %H:%M',
-            level=logging.DEBUG)
-    else:
-        logging.basicConfig(
-            format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',
-            datefmt='%Y-%m-%d %H:%M',
-            level=logging.INFO)
-    monitor = get_monitor(args.type)
-    start_monitor(monitor, args)
--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Usage:
-    Host a trained paddle model with one line command
-    Example:
-        python -m paddle_serving_server.serve --model ./serving_server_model --port 9292
-"""
-import argparse
-import os
-import json
-import base64
-import time
-from multiprocessing import Pool, Process
-from paddle_serving_server_gpu import serve_args
-from flask import Flask, request
-import sys
-if sys.version_info.major == 2:
-    from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
-elif sys.version_info.major == 3:
-    from http.server import BaseHTTPRequestHandler, HTTPServer
-def start_gpu_card_model(index, gpuid, port, args):  # pylint: disable=doc-string-missing
-    gpuid = int(gpuid)
-    device = "gpu"
-    if gpuid == -1:
-        device = "cpu"
-    elif gpuid >= 0:
-        port = port + index
-    thread_num = args.thread
-    model = args.model
-    mem_optim = args.mem_optim_off is False
-    ir_optim = args.ir_optim
-    max_body_size = args.max_body_size
-    use_multilang = args.use_multilang
-    workdir = args.workdir
-    if gpuid >= 0:
-        workdir = "{}_{}".format(args.workdir, gpuid)
-    if model == "":
-        print("You must specify your serving model")
-        exit(-1)
-    import paddle_serving_server_gpu as serving
-    op_maker = serving.OpMaker()
-    read_op = op_maker.create('general_reader')
-    general_infer_op = op_maker.create('general_infer')
-    general_response_op = op_maker.create('general_response')
-    op_seq_maker = serving.OpSeqMaker()
-    op_seq_maker.add_op(read_op)
-    op_seq_maker.add_op(general_infer_op)
-    op_seq_maker.add_op(general_response_op)
-    if use_multilang:
-        server = serving.MultiLangServer()
-    else:
-        server = serving.Server()
-    server.set_op_sequence(op_seq_maker.get_op_sequence())
-    server.set_num_threads(thread_num)
-    server.set_memory_optimize(mem_optim)
-    server.set_ir_optimize(ir_optim)
-    server.set_max_body_size(max_body_size)
-    if args.use_trt:
-        server.set_trt()
-    if args.use_lite:
-        server.set_lite()
-        device = "arm"
-    server.set_device(device)
-    if args.use_xpu:
-        server.set_xpu()
-    if args.product_name != None:
-        server.set_product_name(args.product_name)
-    if args.container_id != None:
-        server.set_container_id(args.container_id)
-    server.load_model_config(model)
-    server.prepare_server(
-        workdir=workdir,
-        port=port,
-        device=device,
-        use_encryption_model=args.use_encryption_model)
-    if gpuid >= 0:
-        server.set_gpuid(gpuid)
-    server.run_server()
-def start_multi_card(args, serving_port=None):  # pylint: disable=doc-string-missing
-    gpus = ""
-    if serving_port == None:
-        serving_port = args.port
-    if args.gpu_ids == "":
-        gpus = []
-    else:
-        gpus = args.gpu_ids.split(",")
-        if "CUDA_VISIBLE_DEVICES" in os.environ:
-            env_gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
-            for ids in gpus:
-                if int(ids) >= len(env_gpus):
-                    print(
-                        " Max index of gpu_ids out of range, the number of CUDA_VISIBLE_DEVICES is {}."
-                        .format(len(env_gpus)))
-                    exit(-1)
-        else:
-            env_gpus = []
-    if args.use_lite:
-        print("run arm server.")
-        start_gpu_card_model(-1, -1, args)
-    elif len(gpus) <= 0:
-        print("gpu_ids not set, going to run cpu service.")
-        start_gpu_card_model(-1, -1, serving_port, args)
-    else:
-        gpu_processes = []
-        for i, gpu_id in enumerate(gpus):
-            p = Process(
-                target=start_gpu_card_model,
-                args=(
-                    i,
-                    gpu_id,
-                    serving_port,
-                    args, ))
-            gpu_processes.append(p)
-        for p in gpu_processes:
-            p.start()
-        for p in gpu_processes:
-            p.join()
-class MainService(BaseHTTPRequestHandler):
-    def get_available_port(self):
-        default_port = 12000
-        for i in range(1000):
-            if port_is_available(default_port + i):
-                return default_port + i
-    def start_serving(self):
-        start_multi_card(args, serving_port)
-    def get_key(self, post_data):
-        if "key" not in post_data:
-            return False
-        else:
-            key = base64.b64decode(post_data["key"].encode())
-            with open(args.model + "/key", "wb") as f:
-                f.write(key)
-            return True
-    def check_key(self, post_data):
-        if "key" not in post_data:
-            return False
-        else:
-            key = base64.b64decode(post_data["key"].encode())
-            with open(args.model + "/key", "rb") as f:
-                cur_key = f.read()
-            return (key == cur_key)
-    def start(self, post_data):
-        post_data = json.loads(post_data)
-        global p_flag
-        if not p_flag:
-            if args.use_encryption_model:
-                print("waiting key for model")
-                if not self.get_key(post_data):
-                    print("not found key in request")
-                    return False
-            global serving_port
-            global p
-            serving_port = self.get_available_port()
-            p = Process(target=self.start_serving)
-            p.start()
-            time.sleep(3)
-            if p.is_alive():
-                p_flag = True
-            else:
-                return False
-        else:
-            if p.is_alive():
-                if not self.check_key(post_data):
-                    return False
-            else:
-                return False
-        return True
-    def do_POST(self):
-        content_length = int(self.headers['Content-Length'])
-        post_data = self.rfile.read(content_length)
-        if self.start(post_data):
-            response = {"endpoint_list": [serving_port]}
-        else:
-            response = {"message": "start serving failed"}
-        self.send_response(200)
-        self.send_header('Content-type', 'application/json')
-        self.end_headers()
-        self.wfile.write(json.dumps(response).encode())
-if __name__ == "__main__":
-    args = serve_args()
-    if args.name == "None":
-        from .web_service import port_is_available
-        if args.use_encryption_model:
-            p_flag = False
-            p = None
-            serving_port = 0
-            server = HTTPServer(('localhost', int(args.port)), MainService)
-            print(
-                'Starting encryption server, waiting for key from client, use <Ctrl-C> to stop'
-            )
-            server.serve_forever()
-        else:
-            start_multi_card(args)
-    else:
-        from .web_service import WebService
-        web_service = WebService(name=args.name)
-        web_service.load_model_config(args.model)
-        gpu_ids = args.gpu_ids
-        if gpu_ids == "":
-            if "CUDA_VISIBLE_DEVICES" in os.environ:
-                gpu_ids = os.environ["CUDA_VISIBLE_DEVICES"]
-        if len(gpu_ids) > 0:
-            web_service.set_gpus(gpu_ids)
-        web_service.prepare_server(
-            workdir=args.workdir,
-            port=args.port,
-            device=args.device,
-            use_lite=args.use_lite,
-            use_xpu=args.use_xpu,
-            ir_optim=args.ir_optim)
-        web_service.run_rpc_service()
-        app_instance = Flask(__name__)
-        @app_instance.before_first_request
-        def init():
-            web_service._launch_web_service()
-        service_name = "/" + web_service.name + "/prediction"
-        @app_instance.route(service_name, methods=["POST"])
-        def run():
-            return web_service.get_prediction(request)
-        app_instance.run(host="0.0.0.0",
-                         port=web_service.port,
-                         threaded=False,
-                         processes=4)
--- a/python/paddle_serving_server_gpu/version.py
+++ b/python/paddle_serving_server_gpu/version.py
-# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Paddle Serving Client version string """
-serving_client_version = "0.0.0"
-serving_server_version = "0.0.0"
-module_proto_version = "0.0.0"
-cuda_version = "9"
-commit_id = ""
--- a/python/paddle_serving_server_gpu/web_service.py
+++ b/python/paddle_serving_server_gpu/web_service.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#!flask/bin/python
-# pylint: disable=doc-string-missing
-from flask import Flask, request, abort
-from contextlib import closing
-from multiprocessing import Pool, Process, Queue
-from paddle_serving_client import Client
-from paddle_serving_server_gpu import OpMaker, OpSeqMaker, Server
-from paddle_serving_server_gpu.serve import start_multi_card
-import socket
-import sys
-import numpy as np
-import paddle_serving_server_gpu as serving
-from paddle_serving_server_gpu import pipeline
-from paddle_serving_server_gpu.pipeline import Op
-def port_is_available(port):
-    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-        sock.settimeout(2)
-        result = sock.connect_ex(('0.0.0.0', port))
-    if result != 0:
-        return True
-    else:
-        return False
-class WebService(object):
-    def __init__(self, name="default_service"):
-        self.name = name
-        # pipeline
-        self._server = pipeline.PipelineServer(self.name)
-        self.gpus = []  # deprecated
-        self.rpc_service_list = []  # deprecated
-    def get_pipeline_response(self, read_op):
-        return None
-    def prepare_pipeline_config(self, yaml_file):
-        # build dag
-        read_op = pipeline.RequestOp()
-        last_op = self.get_pipeline_response(read_op)
-        if not isinstance(last_op, Op):
-            raise ValueError("The return value type of `get_pipeline_response` "
-                             "function is not Op type, please check function "
-                             "`get_pipeline_response`.")
-        response_op = pipeline.ResponseOp(input_ops=[last_op])
-        self._server.set_response_op(response_op)
-        self._server.prepare_server(yaml_file)
-    def run_service(self):
-        self._server.run_server()
-    def load_model_config(self, model_config):
-        print("This API will be deprecated later. Please do not use it")
-        self.model_config = model_config
-        import os
-        from .proto import general_model_config_pb2 as m_config
-        import google.protobuf.text_format
-        if os.path.isdir(model_config):
-            client_config = "{}/serving_server_conf.prototxt".format(
-                model_config)
-        elif os.path.isfile(model_config):
-            client_config = model_config
-        model_conf = m_config.GeneralModelConfig()
-        f = open(client_config, 'r')
-        model_conf = google.protobuf.text_format.Merge(
-            str(f.read()), model_conf)
-        self.feed_vars = {var.name: var for var in model_conf.feed_var}
-        self.fetch_vars = {var.name: var for var in model_conf.fetch_var}
-    def set_gpus(self, gpus):
-        print("This API will be deprecated later. Please do not use it")
-        self.gpus = [int(x) for x in gpus.split(",")]
-    def default_rpc_service(self,
-                            workdir="conf",
-                            port=9292,
-                            gpuid=0,
-                            thread_num=2,
-                            mem_optim=True,
-                            use_lite=False,
-                            use_xpu=False,
-                            ir_optim=False):
-        device = "gpu"
-        if gpuid == -1:
-            if use_lite:
-                device = "arm"
-            else:
-                device = "cpu"
-        op_maker = serving.OpMaker()
-        read_op = op_maker.create('general_reader')
-        general_infer_op = op_maker.create('general_infer')
-        general_response_op = op_maker.create('general_response')
-        op_seq_maker = OpSeqMaker()
-        op_seq_maker.add_op(read_op)
-        op_seq_maker.add_op(general_infer_op)
-        op_seq_maker.add_op(general_response_op)
-        server = Server()
-        server.set_op_sequence(op_seq_maker.get_op_sequence())
-        server.set_num_threads(thread_num)
-        server.set_memory_optimize(mem_optim)
-        server.set_ir_optimize(ir_optim)
-        server.set_device(device)
-        if use_lite:
-            server.set_lite()
-        if use_xpu:
-            server.set_xpu()
-        server.load_model_config(self.model_config)
-        if gpuid >= 0:
-            server.set_gpuid(gpuid)
-        server.prepare_server(workdir=workdir, port=port, device=device)
-        return server
-    def _launch_rpc_service(self, service_idx):
-        self.rpc_service_list[service_idx].run_server()
-    def port_is_available(self, port):
-        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-            sock.settimeout(2)
-            result = sock.connect_ex(('0.0.0.0', port))
-        if result != 0:
-            return True
-        else:
-            return False
-    def prepare_server(self,
-                       workdir="",
-                       port=9393,
-                       device="gpu",
-                       use_lite=False,
-                       use_xpu=False,
-                       ir_optim=False,
-                       gpuid=0,
-                       mem_optim=True):
-        print("This API will be deprecated later. Please do not use it")
-        self.workdir = workdir
-        self.port = port
-        self.device = device
-        self.gpuid = gpuid
-        self.port_list = []
-        default_port = 12000
-        for i in range(1000):
-            if port_is_available(default_port + i):
-                self.port_list.append(default_port + i)
-            if len(self.port_list) > len(self.gpus):
-                break
-        if len(self.gpus) == 0:
-            # init cpu service
-            self.rpc_service_list.append(
-                self.default_rpc_service(
-                    self.workdir,
-                    self.port_list[0],
-                    -1,
-                    thread_num=2,
-                    mem_optim=mem_optim,
-                    use_lite=use_lite,
-                    use_xpu=use_xpu,
-                    ir_optim=ir_optim))
-        else:
-            for i, gpuid in enumerate(self.gpus):
-                self.rpc_service_list.append(
-                    self.default_rpc_service(
-                        "{}_{}".format(self.workdir, i),
-                        self.port_list[i],
-                        gpuid,
-                        thread_num=2,
-                        mem_optim=mem_optim,
-                        use_lite=use_lite,
-                        use_xpu=use_xpu,
-                        ir_optim=ir_optim))
-    def _launch_web_service(self):
-        gpu_num = len(self.gpus)
-        self.client = Client()
-        self.client.load_client_config("{}/serving_server_conf.prototxt".format(
-            self.model_config))
-        endpoints = ""
-        if gpu_num > 0:
-            for i in range(gpu_num):
-                endpoints += "127.0.0.1:{},".format(self.port_list[i])
-        else:
-            endpoints = "127.0.0.1:{}".format(self.port_list[0])
-        self.client.connect([endpoints])
-    def get_prediction(self, request):
-        if not request.json:
-            abort(400)
-        if "fetch" not in request.json:
-            abort(400)
-        try:
-            feed, fetch, is_batch = self.preprocess(request.json["feed"],
-                                                    request.json["fetch"])
-            if isinstance(feed, dict) and "fetch" in feed:
-                del feed["fetch"]
-            if len(feed) == 0:
-                raise ValueError("empty input")
-            fetch_map = self.client.predict(
-                feed=feed, fetch=fetch, batch=is_batch)
-            result = self.postprocess(
-                feed=request.json["feed"], fetch=fetch, fetch_map=fetch_map)
-            result = {"result": result}
-        except ValueError as err:
-            result = {"result": str(err)}
-        return result
-    def run_rpc_service(self):
-        print("This API will be deprecated later. Please do not use it")
-        import socket
-        localIP = socket.gethostbyname(socket.gethostname())
-        print("web service address:")
-        print("http://{}:{}/{}/prediction".format(localIP, self.port,
-                                                  self.name))
-        server_pros = []
-        for i, service in enumerate(self.rpc_service_list):
-            p = Process(target=self._launch_rpc_service, args=(i, ))
-            server_pros.append(p)
-        for p in server_pros:
-            p.start()
-        app_instance = Flask(__name__)
-        @app_instance.before_first_request
-        def init():
-            self._launch_web_service()
-        service_name = "/" + self.name + "/prediction"
-        @app_instance.route(service_name, methods=["POST"])
-        def run():
-            return self.get_prediction(request)
-        self.app_instance = app_instance
-    # TODO: maybe change another API name: maybe run_local_predictor?
-    def run_debugger_service(self, gpu=False):
-        print("This API will be deprecated later. Please do not use it")
-        import socket
-        localIP = socket.gethostbyname(socket.gethostname())
-        print("web service address:")
-        print("http://{}:{}/{}/prediction".format(localIP, self.port,
-                                                  self.name))
-        app_instance = Flask(__name__)
-        @app_instance.before_first_request
-        def init():
-            self._launch_local_predictor(gpu)
-        service_name = "/" + self.name + "/prediction"
-        @app_instance.route(service_name, methods=["POST"])
-        def run():
-            return self.get_prediction(request)
-        self.app_instance = app_instance
-    def _launch_local_predictor(self, gpu):
-        from paddle_serving_app.local_predict import LocalPredictor
-        self.client = LocalPredictor()
-        self.client.load_model_config(
-            "{}".format(self.model_config), use_gpu=True, gpu_id=self.gpus[0])
-    def run_web_service(self):
-        print("This API will be deprecated later. Please do not use it")
-        self.app_instance.run(host="0.0.0.0", port=self.port, threaded=True)
-    def get_app_instance(self):
-        return self.app_instance
-    def preprocess(self, feed=[], fetch=[]):
-        print("This API will be deprecated later. Please do not use it")
-        is_batch = True
-        feed_dict = {}
-        for var_name in self.feed_vars.keys():
-            feed_dict[var_name] = []
-        for feed_ins in feed:
-            for key in feed_ins:
-                feed_dict[key].append(
-                    np.array(feed_ins[key]).reshape(
-                        list(self.feed_vars[key].shape))[np.newaxis, :])
-        feed = {}
-        for key in feed_dict:
-            feed[key] = np.concatenate(feed_dict[key], axis=0)
-        return feed, fetch, is_batch
-    def postprocess(self, feed=[], fetch=[], fetch_map=None):
-        print("This API will be deprecated later. Please do not use it")
-        for key in fetch_map:
-            fetch_map[key] = fetch_map[key].tolist()
-        return fetch_map
--- a/python/pipeline/local_service_handler.py
+++ b/python/pipeline/local_service_handler.py
@@ -15,8 +15,8 @@
 import os
 import logging
 import multiprocessing
-#from paddle_serving_server_gpu import OpMaker, OpSeqMaker
+#from paddle_serving_server import OpMaker, OpSeqMaker
-#from paddle_serving_server_gpu import Server as GpuServer
+#from paddle_serving_server import Server as GpuServer
 #from paddle_serving_server import Server as CpuServer
 from . import util
 #from paddle_serving_app.local_predict import LocalPredictor
@@ -235,7 +235,7 @@ class LocalServiceHandler(object):
            server = Server()
        else:
            #gpu or arm
-            from paddle_serving_server_gpu import OpMaker, OpSeqMaker, Server
+            from paddle_serving_server import OpMaker, OpSeqMaker, Server
            op_maker = OpMaker()
            read_op = op_maker.create('general_reader')
            general_infer_op = op_maker.create('general_infer')

--- a/python/pipeline/profiler.py
+++ b/python/pipeline/profiler.py
@@ -26,10 +26,11 @@ from time import time as _time
 import time
 import threading
 import multiprocessing
+import copy
 _LOGGER = logging.getLogger(__name__)
 _LOGGER.propagate = False
+_is_profile = int(os.environ.get('FLAGS_profile_pipeline', 0))
 class PerformanceTracer(object):
    def __init__(self, is_thread_mode, interval_s, server_worker_num):
@@ -48,6 +49,8 @@ class PerformanceTracer(object):
        self._channels = []
        # The size of data in Channel will not exceed server_worker_num
        self._server_worker_num = server_worker_num
+        if _is_profile:
+            self.profile_dict = {}
    def data_buffer(self):
        return self._data_buffer
@@ -82,7 +85,7 @@ class PerformanceTracer(object):
                    item = self._data_buffer.get_nowait()
                    name = item["name"]
                    actions = item["actions"]
                    if name == "DAG":
                        succ = item["succ"]
                        req_id = item["id"]
@@ -106,9 +109,9 @@ class PerformanceTracer(object):
                    for action, costs in op_cost[name].items():
                        op_cost[name][action] = sum(costs) / (1e3 * len(costs))
                        tot_cost += op_cost[name][action]
                    if name != "DAG":
                        _LOGGER.info("Op({}):".format(name))
                        for action in all_actions:
                            if action in op_cost[name]:
                                _LOGGER.info("\t{}[{} ms]".format(
@@ -118,7 +121,9 @@ class PerformanceTracer(object):
                                calcu_cost += op_cost[name][action]
                        _LOGGER.info("\tidle[{}]".format(1 - 1.0 * calcu_cost /
                                                         tot_cost))
+            if _is_profile:
+                self.profile_dict = copy.deepcopy(op_cost)
            if "DAG" in op_cost:
                calls = list(op_cost["DAG"].values())
                calls.sort()
@@ -137,7 +142,17 @@ class PerformanceTracer(object):
                for latency in latencys:
                    _LOGGER.info("\t\t.{}[{} ms]".format(latency, calls[int(
                        tot * latency / 100.0)]))
+                if _is_profile:
+                    self.profile_dict["DAG"]["query_count"] = tot
+                    self.profile_dict["DAG"]["qps"] = qps
+                    self.profile_dict["DAG"]["succ"] = 1 - 1.0 * err_count / tot
+                    self.profile_dict["DAG"]["avg"] = ave_cost
+                    for latency in latencys:
+                        self.profile_dict["DAG"][str(latency)] = calls[int(tot * latency / 100.0)]
+            if _is_profile:
+                import yaml
+                with open("benchmark.log", "w") as fout:
+                    yaml.dump(self.profile_dict, fout, default_flow_style=False)
            # channel
            _LOGGER.info("Channel (server worker num[{}]):".format(
                self._server_worker_num))

--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -2,14 +2,15 @@ numpy>=1.12, <=1.16.4 ; python_version<"3.5"
 shapely==1.7.0
 wheel>=0.34.0, <0.35.0
 setuptools>=44.1.0
-opencv-python==4.2.0.32
 google>=2.0.3
-opencv-python==4.2.0.32
 protobuf>=3.12.2
 grpcio-tools>=1.28.1
 grpcio>=1.28.1
 func-timeout>=4.3.5
 pyyaml>=1.3.0
-sentencepiece==0.1.92
 flask>=1.1.2
 ujson>=2.0.3
+sentencepiece==0.1.92; platform_machine !=  "aarch64"
+sentencepiece; platform_machine ==  "aarch64"
+opencv-python==4.2.0.32; platform_machine !=  "aarch64"
+opencv-python; platform_machine == "aarch64"
--- a/python/requirements_mac.txt
+++ b/python/requirements_mac.txt
@@ -2,14 +2,13 @@ numpy>=1.12, <=1.16.4 ; python_version<"3.5"
 shapely==1.7.0
 wheel>=0.34.0, <0.35.0
 setuptools>=44.1.0
-opencv-python==4.2.0.32
 google>=2.0.3
 opencv-python==4.2.0.32
 protobuf>=3.12.2
-grpcio-tools>=1.33.2
-grpcio>=1.33.2
 func-timeout>=4.3.5
 pyyaml>=1.3.0
-sentencepiece==0.1.83
 flask>=1.1.2
 ujson>=2.0.3
+grpcio-tools>=1.33.2
+grpcio>=1.33.2
+sentencepiece==0.1.83
--- a/python/setup.py.app.in
+++ b/python/setup.py.app.in
@@ -41,8 +41,13 @@ if '${PACK}' == 'ON':
    copy_lib()
 REQUIRED_PACKAGES = [
-    'six >= 1.10.0', 'sentencepiece<=0.1.83', 'opencv-python<=4.2.0.32', 'pillow',
+    'six >= 1.10.0',
-    'pyclipper', 'shapely'
+    'pillow',
+    'pyclipper', 'shapely',
+    'sentencepiece<=0.1.83; platform_machine != "aarch64"',
+    'sentencepiece; platform_machine == "aarch64"',
+    'opencv-python<=4.2.0.32; platform_machine != "aarch64"',
+    'opencv-python; platform_machine == "aarch64"',
 ]
 packages=['paddle_serving_app',

--- a/python/setup.py.server.in
+++ b/python/setup.py.server.in
@@ -19,11 +19,15 @@ from __future__ import print_function
 from setuptools import setup, Distribution, Extension
 from setuptools import find_packages
 from setuptools import setup
-from paddle_serving_server.version import serving_server_version
+from paddle_serving_server.version import serving_server_version, version_suffix
 import util
-max_version, mid_version, min_version = util.python_version()
+package_version = serving_server_version.replace('-', '')
+if version_suffix != "":
+    version_suffix = "post" + version_suffix
+    package_version = package_version + "." + version_suffix
+max_version, mid_version, min_version = util.python_version()
 # gen pipeline proto code
 util.gen_pipeline_code("paddle_serving_server")
@@ -55,8 +59,8 @@ package_dir={'paddle_serving_server':
 package_data={'paddle_serving_server': ['pipeline/gateway/libproxy_server.so'],}
 setup(
-    name='paddle-serving-server',
+    name='${SERVER_PACKAGE_NAME}',
-    version=serving_server_version.replace('-', ''),
+    version= package_version,
    description=
    ('Paddle Serving Package for saved model with PaddlePaddle'),
    url='https://github.com/PaddlePaddle/Serving',

--- a/python/setup.py.server_gpu.in
+++ b/python/setup.py.server_gpu.in
-#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Setup for pip package."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from setuptools import setup, Distribution, Extension
-from setuptools import find_packages
-from setuptools import setup
-from paddle_serving_server_gpu.version import serving_server_version, cuda_version
-import util
-if cuda_version != "trt":
-    cuda_version = "post" + cuda_version
-max_version, mid_version, min_version = util.python_version()
-# gen pipeline proto code
-util.gen_pipeline_code("paddle_serving_server_gpu")
-REQUIRED_PACKAGES = [
-    'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio <= 1.33.2', 'grpcio-tools <= 1.33.2',
-    'flask >= 1.1.1', 'func_timeout', 'pyyaml'
-]
-packages=['paddle_serving_server_gpu',
-          'paddle_serving_server_gpu.proto',
-          'paddle_serving_server_gpu.pipeline',
-          'paddle_serving_server_gpu.pipeline.proto',
-          'paddle_serving_server_gpu.pipeline.gateway',
-          'paddle_serving_server_gpu.pipeline.gateway.proto']
-package_dir={'paddle_serving_server_gpu':
-             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu',
-             'paddle_serving_server_gpu.proto':
-             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto',
-             'paddle_serving_server_gpu.pipeline':
-             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/pipeline',
-             'paddle_serving_server_gpu.pipeline.proto':
-             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/pipeline/proto',
-             'paddle_serving_server_gpu.pipeline.gateway':
-             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/pipeline/gateway',
-             'paddle_serving_server_gpu.pipeline.gateway.proto':
-             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/pipeline/gateway/proto'}
-package_data={'paddle_serving_server_gpu': ['pipeline/gateway/libproxy_server.so'],}
-setup(
-    name='paddle-serving-server-gpu',
-    version=serving_server_version.replace('-', '') + "." + cuda_version,
-    description=
-    ('Paddle Serving Package for saved model with PaddlePaddle'),
-    url='https://github.com/PaddlePaddle/Serving',
-    author='PaddlePaddle Author',
-    author_email='guru4elephant@gmail.com',
-    install_requires=REQUIRED_PACKAGES,
-    packages=packages,
-    package_data=package_data,
-    package_dir=package_dir,
-    # PyPI package information.
-    classifiers=[
-        'Development Status :: 4 - Beta',
-        'Intended Audience :: Developers',
-        'Intended Audience :: Education',
-        'Intended Audience :: Science/Research',
-        'License :: OSI Approved :: Apache Software License',
-        'Programming Language :: Python :: 2.7',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.4',
-        'Programming Language :: Python :: 3.5',
-        'Programming Language :: Python :: 3.6',
-        'Topic :: Scientific/Engineering',
-        'Topic :: Scientific/Engineering :: Mathematics',
-        'Topic :: Scientific/Engineering :: Artificial Intelligence',
-        'Topic :: Software Development',
-        'Topic :: Software Development :: Libraries',
-        'Topic :: Software Development :: Libraries :: Python Modules',
-    ],
-    license='Apache 2.0',
-    keywords=('paddle-serving serving-server deployment industrial easy-to-use'))
--- a/tools/cpp_examples/demo-serving/CMakeLists.txt
+++ b/tools/cpp_examples/demo-serving/CMakeLists.txt
@@ -41,24 +41,24 @@ include_directories(SYSTEM  ${CMAKE_CURRENT_LIST_DIR}/../kvdb/include)
 include(op/CMakeLists.txt)
 include(proto/CMakeLists.txt)
 add_executable(serving ${serving_srcs})
-add_dependencies(serving pdcodegen fluid_cpu_engine pdserving paddle_fluid
+add_dependencies(serving pdcodegen paddle_inference_engine pdserving paddle_inference
        opencv_imgcodecs cube-api)
 if (WITH_GPU)
-    add_dependencies(serving fluid_gpu_engine)
+    add_dependencies(serving paddle_inference_engine)
 endif()
 target_include_directories(serving PUBLIC
        ${CMAKE_CURRENT_BINARY_DIR}/../../core/predictor
        )
 if(WITH_GPU)
-    target_link_libraries(serving -Wl,--whole-archive fluid_gpu_engine
+    target_link_libraries(serving -Wl,--whole-archive paddle_inference_engine
            -Wl,--no-whole-archive)
 endif()
-target_link_libraries(serving -Wl,--whole-archive fluid_cpu_engine
+target_link_libraries(serving -Wl,--whole-archive paddle_inference_engine
        -Wl,--no-whole-archive)
-target_link_libraries(serving paddle_fluid ${paddle_depend_libs})
+target_link_libraries(serving paddle_inference ${paddle_depend_libs})
 target_link_libraries(serving opencv_imgcodecs
        ${opencv_depend_libs})

--- a/tools/cpp_examples/elastic-ctr/serving/CMakeLists.txt
+++ b/tools/cpp_examples/elastic-ctr/serving/CMakeLists.txt
@@ -18,16 +18,16 @@ include_directories(SYSTEM  ${CMAKE_CURRENT_LIST_DIR}/../kvdb/include)
 include(op/CMakeLists.txt)
 include(proto/CMakeLists.txt)
 add_executable(elastic_serving ${serving_srcs})
-add_dependencies(elastic_serving pdcodegen fluid_cpu_engine pdserving paddle_fluid cube-api)
+add_dependencies(elastic_serving pdcodegen paddle_inference_engine pdserving paddle_inference cube-api)
 target_include_directories(elastic_serving PUBLIC
        ${CMAKE_CURRENT_BINARY_DIR}/../../predictor
        )
-target_link_libraries(elastic_serving -Wl,--whole-archive fluid_cpu_engine
+target_link_libraries(elastic_serving -Wl,--whole-archive paddle_inference_engine
        -Wl,--no-whole-archive)
-target_link_libraries(elastic_serving paddle_fluid ${paddle_depend_libs})
+target_link_libraries(elastic_serving paddle_inference ${paddle_depend_libs})
 target_link_libraries(elastic_serving pdserving)
 target_link_libraries(elastic_serving cube-api)

--- a/tools/scripts/ipipe_py2.sh
+++ b/tools/scripts/ipipe_py2.sh
+#!/bin/bash
+echo "################################################################"
+echo "#                                                              #"
+echo "#                                                              #"
+echo "#                                                              #"
+echo "#          Paddle Serving  begin run with python2.7.15!!        #"
+echo "#                                                              #"
+echo "#                                                              #"
+echo "#                                                              #"
+echo "################################################################"
+export GOPATH=$HOME/go
+export PATH=$PATH:$GOROOT/bin:$GOPATH/bin
+export CUDA_INCLUDE_DIRS=/usr/local/cuda-10.2/include
+export PYTHONROOT=/usr/local/python2.7.15/
+go env -w GO111MODULE=on
+go env -w GOPROXY=https://goproxy.cn,direct
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
+go get -u github.com/golang/protobuf/protoc-gen-go@v1.4.3
+go get -u google.golang.org/grpc@v1.33.0
+build_path=/workspace/Serving
+build_whl_list=(build_gpu_server build_client build_cpu_server build_app)
+rpc_model_list=(grpc_impl pipeline_imagenet bert_rpc_gpu bert_rpc_cpu faster_rcnn_model_rpc ResNet50_rpc lac_rpc \
+cnn_rpc bow_rpc lstm_rpc fit_a_line_rpc cascade_rcnn_rpc deeplabv3_rpc mobilenet_rpc unet_rpc resnetv2_rpc \
+ocr_rpc criteo_ctr_rpc_cpu criteo_ctr_rpc_gpu yolov4_rpc_gpu)
+http_model_list=(fit_a_line_http lac_http cnn_http bow_http lstm_http ResNet50_http bert_http)
+function setproxy(){
+  export http_proxy=${proxy}
+  export https_proxy=${proxy}
+}
+function unsetproxy(){
+  unset http_proxy
+  unset https_proxy
+}
+function kill_server_process(){
+  kill `ps -ef|grep serving|awk '{print $2}'`
+}
+function check() {
+    cd ${build_path}
+    if [ ! -f paddle_serving_app* ]; then
+      echo "paddle_serving_app is compiled failed, please check your pull request"
+      exit 1
+    elif [ ! -f paddle_serving_server-* ]; then
+      echo "paddle_serving_server-cpu is compiled failed, please check your pull request"
+      exit 1
+    elif [ ! -f paddle_serving_server_* ]; then
+      echo "paddle_serving_server_gpu is compiled failed, please check your pull request"
+      exit 1
+    elif [ ! -f paddle_serving_client* ]; then
+      echo "paddle_serving_server_client is compiled failed, please check your pull request"
+      exit 1
+    else
+      echo "paddle serving Build Passed"
+    fi
+}
+function check_result() {
+    if [ $? -ne 0 ];then
+      echo -e "\033[4;31;42m$1 model runs failed, please check your pull request or modify test case! \033[0m"
+      exit 1
+    else
+      echo -e "\033[4;37;42m$1 model runs successfully, congratulations! \033[0m"
+    fi
+}
+function before_hook(){
+  setproxy
+  cd ${build_path}/python
+  pip2.7 install --upgrade pip
+  pip2.7 install opencv-python==4.2.0.32 requests
+  pip2.7 install -r requirements.txt
+  echo "before hook configuration is successful.... "
+}
+function run_env(){
+  setproxy
+  pip2.7 install --upgrade nltk==3.4
+  pip2.7 install --upgrade scipy==1.2.1
+  pip2.7 install --upgrade setuptools
+  pip2.7 install paddlehub ujson paddlepaddle==2.0.0
+  echo "run env configuration is successful.... "
+}
+function run_gpu_env(){
+  cd ${build_path}
+  export LD_LIBRARY_PATH=/usr/local/python2.7.15/lib/python2.7/site-packages/paddle/libs/:$LD_LIBRARY_PATH
+  export LD_LIBRARY_PATH=/workspace/Serving/build_gpu/third_party/install/Paddle/lib/:/workspace/Serving/build_gpu/third_party/Paddle/src/extern_paddle/third_party/install/mklml/lib/:/workspace/Serving/build_gpu/third_party/Paddle/src/extern_paddle/third_party/install/mkldnn/lib/:$LD_LIBRARY_PATH
+  export SERVING_BIN=${build_path}/build_gpu/core/general-server/serving
+  echo "run gpu env configuration is successful.... "
+}
+function run_cpu_env(){
+  cd ${build_path}
+  export LD_LIBRARY_PATH=/usr/local/python2.7.15/lib/python2.7/site-packages/paddle/libs/:$LD_LIBRARY_PATH
+  export LD_LIBRARY_PATH=/workspace/Serving/build_cpu/third_party/install/Paddle/lib/:$LD_LIBRARY_PATH
+  export SERVING_BIN=${build_path}/build_cpu/core/general-server/serving
+  echo "run cpu env configuration is successful.... "
+}
+function build_gpu_server() {
+    setproxy
+    cd ${build_path}
+    git submodule update --init --recursive
+    if [ -d build_gpu ];then
+        rm -rf build_gpu
+    fi
+    if [ -d build ];then
+        rm -rf build
+    fi
+    mkdir build && cd build
+    cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+          -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython2.7.so \
+          -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+          -DSERVER=ON \
+          -DTENSORRT_ROOT=/usr \
+          -DWITH_GPU=ON ..
+    make -j18
+    make -j18
+    make install -j18
+    pip2.7 uninstall paddle-serving-server-gpu -y
+    pip2.7 install ${build_path}/build/python/dist/*
+    cp  ${build_path}/build/python/dist/* ../
+    cp -r ${build_path}/build/ ${build_path}/build_gpu
+}
+function build_client() {
+     setproxy
+     cd  ${build_path}
+     if [ -d build ];then
+          rm -rf build
+     fi
+     mkdir build && cd build
+     cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+           -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython2.7.so \
+           -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+           -DCLIENT=ON ..
+     make -j18
+     make -j18
+     cp ${build_path}/build/python/dist/* ../
+     pip2.7 uninstall paddle-serving-client -y
+     pip2.7 install ${build_path}/build/python/dist/*
+}
+function build_cpu_server(){
+      setproxy
+      cd ${build_path}
+      if [ -d build_cpu ];then
+         rm -rf build_cpu
+      fi
+      if [ -d build ];then
+          rm -rf build
+      fi
+      mkdir build && cd build
+      pwd
+      cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+            -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython2.7.so \
+            -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+            -DWITH_GPU=OFF \
+            -DSERVER=ON ..
+      make -j18
+      make -j18
+      make install -j18
+      cp ${build_path}/build/python/dist/* ../
+      pip2.7 uninstall paddle-serving-server -y
+      pip2.7 install ${build_path}/build/python/dist/*
+      cp -r ${build_path}/build/ ${build_path}/build_cpu
+}
+function build_app() {
+  setproxy
+  cd ${build_path}
+  if [ -d build ];then
+      rm -rf build
+  fi
+  mkdir build && cd build
+  cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+        -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+        -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+        -DCMAKE_INSTALL_PREFIX=./output -DAPP=ON ..
+  make
+  cp ${build_path}/build/python/dist/* ../
+  pip2.7 uninstall paddle-serving-app -y
+  pip2.7 install ${build_path}/build/python/dist/*
+}
+function bert_rpc_gpu(){
+  run_gpu_env
+  setproxy
+  cd ${build_path}/python/examples/bert
+  sh get_data.sh >/dev/null 2>&1
+  sed -i 's/9292/8860/g' bert_client.py
+  sed -i '$aprint(result)' bert_client.py
+  cp -r /root/.cache/dist_data/serving/bert/bert_seq128_* ./
+  ls -hlst
+  python2.7 -m paddle_serving_server_gpu.serve --model bert_seq128_model/ --port 8860 --gpu_ids 0 > bert_rpc_gpu 2>&1 &
+  sleep 15
+  head data-c.txt | python2.7 bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
+  cat bert_rpc_gpu
+  check_result $FUNCNAME
+  kill_server_process
+}
+function bert_rpc_cpu(){
+  run_cpu_env
+  setproxy
+  cd ${build_path}/python/examples/bert
+  sed -i 's/8860/8861/g' bert_client.py
+  python2.7 -m paddle_serving_server.serve --model bert_seq128_model/ --port 8861 > bert_rpc_cpu 2>&1 &
+  sleep 3
+  cp data-c.txt.1 data-c.txt
+  head data-c.txt | python2.7 bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
+  cat bert_rpc_cpu
+  check_result $FUNCNAME
+  kill_server_process
+}
+function criteo_ctr_with_cube_rpc(){
+  setproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/criteo_ctr_with_cube
+  ln -s /root/.cache/dist_data/serving/criteo_ctr_with_cube/raw_data ./
+  sed -i "s/9292/8888/g" test_server.py
+  sed -i "s/9292/8888/g" test_client.py
+  wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz >/dev/null 2>&1
+  tar xf ctr_cube_unittest.tar.gz
+  mv models/ctr_client_conf ./
+  mv models/ctr_serving_model_kv ./
+  mv models/data ./cube/
+  wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz >/dev/null 2>&1
+  tar xf cube_app.tar.gz
+  mv cube_app/cube* ./cube/
+  sh cube_prepare.sh > haha 2>&1 &
+  sleep 5
+  python2.7 test_server.py ctr_serving_model_kv > criteo_ctr_rpc 2>&1 &
+  sleep 5
+  python2.7 test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
+  cat criteo_ctr_rpc
+  check_result $FUNCNAME
+  kill `ps -ef|grep cube|awk '{print $2}'`
+  kill_server_process
+}
+function pipeline_imagenet(){
+  run_gpu_env
+  setproxy
+  cd ${build_path}/python/examples/pipeline/imagenet
+  cp -r /root/.cache/dist_data/serving/imagenet/* ./
+  ls -a
+  python2.7 resnet50_web_service.py > pipelog 2>&1 &
+  sleep 5
+  python2.7 pipeline_rpc_client.py
+  # check_result $FUNCNAME
+  kill_server_process
+}
+function ResNet50_rpc(){
+  run_gpu_env
+  setproxy
+  cd ${build_path}/python/examples/imagenet
+  cp -r /root/.cache/dist_data/serving/imagenet/* ./
+  sed -i 's/9696/8863/g' resnet50_rpc_client.py
+  python2.7 -m paddle_serving_server_gpu.serve --model ResNet50_vd_model --port 8863 --gpu_ids 0 > ResNet50_rpc 2>&1 &
+  sleep 5
+  python2.7 resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
+  tail ResNet50_rpc
+  check_result $FUNCNAME
+  kill_server_process
+  sleep 5
+}
+function ResNet101_rpc(){
+  run_gpu_env
+  setproxy
+  cd ${build_path}/python/examples/imagenet
+  sed -i 's/9292/8864/g' image_rpc_client.py
+  python2.7 -m paddle_serving_server_gpu.serve --model ResNet101_vd_model --port 8864 --gpu_ids 0 > ResNet101_rpc 2>&1 &
+  sleep 5
+  python2.7 image_rpc_client.py ResNet101_vd_client_config/serving_client_conf.prototxt
+  tail ResNet101_rpc
+  kill_server_process
+  check_result $FUNCNAME
+}
+function cnn_rpc(){
+  setproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/imdb
+  cp -r /root/.cache/dist_data/serving/imdb/* ./
+  tar xf imdb_model.tar.gz && tar xf text_classification_data.tar.gz
+  sed -i 's/9292/8865/g' test_client.py
+  python2.7 -m paddle_serving_server.serve --model imdb_cnn_model/ --port 8865 > cnn_rpc 2>&1 &
+  sleep 5
+  head test_data/part-0 | python2.7 test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
+  tail cnn_rpc
+  check_result $FUNCNAME
+  kill_server_process
+}
+function bow_rpc(){
+  setproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/imdb
+  sed -i 's/8865/8866/g' test_client.py
+  python2.7 -m paddle_serving_server.serve --model imdb_bow_model/ --port 8866 > bow_rpc 2>&1 &
+  sleep 5
+  head test_data/part-0 | python2.7 test_client.py imdb_bow_client_conf/serving_client_conf.prototxt imdb.vocab
+  tail bow_rpc
+  check_result $FUNCNAME
+  kill_server_process
+  sleep 5
+}
+function lstm_rpc(){
+  setproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/imdb
+  sed -i 's/8866/8867/g' test_client.py
+  python2.7 -m paddle_serving_server.serve --model imdb_lstm_model/ --port 8867 > lstm_rpc 2>&1 &
+  sleep 5
+  head test_data/part-0 | python2.7 test_client.py imdb_lstm_client_conf/serving_client_conf.prototxt imdb.vocab
+  tail lstm_rpc
+  check_result $FUNCNAME
+  kill_server_process
+  kill `ps -ef|grep imdb|awk '{print $2}'`
+}
+function lac_rpc(){
+  setproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/lac
+  python2.7 -m paddle_serving_app.package --get_model lac >/dev/null 2>&1
+  tar xf lac.tar.gz
+  sed -i 's/9292/8868/g' lac_client.py
+  python2.7 -m paddle_serving_server.serve --model lac_model/ --port 8868 > lac_rpc 2>&1 &
+  sleep 5
+  echo "我爱北京天安门" | python2.7 lac_client.py lac_client/serving_client_conf.prototxt lac_dict/
+  tail lac_rpc
+  check_result $FUNCNAME
+  kill_server_process
+}
+function fit_a_line_rpc(){
+  setproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/fit_a_line
+  sh get_data.sh >/dev/null 2>&1
+  sed -i 's/9393/8869/g' test_client.py
+  python2.7 -m paddle_serving_server.serve --model uci_housing_model --port 8869 > line_rpc 2>&1 &
+  sleep 5
+  python2.7 test_client.py uci_housing_client/serving_client_conf.prototxt
+  tail line_rpc
+  check_result $FUNCNAME
+  kill_server_process
+}
+function faster_rcnn_model_rpc(){
+  run_gpu_env
+  setproxy
+  cd ${build_path}/python/examples/faster_rcnn_model
+  cp -r /root/.cache/dist_data/serving/faster_rcnn/faster_rcnn_model.tar.gz ./
+  tar xf faster_rcnn_model.tar.gz
+  wget https://paddle-serving.bj.bcebos.com/pddet_demo/infer_cfg.yml >/dev/null 2>&1
+  mv faster_rcnn_model/pddet* ./
+  sed -i 's/9494/8870/g' test_client.py
+  python2.7 -m paddle_serving_server_gpu.serve --model pddet_serving_model --port 8870 --gpu_id 0 > faster_rcnn_rpc 2>&1 &
+  sleep 3
+  python2.7 test_client.py pddet_client_conf/serving_client_conf.prototxt infer_cfg.yml 000000570688.jpg
+  tail faster_rcnn_rpc
+  check_result $FUNCNAME
+  kill_server_process
+}
+function cascade_rcnn_rpc(){
+  setproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/cascade_rcnn
+  cp -r /root/.cache/dist_data/serving/cascade_rcnn/cascade_rcnn_r50_fpx_1x_serving.tar.gz ./
+  tar xf cascade_rcnn_r50_fpx_1x_serving.tar.gz
+  sed -i "s/9292/8879/g" test_client.py
+  python2.7 -m paddle_serving_server_gpu.serve --model serving_server --port 8879 --gpu_id 0 > rcnn_rpc 2>&1 &
+  ls -hlst
+  sleep 5
+  python2.7 test_client.py
+  tail rcnn_rpc
+  check_result $FUNCNAME
+  kill_server_process
+}
+function deeplabv3_rpc() {
+  setproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/deeplabv3
+  cp -r /root/.cache/dist_data/serving/deeplabv3/deeplabv3.tar.gz ./
+  tar xf deeplabv3.tar.gz
+  sed -i "s/9494/8880/g" deeplabv3_client.py
+  python2.7 -m paddle_serving_server_gpu.serve --model deeplabv3_server --gpu_ids 0 --port 8880 > deeplab_rpc 2>&1 &
+  sleep 5
+  python2.7 deeplabv3_client.py
+  tail deeplab_rpc
+  check_result $FUNCNAME
+  kill_server_process
+}
+function mobilenet_rpc() {
+  setproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/mobilenet
+  python2.7 -m paddle_serving_app.package --get_model mobilenet_v2_imagenet >/dev/null 2>&1
+  tar xf mobilenet_v2_imagenet.tar.gz
+  sed -i "s/9393/8881/g" mobilenet_tutorial.py
+  python2.7 -m paddle_serving_server_gpu.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 8881 > mobilenet_rpc 2>&1 &
+  sleep 5
+  python2.7 mobilenet_tutorial.py
+  tail mobilenet_rpc
+  check_result $FUNCNAME
+  kill_server_process
+}
+function unet_rpc() {
+ setproxy
+ run_gpu_env
+ cd ${build_path}/python/examples/unet_for_image_seg
+ python2.7 -m paddle_serving_app.package --get_model unet >/dev/null 2>&1
+ tar xf unet.tar.gz
+ sed -i "s/9494/8882/g" seg_client.py
+ python2.7 -m paddle_serving_server_gpu.serve --model unet_model --gpu_ids 0 --port 8882 > unet_rpc 2>&1 &
+ sleep 5
+ python2.7 seg_client.py
+ tail unet_rpc
+ check_result $FUNCNAME
+ kill_server_process
+}
+function resnetv2_rpc() {
+  setproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/resnet_v2_50
+  cp /root/.cache/dist_data/serving/resnet_v2_50/resnet_v2_50_imagenet.tar.gz ./
+  tar xf resnet_v2_50_imagenet.tar.gz
+  sed -i 's/9393/8883/g' resnet50_v2_tutorial.py
+  python2.7 -m paddle_serving_server_gpu.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 8883 > v2_log 2>&1 &
+  sleep 10
+  python2.7 resnet50_v2_tutorial.py
+  tail v2_log
+  check_result $FUNCNAME
+  kill_server_process
+}
+function ocr_rpc() {
+  setproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/ocr
+  cp -r /root/.cache/dist_data/serving/ocr/test_imgs ./
+  python2.7 -m paddle_serving_app.package --get_model ocr_rec >/dev/null 2>&1
+  tar xf ocr_rec.tar.gz
+  sed -i 's/9292/8884/g' test_ocr_rec_client.py
+  python2.7 -m paddle_serving_server.serve --model ocr_rec_model --port 8884 > ocr_rpc 2>&1 &
+  sleep 5
+  python2.7 test_ocr_rec_client.py
+  tail ocr_rpc
+  check_result $FUNCNAME
+  kill_server_process
+}
+function criteo_ctr_rpc_cpu() {
+  setproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/criteo_ctr
+  sed -i "s/9292/8885/g" test_client.py
+  ln -s /root/.cache/dist_data/serving/criteo_ctr_with_cube/raw_data ./
+  wget https://paddle-serving.bj.bcebos.com/criteo_ctr_example/criteo_ctr_demo_model.tar.gz >/dev/null 2>&1
+  tar xf criteo_ctr_demo_model.tar.gz
+  mv models/ctr_client_conf .
+  mv models/ctr_serving_model .
+  python2.7 -m paddle_serving_server.serve --model ctr_serving_model/ --port 8885 > criteo_ctr_cpu_rpc 2>&1 &
+  sleep 5
+  python2.7 test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0
+  tail criteo_ctr_cpu_rpc
+  check_result $FUNCNAME
+  kill_server_process
+}
+function criteo_ctr_rpc_gpu() {
+  setproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/criteo_ctr
+  sed -i "s/8885/8886/g" test_client.py
+  python2.7 -m paddle_serving_server_gpu.serve --model ctr_serving_model/ --port 8886 --gpu_ids 0 > criteo_ctr_gpu_rpc 2>&1 &
+  sleep 5
+  python2.7 test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/
+  tail criteo_ctr_gpu_rpc
+  check_result $FUNCNAME
+  kill_server_process
+}
+function yolov4_rpc_gpu() {
+  setproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/yolov4
+  sed -i "s/9393/8887/g" test_client.py
+  cp -r /root/.cache/dist_data/serving/yolov4/yolov4.tar.gz ./
+  tar xf yolov4.tar.gz
+  python2.7 -m paddle_serving_server_gpu.serve --model yolov4_model --port 8887 --gpu_ids 0 > yolov4_rpc_log 2>&1 &
+  sleep 5
+  python2.7 test_client.py 000000570688.jpg
+  tail yolov4_rpc_log
+# check_result $FUNCNAME
+  kill_server_process
+}
+function senta_rpc_cpu() {
+  setproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/senta
+  sed -i "s/9393/8887/g" test_client.py
+  cp -r /data/.cache/dist_data/serving/yolov4/yolov4.tar.gz ./
+  tar xf yolov4.tar.gz
+  python2.7 -m paddle_serving_server_gpu.serve --model yolov4_model --port 8887 --gpu_ids 0 > yolov4_rpc_log 2>&1 &
+  sleep 5
+  python2.7 test_client.py 000000570688.jpg
+  tail yolov4_rpc_log
+  check_result $FUNCNAME
+  kill_server_process
+}
+function fit_a_line_http() {
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/fit_a_line
+  sed -i "s/9292/8871/g" test_server.py
+  python2.7 test_server.py > http_log2 2>&1 &
+  sleep 10
+  curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://${host}:8871/uci/prediction
+  check_result $FUNCNAME
+  kill_server_process
+}
+function lac_http() {
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/lac
+  python2.7 lac_web_service.py lac_model/ lac_workdir 8872 > http_lac_log2 2>&1 &
+  sleep 10
+  curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "我爱北京天安门"}], "fetch":["word_seg"]}' http://${host}:8872/lac/prediction
+  check_result $FUNCNAME
+  kill_server_process
+}
+function cnn_http() {
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/imdb
+  python2.7 text_classify_service.py imdb_cnn_model/ workdir/ 8873 imdb.vocab > cnn_http 2>&1 &
+  sleep 10
+  curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://${host}:8873/imdb/prediction
+  check_result $FUNCNAME
+  kill_server_process
+}
+function bow_http() {
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/imdb
+  python2.7 text_classify_service.py imdb_bow_model/ workdir/ 8874 imdb.vocab > bow_http 2>&1 &
+  sleep 10
+  curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://${host}:8874/imdb/prediction
+  check_result $FUNCNAME
+  kill_server_process
+}
+function lstm_http() {
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/imdb
+  python2.7 text_classify_service.py imdb_bow_model/ workdir/ 8875 imdb.vocab > bow_http 2>&1 &
+  sleep 10
+  curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://${host}:8875/imdb/prediction
+  check_result $FUNCNAME
+  kill_server_process
+}
+function ResNet50_http() {
+  unsetproxy
+  run_gpu_env
+  cd ${build_path}/python2.7/examples/imagenet
+  python2.7 resnet50_web_service.py ResNet50_vd_model gpu 8876 > resnet50_http 2>&1 &
+  sleep 10
+  curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"image": "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"}], "fetch": ["score"]}' http://${host}:8876/image/prediction
+  check_result $FUNCNAME
+  kill_server_process
+}
+bert_http(){
+  run_gpu_env
+  unsetproxy
+  cd ${build_path}/python/examples/bert
+  cp data-c.txt.1 data-c.txt
+  cp vocab.txt.1 vocab.txt
+  export CUDA_VISIBLE_DEVICES=0
+  python2.7 bert_web_service.py bert_seq128_model/ 8878 > bert_http 2>&1 &
+  sleep 10
+  curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://${host}:8878/bert/prediction
+  check_result $FUNCNAME
+  kill_server_process
+}
+grpc_impl(){
+  run_gpu_env
+  cd ${build_path}/python/examples/grpc_impl_example/fit_a_line
+  sh get_data.sh >/dev/null 2>&1
+  python2.7 test_server.py uci_housing_model/ > grpclog 2>&1 &
+  sleep 5
+  echo "sync predict"
+  python2.7 test_sync_client.py
+  echo "async predict"
+  python2.7 test_asyn_client.py
+  echo "batch predict"
+  python2.7 test_batch_client.py
+  echo "timeout predict"
+  python2.7 test_timeout_client.py
+  # check_result $FUNCNAME
+  kill_server_process
+}
+function build_all_whl(){
+  for whl in ${build_whl_list[@]}
+  do
+    echo "===========${whl} begin build==========="
+    $whl
+    sleep 3
+    echo "===========${whl} build over ==========="
+  done
+}
+function run_rpc_models(){
+  for model in ${rpc_model_list[@]}
+  do
+    echo "===========${model} run begin==========="
+    $model
+    sleep 3
+    echo "===========${model} run  end ==========="
+  done
+}
+function run_http_models(){
+  for model in ${http_model_list[@]}
+  do
+    echo "===========${model} run begin==========="
+    $model
+    sleep 3
+    echo "===========${model} run  end ==========="
+  done
+}
+function end_hook(){
+  cd ${build_path}
+  kill_server_process
+  kill `ps -ef|grep python|awk '{print $2}'`
+  sleep 5
+  echo "===========files==========="
+  ls -hlst
+  echo "=========== end ==========="
+}
+function main() {
+  before_hook
+  build_all_whl
+  check
+  run_env
+  run_rpc_models
+#   run_http_models
+  end_hook
+}
+main$@
--- a/tools/scripts/ipipe_py3.sh
+++ b/tools/scripts/ipipe_py3.sh
+#!/bin/bash
+echo "################################################################"
+echo "#                                                              #"
+echo "#                                                              #"
+echo "#                                                              #"
+echo "#          Paddle Serving  begin run  with python3.6.8!        #"
+echo "#                                                              #"
+echo "#                                                              #"
+echo "#                                                              #"
+echo "################################################################"
+export GOPATH=$HOME/go
+export PATH=$PATH:$GOROOT/bin:$GOPATH/bin
+export CUDA_INCLUDE_DIRS=/usr/local/cuda-10.2/include
+export PYTHONROOT=/usr/local
+go env -w GO111MODULE=on
+go env -w GOPROXY=https://goproxy.cn,direct
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
+go get -u github.com/golang/protobuf/protoc-gen-go@v1.4.3
+go get -u google.golang.org/grpc@v1.33.0
+build_path=/workspace/Serving/
+build_whl_list=(build_gpu_server build_client build_cpu_server build_app)
+rpc_model_list=(grpc_impl pipeline_imagenet bert_rpc_gpu bert_rpc_cpu ResNet50_rpc lac_rpc \
+cnn_rpc bow_rpc lstm_rpc fit_a_line_rpc deeplabv3_rpc mobilenet_rpc unet_rpc resnetv2_rpc \
+criteo_ctr_rpc_cpu criteo_ctr_rpc_gpu ocr_rpc yolov4_rpc_gpu)
+http_model_list=(fit_a_line_http lac_http cnn_http bow_http lstm_http ResNet50_http bert_http)
+function setproxy(){
+  export http_proxy=${proxy}
+  export https_proxy=${proxy}
+}
+function unsetproxy(){
+  unset http_proxy
+  unset https_proxy
+}
+function kill_server_process(){
+  kill `ps -ef|grep $1 |awk '{print $2}'`
+  kill `ps -ef|grep serving |awk '{print $2}'`
+}
+function check() {
+    cd ${build_path}
+    if [ ! -f paddle_serving_app* ]; then
+      echo "paddle_serving_app is compiled failed, please check your pull request"
+      exit 1
+    elif [ ! -f paddle_serving_server-* ]; then
+      echo "paddle_serving_server-cpu is compiled failed, please check your pull request"
+      exit 1
+    elif [ ! -f paddle_serving_server_* ]; then
+      echo "paddle_serving_server_gpu is compiled failed, please check your pull request"
+      exit 1
+    elif [ ! -f paddle_serving_client* ]; then
+      echo "paddle_serving_server_client is compiled failed, please check your pull request"
+      exit 1
+    else
+      echo "paddle serving build passed"
+    fi
+}
+function check_result() {
+    if [ $? -ne 0 ];then
+      echo -e "\033[4;31;42m$1 model runs failed, please check your pull request or modify test case! \033[0m"
+      exit 1
+    else
+      echo -e "\033[4;37;42m$1 model runs successfully, congratulations! \033[0m"
+    fi
+}
+function before_hook(){
+  setproxy
+  cd ${build_path}/python
+  pip3.6 install --upgrade pip
+  pip3.6 install requests
+  pip3.6 install -r requirements.txt
+  pip3.6 install numpy==1.16.4
+  echo "before hook configuration is successful.... "
+}
+function run_env(){
+  setproxy
+  pip3.6 install --upgrade nltk==3.4
+  pip3.6 install --upgrade scipy==1.2.1
+  pip3.6 install --upgrade setuptools==41.0.0
+  pip3.6 install paddlehub ujson paddlepaddle==2.0.0
+  echo "run env configuration is successful.... "
+}
+function run_gpu_env(){
+  cd ${build_path}
+  export LD_LIBRARY_PATH=/usr/local/lib64/python3.6/site-packages/paddle/libs/:$LD_LIBRARY_PATH
+  export LD_LIBRARY_PATH=/workspace/Serving/build_gpu/third_party/install/Paddle/lib/:/workspace/Serving/build_gpu/third_party/Paddle/src/extern_paddle/third_party/install/mklml/lib/:/workspace/Serving/build_gpu/third_party/Paddle/src/extern_paddle/third_party/install/mkldnn/lib/:$LD_LIBRARY_PATH
+  export SERVING_BIN=${build_path}/build_gpu/core/general-server/serving
+  echo "run gpu env configuration is successful.... "
+}
+function run_cpu_env(){
+  cd ${build_path}
+  export LD_LIBRARY_PATH=/usr/local/lib64/python3.6/site-packages/paddle/libs/:$LD_LIBRARY_PATH
+  export LD_LIBRARY_PATH=/workspace/Serving/build_cpu/third_party/install/Paddle/lib/:$LD_LIBRARY_PATH
+  export SERVING_BIN=${build_path}/build_cpu/core/general-server/serving
+  echo "run cpu env configuration is successful.... "
+}
+function build_gpu_server() {
+    setproxy
+    cd ${build_path}
+    git submodule update --init --recursive
+    if [ -d build_gpu ];then
+        rm -rf build_gpu
+    fi
+    if [ -d build ];then
+        rm -rf build
+    fi
+    mkdir build && cd build
+    cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python3.6m/ \
+          -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython3.6.so \
+          -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python3.6 \
+          -DSERVER=ON \
+          -DTENSORRT_ROOT=/usr \
+          -DWITH_GPU=ON ..
+    make -j18
+    make -j18
+    make install -j18
+    pip3.6 uninstall paddle-serving-server-gpu -y
+    pip3.6 install ${build_path}/build/python/dist/*
+    cp  ${build_path}/build/python/dist/* ../
+    cp -r ${build_path}/build/ ${build_path}/build_gpu
+}
+function build_client() {
+     setproxy
+     cd  ${build_path}
+     if [ -d build ];then
+          rm -rf build
+     fi
+     mkdir build && cd build
+     cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python3.6m/ \
+           -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython3.6.so \
+           -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python3.6 \
+           -DCLIENT=ON ..
+     make -j18
+     make -j18
+     cp ${build_path}/build/python/dist/* ../
+     pip3.6 uninstall paddle-serving-client -y
+     pip3.6 install ${build_path}/build/python/dist/*
+}
+function build_cpu_server(){
+      setproxy
+      cd ${build_path}
+      if [ -d build_cpu ];then
+          rm -rf build_cpu
+      fi
+      if [ -d build ];then
+          rm -rf build
+      fi
+      mkdir build && cd build
+      cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python3.6m/ \
+            -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython3.6.so \
+            -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python3.6 \
+            -DWITH_GPU=OFF \
+            -DSERVER=ON ..
+      make -j18
+      make -j18
+      make install -j18
+      cp ${build_path}/build/python/dist/* ../
+      pip3.6 uninstall paddle-serving-server -y
+      pip3.6 install ${build_path}/build/python/dist/*
+      cp -r ${build_path}/build/ ${build_path}/build_cpu
+}
+function build_app() {
+  setproxy
+  pip3.6 install paddlehub ujson Pillow
+  pip3.6 install paddlepaddle==2.0.0
+  cd ${build_path}
+  if [ -d build ];then
+      rm -rf build
+  fi
+  mkdir build && cd build
+  cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python3.6m/ \
+        -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython3.6.so \
+        -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python3.6 \
+        -DCMAKE_INSTALL_PREFIX=./output -DAPP=ON ..
+  make
+  cp ${build_path}/build/python/dist/* ../
+  pip3.6 uninstall paddle-serving-app -y
+  pip3.6 install ${build_path}/build/python/dist/*
+}
+function bert_rpc_gpu(){
+  run_gpu_env
+  unsetproxy
+  cd ${build_path}/python/examples/bert
+  sh get_data.sh >/dev/null 2>&1
+  sed -i 's/9292/8860/g' bert_client.py
+  sed -i '$aprint(result)' bert_client.py
+  cp -r /root/.cache/dist_data/serving/bert/bert_seq128_* ./
+  ls -hlst
+  python3.6 -m paddle_serving_server_gpu.serve --model bert_seq128_model/ --port 8860 --gpu_ids 0 &
+  sleep 15
+  nvidia-smi
+  head data-c.txt | python3.6 bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
+  nvidia-smi
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function bert_rpc_cpu(){
+  run_cpu_env
+  unsetproxy
+  cd ${build_path}/python/examples/bert
+  sed -i 's/8860/8861/g' bert_client.py
+  python3.6 -m paddle_serving_server.serve --model bert_seq128_model/ --port 8861 &
+  sleep 3
+  cp data-c.txt.1 data-c.txt
+  head data-c.txt | python3.6 bert_client.py --model bert_seq128_client/serving_client_conf.prototxt
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function criteo_ctr_with_cube_rpc(){
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/criteo_ctr_with_cube
+  ln -s /root/.cache/dist_data/serving/criteo_ctr_with_cube/raw_data ./
+  sed -i "s/9292/8888/g" test_server.py
+  sed -i "s/9292/8888/g" test_client.py
+  wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz >/dev/null 2>&1
+  tar xf ctr_cube_unittest.tar.gz
+  mv models/ctr_client_conf ./
+  mv models/ctr_serving_model_kv ./
+  mv models/data ./cube/
+  wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz >/dev/null 2>&1
+  tar xf cube_app.tar.gz
+  mv cube_app/cube* ./cube/
+  sh cube_prepare.sh > haha 2>&1 &
+  sleep 5
+  python3.6 test_server.py ctr_serving_model_kv &
+  sleep 5
+  python3.6 test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
+  check_result $FUNCNAME
+  kill `ps -ef|grep cube|awk '{print $2}'`
+  kill_server_process test_server
+}
+function pipeline_imagenet(){
+  run_gpu_env
+  unsetproxy
+  cd ${build_path}/python/examples/pipeline/imagenet
+  cp -r /root/.cache/dist_data/serving/imagenet/* ./
+  ls -a
+  python3.6 resnet50_web_service.py &
+  sleep 5
+  nvidia-smi
+  python3.6 pipeline_rpc_client.py
+  nvidia-smi
+  # check_result $FUNCNAME
+  kill_server_process resnet50_web_service
+}
+function ResNet50_rpc(){
+  run_gpu_env
+  unsetproxy
+  cd ${build_path}/python/examples/imagenet
+  cp -r /root/.cache/dist_data/serving/imagenet/* ./
+  sed -i 's/9696/8863/g' resnet50_rpc_client.py
+  python3.6 -m paddle_serving_server_gpu.serve --model ResNet50_vd_model --port 8863 --gpu_ids 0 &
+  sleep 5
+  nvidia-smi
+  python3.6 resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt
+  nvidia-smi
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function ResNet101_rpc(){
+  run_gpu_env
+  unsetproxy
+  cd ${build_path}/python/examples/imagenet
+  sed -i "22cclient.connect(['${host}:8864'])" image_rpc_client.py
+  python3.6 -m paddle_serving_server_gpu.serve --model ResNet101_vd_model --port 8864 --gpu_ids 0 &
+  sleep 5
+  nvidia-smi
+  python3.6 image_rpc_client.py ResNet101_vd_client_config/serving_client_conf.prototxt
+  nvidia-smi
+  check_result $FUNCNAME
+  kill_server_process serving
+  sleep 5
+}
+function cnn_rpc(){
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/imdb
+  cp -r /root/.cache/dist_data/serving/imdb/* ./
+  tar xf imdb_model.tar.gz && tar xf text_classification_data.tar.gz
+  sed -i 's/9292/8865/g' test_client.py
+  python3.6 -m paddle_serving_server.serve --model imdb_cnn_model/ --port 8865 &
+  sleep 5
+  head test_data/part-0 | python3.6 test_client.py imdb_cnn_client_conf/serving_client_conf.prototxt imdb.vocab
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function bow_rpc(){
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/imdb
+  sed -i 's/8865/8866/g' test_client.py
+  python3.6 -m paddle_serving_server.serve --model imdb_bow_model/ --port 8866 &
+  sleep 5
+  head test_data/part-0 | python3.6 test_client.py imdb_bow_client_conf/serving_client_conf.prototxt imdb.vocab
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function lstm_rpc(){
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/imdb
+  sed -i 's/8866/8867/g' test_client.py
+  python3.6 -m paddle_serving_server.serve --model imdb_lstm_model/ --port 8867 &
+  sleep 5
+  head test_data/part-0 | python3.6 test_client.py imdb_lstm_client_conf/serving_client_conf.prototxt imdb.vocab
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function lac_rpc(){
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/lac
+  python3.6 -m paddle_serving_app.package --get_model lac >/dev/null 2>&1
+  tar xf lac.tar.gz
+  sed -i 's/9292/8868/g' lac_client.py
+  python3.6 -m paddle_serving_server.serve --model lac_model/ --port 8868 &
+  sleep 5
+  echo "我爱北京天安门" | python3.6 lac_client.py lac_client/serving_client_conf.prototxt lac_dict/
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function fit_a_line_rpc(){
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/fit_a_line
+  sh get_data.sh >/dev/null 2>&1
+  sed -i 's/9393/8869/g' test_client.py
+  python3.6 -m paddle_serving_server.serve --model uci_housing_model --port 8869 &
+  sleep 5
+  python3.6 test_client.py uci_housing_client/serving_client_conf.prototxt
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function faster_rcnn_model_rpc(){
+  unsetproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/faster_rcnn
+  cp -r /root/.cache/dist_data/serving/faster_rcnn/faster_rcnn_model.tar.gz ./
+  tar xf faster_rcnn_model.tar.gz
+  wget https://paddle-serving.bj.bcebos.com/pddet_demo/infer_cfg.yml >/dev/null 2>&1
+  mv faster_rcnn_model/pddet* ./
+  sed -i 's/9494/8870/g' test_client.py
+  python3.6 -m paddle_serving_server_gpu.serve --model pddet_serving_model --port 8870 --gpu_id 0 --thread 2 &
+  echo "faster rcnn running ..."
+  nvidia-smi
+  sleep 5
+  python3.6 test_client.py pddet_client_conf/serving_client_conf.prototxt infer_cfg.yml 000000570688.jpg
+  nvidia-smi
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function cascade_rcnn_rpc(){
+  unsetproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/cascade_rcnn
+  cp -r /root/.cache/dist_data/serving/cascade_rcnn/cascade_rcnn_r50_fpx_1x_serving.tar.gz ./
+  tar xf cascade_rcnn_r50_fpx_1x_serving.tar.gz
+  sed -i "s/9292/8879/g" test_client.py
+  python3.6 -m paddle_serving_server_gpu.serve --model serving_server --port 8879 --gpu_id 0 --thread 2 &
+  sleep 5
+  nvidia-smi
+  python3.6 test_client.py
+  nvidia-smi
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function deeplabv3_rpc() {
+  unsetproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/deeplabv3
+  cp -r /root/.cache/dist_data/serving/deeplabv3/deeplabv3.tar.gz ./
+  tar xf deeplabv3.tar.gz
+  sed -i "s/9494/8880/g" deeplabv3_client.py
+  python3.6 -m paddle_serving_server_gpu.serve --model deeplabv3_server --gpu_ids 0 --port 8880 --thread 2 &
+  sleep 5
+  nvidia-smi
+  python3.6 deeplabv3_client.py
+  nvidia-smi
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function mobilenet_rpc() {
+  unsetproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/mobilenet
+  python3.6 -m paddle_serving_app.package --get_model mobilenet_v2_imagenet >/dev/null 2>&1
+  tar xf mobilenet_v2_imagenet.tar.gz
+  sed -i "s/9393/8881/g" mobilenet_tutorial.py
+  python3.6 -m paddle_serving_server_gpu.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 8881 &
+  sleep 5
+  nvidia-smi
+  python3.6 mobilenet_tutorial.py
+  nvidia-smi
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function unet_rpc() {
+ unsetproxy
+ run_gpu_env
+ cd ${build_path}/python/examples/unet_for_image_seg
+ python3.6 -m paddle_serving_app.package --get_model unet >/dev/null 2>&1
+ tar xf unet.tar.gz
+ sed -i "s/9494/8882/g" seg_client.py
+ python3.6 -m paddle_serving_server_gpu.serve --model unet_model --gpu_ids 0 --port 8882 &
+ sleep 5
+ nvidia-smi
+ python3.6 seg_client.py
+ nvidia-smi
+ check_result $FUNCNAME
+ kill_server_process serving
+}
+function resnetv2_rpc() {
+  unsetproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/resnet_v2_50
+  cp /root/.cache/dist_data/serving/resnet_v2_50/resnet_v2_50_imagenet.tar.gz ./
+  tar xf resnet_v2_50_imagenet.tar.gz
+  sed -i 's/9393/8883/g' resnet50_v2_tutorial.py
+  python3.6 -m paddle_serving_server_gpu.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 8883 &
+  sleep 10
+  nvidia-smi
+  python3.6 resnet50_v2_tutorial.py
+  nvidia-smi
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function ocr_rpc() {
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/ocr
+  cp -r /root/.cache/dist_data/serving/ocr/test_imgs ./
+  python3.6 -m paddle_serving_app.package --get_model ocr_rec >/dev/null 2>&1
+  tar xf ocr_rec.tar.gz
+  sed -i 's/9292/8884/g' test_ocr_rec_client.py
+  python3.6 -m paddle_serving_server.serve --model ocr_rec_model --port 8884 &
+  sleep 5
+  python3.6 test_ocr_rec_client.py
+ # check_result $FUNCNAME
+  kill_server_process serving
+}
+function criteo_ctr_rpc_cpu() {
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/criteo_ctr
+  sed -i "s/9292/8885/g" test_client.py
+  ln -s /root/.cache/dist_data/serving/criteo_ctr_with_cube/raw_data ./
+  wget https://paddle-serving.bj.bcebos.com/criteo_ctr_example/criteo_ctr_demo_model.tar.gz >/dev/null 2>&1
+  tar xf criteo_ctr_demo_model.tar.gz
+  mv models/ctr_client_conf .
+  mv models/ctr_serving_model .
+  python3.6 -m paddle_serving_server.serve --model ctr_serving_model/ --port 8885 &
+  sleep 5
+  python3.6 test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function criteo_ctr_rpc_gpu() {
+  unsetproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/criteo_ctr
+  sed -i "s/8885/8886/g" test_client.py
+  wget https://paddle-serving.bj.bcebos.com/criteo_ctr_example/criteo_ctr_demo_model.tar.gz >/dev/null 2>&1
+  python3.6 -m paddle_serving_server_gpu.serve --model ctr_serving_model/ --port 8886 --gpu_ids 0 &
+  sleep 5
+  nvidia-smi
+  python3.6 test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/
+  nvidia-smi
+  check_result $FUNCNAME
+  kill `ps -ef|grep ctr|awk '{print $2}'`
+  kill_server_process serving
+}
+function yolov4_rpc_gpu() {
+  unsetproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/yolov4
+  sed -i "s/9393/8887/g" test_client.py
+  cp -r /root/.cache/dist_data/serving/yolov4/yolov4.tar.gz ./
+  tar xf yolov4.tar.gz
+  python3.6 -m paddle_serving_server_gpu.serve --model yolov4_model --port 8887 --gpu_ids 0 &
+  nvidia-smi
+  sleep 5
+  python3.6 test_client.py 000000570688.jpg
+  nvidia-smi
+ # check_result $FUNCNAME
+  kill_server_process serving
+}
+function senta_rpc_cpu() {
+  unsetproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/senta
+  sed -i "s/9393/8887/g" test_client.py
+  cp -r /data/.cache/dist_data/serving/yolov4/yolov4.tar.gz ./
+  tar xf yolov4.tar.gz
+  python3.6 -m paddle_serving_server_gpu.serve --model yolov4_model --port 8887 --gpu_ids 0 &
+  nvidia-smi
+  sleep 5
+  python3.6 test_client.py 000000570688.jpg
+  nvidia-smi
+  check_result $FUNCNAME
+  kill_server_process serving
+}
+function fit_a_line_http() {
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/fit_a_line
+  sed -i "s/9292/8871/g" test_server.py
+  python3.6 test_server.py &
+  sleep 10
+  curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' http://${host}:8871/uci/prediction
+  check_result $FUNCNAME
+  kill_server_process test_server
+}
+function lac_http() {
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/lac
+  python3.6 lac_web_service.py lac_model/ lac_workdir 8872 &
+  sleep 10
+  curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "我爱北京天安门"}], "fetch":["word_seg"]}' http://${host}:8872/lac/prediction
+  check_result $FUNCNAME
+  kill_server_process lac_web_service
+}
+function cnn_http() {
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/imdb
+  python3.6 text_classify_service.py imdb_cnn_model/ workdir/ 8873 imdb.vocab &
+  sleep 10
+  curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://${host}:8873/imdb/prediction
+  check_result $FUNCNAME
+  kill_server_process text_classify_service
+}
+function bow_http() {
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/imdb
+  python3.6 text_classify_service.py imdb_bow_model/ workdir/ 8874 imdb.vocab &
+  sleep 10
+  curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://${host}:8874/imdb/prediction
+  check_result $FUNCNAME
+  kill_server_process text_classify_service
+}
+function lstm_http() {
+  unsetproxy
+  run_cpu_env
+  cd ${build_path}/python/examples/imdb
+  python3.6 text_classify_service.py imdb_bow_model/ workdir/ 8875 imdb.vocab &
+  sleep 10
+  curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "i am very sad | 0"}], "fetch":["prediction"]}' http://${host}:8875/imdb/prediction
+  check_result $FUNCNAME
+  kill `ps -ef|grep imdb|awk '{print $2}'`
+  kill_server_process text_classify_service
+}
+function ResNet50_http() {
+  unsetproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/imagenet
+  python3.6 resnet50_web_service.py ResNet50_vd_model gpu 8876 &
+  sleep 10
+  curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"image": "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"}], "fetch": ["score"]}' http://${host}:8876/image/prediction
+  check_result $FUNCNAME
+  kill_server_process resnet50_web_service
+}
+function bert_http(){
+  unsetproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/bert
+  cp data-c.txt.1 data-c.txt
+  cp vocab.txt.1 vocab.txt
+  export CUDA_VISIBLE_DEVICES=0
+  python3.6 bert_web_service.py bert_seq128_model/ 8878 &
+  sleep 5
+  curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"words": "hello"}], "fetch":["pooled_output"]}' http://127.0.0.1:8878/bert/prediction
+  check_result $FUNCNAME
+  kill_server_process bert_web_service
+}
+grpc_impl(){
+  unsetproxy
+  run_gpu_env
+  cd ${build_path}/python/examples/grpc_impl_example/fit_a_line
+  sh get_data.sh >/dev/null 2>&1
+  python3.6 test_server.py uci_housing_model/ &
+  sleep 5
+  echo "sync predict"
+  python3.6 test_sync_client.py
+  echo "async predict"
+  python3.6 test_asyn_client.py
+  echo "batch predict"
+  python3.6 test_batch_client.py
+  echo "timeout predict"
+  python3.6 test_timeout_client.py
+#  check_result $FUNCNAME
+  kill_server_process test_server
+}
+function build_all_whl(){
+  for whl in ${build_whl_list[@]}
+  do
+    echo "===========${whl} begin build==========="
+    $whl
+    sleep 3
+    echo "===========${whl} build over ==========="
+  done
+}
+function run_rpc_models(){
+  for model in ${rpc_model_list[@]}
+  do
+    echo "===========${model} run begin==========="
+    $model
+    sleep 3
+    echo "===========${model} run  end ==========="
+  done
+}
+function run_http_models(){
+  for model in ${http_model_list[@]}
+  do
+    echo "===========${model} run begin==========="
+    $model
+    sleep 3
+    echo "===========${model} run  end ==========="
+  done
+}
+function end_hook(){
+  cd ${build_path}
+  kill_server_process
+  kill `ps -ef|grep python|awk '{print $2}'`
+  sleep 5
+  echo "===========files==========="
+  ls -hlst
+  echo "=========== end ==========="
+}
+function main() {
+  before_hook
+  build_all_whl
+  check
+  run_env
+  run_rpc_models
+#   run_http_models
+  end_hook
+}
+main$@