Merge pull request #32 from PaddlePaddle/develop

Sync codes

Merge pull request #32 from PaddlePaddle/develop
Sync codes
7b77852b · TeslaZhao · GitHub · d85a7733 · 592fe770 · 7b77852b
119 changed file
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ We consider deploying deep learning inference service online to be a user-facing

 <h2 align="center">AIStudio Turorial</h2>

-Here we provide tutorial on AIStudio(Chinese Version) [AIStudio教程-Paddle Serving服务化部署框架](https://aistudio.baidu.com/aistudio/projectdetail/1550674)
+Here we provide tutorial on AIStudio(Chinese Version) [AIStudio教程-Paddle Serving服务化部署框架](https://www.paddlepaddle.org.cn/tutorials/projectdetail/1555945)

 The tutorial provides 
 <ul>
@@ -85,14 +85,14 @@ We **highly recommend** you to **run Paddle Serving in Docker**, please visit [R
 ```
 # Run CPU Docker
 docker pull registry.baidubce.com/paddlepaddle/serving:0.5.0-devel
-docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-devel
+docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-devel bash
 docker exec -it test bash
 git clone https://github.com/PaddlePaddle/Serving
 ```
 ```
 # Run GPU Docker
 nvidia-docker pull registry.baidubce.com/paddlepaddle/serving:0.5.0-cuda10.2-cudnn8-devel
-nvidia-docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-cuda10.2-cudnn8-devel
+nvidia-docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-cuda10.2-cudnn8-devel bash
 nvidia-docker exec -it test bash
 git clone https://github.com/PaddlePaddle/Serving
 ```

--- a/README_CN.md
+++ b/README_CN.md
@@ -53,7 +53,7 @@ Paddle Serving 旨在帮助深度学习开发者轻易部署在线预测服务

 <h2 align="center">教程</h2>

-Paddle Serving开发者为您提供了简单易用的[AIStudio教程-Paddle Serving服务化部署框架](https://aistudio.baidu.com/aistudio/projectdetail/1550674)
+Paddle Serving开发者为您提供了简单易用的[AIStudio教程-Paddle Serving服务化部署框架](https://www.paddlepaddle.org.cn/tutorials/projectdetail/1555945)

 教程提供了如下内容

@@ -86,14 +86,14 @@ Paddle Serving开发者为您提供了简单易用的[AIStudio教程-Paddle Serv
 ```
 # 启动 CPU Docker
 docker pull registry.baidubce.com/paddlepaddle/serving:0.5.0-devel
-docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-devel
+docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-devel bash
 docker exec -it test bash
 git clone https://github.com/PaddlePaddle/Serving
 ```
 ```
 # 启动 GPU Docker
 nvidia-docker pull registry.baidubce.com/paddlepaddle/serving:0.5.0-cuda10.2-cudnn8-devel
-nvidia-docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-cuda10.2-cudnn8-devel
+nvidia-docker run -p 9292:9292 --name test -dit registry.baidubce.com/paddlepaddle/serving:0.5.0-cuda10.2-cudnn8-devel bash
 nvidia-docker exec -it test bash
 git clone https://github.com/PaddlePaddle/Serving
 ```

--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -18,7 +18,7 @@ SET(PADDLE_SOURCES_DIR ${THIRD_PARTY_PATH}/Paddle)
 SET(PADDLE_DOWNLOAD_DIR ${PADDLE_SOURCES_DIR}/src/extern_paddle)
 SET(PADDLE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/Paddle/)
 SET(PADDLE_INCLUDE_DIR "${PADDLE_INSTALL_DIR}/include" CACHE PATH "PaddlePaddle include directory." FORCE)
-SET(PADDLE_LIBRARIES "${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.a" CACHE FILEPATH "Paddle library." FORCE)
+SET(PADDLE_LIBRARIES "${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.a" CACHE FILEPATH "Paddle library." FORCE)

 message("paddle install dir: " ${PADDLE_INSTALL_DIR})

@@ -31,7 +31,7 @@ message( "WITH_GPU = ${WITH_GPU}")
 # Paddle Version should be one of:
 # latest: latest develop build
 # version number like 1.5.2
-SET(PADDLE_VERSION "2.0.0")
+SET(PADDLE_VERSION "2.0.1")
 if (WITH_GPU)
    if(CUDA_VERSION EQUAL 11.0)
        set(CUDA_SUFFIX "cuda11-cudnn8-avx-mkl")
@@ -55,9 +55,9 @@ if (WITH_GPU)
    SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-${CUDA_SUFFIX}")
 elseif (WITH_LITE)
    if (WITH_XPU)
-        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-arm-xpu")
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-${CMAKE_SYSTEM_PROCESSOR}-xpu")
    else()
-        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-arm")
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-${CMAKE_SYSTEM_PROCESSOR}")
    endif()
 else()
    if (WITH_AVX)
@@ -139,8 +139,8 @@ LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib)
 ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/openblas/lib/libopenblas.a)

-ADD_LIBRARY(paddle_fluid STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.a)
+ADD_LIBRARY(paddle_inference STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET paddle_inference PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_inference.a)

 if (WITH_TRT)
    ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL)

--- a/core/configure/CMakeLists.txt
+++ b/core/configure/CMakeLists.txt
 if (SERVER OR CLIENT)
-LIST(APPEND protofiles
-        ${CMAKE_CURRENT_LIST_DIR}/proto/server_configure.proto
-        ${CMAKE_CURRENT_LIST_DIR}/proto/sdk_configure.proto
-        ${CMAKE_CURRENT_LIST_DIR}/proto/inferencer_configure.proto
-	${CMAKE_CURRENT_LIST_DIR}/proto/general_model_config.proto
-)
-
-PROTOBUF_GENERATE_CPP(configure_proto_srcs configure_proto_hdrs ${protofiles})
-list(APPEND configure_srcs ${configure_proto_srcs})
-
-list(APPEND configure_srcs ${CMAKE_CURRENT_LIST_DIR}/src/configure_parser.cpp)
-
-add_library(configure ${configure_srcs})
-add_dependencies(configure brpc)
-
-install(TARGETS configure 
-        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
-        )
-
-install(FILES ${CMAKE_CURRENT_LIST_DIR}/include/configure_parser.h
-        DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/include/configure/include)
-
-FILE(GLOB inc ${CMAKE_CURRENT_BINARY_DIR}/*.pb.h)
-
-install(FILES ${inc}
-        DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/include/configure)
+  LIST(APPEND protofiles
+          ${CMAKE_CURRENT_LIST_DIR}/proto/server_configure.proto
+          ${CMAKE_CURRENT_LIST_DIR}/proto/sdk_configure.proto
+          ${CMAKE_CURRENT_LIST_DIR}/proto/inferencer_configure.proto
+  	${CMAKE_CURRENT_LIST_DIR}/proto/general_model_config.proto
+  )
+  
+  PROTOBUF_GENERATE_CPP(configure_proto_srcs configure_proto_hdrs ${protofiles})
+  list(APPEND configure_srcs ${configure_proto_srcs})
+  
+  list(APPEND configure_srcs ${CMAKE_CURRENT_LIST_DIR}/src/configure_parser.cpp)
+  
+  add_library(configure ${configure_srcs})
+  add_dependencies(configure brpc)
+  
+  install(TARGETS configure 
+          ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
+          )
+  
+  install(FILES ${CMAKE_CURRENT_LIST_DIR}/include/configure_parser.h
+          DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/include/configure/include)
+  
+  FILE(GLOB inc ${CMAKE_CURRENT_BINARY_DIR}/*.pb.h)
+  
+  install(FILES ${inc}
+          DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/include/configure)
 endif()

 if (WITH_PYTHON)

-py_proto_compile(general_model_config_py_proto SRCS proto/general_model_config.proto)
-add_custom_target(general_model_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-add_dependencies(general_model_config_py_proto general_model_config_py_proto_init)
-
-py_grpc_proto_compile(multi_lang_general_model_service_py_proto SRCS proto/multi_lang_general_model_service.proto)
-add_custom_target(multi_lang_general_model_service_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-add_dependencies(multi_lang_general_model_service_py_proto multi_lang_general_model_service_py_proto_init)
-
-if (CLIENT)
-py_proto_compile(sdk_configure_py_proto SRCS proto/sdk_configure.proto)
-add_custom_target(sdk_configure_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-add_dependencies(sdk_configure_py_proto sdk_configure_py_proto_init)
-add_custom_command(TARGET sdk_configure_py_proto POST_BUILD
-		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-		COMMENT "Copy generated python proto into directory paddle_serving_client/proto."
-		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-
-add_custom_command(TARGET general_model_config_py_proto POST_BUILD
-                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-                COMMENT "Copy generated general_model_config proto file into directory paddle_serving_client/proto."
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-
-add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
-                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
-                COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_client/proto."
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif()
-
-if (APP)
-add_custom_command(TARGET general_model_config_py_proto POST_BUILD
-                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
-                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
-                COMMENT "Copy generated general_model_config proto file into directory paddle_serving_app/proto."
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif()
-
-if (SERVER)
-py_proto_compile(server_config_py_proto SRCS proto/server_configure.proto)
-add_custom_target(server_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-add_dependencies(server_config_py_proto server_config_py_proto_init)
-if (NOT WITH_GPU AND NOT WITH_LITE)
-add_custom_command(TARGET server_config_py_proto POST_BUILD
-		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-		COMMENT "Copy generated python proto into directory paddle_serving_server/proto."
-		WORKING_DIRECTORY ${CMAKE_CURRENT_BINRARY_DIR})
-
-add_custom_command(TARGET general_model_config_py_proto POST_BUILD
-		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-		COMMENT "Copy generated general_model_config proto file into directory paddle_serving_server/proto."
-		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-
-add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
-                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
-                COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server/proto."
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-else()
-add_custom_command(TARGET server_config_py_proto POST_BUILD
-		COMMAND ${CMAKE_COMMAND} -E make_directory
-        ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-		COMMAND cp -f *.py
-        ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-		COMMENT "Copy generated python proto into directory
-        paddle_serving_server_gpu/proto."
-		WORKING_DIRECTORY ${CMAKE_CURRENT_BINRARY_DIR})
-
-add_custom_command(TARGET general_model_config_py_proto POST_BUILD
-		COMMAND ${CMAKE_COMMAND} -E make_directory
-        ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-		COMMAND cp -f *.py
-        ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-		COMMENT "Copy generated general_model_config proto file into directory
-        paddle_serving_server_gpu/proto."
-		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-
-add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
-                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-                COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto
-                COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server_gpu/proto."
-                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif()
-endif()
+  py_proto_compile(general_model_config_py_proto SRCS proto/general_model_config.proto)
+  add_custom_target(general_model_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+  add_dependencies(general_model_config_py_proto general_model_config_py_proto_init)
+  
+  py_grpc_proto_compile(multi_lang_general_model_service_py_proto SRCS proto/multi_lang_general_model_service.proto)
+  add_custom_target(multi_lang_general_model_service_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+  add_dependencies(multi_lang_general_model_service_py_proto multi_lang_general_model_service_py_proto_init)
+  
+  if (CLIENT)
+    py_proto_compile(sdk_configure_py_proto SRCS proto/sdk_configure.proto)
+    add_custom_target(sdk_configure_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+    add_dependencies(sdk_configure_py_proto sdk_configure_py_proto_init)
+    add_custom_command(TARGET sdk_configure_py_proto POST_BUILD
+    		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+    		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+    		COMMENT "Copy generated python proto into directory paddle_serving_client/proto."
+    		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    
+    add_custom_command(TARGET general_model_config_py_proto POST_BUILD
+                    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+                    COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+                    COMMENT "Copy generated general_model_config proto file into directory paddle_serving_client/proto."
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    
+    add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
+                    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+                    COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto
+                    COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_client/proto."
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  endif()
+  
+  if (APP)
+    add_custom_command(TARGET general_model_config_py_proto POST_BUILD
+                    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
+                    COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_app/proto
+                    COMMENT "Copy generated general_model_config proto file into directory paddle_serving_app/proto."
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  endif()
+  
+  if (SERVER)
+    py_proto_compile(server_config_py_proto SRCS proto/server_configure.proto)
+    add_custom_target(server_config_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+    add_dependencies(server_config_py_proto server_config_py_proto_init)
+    add_custom_command(TARGET server_config_py_proto POST_BUILD
+    		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+    		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+    		COMMENT "Copy generated python proto into directory paddle_serving_server/proto."
+    		WORKING_DIRECTORY ${CMAKE_CURRENT_BINRARY_DIR})
+    
+    add_custom_command(TARGET general_model_config_py_proto POST_BUILD
+    		COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+    		COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+    		COMMENT "Copy generated general_model_config proto file into directory paddle_serving_server/proto."
+    		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    
+    add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
+                    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+                    COMMAND cp -f *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
+                    COMMENT "Copy generated multi_lang_general_model_service proto file into directory paddle_serving_server/proto."
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  endif()

 endif()
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -20,7 +20,7 @@ message EngineDesc {
  required string type = 2;
  required string reloadable_meta = 3;
  required string reloadable_type = 4;
-  required string model_data_path = 5;
+  required string model_dir = 5;
  required int32 runtime_thread_num = 6;
  required int32 batch_infer_size = 7;
  required int32 enable_batch_align = 8;
@@ -41,12 +41,13 @@ message EngineDesc {
  optional SparseParamServiceType sparse_param_service_type = 11;
  optional string sparse_param_service_table_name = 12;
  optional bool enable_memory_optimization = 13;
-  optional bool static_optimization = 14;
-  optional bool force_update_static_cache = 15;
-  optional bool enable_ir_optimization = 16;
-  optional bool use_trt = 17;
-  optional bool use_lite = 18;
-  optional bool use_xpu = 19;
+  optional bool enable_ir_optimization = 14;
+  optional bool use_trt = 15;
+  optional bool use_lite = 16;
+  optional bool use_xpu = 17;
+  optional bool use_gpu = 18;
+  optional bool combined_model = 19;
+  optional bool encrypted_model = 20;
 };

 // model_toolkit conf

--- a/core/configure/tests/test_configure.cpp
+++ b/core/configure/tests/test_configure.cpp
@@ -69,8 +69,6 @@ int test_write_conf() {
  engine->set_sparse_param_service_type(EngineDesc::LOCAL);
  engine->set_sparse_param_service_table_name("local_kv");
  engine->set_enable_memory_optimization(true);
-  engine->set_static_optimization(false);
-  engine->set_force_update_static_cache(false);

  int ret = baidu::paddle_serving::configure::write_proto_conf(
      &model_toolkit_conf, output_dir, model_toolkit_conf_file);

--- a/core/general-client/CMakeLists.txt
+++ b/core/general-client/CMakeLists.txt
 if(CLIENT)
 add_subdirectory(pybind11)
 pybind11_add_module(serving_client src/general_model.cpp src/pybind_general_model.cpp)
-add_dependencies(serving_client sdk_cpp)
 target_link_libraries(serving_client PRIVATE -Wl,--whole-archive utils sdk-cpp pybind python -Wl,--no-whole-archive -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -Wl,-rpath,'$ORIGIN'/lib)
 endif()
--- a/core/general-server/CMakeLists.txt
+++ b/core/general-server/CMakeLists.txt
@@ -2,33 +2,25 @@ include_directories(SYSTEM  ${CMAKE_CURRENT_LIST_DIR}/../../)
 include(op/CMakeLists.txt)
 include(proto/CMakeLists.txt)
 add_executable(serving ${serving_srcs})
-add_dependencies(serving pdcodegen fluid_cpu_engine pdserving paddle_fluid cube-api utils)
+add_dependencies(serving pdcodegen paddle_inference_engine pdserving paddle_inference cube-api utils)
+
 if (WITH_GPU)
-    add_dependencies(serving fluid_gpu_engine)
+    add_dependencies(serving paddle_inference_engine)
 endif()

 if (WITH_LITE)
-    add_dependencies(serving fluid_arm_engine)
+    add_dependencies(serving paddle_inference_engine)
 endif()

 target_include_directories(serving PUBLIC
        ${CMAKE_CURRENT_BINARY_DIR}/../../core/predictor
-        )
-    include_directories(${CUDNN_ROOT}/include/)
-if(WITH_GPU)
-    target_link_libraries(serving -Wl,--whole-archive fluid_gpu_engine
-            -Wl,--no-whole-archive)
-endif()
-
-if(WITH_LITE)
-    target_link_libraries(serving -Wl,--whole-archive fluid_arm_engine
-            -Wl,--no-whole-archive)
-endif()
+)
+include_directories(${CUDNN_ROOT}/include/)

-target_link_libraries(serving -Wl,--whole-archive fluid_cpu_engine
+target_link_libraries(serving -Wl,--whole-archive paddle_inference_engine
        -Wl,--no-whole-archive)

-target_link_libraries(serving paddle_fluid ${paddle_depend_libs})
+target_link_libraries(serving paddle_inference ${paddle_depend_libs})
 target_link_libraries(serving brpc)
 target_link_libraries(serving protobuf)
 target_link_libraries(serving pdserving)

--- a/core/predictor/CMakeLists.txt
+++ b/core/predictor/CMakeLists.txt
@@ -12,12 +12,12 @@ set_source_files_properties(
        ${pdserving_srcs}
        PROPERTIES
        COMPILE_FLAGS  "-Wno-strict-aliasing -Wno-unused-variable -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure extern_paddle paddle_fluid)
+add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure extern_paddle paddle_inference)
 if (WITH_TRT)
    add_definitions(-DWITH_TRT)
 endif()
 target_link_libraries(pdserving
-        brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz paddle_fluid ${paddle_depend_libs})
+        brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz paddle_inference ${paddle_depend_libs})
 # install
 install(TARGETS pdserving
        RUNTIME DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/bin

--- a/core/predictor/common/utils.h
+++ b/core/predictor/common/utils.h
@@ -14,6 +14,7 @@

 #pragma once
 #include <string>
+#include <fstream>
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/common/macros.h"

@@ -148,6 +149,16 @@ class IsDerivedFrom {
  }
 };

+static void ReadBinaryFile(const std::string& filename, std::string* contents) {
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  fin.seekg(0, std::ios::end);
+  contents->clear();
+  contents->resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(contents->at(0)), contents->size());
+  fin.close();
+}
+
 }  // namespace predictor
 }  // namespace paddle_serving
 }  // namespace baidu
--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -16,6 +16,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <pthread.h>
 #include <string>
 #include <utility>
 #include <vector>
@@ -29,83 +30,29 @@ namespace predictor {

 using configure::ModelToolkitConf;

-class InferEngineCreationParams {
+class AutoLock {
 public:
-  InferEngineCreationParams() {
-    _path = "";
-    _enable_memory_optimization = false;
-    _enable_ir_optimization = false;
-    _static_optimization = false;
-    _force_update_static_cache = false;
-    _use_trt = false;
-    _use_lite = false;
-    _use_xpu = false;
+  explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
+    pthread_mutex_lock(&mutex);
  }
+  ~AutoLock() { pthread_mutex_unlock(&_mut); }

-  void set_path(const std::string& path) { _path = path; }
-
-  void set_enable_memory_optimization(bool enable_memory_optimization) {
-    _enable_memory_optimization = enable_memory_optimization;
-  }
-
-  void set_enable_ir_optimization(bool enable_ir_optimization) {
-    _enable_ir_optimization = enable_ir_optimization;
-  }
-
-  void set_use_trt(bool use_trt) { _use_trt = use_trt; }
-
-  void set_use_lite(bool use_lite) { _use_lite = use_lite; }
-
-  void set_use_xpu(bool use_xpu) { _use_xpu = use_xpu; }
-
-  bool enable_memory_optimization() const {
-    return _enable_memory_optimization;
-  }
-
-  bool enable_ir_optimization() const { return _enable_ir_optimization; }
-
-  bool use_trt() const { return _use_trt; }
-
-  bool use_lite() const { return _use_lite; }
-
-  bool use_xpu() const { return _use_xpu; }
-
-  void set_static_optimization(bool static_optimization = false) {
-    _static_optimization = static_optimization;
-  }
-
-  void set_force_update_static_cache(bool force_update_static_cache = false) {
-    _force_update_static_cache = force_update_static_cache;
-  }
-
-  bool static_optimization() const { return _static_optimization; }
-
-  bool force_update_static_cache() const { return _force_update_static_cache; }
+ private:
+  pthread_mutex_t& _mut;
+};

-  std::string get_path() const { return _path; }
+class GlobalCreateMutex {
+ public:
+  pthread_mutex_t& mutex() { return _mut; }

-  void dump() const {
-    LOG(INFO) << "InferEngineCreationParams: "
-              << "model_path = " << _path << ", "
-              << "enable_memory_optimization = " << _enable_memory_optimization
-              << ", "
-              << "enable_tensorrt = " << _use_trt << ", "
-              << "enable_lite = " << _use_lite << ", "
-              << "enable_xpu = " << _use_xpu << ", "
-              << "enable_ir_optimization = " << _enable_ir_optimization << ", "
-              << "static_optimization = " << _static_optimization << ", "
-              << "force_update_static_cache = " << _force_update_static_cache;
+  static pthread_mutex_t& instance() {
+    static GlobalCreateMutex gmutex;
+    return gmutex.mutex();
  }

 private:
-  std::string _path;
-  bool _enable_memory_optimization;
-  bool _enable_ir_optimization;
-  bool _static_optimization;
-  bool _force_update_static_cache;
-  bool _use_trt;
-  bool _use_lite;
-  bool _use_xpu;
+  GlobalCreateMutex() { pthread_mutex_init(&_mut, NULL); }
+  pthread_mutex_t _mut;
 };

 class InferEngine {
@@ -152,57 +99,19 @@ class ReloadableInferEngine : public InferEngine {
    uint64_t last_revision;
  };

-  virtual int load(const InferEngineCreationParams& params) = 0;
+  virtual int load(const configure::EngineDesc& conf) = 0;

  int proc_initialize_impl(const configure::EngineDesc& conf, bool version) {
    _reload_tag_file = conf.reloadable_meta();
    _reload_mode_tag = conf.reloadable_type();
-    _model_data_path = conf.model_data_path();
+    _model_data_path = conf.model_dir();
    _infer_thread_num = conf.runtime_thread_num();
    _infer_batch_size = conf.batch_infer_size();
    _infer_batch_align = conf.enable_batch_align();

-    bool enable_memory_optimization = false;
-    if (conf.has_enable_memory_optimization()) {
-      enable_memory_optimization = conf.enable_memory_optimization();
-    }
-
-    bool static_optimization = false;
-    if (conf.has_static_optimization()) {
-      static_optimization = conf.static_optimization();
-    }
-
-    bool force_update_static_cache = false;
-    if (conf.has_force_update_static_cache()) {
-      force_update_static_cache = conf.force_update_static_cache();
-    }
+    _conf = conf;

-    if (conf.has_enable_ir_optimization()) {
-      _infer_engine_params.set_enable_ir_optimization(
-          conf.enable_ir_optimization());
-    }
-
-    _infer_engine_params.set_path(_model_data_path);
-    if (enable_memory_optimization) {
-      _infer_engine_params.set_enable_memory_optimization(true);
-      _infer_engine_params.set_static_optimization(static_optimization);
-      _infer_engine_params.set_force_update_static_cache(
-          force_update_static_cache);
-    }
-
-    if (conf.has_use_trt()) {
-      _infer_engine_params.set_use_trt(conf.use_trt());
-    }
-
-    if (conf.has_use_lite()) {
-      _infer_engine_params.set_use_lite(conf.use_lite());
-    }
-
-    if (conf.has_use_xpu()) {
-      _infer_engine_params.set_use_xpu(conf.use_xpu());
-    }
-
-    if (!check_need_reload() || load(_infer_engine_params) != 0) {
+    if (!check_need_reload() || load(conf) != 0) {
      LOG(ERROR) << "Failed load model_data_path" << _model_data_path;
      return -1;
    }
@@ -230,7 +139,6 @@ class ReloadableInferEngine : public InferEngine {
    if (_infer_thread_num > 0) {
      return 0;
    }
-
    return thrd_initialize_impl();
  }

@@ -254,13 +162,13 @@ class ReloadableInferEngine : public InferEngine {
  int reload() {
    if (check_need_reload()) {
      LOG(WARNING) << "begin reload model[" << _model_data_path << "].";
-      return load(_infer_engine_params);
+      return load(_conf);
    }
    return 0;
  }

  uint64_t version() const { return _version; }
-
+  
  uint32_t thread_num() const { return _infer_thread_num; }

 private:
@@ -322,7 +230,7 @@ class ReloadableInferEngine : public InferEngine {

 protected:
  std::string _model_data_path;
-  InferEngineCreationParams _infer_engine_params;
+  configure::EngineDesc _conf;

 private:
  std::string _reload_tag_file;
@@ -361,25 +269,25 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
    return ReloadableInferEngine::proc_initialize(conf, version);
  }

-  virtual int load(const InferEngineCreationParams& params) {
+  virtual int load(const configure::EngineDesc& conf) {
    if (_reload_vec.empty()) {
      return 0;
    }

    for (uint32_t ti = 0; ti < _reload_vec.size(); ++ti) {
-      if (load_data(_reload_vec[ti], params) != 0) {
+      if (load_data(_reload_vec[ti], conf) != 0) {
        LOG(ERROR) << "Failed reload engine model: " << ti;
        return -1;
      }
    }

-    LOG(WARNING) << "Succ load engine, path: " << params.get_path();
+    LOG(WARNING) << "Succ load engine, path: " << conf.model_dir();

    return 0;
  }

  int load_data(ModelData<EngineCore>* md,
-                const InferEngineCreationParams& params) {
+                const configure::EngineDesc& conf) {
    uint32_t next_idx = (md->current_idx + 1) % 2;
    if (md->cores[next_idx]) {
      delete md->cores[next_idx];
@@ -387,9 +295,9 @@ class DBReloadableInferEngine : public ReloadableInferEngine {

    md->cores[next_idx] = new (std::nothrow) EngineCore;

-    params.dump();
-    if (!md->cores[next_idx] || md->cores[next_idx]->create(params) != 0) {
-      LOG(ERROR) << "Failed create model, path: " << params.get_path();
+    //params.dump();
+    if (!md->cores[next_idx] || md->cores[next_idx]->create(conf) != 0) {
+      LOG(ERROR) << "Failed create model, path: " << conf.model_dir();
      return -1;
    }
    md->current_idx = next_idx;
@@ -400,9 +308,9 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
    // memory pool to be inited in non-serving-threads

    ModelData<EngineCore>* md = new (std::nothrow) ModelData<EngineCore>;
-    if (!md || load_data(md, _infer_engine_params) != 0) {
+    if (!md || load_data(md, _conf) != 0) {
      LOG(ERROR) << "Failed create thread data from "
-                 << _infer_engine_params.get_path();
+                 << _conf.model_dir();
      return -1;
    }

@@ -458,16 +366,16 @@ class CloneDBReloadableInferEngine
    return DBReloadableInferEngine<EngineCore>::proc_initialize(conf, version);
  }

-  virtual int load(const InferEngineCreationParams& params) {
+  virtual int load(const configure::EngineDesc& conf) {
    // 加载进程级模型数据
    if (!_pd ||
-        DBReloadableInferEngine<EngineCore>::load_data(_pd, params) != 0) {
-      LOG(ERROR) << "Failed to create common model from [" << params.get_path()
+        DBReloadableInferEngine<EngineCore>::load_data(_pd, conf) != 0) {
+      LOG(ERROR) << "Failed to create common model from [" << conf.model_dir()
                 << "].";
      return -1;
    }
    LOG(WARNING) << "Succ load common model[" << _pd->cores[_pd->current_idx]
-                 << "], path[" << params.get_path() << "].";
+                 << "], path[" << conf.model_dir() << "].";

    if (DBReloadableInferEngine<EngineCore>::_reload_vec.empty()) {
      return 0;
@@ -483,7 +391,7 @@ class CloneDBReloadableInferEngine
      }
    }

-    LOG(WARNING) << "Succ load clone model, path[" << params.get_path() << "]";
+    LOG(WARNING) << "Succ load clone model, path[" << conf.model_dir() << "]";

    return 0;
  }
@@ -527,18 +435,18 @@ class CloneDBReloadableInferEngine
      _pd;  // 进程级EngineCore，多个线程级EngineCore共用该对象的模型数据
 };

-template <typename FluidFamilyCore>
+template <typename PaddleInferenceCore>
 #ifdef WITH_TRT
-class FluidInferEngine : public DBReloadableInferEngine<FluidFamilyCore> {
+class FluidInferEngine : public DBReloadableInferEngine<PaddleInferenceCore> {
 #else
-class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
+class FluidInferEngine : public CloneDBReloadableInferEngine<PaddleInferenceCore> {
 #endif
 public:  // NOLINT
  FluidInferEngine() {}
  ~FluidInferEngine() {}
  std::vector<std::string> GetInputNames() {
-    FluidFamilyCore* core =
-        DBReloadableInferEngine<FluidFamilyCore>::get_core();
+    PaddleInferenceCore* core =
+        DBReloadableInferEngine<PaddleInferenceCore>::get_core();
    if (!core || !core->get()) {
      LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
    }
@@ -546,8 +454,8 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
  }

  std::vector<std::string> GetOutputNames() {
-    FluidFamilyCore* core =
-        DBReloadableInferEngine<FluidFamilyCore>::get_core();
+    PaddleInferenceCore* core =
+        DBReloadableInferEngine<PaddleInferenceCore>::get_core();
    if (!core || !core->get()) {
      LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
    }
@@ -556,8 +464,8 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {

  std::unique_ptr<paddle_infer::Tensor> GetInputHandle(
      const std::string& name) {
-    FluidFamilyCore* core =
-        DBReloadableInferEngine<FluidFamilyCore>::get_core();
+    PaddleInferenceCore* core =
+        DBReloadableInferEngine<PaddleInferenceCore>::get_core();
    if (!core || !core->get()) {
      LOG(ERROR) << "Failed get fluid core in GetInputHandle()";
    }
@@ -566,8 +474,8 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {

  std::unique_ptr<paddle_infer::Tensor> GetOutputHandle(
      const std::string& name) {
-    FluidFamilyCore* core =
-        DBReloadableInferEngine<FluidFamilyCore>::get_core();
+    PaddleInferenceCore* core =
+        DBReloadableInferEngine<PaddleInferenceCore>::get_core();
    if (!core || !core->get()) {
      LOG(ERROR) << "Failed get fluid core in GetOutputHandle()";
    }
@@ -575,8 +483,8 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
  }

  int infer_impl() {
-    FluidFamilyCore* core =
-        DBReloadableInferEngine<FluidFamilyCore>::get_core();
+    PaddleInferenceCore* core =
+        DBReloadableInferEngine<PaddleInferenceCore>::get_core();
    if (!core || !core->get()) {
      LOG(ERROR) << "Failed get fluid core in infer_impl()";
      return -1;

--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -77,7 +77,7 @@ export PYTHON_EXECUTABLE=$PYTHONROOT/bin/python3.8
 ## Install Python dependencies

 ```shell
-pip install -r python/requirements.txt
+pip install -r python/requirements.txt -i https://mirror.baidu.com/pypi/simple
 ```

 If you use other Python version, please use the right `pip` accordingly.
@@ -123,14 +123,13 @@ Compared with CPU environment, GPU environment needs to refer to the following t
 **It should be noted that the following table is used as a reference for non-Docker compilation environment. The Docker compilation environment has been configured with relevant parameters and does not need to be specified in cmake process. **

 | cmake environment variable | meaning | GPU environment considerations | whether Docker environment is needed |
-|-----------------------|------------------------- ------------|-------------------------------|----- ---------------|
-| CUDA_TOOLKIT_ROOT_DIR | cuda installation path, usually /usr/local/cuda | Required for all environments | No
-(/usr/local/cuda) |
+|-----------------------|-------------------------------------|-------------------------------|--------------------|
+| CUDA_TOOLKIT_ROOT_DIR | cuda installation path, usually /usr/local/cuda | Required for all environments | No (/usr/local/cuda) |
 | CUDNN_LIBRARY | The directory where libcudnn.so.* is located, usually /usr/local/cuda/lib64/ | Required for all environments | No (/usr/local/cuda/lib64/) |
 | CUDA_CUDART_LIBRARY | The directory where libcudart.so.* is located, usually /usr/local/cuda/lib64/ | Required for all environments | No (/usr/local/cuda/lib64/) |
 | TENSORRT_ROOT | The upper level directory of the directory where libnvinfer.so.* is located, depends on the TensorRT installation directory | Cuda 9.0/10.0 does not need, other needs | No (/usr) |

-If not in Docker environment, users can refer to the following execution methods. The specific path is subject to the current environment, and the code is only for reference.
+If not in Docker environment, users can refer to the following execution methods. The specific path is subject to the current environment, and the code is only for reference.TENSORRT_LIBRARY_PATH is related to the TensorRT version and should be set according to the actual situation。For example, in the cuda10.1 environment, the TensorRT version is 6.0 (/usr/local/TensorRT-6.0.1.5/targets/x86_64-linux-gnu/)，In the cuda10.2 environment, the TensorRT version is 7.1 (/usr/local/TensorRT-7.1.3.4/targets/x86_64-linux-gnu/).

 ``` shell
 export CUDA_PATH='/usr/local/cuda'
@@ -145,7 +144,7 @@ cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR \
    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
    -DCUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY} \
-    -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH}
+    -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH} \
    -DSERVER=ON \
    -DWITH_GPU=ON ..
 make -j10

--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -76,7 +76,7 @@ export PYTHON_EXECUTABLE=$PYTHONROOT/bin/python3.8
 ## 安装Python依赖

 ```shell
-pip install -r python/requirements.txt
+pip install -r python/requirements.txt -i https://mirror.baidu.com/pypi/simple
 ```

 如果使用其他Python版本，请使用对应版本的`pip`。
@@ -128,7 +128,7 @@ make -j10
 | CUDA_CUDART_LIBRARY   | libcudart.so.*所在目录，通常为/usr/local/cuda/lib64/ | 全部环境都需要                | 否(/usr/local/cuda/lib64/)                 |
 | TENSORRT_ROOT         | libnvinfer.so.*所在目录的上一级目录，取决于TensorRT安装目录 | Cuda 9.0/10.0不需要，其他需要 | 否(/usr)                 |

-非Docker环境下，用户可以参考如下执行方式，具体的路径以当时环境为准，代码仅作为参考。
+非Docker环境下，用户可以参考如下执行方式，具体的路径以当时环境为准，代码仅作为参考。TENSORRT_LIBRARY_PATH和TensorRT版本有关，要根据实际情况设置。例如在cuda10.1环境下TensorRT版本是6.0(/usr/local/TensorRT-6.0.1.5/targets/x86_64-linux-gnu/)，在cuda10.2环境下TensorRT版本是7.1（/usr/local/TensorRT-7.1.3.4/targets/x86_64-linux-gnu/）。

 ``` shell
 export CUDA_PATH='/usr/local/cuda'
@@ -143,7 +143,7 @@ cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR \
    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
    -DCUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY} \
-    -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH}
+    -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH} \
    -DSERVER=ON \
    -DWITH_GPU=ON ..
 make -j10
@@ -159,7 +159,7 @@ make -j10
 mkdir client-build && cd client-build
 cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR \
    -DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
-    -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \    
+    -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
    -DCLIENT=ON ..
 make -j10
 ```

--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -6,17 +6,17 @@

 #### Q: Paddle Serving 、Paddle Inference、PaddleHub Serving三者的区别及联系？

-**A:** paddle serving是远程服务，即发起预测的设备（手机、浏览器、客户端等）与实际预测的硬件不在一起。	paddle inference是一个library，适合嵌入到一个大系统中保证预测效率，paddle serving调用了paddle       inference做远程服务。paddlehub serving可以认为是一个示例，都会使用paddle serving作为统一预测服务入口。如果在web端交互，一般是调用远程服务的形式，可以使用paddle serving的web service搭建。
+**A:** paddle serving是远程服务，即发起预测的设备（手机、浏览器、客户端等）与实际预测的硬件不在一起。   paddle inference是一个library，适合嵌入到一个大系统中保证预测效率，paddle serving调用了paddle       inference做远程服务。paddlehub serving可以认为是一个示例，都会使用paddle serving作为统一预测服务入口。如果在web端交互，一般是调用远程服务的形式，可以使用paddle serving的web service搭建。

 #### Q: paddle-serving是否支持Int32支持

 **A:** 在protobuf定feed_type和fetch_type编号与数据类型对应如下

-     0-int64
-
-	  1-float32
-
-	  2-int32
+     0-int64
+    
+      1-float32
+    
+      2-int32

 #### Q: paddle-serving是否支持windows和Linux环境下的多线程调用 

@@ -37,6 +37,7 @@
 ## 安装问题

 #### Q: pip install安装whl包过程，报错信息如下：
+
 ```
 Collecting opencv-python
  Using cached opencv-python-4.3.0.38.tar.gz (88.0 MB)
@@ -69,9 +70,11 @@ Collecting opencv-python
      s = list(pattern)
  TypeError: 'NoneType' object is not iterable
 ```
+
 **A:** 指定opencv-python版本安装，pip install opencv-python==4.2.0.32，再安装whl包

 #### Q: pip3 install whl包过程报错信息如下：
+
 ```
    Complete output from command python setup.py egg_info:
    Found cython-generated files...
@@ -80,13 +83,16 @@ Collecting opencv-python
    ----------------------------------------
 Command "python setup.py egg_info" failed with error code 1 in /tmp/pip-install-taoxz02y/grpcio/
 ```
+
 **A:** 需要升级pip3，再重新执行安装命令。
+
 ```
 pip3 install --upgrade pip
 pip3 install --upgrade setuptools
 ```

 #### Q: 运行过程中报错，信息如下：
+
 ```
 Traceback (most recent call last):
  File "../../deploy/serving/test_client.py", line 18, in <module>
@@ -97,7 +103,9 @@ Traceback (most recent call last):
    from shapely.geometry import Polygon
 ImportError: No module named shapely.geometry
 ```
+
 **A:** 有2种方法，第一种通过pip/pip3安装shapely，第二种通过pip/pip3安装所有依赖组件。
+
 ```
 方法1：
 pip install shapely==1.7.0
@@ -116,7 +124,69 @@ pip install -r python/requirements.txt

 **A:** 没有安装JDK，或者JAVA_HOME路径配置错误（正确配置是JDK路径，常见错误配置成JRE路径，例如正确路径参考JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.262.b10-0.el7_8.x86_64/"）。Java JDK安装参考https://segmentfault.com/a/1190000015389941

+## 环境问题
+
+#### Q：使用过程中出现CXXABI错误。
+
+这个问题出现的原因是Python使用的gcc版本和Serving所需的gcc版本对不上。对于Docker用户，推荐使用[Docker容器](./RUN_IN_DOCKER_CN.md)，由于Docker容器内的Python版本与Serving在发布前都做过适配，这样就不会出现类似的错误。如果是其他开发环境，首先需要确保开发环境中具备GCC 8.2，如果没有gcc 8.2，参考安装方式
+
+```bash
+wget -q https://paddle-ci.gz.bcebos.com/gcc-8.2.0.tar.xz 
+tar -xvf gcc-8.2.0.tar.xz && \
+cd gcc-8.2.0 && \
+unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
+./contrib/download_prerequisites && \
+cd .. && mkdir temp_gcc82 && cd temp_gcc82 && \
+../gcc-8.2.0/configure --prefix=/usr/local/gcc-8.2 --enable-threads=posix --disable-checking --disable-multilib && \
+make -j8 && make install
+cd .. && rm -rf temp_gcc82
+cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} && 
+ln -s /usr/local/gcc-8.2/lib64/libgfortran.so.5 ${lib_so_5} && \
+ln -s /usr/local/gcc-8.2/lib64/libstdc++.so.6 ${lib_so_6} && \
+cp /usr/local/gcc-8.2/lib64/libstdc++.so.6.0.25 ${lib_path}
+```
+
+假如已经有了GCC 8.2，可以自行安装Python，此外我们也提供了两个GCC 8.2编译的[Python2.7](https://paddle-serving.bj.bcebos.com/others/Python2.7.17-gcc82.tar) 和 [Python3.6](https://paddle-serving.bj.bcebos.com/others/Python3.6.10-gcc82.tar) 。下载解压后，需要将对应的目录设置为`PYTHONROOT`，并设置`PATH`和`LD_LIBRARY_PATH`。
+
+```bash
+export PYTHONROOT=/path/of/python # 对应解压后的Python目录
+export PATH=$PYTHONROOT/bin:$PATH
+export LD_LIBRARY_PATH=$PYTHONROOT/lib:$LD_LIBRARY_PATH
+```
+
+#### Q：遇到libstdc++.so.6的版本不够的问题
+
+触发该问题的原因在于，编译Paddle Serving相关可执行程序和动态库，所采用的是GCC 8.2(Cuda 9.0和10.0的Server可执行程序受限Cuda兼容性采用GCC 4.8编译)。Python在调用的过程中，有可能链接到了其他GCC版本的 `libstdc++.so`。 需要做的就是受限确保所在环境具备GCC 8.2，其次将GCC8.2的`libstdc++.so.*`拷贝到某个目录例如`/home/libstdcpp`下。最后`export LD_LIBRARY_PATH=/home/libstdcpp:$LD_LIBRARY_PATH` 即可。
+
+#### Q: 遇到OPENSSL_1.0.1EC 符号找不到的问题。
+
+目前Serving的可执行程序和客户端动态库需要链接1.0.2k版本的openssl动态库。如果环境当中没有，可以执行
+
+```bash
+wget https://paddle-serving.bj.bcebos.com/others/centos_ssl.tar && \
+    tar xf centos_ssl.tar && rm -rf centos_ssl.tar && \
+    mv libcrypto.so.1.0.2k /usr/lib/libcrypto.so.1.0.2k && mv libssl.so.1.0.2k /usr/lib/libssl.so.1.0.2k && \
+    ln -sf /usr/lib/libcrypto.so.1.0.2k /usr/lib/libcrypto.so.10 && \
+    ln -sf /usr/lib/libssl.so.1.0.2k /usr/lib/libssl.so.10 && \
+    ln -sf /usr/lib/libcrypto.so.10 /usr/lib/libcrypto.so && \
+    ln -sf /usr/lib/libssl.so.10 /usr/lib/libssl.so
+```
+
+其中`/usr/lib` 可以换成其他目录，并确保该目录在`LD_LIBRARY_PATH`下。

+### GPU相关环境问题
+
+#### Q：需要做哪些检查确保Serving可以运行在GPU环境
+
+**注：如果是使用Serving提供的镜像不需要做下列检查，如果是其他开发环境可以参考以下指导。**
+
+首先需要确保`nvidia-smi`可用，其次需要确保所需的动态库so文件在`LD_LIBRARY_PATH`所在的目录（包括系统lib库）。
+
+（1）Cuda显卡驱动：文件名通常为 `libcuda.so.$DRIVER_VERSION` 例如驱动版本为440.10.15，文件名就是`libcuda.so.440.10.15`。
+
+（2）Cuda和Cudnn动态库：文件名通常为 `libcudart.so.$CUDA_VERSION`，和 `libcudnn.so.$CUDNN_VERSION`。例如Cuda9就是 `libcudart.so.9.0`，Cudnn7就是 `libcudnn.so.7`。Cuda和Cudnn与Serving的版本匹配参见[Serving所有镜像列表](DOCKER_IMAGES_CN.md#%E9%99%84%E5%BD%95%E6%89%80%E6%9C%89%E9%95%9C%E5%83%8F%E5%88%97%E8%A1%A8).
+
+  (3) Cuda10.1及更高版本需要TensorRT。安装TensorRT相关文件的脚本参考 [install_trt.sh](../tools/dockerfile/build_scripts/install_trt.sh).

 ## 部署问题

@@ -154,7 +224,7 @@ InvalidArgumentError: Device id must be less than GPU count, but received id is:

 **A:**:1)使用[GPU docker](https://github.com/PaddlePaddle/Serving/blob/develop/doc/RUN_IN_DOCKER.md#gpunvidia-docker)解决环境问题

-	   2)修改anaconda的虚拟环境下安装的python的gcc版本[参考](https://www.jianshu.com/p/c498b3d86f77) 
+       2)修改anaconda的虚拟环境下安装的python的gcc版本[参考](https://www.jianshu.com/p/c498b3d86f77) 

 #### Q: paddle-serving是否支持本地离线安装 

@@ -221,9 +291,10 @@ client端的日志直接打印到标准输出。

 **A:** 1)警告是glog组件打印的，告知glog初始化之前日志打印在STDERR

-	   2)一般采用GLOG_v方式启动服务同时设置日志级别。
+       2)一般采用GLOG_v方式启动服务同时设置日志级别。

 例如：
+
 ```
 GLOG_v=2 python -m paddle_serving_server.serve --model xxx_conf/ --port 9999 
 ```

--- a/paddle_inference/CMakeLists.txt
+++ b/paddle_inference/CMakeLists.txt
@@ -13,13 +13,5 @@
 # limitations under the License

 if (NOT CLIENT_ONLY)
-    add_subdirectory(inferencer-fluid-cpu)
-    
-    if (WITH_GPU)
-        add_subdirectory(inferencer-fluid-gpu)
-    endif()
-    
-    if (WITH_LITE)
-        add_subdirectory(inferencer-fluid-arm)
-    endif()
+    add_subdirectory(paddle)
 endif()
--- a/paddle_inference/inferencer-fluid-arm/CMakeLists.txt
+++ b/paddle_inference/inferencer-fluid-arm/CMakeLists.txt
-FILE(GLOB fluid_arm_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
-add_library(fluid_arm_engine ${fluid_arm_engine_srcs})
-target_include_directories(fluid_arm_engine PUBLIC
-        ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
-add_dependencies(fluid_arm_engine pdserving extern_paddle configure)
-target_link_libraries(fluid_arm_engine pdserving paddle_fluid -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
-
-install(TARGETS fluid_arm_engine 
-        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
-        )
--- a/paddle_inference/inferencer-fluid-cpu/CMakeLists.txt
+++ b/paddle_inference/inferencer-fluid-cpu/CMakeLists.txt
-FILE(GLOB fluid_cpu_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
-add_library(fluid_cpu_engine ${fluid_cpu_engine_srcs})
-target_include_directories(fluid_cpu_engine PUBLIC
-        ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
-add_dependencies(fluid_cpu_engine pdserving extern_paddle configure)
-target_link_libraries(fluid_cpu_engine pdserving paddle_fluid -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
-
-install(TARGETS fluid_cpu_engine 
-        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
-        )
--- a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
+++ b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <pthread.h>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include "core/configure/include/configure_parser.h"
-#include "core/configure/inferencer_configure.pb.h"
-#include "core/predictor/framework/infer.h"
-#include "paddle_inference_api.h"  // NOLINT
-
-namespace baidu {
-namespace paddle_serving {
-namespace fluid_cpu {
-
-class AutoLock {
- public:
-  explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
-    pthread_mutex_lock(&mutex);
-  }
-
-  ~AutoLock() { pthread_mutex_unlock(&_mut); }
-
- private:
-  pthread_mutex_t& _mut;
-};
-
-class GlobalPaddleCreateMutex {
- public:
-  pthread_mutex_t& mutex() { return _mut; }
-
-  static pthread_mutex_t& instance() {
-    static GlobalPaddleCreateMutex gmutex;
-    return gmutex.mutex();
-  }
-
- private:
-  GlobalPaddleCreateMutex() { pthread_mutex_init(&_mut, NULL); }
-
-  pthread_mutex_t _mut;
-};
-
-using paddle_infer::Config;
-using paddle_infer::Predictor;
-using paddle_infer::Tensor;
-using paddle_infer::CreatePredictor;
-
-// data interface
-class FluidFamilyCore {
- public:
-  virtual ~FluidFamilyCore() {}
-  virtual std::vector<std::string> GetInputNames() {
-    return _core->GetInputNames();
-  }
-
-  virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
-    return _core->GetInputHandle(name);
-  }
-
-  virtual std::vector<std::string> GetOutputNames() {
-    return _core->GetOutputNames();
-  }
-
-  virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
-    return _core->GetOutputHandle(name);
-  }
-
-  virtual bool Run() {
-    if (!_core->Run()) {
-      LOG(ERROR) << "Failed call Run with paddle predictor";
-      return false;
-    }
-    return true;
-  }
-
-  virtual int create(const predictor::InferEngineCreationParams& params) = 0;
-
-  virtual int clone(void* origin_core) {
-    if (origin_core == NULL) {
-      LOG(ERROR) << "origin paddle Predictor is null.";
-      return -1;
-    }
-    Predictor* p_predictor = (Predictor*)origin_core;
-    _core = p_predictor->Clone();
-    if (_core.get() == NULL) {
-      LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
-      return -1;
-    }
-    return 0;
-  }
-
-  virtual void* get() { return _core.get(); }
-
- protected:
-  std::shared_ptr<Predictor> _core;
-};
-
-// infer interface
-class FluidCpuAnalysisCore : public FluidFamilyCore {
- public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
-    }
-
-    Config config;
-    config.SetParamsFile(data_path + "/__params__");
-    config.SetProgFile(data_path + "/__model__");
-    config.DisableGpu();
-    config.SetCpuMathLibraryNumThreads(1);
-
-    if (params.enable_memory_optimization()) {
-      config.EnableMemoryOptim();
-    }
-
-    config.SwitchSpecifyInputNames(true);
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core = CreatePredictor(config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-
-class FluidCpuAnalysisDirCore : public FluidFamilyCore {
- public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
-    }
-
-    Config config;
-    config.SetModel(data_path);
-    config.DisableGpu();
-    config.SwitchSpecifyInputNames(true);
-    config.SetCpuMathLibraryNumThreads(1);
-
-    if (params.enable_memory_optimization()) {
-      config.EnableMemoryOptim();
-    }
-
-    if (params.enable_ir_optimization()) {
-      config.SwitchIrOptim(true);
-    } else {
-      config.SwitchIrOptim(false);
-    }
-
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core = CreatePredictor(config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-
-class Parameter {
- public:
-  Parameter() : _row(0), _col(0), _params(NULL) {}
-  ~Parameter() {
-    VLOG(2) << "before destroy Parameter, file_name[" << _file_name << "]";
-    destroy();
-  }
-
-  int init(int row, int col, const char* file_name) {
-    destroy();
-    _file_name = file_name;
-    _row = row;
-    _col = col;
-    _params = reinterpret_cast<float*>(malloc(_row * _col * sizeof(float)));
-    if (_params == NULL) {
-      LOG(ERROR) << "Load " << _file_name << " malloc error.";
-      return -1;
-    }
-    VLOG(2) << "Load parameter file[" << _file_name << "] success.";
-    return 0;
-  }
-
-  void destroy() {
-    _row = 0;
-    _col = 0;
-    if (_params != NULL) {
-      free(_params);
-      _params = NULL;
-    }
-  }
-
-  int load() {
-    if (_params == NULL || _row <= 0 || _col <= 0) {
-      LOG(ERROR) << "load parameter error [not inited].";
-      return -1;
-    }
-
-    FILE* fs = fopen(_file_name.c_str(), "rb");
-    if (fs == NULL) {
-      LOG(ERROR) << "load " << _file_name << " fopen error.";
-      return -1;
-    }
-    static const uint32_t MODEL_FILE_HEAD_LEN = 16;
-    char head[MODEL_FILE_HEAD_LEN] = {0};
-    if (fread(head, 1, MODEL_FILE_HEAD_LEN, fs) != MODEL_FILE_HEAD_LEN) {
-      destroy();
-      LOG(ERROR) << "Load " << _file_name << " read head error.";
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      return -1;
-    }
-
-    uint32_t matrix_size = _row * _col;
-    if (matrix_size == fread(_params, sizeof(float), matrix_size, fs)) {
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      VLOG(2) << "load " << _file_name << " read ok.";
-      return 0;
-    } else {
-      LOG(ERROR) << "load " << _file_name << " read error.";
-      destroy();
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      return -1;
-    }
-    return 0;
-  }
-
- public:
-  std::string _file_name;
-  int _row;
-  int _col;
-  float* _params;
-};
-
-class FluidCpuAnalysisEncryptCore : public FluidFamilyCore {
- public:
-  void ReadBinaryFile(const std::string& filename, std::string* contents) {
-    std::ifstream fin(filename, std::ios::in | std::ios::binary);
-    fin.seekg(0, std::ios::end);
-    contents->clear();
-    contents->resize(fin.tellg());
-    fin.seekg(0, std::ios::beg);
-    fin.read(&(contents->at(0)), contents->size());
-    fin.close();
-  }
-
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path note exits: "
-                 << data_path;
-      return -1;
-    }
-
-    std::string model_buffer, params_buffer, key_buffer;
-    ReadBinaryFile(data_path + "encrypt_model", &model_buffer);
-    ReadBinaryFile(data_path + "encrypt_params", &params_buffer);
-    ReadBinaryFile(data_path + "key", &key_buffer);
-
-    VLOG(2) << "prepare for encryption model";
-
-    auto cipher = paddle::MakeCipher("");
-    std::string real_model_buffer = cipher->Decrypt(model_buffer, key_buffer);
-    std::string real_params_buffer = cipher->Decrypt(params_buffer, key_buffer);
-
-    Config analysis_config;
-    // paddle::AnalysisConfig analysis_config;
-    analysis_config.SetModelBuffer(&real_model_buffer[0],
-                                   real_model_buffer.size(),
-                                   &real_params_buffer[0],
-                                   real_params_buffer.size());
-    analysis_config.DisableGpu();
-    analysis_config.SetCpuMathLibraryNumThreads(1);
-    if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim();
-    }
-    analysis_config.SwitchSpecifyInputNames(true);
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    VLOG(2) << "decrypt model file sucess";
-    _core = CreatePredictor(analysis_config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-
-}  // namespace fluid_cpu
-}  // namespace paddle_serving
-}  // namespace baidu
--- a/paddle_inference/inferencer-fluid-cpu/src/fluid_cpu_engine.cpp
+++ b/paddle_inference/inferencer-fluid-cpu/src/fluid_cpu_engine.cpp
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h"
-#include "core/predictor/framework/factory.h"
-
-namespace baidu {
-namespace paddle_serving {
-namespace fluid_cpu {
-
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidCpuAnalysisCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_CPU_ANALYSIS");
-
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<
-        FluidCpuAnalysisDirCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_CPU_ANALYSIS_DIR");
-
-#if 1
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<
-        FluidCpuAnalysisEncryptCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_CPU_ANALYSIS_ENCRYPT");
-#endif
-}  // namespace fluid_cpu
-}  // namespace paddle_serving
-}  // namespace baidu
--- a/paddle_inference/inferencer-fluid-gpu/CMakeLists.txt
+++ b/paddle_inference/inferencer-fluid-gpu/CMakeLists.txt
-FILE(GLOB fluid_gpu_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
-add_library(fluid_gpu_engine ${fluid_gpu_engine_srcs})
-target_include_directories(fluid_gpu_engine PUBLIC
-        ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
-
-add_dependencies(fluid_gpu_engine pdserving extern_paddle configure)
-target_link_libraries(fluid_gpu_engine pdserving paddle_fluid iomp5 mklml_intel -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
-
-install(TARGETS fluid_gpu_engine 
-        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
-        )
--- a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
+++ b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <pthread.h>
-#include <fstream>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "core/configure/include/configure_parser.h"
-#include "core/configure/inferencer_configure.pb.h"
-#include "core/predictor/framework/infer.h"
-#include "paddle_inference_api.h"  // NOLINT
-
-DECLARE_int32(gpuid);
-
-namespace baidu {
-namespace paddle_serving {
-namespace fluid_gpu {
-
-using configure::SigmoidConf;
-
-class AutoLock {
- public:
-  explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
-    pthread_mutex_lock(&mutex);
-  }
-
-  ~AutoLock() { pthread_mutex_unlock(&_mut); }
-
- private:
-  pthread_mutex_t& _mut;
-};
-
-class GlobalPaddleCreateMutex {
- public:
-  pthread_mutex_t& mutex() { return _mut; }
-
-  static pthread_mutex_t& instance() {
-    static GlobalPaddleCreateMutex gmutex;
-    return gmutex.mutex();
-  }
-
- private:
-  GlobalPaddleCreateMutex() { pthread_mutex_init(&_mut, NULL); }
-
-  pthread_mutex_t _mut;
-};
-
-using paddle_infer::Config;
-using paddle_infer::Predictor;
-using paddle_infer::Tensor;
-using paddle_infer::CreatePredictor;
-
-// data interface
-class FluidFamilyCore {
- public:
-  virtual ~FluidFamilyCore() {}
-  virtual std::vector<std::string> GetInputNames() {
-    return _core->GetInputNames();
-  }
-
-  virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
-    return _core->GetInputHandle(name);
-  }
-
-  virtual std::vector<std::string> GetOutputNames() {
-    return _core->GetOutputNames();
-  }
-
-  virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
-    return _core->GetOutputHandle(name);
-  }
-
-  virtual bool Run() {
-    if (!_core->Run()) {
-      LOG(ERROR) << "Failed call Run with paddle predictor";
-      return false;
-    }
-    return true;
-  }
-
-  virtual int create(const predictor::InferEngineCreationParams& params) = 0;
-
-  virtual int clone(void* origin_core) {
-    if (origin_core == NULL) {
-      LOG(ERROR) << "origin paddle Predictor is null.";
-      return -1;
-    }
-    Predictor* p_predictor = (Predictor*)origin_core;
-    _core = p_predictor->Clone();
-    if (_core.get() == NULL) {
-      LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
-      return -1;
-    }
-    return 0;
-  }
-
-  virtual void* get() { return _core.get(); }
-
- protected:
-  std::shared_ptr<Predictor> _core;
-};
-
-// infer interface
-class FluidGpuAnalysisCore : public FluidFamilyCore {
- public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
-    }
-
-    Config config;
-    config.SetParamsFile(data_path + "/__params__");
-    config.SetProgFile(data_path + "/__model__");
-    config.EnableUseGpu(100, FLAGS_gpuid);
-    config.SetCpuMathLibraryNumThreads(1);
-
-    if (params.enable_memory_optimization()) {
-      config.EnableMemoryOptim();
-    }
-
-    config.SwitchSpecifyInputNames(true);
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core = CreatePredictor(config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-
-class FluidGpuAnalysisDirCore : public FluidFamilyCore {
- public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
-    }
-
-    Config config;
-    config.SetModel(data_path);
-    config.EnableUseGpu(1500, FLAGS_gpuid);
-    config.SwitchSpecifyInputNames(true);
-    config.SetCpuMathLibraryNumThreads(1);
-
-    if (params.enable_memory_optimization()) {
-      config.EnableMemoryOptim();
-    }
-    int max_batch = 32;
-    int min_subgraph_size = 3;
-    if (params.use_trt()) {
-      config.EnableTensorRtEngine(1 << 20,
-                                  max_batch,
-                                  min_subgraph_size,
-                                  Config::Precision::kFloat32,
-                                  false,
-                                  false);
-      LOG(INFO) << "create TensorRT predictor";
-    } else {
-      if (params.enable_memory_optimization()) {
-        config.EnableMemoryOptim();
-      }
-
-      if (params.enable_ir_optimization()) {
-        config.SwitchIrOptim(true);
-      } else {
-        config.SwitchIrOptim(false);
-      }
-    }
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core = CreatePredictor(config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-
-class Parameter {
- public:
-  Parameter() : _row(0), _col(0), _params(NULL) {}
-  ~Parameter() {
-    LOG(INFO) << "before destroy Parameter, file_name[" << _file_name << "]";
-    destroy();
-  }
-
-  int init(int row, int col, const char* file_name) {
-    destroy();
-    _file_name = file_name;
-    _row = row;
-    _col = col;
-    _params = reinterpret_cast<float*>(malloc(_row * _col * sizeof(float)));
-    if (_params == NULL) {
-      LOG(ERROR) << "Load " << _file_name << " malloc error.";
-      return -1;
-    }
-    VLOG(2) << "Load parameter file[" << _file_name << "] success.";
-    return 0;
-  }
-
-  void destroy() {
-    _row = 0;
-    _col = 0;
-    if (_params != NULL) {
-      free(_params);
-      _params = NULL;
-    }
-  }
-
-  int load() {
-    if (_params == NULL || _row <= 0 || _col <= 0) {
-      LOG(ERROR) << "load parameter error [not inited].";
-      return -1;
-    }
-
-    FILE* fs = fopen(_file_name.c_str(), "rb");
-    if (fs == NULL) {
-      LOG(ERROR) << "load " << _file_name << " fopen error.";
-      return -1;
-    }
-    static const uint32_t MODEL_FILE_HEAD_LEN = 16;
-    char head[MODEL_FILE_HEAD_LEN] = {0};
-    if (fread(head, 1, MODEL_FILE_HEAD_LEN, fs) != MODEL_FILE_HEAD_LEN) {
-      destroy();
-      LOG(ERROR) << "Load " << _file_name << " read head error.";
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      return -1;
-    }
-
-    uint32_t matrix_size = _row * _col;
-    if (matrix_size == fread(_params, sizeof(float), matrix_size, fs)) {
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      LOG(INFO) << "load " << _file_name << " read ok.";
-      return 0;
-    } else {
-      LOG(ERROR) << "load " << _file_name << " read error.";
-      destroy();
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      return -1;
-    }
-    return 0;
-  }
-
- public:
-  std::string _file_name;
-  int _row;
-  int _col;
-  float* _params;
-};
-
-class FluidGpuAnalysisEncryptCore : public FluidFamilyCore {
- public:
-  void ReadBinaryFile(const std::string& filename, std::string* contents) {
-    std::ifstream fin(filename, std::ios::in | std::ios::binary);
-    fin.seekg(0, std::ios::end);
-    contents->clear();
-    contents->resize(fin.tellg());
-    fin.seekg(0, std::ios::beg);
-    fin.read(&(contents->at(0)), contents->size());
-    fin.close();
-  }
-
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path note exits: "
-                 << data_path;
-      return -1;
-    }
-
-    std::string model_buffer, params_buffer, key_buffer;
-    ReadBinaryFile(data_path + "encrypt_model", &model_buffer);
-    ReadBinaryFile(data_path + "encrypt_params", &params_buffer);
-    ReadBinaryFile(data_path + "key", &key_buffer);
-
-    VLOG(2) << "prepare for encryption model";
-
-    auto cipher = paddle::MakeCipher("");
-    std::string real_model_buffer = cipher->Decrypt(model_buffer, key_buffer);
-    std::string real_params_buffer = cipher->Decrypt(params_buffer, key_buffer);
-
-    Config analysis_config;
-    analysis_config.SetModelBuffer(&real_model_buffer[0],
-                                   real_model_buffer.size(),
-                                   &real_params_buffer[0],
-                                   real_params_buffer.size());
-    analysis_config.EnableUseGpu(100, FLAGS_gpuid);
-    analysis_config.SetCpuMathLibraryNumThreads(1);
-    if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim();
-    }
-    analysis_config.SwitchSpecifyInputNames(true);
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    VLOG(2) << "decrypt model file sucess";
-    _core = CreatePredictor(analysis_config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-
-}  // namespace fluid_gpu
-}  // namespace paddle_serving
-}  // namespace baidu
--- a/paddle_inference/inferencer-fluid-gpu/src/fluid_gpu_engine.cpp
+++ b/paddle_inference/inferencer-fluid-gpu/src/fluid_gpu_engine.cpp
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h"
-#include "core/predictor/framework/factory.h"
-
-DEFINE_int32(gpuid, 0, "GPU device id to use");
-
-namespace baidu {
-namespace paddle_serving {
-namespace fluid_gpu {
-
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidGpuAnalysisCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_GPU_ANALYSIS");
-
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<
-        FluidGpuAnalysisDirCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_GPU_ANALYSIS_DIR");
-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<
-        FluidGpuAnalysisEncryptCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_GPU_ANALYSIS_ENCRPT")
-
-}  // namespace fluid_gpu
-}  // namespace paddle_serving
-}  // namespace baidu
--- a/paddle_inference/paddle/CMakeLists.txt
+++ b/paddle_inference/paddle/CMakeLists.txt
+FILE(GLOB paddle_inference_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
+add_library(paddle_inference_engine ${paddle_inference_engine_srcs})
+target_include_directories(paddle_inference_engine PUBLIC
+        ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
+add_dependencies(paddle_inference_engine pdserving extern_paddle configure)
+target_link_libraries(paddle_inference_engine pdserving paddle_inference -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
+
+install(TARGETS paddle_inference_engine 
+        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
+        )
--- a/paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h
+++ b/paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -17,275 +17,174 @@
 #include <pthread.h>
 #include <fstream>
 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
 #include "core/configure/include/configure_parser.h"
 #include "core/configure/inferencer_configure.pb.h"
+#include "core/predictor/common/utils.h"
 #include "core/predictor/framework/infer.h"
 #include "paddle_inference_api.h"  // NOLINT

 namespace baidu {
 namespace paddle_serving {
-namespace fluid_arm {
-
-class AutoLock {
- public:
-  explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
-    pthread_mutex_lock(&mutex);
-  }
-
-  ~AutoLock() { pthread_mutex_unlock(&_mut); }
-
- private:
-  pthread_mutex_t& _mut;
-};
-
-class GlobalPaddleCreateMutex {
- public:
-  pthread_mutex_t& mutex() { return _mut; }
-
-  static pthread_mutex_t& instance() {
-    static GlobalPaddleCreateMutex gmutex;
-    return gmutex.mutex();
-  }
-
- private:
-  GlobalPaddleCreateMutex() { pthread_mutex_init(&_mut, NULL); }
-
-  pthread_mutex_t _mut;
-};
+namespace inference {

 using paddle_infer::Config;
+using paddle_infer::PrecisionType;
 using paddle_infer::Predictor;
 using paddle_infer::Tensor;
-using paddle_infer::PrecisionType;
 using paddle_infer::CreatePredictor;

-// data interface
-class FluidFamilyCore {
+DECLARE_int32(gpuid);
+
+static const int max_batch = 32;
+static const int min_subgraph_size = 3;
+
+// Engine Base
+class PaddleEngineBase {
 public:
-  virtual ~FluidFamilyCore() {}
+  virtual ~PaddleEngineBase() {}
  virtual std::vector<std::string> GetInputNames() {
-    return _core->GetInputNames();
+    return _predictor->GetInputNames();
  }

  virtual std::unique_ptr<Tensor> GetInputHandle(const std::string& name) {
-    return _core->GetInputHandle(name);
+    return _predictor->GetInputHandle(name);
  }

  virtual std::vector<std::string> GetOutputNames() {
-    return _core->GetOutputNames();
+    return _predictor->GetOutputNames();
  }

  virtual std::unique_ptr<Tensor> GetOutputHandle(const std::string& name) {
-    return _core->GetOutputHandle(name);
+    return _predictor->GetOutputHandle(name);
  }

  virtual bool Run() {
-    if (!_core->Run()) {
+    if (!_predictor->Run()) {
      LOG(ERROR) << "Failed call Run with paddle predictor";
      return false;
    }
    return true;
  }

-  virtual int create(const predictor::InferEngineCreationParams& params) = 0;
+  virtual int create(const configure::EngineDesc& conf) = 0;

-  virtual int clone(void* origin_core) {
-    if (origin_core == NULL) {
+  virtual int clone(void* predictor) {
+    if (predictor == NULL) {
      LOG(ERROR) << "origin paddle Predictor is null.";
      return -1;
    }
-    Predictor* p_predictor = (Predictor*)origin_core;
-    _core = p_predictor->Clone();
-    if (_core.get() == NULL) {
-      LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
+    Predictor* prep = static_cast<Predictor*>(predictor);
+    _predictor = prep->Clone();
+    if (_predictor.get() == NULL) {
+      LOG(ERROR) << "fail to clone paddle predictor: " << predictor;
      return -1;
    }
    return 0;
  }

-  virtual void* get() { return _core.get(); }
+  virtual void* get() { return _predictor.get(); }

 protected:
-  std::shared_ptr<Predictor> _core;
+  std::shared_ptr<Predictor> _predictor;
 };

-// infer interface
-class FluidArmAnalysisCore : public FluidFamilyCore {
+// Paddle Inference Engine
+class PaddleInferenceEngine : public PaddleEngineBase {
 public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
+  int create(const configure::EngineDesc& engine_conf) {
+    std::string model_path = engine_conf.model_dir();
+    if (access(model_path.c_str(), F_OK) == -1) {
      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
+                 << model_path;
      return -1;
    }

    Config config;
-    config.SetParamsFile(data_path + "/__params__");
-    config.SetProgFile(data_path + "/__model__");
-    config.DisableGpu();
-    config.SetCpuMathLibraryNumThreads(1);
-
-    if (params.use_lite()) {
-      config.EnableLiteEngine(PrecisionType::kFloat32, true);
-    }
-
-    if (params.use_xpu()) {
-      config.EnableXpu(2 * 1024 * 1024);
-    }
-
-    if (params.enable_memory_optimization()) {
-      config.EnableMemoryOptim();
-    }
-
-    if (params.enable_ir_optimization()) {
-      config.SwitchIrOptim(true);
+    // todo, auto config(zhangjun)
+    if (engine_conf.has_combined_model()) {
+      if (!engine_conf.combined_model()) {
+        config.SetModel(model_path);
+      } else {
+        config.SetParamsFile(model_path + "/__params__");
+        config.SetProgFile(model_path + "/__model__");
+      }
    } else {
-      config.SwitchIrOptim(false);
+      config.SetParamsFile(model_path + "/__params__");
+      config.SetProgFile(model_path + "/__model__");
    }

    config.SwitchSpecifyInputNames(true);
-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core = CreatePredictor(config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
+    config.SetCpuMathLibraryNumThreads(1);
+    if (engine_conf.has_use_gpu() && engine_conf.use_gpu()) {
+      // 2000MB GPU memory
+      config.EnableUseGpu(2000, FLAGS_gpuid);
    }

-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-
-class FluidArmAnalysisDirCore : public FluidFamilyCore {
- public:
-  int create(const predictor::InferEngineCreationParams& params) {
-    std::string data_path = params.get_path();
-    if (access(data_path.c_str(), F_OK) == -1) {
-      LOG(ERROR) << "create paddle predictor failed, path not exits: "
-                 << data_path;
-      return -1;
+    if (engine_conf.has_use_trt() && engine_conf.use_trt()) {
+      if (!engine_conf.has_use_gpu() || !engine_conf.use_gpu()) {
+        config.EnableUseGpu(2000, FLAGS_gpuid);
+      }
+      config.EnableTensorRtEngine(1 << 20,
+                                  max_batch,
+                                  min_subgraph_size,
+                                  Config::Precision::kFloat32,
+                                  false,
+                                  false);
+      LOG(INFO) << "create TensorRT predictor";
    }

-    Config config;
-    config.SetModel(data_path);
-    config.DisableGpu();
-    config.SwitchSpecifyInputNames(true);
-    config.SetCpuMathLibraryNumThreads(1);
-
-    if (params.use_lite()) {
+    if (engine_conf.has_use_lite() && engine_conf.use_lite()) {
      config.EnableLiteEngine(PrecisionType::kFloat32, true);
    }

-    if (params.use_xpu()) {
+    if (engine_conf.has_use_xpu() && engine_conf.use_xpu()) {
+      // 2 MB l3 cache
      config.EnableXpu(2 * 1024 * 1024);
    }
-
-    if (params.enable_memory_optimization()) {
-      config.EnableMemoryOptim();
-    }
-
-    if (params.enable_ir_optimization()) {
-      config.SwitchIrOptim(true);
-    } else {
+    if (engine_conf.has_enable_ir_optimization() &&
+        !engine_conf.enable_ir_optimization()) {
      config.SwitchIrOptim(false);
+    } else {
+      config.SwitchIrOptim(true);
    }

-    AutoLock lock(GlobalPaddleCreateMutex::instance());
-    _core = CreatePredictor(config);
-    if (NULL == _core.get()) {
-      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
-      return -1;
-    }
-
-    VLOG(2) << "create paddle predictor sucess, path: " << data_path;
-    return 0;
-  }
-};
-
-class Parameter {
- public:
-  Parameter() : _row(0), _col(0), _params(NULL) {}
-  ~Parameter() {
-    VLOG(2) << "before destroy Parameter, file_name[" << _file_name << "]";
-    destroy();
-  }
-
-  int init(int row, int col, const char* file_name) {
-    destroy();
-    _file_name = file_name;
-    _row = row;
-    _col = col;
-    _params = reinterpret_cast<float*>(malloc(_row * _col * sizeof(float)));
-    if (_params == NULL) {
-      LOG(ERROR) << "Load " << _file_name << " malloc error.";
-      return -1;
+    if (engine_conf.has_enable_memory_optimization() &&
+        engine_conf.enable_memory_optimization()) {
+      config.EnableMemoryOptim();
    }
-    VLOG(2) << "Load parameter file[" << _file_name << "] success.";
-    return 0;
-  }

-  void destroy() {
-    _row = 0;
-    _col = 0;
-    if (_params != NULL) {
-      free(_params);
-      _params = NULL;
-    }
-  }
+    if (engine_conf.has_encrypted_model() && engine_conf.encrypted_model()) {
+      // decrypt model
+      std::string model_buffer, params_buffer, key_buffer;
+      predictor::ReadBinaryFile(model_path + "encrypt_model", &model_buffer);
+      predictor::ReadBinaryFile(model_path + "encrypt_params", &params_buffer);
+      predictor::ReadBinaryFile(model_path + "key", &key_buffer);

-  int load() {
-    if (_params == NULL || _row <= 0 || _col <= 0) {
-      LOG(ERROR) << "load parameter error [not inited].";
-      return -1;
+      auto cipher = paddle::MakeCipher("");
+      std::string real_model_buffer = cipher->Decrypt(model_buffer, key_buffer);
+      std::string real_params_buffer =
+          cipher->Decrypt(params_buffer, key_buffer);
+      config.SetModelBuffer(&real_model_buffer[0],
+                            real_model_buffer.size(),
+                            &real_params_buffer[0],
+                            real_params_buffer.size());
    }

-    FILE* fs = fopen(_file_name.c_str(), "rb");
-    if (fs == NULL) {
-      LOG(ERROR) << "load " << _file_name << " fopen error.";
-      return -1;
-    }
-    static const uint32_t MODEL_FILE_HEAD_LEN = 16;
-    char head[MODEL_FILE_HEAD_LEN] = {0};
-    if (fread(head, 1, MODEL_FILE_HEAD_LEN, fs) != MODEL_FILE_HEAD_LEN) {
-      destroy();
-      LOG(ERROR) << "Load " << _file_name << " read head error.";
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
+    predictor::AutoLock lock(predictor::GlobalCreateMutex::instance());
+    _predictor = CreatePredictor(config);
+    if (NULL == _predictor.get()) {
+      LOG(ERROR) << "create paddle predictor failed, path: " << model_path;
      return -1;
    }

-    uint32_t matrix_size = _row * _col;
-    if (matrix_size == fread(_params, sizeof(float), matrix_size, fs)) {
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      VLOG(2) << "load " << _file_name << " read ok.";
-      return 0;
-    } else {
-      LOG(ERROR) << "load " << _file_name << " read error.";
-      destroy();
-      if (fs != NULL) {
-        fclose(fs);
-        fs = NULL;
-      }
-      return -1;
-    }
+    VLOG(2) << "create paddle predictor sucess, path: " << model_path;
    return 0;
  }
-
- public:
-  std::string _file_name;
-  int _row;
-  int _col;
-  float* _params;
 };

-}  // namespace fluid_arm
+}  // namespace inference
 }  // namespace paddle_serving
 }  // namespace baidu
--- a/paddle_inference/inferencer-fluid-arm/src/fluid_arm_engine.cpp
+++ b/paddle_inference/inferencer-fluid-arm/src/fluid_arm_engine.cpp
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,24 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle_inference/inferencer-fluid-arm/include/fluid_arm_engine.h"
+#include "paddle_inference/paddle/include/paddle_engine.h"
 #include "core/predictor/framework/factory.h"

 namespace baidu {
 namespace paddle_serving {
-namespace fluid_arm {
+namespace inference {

-REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidArmAnalysisCore>,
-    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_ARM_ANALYSIS");
+DEFINE_int32(gpuid, 0, "GPU device id to use");

 REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
-    ::baidu::paddle_serving::predictor::FluidInferEngine<
-        FluidArmAnalysisDirCore>,
+    ::baidu::paddle_serving::predictor::FluidInferEngine<PaddleInferenceEngine>,
    ::baidu::paddle_serving::predictor::InferEngine,
-    "FLUID_ARM_ANALYSIS_DIR");
+    "PADDLE_INFER");

-}  // namespace fluid_arm
+}  // namespace inference
 }  // namespace paddle_serving
 }  // namespace baidu
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
 if (CLIENT)
-    file(INSTALL pipeline DESTINATION paddle_serving_client)
-    file(GLOB_RECURSE SERVING_CLIENT_PY_FILES paddle_serving_client/*.py)
-    set(PY_FILES ${SERVING_CLIENT_PY_FILES})
-    SET(PACKAGE_NAME "serving_client")
-    set(SETUP_LOG_FILE "setup.py.client.log")
+  file(INSTALL pipeline DESTINATION paddle_serving_client)
+  file(GLOB_RECURSE SERVING_CLIENT_PY_FILES paddle_serving_client/*.py)
+  set(PY_FILES ${SERVING_CLIENT_PY_FILES})
+  SET(PACKAGE_NAME "serving_client")
+  set(SETUP_LOG_FILE "setup.py.client.log")
 endif()

 if (SERVER)
-    if (NOT WITH_GPU AND NOT WITH_LITE)
-        file(INSTALL pipeline DESTINATION paddle_serving_server)
-        file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server/*.py)
-    else()
-        file(INSTALL pipeline DESTINATION paddle_serving_server_gpu)
-        file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server_gpu/*.py)
-    endif()
-        set(PY_FILES ${SERVING_SERVER_PY_FILES})
-        SET(PACKAGE_NAME "serving_server")
-        set(SETUP_LOG_FILE "setup.py.server.log")
+  SET(SERVER_PACKAGE_NAME "paddle-serving-server")
+  if (WITH_GPU) 
+    set(SERVER_PACKAGE_NAME "paddle-serving-server-gpu")
+  elseif(WITH_XPU)
+    set(SERVER_PACKAGE_NAME "paddle-serving-server-xpu")
+  endif()
+  file(INSTALL pipeline DESTINATION paddle_serving_server)
+  file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server/*.py)
+  set(PY_FILES ${SERVING_SERVER_PY_FILES})
+  set(SETUP_LOG_FILE "setup.py.server.log")
 endif()

 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/util.py
    ${CMAKE_CURRENT_BINARY_DIR}/util.py)
 if (CLIENT)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.client.in
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.client.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../tools/python_tag.py
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../tools/python_tag.py
    ${CMAKE_CURRENT_BINARY_DIR}/python_tag.py)
 endif()

 if (APP)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.app.in
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.app.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 endif()

 if (SERVER)
-    if (NOT WITH_GPU AND NOT WITH_LITE)
-        configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.server.in
-            ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-    else()
-        configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.server_gpu.in
-            ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-    endif()
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.server.in
+    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 endif()

 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/gen_version.py
@@ -50,108 +45,73 @@ set (SERVING_CLIENT_CORE ${PADDLE_SERVING_BINARY_DIR}/core/general-client/*.so)
 message("python env: " ${py_env})

 if (APP)
-add_custom_command(
-        OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
-        COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_app/ ${PADDLE_SERVING_BINARY_DIR}/python/
-        COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py "app"
-        COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-        DEPENDS ${SERVING_APP_CORE} general_model_config_py_proto ${PY_FILES})
-add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+  add_custom_command(
+    OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_app/ ${PADDLE_SERVING_BINARY_DIR}/python/
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py "app"
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    DEPENDS ${SERVING_APP_CORE} general_model_config_py_proto ${PY_FILES})
+  add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
 endif()

 if (CLIENT)
-add_custom_command(
-	OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
-	COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_client/ ${PADDLE_SERVING_BINARY_DIR}/python/
-	COMMAND ${CMAKE_COMMAND} -E copy ${SERVING_CLIENT_CORE} ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/serving_client.so
+  add_custom_command(
+    OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_client/ ${PADDLE_SERVING_BINARY_DIR}/python/
+    COMMAND ${CMAKE_COMMAND} -E copy ${SERVING_CLIENT_CORE} ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/serving_client.so
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} python_tag.py
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py "client"
-	COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-	DEPENDS ${SERVING_CLIENT_CORE} sdk_configure_py_proto ${PY_FILES})
-add_custom_target(paddle_python ALL DEPENDS serving_client ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    DEPENDS ${SERVING_CLIENT_CORE} sdk_configure_py_proto ${PY_FILES})
+  add_custom_target(paddle_python ALL DEPENDS serving_client ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
 endif()

 if (SERVER)
-    if(NOT WITH_GPU AND NOT WITH_LITE)
-        add_custom_command(
-            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
-            COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server/ ${PADDLE_SERVING_BINARY_DIR}/python/
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py "server"
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
-        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
-    elseif(WITH_TRT)
-        if(CUDA_VERSION EQUAL 10.1)
-            set(SUFFIX 101)
-        elseif(CUDA_VERSION EQUAL 10.2)
-            set(SUFFIX 102)
-        elseif(CUDA_VERSION EQUAL 11.0)
-            set(SUFFIX 11)
-
-        endif()
-        add_custom_command(
-            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
-            COMMAND cp -r
-            ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
-            "server_gpu"  ${SUFFIX}
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
-        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
-    elseif(WITH_LITE)
-        if(WITH_XPU)
-            add_custom_command(
-                OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
-                COMMAND cp -r
-                ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
-                COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
-                "server_gpu" arm-xpu 
-                COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-                DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
-            add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
-        else()
-            add_custom_command(
-                OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
-                COMMAND cp -r
-                ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
-                COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
-                "server_gpu" arm 
-                COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-                DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
-            add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
-        endif()
-    else()
-        add_custom_command(
-            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
-            COMMAND cp -r
-            ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
-            "server_gpu" ${CUDA_VERSION_MAJOR}
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
-        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+  # todo, generate suffix for cpu、gpu、arm
+  if(WITH_TRT)
+    if(CUDA_VERSION EQUAL 10.1)
+      set(VERSION_SUFFIX 101)
+    elseif(CUDA_VERSION EQUAL 10.2)
+      set(VERSION_SUFFIX 102)
+    elseif(CUDA_VERSION EQUAL 11.0)
+      set(VERSION_SUFFIX 11)
    endif()
+  endif()
+
+  if(WITH_LITE)
+    set(VERSION_SUFFIX 2)
+  endif()
+
+  add_custom_command(
+    OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+    COMMAND cp -r
+    ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server/ ${PADDLE_SERVING_BINARY_DIR}/python/
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
+    "server" ${VERSION_SUFFIX}
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
+  add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
 endif()

 set(SERVING_CLIENT_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 set(SERVING_SERVER_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)

 if (CLIENT)
-install(DIRECTORY ${SERVING_CLIENT_PYTHON_PACKAGE_DIR}
+  install(DIRECTORY ${SERVING_CLIENT_PYTHON_PACKAGE_DIR}
    DESTINATION opt/serving_client/share/wheels
-)
+  )
 endif()

 if (SERVER)
-install(DIRECTORY ${SERVING_SERVER_PYTHON_PACKAGE_DIR}
-    DESTINATION opt/serving_server/share/wheels
-)
+  install(DIRECTORY ${SERVING_SERVER_PYTHON_PACKAGE_DIR}
+      DESTINATION opt/serving_server/share/wheels
+  )
 endif()

 if (CLIENT OR SERVER)
-find_program(PATCHELF_EXECUTABLE patchelf)
-if (NOT PATCHELF_EXECUTABLE)
-  message(FATAL_ERROR "patchelf not found, please install it.\n"
-         "For Ubuntu, the command is: apt-get install -y patchelf.")
-endif()
+  find_program(PATCHELF_EXECUTABLE patchelf)
+  if (NOT PATCHELF_EXECUTABLE)
+    message(FATAL_ERROR "patchelf not found, please install it.\n"
+           "For Ubuntu, the command is: apt-get install -y patchelf.")
+  endif()
 endif()
--- a/python/examples/bert/README.md
+++ b/python/examples/bert/README.md
@@ -49,7 +49,7 @@ python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #c
 ```
 Or,start gpu inference service,Run
 ```
-python -m paddle_serving_server_gpu.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #launch gpu inference service at GPU 0
+python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #launch gpu inference service at GPU 0
 ```

 ### RPC Inference

--- a/python/examples/bert/README_CN.md
+++ b/python/examples/bert/README_CN.md
@@ -48,7 +48,7 @@ python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292  #
 ```
 或者，启动gpu预测服务，执行
 ```
-python -m paddle_serving_server_gpu.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务
+python -m paddle_serving_server.serve --model bert_seq128_model/ --port 9292 --gpu_ids 0 #在gpu 0上启动gpu预测服务

 ```


--- a/python/examples/bert/benchmark.sh
+++ b/python/examples/bert/benchmark.sh
@@ -12,7 +12,7 @@ else
    mkdir utilization
 fi
 #start server
-$PYTHONROOT/bin/python3 -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim  --ir_optim >  elog  2>&1 &
+$PYTHONROOT/bin/python3 -m paddle_serving_server.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim  --ir_optim >  elog  2>&1 &
 sleep 5

 #warm up

--- a/python/examples/bert/benchmark_with_profile.sh
+++ b/python/examples/bert/benchmark_with_profile.sh
 export CUDA_VISIBLE_DEVICES=0,1,2,3
-python -m paddle_serving_server_gpu.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
+python -m paddle_serving_server.serve --model bert_seq20_model/ --port 9295 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
 export FLAGS_profile_client=1
 export FLAGS_profile_server=1
 sleep 5

--- a/python/examples/bert/bert_gpu_server.py
+++ b/python/examples/bert/bert_gpu_server.py
@@ -14,9 +14,9 @@

 import os
 import sys
-from paddle_serving_server_gpu import OpMaker
-from paddle_serving_server_gpu import OpSeqMaker
-from paddle_serving_server_gpu import Server
+from paddle_serving_server import OpMaker
+from paddle_serving_server import OpSeqMaker
+from paddle_serving_server import Server

 op_maker = OpMaker()
 read_op = op_maker.create('general_reader')

--- a/python/examples/bert/bert_web_service_gpu.py
+++ b/python/examples/bert/bert_web_service_gpu.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
-from paddle_serving_server_gpu.web_service import WebService
+from paddle_serving_server.web_service import WebService
 from paddle_serving_app.reader import ChineseBertReader
 import sys
 import os

--- a/python/examples/cascade_rcnn/README.md
+++ b/python/examples/cascade_rcnn/README.md
@@ -10,7 +10,7 @@ If you want to have more detection models, please refer to [Paddle Detection Mod

 ### Start the service
 ```
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9292 --gpu_id 0
+python -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
 ```

 ### Perform prediction

--- a/python/examples/cascade_rcnn/README_CN.md
+++ b/python/examples/cascade_rcnn/README_CN.md
@@ -10,7 +10,7 @@ sh get_data.sh

 ### 启动服务
 ```
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9292 --gpu_id 0
+python -m paddle_serving_server.serve --model serving_server --port 9292 --gpu_id 0
 ```

 ### 执行预测

--- a/python/examples/criteo_ctr/README.md
+++ b/python/examples/criteo_ctr/README.md
@@ -20,7 +20,7 @@ the directories like `ctr_serving_model` and `ctr_client_conf` will appear.

 ```
 python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #CPU RPC Service
-python -m paddle_serving_server_gpu.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #RPC Service on GPU 0
+python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #RPC Service on GPU 0
 ```

 ### RPC Infer

--- a/python/examples/criteo_ctr/README_CN.md
+++ b/python/examples/criteo_ctr/README_CN.md
@@ -20,7 +20,7 @@ mv models/ctr_serving_model .

 ```
 python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 #启动CPU预测服务
-python -m paddle_serving_server_gpu.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #在GPU 0上启动预测服务
+python -m paddle_serving_server.serve --model ctr_serving_model/ --port 9292 --gpu_ids 0 #在GPU 0上启动预测服务
 ```

 ### 执行预测

--- a/python/examples/deeplabv3/README.md
+++ b/python/examples/deeplabv3/README.md
@@ -12,7 +12,7 @@ tar -xzvf deeplabv3.tar.gz
 ### Start Service

 ```
-python -m paddle_serving_server_gpu.serve --model deeplabv3_server --gpu_ids 0 --port 9494
+python -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
 ```

 ### Client Prediction

--- a/python/examples/deeplabv3/README_CN.md
+++ b/python/examples/deeplabv3/README_CN.md
@@ -12,7 +12,7 @@ tar -xzvf deeplabv3.tar.gz
 ### 启动服务端

 ```
-python -m paddle_serving_server_gpu.serve --model deeplabv3_server --gpu_ids 0 --port 9494
+python -m paddle_serving_server.serve --model deeplabv3_server --gpu_ids 0 --port 9494
 ```

 ### 客户端预测

--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
@@ -10,7 +10,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```

 This model support TensorRT, if you want a faster inference, please use `--use_trt`. 

--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
@@ -11,7 +11,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。


--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README.md
@@ -10,7 +10,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ppyolo_r50vd_dcn_1x_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```

 This model support TensorRT, if you want a faster inference, please use `--use_trt`.

--- a/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md
+++ b/python/examples/detection/ppyolo_r50vd_dcn_1x_coco/README_CN.md
@@ -11,7 +11,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf ppyolo_r50vd_dcn_1x_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```

 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。

--- a/python/examples/detection/ttfnet_darknet53_1x_coco/README.md
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/README.md
@@ -10,7 +10,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ttfnet_darknet53_1x_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.


--- a/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md
+++ b/python/examples/detection/ttfnet_darknet53_1x_coco/README_CN.md
@@ -11,7 +11,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf ttfnet_darknet53_1x_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```

 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。

--- a/python/examples/detection/yolov3_darknet53_270e_coco/README.md
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/README.md
@@ -10,7 +10,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf yolov3_darknet53_270e_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```

 This model support TensorRT, if you want a faster inference, please use `--use_trt`.

--- a/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md
+++ b/python/examples/detection/yolov3_darknet53_270e_coco/README_CN.md
@@ -11,7 +11,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf yolov3_darknet53_270e_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```

 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。

--- a/python/examples/encryption/README.md
+++ b/python/examples/encryption/README.md
@@ -26,7 +26,7 @@ python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_
 ```
 GPU Service
 ```
-python -m paddle_serving_server_gpu.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
+python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
 ```

 ## Prediction

--- a/python/examples/encryption/README_CN.md
+++ b/python/examples/encryption/README_CN.md
@@ -24,7 +24,7 @@ python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_
 ```
 GPU预测服务
 ```
-python -m paddle_serving_server_gpu.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
+python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_encryption_model --gpu_ids 0
 ```

 ## 预测

--- a/python/examples/grpc_impl_example/fit_a_line/test_server_gpu.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_server_gpu.py
@@ -15,9 +15,9 @@

 import os
 import sys
-from paddle_serving_server_gpu import OpMaker
-from paddle_serving_server_gpu import OpSeqMaker
-from paddle_serving_server_gpu import MultiLangServer as Server
+from paddle_serving_server import OpMaker
+from paddle_serving_server import OpSeqMaker
+from paddle_serving_server import MultiLangServer as Server

 op_maker = OpMaker()
 read_op = op_maker.create('general_reader')

--- a/python/examples/grpc_impl_example/yolov4/README.md
+++ b/python/examples/grpc_impl_example/yolov4/README.md
@@ -12,7 +12,7 @@ tar -xzvf yolov4.tar.gz
 ## Start RPC Service

 ```
-python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang
+python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang
 ```

 ## Prediction

--- a/python/examples/grpc_impl_example/yolov4/README_CN.md
+++ b/python/examples/grpc_impl_example/yolov4/README_CN.md
@@ -12,7 +12,7 @@ tar -xzvf yolov4.tar.gz
 ## 启动RPC服务

 ```
-python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang
+python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang
 ```

 ## 预测

--- a/python/examples/imagenet/README.md
+++ b/python/examples/imagenet/README.md
@@ -39,7 +39,7 @@ python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu
 ```

 ```
-python -m paddle_serving_server_gpu.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu inference service
+python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu inference service
 ```

 client send inference request

--- a/python/examples/imagenet/README_CN.md
+++ b/python/examples/imagenet/README_CN.md
@@ -39,7 +39,7 @@ python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 #cpu
 ```

 ```
-python -m paddle_serving_server_gpu.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu预测服务
+python -m paddle_serving_server.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0 #gpu预测服务
 ```

 client端进行预测

--- a/python/examples/imagenet/benchmark.sh
+++ b/python/examples/imagenet/benchmark.sh
@@ -2,7 +2,7 @@ rm profile_log*
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 export FLAGS_profile_server=1
 export FLAGS_profile_client=1
-python -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim --ir_optim  2> elog > stdlog &
+python -m paddle_serving_server.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim --ir_optim  2> elog > stdlog &

 sleep 5
 gpu_id=0

--- a/python/examples/imagenet/resnet50_web_service.py
+++ b/python/examples/imagenet/resnet50_web_service.py
@@ -25,7 +25,7 @@ device = sys.argv[2]
 if device == "cpu":
    from paddle_serving_server.web_service import WebService
 else:
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService


 class ImageService(WebService):

--- a/python/examples/mobilenet/README.md
+++ b/python/examples/mobilenet/README.md
@@ -12,7 +12,7 @@ tar -xzvf mobilenet_v2_imagenet.tar.gz
 ### Start Service

 ```
-python -m paddle_serving_server_gpu.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
+python -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
 ```

 ### Client Prediction

--- a/python/examples/mobilenet/README_CN.md
+++ b/python/examples/mobilenet/README_CN.md
@@ -12,7 +12,7 @@ tar -xzvf mobilenet_v2_imagenet.tar.gz
 ### 启动服务端

 ```
-python -m paddle_serving_server_gpu.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
+python -m paddle_serving_server.serve --model mobilenet_v2_imagenet_model --gpu_ids 0 --port 9393
 ```

 ### 客户端预测

--- a/python/examples/ocr/README.md
+++ b/python/examples/ocr/README.md
@@ -26,7 +26,7 @@ tar xf test_imgs.tar
 python -m paddle_serving_server.serve --model ocr_det_model --port 9293
 python ocr_web_server.py cpu
 #for gpu user
-python -m paddle_serving_server_gpu.serve --model ocr_det_model --port 9293 --gpu_id 0
+python -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_id 0
 python ocr_web_server.py gpu
 ```


--- a/python/examples/ocr/README_CN.md
+++ b/python/examples/ocr/README_CN.md
@@ -25,7 +25,7 @@ tar xf test_imgs.tar
 python -m paddle_serving_server.serve --model ocr_det_model --port 9293
 python ocr_web_server.py cpu
 #for gpu user
-python -m paddle_serving_server_gpu.serve --model ocr_det_model --port 9293 --gpu_id 0
+python -m paddle_serving_server.serve --model ocr_det_model --port 9293 --gpu_id 0
 python ocr_web_server.py gpu
 ```


--- a/python/examples/ocr/det_debugger_server.py
+++ b/python/examples/ocr/det_debugger_server.py
@@ -22,7 +22,7 @@ from paddle_serving_app.reader import Sequential, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes
 if sys.argv[1] == 'gpu':
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService
 elif sys.argv[1] == 'cpu':
    from paddle_serving_server.web_service import WebService
 import time

--- a/python/examples/ocr/det_web_server.py
+++ b/python/examples/ocr/det_web_server.py
@@ -22,7 +22,7 @@ from paddle_serving_app.reader import Sequential, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes
 if sys.argv[1] == 'gpu':
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService
 elif sys.argv[1] == 'cpu':
    from paddle_serving_server.web_service import WebService
 import time

--- a/python/examples/ocr/ocr_debugger_server.py
+++ b/python/examples/ocr/ocr_debugger_server.py
@@ -23,7 +23,7 @@ from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
 if sys.argv[1] == 'gpu':
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService
 elif sys.argv[1] == 'cpu':
    from paddle_serving_server.web_service import WebService
 from paddle_serving_app.local_predict import LocalPredictor

--- a/python/examples/ocr/ocr_web_server.py
+++ b/python/examples/ocr/ocr_web_server.py
@@ -23,7 +23,7 @@ from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
 if sys.argv[1] == 'gpu':
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService
 elif sys.argv[1] == 'cpu':
    from paddle_serving_server.web_service import WebService
 import time

--- a/python/examples/ocr/rec_debugger_server.py
+++ b/python/examples/ocr/rec_debugger_server.py
@@ -23,7 +23,7 @@ from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
 if sys.argv[1] == 'gpu':
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService
 elif sys.argv[1] == 'cpu':
    from paddle_serving_server.web_service import WebService
 import time

--- a/python/examples/ocr/rec_web_server.py
+++ b/python/examples/ocr/rec_web_server.py
@@ -23,7 +23,7 @@ from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
 if sys.argv[1] == 'gpu':
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService
 elif sys.argv[1] == 'cpu':
    from paddle_serving_server.web_service import WebService
 import time

--- a/python/examples/pipeline/bert/benchmark.py
+++ b/python/examples/pipeline/bert/benchmark.py
+import sys
+import os
+import yaml
+import requests
+import time
+import json
+try:
+    from paddle_serving_server_gpu.pipeline import PipelineClient
+except ImportError:
+    from paddle_serving_server.pipeline import PipelineClient
+import numpy as np
+from paddle_serving_client.utils import MultiThreadRunner
+from paddle_serving_client.utils import benchmark_args, show_latency
+'''
+2021-03-16 10:26:01,832 ==================== TRACER ======================
+2021-03-16 10:26:01,838 Op(bert):
+2021-03-16 10:26:01,838 	in[5.7833 ms]
+2021-03-16 10:26:01,838 	prep[8.2001 ms]
+2021-03-16 10:26:01,838 	midp[198.79853333333332 ms]
+2021-03-16 10:26:01,839 	postp[0.8411 ms]
+2021-03-16 10:26:01,839 	out[0.9440666666666667 ms]
+2021-03-16 10:26:01,839 	idle[0.03135320683677345]
+2021-03-16 10:26:01,839 DAGExecutor:
+2021-03-16 10:26:01,839 	Query count[30]
+2021-03-16 10:26:01,839 	QPS[3.0 q/s]
+2021-03-16 10:26:01,839 	Succ[1.0]
+2021-03-16 10:26:01,839 	Error req[]
+2021-03-16 10:26:01,839 	Latency:
+2021-03-16 10:26:01,839 		ave[237.85519999999997 ms]
+2021-03-16 10:26:01,839 		.50[179.937 ms]
+2021-03-16 10:26:01,839 		.60[179.994 ms]
+2021-03-16 10:26:01,839 		.70[180.515 ms]
+2021-03-16 10:26:01,840 		.80[180.735 ms]
+2021-03-16 10:26:01,840 		.90[182.275 ms]
+2021-03-16 10:26:01,840 		.95[182.789 ms]
+2021-03-16 10:26:01,840 		.99[1921.33 ms]
+2021-03-16 10:26:01,840 Channel (server worker num[1]):
+2021-03-16 10:26:01,840 	chl0(In: ['@DAGExecutor'], Out: ['bert']) size[0/0]
+2021-03-16 10:26:01,841 	chl1(In: ['bert'], Out: ['@DAGExecutor']) size[0/0]
+'''
+def parse_benchmark(filein, fileout):
+    with open(filein, "r") as fin:
+        res = yaml.load(fin)
+        del_list = []
+        for key in res["DAG"].keys():
+            if "call" in key:
+                del_list.append(key)
+        for key in del_list:
+            del res["DAG"][key]
+    with open(fileout, "w") as fout:
+        yaml.dump(res, fout, default_flow_style=False)
+
+def gen_yml(device):
+    fin = open("config.yml", "r")
+    config = yaml.load(fin)
+    fin.close()
+    config["dag"]["tracer"] = {"interval_s": 10}
+    if device == "gpu":
+        config["op"]["bert"]["local_service_conf"]["device_type"] = 1
+        config["op"]["bert"]["local_service_conf"]["devices"] = "2"        
+    with open("config2.yml", "w") as fout: 
+        yaml.dump(config, fout, default_flow_style=False)
+
+def run_http(idx, batch_size):
+    print("start thread ({})".format(idx))
+    url = "http://127.0.0.1:18082/bert/prediction"    
+    start = time.time()
+    with open("data-c.txt", 'r') as fin:
+        start = time.time()
+        lines = fin.readlines()
+        start_idx = 0
+        while start_idx < len(lines):
+            end_idx = min(len(lines), start_idx + batch_size)
+            feed = {}
+            for i in range(start_idx, end_idx):
+                feed[str(i - start_idx)] = lines[i]
+            keys = list(feed.keys())
+            values = [feed[x] for x in keys]
+            data = {"key": keys, "value": values}
+            r = requests.post(url=url, data=json.dumps(data))
+            start_idx += batch_size
+            if start_idx > 2000:
+                break
+        end = time.time()
+    return [[end - start]]
+
+def multithread_http(thread, batch_size):
+    multi_thread_runner = MultiThreadRunner()
+    result = multi_thread_runner.run(run_http , thread, batch_size)
+
+def run_rpc(thread, batch_size):
+    client = PipelineClient()
+    client.connect(['127.0.0.1:9998'])
+    with open("data-c.txt", 'r') as fin:
+        start = time.time()
+        lines = fin.readlines()
+        start_idx = 0
+        while start_idx < len(lines):
+            end_idx = min(len(lines), start_idx + batch_size)
+            feed = {}
+            for i in range(start_idx, end_idx):
+                feed[str(i - start_idx)] = lines[i]
+            ret = client.predict(feed_dict=feed, fetch=["res"])
+            start_idx += batch_size
+            if start_idx > 1000:
+                break
+        end = time.time()
+    return [[end - start]]
+
+
+def multithread_rpc(thraed, batch_size):
+    multi_thread_runner = MultiThreadRunner()
+    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+
+if __name__ == "__main__":
+    if sys.argv[1] == "yaml":
+        mode = sys.argv[2] # brpc/  local predictor
+        thread = int(sys.argv[3])
+        device = sys.argv[4]
+        gen_yml(device)
+    elif sys.argv[1] == "run":
+        mode = sys.argv[2] # http/ rpc
+        thread = int(sys.argv[3])
+        batch_size = int(sys.argv[4])
+        if mode == "http":
+            multithread_http(thread, batch_size)
+        elif mode == "rpc":
+            multithread_rpc(thread, batch_size)
+    elif sys.argv[1] == "dump":
+        filein = sys.argv[2]
+        fileout = sys.argv[3]
+        parse_benchmark(filein, fileout)
+    
--- a/python/examples/pipeline/bert/benchmark.sh
+++ b/python/examples/pipeline/bert/benchmark.sh
+export FLAGS_profile_pipeline=1
+alias python3="python3.7"
+modelname="bert"
+# HTTP
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 
+sleep 3
+python3 benchmark.py yaml local_predictor 1 gpu 
+rm -rf profile_log_$modelname
+for thread_num in 1 8 16 
+do
+  for batch_size in 1 10 100
+  do
+    echo "----Bert thread num: $thread_num batch size: $batch_size mode:http ----" >>profile_log_$modelname
+    rm -rf PipelineServingLogs
+    rm -rf cpu_utilization.py
+    python3 web_service.py >web.log 2>&1 &
+    sleep 3
+    nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
+    nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
+    echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+    python3 benchmark.py run http $thread_num $batch_size
+    python3 cpu_utilization.py >>profile_log_$modelname
+    ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+    python3 benchmark.py dump benchmark.log benchmark.tmp
+    mv benchmark.tmp benchmark.log
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$modelname
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$modelname
+    cat benchmark.log >> profile_log_$modelname
+    #rm -rf gpu_use.log gpu_utilization.log
+  done
+done
+# RPC
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+sleep 3
+python3 benchmark.py yaml local_predictor 1 gpu
+
+for thread_num in 1 8 16
+do
+  for batch_size in 1 10 100
+  do
+    echo "----Bert thread num: $thread_num batch size: $batch_size mode:rpc ----" >>profile_log_$modelname
+    rm -rf PipelineServingLogs
+    rm -rf cpu_utilization.py
+    python3 web_service.py >web.log 2>&1 &
+    sleep 3
+    nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
+    nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
+    echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+    python3 benchmark.py run rpc $thread_num $batch_size
+    python3 cpu_utilization.py >>profile_log_$modelname
+    ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+    python3 benchmark.py dump benchmark.log benchmark.tmp
+    mv benchmark.tmp benchmark.log
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$modelname
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$modelname
+    #rm -rf gpu_use.log gpu_utilization.log
+    cat benchmark.log >> profile_log_$modelname
+  done
+done
--- a/python/examples/pipeline/bert/config.yml
+++ b/python/examples/pipeline/bert/config.yml
+dag:
+  is_thread_op: false
+  tracer:
+    interval_s: 10
+http_port: 18082
+op:
+  bert:
+    local_service_conf:
+      client_type: local_predictor
+      concurrency: 2
+      device_type: 1
+      devices: '2'
+      fetch_list:
+      - pooled_output
+      model_config: bert_seq128_model/
+rpc_port: 9998
+worker_num: 20
--- a/python/examples/pipeline/bert/get_data.sh
+++ b/python/examples/pipeline/bert/get_data.sh
+wget https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SemanticModel/bert_chinese_L-12_H-768_A-12.tar.gz
+tar -xzf bert_chinese_L-12_H-768_A-12.tar.gz
+mv bert_chinese_L-12_H-768_A-12_model bert_seq128_model
+mv bert_chinese_L-12_H-768_A-12_client bert_seq128_client
+wget https://paddle-serving.bj.bcebos.com/bert_example/data-c.txt --no-check-certificate
+wget https://paddle-serving.bj.bcebos.com/bert_example/vocab.txt --no-check-certificate
--- a/python/examples/pipeline/bert/pipeline_rpc_client.py
+++ b/python/examples/pipeline/bert/pipeline_rpc_client.py
+import sys
+import os
+import yaml
+import requests
+import time
+import json
+try:
+    from paddle_serving_server_gpu.pipeline import PipelineClient
+except ImportError:
+    from paddle_serving_server.pipeline import PipelineClient
+import numpy as np
+
+
+client = PipelineClient()
+client.connect(['127.0.0.1:9998'])
+batch_size = 101
+with open("data-c.txt", 'r') as fin:
+     lines = fin.readlines()
+     start_idx = 0
+     while start_idx < len(lines):
+         end_idx = min(len(lines), start_idx + batch_size)
+         feed = {}
+         for i in range(start_idx, end_idx):
+             feed[str(i - start_idx)] = lines[i]
+         ret = client.predict(feed_dict=feed, fetch=["res"])
+         print(ret)
+         start_idx += batch_size
--- a/python/examples/pipeline/bert/web_service.py
+++ b/python/examples/pipeline/bert/web_service.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    from paddle_serving_server_gpu.web_service import WebService, Op
+except ImportError:
+    from paddle_serving_server.web_service import WebService, Op
+import logging
+import numpy as np
+import sys
+from paddle_serving_app.reader import ChineseBertReader
+_LOGGER = logging.getLogger()
+
+
+class BertOp(Op):
+    def init_op(self):
+        self.reader = ChineseBertReader({
+            "vocab_file": "vocab.txt",
+            "max_seq_len": 128
+        })
+
+    def preprocess(self, input_dicts, data_id, log_id):
+        (_, input_dict), = input_dicts.items()
+        print("input dict", input_dict)
+        batch_size = len(input_dict.keys())
+        feed_res = []
+        for i in range(batch_size):
+            feed_dict = self.reader.process(input_dict[str(i)].encode("utf-8"))
+            for key in feed_dict.keys():
+                feed_dict[key] = np.array(feed_dict[key]).reshape((1, len(feed_dict[key]), 1))
+            feed_res.append(feed_dict)
+        feed_dict = {}
+        for key in feed_res[0].keys():
+            feed_dict[key] = np.concatenate([x[key] for x in feed_res], axis=0)
+            print(key, feed_dict[key].shape)
+        return feed_dict, False, None, ""
+
+    def postprocess(self, input_dicts, fetch_dict, log_id):
+        fetch_dict["pooled_output"] = str(fetch_dict["pooled_output"])
+        return fetch_dict, None, ""
+
+
+class BertService(WebService):
+    def get_pipeline_response(self, read_op):
+        bert_op = BertOp(name="bert", input_ops=[read_op])
+        return bert_op
+
+
+bert_service = BertService(name="bert")
+bert_service.prepare_pipeline_config("config2.yml")
+bert_service.run_service()
--- a/python/examples/pipeline/imagenet/pipeline_rpc_client.py
+++ b/python/examples/pipeline/imagenet/pipeline_rpc_client.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+    from paddle_serving_server.pipeline import PipelineClient
 except ImportError:
    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np

--- a/python/examples/pipeline/imagenet/resnet50_web_service.py
+++ b/python/examples/pipeline/imagenet/resnet50_web_service.py
@@ -14,7 +14,7 @@
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
 try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
+    from paddle_serving_server.web_service import WebService, Op
 except ImportError:
    from paddle_serving_server.web_service import WebService, Op
 import logging

--- a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
+++ b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
@@ -22,7 +22,7 @@ import logging
 try:
    from paddle_serving_server.web_service import WebService
 except ImportError:
-    from paddle_serving_server_gpu.web_service import WebService
+    from paddle_serving_server.web_service import WebService

 _LOGGER = logging.getLogger()
 user_handler = logging.StreamHandler()

--- a/python/examples/pipeline/ocr/benchmark.py
+++ b/python/examples/pipeline/ocr/benchmark.py
+import sys
+import os
+import base64
+import yaml
+import requests
+import time
+import json
+try:
+    from paddle_serving_server_gpu.pipeline import PipelineClient
+except ImportError:
+    from paddle_serving_server.pipeline import PipelineClient
+import numpy as np
+from paddle_serving_client.utils import MultiThreadRunner
+from paddle_serving_client.utils import benchmark_args, show_latency
+def parse_benchmark(filein, fileout):
+    with open(filein, "r") as fin:
+        res = yaml.load(fin)
+        del_list = []
+        for key in res["DAG"].keys():
+            if "call" in key:
+                del_list.append(key)
+        for key in del_list:
+            del res["DAG"][key]
+    with open(fileout, "w") as fout:
+        yaml.dump(res, fout, default_flow_style=False)
+
+def gen_yml(device):
+    fin = open("config.yml", "r")
+    config = yaml.load(fin)
+    fin.close()
+    config["dag"]["tracer"] = {"interval_s": 10}
+    if device == "gpu":
+        config["op"]["det"]["local_service_conf"]["device_type"] = 1
+        config["op"]["det"]["local_service_conf"]["devices"] = "2"
+        config["op"]["rec"]["local_service_conf"]["device_type"] = 1
+        config["op"]["rec"]["local_service_conf"]["devices"] = "2"        
+    with open("config2.yml", "w") as fout: 
+        yaml.dump(config, fout, default_flow_style=False)
+
+def cv2_to_base64(image):
+    return base64.b64encode(image).decode('utf8')
+
+def run_http(idx, batch_size):
+    print("start thread ({})".format(idx))
+    url = "http://127.0.0.1:9999/ocr/prediction"    
+    start = time.time()
+
+    test_img_dir = "imgs/"
+    for img_file in os.listdir(test_img_dir):
+        with open(os.path.join(test_img_dir, img_file), 'rb') as file:
+            image_data1 = file.read()
+        image = cv2_to_base64(image_data1)
+        data = {"key": ["image"], "value": [image]}
+        for i in range(100):
+            r = requests.post(url=url, data=json.dumps(data))
+        end = time.time()
+    return [[end - start]]
+
+def multithread_http(thread, batch_size):
+    multi_thread_runner = MultiThreadRunner()
+    result = multi_thread_runner.run(run_http , thread, batch_size)
+
+def run_rpc(thread, batch_size):
+    client = PipelineClient()
+    client.connect(['127.0.0.1:18090'])
+    start = time.time()
+    test_img_dir = "imgs/"
+    for img_file in os.listdir(test_img_dir):
+        with open(os.path.join(test_img_dir, img_file), 'rb') as file:
+            image_data = file.read()
+        image = cv2_to_base64(image_data)
+
+        for i in range(100):
+            ret = client.predict(feed_dict={"image": image}, fetch=["res"])
+    end = time.time()
+    return [[end - start]]
+
+
+def multithread_rpc(thraed, batch_size):
+    multi_thread_runner = MultiThreadRunner()
+    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+
+if __name__ == "__main__":
+    if sys.argv[1] == "yaml":
+        mode = sys.argv[2] # brpc/  local predictor
+        thread = int(sys.argv[3])
+        device = sys.argv[4]
+        gen_yml(device)
+    elif sys.argv[1] == "run":
+        mode = sys.argv[2] # http/ rpc
+        thread = int(sys.argv[3])
+        batch_size = int(sys.argv[4])
+        if mode == "http":
+            multithread_http(thread, batch_size)
+        elif mode == "rpc":
+            multithread_rpc(thread, batch_size)
+    elif sys.argv[1] == "dump":
+        filein = sys.argv[2]
+        fileout = sys.argv[3]
+        parse_benchmark(filein, fileout)
+    
--- a/python/examples/pipeline/ocr/benchmark.sh
+++ b/python/examples/pipeline/ocr/benchmark.sh
+export FLAGS_profile_pipeline=1
+alias python3="python3.7"
+modelname="ocr"
+# HTTP
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 
+sleep 3
+python3 benchmark.py yaml local_predictor 1 gpu 
+rm -rf profile_log_$modelname
+for thread_num in 1 8 16
+do
+  for batch_size in 1
+  do
+    echo "----Bert thread num: $thread_num batch size: $batch_size mode:http ----" >>profile_log_$modelname
+    rm -rf PipelineServingLogs
+    rm -rf cpu_utilization.py
+    python3 web_service.py >web.log 2>&1 &
+    sleep 3
+    nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
+    nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
+    echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+    python3 benchmark.py run http $thread_num $batch_size
+    python3 cpu_utilization.py >>profile_log_$modelname
+    ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+    python3 benchmark.py dump benchmark.log benchmark.tmp
+    mv benchmark.tmp benchmark.log
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$modelname
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$modelname
+    cat benchmark.log >> profile_log_$modelname
+    #rm -rf gpu_use.log gpu_utilization.log
+  done
+done
+# RPC
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+sleep 3
+python3 benchmark.py yaml local_predictor 1 gpu
+
+for thread_num in 1 8 16
+do
+  for batch_size in 1
+  do
+    echo "----Bert thread num: $thread_num batch size: $batch_size mode:rpc ----" >>profile_log_$modelname
+    rm -rf PipelineServingLogs
+    rm -rf cpu_utilization.py
+    python3 web_service.py >web.log 2>&1 &
+    sleep 3
+    nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
+    nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
+    echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+    python3 benchmark.py run rpc $thread_num $batch_size
+    python3 cpu_utilization.py >>profile_log_$modelname
+    ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+    python3 benchmark.py dump benchmark.log benchmark.tmp
+    mv benchmark.tmp benchmark.log
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$modelname
+    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$modelname
+    #rm -rf gpu_use.log gpu_utilization.log
+    cat benchmark.log >> profile_log_$modelname
+  done
+done
--- a/python/examples/pipeline/ocr/config.yml
+++ b/python/examples/pipeline/ocr/config.yml
@@ -6,7 +6,7 @@ http_port: 9999

 #worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
 ##当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
-worker_num: 1
+worker_num: 5

 #build_dag_each_worker, False，框架在进程内创建一条DAG；True，框架会每个进程内创建多个独立的DAG
 build_dag_each_worker: false
@@ -20,6 +20,9 @@ dag:

    #使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
    use_profile: false
+    tracer:
+        interval_s: 10
+
 op:
    det:
        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
@@ -37,7 +40,7 @@ op:
            fetch_list: ["concat_1.tmp_0"]

            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
-            devices: "0"
+            devices: "2"
    rec:
        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
        concurrency: 2
@@ -61,4 +64,4 @@ op:
            fetch_list: ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"] 

            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
-            devices: "0"
+            devices: "2"
--- a/python/examples/pipeline/ocr/pipeline_rpc_client.py
+++ b/python/examples/pipeline/ocr/pipeline_rpc_client.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 try:
-    from paddle_serving_server_gpu.pipeline import PipelineClient
+    from paddle_serving_server.pipeline import PipelineClient
 except ImportError:
    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np

--- a/python/examples/pipeline/ocr/web_service.py
+++ b/python/examples/pipeline/ocr/web_service.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 try:
-    from paddle_serving_server.web_service import WebService, Op
-except ImportError:
    from paddle_serving_server_gpu.web_service import WebService, Op
+except ImportError:
+    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import cv2
@@ -45,16 +45,19 @@ class DetOp(Op):

    def preprocess(self, input_dicts, data_id, log_id):
        (_, input_dict), = input_dicts.items()
-        data = base64.b64decode(input_dict["image"].encode('utf8'))
-        data = np.fromstring(data, np.uint8)
-        # Note: class variables(self.var) can only be used in process op mode
-        self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
-        self.ori_h, self.ori_w, _ = self.im.shape
-        det_img = self.det_preprocess(self.im)
-        _, self.new_h, self.new_w = det_img.shape
-        return {"image": det_img[np.newaxis, :].copy()}, False, None, ""
+        imgs = []
+        for key in input_dict.keys():
+            data = base64.b64decode(input_dict[key].encode('utf8'))
+            data = np.fromstring(data, np.uint8)
+            self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
+            self.ori_h, self.ori_w, _ = self.im.shape
+            det_img = self.det_preprocess(self.im)
+            _, self.new_h, self.new_w = det_img.shape
+            imgs.append(det_img[np.newaxis, :].copy())
+        return {"image": np.concatenate(imgs, axis=0)}, False, None, ""

    def postprocess(self, input_dicts, fetch_dict, log_id):
+#        print(fetch_dict)
        det_out = fetch_dict["concat_1.tmp_0"]
        ratio_list = [
            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
@@ -62,7 +65,6 @@ class DetOp(Op):
        dt_boxes_list = self.post_func(det_out, [ratio_list])
        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
-        print("out dict", out_dict)
        return out_dict, None, ""


@@ -112,5 +114,5 @@ class OcrService(WebService):


 uci_service = OcrService(name="ocr")
-uci_service.prepare_pipeline_config("config.yml")
+uci_service.prepare_pipeline_config("config2.yml")
 uci_service.run_service()
--- a/python/examples/pipeline/simple_web_service/benchmark.py
+++ b/python/examples/pipeline/simple_web_service/benchmark.py
+import sys
+import os
+import yaml
+import requests
+import time
+import json
+try:
+    from paddle_serving_server_gpu.pipeline import PipelineClient
+except ImportError:
+    from paddle_serving_server.pipeline import PipelineClient
+import numpy as np
+from paddle_serving_client.utils import MultiThreadRunner
+from paddle_serving_client.utils import benchmark_args, show_latency
+
+def gen_yml():
+    fin = open("config.yml", "r")
+    config = yaml.load(fin)
+    fin.close()
+    config["dag"]["tracer"] = {"interval_s": 5}
+    with open("config2.yml", "w") as fout: 
+        yaml.dump(config, fout, default_flow_style=False)
+
+def run_http(idx, batch_size):
+    print("start thread ({})".format(idx))
+    url = "http://127.0.0.1:18082/uci/prediction"    
+    start = time.time()
+    value = "0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"
+    all_value = ";".join([value for i in range(batch_size)])
+    data = {"key": ["x"], "value": [all_value]}
+    for i in range(1000):
+        r = requests.post(url=url, data=json.dumps(data))
+    print(r.json())
+    end = time.time()
+    return [[end - start]]
+
+def multithread_http(thread, batch_size):
+    multi_thread_runner = MultiThreadRunner()
+    result = multi_thread_runner.run(run_http , thread, batch_size)
+
+def run_rpc(thread, batch_size):
+    client = PipelineClient()
+    client.connect(['127.0.0.1:9998'])
+    value = "0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"
+    all_value = ";".join([value for i in range(batch_size)])
+    data = {"key": "x", "value": all_value}
+    for i in range(1000):
+        ret = client.predict(feed_dict={data["key"]: data["value"]}, fetch=["res"])
+    print(ret)
+
+def multithread_rpc(thraed, batch_size):
+    multi_thread_runner = MultiThreadRunner()
+    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+
+if __name__ == "__main__":
+    if sys.argv[1] == "yaml":
+        mode = sys.argv[2] # brpc/  local predictor
+        thread = int(sys.argv[3])
+        gen_yml()
+    elif sys.argv[1] == "run":
+        mode = sys.argv[2] # http/ rpc
+        thread = int(sys.argv[3])
+        batch_size = int(sys.argv[4])
+        if mode == "http":
+            multithread_http(thread, batch_size)
+        elif mode == "rpc":
+            multithread_rpc(thread, batch_size)
+
+    
--- a/python/examples/pipeline/simple_web_service/benchmark.sh
+++ b/python/examples/pipeline/simple_web_service/benchmark.sh
+# HTTP
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 
+sleep 3
+python3 benchmark.py yaml local_predictor 1 
+
+for thread_num in 1 
+do
+for batch_size in 1
+do
+rm -rf PipelineServingLogs
+rm -rf cpu_utilization.py
+python3 web_service.py >web.log 2>&1 &
+sleep 3
+echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+python3 benchmark.py run http $thread_num $batch_size
+python3 cpu_utilization.py
+echo "------------Fit a line pipeline benchmark (Thread: $thread_num) (BatchSize: $batch_size)"
+tail -n 25 PipelineServingLogs/pipeline.tracer 
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+done
+done
+
+# RPC
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+sleep 3
+python3 benchmark.py yaml local_predictor 1
+
+for thread_num in 1 
+do
+for batch_size in 1 
+do
+rm -rf PipelineServingLogs
+rm -rf cpu_utilization.py
+python3 web_service.py >web.log 2>&1 &
+sleep 3
+echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+python3 benchmark.py run rpc $thread_num $batch_size
+python3 cpu_utilization.py
+echo "------------Fit a line pipeline benchmark (Thread: $thread_num) (BatchSize: $batch_size)"
+tail -n 25 PipelineServingLogs/pipeline.tracer
+ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+done
+done
--- a/python/examples/pipeline/simple_web_service/web_service.py
+++ b/python/examples/pipeline/simple_web_service/web_service.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
+    from paddle_serving_server.web_service import WebService, Op
 except ImportError:
    from paddle_serving_server.web_service import WebService, Op
 import logging
@@ -25,32 +25,24 @@ _LOGGER = logging.getLogger()
 class UciOp(Op):
    def init_op(self):
        self.separator = ","
+        self.batch_separator = ";"

    def preprocess(self, input_dicts, data_id, log_id):
        (_, input_dict), = input_dicts.items()
        _LOGGER.error("UciOp::preprocess >>> log_id:{}, input:{}".format(
            log_id, input_dict))
-        x_value = input_dict["x"]
+        x_value = input_dict["x"].split(self.batch_separator)
+        x_lst = []
+        for x_val in x_value:
+            x_lst.append(np.array([float(x.strip()) for x in x_val.split(self.separator)]).reshape(1, 13))
+        input_dict["x"] = np.concatenate(x_lst, axis=0) 
        proc_dict = {}
-        if sys.version_info.major == 2:
-            if isinstance(x_value, (str, unicode)):
-                input_dict["x"] = np.array(
-                    [float(x.strip())
-                     for x in x_value.split(self.separator)]).reshape(1, 13)
-                _LOGGER.error("input_dict:{}".format(input_dict))
-        else:
-            if isinstance(x_value, str):
-                input_dict["x"] = np.array(
-                    [float(x.strip())
-                     for x in x_value.split(self.separator)]).reshape(1, 13)
-                _LOGGER.error("input_dict:{}".format(input_dict))
-
        return input_dict, False, None, ""

    def postprocess(self, input_dicts, fetch_dict, log_id):
        _LOGGER.info("UciOp::postprocess >>> log_id:{}, fetch_dict:{}".format(
            log_id, fetch_dict))
-        fetch_dict["price"] = str(fetch_dict["price"][0][0])
+        fetch_dict["price"] = str(fetch_dict["price"])
        return fetch_dict, None, ""


@@ -61,5 +53,5 @@ class UciService(WebService):


 uci_service = UciService(name="uci")
-uci_service.prepare_pipeline_config("config.yml")
+uci_service.prepare_pipeline_config("config2.yml")
 uci_service.run_service()
--- a/python/examples/resnet_v2_50/README.md
+++ b/python/examples/resnet_v2_50/README.md
@@ -12,7 +12,7 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### Start Service

 ```
-python -m paddle_serving_server_gpu.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
+python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
 ```

 ### Client Prediction

--- a/python/examples/resnet_v2_50/README_CN.md
+++ b/python/examples/resnet_v2_50/README_CN.md
@@ -12,7 +12,7 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### 启动服务端

 ```
-python -m paddle_serving_server_gpu.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
+python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --gpu_ids 0 --port 9393
 ```

 ### 客户端预测

--- a/python/examples/unet_for_image_seg/README.md
+++ b/python/examples/unet_for_image_seg/README.md
@@ -12,7 +12,7 @@ tar -xzvf unet.tar.gz
 ### Start Service

 ```
-python -m paddle_serving_server_gpu.serve --model unet_model --gpu_ids 0 --port 9494
+python -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
 ```

 ### Client Prediction

--- a/python/examples/unet_for_image_seg/README_CN.md
+++ b/python/examples/unet_for_image_seg/README_CN.md
@@ -12,7 +12,7 @@ tar -xzvf unet.tar.gz
 ### 启动服务端

 ```
-python -m paddle_serving_server_gpu.serve --model unet_model --gpu_ids 0 --port 9494
+python -m paddle_serving_server.serve --model unet_model --gpu_ids 0 --port 9494
 ```

 ### 客户端预测

--- a/python/examples/xpu/fit_a_line_xpu/README.md
+++ b/python/examples/xpu/fit_a_line_xpu/README.md
@@ -15,7 +15,7 @@ sh get_data.sh
 ### Start server

 ```shell
-python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_lite --use_xpu --ir_optim
 ```

 ### Client prediction

--- a/python/examples/xpu/fit_a_line_xpu/test_server.py
+++ b/python/examples/xpu/fit_a_line_xpu/test_server.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing

-from paddle_serving_server_gpu.web_service import WebService
+from paddle_serving_server.web_service import WebService
 import numpy as np



--- a/python/examples/xpu/resnet_v2_50_xpu/README.md
+++ b/python/examples/xpu/resnet_v2_50_xpu/README.md
@@ -12,7 +12,7 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### Start Service

 ```
-python -m paddle_serving_server_gpu.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
+python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
 ```

 ### Client Prediction

--- a/python/examples/xpu/resnet_v2_50_xpu/README_CN.md
+++ b/python/examples/xpu/resnet_v2_50_xpu/README_CN.md
@@ -12,7 +12,7 @@ tar -xzvf resnet_v2_50_imagenet.tar.gz
 ### 启动服务端

 ```
-python -m paddle_serving_server_gpu.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
+python -m paddle_serving_server.serve --model resnet_v2_50_imagenet_model --port 9393 --use_lite --use_xpu --ir_optim
 ```

 ### 客户端预测

--- a/python/examples/yolov4/README.md
+++ b/python/examples/yolov4/README.md
@@ -12,7 +12,7 @@ tar -xzvf yolov4.tar.gz
 ## Start RPC Service

 ```
-python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0
+python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
 ```

 ## Prediction

--- a/python/examples/yolov4/README_CN.md
+++ b/python/examples/yolov4/README_CN.md
@@ -12,7 +12,7 @@ tar -xzvf yolov4.tar.gz
 ## 启动RPC服务

 ```
-python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0
+python -m paddle_serving_server.serve --model yolov4_model --port 9393 --gpu_ids 0
 ```

 ## 预测

--- a/python/gen_version.py
+++ b/python/gen_version.py
@@ -34,10 +34,16 @@ def update_info(file_name, feature, info):
        f.write(new_str)


-if len(sys.argv) > 2:
-    update_info("paddle_serving_server_gpu/version.py", "cuda_version",
+if len(sys.argv) > 2 and len(sys.argv[2]) > 0:
+    update_info("paddle_serving_server/version.py", "version_suffix",
                sys.argv[2])

+package_name = '${SERVER_PACKAGE_NAME}'
+if package_name.endswith('gpu'):
+    update_info("paddle_serving_server/version.py", "device_type", "1")
+elif package_name.endswith('xpu'):
+    update_info("paddle_serving_server/version.py", "device_type", "2")
+
 path = "paddle_serving_" + sys.argv[1]
 commit_id = subprocess.check_output(['git', 'rev-parse', 'HEAD'])
 update_info(path + "/version.py", "commit_id", commit_id)
--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -82,7 +82,10 @@ class LocalPredictor(object):
        f = open(client_config, 'r')
        model_conf = google.protobuf.text_format.Merge(
            str(f.read()), model_conf)
-        config = AnalysisConfig(model_path)
+        if os.path.exists(os.path.join(model_path, "__params__")):
+            config = AnalysisConfig(os.path.join(model_path, "__model__"), os.path.join(model_path, "__params__")) 
+        else:
+            config = AnalysisConfig(model_path) 
        logger.info("load_model_config params: model_path:{}, use_gpu:{},\
            gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{},\
            use_trt:{}, use_lite:{}, use_xpu: {}, use_feed_fetch_ops:{}".format(

--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
--- a/python/paddle_serving_client/client.py
+++ b/python/paddle_serving_client/client.py
--- a/python/paddle_serving_server/__init__.py
+++ b/python/paddle_serving_server/__init__.py
--- a/python/paddle_serving_server/dag.py
+++ b/python/paddle_serving_server/dag.py
--- a/python/paddle_serving_server/monitor.py
+++ b/python/paddle_serving_server/monitor.py
@@ -28,7 +28,6 @@ import logging

 _LOGGER = logging.getLogger(__name__)

-
 class Monitor(object):
    '''
    Monitor base class. It is used to monitor the remote model, pull and update the local model.

--- a/python/paddle_serving_server/rpc_service.py
+++ b/python/paddle_serving_server/rpc_service.py
--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
--- a/python/paddle_serving_server/version.py
+++ b/python/paddle_serving_server/version.py
--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
--- a/python/paddle_serving_server_gpu/monitor.py
+++ b/python/paddle_serving_server_gpu/monitor.py
--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
--- a/python/paddle_serving_server_gpu/version.py
+++ b/python/paddle_serving_server_gpu/version.py
--- a/python/paddle_serving_server_gpu/web_service.py
+++ b/python/paddle_serving_server_gpu/web_service.py
--- a/python/pipeline/local_service_handler.py
+++ b/python/pipeline/local_service_handler.py
--- a/python/pipeline/profiler.py
+++ b/python/pipeline/profiler.py
--- a/python/requirements.txt
+++ b/python/requirements.txt
--- a/python/requirements_mac.txt
+++ b/python/requirements_mac.txt
--- a/python/setup.py.app.in
+++ b/python/setup.py.app.in
--- a/python/setup.py.server.in
+++ b/python/setup.py.server.in
--- a/python/setup.py.server_gpu.in
+++ b/python/setup.py.server_gpu.in
--- a/tools/cpp_examples/demo-serving/CMakeLists.txt
+++ b/tools/cpp_examples/demo-serving/CMakeLists.txt
--- a/tools/cpp_examples/elastic-ctr/serving/CMakeLists.txt
+++ b/tools/cpp_examples/elastic-ctr/serving/CMakeLists.txt
--- a/tools/scripts/ipipe_py2.sh
+++ b/tools/scripts/ipipe_py2.sh
--- a/tools/scripts/ipipe_py3.sh
+++ b/tools/scripts/ipipe_py3.sh