Merge branch 'develop' of https://github.com/paddlepaddle/serving into develop

ed938a1a · wangjiawei04 · 9ba7b510 · 62a23aec · ed938a1a · ed938a1a
102 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,16 +18,13 @@ set(PADDLE_SERVING_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_SERVING_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 SET(PADDLE_SERVING_INSTALL_DIR ${CMAKE_BINARY_DIR}/output)
 SET(CMAKE_INSTALL_RPATH "\$ORIGIN" "${CMAKE_INSTALL_RPATH}")
 include(system)
 project(paddle-serving CXX C)
 message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
 find_package(CUDA QUIET)
@@ -40,25 +37,41 @@ if(NOT CMAKE_BUILD_TYPE)
      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
      FORCE)
 endif()
+SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -Wall -g2 -ggdb")
+SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall")
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
  "A path setting third party libraries download & build directories.")
 set(THIRD_PARTY_BUILD_TYPE Release)
-option(WITH_AVX	    "Compile Paddle Serving with AVX intrinsics"    OFF)
+option(WITH_AVX	            "Compile Paddle Serving with AVX intrinsics"        OFF)
-option(WITH_MKL	    "Compile Paddle Serving with MKL support."      OFF)
+option(WITH_MKL	            "Compile Paddle Serving with MKL support."          OFF)
-option(WITH_GPU	    "Compile Paddle Serving with NVIDIA GPU"        OFF)
+option(WITH_GPU	            "Compile Paddle Serving with NVIDIA GPU"            OFF)
-option(WITH_LITE    "Compile Paddle Serving with Paddle Lite Engine"    OFF)
+option(WITH_LITE            "Compile Paddle Serving with Paddle Lite Engine"    OFF)
-option(WITH_XPU	    "Compile Paddle Serving with Baidu Kunlun"        OFF)
+option(WITH_XPU	            "Compile Paddle Serving with Baidu Kunlun"          OFF)
-option(WITH_PYTHON  "Compile Paddle Serving with Python"		    ON)
+option(WITH_PYTHON          "Compile Paddle Serving with Python"                 ON)
-option(CLIENT  	    "Compile Paddle Serving Client"		    OFF)
+option(CLIENT  	            "Compile Paddle Serving Client"		                OFF)
-option(SERVER	    "Compile Paddle Serving Server"		    OFF)
+option(SERVER	            "Compile Paddle Serving Server"		                OFF)
-option(APP          "Compile Paddle Serving App package"	    OFF)
+option(APP                  "Compile Paddle Serving App package"	            OFF)
-option(WITH_ELASTIC_CTR "Compile ELASITC-CTR solution"              OFF)
+option(WITH_ELASTIC_CTR     "Compile ELASITC-CTR solution"                      OFF)
-option(PACK         "Compile for whl"                               OFF)
+option(PACK                 "Compile for whl"                                   OFF)
-option(WITH_TRT     "Compile Paddle Serving with TRT"       OFF)
+option(WITH_TRT             "Compile Paddle Serving with TRT"                   OFF)
-option(PADDLE_ON_INFERENCE "Compile for encryption" ON)
+option(PADDLE_ON_INFERENCE  "Compile for encryption"                             ON)
+option(WITH_OPENCV	    "Compile Paddle Serving with OPENCV"                    OFF)
+if (WITH_OPENCV)
+    SET(OPENCV_DIR "" CACHE PATH "Location of libraries")
+    if(NOT DEFINED OPENCV_DIR)
+        message(FATAL_ERROR "please set OPENCV_DIR with -DOPENCV_DIR=/path/opencv")
+    endif()
+    if (WIN32)
+    find_package(OpenCV REQUIRED PATHS ${OPENCV_DIR}/build/ NO_DEFAULT_PATH)
+    else ()
+    find_package(OpenCV REQUIRED PATHS ${OPENCV_DIR}/share/OpenCV NO_DEFAULT_PATH)
+    endif ()
+    include_directories(${OpenCV_INCLUDE_DIRS})
+endif()
 if (PADDLE_ON_INFERENCE)
    add_definitions(-DPADDLE_ON_INFERENCE)

--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ We consider deploying deep learning inference service online to be a user-facing
 - Any model trained by [PaddlePaddle](https://github.com/paddlepaddle/paddle) can be directly used or [Model Conversion Interface](./doc/SAVE.md) for online deployment of Paddle Serving.
 - Support [Multi-model Pipeline Deployment](./doc/PIPELINE_SERVING.md), and provide the requirements of the REST interface and RPC interface itself, [Pipeline example](./python/examples/pipeline).
- Support the model zoos from the Paddle ecosystem, such as [PaddleDetection](./python/examples/detection), [PaddleOCR](./python/examples/ocr), [PaddleRec](https://github.com/PaddlePaddle/PaddleRec/tree/master/tools/recserving/movie_recommender).
+- Support the model zoos from the Paddle ecosystem, such as [PaddleDetection](./python/examples/detection), [PaddleOCR](./python/examples/ocr), [PaddleRec](https://github.com/PaddlePaddle/PaddleRec/tree/master/recserving/movie_recommender).
 - Provide a variety of pre-processing and post-processing to facilitate users in training, deployment and other stages of related code, bridging the gap between AI developers and application developers, please refer to
 [Serving Examples](./python/examples/).

--- a/README_CN.md
+++ b/README_CN.md
@@ -44,7 +44,7 @@ Paddle Serving 旨在帮助深度学习开发者轻易部署在线预测服务
 - 任何经过[PaddlePaddle](https://github.com/paddlepaddle/paddle)训练的模型，都可以经过直接保存或是[模型转换接口](./doc/SAVE_CN.md)，用于Paddle Serving在线部署。
 - 支持[多模型串联服务部署](./doc/PIPELINE_SERVING_CN.md), 同时提供Rest接口和RPC接口以满足您的需求，[Pipeline示例](./python/examples/pipeline)。
- 支持Paddle生态的各大模型库, 例如[PaddleDetection](./python/examples/detection)，[PaddleOCR](./python/examples/ocr)，[PaddleRec](https://github.com/PaddlePaddle/PaddleRec/tree/master/tools/recserving/movie_recommender)。
+- 支持Paddle生态的各大模型库, 例如[PaddleDetection](./python/examples/detection)，[PaddleOCR](./python/examples/ocr)，[PaddleRec](https://github.com/PaddlePaddle/PaddleRec/tree/master/recserving/movie_recommender)。
 - 提供丰富多彩的前后处理，方便用户在训练、部署等各阶段复用相关代码，弥合AI开发者和应用开发者之间的鸿沟，详情参考[模型示例](./python/examples/)。
 <p align="center">

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -26,7 +26,7 @@ ExternalProject_Add(
    extern_zlib
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
-    GIT_TAG         "v1.2.8"
+    GIT_TAG         "v1.2.9"
    PREFIX          ${ZLIB_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
@@ -54,7 +54,10 @@ ELSE(WIN32)
  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
 ENDIF(WIN32)
-ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
+IF(NOT WITH_OPENCV)
+  ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
+ENDIF()
 SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 ADD_DEPENDENCIES(zlib extern_zlib)

--- a/core/configure/proto/multi_lang_general_model_service.proto
+++ b/core/configure/proto/multi_lang_general_model_service.proto
@@ -59,7 +59,7 @@ message SimpleResponse { required int32 err_code = 1; }
 message GetClientConfigRequest {}
-message GetClientConfigResponse { required string client_config_str = 1; }
+message GetClientConfigResponse { repeated string client_config_str_list = 1; }
 service MultiLangGeneralModelService {
  rpc Inference(InferenceRequest) returns (InferenceResponse) {}

--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -55,10 +55,10 @@ message ModelToolkitConf { repeated EngineDesc engines = 1; };
 // reource conf
 message ResourceConf {
-  required string model_toolkit_path = 1;
+  repeated string model_toolkit_path = 1;
-  required string model_toolkit_file = 2;
+  repeated string model_toolkit_file = 2;
-  optional string general_model_path = 3;
+  repeated string general_model_path = 3;
-  optional string general_model_file = 4;
+  repeated string general_model_file = 4;
  optional string cube_config_path = 5;
  optional string cube_config_file = 6;
  optional int32 cube_quant_bits = 7; // set 0 if no quant.

--- a/core/general-client/include/general_model.h
+++ b/core/general-client/include/general_model.h
@@ -207,7 +207,7 @@ class PredictorClient {
  void init_gflags(std::vector<std::string> argv);
-  int init(const std::string& client_conf);
+  int init(const std::vector<std::string> &client_conf);
  void set_predictor_conf(const std::string& conf_path,
                          const std::string& conf_file);
@@ -227,6 +227,10 @@ class PredictorClient {
      const std::vector<std::string>& int_feed_name,
      const std::vector<std::vector<int>>& int_shape,
      const std::vector<std::vector<int>>& int_lod_slot_batch,
+      const std::vector<std::vector<std::string>>& string_feed_batch,
+      const std::vector<std::string>& string_feed_name,
+      const std::vector<std::vector<int>>& string_shape,
+      const std::vector<std::vector<int>>& string_lod_slot_batch,
      const std::vector<std::string>& fetch_name,
      PredictorRes& predict_res_batch,  // NOLINT
      const int& pid,

--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -28,7 +28,7 @@ using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::FeedInst;
 using baidu::paddle_serving::predictor::general_model::FetchInst;
+enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
 std::once_flag gflags_init_flag;
 namespace py = pybind11;
@@ -56,12 +56,12 @@ void PredictorClient::init_gflags(std::vector<std::string> argv) {
  });
 }
-int PredictorClient::init(const std::string &conf_file) {
+int PredictorClient::init(const std::vector<std::string> &conf_file) {
  try {
    GeneralModelConfig model_config;
-    if (configure::read_proto_conf(conf_file.c_str(), &model_config) != 0) {
+    if (configure::read_proto_conf(conf_file[0].c_str(), &model_config) != 0) {
      LOG(ERROR) << "Failed to load general model config"
-                 << ", file path: " << conf_file;
+                 << ", file path: " << conf_file[0];
      return -1;
    }
@@ -69,9 +69,7 @@ int PredictorClient::init(const std::string &conf_file) {
    _fetch_name_to_idx.clear();
    _shape.clear();
    int feed_var_num = model_config.feed_var_size();
-    int fetch_var_num = model_config.fetch_var_size();
+    VLOG(2) << "feed var num: " << feed_var_num;
-    VLOG(2) << "feed var num: " << feed_var_num
-            << "fetch_var_num: " << fetch_var_num;
    for (int i = 0; i < feed_var_num; ++i) {
      _feed_name_to_idx[model_config.feed_var(i).alias_name()] = i;
      VLOG(2) << "feed alias name: " << model_config.feed_var(i).alias_name()
@@ -90,6 +88,16 @@ int PredictorClient::init(const std::string &conf_file) {
      _shape.push_back(tmp_feed_shape);
    }
+    if (conf_file.size() > 1) {
+      model_config.Clear();
+      if (configure::read_proto_conf(conf_file[conf_file.size()-1].c_str(), &model_config) != 0) {
+        LOG(ERROR) << "Failed to load general model config"
+                  << ", file path: " << conf_file[conf_file.size()-1];
+        return -1;
+      }
+    }
+    int fetch_var_num = model_config.fetch_var_size();
+    VLOG(2) << "fetch_var_num: " << fetch_var_num;
    for (int i = 0; i < fetch_var_num; ++i) {
      _fetch_name_to_idx[model_config.fetch_var(i).alias_name()] = i;
      VLOG(2) << "fetch [" << i << "]"
@@ -146,11 +154,16 @@ int PredictorClient::numpy_predict(
    const std::vector<std::string> &int_feed_name,
    const std::vector<std::vector<int>> &int_shape,
    const std::vector<std::vector<int>> &int_lod_slot_batch,
+    const std::vector<std::vector<std::string>>& string_feed_batch,
+    const std::vector<std::string>& string_feed_name,
+    const std::vector<std::vector<int>>& string_shape,
+    const std::vector<std::vector<int>>& string_lod_slot_batch,
    const std::vector<std::string> &fetch_name,
    PredictorRes &predict_res_batch,
    const int &pid,
    const uint64_t log_id) {
  int batch_size = std::max(float_feed_batch.size(), int_feed_batch.size());
+  batch_size = batch_size > string_feed_batch.size() ? batch_size : string_feed_batch.size();
  VLOG(2) << "batch size: " << batch_size;
  predict_res_batch.clear();
  Timer timeline;
@@ -165,6 +178,7 @@ int PredictorClient::numpy_predict(
  VLOG(2) << "fetch general model predictor done.";
  VLOG(2) << "float feed name size: " << float_feed_name.size();
  VLOG(2) << "int feed name size: " << int_feed_name.size();
+  VLOG(2) << "string feed name size: " << string_feed_name.size();
  VLOG(2) << "max body size : " << brpc::fLU64::FLAGS_max_body_size;
  Request req;
  req.set_log_id(log_id);
@@ -172,12 +186,15 @@ int PredictorClient::numpy_predict(
    req.add_fetch_var_names(name);
  }
+  int vec_idx = 0;
  for (int bi = 0; bi < batch_size; bi++) {
    VLOG(2) << "prepare batch " << bi;
    std::vector<Tensor *> tensor_vec;
    FeedInst *inst = req.add_insts();
    std::vector<py::array_t<float>> float_feed = float_feed_batch[bi];
    std::vector<py::array_t<int64_t>> int_feed = int_feed_batch[bi];
+    std::vector<std::string> string_feed = string_feed_batch[bi];
    for (auto &name : float_feed_name) {
      tensor_vec.push_back(inst->add_tensor_array());
    }
@@ -186,14 +203,19 @@ int PredictorClient::numpy_predict(
      tensor_vec.push_back(inst->add_tensor_array());
    }
-    VLOG(2) << "batch [" << bi << "] int_feed_name and float_feed_name "
+    for (auto &name : string_feed_name) {
-            << "prepared";
+      tensor_vec.push_back(inst->add_tensor_array());
+    }
-    int vec_idx = 0;
+    VLOG(2) << "batch [" << bi << "] " << "prepared";
-    VLOG(2) << "tensor_vec size " << tensor_vec.size() << " float shape "
-            << float_shape.size();
+    vec_idx = 0;
    for (auto &name : float_feed_name) {
      int idx = _feed_name_to_idx[name];
+      if (idx >= tensor_vec.size()) {
+        LOG(ERROR) << "idx > tensor_vec.size()";
+        return -1;
+      }
      Tensor *tensor = tensor_vec[idx];
      VLOG(2) << "prepare float feed " << name << " shape size "
              << float_shape[vec_idx].size();
@@ -203,7 +225,7 @@ int PredictorClient::numpy_predict(
      for (uint32_t j = 0; j < float_lod_slot_batch[vec_idx].size(); ++j) {
        tensor->add_lod(float_lod_slot_batch[vec_idx][j]);
      }
-      tensor->set_elem_type(1);
+      tensor->set_elem_type(P_FLOAT32);
      const int float_shape_size = float_shape[vec_idx].size();
      switch (float_shape_size) {
        case 4: {
@@ -249,13 +271,17 @@ int PredictorClient::numpy_predict(
      }
      vec_idx++;
    }
    VLOG(2) << "batch [" << bi << "] "
            << "float feed value prepared";
    vec_idx = 0;
    for (auto &name : int_feed_name) {
      int idx = _feed_name_to_idx[name];
+      if (idx >= tensor_vec.size()) {
+        LOG(ERROR) << "idx > tensor_vec.size()";
+        return -1;
+      }
      Tensor *tensor = tensor_vec[idx];
      for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
@@ -266,7 +292,7 @@ int PredictorClient::numpy_predict(
      }
      tensor->set_elem_type(_type[idx]);
-      if (_type[idx] == 0) {
+      if (_type[idx] == P_INT64) {
        VLOG(2) << "prepare int feed " << name << " shape size "
                << int_shape[vec_idx].size();
      } else {
@@ -282,7 +308,7 @@ int PredictorClient::numpy_predict(
            for (ssize_t j = 0; j < int_array.shape(1); j++) {
              for (ssize_t k = 0; k < int_array.shape(2); k++) {
                for (ssize_t l = 0; k < int_array.shape(3); l++) {
-                  if (_type[idx] == 0) {
+                  if (_type[idx] == P_INT64) {
                    tensor->add_int64_data(int_array(i, j, k, l));
                  } else {
                    tensor->add_int_data(int_array(i, j, k, l));
@@ -298,7 +324,7 @@ int PredictorClient::numpy_predict(
          for (ssize_t i = 0; i < int_array.shape(0); i++) {
            for (ssize_t j = 0; j < int_array.shape(1); j++) {
              for (ssize_t k = 0; k < int_array.shape(2); k++) {
-                if (_type[idx] == 0) {
+                if (_type[idx] == P_INT64) {
                  tensor->add_int64_data(int_array(i, j, k));
                } else {
                  tensor->add_int_data(int_array(i, j, k));
@@ -312,7 +338,7 @@ int PredictorClient::numpy_predict(
          auto int_array = int_feed[vec_idx].unchecked<2>();
          for (ssize_t i = 0; i < int_array.shape(0); i++) {
            for (ssize_t j = 0; j < int_array.shape(1); j++) {
-              if (_type[idx] == 0) {
+              if (_type[idx] == P_INT64) {
                tensor->add_int64_data(int_array(i, j));
              } else {
                tensor->add_int_data(int_array(i, j));
@@ -324,7 +350,7 @@ int PredictorClient::numpy_predict(
        case 1: {
          auto int_array = int_feed[vec_idx].unchecked<1>();
          for (ssize_t i = 0; i < int_array.shape(0); i++) {
-            if (_type[idx] == 0) {
+            if (_type[idx] == P_INT64) {
              tensor->add_int64_data(int_array(i));
            } else {
              tensor->add_int_data(int_array(i));
@@ -338,6 +364,42 @@ int PredictorClient::numpy_predict(
    VLOG(2) << "batch [" << bi << "] "
            << "int feed value prepared";
+    vec_idx = 0;
+    for (auto &name : string_feed_name) {
+      int idx = _feed_name_to_idx[name];
+      if (idx >= tensor_vec.size()) {
+        LOG(ERROR) << "idx > tensor_vec.size()";
+        return -1;
+      }
+      Tensor *tensor = tensor_vec[idx];
+      for (uint32_t j = 0; j < string_shape[vec_idx].size(); ++j) {
+        tensor->add_shape(string_shape[vec_idx][j]);
+      }
+      for (uint32_t j = 0; j < string_lod_slot_batch[vec_idx].size(); ++j) {
+        tensor->add_lod(string_lod_slot_batch[vec_idx][j]);
+      }
+      tensor->set_elem_type(P_STRING);
+      const int string_shape_size = string_shape[vec_idx].size();
+      //string_shape[vec_idx] = [1];cause numpy has no datatype of string.
+      //we pass string via vector<vector<string> >.
+      if (string_shape_size != 1) {
+        LOG(ERROR) << "string_shape_size should be 1-D, but received is : " << string_shape_size;
+        return -1;
+      }
+      switch (string_shape_size) {
+        case 1: {
+          tensor->add_data(string_feed[vec_idx]);
+          break;
+        }
+      }
+      vec_idx++;
+    }
+    VLOG(2) << "batch [" << bi << "] "
+            << "string feed value prepared";
  }
  int64_t preprocess_end = timeline.TimeStampUS();
@@ -397,19 +459,19 @@ int PredictorClient::numpy_predict(
      for (auto &name : fetch_name) {
        // int idx = _fetch_name_to_idx[name];
-        if (_fetch_name_to_type[name] == 0) {
+        if (_fetch_name_to_type[name] == P_INT64) {
          VLOG(2) << "ferch var " << name << "type int64";
          int size = output.insts(0).tensor_array(idx).int64_data_size();
          model._int64_value_map[name] = std::vector<int64_t>(
              output.insts(0).tensor_array(idx).int64_data().begin(),
              output.insts(0).tensor_array(idx).int64_data().begin() + size);
-        } else if (_fetch_name_to_type[name] == 1) {
+        } else if (_fetch_name_to_type[name] == P_FLOAT32) {
          VLOG(2) << "fetch var " << name << "type float";
          int size = output.insts(0).tensor_array(idx).float_data_size();
          model._float_value_map[name] = std::vector<float>(
              output.insts(0).tensor_array(idx).float_data().begin(),
              output.insts(0).tensor_array(idx).float_data().begin() + size);
-        } else if (_fetch_name_to_type[name] == 2) {
+        } else if (_fetch_name_to_type[name] == P_INT32) {
          VLOG(2) << "fetch var " << name << "type int32";
          int size = output.insts(0).tensor_array(idx).int_data_size();
          model._int32_value_map[name] = std::vector<int32_t>(

--- a/core/general-client/src/pybind_general_model.cpp
+++ b/core/general-client/src/pybind_general_model.cpp
@@ -78,7 +78,7 @@ PYBIND11_MODULE(serving_client, m) {
             self.init_gflags(argv);
           })
      .def("init",
-           [](PredictorClient &self, const std::string &conf) {
+           [](PredictorClient &self, const std::vector<std::string> &conf) {
             return self.init(conf);
           })
      .def("set_predictor_conf",
@@ -107,6 +107,10 @@ PYBIND11_MODULE(serving_client, m) {
              const std::vector<std::string> &int_feed_name,
              const std::vector<std::vector<int>> &int_shape,
              const std::vector<std::vector<int>> &int_lod_slot_batch,
+              const std::vector<std::vector<std::string>>& string_feed_batch,
+              const std::vector<std::string>& string_feed_name,
+              const std::vector<std::vector<int>>& string_shape,
+              const std::vector<std::vector<int>>& string_lod_slot_batch,
              const std::vector<std::string> &fetch_name,
              PredictorRes &predict_res_batch,
              const int &pid,
@@ -119,6 +123,10 @@ PYBIND11_MODULE(serving_client, m) {
                                       int_feed_name,
                                       int_shape,
                                       int_lod_slot_batch,
+                                       string_feed_batch,
+                                       string_feed_name,
+                                       string_shape,
+                                       string_lod_slot_batch,
                                       fetch_name,
                                       predict_res_batch,
                                       pid,

--- a/core/general-server/CMakeLists.txt
+++ b/core/general-server/CMakeLists.txt
 include_directories(SYSTEM  ${CMAKE_CURRENT_LIST_DIR}/../../)
 include(op/CMakeLists.txt)
 include(proto/CMakeLists.txt)
 add_executable(serving ${serving_srcs})
 add_dependencies(serving pdcodegen paddle_inference_engine pdserving paddle_inference cube-api utils)
@@ -20,6 +21,9 @@ include_directories(${CUDNN_ROOT}/include/)
 target_link_libraries(serving -Wl,--whole-archive paddle_inference_engine
        -Wl,--no-whole-archive)
+if(WITH_OPENCV)
+    target_link_libraries(serving ${OpenCV_LIBS})
+endif()
 target_link_libraries(serving paddle_inference ${paddle_depend_libs})
 target_link_libraries(serving brpc)
 target_link_libraries(serving protobuf)
@@ -27,6 +31,7 @@ target_link_libraries(serving pdserving)
 target_link_libraries(serving cube-api)
 target_link_libraries(serving utils)
 if(WITH_GPU)
    target_link_libraries(serving ${CUDA_LIBRARIES})
 endif()

--- a/core/general-server/op/CMakeLists.txt
+++ b/core/general-server/op/CMakeLists.txt
 FILE(GLOB op_srcs ${CMAKE_CURRENT_LIST_DIR}/*.cpp ${CMAKE_CURRENT_LIST_DIR}/../../predictor/tools/quant.cpp)
+if(WITH_OPENCV)
+    FILE(GLOB ocrtools_srcs ${CMAKE_CURRENT_LIST_DIR}/../../predictor/tools/ocrtools/*.cpp)
+    LIST(APPEND op_srcs ${ocrtools_srcs})
+else()
+    set (EXCLUDE_DIR "general_detection_op.cpp")
+    foreach (TMP_PATH ${op_srcs})
+        string (FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND)
+        if (NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
+            list (REMOVE_ITEM op_srcs ${TMP_PATH})
+            break()
+        endif ()
+    endforeach(TMP_PATH)
+endif()
 LIST(APPEND serving_srcs ${op_srcs})
--- a/core/general-server/op/general_detection_op.cpp
+++ b/core/general-server/op/general_detection_op.cpp
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "core/general-server/op/general_detection_op.h"
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include "core/predictor/framework/infer.h"
+#include "core/predictor/framework/memory.h"
+#include "core/predictor/framework/resource.h"
+#include "core/util/include/timer.h"
+/*
+#include "opencv2/imgcodecs/legacy/constants_c.h"
+#include "opencv2/imgproc/types_c.h"
+*/
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+using baidu::paddle_serving::Timer;
+using baidu::paddle_serving::predictor::MempoolWrapper;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::FetchInst;
+using baidu::paddle_serving::predictor::InferManager;
+using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+int GeneralDetectionOp::inference() {
+  VLOG(2) << "Going to run inference";
+  const std::vector<std::string> pre_node_names = pre_names();
+  if (pre_node_names.size() != 1) {
+    LOG(ERROR) << "This op(" << op_name()
+               << ") can only have one predecessor op, but received "
+               << pre_node_names.size();
+    return -1;
+  }
+  const std::string pre_name = pre_node_names[0];
+  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
+  if (!input_blob) {
+    LOG(ERROR) << "input_blob is nullptr,error";
+      return -1;
+  }
+  uint64_t log_id = input_blob->GetLogId();
+  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
+  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
+  if (!output_blob) {
+    LOG(ERROR) << "output_blob is nullptr,error";
+      return -1;
+  }
+  output_blob->SetLogId(log_id);
+  if (!input_blob) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed mutable depended argument, op:" << pre_name;
+    return -1;
+  }
+  const TensorVector *in = &input_blob->tensor_vector;
+  TensorVector* out = &output_blob->tensor_vector;
+  int batch_size = input_blob->_batch_size;
+  VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
+  output_blob->_batch_size = batch_size;
+  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
+  std::vector<int> input_shape;
+  int in_num =0;
+  void* databuf_data = NULL;
+  char* databuf_char = NULL;
+  size_t databuf_size = 0;
+  std::string* input_ptr = static_cast<std::string*>(in->at(0).data.data());
+  std::string base64str = input_ptr[0];
+  float ratio_h{};
+  float ratio_w{};
+  cv::Mat img = Base2Mat(base64str);
+  cv::Mat srcimg;
+  cv::Mat resize_img;
+  cv::Mat resize_img_rec;
+  cv::Mat crop_img;
+  img.copyTo(srcimg);
+  this->resize_op_.Run(img, resize_img, this->max_side_len_, ratio_h, ratio_w,
+                       this->use_tensorrt_);
+  this->normalize_op_.Run(&resize_img, this->mean_det, this->scale_det,
+                          this->is_scale_);
+  std::vector<float> input(1 * 3 * resize_img.rows * resize_img.cols, 0.0f);
+  this->permute_op_.Run(&resize_img, input.data());
+  TensorVector* real_in = new TensorVector();
+  if (!real_in) {
+    LOG(ERROR) << "real_in is nullptr,error";
+    return -1;
+  }
+  for (int i = 0; i < in->size(); ++i) {
+    input_shape = {1, 3, resize_img.rows, resize_img.cols};
+    in_num = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+    databuf_size = in_num*sizeof(float);
+    databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+    if (!databuf_data) {
+        LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+        return -1;
+    }
+    memcpy(databuf_data,input.data(),databuf_size);
+    databuf_char = reinterpret_cast<char*>(databuf_data);
+    paddle::PaddleBuf paddleBuf(databuf_char, databuf_size);
+    paddle::PaddleTensor tensor_in;
+    tensor_in.name = in->at(i).name;
+    tensor_in.dtype = paddle::PaddleDType::FLOAT32;
+    tensor_in.shape = {1, 3, resize_img.rows, resize_img.cols};
+    tensor_in.lod = in->at(i).lod;
+    tensor_in.data = paddleBuf;
+    real_in->push_back(tensor_in);
+  }
+  Timer timeline;
+  int64_t start = timeline.TimeStampUS();
+  timeline.Start();
+  if (InferManager::instance().infer(
+          engine_name().c_str(), real_in, out, batch_size)) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name().c_str();
+    return -1;
+  }
+  std::vector<int> output_shape;
+  int out_num =0;
+  void* databuf_data_out = NULL;
+  char* databuf_char_out = NULL;
+  size_t databuf_size_out = 0;
+  //this is special add for PaddleOCR postprecess
+  int infer_outnum =  out->size();
+  for (int k = 0;k <infer_outnum; ++k) {
+    int n2 = out->at(k).shape[2];
+    int n3 = out->at(k).shape[3];
+    int n = n2 * n3;
+    float* out_data = static_cast<float*>(out->at(k).data.data());
+    std::vector<float> pred(n, 0.0);
+    std::vector<unsigned char> cbuf(n, ' ');
+    for (int i = 0; i < n; i++) {
+      pred[i] = float(out_data[i]);
+      cbuf[i] = (unsigned char)((out_data[i]) * 255);
+    }
+    cv::Mat cbuf_map(n2, n3, CV_8UC1, (unsigned char *)cbuf.data());
+    cv::Mat pred_map(n2, n3, CV_32F, (float *)pred.data());
+    const double threshold = this->det_db_thresh_ * 255;
+    const double maxvalue = 255;
+    cv::Mat bit_map;
+    cv::threshold(cbuf_map, bit_map, threshold, maxvalue, cv::THRESH_BINARY);
+    cv::Mat dilation_map;
+    cv::Mat dila_ele = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
+    cv::dilate(bit_map, dilation_map, dila_ele);
+    boxes = post_processor_.BoxesFromBitmap(pred_map, dilation_map,
+                                            this->det_db_box_thresh_,
+                                            this->det_db_unclip_ratio_);
+    boxes = post_processor_.FilterTagDetRes(boxes, ratio_h, ratio_w, srcimg);
+    for (int i = boxes.size() - 1; i >= 0; i--) {
+      crop_img = GetRotateCropImage(img, boxes[i]);
+      float wh_ratio = float(crop_img.cols) / float(crop_img.rows);
+      this->resize_op_rec.Run(crop_img, resize_img_rec, wh_ratio, this->use_tensorrt_);
+      this->normalize_op_.Run(&resize_img_rec, this->mean_rec, this->scale_rec,
+                              this->is_scale_);
+      std::vector<float> output_rec(1 * 3 * resize_img_rec.rows * resize_img_rec.cols, 0.0f);
+      this->permute_op_.Run(&resize_img_rec, output_rec.data());
+      // Inference.
+      output_shape = {1, 3, resize_img_rec.rows, resize_img_rec.cols};
+      out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+      databuf_size_out = out_num*sizeof(float);
+      databuf_data_out = MempoolWrapper::instance().malloc(databuf_size_out);
+      if (!databuf_data_out) {
+          LOG(ERROR) << "Malloc failed, size: " << databuf_size_out;
+          return -1;
+      }
+      memcpy(databuf_data_out,output_rec.data(),databuf_size_out);
+      databuf_char_out = reinterpret_cast<char*>(databuf_data_out);
+      paddle::PaddleBuf paddleBuf(databuf_char_out, databuf_size_out);
+      paddle::PaddleTensor tensor_out;
+      tensor_out.name = "image";
+      tensor_out.dtype = paddle::PaddleDType::FLOAT32;
+      tensor_out.shape = {1, 3, resize_img_rec.rows, resize_img_rec.cols};
+      tensor_out.data = paddleBuf;
+      out->push_back(tensor_out);
+    }
+  }
+  out->erase(out->begin(),out->begin()+infer_outnum);
+  int64_t end = timeline.TimeStampUS();
+  CopyBlobInfo(input_blob, output_blob);
+  AddBlobInfo(output_blob, start);
+  AddBlobInfo(output_blob, end);
+  return 0;
+}
+cv::Mat GeneralDetectionOp::Base2Mat(std::string &base64_data)
+{
+	cv::Mat img;
+	std::string s_mat;
+	s_mat = base64Decode(base64_data.data(), base64_data.size());
+	std::vector<char> base64_img(s_mat.begin(), s_mat.end());
+	img = cv::imdecode(base64_img, cv::IMREAD_COLOR);//CV_LOAD_IMAGE_COLOR
+	return img;
+}
+std::string GeneralDetectionOp::base64Decode(const char* Data, int DataByte)
+{
+	const char DecodeTable[] =
+	{
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		62, // '+'
+		0, 0, 0,
+		63, // '/'
+		52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
+		0, 0, 0, 0, 0, 0, 0,
+		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+		13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
+		0, 0, 0, 0, 0, 0,
+		26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+		39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
+	};
+	std::string strDecode;
+	int nValue;
+	int i = 0;
+	while (i < DataByte)
+	{
+		if (*Data != '\r' && *Data != '\n')
+		{
+			nValue = DecodeTable[*Data++] << 18;
+			nValue += DecodeTable[*Data++] << 12;
+			strDecode += (nValue & 0x00FF0000) >> 16;
+			if (*Data != '=')
+			{
+				nValue += DecodeTable[*Data++] << 6;
+				strDecode += (nValue & 0x0000FF00) >> 8;
+				if (*Data != '=')
+				{
+					nValue += DecodeTable[*Data++];
+					strDecode += nValue & 0x000000FF;
+				}
+			}
+			i += 4;
+		}
+		else// 回车换行,跳过
+		{
+			Data++;
+			i++;
+		}
+	}
+	return strDecode;
+}
+cv::Mat GeneralDetectionOp::GetRotateCropImage(const cv::Mat &srcimage,
+                                           std::vector<std::vector<int>> box) {
+  cv::Mat image;
+  srcimage.copyTo(image);
+  std::vector<std::vector<int>> points = box;
+  int x_collect[4] = {box[0][0], box[1][0], box[2][0], box[3][0]};
+  int y_collect[4] = {box[0][1], box[1][1], box[2][1], box[3][1]};
+  int left = int(*std::min_element(x_collect, x_collect + 4));
+  int right = int(*std::max_element(x_collect, x_collect + 4));
+  int top = int(*std::min_element(y_collect, y_collect + 4));
+  int bottom = int(*std::max_element(y_collect, y_collect + 4));
+  cv::Mat img_crop;
+  image(cv::Rect(left, top, right - left, bottom - top)).copyTo(img_crop);
+  for (int i = 0; i < points.size(); i++) {
+    points[i][0] -= left;
+    points[i][1] -= top;
+  }
+  int img_crop_width = int(sqrt(pow(points[0][0] - points[1][0], 2) +
+                                pow(points[0][1] - points[1][1], 2)));
+  int img_crop_height = int(sqrt(pow(points[0][0] - points[3][0], 2) +
+                                 pow(points[0][1] - points[3][1], 2)));
+  cv::Point2f pts_std[4];
+  pts_std[0] = cv::Point2f(0., 0.);
+  pts_std[1] = cv::Point2f(img_crop_width, 0.);
+  pts_std[2] = cv::Point2f(img_crop_width, img_crop_height);
+  pts_std[3] = cv::Point2f(0.f, img_crop_height);
+  cv::Point2f pointsf[4];
+  pointsf[0] = cv::Point2f(points[0][0], points[0][1]);
+  pointsf[1] = cv::Point2f(points[1][0], points[1][1]);
+  pointsf[2] = cv::Point2f(points[2][0], points[2][1]);
+  pointsf[3] = cv::Point2f(points[3][0], points[3][1]);
+  cv::Mat M = cv::getPerspectiveTransform(pointsf, pts_std);
+  cv::Mat dst_img;
+  cv::warpPerspective(img_crop, dst_img, M,
+                      cv::Size(img_crop_width, img_crop_height),
+                      cv::BORDER_REPLICATE);
+  if (float(dst_img.rows) >= float(dst_img.cols) * 1.5) {
+    cv::Mat srcCopy = cv::Mat(dst_img.rows, dst_img.cols, dst_img.depth());
+    cv::transpose(dst_img, srcCopy);
+    cv::flip(srcCopy, srcCopy, 0);
+    return srcCopy;
+  } else {
+    return dst_img;
+  }
+}
+DEFINE_OP(GeneralDetectionOp);
+}  // namespace serving
+}  // namespace paddle_serving
+}  // namespace baidu
\ No newline at end of file
--- a/core/general-server/op/general_detection_op.h
+++ b/core/general-server/op/general_detection_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include <numeric>
+#include "core/general-server/general_model_service.pb.h"
+#include "core/general-server/op/general_infer_helper.h"
+#include "core/predictor/tools/ocrtools/postprocess_op.h"
+#include "core/predictor/tools/ocrtools/preprocess_op.h"
+#include "paddle_inference_api.h"  // NOLINT
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+class GeneralDetectionOp
+    : public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
+  public:
+    typedef std::vector<paddle::PaddleTensor> TensorVector;
+    DECLARE_OP(GeneralDetectionOp);
+    int inference();
+  private:
+    //config info
+    bool use_gpu_ = false;
+    int gpu_id_ = 0;
+    int gpu_mem_ = 4000;
+    int cpu_math_library_num_threads_ = 4;
+    bool use_mkldnn_ = false;
+    // pre-process
+    PaddleOCR::ResizeImgType0 resize_op_;
+    PaddleOCR::Normalize normalize_op_;
+    PaddleOCR::Permute permute_op_;
+    PaddleOCR::CrnnResizeImg resize_op_rec;
+    bool use_tensorrt_ = false;
+    bool use_fp16_ = false;
+    // post-process
+    PaddleOCR::PostProcessor post_processor_;
+    //det config info
+    int max_side_len_ = 960;
+    double det_db_thresh_ = 0.3;
+    double det_db_box_thresh_ = 0.5;
+    double det_db_unclip_ratio_ = 2.0;
+    std::vector<float> mean_det = {0.485f, 0.456f, 0.406f};
+    std::vector<float> scale_det = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f};
+    bool is_scale_ = true;
+    //rec config info
+    std::vector<std::string> label_list_;
+    std::vector<float> mean_rec = {0.5f, 0.5f, 0.5f};
+    std::vector<float> scale_rec = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f};
+    cv::Mat GetRotateCropImage(const cv::Mat &srcimage,
+                              std::vector<std::vector<int>> box);
+    cv::Mat Base2Mat(std::string &base64_data);
+    std::string base64Decode(const char* Data, int DataByte);
+    std::vector<std::vector<std::vector<int>>> boxes;
+};
+}  // namespace serving
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/core/general-server/op/general_dist_kv_quant_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_quant_infer_op.cpp
@@ -117,8 +117,9 @@ int GeneralDistKVQuantInferOp::inference() {
  std::unordered_map<int, int> in_out_map;
  baidu::paddle_serving::predictor::Resource &resource =
      baidu::paddle_serving::predictor::Resource::instance();
+  //TODO:Temporary addition, specific details to be studied by HexToString
  std::shared_ptr<PaddleGeneralModelConfig> model_config =
-      resource.get_general_model_config();
+      resource.get_general_model_config()[0];
  int cube_quant_bits = resource.get_cube_quant_bits();
  size_t EMBEDDING_SIZE = 0;
  if (cube_quant_bits == 0) {

--- a/core/general-server/op/general_infer_op.cpp
+++ b/core/general-server/op/general_infer_op.cpp
@@ -44,9 +44,50 @@ int GeneralInferOp::inference() {
               << pre_node_names.size();
    return -1;
  }
-  if (InferManager::instance().infer(engine_name().c_str())) {
+  const std::string pre_name = pre_node_names[0];
+  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
+  if (!input_blob) {
+    LOG(ERROR) << "input_blob is nullptr,error";
+      return -1;
+  }
+  uint64_t log_id = input_blob->GetLogId();
+  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
+  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
+  if (!output_blob) {
+    LOG(ERROR) << "output_blob is nullptr,error";
+      return -1;
+  }
+  output_blob->SetLogId(log_id);
+  if (!input_blob) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed mutable depended argument, op:" << pre_name;
+    return -1;
+  }
+  const TensorVector *in = &input_blob->tensor_vector;
+  TensorVector *out = &output_blob->tensor_vector;
+  int batch_size = input_blob->_batch_size;
+  output_blob->_batch_size = batch_size;
+  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
+  Timer timeline;
+  int64_t start = timeline.TimeStampUS();
+  timeline.Start();
+  if (InferManager::instance().infer(
+          engine_name().c_str(), in, out, batch_size)) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name().c_str();
    return -1;
  }
+  int64_t end = timeline.TimeStampUS();
+  CopyBlobInfo(input_blob, output_blob);
+  AddBlobInfo(output_blob, start);
+  AddBlobInfo(output_blob, end);
  return 0;
 }
 DEFINE_OP(GeneralInferOp);

--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -20,7 +20,6 @@
 #include "core/general-server/op/general_infer_helper.h"
 #include "core/predictor/framework/infer.h"
 #include "core/predictor/framework/memory.h"
-#include "core/predictor/framework/resource.h"
 #include "core/util/include/timer.h"
 namespace baidu {
@@ -33,8 +32,7 @@ using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::general_model::FeedInst;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
-using baidu::paddle_serving::predictor::InferManager;
+enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
 int conf_check(const Request *req,
               const std::shared_ptr<PaddleGeneralModelConfig> &model_config) {
  int var_num = req->insts(0).tensor_array_size();
@@ -48,17 +46,18 @@ int conf_check(const Request *req,
  VLOG(2) << "fetch var num in reader op: " << req->fetch_var_names_size();
  for (int i = 0; i < var_num; ++i) {
+    const Tensor &tensor = req->insts(0).tensor_array(i);
    if (model_config->_feed_type[i] !=
-        req->insts(0).tensor_array(i).elem_type()) {
+        tensor.elem_type()) {
      LOG(ERROR) << "feed type not match.";
      return -1;
    }
    if (model_config->_feed_shape[i].size() ==
-        req->insts(0).tensor_array(i).shape_size()) {
+        tensor.shape_size()) {
      for (int j = 0; j < model_config->_feed_shape[i].size(); ++j) {
-        req->insts(0).tensor_array(i).shape(j);
+        tensor.shape(j);
        if (model_config->_feed_shape[i][j] !=
-            req->insts(0).tensor_array(i).shape(j)) {
+            tensor.shape(j)) {
          LOG(ERROR) << "feed shape not match.";
          return -1;
        }
@@ -72,88 +71,178 @@ int conf_check(const Request *req,
 }
 int GeneralReaderOp::inference() {
-  // reade request from client
+  // read request from client
-  // TODO: only support one engine here
-  std::string engine_name = "general_infer_0";
  const Request *req = dynamic_cast<const Request *>(get_request_message());
  uint64_t log_id = req->log_id();
  int input_var_num = 0;
  std::vector<int64_t> elem_type;
  std::vector<int64_t> elem_size;
-  std::vector<int64_t> capacity;
+  std::vector<int64_t> databuf_size;
+  GeneralBlob *res = mutable_data<GeneralBlob>();
+  TensorVector *out = &(res->tensor_vector);
+  res->SetLogId(log_id);
+  if (!res) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed get op tls reader object output";
+  }
+  Timer timeline;
+  int64_t start = timeline.TimeStampUS();
  int var_num = req->insts(0).tensor_array_size();
+  VLOG(2) << "(logid=" << log_id << ") var num: " << var_num
+          << ") start to call load general model_conf op";
  baidu::paddle_serving::predictor::Resource &resource =
      baidu::paddle_serving::predictor::Resource::instance();
+  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
+  //get the first InferOP's model_config as ReaderOp's model_config by default.
  std::shared_ptr<PaddleGeneralModelConfig> model_config =
-      resource.get_general_model_config();
+      resource.get_general_model_config().front();
+  // TODO(guru4elephant): how to do conditional check?
+  /*
+  int ret = conf_check(req, model_config);
+  if (ret != 0) {
+    LOG(ERROR) << "model conf of server:";
+    resource.print_general_model_config(model_config);
+    return 0;
+  }
+  */
+  // package tensor
  elem_type.resize(var_num);
  elem_size.resize(var_num);
-  capacity.resize(var_num);
+  databuf_size.resize(var_num);
+  // prepare basic information for input
+  // specify the memory needed for output tensor_vector
+  // fill the data into output general_blob
+  int data_len = 0;
  for (int i = 0; i < var_num; ++i) {
-    std::string tensor_name = model_config->_feed_name[i];
+    paddle::PaddleTensor lod_tensor;
-    VLOG(2) << "(logid=" << log_id << ") get tensor name: " << tensor_name;
+    const Tensor &tensor = req->insts(0).tensor_array(i);
-    auto lod_tensor = InferManager::instance().GetInputHandle(
+    data_len = 0;
-        engine_name.c_str(), tensor_name.c_str());
+    elem_type[i] = tensor.elem_type();
-    std::vector<std::vector<size_t>> lod;
+    VLOG(2) << "var[" << i << "] has elem type: " << elem_type[i];
-    std::vector<int> shape;
+    if (elem_type[i] == P_INT64) {  // int64
-    // get lod info here
+      elem_size[i] = sizeof(int64_t);
-    if (req->insts(0).tensor_array(i).lod_size() > 0) {
+      lod_tensor.dtype = paddle::PaddleDType::INT64;
-      lod.resize(1);
+      data_len = tensor.int64_data_size();
-      for (int k = 0; k < req->insts(0).tensor_array(i).lod_size(); ++k) {
+    } else if (elem_type[i] == P_FLOAT32) {
-        lod[0].push_back(req->insts(0).tensor_array(i).lod(k));
+      elem_size[i] = sizeof(float);
-      }
+      lod_tensor.dtype = paddle::PaddleDType::FLOAT32;
-      capacity[i] = 1;
+      data_len = tensor.float_data_size();
-      for (int k = 0; k < req->insts(0).tensor_array(i).shape_size(); ++k) {
+    } else if (elem_type[i] == P_INT32) {
-        int dim = req->insts(0).tensor_array(i).shape(k);
+      elem_size[i] = sizeof(int32_t);
-        VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
+      lod_tensor.dtype = paddle::PaddleDType::INT32;
-                << "]: " << dim;
+      data_len = tensor.int_data_size();
-        capacity[i] *= dim;
+    } else if (elem_type[i] == P_STRING) {
-        shape.push_back(dim);
+      //use paddle::PaddleDType::UINT8 as for String.
+      elem_size[i] = sizeof(uint8_t);
+      lod_tensor.dtype = paddle::PaddleDType::UINT8;
+      //this is for vector<String>, cause the databuf_size != vector<String>.size()*sizeof(char);
+      for (int idx = 0; idx < tensor.data_size(); idx++) {
+        data_len += tensor.data()[idx].length();
      }
-      VLOG(2) << "(logid=" << log_id << ") var[" << i
+    }
-              << "] is tensor, capacity: " << capacity[i];
+    // implement lod tensor here
-    } else {
+    // only support 1-D lod
-      capacity[i] = 1;
+    // TODO:support 2-D lod
-      for (int k = 0; k < req->insts(0).tensor_array(i).shape_size(); ++k) {
+    if (tensor.lod_size() > 0) {
-        int dim = req->insts(0).tensor_array(i).shape(k);
+      VLOG(2) << "(logid=" << log_id << ") var[" << i << "] is lod_tensor";
-        VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
+      lod_tensor.lod.resize(1);
-                << "]: " << dim;
+      for (int k = 0; k < tensor.lod_size(); ++k) {
-        capacity[i] *= dim;
+        lod_tensor.lod[0].push_back(tensor.lod(k));
-        shape.push_back(dim);
      }
-      VLOG(2) << "(logid=" << log_id << ") var[" << i
-              << "] is tensor, capacity: " << capacity[i];
    }
-    lod_tensor->SetLoD(lod);
-    lod_tensor->Reshape(shape);
+    for (int k = 0; k < tensor.shape_size(); ++k) {
-    // insert data here
+      int dim = tensor.shape(k);
-    if (req->insts(0).tensor_array(i).elem_type() == 0) {
+      VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
-      // TODO: Copy twice here, can optimize
+              << "]: " << dim;
-      int elem_num = req->insts(0).tensor_array(i).int64_data_size();
+      lod_tensor.shape.push_back(dim);
-      std::vector<int64_t> data(elem_num);
+    }
-      int64_t *dst_ptr = data.data();
+    lod_tensor.name = model_config->_feed_name[i];
+    out->push_back(lod_tensor);
+    VLOG(2) << "(logid=" << log_id << ") tensor size for var[" << i
+            << "]: " << data_len;
+    databuf_size[i] = data_len * elem_size[i];
+    out->at(i).data.Resize(data_len * elem_size[i]);
+    VLOG(2) << "(logid=" << log_id << ") var[" << i
+            << "] is lod_tensor and len=" << out->at(i).lod[0].back();
+    if (elem_type[i] == P_INT64) {
+      int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
+      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
+              << "] is " << tensor.int64_data(0);
+      if (!dst_ptr) {
+        LOG(ERROR) << "dst_ptr is nullptr";
+            return -1;
+      }
+      memcpy(dst_ptr, tensor.int64_data().data(),databuf_size[i]);
+      /*
+      int elem_num = tensor.int64_data_size();
      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[k] = req->insts(0).tensor_array(i).int64_data(k);
+        dst_ptr[k] = tensor.int64_data(k);
+      }
+      */
+    } else if (elem_type[i] == P_FLOAT32) {
+      float *dst_ptr = static_cast<float *>(out->at(i).data.data());
+      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
+              << "] is " << tensor.float_data(0);
+      if (!dst_ptr) {
+        LOG(ERROR) << "dst_ptr is nullptr";
+            return -1;
      }
-      lod_tensor->CopyFromCpu(dst_ptr);
+      memcpy(dst_ptr, tensor.float_data().data(),databuf_size[i]);
-    } else if (req->insts(0).tensor_array(i).elem_type() == 1) {
+      /*int elem_num = tensor.float_data_size();
-      int elem_num = req->insts(0).tensor_array(i).float_data_size();
-      std::vector<float> data(elem_num);
-      float *dst_ptr = data.data();
      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[k] = req->insts(0).tensor_array(i).float_data(k);
+        dst_ptr[k] = tensor.float_data(k);
+      }*/
+    } else if (elem_type[i] == P_INT32) {
+      int32_t *dst_ptr = static_cast<int32_t *>(out->at(i).data.data());
+      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
+              << "] is " << tensor.int_data(0);
+      if (!dst_ptr) {
+        LOG(ERROR) << "dst_ptr is nullptr";
+            return -1;
      }
-      lod_tensor->CopyFromCpu(dst_ptr);
+      memcpy(dst_ptr, tensor.int_data().data(),databuf_size[i]);
-    } else if (req->insts(0).tensor_array(i).elem_type() == 2) {
+      /*
-      int elem_num = req->insts(0).tensor_array(i).int_data_size();
+      int elem_num = tensor.int_data_size();
-      std::vector<int32_t> data(elem_num);
-      int32_t *dst_ptr = data.data();
      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[k] = req->insts(0).tensor_array(i).int_data(k);
+        dst_ptr[k] = tensor.int_data(k);
+      }
+      */
+    } else if (elem_type[i] == P_STRING) {
+      std::string *dst_ptr = static_cast<std::string *>(out->at(i).data.data());
+      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
+              << "] is " << tensor.data(0);
+      if (!dst_ptr) {
+        LOG(ERROR) << "dst_ptr is nullptr";
+            return -1;
+      }
+      int elem_num = tensor.data_size();
+      for (int k = 0; k < elem_num; ++k) {
+        dst_ptr[k] = tensor.data(k);
      }
-      lod_tensor->CopyFromCpu(dst_ptr);
    }
  }
+  VLOG(2) << "(logid=" << log_id << ") output size: " << out->size();
+  timeline.Pause();
+  int64_t end = timeline.TimeStampUS();
+  res->p_size = 0;
+  res->_batch_size = 1;
+  AddBlobInfo(res, start);
+  AddBlobInfo(res, end);
+  VLOG(2) << "(logid=" << log_id << ") read data from client success";
  return 0;
 }
 DEFINE_OP(GeneralReaderOp);

--- a/core/general-server/op/general_response_op.cpp
+++ b/core/general-server/op/general_response_op.cpp
@@ -40,59 +40,163 @@ using baidu::paddle_serving::predictor::InferManager;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
 int GeneralResponseOp::inference() {
+  const std::vector<std::string> pre_node_names = pre_names();
+  VLOG(2) << "pre node names size: " << pre_node_names.size();
+  const GeneralBlob *input_blob = nullptr;
+  int var_idx = 0;
+  int cap = 1;
+  uint64_t log_id =
+      get_depend_argument<GeneralBlob>(pre_node_names[0])->GetLogId();
  const Request *req = dynamic_cast<const Request *>(get_request_message());
  // response inst with only fetch_var_names
  Response *res = mutable_data<Response>();
+  Timer timeline;
+  // double response_time = 0.0;
+  // timeline.Start();
+  int64_t start = timeline.TimeStampUS();
+  VLOG(2) << "(logid=" << log_id
+          << ") start to call load general model_conf op";
  baidu::paddle_serving::predictor::Resource &resource =
      baidu::paddle_serving::predictor::Resource::instance();
+  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
+  //get the last InferOP's model_config as ResponseOp's model_config by default.
  std::shared_ptr<PaddleGeneralModelConfig> model_config =
-      resource.get_general_model_config();
+      resource.get_general_model_config().back();
-  std::vector<int> capacity(req->fetch_var_names_size(), 1);
-  std::string engine_name = "general_infer_0";
+  VLOG(2) << "(logid=" << log_id
-  ModelOutput *output = res->add_outputs();
+          << ") max body size : " << brpc::fLU64::FLAGS_max_body_size;
-  FetchInst *fetch_inst = output->add_insts();
-  FetchInst *fetch_p = output->mutable_insts(0);
+  std::vector<int> fetch_index;
-  std::vector<std::string> outs =
+  fetch_index.resize(req->fetch_var_names_size());
-      InferManager::instance().GetOutputNames(engine_name.c_str());
  for (int i = 0; i < req->fetch_var_names_size(); ++i) {
-    Tensor *tensor = fetch_inst->add_tensor_array();
+    fetch_index[i] =
-    std::string tensor_name = outs[i];
+        model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
-    auto lod_tensor = InferManager::instance().GetOutputHandle(
+  }
-        engine_name.c_str(), tensor_name.c_str());
-    std::vector<int> shape = lod_tensor->shape();
+  for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
-    for (int k = 0; k < shape.size(); ++k) {
+    const std::string &pre_name = pre_node_names[pi];
-      capacity[i] *= shape[k];
+    VLOG(2) << "(logid=" << log_id << ") pre names[" << pi << "]: " << pre_name
-      tensor->add_shape(shape[k]);
+            << " (" << pre_node_names.size() << ")";
+    input_blob = get_depend_argument<GeneralBlob>(pre_name);
+    // fprintf(stderr, "input(%s) blob address %x\n", pre_names.c_str(),
+    // input_blob);
+    if (!input_blob) {
+      LOG(ERROR) << "(logid=" << log_id
+                 << ") Failed mutable depended argument, op: " << pre_name;
+      return -1;
+    }
+    const TensorVector *in = &input_blob->tensor_vector;
+    ModelOutput *output = res->add_outputs();
+    // To get the order of model return values
+    output->set_engine_name(pre_name);
+    FetchInst *fetch_inst = output->add_insts();
+    for (auto &idx : fetch_index) {
+      Tensor *tensor = fetch_inst->add_tensor_array();
+      //tensor->set_elem_type(1);
+      if (model_config->_is_lod_fetch[idx]) {
+        VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
+                << model_config->_fetch_name[idx] << " is lod_tensor";
+        for (int k = 0; k < in->at(idx).shape.size(); ++k) {
+          VLOG(2) << "(logid=" << log_id << ") shape[" << k
+                  << "]: " << in->at(idx).shape[k];
+          tensor->add_shape(in->at(idx).shape[k]);
+        }
+      } else {
+        VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
+                << model_config->_fetch_name[idx] << " is tensor";
+        for (int k = 0; k < in->at(idx).shape.size(); ++k) {
+          VLOG(2) << "(logid=" << log_id << ") shape[" << k
+                  << "]: " << in->at(idx).shape[k];
+          tensor->add_shape(in->at(idx).shape[k]);
+        }
+      }
    }
-    auto dtype = lod_tensor->type();
-    if (dtype == paddle::PaddleDType::INT64) {
+    var_idx = 0;
-      std::vector<int64_t> datas(capacity[i]);
+    for (auto &idx : fetch_index) {
-      int64_t *data_ptr = datas.data();
+      cap = 1;
-      lod_tensor->CopyToCpu(data_ptr);
+      for (int j = 0; j < in->at(idx).shape.size(); ++j) {
-      google::protobuf::RepeatedField<int64_t> tmp_data(data_ptr,
+        cap *= in->at(idx).shape[j];
-                                                        data_ptr + capacity[i]);
+      }
-      tensor->mutable_int64_data()->Swap(&tmp_data);
-    } else if (dtype == paddle::PaddleDType::FLOAT32) {
+      FetchInst *fetch_p = output->mutable_insts(0);
-      std::vector<float> datas(capacity[i]);
+      auto dtype = in->at(idx).dtype;
-      float *data_ptr = datas.data();
+      if (dtype == paddle::PaddleDType::INT64) {
-      lod_tensor->CopyToCpu(data_ptr);
+        VLOG(2) << "(logid=" << log_id << ") Prepare int64 var ["
-      google::protobuf::RepeatedField<float> tmp_data(data_ptr,
+                << model_config->_fetch_name[idx] << "].";
-                                                      data_ptr + capacity[i]);
+        int64_t *data_ptr = static_cast<int64_t *>(in->at(idx).data.data());
-      tensor->mutable_float_data()->Swap(&tmp_data);
+        // from
-    } else if (dtype == paddle::PaddleDType::INT32) {
+        // https://stackoverflow.com/questions/15499641/copy-a-stdvector-to-a-repeated-field-from-protobuf-with-memcpy
-      std::vector<int32_t> datas(capacity[i]);
+        // `Swap` method is faster than `{}` method.
-      int32_t *data_ptr = datas.data();
+        google::protobuf::RepeatedField<int64_t> tmp_data(data_ptr,
-      lod_tensor->CopyToCpu(data_ptr);
+                                                          data_ptr + cap);
-      google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr,
+        fetch_p->mutable_tensor_array(var_idx)->mutable_int64_data()->Swap(
-                                                        data_ptr + capacity[i]);
+            &tmp_data);
-      tensor->mutable_int_data()->Swap(&tmp_data);
+      } else if (dtype == paddle::PaddleDType::FLOAT32) {
+        VLOG(2) << "(logid=" << log_id << ") Prepare float var ["
+                << model_config->_fetch_name[idx] << "].";
+        float *data_ptr = static_cast<float *>(in->at(idx).data.data());
+        google::protobuf::RepeatedField<float> tmp_data(data_ptr,
+                                                        data_ptr + cap);
+        fetch_p->mutable_tensor_array(var_idx)->mutable_float_data()->Swap(
+            &tmp_data);
+      } else if (dtype == paddle::PaddleDType::INT32) {
+        VLOG(2) << "(logid=" << log_id << ")Prepare int32 var ["
+                << model_config->_fetch_name[idx] << "].";
+        int32_t *data_ptr = static_cast<int32_t *>(in->at(idx).data.data());
+        google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr,
+                                                          data_ptr + cap);
+        fetch_p->mutable_tensor_array(var_idx)->mutable_int_data()->Swap(
+            &tmp_data);
+      }
+      if (model_config->_is_lod_fetch[idx]) {
+        if (in->at(idx).lod.size() > 0) {
+          for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
+            fetch_p->mutable_tensor_array(var_idx)->add_lod(
+                in->at(idx).lod[0][j]);
+          }
+        }
+      }
+      VLOG(2) << "(logid=" << log_id << ") fetch var ["
+              << model_config->_fetch_name[idx] << "] ready";
+      var_idx++;
    }
-    std::vector<std::vector<size_t>> lod = lod_tensor->lod();
+  }
-    if (lod.size() > 0) {
-      for (int j = 0; j < lod[0].size(); ++j) {
+  if (req->profile_server()) {
-        tensor->add_lod(lod[0][j]);
+    int64_t end = timeline.TimeStampUS();
+    // TODO(barriery): multi-model profile_time.
+    // At present, only the response_op is multi-input, so here we get
+    // the profile_time by hard coding. It needs to be replaced with
+    // a more elegant way.
+    for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
+      input_blob = get_depend_argument<GeneralBlob>(pre_node_names[pi]);
+      VLOG(2) << "(logid=" << log_id
+              << ") p size for input blob: " << input_blob->p_size;
+      int profile_time_idx = -1;
+      if (pi == 0) {
+        profile_time_idx = 0;
+      } else {
+        profile_time_idx = input_blob->p_size - 2;
+      }
+      for (; profile_time_idx < input_blob->p_size; ++profile_time_idx) {
+        res->add_profile_time(input_blob->time_stamp[profile_time_idx]);
      }
    }
+    // TODO(guru4elephant): find more elegant way to do this
+    res->add_profile_time(start);
+    res->add_profile_time(end);
  }
  return 0;
 }
@@ -101,4 +205,4 @@ DEFINE_OP(GeneralResponseOp);
 }  // namespace serving
 }  // namespace paddle_serving
 }  // namespace baidu
\ No newline at end of file
--- a/core/general-server/op/general_text_reader_op.cpp
+++ b/core/general-server/op/general_text_reader_op.cpp
@@ -73,7 +73,7 @@ int GeneralTextReaderOp::inference() {
  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
  std::shared_ptr<PaddleGeneralModelConfig> model_config =
-      resource.get_general_model_config();
+      resource.get_general_model_config()[0];
  VLOG(2) << "(logid=" << log_id << ") print general model config done.";

--- a/core/general-server/op/general_text_response_op.cpp
+++ b/core/general-server/op/general_text_response_op.cpp
@@ -58,7 +58,7 @@ int GeneralTextResponseOp::inference() {
  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
  std::shared_ptr<PaddleGeneralModelConfig> model_config =
-      resource.get_general_model_config();
+      resource.get_general_model_config().back();
  std::vector<int> fetch_index;
  fetch_index.resize(req->fetch_var_names_size());

--- a/core/predictor/common/utils.h
+++ b/core/predictor/common/utils.h
@@ -13,8 +13,10 @@
 // limitations under the License.
 #pragma once
-#include <string>
+#include <algorithm>
+#include <cctype>
 #include <fstream>
+#include <string>
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/common/macros.h"
@@ -26,6 +28,38 @@ namespace predictor {
 namespace butil = base;
 #endif
+enum class Precision {
+  kUnk = -1,     // unknown type
+  kFloat32 = 0,  // fp32
+  kInt8,         // int8
+  kHalf,         // fp16
+  kBfloat16,     // bf16
+};
+static std::string PrecisionTypeString(const Precision data_type) {
+  switch (data_type) {
+    case Precision::kFloat32:
+      return "kFloat32";
+    case Precision::kInt8:
+      return "kInt8";
+    case Precision::kHalf:
+      return "kHalf";
+    case Precision::kBfloat16:
+      return "kBloat16";
+    default:
+      return "unUnk";
+  }
+}
+static std::string ToLower(const std::string& data) {
+  std::string result = data;
+  std::transform(
+      result.begin(), result.end(), result.begin(), [](unsigned char c) {
+        return tolower(c);
+      });
+  return result;
+}
 class TimerFlow {
 public:
  static const int MAX_SIZE = 1024;

--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
--- a/core/predictor/framework/infer_data.h
+++ b/core/predictor/framework/infer_data.h
@@ -21,7 +21,7 @@ namespace baidu {
 namespace paddle_serving {
 namespace predictor {
-enum DataType { FLOAT32, INT64 };
+enum DataType { FLOAT32, INT64, INT32 };
 class DataBuf {
 public:
@@ -80,8 +80,10 @@ struct Tensor {
  size_t ele_byte() const {
    if (type == INT64) {
      return sizeof(int64_t);
-    } else {
+    } else if (type == FLOAT32) {
      return sizeof(float);
+    } else {
+      return sizeof(int32_t);
    }
  }

--- a/core/predictor/framework/resource.cpp
+++ b/core/predictor/framework/resource.cpp
@@ -42,8 +42,8 @@ DynamicResource::~DynamicResource() {}
 int DynamicResource::initialize() { return 0; }
-std::shared_ptr<PaddleGeneralModelConfig> Resource::get_general_model_config() {
+std::vector<std::shared_ptr<PaddleGeneralModelConfig> > Resource::get_general_model_config() {
-  return _config;
+  return _configs;
 }
 void Resource::print_general_model_config(
@@ -149,30 +149,23 @@ int Resource::initialize(const std::string& path, const std::string& file) {
 #endif
  if (FLAGS_enable_model_toolkit) {
-    int err = 0;
+    size_t model_toolkit_num = resource_conf.model_toolkit_path_size();
-    std::string model_toolkit_path = resource_conf.model_toolkit_path();
+    for (size_t mi=0; mi < model_toolkit_num; ++mi) {
-    if (err != 0) {
+      std::string model_toolkit_path = resource_conf.model_toolkit_path(mi);
-      LOG(ERROR) << "read model_toolkit_path failed, path[" << path
+      std::string model_toolkit_file = resource_conf.model_toolkit_file(mi);
-                 << "], file[" << file << "]";
-      return -1;
+      if (InferManager::instance().proc_initialize(
-    }
+              model_toolkit_path.c_str(), model_toolkit_file.c_str()) != 0) {
-    std::string model_toolkit_file = resource_conf.model_toolkit_file();
+        LOG(ERROR) << "failed proc initialize modeltoolkit, config: "
-    if (err != 0) {
+                  << model_toolkit_path << "/" << model_toolkit_file;
-      LOG(ERROR) << "read model_toolkit_file failed, path[" << path
+        return -1;
-                 << "], file[" << file << "]";
+      }
-      return -1;
-    }
-    if (InferManager::instance().proc_initialize(
-            model_toolkit_path.c_str(), model_toolkit_file.c_str()) != 0) {
-      LOG(ERROR) << "failed proc initialize modeltoolkit, config: "
-                 << model_toolkit_path << "/" << model_toolkit_file;
-      return -1;
-    }
-    if (KVManager::instance().proc_initialize(
+      if (KVManager::instance().proc_initialize(
-            model_toolkit_path.c_str(), model_toolkit_file.c_str()) != 0) {
+              model_toolkit_path.c_str(), model_toolkit_file.c_str()) != 0) {
-      LOG(ERROR) << "Failed proc initialize kvmanager, config: "
+        LOG(ERROR) << "Failed proc initialize kvmanager, config: "
-                 << model_toolkit_path << "/" << model_toolkit_file;
+                  << model_toolkit_path << "/" << model_toolkit_file;
+      }
    }
  }
@@ -231,80 +224,79 @@ int Resource::general_model_initialize(const std::string& path,
    LOG(ERROR) << "Failed initialize resource from: " << path << "/" << file;
    return -1;
  }
-  int err = 0;
+  size_t general_model_num = resource_conf.general_model_path_size();
-  std::string general_model_path = resource_conf.general_model_path();
+  for (size_t gi=0; gi < general_model_num; ++gi) {
-  std::string general_model_file = resource_conf.general_model_file();
-  if (err != 0) {
-    LOG(ERROR) << "read general_model_path failed, path[" << path << "], file["
-               << file << "]";
-    return -1;
-  }
-  GeneralModelConfig model_config;
-  if (configure::read_proto_conf(general_model_path.c_str(),
-                                 general_model_file.c_str(),
-                                 &model_config) != 0) {
-    LOG(ERROR) << "Failed initialize model config from: " << general_model_path
-               << "/" << general_model_file;
-    return -1;
-  }
-  _config.reset(new PaddleGeneralModelConfig());
+    std::string general_model_path = resource_conf.general_model_path(gi);
-  int feed_var_num = model_config.feed_var_size();
+    std::string general_model_file = resource_conf.general_model_file(gi);
-  VLOG(2) << "load general model config";
-  VLOG(2) << "feed var num: " << feed_var_num;
+    GeneralModelConfig model_config;
-  _config->_feed_name.resize(feed_var_num);
+    if (configure::read_proto_conf(general_model_path.c_str(),
-  _config->_feed_alias_name.resize(feed_var_num);
+                                  general_model_file.c_str(),
-  _config->_feed_type.resize(feed_var_num);
+                                  &model_config) != 0) {
-  _config->_is_lod_feed.resize(feed_var_num);
+      LOG(ERROR) << "Failed initialize model config from: " << general_model_path
-  _config->_capacity.resize(feed_var_num);
+                << "/" << general_model_file;
-  _config->_feed_shape.resize(feed_var_num);
+      return -1;
-  for (int i = 0; i < feed_var_num; ++i) {
+    }
-    _config->_feed_name[i] = model_config.feed_var(i).name();
+    auto _config = std::make_shared<PaddleGeneralModelConfig>();
-    _config->_feed_alias_name[i] = model_config.feed_var(i).alias_name();
+    int feed_var_num = model_config.feed_var_size();
-    VLOG(2) << "feed var[" << i << "]: " << _config->_feed_name[i];
+    VLOG(2) << "load general model config";
-    VLOG(2) << "feed var[" << i << "]: " << _config->_feed_alias_name[i];
+    VLOG(2) << "feed var num: " << feed_var_num;
-    _config->_feed_type[i] = model_config.feed_var(i).feed_type();
+    _config->_feed_name.resize(feed_var_num);
-    VLOG(2) << "feed type[" << i << "]: " << _config->_feed_type[i];
+    _config->_feed_alias_name.resize(feed_var_num);
+    _config->_feed_type.resize(feed_var_num);
-    if (model_config.feed_var(i).is_lod_tensor()) {
+    _config->_is_lod_feed.resize(feed_var_num);
-      VLOG(2) << "var[" << i << "] is lod tensor";
+    _config->_capacity.resize(feed_var_num);
-      _config->_feed_shape[i] = {-1};
+    _config->_feed_shape.resize(feed_var_num);
-      _config->_is_lod_feed[i] = true;
+    for (int i=0; i < feed_var_num; ++i) {
-    } else {
+      _config->_feed_name[i] = model_config.feed_var(i).name();
-      VLOG(2) << "var[" << i << "] is tensor";
+      _config->_feed_alias_name[i] = model_config.feed_var(i).alias_name();
-      _config->_capacity[i] = 1;
+      VLOG(2) << "feed var[" << i << "]: " << _config->_feed_name[i];
-      _config->_is_lod_feed[i] = false;
+      VLOG(2) << "feed var[" << i << "]: " << _config->_feed_alias_name[i];
-      for (int j = 0; j < model_config.feed_var(i).shape_size(); ++j) {
+      _config->_feed_type[i] = model_config.feed_var(i).feed_type();
-        int32_t dim = model_config.feed_var(i).shape(j);
+      VLOG(2) << "feed type[" << i << "]: " << _config->_feed_type[i];
-        VLOG(2) << "var[" << i << "].shape[" << i << "]: " << dim;
-        _config->_feed_shape[i].push_back(dim);
+      if (model_config.feed_var(i).is_lod_tensor()) {
-        _config->_capacity[i] *= dim;
+        VLOG(2) << "var[" << i << "] is lod tensor";
+        _config->_feed_shape[i] = {-1};
+        _config->_is_lod_feed[i] = true;
+      } else {
+        VLOG(2) << "var[" << i << "] is tensor";
+        _config->_capacity[i] = 1;
+        _config->_is_lod_feed[i] = false;
+        for (int j=0; j < model_config.feed_var(i).shape_size(); ++j) {
+          int32_t dim = model_config.feed_var(i).shape(j);
+          VLOG(2) << "var[" << i << "].shape[" << i << "]: " << dim;
+          _config->_feed_shape[i].push_back(dim);
+          _config->_capacity[i] *= dim;
+        }
      }
    }
-  }
-  int fetch_var_num = model_config.fetch_var_size();
+    int fetch_var_num = model_config.fetch_var_size();
-  _config->_is_lod_fetch.resize(fetch_var_num);
+    _config->_is_lod_fetch.resize(fetch_var_num);
-  _config->_fetch_name.resize(fetch_var_num);
+    _config->_fetch_name.resize(fetch_var_num);
-  _config->_fetch_alias_name.resize(fetch_var_num);
+    _config->_fetch_alias_name.resize(fetch_var_num);
-  _config->_fetch_shape.resize(fetch_var_num);
+    _config->_fetch_shape.resize(fetch_var_num);
-  for (int i = 0; i < fetch_var_num; ++i) {
+    for (int i=0; i < fetch_var_num; ++i) {
-    _config->_fetch_name[i] = model_config.fetch_var(i).name();
+      _config->_fetch_name[i] = model_config.fetch_var(i).name();
-    _config->_fetch_alias_name[i] = model_config.fetch_var(i).alias_name();
+      _config->_fetch_alias_name[i] = model_config.fetch_var(i).alias_name();
-    _config->_fetch_name_to_index[_config->_fetch_name[i]] = i;
+      _config->_fetch_name_to_index[_config->_fetch_name[i]] = i;
-    _config->_fetch_alias_name_to_index[_config->_fetch_alias_name[i]] = i;
+      _config->_fetch_alias_name_to_index[_config->_fetch_alias_name[i]] = i;
-    if (model_config.fetch_var(i).is_lod_tensor()) {
+      if (model_config.fetch_var(i).is_lod_tensor()) {
-      VLOG(2) << "fetch var[" << i << "] is lod tensor";
+        VLOG(2) << "fetch var[" << i << "] is lod tensor";
-      _config->_fetch_shape[i] = {-1};
+        _config->_fetch_shape[i] = {-1};
-      _config->_is_lod_fetch[i] = true;
+        _config->_is_lod_fetch[i] = true;
-    } else {
+      } else {
-      _config->_is_lod_fetch[i] = false;
+        _config->_is_lod_fetch[i] = false;
-      for (int j = 0; j < model_config.fetch_var(i).shape_size(); ++j) {
+        for (int j=0; j < model_config.fetch_var(i).shape_size(); ++j) {
-        int dim = model_config.fetch_var(i).shape(j);
+          int dim = model_config.fetch_var(i).shape(j);
-        _config->_fetch_shape[i].push_back(dim);
+          _config->_fetch_shape[i].push_back(dim);
+        }
      }
    }
+    _configs.push_back(std::move(_config));
  }
  return 0;
 }

--- a/core/predictor/framework/resource.h
+++ b/core/predictor/framework/resource.h
@@ -94,7 +94,7 @@ class Resource {
  int finalize();
-  std::shared_ptr<PaddleGeneralModelConfig> get_general_model_config();
+  std::vector<std::shared_ptr<PaddleGeneralModelConfig> > get_general_model_config();
  void print_general_model_config(
      const std::shared_ptr<PaddleGeneralModelConfig>& config);
@@ -107,7 +107,7 @@ class Resource {
 private:
  int thread_finalize() { return 0; }
-  std::shared_ptr<PaddleGeneralModelConfig> _config;
+  std::vector<std::shared_ptr<PaddleGeneralModelConfig> > _configs;
  std::string cube_config_fullpath;
  int cube_quant_bits;  // 0 if no empty

--- a/core/predictor/framework/service.cpp
+++ b/core/predictor/framework/service.cpp
--- a/core/predictor/op/op.cpp
+++ b/core/predictor/op/op.cpp
--- a/core/predictor/src/pdserving.cpp
+++ b/core/predictor/src/pdserving.cpp
@@ -126,7 +126,7 @@ int main(int argc, char** argv) {
    return 0;
  }
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  //google::ParseCommandLineFlags(&argc, &argv, true);
  g_change_server_port();
@@ -202,6 +202,7 @@ int main(int argc, char** argv) {
  }
  VLOG(2) << "Succ call pthread worker start function";
+  //this is not used by any code segment,which can be cancelled.
  if (Resource::instance().general_model_initialize(FLAGS_resource_path,
                                                    FLAGS_resource_file) != 0) {
    LOG(ERROR) << "Failed to initialize general model conf: "

--- a/core/predictor/tools/ocrtools/clipper.cpp
+++ b/core/predictor/tools/ocrtools/clipper.cpp
--- a/core/predictor/tools/ocrtools/clipper.h
+++ b/core/predictor/tools/ocrtools/clipper.h
+/*******************************************************************************
+*                                                                              *
+* Author    :  Angus Johnson                                                   *
+* Version   :  6.4.2                                                           *
+* Date      :  27 February 2017                                                *
+* Website   :  http://www.angusj.com                                           *
+* Copyright :  Angus Johnson 2010-2017                                         *
+*                                                                              *
+* License:                                                                     *
+* Use, modification & distribution is subject to Boost Software License Ver 1. *
+* http://www.boost.org/LICENSE_1_0.txt                                         *
+*                                                                              *
+* Attributions:                                                                *
+* The code in this library is an extension of Bala Vatti's clipping algorithm: *
+* "A generic solution to polygon clipping"                                     *
+* Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63.             *
+* http://portal.acm.org/citation.cfm?id=129906                                 *
+*                                                                              *
+* Computer graphics and geometric modeling: implementation and algorithms      *
+* By Max K. Agoston                                                            *
+* Springer; 1 edition (January 4, 2005)                                        *
+* http://books.google.com/books?q=vatti+clipping+agoston                       *
+*                                                                              *
+* See also:                                                                    *
+* "Polygon Offsetting by Computing Winding Numbers"                            *
+* Paper no. DETC2005-85513 pp. 565-575                                         *
+* ASME 2005 International Design Engineering Technical Conferences             *
+* and Computers and Information in Engineering Conference (IDETC/CIE2005)      *
+* September 24-28, 2005 , Long Beach, California, USA                          *
+* http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf              *
+*                                                                              *
+*******************************************************************************/
+#ifndef clipper_hpp
+#define clipper_hpp
+#define CLIPPER_VERSION "6.4.2"
+// use_int32: When enabled 32bit ints are used instead of 64bit ints. This
+// improve performance but coordinate values are limited to the range +/- 46340
+//#define use_int32
+// use_xyz: adds a Z member to IntPoint. Adds a minor cost to perfomance.
+//#define use_xyz
+// use_lines: Enables line clipping. Adds a very minor cost to performance.
+#define use_lines
+// use_deprecated: Enables temporary support for the obsolete functions
+//#define use_deprecated
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <list>
+#include <ostream>
+#include <queue>
+#include <set>
+#include <stdexcept>
+#include <vector>
+namespace ClipperLib {
+enum ClipType { ctIntersection, ctUnion, ctDifference, ctXor };
+enum PolyType { ptSubject, ptClip };
+// By far the most widely used winding rules for polygon filling are
+// EvenOdd & NonZero (GDI, GDI+, XLib, OpenGL, Cairo, AGG, Quartz, SVG, Gr32)
+// Others rules include Positive, Negative and ABS_GTR_EQ_TWO (only in OpenGL)
+// see http://glprogramming.com/red/chapter11.html
+enum PolyFillType { pftEvenOdd, pftNonZero, pftPositive, pftNegative };
+#ifdef use_int32
+typedef int cInt;
+static cInt const loRange = 0x7FFF;
+static cInt const hiRange = 0x7FFF;
+#else
+typedef signed long long cInt;
+static cInt const loRange = 0x3FFFFFFF;
+static cInt const hiRange = 0x3FFFFFFFFFFFFFFFLL;
+typedef signed long long long64; // used by Int128 class
+typedef unsigned long long ulong64;
+#endif
+struct IntPoint {
+  cInt X;
+  cInt Y;
+#ifdef use_xyz
+  cInt Z;
+  IntPoint(cInt x = 0, cInt y = 0, cInt z = 0) : X(x), Y(y), Z(z){};
+#else
+  IntPoint(cInt x = 0, cInt y = 0) : X(x), Y(y){};
+#endif
+  friend inline bool operator==(const IntPoint &a, const IntPoint &b) {
+    return a.X == b.X && a.Y == b.Y;
+  }
+  friend inline bool operator!=(const IntPoint &a, const IntPoint &b) {
+    return a.X != b.X || a.Y != b.Y;
+  }
+};
+//------------------------------------------------------------------------------
+typedef std::vector<IntPoint> Path;
+typedef std::vector<Path> Paths;
+inline Path &operator<<(Path &poly, const IntPoint &p) {
+  poly.push_back(p);
+  return poly;
+}
+inline Paths &operator<<(Paths &polys, const Path &p) {
+  polys.push_back(p);
+  return polys;
+}
+std::ostream &operator<<(std::ostream &s, const IntPoint &p);
+std::ostream &operator<<(std::ostream &s, const Path &p);
+std::ostream &operator<<(std::ostream &s, const Paths &p);
+struct DoublePoint {
+  double X;
+  double Y;
+  DoublePoint(double x = 0, double y = 0) : X(x), Y(y) {}
+  DoublePoint(IntPoint ip) : X((double)ip.X), Y((double)ip.Y) {}
+};
+//------------------------------------------------------------------------------
+#ifdef use_xyz
+typedef void (*ZFillCallback)(IntPoint &e1bot, IntPoint &e1top, IntPoint &e2bot,
+                              IntPoint &e2top, IntPoint &pt);
+#endif
+enum InitOptions {
+  ioReverseSolution = 1,
+  ioStrictlySimple = 2,
+  ioPreserveCollinear = 4
+};
+enum JoinType { jtSquare, jtRound, jtMiter };
+enum EndType {
+  etClosedPolygon,
+  etClosedLine,
+  etOpenButt,
+  etOpenSquare,
+  etOpenRound
+};
+class PolyNode;
+typedef std::vector<PolyNode *> PolyNodes;
+class PolyNode {
+public:
+  PolyNode();
+  virtual ~PolyNode(){};
+  Path Contour;
+  PolyNodes Childs;
+  PolyNode *Parent;
+  PolyNode *GetNext() const;
+  bool IsHole() const;
+  bool IsOpen() const;
+  int ChildCount() const;
+private:
+  // PolyNode& operator =(PolyNode& other);
+  unsigned Index; // node index in Parent.Childs
+  bool m_IsOpen;
+  JoinType m_jointype;
+  EndType m_endtype;
+  PolyNode *GetNextSiblingUp() const;
+  void AddChild(PolyNode &child);
+  friend class Clipper; // to access Index
+  friend class ClipperOffset;
+};
+class PolyTree : public PolyNode {
+public:
+  ~PolyTree() { Clear(); };
+  PolyNode *GetFirst() const;
+  void Clear();
+  int Total() const;
+private:
+  // PolyTree& operator =(PolyTree& other);
+  PolyNodes AllNodes;
+  friend class Clipper; // to access AllNodes
+};
+bool Orientation(const Path &poly);
+double Area(const Path &poly);
+int PointInPolygon(const IntPoint &pt, const Path &path);
+void SimplifyPolygon(const Path &in_poly, Paths &out_polys,
+                     PolyFillType fillType = pftEvenOdd);
+void SimplifyPolygons(const Paths &in_polys, Paths &out_polys,
+                      PolyFillType fillType = pftEvenOdd);
+void SimplifyPolygons(Paths &polys, PolyFillType fillType = pftEvenOdd);
+void CleanPolygon(const Path &in_poly, Path &out_poly, double distance = 1.415);
+void CleanPolygon(Path &poly, double distance = 1.415);
+void CleanPolygons(const Paths &in_polys, Paths &out_polys,
+                   double distance = 1.415);
+void CleanPolygons(Paths &polys, double distance = 1.415);
+void MinkowskiSum(const Path &pattern, const Path &path, Paths &solution,
+                  bool pathIsClosed);
+void MinkowskiSum(const Path &pattern, const Paths &paths, Paths &solution,
+                  bool pathIsClosed);
+void MinkowskiDiff(const Path &poly1, const Path &poly2, Paths &solution);
+void PolyTreeToPaths(const PolyTree &polytree, Paths &paths);
+void ClosedPathsFromPolyTree(const PolyTree &polytree, Paths &paths);
+void OpenPathsFromPolyTree(PolyTree &polytree, Paths &paths);
+void ReversePath(Path &p);
+void ReversePaths(Paths &p);
+struct IntRect {
+  cInt left;
+  cInt top;
+  cInt right;
+  cInt bottom;
+};
+// enums that are used internally ...
+enum EdgeSide { esLeft = 1, esRight = 2 };
+// forward declarations (for stuff used internally) ...
+struct TEdge;
+struct IntersectNode;
+struct LocalMinimum;
+struct OutPt;
+struct OutRec;
+struct Join;
+typedef std::vector<OutRec *> PolyOutList;
+typedef std::vector<TEdge *> EdgeList;
+typedef std::vector<Join *> JoinList;
+typedef std::vector<IntersectNode *> IntersectList;
+//------------------------------------------------------------------------------
+// ClipperBase is the ancestor to the Clipper class. It should not be
+// instantiated directly. This class simply abstracts the conversion of sets of
+// polygon coordinates into edge objects that are stored in a LocalMinima list.
+class ClipperBase {
+public:
+  ClipperBase();
+  virtual ~ClipperBase();
+  virtual bool AddPath(const Path &pg, PolyType PolyTyp, bool Closed);
+  bool AddPaths(const Paths &ppg, PolyType PolyTyp, bool Closed);
+  virtual void Clear();
+  IntRect GetBounds();
+  bool PreserveCollinear() { return m_PreserveCollinear; };
+  void PreserveCollinear(bool value) { m_PreserveCollinear = value; };
+protected:
+  void DisposeLocalMinimaList();
+  TEdge *AddBoundsToLML(TEdge *e, bool IsClosed);
+  virtual void Reset();
+  TEdge *ProcessBound(TEdge *E, bool IsClockwise);
+  void InsertScanbeam(const cInt Y);
+  bool PopScanbeam(cInt &Y);
+  bool LocalMinimaPending();
+  bool PopLocalMinima(cInt Y, const LocalMinimum *&locMin);
+  OutRec *CreateOutRec();
+  void DisposeAllOutRecs();
+  void DisposeOutRec(PolyOutList::size_type index);
+  void SwapPositionsInAEL(TEdge *edge1, TEdge *edge2);
+  void DeleteFromAEL(TEdge *e);
+  void UpdateEdgeIntoAEL(TEdge *&e);
+  typedef std::vector<LocalMinimum> MinimaList;
+  MinimaList::iterator m_CurrentLM;
+  MinimaList m_MinimaList;
+  bool m_UseFullRange;
+  EdgeList m_edges;
+  bool m_PreserveCollinear;
+  bool m_HasOpenPaths;
+  PolyOutList m_PolyOuts;
+  TEdge *m_ActiveEdges;
+  typedef std::priority_queue<cInt> ScanbeamList;
+  ScanbeamList m_Scanbeam;
+};
+//------------------------------------------------------------------------------
+class Clipper : public virtual ClipperBase {
+public:
+  Clipper(int initOptions = 0);
+  bool Execute(ClipType clipType, Paths &solution,
+               PolyFillType fillType = pftEvenOdd);
+  bool Execute(ClipType clipType, Paths &solution, PolyFillType subjFillType,
+               PolyFillType clipFillType);
+  bool Execute(ClipType clipType, PolyTree &polytree,
+               PolyFillType fillType = pftEvenOdd);
+  bool Execute(ClipType clipType, PolyTree &polytree, PolyFillType subjFillType,
+               PolyFillType clipFillType);
+  bool ReverseSolution() { return m_ReverseOutput; };
+  void ReverseSolution(bool value) { m_ReverseOutput = value; };
+  bool StrictlySimple() { return m_StrictSimple; };
+  void StrictlySimple(bool value) { m_StrictSimple = value; };
+// set the callback function for z value filling on intersections (otherwise Z
+// is 0)
+#ifdef use_xyz
+  void ZFillFunction(ZFillCallback zFillFunc);
+#endif
+protected:
+  virtual bool ExecuteInternal();
+private:
+  JoinList m_Joins;
+  JoinList m_GhostJoins;
+  IntersectList m_IntersectList;
+  ClipType m_ClipType;
+  typedef std::list<cInt> MaximaList;
+  MaximaList m_Maxima;
+  TEdge *m_SortedEdges;
+  bool m_ExecuteLocked;
+  PolyFillType m_ClipFillType;
+  PolyFillType m_SubjFillType;
+  bool m_ReverseOutput;
+  bool m_UsingPolyTree;
+  bool m_StrictSimple;
+#ifdef use_xyz
+  ZFillCallback m_ZFill; // custom callback
+#endif
+  void SetWindingCount(TEdge &edge);
+  bool IsEvenOddFillType(const TEdge &edge) const;
+  bool IsEvenOddAltFillType(const TEdge &edge) const;
+  void InsertLocalMinimaIntoAEL(const cInt botY);
+  void InsertEdgeIntoAEL(TEdge *edge, TEdge *startEdge);
+  void AddEdgeToSEL(TEdge *edge);
+  bool PopEdgeFromSEL(TEdge *&edge);
+  void CopyAELToSEL();
+  void DeleteFromSEL(TEdge *e);
+  void SwapPositionsInSEL(TEdge *edge1, TEdge *edge2);
+  bool IsContributing(const TEdge &edge) const;
+  bool IsTopHorz(const cInt XPos);
+  void DoMaxima(TEdge *e);
+  void ProcessHorizontals();
+  void ProcessHorizontal(TEdge *horzEdge);
+  void AddLocalMaxPoly(TEdge *e1, TEdge *e2, const IntPoint &pt);
+  OutPt *AddLocalMinPoly(TEdge *e1, TEdge *e2, const IntPoint &pt);
+  OutRec *GetOutRec(int idx);
+  void AppendPolygon(TEdge *e1, TEdge *e2);
+  void IntersectEdges(TEdge *e1, TEdge *e2, IntPoint &pt);
+  OutPt *AddOutPt(TEdge *e, const IntPoint &pt);
+  OutPt *GetLastOutPt(TEdge *e);
+  bool ProcessIntersections(const cInt topY);
+  void BuildIntersectList(const cInt topY);
+  void ProcessIntersectList();
+  void ProcessEdgesAtTopOfScanbeam(const cInt topY);
+  void BuildResult(Paths &polys);
+  void BuildResult2(PolyTree &polytree);
+  void SetHoleState(TEdge *e, OutRec *outrec);
+  void DisposeIntersectNodes();
+  bool FixupIntersectionOrder();
+  void FixupOutPolygon(OutRec &outrec);
+  void FixupOutPolyline(OutRec &outrec);
+  bool IsHole(TEdge *e);
+  bool FindOwnerFromSplitRecs(OutRec &outRec, OutRec *&currOrfl);
+  void FixHoleLinkage(OutRec &outrec);
+  void AddJoin(OutPt *op1, OutPt *op2, const IntPoint offPt);
+  void ClearJoins();
+  void ClearGhostJoins();
+  void AddGhostJoin(OutPt *op, const IntPoint offPt);
+  bool JoinPoints(Join *j, OutRec *outRec1, OutRec *outRec2);
+  void JoinCommonEdges();
+  void DoSimplePolygons();
+  void FixupFirstLefts1(OutRec *OldOutRec, OutRec *NewOutRec);
+  void FixupFirstLefts2(OutRec *InnerOutRec, OutRec *OuterOutRec);
+  void FixupFirstLefts3(OutRec *OldOutRec, OutRec *NewOutRec);
+#ifdef use_xyz
+  void SetZ(IntPoint &pt, TEdge &e1, TEdge &e2);
+#endif
+};
+//------------------------------------------------------------------------------
+class ClipperOffset {
+public:
+  ClipperOffset(double miterLimit = 2.0, double roundPrecision = 0.25);
+  ~ClipperOffset();
+  void AddPath(const Path &path, JoinType joinType, EndType endType);
+  void AddPaths(const Paths &paths, JoinType joinType, EndType endType);
+  void Execute(Paths &solution, double delta);
+  void Execute(PolyTree &solution, double delta);
+  void Clear();
+  double MiterLimit;
+  double ArcTolerance;
+private:
+  Paths m_destPolys;
+  Path m_srcPoly;
+  Path m_destPoly;
+  std::vector<DoublePoint> m_normals;
+  double m_delta, m_sinA, m_sin, m_cos;
+  double m_miterLim, m_StepsPerRad;
+  IntPoint m_lowest;
+  PolyNode m_polyNodes;
+  void FixOrientations();
+  void DoOffset(double delta);
+  void OffsetPoint(int j, int &k, JoinType jointype);
+  void DoSquare(int j, int k);
+  void DoMiter(int j, int k, double r);
+  void DoRound(int j, int k);
+};
+//------------------------------------------------------------------------------
+class clipperException : public std::exception {
+public:
+  clipperException(const char *description) : m_descr(description) {}
+  virtual ~clipperException() throw() {}
+  virtual const char *what() const throw() { return m_descr.c_str(); }
+private:
+  std::string m_descr;
+};
+//------------------------------------------------------------------------------
+} // ClipperLib namespace
+#endif // clipper_hpp
--- a/core/predictor/tools/ocrtools/postprocess_op.cpp
+++ b/core/predictor/tools/ocrtools/postprocess_op.cpp
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "postprocess_op.h"
+namespace PaddleOCR {
+void PostProcessor::GetContourArea(const std::vector<std::vector<float>> &box,
+                                   float unclip_ratio, float &distance) {
+  int pts_num = 4;
+  float area = 0.0f;
+  float dist = 0.0f;
+  for (int i = 0; i < pts_num; i++) {
+    area += box[i][0] * box[(i + 1) % pts_num][1] -
+            box[i][1] * box[(i + 1) % pts_num][0];
+    dist += sqrtf((box[i][0] - box[(i + 1) % pts_num][0]) *
+                      (box[i][0] - box[(i + 1) % pts_num][0]) +
+                  (box[i][1] - box[(i + 1) % pts_num][1]) *
+                      (box[i][1] - box[(i + 1) % pts_num][1]));
+  }
+  area = fabs(float(area / 2.0));
+  distance = area * unclip_ratio / dist;
+}
+cv::RotatedRect PostProcessor::UnClip(std::vector<std::vector<float>> box,
+                                      const float &unclip_ratio) {
+  float distance = 1.0;
+  GetContourArea(box, unclip_ratio, distance);
+  ClipperLib::ClipperOffset offset;
+  ClipperLib::Path p;
+  p << ClipperLib::IntPoint(int(box[0][0]), int(box[0][1]))
+    << ClipperLib::IntPoint(int(box[1][0]), int(box[1][1]))
+    << ClipperLib::IntPoint(int(box[2][0]), int(box[2][1]))
+    << ClipperLib::IntPoint(int(box[3][0]), int(box[3][1]));
+  offset.AddPath(p, ClipperLib::jtRound, ClipperLib::etClosedPolygon);
+  ClipperLib::Paths soln;
+  offset.Execute(soln, distance);
+  std::vector<cv::Point2f> points;
+  for (int j = 0; j < soln.size(); j++) {
+    for (int i = 0; i < soln[soln.size() - 1].size(); i++) {
+      points.emplace_back(soln[j][i].X, soln[j][i].Y);
+    }
+  }
+  cv::RotatedRect res;
+  if (points.size() <= 0) {
+    res = cv::RotatedRect(cv::Point2f(0, 0), cv::Size2f(1, 1), 0);
+  } else {
+    res = cv::minAreaRect(points);
+  }
+  return res;
+}
+float **PostProcessor::Mat2Vec(cv::Mat mat) {
+  auto **array = new float *[mat.rows];
+  for (int i = 0; i < mat.rows; ++i)
+    array[i] = new float[mat.cols];
+  for (int i = 0; i < mat.rows; ++i) {
+    for (int j = 0; j < mat.cols; ++j) {
+      array[i][j] = mat.at<float>(i, j);
+    }
+  }
+  return array;
+}
+std::vector<std::vector<int>>
+PostProcessor::OrderPointsClockwise(std::vector<std::vector<int>> pts) {
+  std::vector<std::vector<int>> box = pts;
+  std::sort(box.begin(), box.end(), XsortInt);
+  std::vector<std::vector<int>> leftmost = {box[0], box[1]};
+  std::vector<std::vector<int>> rightmost = {box[2], box[3]};
+  if (leftmost[0][1] > leftmost[1][1])
+    std::swap(leftmost[0], leftmost[1]);
+  if (rightmost[0][1] > rightmost[1][1])
+    std::swap(rightmost[0], rightmost[1]);
+  std::vector<std::vector<int>> rect = {leftmost[0], rightmost[0], rightmost[1],
+                                        leftmost[1]};
+  return rect;
+}
+std::vector<std::vector<float>> PostProcessor::Mat2Vector(cv::Mat mat) {
+  std::vector<std::vector<float>> img_vec;
+  std::vector<float> tmp;
+  for (int i = 0; i < mat.rows; ++i) {
+    tmp.clear();
+    for (int j = 0; j < mat.cols; ++j) {
+      tmp.push_back(mat.at<float>(i, j));
+    }
+    img_vec.push_back(tmp);
+  }
+  return img_vec;
+}
+bool PostProcessor::XsortFp32(std::vector<float> a, std::vector<float> b) {
+  if (a[0] != b[0])
+    return a[0] < b[0];
+  return false;
+}
+bool PostProcessor::XsortInt(std::vector<int> a, std::vector<int> b) {
+  if (a[0] != b[0])
+    return a[0] < b[0];
+  return false;
+}
+std::vector<std::vector<float>> PostProcessor::GetMiniBoxes(cv::RotatedRect box,
+                                                            float &ssid) {
+  ssid = std::max(box.size.width, box.size.height);
+  cv::Mat points;
+  cv::boxPoints(box, points);
+  auto array = Mat2Vector(points);
+  std::sort(array.begin(), array.end(), XsortFp32);
+  std::vector<float> idx1 = array[0], idx2 = array[1], idx3 = array[2],
+                     idx4 = array[3];
+  if (array[3][1] <= array[2][1]) {
+    idx2 = array[3];
+    idx3 = array[2];
+  } else {
+    idx2 = array[2];
+    idx3 = array[3];
+  }
+  if (array[1][1] <= array[0][1]) {
+    idx1 = array[1];
+    idx4 = array[0];
+  } else {
+    idx1 = array[0];
+    idx4 = array[1];
+  }
+  array[0] = idx1;
+  array[1] = idx2;
+  array[2] = idx3;
+  array[3] = idx4;
+  return array;
+}
+float PostProcessor::BoxScoreFast(std::vector<std::vector<float>> box_array,
+                                  cv::Mat pred) {
+  auto array = box_array;
+  int width = pred.cols;
+  int height = pred.rows;
+  float box_x[4] = {array[0][0], array[1][0], array[2][0], array[3][0]};
+  float box_y[4] = {array[0][1], array[1][1], array[2][1], array[3][1]};
+  int xmin = clamp(int(std::floor(*(std::min_element(box_x, box_x + 4)))), 0,
+                   width - 1);
+  int xmax = clamp(int(std::ceil(*(std::max_element(box_x, box_x + 4)))), 0,
+                   width - 1);
+  int ymin = clamp(int(std::floor(*(std::min_element(box_y, box_y + 4)))), 0,
+                   height - 1);
+  int ymax = clamp(int(std::ceil(*(std::max_element(box_y, box_y + 4)))), 0,
+                   height - 1);
+  cv::Mat mask;
+  mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);
+  cv::Point root_point[4];
+  root_point[0] = cv::Point(int(array[0][0]) - xmin, int(array[0][1]) - ymin);
+  root_point[1] = cv::Point(int(array[1][0]) - xmin, int(array[1][1]) - ymin);
+  root_point[2] = cv::Point(int(array[2][0]) - xmin, int(array[2][1]) - ymin);
+  root_point[3] = cv::Point(int(array[3][0]) - xmin, int(array[3][1]) - ymin);
+  const cv::Point *ppt[1] = {root_point};
+  int npt[] = {4};
+  cv::fillPoly(mask, ppt, npt, 1, cv::Scalar(1));
+  cv::Mat croppedImg;
+  pred(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1))
+      .copyTo(croppedImg);
+  auto score = cv::mean(croppedImg, mask)[0];
+  return score;
+}
+std::vector<std::vector<std::vector<int>>>
+PostProcessor::BoxesFromBitmap(const cv::Mat pred, const cv::Mat bitmap,
+                               const float &box_thresh,
+                               const float &det_db_unclip_ratio) {
+  const int min_size = 3;
+  const int max_candidates = 1000;
+  int width = bitmap.cols;
+  int height = bitmap.rows;
+  std::vector<std::vector<cv::Point>> contours;
+  std::vector<cv::Vec4i> hierarchy;
+  cv::findContours(bitmap, contours, hierarchy, cv::RETR_LIST,
+                   cv::CHAIN_APPROX_SIMPLE);
+  int num_contours =
+      contours.size() >= max_candidates ? max_candidates : contours.size();
+  std::vector<std::vector<std::vector<int>>> boxes;
+  for (int _i = 0; _i < num_contours; _i++) {
+    if (contours[_i].size() <= 2) {
+      continue;
+    }
+    float ssid;
+    cv::RotatedRect box = cv::minAreaRect(contours[_i]);
+    auto array = GetMiniBoxes(box, ssid);
+    auto box_for_unclip = array;
+    // end get_mini_box
+    if (ssid < min_size) {
+      continue;
+    }
+    float score;
+    score = BoxScoreFast(array, pred);
+    if (score < box_thresh)
+      continue;
+    // start for unclip
+    cv::RotatedRect points = UnClip(box_for_unclip, det_db_unclip_ratio);
+    if (points.size.height < 1.001 && points.size.width < 1.001) {
+      continue;
+    }
+    // end for unclip
+    cv::RotatedRect clipbox = points;
+    auto cliparray = GetMiniBoxes(clipbox, ssid);
+    if (ssid < min_size + 2)
+      continue;
+    int dest_width = pred.cols;
+    int dest_height = pred.rows;
+    std::vector<std::vector<int>> intcliparray;
+    for (int num_pt = 0; num_pt < 4; num_pt++) {
+      std::vector<int> a{int(clampf(roundf(cliparray[num_pt][0] / float(width) *
+                                           float(dest_width)),
+                                    0, float(dest_width))),
+                         int(clampf(roundf(cliparray[num_pt][1] /
+                                           float(height) * float(dest_height)),
+                                    0, float(dest_height)))};
+      intcliparray.push_back(a);
+    }
+    boxes.push_back(intcliparray);
+  } // end for
+  return boxes;
+}
+std::vector<std::vector<std::vector<int>>>
+PostProcessor::FilterTagDetRes(std::vector<std::vector<std::vector<int>>> boxes,
+                               float ratio_h, float ratio_w, cv::Mat srcimg) {
+  int oriimg_h = srcimg.rows;
+  int oriimg_w = srcimg.cols;
+  std::vector<std::vector<std::vector<int>>> root_points;
+  for (int n = 0; n < boxes.size(); n++) {
+    boxes[n] = OrderPointsClockwise(boxes[n]);
+    for (int m = 0; m < boxes[0].size(); m++) {
+      boxes[n][m][0] /= ratio_w;
+      boxes[n][m][1] /= ratio_h;
+      boxes[n][m][0] = int(_min(_max(boxes[n][m][0], 0), oriimg_w - 1));
+      boxes[n][m][1] = int(_min(_max(boxes[n][m][1], 0), oriimg_h - 1));
+    }
+  }
+  for (int n = 0; n < boxes.size(); n++) {
+    int rect_width, rect_height;
+    rect_width = int(sqrt(pow(boxes[n][0][0] - boxes[n][1][0], 2) +
+                          pow(boxes[n][0][1] - boxes[n][1][1], 2)));
+    rect_height = int(sqrt(pow(boxes[n][0][0] - boxes[n][3][0], 2) +
+                           pow(boxes[n][0][1] - boxes[n][3][1], 2)));
+    if (rect_width <= 4 || rect_height <= 4)
+      continue;
+    root_points.push_back(boxes[n]);
+  }
+  return root_points;
+}
+} // namespace PaddleOCR
--- a/core/predictor/tools/ocrtools/postprocess_op.h
+++ b/core/predictor/tools/ocrtools/postprocess_op.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+#include <cstring>
+#include <fstream>
+#include <numeric>
+#include "clipper.h"
+#include "utility.h"
+using namespace std;
+namespace PaddleOCR {
+class PostProcessor {
+public:
+  void GetContourArea(const std::vector<std::vector<float>> &box,
+                      float unclip_ratio, float &distance);
+  cv::RotatedRect UnClip(std::vector<std::vector<float>> box,
+                         const float &unclip_ratio);
+  float **Mat2Vec(cv::Mat mat);
+  std::vector<std::vector<int>>
+  OrderPointsClockwise(std::vector<std::vector<int>> pts);
+  std::vector<std::vector<float>> GetMiniBoxes(cv::RotatedRect box,
+                                               float &ssid);
+  float BoxScoreFast(std::vector<std::vector<float>> box_array, cv::Mat pred);
+  std::vector<std::vector<std::vector<int>>>
+  BoxesFromBitmap(const cv::Mat pred, const cv::Mat bitmap,
+                  const float &box_thresh, const float &det_db_unclip_ratio);
+  std::vector<std::vector<std::vector<int>>>
+  FilterTagDetRes(std::vector<std::vector<std::vector<int>>> boxes,
+                  float ratio_h, float ratio_w, cv::Mat srcimg);
+private:
+  static bool XsortInt(std::vector<int> a, std::vector<int> b);
+  static bool XsortFp32(std::vector<float> a, std::vector<float> b);
+  std::vector<std::vector<float>> Mat2Vector(cv::Mat mat);
+  inline int _max(int a, int b) { return a >= b ? a : b; }
+  inline int _min(int a, int b) { return a >= b ? b : a; }
+  template <class T> inline T clamp(T x, T min, T max) {
+    if (x > max)
+      return max;
+    if (x < min)
+      return min;
+    return x;
+  }
+  inline float clampf(float x, float min, float max) {
+    if (x > max)
+      return max;
+    if (x < min)
+      return min;
+    return x;
+  }
+};
+} // namespace PaddleOCR
--- a/core/predictor/tools/ocrtools/preprocess_op.cpp
+++ b/core/predictor/tools/ocrtools/preprocess_op.cpp
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+//#include "paddle_api.h"
+//#include "paddle_inference_api.h"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+#include <cstring>
+#include <fstream>
+#include <numeric>
+#include "preprocess_op.h"
+namespace PaddleOCR {
+void Permute::Run(const cv::Mat *im, float *data) {
+  int rh = im->rows;
+  int rw = im->cols;
+  int rc = im->channels();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(*im, cv::Mat(rh, rw, CV_32FC1, data + i * rh * rw), i);
+  }
+}
+void Normalize::Run(cv::Mat *im, const std::vector<float> &mean,
+                    const std::vector<float> &scale, const bool is_scale) {
+  double e = 1.0;
+  if (is_scale) {
+    e /= 255.0;
+  }
+  (*im).convertTo(*im, CV_32FC3, e);
+  for (int h = 0; h < im->rows; h++) {
+    for (int w = 0; w < im->cols; w++) {
+      im->at<cv::Vec3f>(h, w)[0] =
+          (im->at<cv::Vec3f>(h, w)[0] - mean[0]) * scale[0];
+      im->at<cv::Vec3f>(h, w)[1] =
+          (im->at<cv::Vec3f>(h, w)[1] - mean[1]) * scale[1];
+      im->at<cv::Vec3f>(h, w)[2] =
+          (im->at<cv::Vec3f>(h, w)[2] - mean[2]) * scale[2];
+    }
+  }
+}
+void ResizeImgType0::Run(const cv::Mat &img, cv::Mat &resize_img,
+                         int max_size_len, float &ratio_h, float &ratio_w,
+                         bool use_tensorrt) {
+  int w = img.cols;
+  int h = img.rows;
+  float ratio = 1.f;
+  int max_wh = w >= h ? w : h;
+  if (max_wh > max_size_len) {
+    if (h > w) {
+      ratio = float(max_size_len) / float(h);
+    } else {
+      ratio = float(max_size_len) / float(w);
+    }
+  }
+  int resize_h = int(float(h) * ratio);
+  int resize_w = int(float(w) * ratio);
+  if (resize_h % 32 == 0)
+    resize_h = resize_h;
+  else if (resize_h / 32 < 1 + 1e-5)
+    resize_h = 32;
+  else
+    resize_h = (resize_h / 32) * 32;
+  if (resize_w % 32 == 0)
+    resize_w = resize_w;
+  else if (resize_w / 32 < 1 + 1e-5)
+    resize_w = 32;
+  else
+    resize_w = (resize_w / 32) * 32;
+  if (!use_tensorrt) {
+    cv::resize(img, resize_img, cv::Size(resize_w, resize_h));
+    ratio_h = float(resize_h) / float(h);
+    ratio_w = float(resize_w) / float(w);
+  } else {
+    cv::resize(img, resize_img, cv::Size(640, 640));
+    ratio_h = float(640) / float(h);
+    ratio_w = float(640) / float(w);
+  }
+}
+void CrnnResizeImg::Run(const cv::Mat &img, cv::Mat &resize_img, float wh_ratio,
+                        bool use_tensorrt,
+                        const std::vector<int> &rec_image_shape) {
+  int imgC, imgH, imgW;
+  imgC = rec_image_shape[0];
+  imgH = rec_image_shape[1];
+  imgW = rec_image_shape[2];
+  imgW = int(32 * wh_ratio);
+  float ratio = float(img.cols) / float(img.rows);
+  int resize_w, resize_h;
+  if (ceilf(imgH * ratio) > imgW)
+    resize_w = imgW;
+  else
+    resize_w = int(ceilf(imgH * ratio));
+  if (!use_tensorrt) {
+    cv::resize(img, resize_img, cv::Size(resize_w, imgH), 0.f, 0.f,
+               cv::INTER_LINEAR);
+    cv::copyMakeBorder(resize_img, resize_img, 0, 0, 0,
+                       int(imgW - resize_img.cols), cv::BORDER_CONSTANT,
+                       {127, 127, 127});
+  } else {
+    int k = int(img.cols * 32 / img.rows);
+    if (k >= 100) {
+      cv::resize(img, resize_img, cv::Size(100, 32), 0.f, 0.f,
+                 cv::INTER_LINEAR);
+    } else {
+      cv::resize(img, resize_img, cv::Size(k, 32), 0.f, 0.f, cv::INTER_LINEAR);
+      cv::copyMakeBorder(resize_img, resize_img, 0, 0, 0, int(100 - k),
+                         cv::BORDER_CONSTANT, {127, 127, 127});
+    }
+  }
+}
+void ClsResizeImg::Run(const cv::Mat &img, cv::Mat &resize_img,
+                       bool use_tensorrt,
+                       const std::vector<int> &rec_image_shape) {
+  int imgC, imgH, imgW;
+  imgC = rec_image_shape[0];
+  imgH = rec_image_shape[1];
+  imgW = rec_image_shape[2];
+  float ratio = float(img.cols) / float(img.rows);
+  int resize_w, resize_h;
+  if (ceilf(imgH * ratio) > imgW)
+    resize_w = imgW;
+  else
+    resize_w = int(ceilf(imgH * ratio));
+  if (!use_tensorrt) {
+    cv::resize(img, resize_img, cv::Size(resize_w, imgH), 0.f, 0.f,
+               cv::INTER_LINEAR);
+    if (resize_w < imgW) {
+      cv::copyMakeBorder(resize_img, resize_img, 0, 0, 0, imgW - resize_w,
+                         cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
+    }
+  } else {
+    cv::resize(img, resize_img, cv::Size(100, 32), 0.f, 0.f, cv::INTER_LINEAR);
+  }
+}
+} // namespace PaddleOCR
--- a/core/predictor/tools/ocrtools/preprocess_op.h
+++ b/core/predictor/tools/ocrtools/preprocess_op.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+#include <cstring>
+#include <fstream>
+#include <numeric>
+using namespace std;
+//using namespace paddle;
+namespace PaddleOCR {
+class Normalize {
+public:
+  virtual void Run(cv::Mat *im, const std::vector<float> &mean,
+                   const std::vector<float> &scale, const bool is_scale = true);
+};
+// RGB -> CHW
+class Permute {
+public:
+  virtual void Run(const cv::Mat *im, float *data);
+};
+class ResizeImgType0 {
+public:
+  virtual void Run(const cv::Mat &img, cv::Mat &resize_img, int max_size_len,
+                   float &ratio_h, float &ratio_w, bool use_tensorrt);
+};
+class CrnnResizeImg {
+public:
+  virtual void Run(const cv::Mat &img, cv::Mat &resize_img, float wh_ratio,
+                   bool use_tensorrt = false,
+                   const std::vector<int> &rec_image_shape = {3, 32, 320});
+};
+class ClsResizeImg {
+public:
+  virtual void Run(const cv::Mat &img, cv::Mat &resize_img,
+                   bool use_tensorrt = false,
+                   const std::vector<int> &rec_image_shape = {3, 48, 192});
+};
+} // namespace PaddleOCR
\ No newline at end of file
--- a/core/predictor/tools/ocrtools/utility.cpp
+++ b/core/predictor/tools/ocrtools/utility.cpp
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <iostream>
+#include <ostream>
+#include <vector>
+#include "utility.h"
+namespace PaddleOCR {
+std::vector<std::string> Utility::ReadDict(const std::string &path) {
+  std::ifstream in(path);
+  std::string line;
+  std::vector<std::string> m_vec;
+  if (in) {
+    while (getline(in, line)) {
+      m_vec.push_back(line);
+    }
+  } else {
+    std::cout << "no such label file: " << path << ", exit the program..."
+              << std::endl;
+    exit(1);
+  }
+  return m_vec;
+}
+void Utility::VisualizeBboxes(
+    const cv::Mat &srcimg,
+    const std::vector<std::vector<std::vector<int>>> &boxes) {
+  cv::Mat img_vis;
+  srcimg.copyTo(img_vis);
+  for (int n = 0; n < boxes.size(); n++) {
+    cv::Point rook_points[4];
+    for (int m = 0; m < boxes[n].size(); m++) {
+      rook_points[m] = cv::Point(int(boxes[n][m][0]), int(boxes[n][m][1]));
+    }
+    const cv::Point *ppt[1] = {rook_points};
+    int npt[] = {4};
+    cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0);
+  }
+  cv::imwrite("./ocr_vis.png", img_vis);
+  std::cout << "The detection visualized image saved in ./ocr_vis.png"
+            << std::endl;
+}
+} // namespace PaddleOCR
\ No newline at end of file
--- a/core/predictor/tools/ocrtools/utility.h
+++ b/core/predictor/tools/ocrtools/utility.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <stdlib.h>
+#include <vector>
+#include <algorithm>
+#include <cstring>
+#include <fstream>
+#include <numeric>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+namespace PaddleOCR {
+class Utility {
+public:
+  static std::vector<std::string> ReadDict(const std::string &path);
+  static void
+  VisualizeBboxes(const cv::Mat &srcimg,
+                  const std::vector<std::vector<std::vector<int>>> &boxes);
+  template <class ForwardIterator>
+  inline static size_t argmax(ForwardIterator first, ForwardIterator last) {
+    return std::distance(first, std::max_element(first, last));
+  }
+};
+} // namespace PaddleOCR
\ No newline at end of file
--- a/doc/ABTEST_IN_PADDLE_SERVING.md
+++ b/doc/ABTEST_IN_PADDLE_SERVING.md
@@ -4,7 +4,7 @@
 This document will use an example of text classification task based on IMDB dataset to show how to build a A/B Test framework using Paddle Serving. The structure relationship between the client and servers in the example is shown in the figure below.
-<img src="abtest.png" style="zoom:33%;" />
+<img src="abtest.png" style="zoom:25%;" />
 Note that:  A/B Test is only applicable to RPC mode, not web mode.
@@ -88,7 +88,7 @@ with open('processed.data') as f:
        cnt[tag]['total'] += 1
    for tag, data in cnt.items():
-        print('[{}](total: {}) acc: {}'.format(tag, data['total'], float(data['acc']) / float(data['total'])))
+        print('[{}]<total: {}> acc: {}'.format(tag, data['total'], float(data['acc']) / float(data['total'])))
 ```
 In the code, the function `client.add_variant(tag, clusters, variant_weight)` is to add a variant with label `tag` and flow weight `variant_weight`. In this example, a BOW variant with label of `bow` and flow weight of `10`, and an LSTM variant with label of `lstm` and a flow weight of `90` are added. The flow on the client side will be distributed to two variants according to the ratio of `10:90`.
@@ -98,8 +98,8 @@ When making prediction on the client side, if the parameter `need_variant_tag=Tr
 ### Expected Results
 Due to different network conditions, the results of each prediction may be slightly different.
 ``` python
-[lstm](total: 1867) acc: 0.490091055169
+[lstm]<total: 1867> acc: 0.490091055169
-[bow](total: 217) acc: 0.73732718894
+[bow]<total: 217> acc: 0.73732718894
 ```
 <!--

--- a/doc/ABTEST_IN_PADDLE_SERVING_CN.md
+++ b/doc/ABTEST_IN_PADDLE_SERVING_CN.md
@@ -92,7 +92,7 @@ with open('processed.data') as f:
        cnt[tag]['total'] += 1
    for tag, data in cnt.items():
-        print('[{}](total: {}) acc: {}'.format(tag, data['total'], float(data['acc'])/float(data['total']) ))
+        print('[{}]<total: {}> acc: {}'.format(tag, data['total'], float(data['acc'])/float(data['total']) ))
 ```
 代码中，`client.add_variant(tag, clusters, variant_weight)`是为了添加一个标签为`tag`、流量权重为`variant_weight`的variant。在这个样例中，添加了一个标签为`bow`、流量权重为`10`的BOW variant，以及一个标签为`lstm`、流量权重为`90`的LSTM variant。Client端的流量会根据`10:90`的比例分发到两个variant。
@@ -101,6 +101,6 @@ Client端做预测时，若指定参数`need_variant_tag=True`，返回值则包
 ### 预期结果
 由于网络情况的不同，可能每次预测的结果略有差异。
 ``` bash
-[lstm](total: 1867) acc: 0.490091055169
+[lstm]<total: 1867> acc: 0.490091055169
-[bow](total: 217) acc: 0.73732718894
+[bow]<total: 217> acc: 0.73732718894
 ```
--- a/doc/deprecated/CLIENT_CONFIGURE.md
+++ b/doc/deprecated/CLIENT_CONFIGURE.md
--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -152,6 +152,24 @@ make -j10
 Execute `make install` to put the target output in the `./output` directory.
+### Compile C++ Server under the condition of WITH_OPENCV=ON
+**Note:** Only when you need to redevelop the paddle serving C + + part, and the new code depends on the OpenCV library, you need to do so.
+First of all , OpenCV library should be installed, if not, please refer to the `Compile and install OpenCV` section later in this article.
+In the compile command, add `DOPENCV_DIR=${OPENCV_DIR}` and `DWITH_OPENCV=ON`，for example：
+``` shell
+OPENCV_DIR=your_opencv_dir #`your_opencv_dir` is the installation path of OpenCV library。
+mkdir server-build-cpu && cd server-build-cpu
+cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR/ \
+    -DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
+    -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
+    -DOPENCV_DIR=${OPENCV_DIR} \
+    -DWITH_OPENCV=ON
+    -DSERVER=ON ..
+make -j10
+```
 **Note:** After the compilation is successful, you need to set the `SERVING_BIN` path, see the following [Notes](COMPILE.md#Notes) ).
 ## Compile Client
@@ -209,6 +227,7 @@ Please use the example under `python/examples` to verify.
 |     WITH_AVX     | Compile Paddle Serving with AVX intrinsics | OFF  |
 |     WITH_MKL     |  Compile Paddle Serving with MKL support   | OFF  |
 |     WITH_GPU     |   Compile Paddle Serving with NVIDIA GPU   | OFF  |
+|     WITH_OPENCV  |    Compile Paddle Serving with OPENCV      | OFF  |
 |  CUDNN_LIBRARY   |    Define CuDNN library and header path    |      |
 | CUDA_TOOLKIT_ROOT_DIR |       Define CUDA PATH                |      |
 |   TENSORRT_ROOT  |           Define TensorRT PATH             |      |
@@ -247,3 +266,63 @@ The following is the base library version matching relationship used by the Padd
 ### How to make the compiler detect the CuDNN library
 Download the corresponding CUDNN version from NVIDIA developer official website and decompressing it, add `-DCUDNN_ROOT` to cmake command, to specify the path of CUDNN.
+## Compile and install OpenCV
+**Note:** You need to do this only if you need to import the opencv library into your C + + code.
+* First of all, you need to download the source code compiled package in the Linux environment from the OpenCV official website. Taking OpenCV3.4.7 as an example, the download command is as follows.
+```
+wget https://github.com/opencv/opencv/archive/3.4.7.tar.gz
+tar -xf 3.4.7.tar.gz
+```
+Finally, you can see the folder of `opencv-3.4.7/` in the current directory.
+* Compile OpenCV, the OpenCV source path (`root_path`) and installation path (`install_path`) should be set by yourself. Enter the OpenCV source code path and compile it in the following way.
+```shell
+root_path=your_opencv_root_path
+install_path=${root_path}/opencv3
+rm -rf build
+mkdir build
+cd build
+cmake .. \
+    -DCMAKE_INSTALL_PREFIX=${install_path} \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DBUILD_SHARED_LIBS=OFF \
+    -DWITH_IPP=OFF \
+    -DBUILD_IPP_IW=OFF \
+    -DWITH_LAPACK=OFF \
+    -DWITH_EIGEN=OFF \
+    -DCMAKE_INSTALL_LIBDIR=lib64 \
+    -DWITH_ZLIB=ON \
+    -DBUILD_ZLIB=ON \
+    -DWITH_JPEG=ON \
+    -DBUILD_JPEG=ON \
+    -DWITH_PNG=ON \
+    -DBUILD_PNG=ON \
+    -DWITH_TIFF=ON \
+    -DBUILD_TIFF=ON
+make -j
+make install
+```
+Among them, `root_path` is the downloaded OpenCV source code path, and `install_path` is the installation path of OpenCV. After `make install` is completed, the OpenCV header file and library file will be generated in this folder for later source code compilation.
+The final file structure under the OpenCV installation path is as follows.
+```
+opencv3/
+|-- bin
+|-- include
+|-- lib
+|-- lib64
+|-- share
+```
--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -151,8 +151,27 @@ make -j10
 执行`make install`可以把目标产出放在`./output`目录下。
+### 开启WITH_OPENCV选项编译C++ Server
+**注意：** 只有当您需要对Paddle Serving C++部分进行二次开发，且新增的代码依赖于OpenCV库时，您才需要这样做。
+编译Serving C++ Server部分，开启WITH_OPENCV选项时，需要已安装的OpenCV库，若尚未安装，可参考本文档后面的说明编译安装OpenCV库。
+以开启WITH_OPENCV选项，编译CPU版本Paddle Inference Library为例，在上述编译命令基础上，加入`DOPENCV_DIR=${OPENCV_DIR}` 和 `DWITH_OPENCV=ON`选项。
+``` shell
+OPENCV_DIR=your_opencv_dir #`your_opencv_dir`为opencv库的安装路径。
+mkdir server-build-cpu && cd server-build-cpu
+cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR/ \
+    -DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
+    -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
+    -DOPENCV_DIR=${OPENCV_DIR} \
+    -DWITH_OPENCV=ON
+    -DSERVER=ON ..
+make -j10
+```
 **注意：** 编译成功后，需要设置`SERVING_BIN`路径，详见后面的[注意事项](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE_CN.md#注意事项)。
 ## 编译Client部分
 ``` shell
@@ -174,7 +193,7 @@ make -j10
 mkdir app-build && cd app-build
 cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR \
    -DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
-    -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \    
+    -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
    -DAPP=ON ..
 make
 ```
@@ -211,6 +230,7 @@ make
 |     WITH_MKL     |  Compile Paddle Serving with MKL support   | OFF  |
 |     WITH_GPU     |   Compile Paddle Serving with NVIDIA GPU   | OFF  |
 |     WITH_TRT     |    Compile Paddle Serving with TensorRT    | OFF  |
+|     WITH_OPENCV  |    Compile Paddle Serving with OPENCV      | OFF  |
 |  CUDNN_LIBRARY   |    Define CuDNN library and header path    |      |
 | CUDA_TOOLKIT_ROOT_DIR |       Define CUDA PATH                |      |
 |   TENSORRT_ROOT  |           Define TensorRT PATH             |      |
@@ -248,3 +268,61 @@ Paddle Serving通过PaddlePaddle预测库支持在GPU上做预测。WITH_GPU选
 ### 如何让Paddle Serving编译系统探测到CuDNN库
 从NVIDIA developer官网下载对应版本CuDNN并在本地解压后，在cmake编译命令中增加`-DCUDNN_LIBRARY`参数，指定CuDNN库所在路径。
+## 编译安装OpenCV库
+**注意：** 只有当您需要在C++代码中引入OpenCV库时，您才需要这样做。
+* 首先需要从OpenCV官网上下载在Linux环境下源码编译的包，以OpenCV3.4.7为例，下载命令如下。
+```
+wget https://github.com/opencv/opencv/archive/3.4.7.tar.gz
+tar -xf 3.4.7.tar.gz
+```
+最终可以在当前目录下看到`opencv-3.4.7/`的文件夹。
+* 编译OpenCV，设置OpenCV源码路径(`root_path`)以及安装路径(`install_path`)。进入OpenCV源码路径下，按照下面的方式进行编译。
+```shell
+root_path=your_opencv_root_path
+install_path=${root_path}/opencv3
+rm -rf build
+mkdir build
+cd build
+cmake .. \
+    -DCMAKE_INSTALL_PREFIX=${install_path} \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DBUILD_SHARED_LIBS=OFF \
+    -DWITH_IPP=OFF \
+    -DBUILD_IPP_IW=OFF \
+    -DWITH_LAPACK=OFF \
+    -DWITH_EIGEN=OFF \
+    -DCMAKE_INSTALL_LIBDIR=lib64 \
+    -DWITH_ZLIB=ON \
+    -DBUILD_ZLIB=ON \
+    -DWITH_JPEG=ON \
+    -DBUILD_JPEG=ON \
+    -DWITH_PNG=ON \
+    -DBUILD_PNG=ON \
+    -DWITH_TIFF=ON \
+    -DBUILD_TIFF=ON
+make -j
+make install
+```
+其中`root_path`为下载的OpenCV源码路径，`install_path`为OpenCV的安装路径，`make install`完成之后，会在该文件夹下生成OpenCV头文件和库文件，用于引用OpenCV库的代码的编译。
+最终在安装路径下的文件结构如下所示。
+```
+opencv3/
+|-- bin
+|-- include
+|-- lib
+|-- lib64
+|-- share
+```
--- a/doc/CONTRIBUTE.md
+++ b/doc/CONTRIBUTE.md
@@ -132,7 +132,7 @@ Please install pre-commit, which automatically reformat the changes to C/C++ and
 Please remember to add related unit tests.
- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/primer.md) .
+- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/docs/primer.md) .
 - For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/).

--- a/doc/CUBE_LOCAL.md
+++ b/doc/CUBE_LOCAL.md
@@ -7,11 +7,10 @@
 There are two examples on CTR under python / examples, they are criteo_ctr, criteo_ctr_with_cube. The former is to save the entire model during training, including sparse parameters. The latter is to cut out the sparse parameters and save them into two parts, one is the sparse parameter and the other is the dense parameter. Because the scale of sparse parameters is very large in industrial cases, reaching the order of 10 ^ 9. Therefore, it is not practical to start large-scale sparse parameter prediction on one machine. Therefore, we introduced Baidu's industrial-grade product Cube to provide the sparse parameter service for many years to provide distributed sparse parameter services.
 The local mode of Cube is different from distributed Cube, which is designed to be convenient for developers to use in experiments and demos. 
-<!--If there is a demand for distributed sparse parameter service, please continue reading [Distributed Cube User Guide](./Distributed_Cube) after reading this document (still developing).-->
+<!--If there is a demand for distributed sparse parameter service, please continue reading [Quantization Storage on Cube Sparse Parameter Indexing](./CUBE_QUANT.md) after reading this document (still developing).-->
 This document uses the original model without any compression algorithm. If there is a need for a quantitative model to go online, please read the [Quantization Storage on Cube Sparse Parameter Indexing](./CUBE_QUANT.md)
 ## Example
 in directory python/example/criteo_ctr_with_cube, run

--- a/doc/CUBE_LOCAL_CN.md
+++ b/doc/CUBE_LOCAL_CN.md
@@ -6,7 +6,7 @@
 在python/examples下有两个关于CTR的示例，他们分别是criteo_ctr, criteo_ctr_with_cube。前者是在训练时保存整个模型，包括稀疏参数。后者是将稀疏参数裁剪出来，保存成两个部分，一个是稀疏参数，另一个是稠密参数。由于在工业级的场景中，稀疏参数的规模非常大，达到10^9数量级。因此在一台机器上启动大规模稀疏参数预测是不实际的，因此我们引入百度多年来在稀疏参数索引领域的工业级产品Cube，提供分布式的稀疏参数服务。
-<!--单机版Cube是分布式Cube的弱化版本，旨在方便开发者做实验和Demo时使用。如果有分布式稀疏参数服务的需求，请在读完此文档之后，继续阅读  [稀疏参数索引服务Cube使用指南](分布式Cube)（正在建设中）。-->
+<!--单机版Cube是分布式Cube的弱化版本，旨在方便开发者做实验和Demo时使用。如果有分布式稀疏参数服务的需求，请在读完此文档之后，继续阅读  [稀疏参数索引服务Cube使用指南](CUBE_LOCAL_CN.md)（正在建设中）。-->
 本文档使用的都是未经过任何压缩算法处理的原始模型，如果有量化模型上线需求，请阅读[Cube稀疏参数索引量化存储使用指南](./CUBE_QUANT_CN.md)

--- a/doc/DESIGN_DOC.md
+++ b/doc/DESIGN_DOC.md
@@ -70,7 +70,7 @@ The inference framework of the well-known deep learning platform only supports C
 > Model conversion across deep learning platforms
-Models trained on other deep learning platforms can be passed《[PaddlePaddle/X2Paddle工具](https://github.com/PaddlePaddle/X2Paddle)》.We convert multiple mainstream CV models to Paddle models. TensorFlow, Caffe, ONNX, PyTorch model conversion is tested.《[An End-to-end Tutorial from Training to Inference Service Deployment](TRAIN_TO_SERVICE.md)》
+Models trained on other deep learning platforms can be passed《[PaddlePaddle/X2Paddle工具](https://github.com/PaddlePaddle/X2Paddle)》.We convert multiple mainstream CV models to Paddle models. TensorFlow, Caffe, ONNX, PyTorch model conversion is tested.《[AIStudio教程-Paddle Serving服务化部署框架](https://www.paddlepaddle.org.cn/tutorials/projectdetail/1555945)》
 Because it is impossible to directly view the feed and fetch parameter information in the model file, it is not convenient for users to assemble the parameters. Therefore, Paddle Serving developed a tool to convert the Paddle model into Serving format and generate a prototxt file containing feed and fetch parameter information. The following figure is the generated prototxt file of the uci_housing example. For more conversion methods, refer to the document《[How to save a servable model of Paddle Serving?](SAVE.md)》.
 ```

--- a/doc/DESIGN_DOC_CN.md
+++ b/doc/DESIGN_DOC_CN.md
@@ -74,7 +74,7 @@ Paddle Serving提供了4种开发语言SDK，包括Python、C++、Java、Golang
 其他深度学习平台训练的模型，可以通过《[PaddlePaddle/X2Paddle工具](https://github.com/PaddlePaddle/X2Paddle)》将多个主流的CV模型转为Paddle模型，测试过TensorFlow、Caffe、ONNX、PyTorch模型转换。
-以IMDB评论情感分析任务为例通过9步展示，Paddle Serving从模型的训练到部署预测服务的全流程《[端到端完成从训练到部署全流程](TRAIN_TO_SERVICE_CN.md)》
+以IMDB评论情感分析任务为例通过9步展示，Paddle Serving从模型的训练到部署预测服务的全流程《[AIStudio教程-Paddle Serving服务化部署框架](https://www.paddlepaddle.org.cn/tutorials/projectdetail/1555945)》
 由于无法直接查看模型文件中feed和fetch参数信息，不方便用户拼装参数。因此，Paddle Serving开发一个工具将Paddle模型转成Serving的格式，生成包含feed和fetch参数信息的prototxt文件。下图是uci_housing示例的生成的prototxt文件，更多转换方法参考文档《[怎样保存用于Paddle Serving的模型](SAVE_CN.md)》。
 ```

--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -14,9 +14,9 @@
     0-int64
-      1-float32
+     1-float32
-      2-int32
+     2-int32
 #### Q: paddle-serving是否支持windows和Linux环境下的多线程调用 
@@ -222,9 +222,7 @@ InvalidArgumentError: Device id must be less than GPU count, but received id is:
 #### Q: python编译的GCC版本与serving的版本不匹配
-**A:**:1)使用[GPU docker](https://github.com/PaddlePaddle/Serving/blob/develop/doc/RUN_IN_DOCKER.md#gpunvidia-docker)解决环境问题
+**A:**:1)使用[GPU docker](https://github.com/PaddlePaddle/Serving/blob/develop/doc/RUN_IN_DOCKER.md#gpunvidia-docker)解决环境问题；2)修改anaconda的虚拟环境下安装的python的gcc版本[改变python的GCC编译环境](https://www.jianshu.com/p/c498b3d86f77) 
-       2)修改anaconda的虚拟环境下安装的python的gcc版本[参考](https://www.jianshu.com/p/c498b3d86f77) 
 #### Q: paddle-serving是否支持本地离线安装 

--- a/doc/LATEST_PACKAGES.md
+++ b/doc/LATEST_PACKAGES.md
@@ -78,7 +78,7 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_app-0.0.0-py2-none-any.w
 ```
 ## ARM user
-for ARM user who uses [PaddleLite](https://github.com/PaddlePaddle/PaddleLite) can download the wheel packages as follows. And ARM user should use the xpu-beta docker [DOCKER IMAGES](./DOCKER_IMAGES.md) 
+for ARM user who uses [Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite) can download the wheel packages as follows. And ARM user should use the xpu-beta docker [DOCKER IMAGES](./DOCKER_IMAGES.md) 
 **We only support Python 3.6 for Arm Users.**
 ### Wheel Package Links

--- a/doc/deprecated/NEW_OPERATOR.md
+++ b/doc/deprecated/NEW_OPERATOR.md
--- a/doc/deprecated/NEW_OPERATOR_CN.md
+++ b/doc/deprecated/NEW_OPERATOR_CN.md
--- a/doc/SERVER_DAG.md
+++ b/doc/SERVER_DAG.md
@@ -48,7 +48,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 ### Nodes with multiple inputs
-An example containing multiple input nodes is given in the [MODEL_ENSEMBLE_IN_PADDLE_SERVING](MODEL_ENSEMBLE_IN_PADDLE_SERVING.md). A example graph and the corresponding DAG definition code is as follows.
+An example containing multiple input nodes is given in the [MODEL_ENSEMBLE_IN_PADDLE_SERVING](./deprecated/MODEL_ENSEMBLE_IN_PADDLE_SERVING.md). A example graph and the corresponding DAG definition code is as follows.
 <center>
 <img src='complex_dag.png' width = "480" height = "400" align="middle"/>

--- a/doc/SERVER_DAG_CN.md
+++ b/doc/SERVER_DAG_CN.md
@@ -47,7 +47,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 ### 包含多个输入的节点
-在[Paddle Serving中的集成预测](MODEL_ENSEMBLE_IN_PADDLE_SERVING_CN.md)文档中给出了一个包含多个输入节点的样例，示意图和代码如下。
+在[Paddle Serving中的集成预测](./deprecated/MODEL_ENSEMBLE_IN_PADDLE_SERVING_CN.md)文档中给出了一个包含多个输入节点的样例，示意图和代码如下。
 <center>
 <img src='complex_dag.png' width = "480" height = "400" align="middle"/>

--- a/doc/deprecated/SERVING_CONFIGURE.md
+++ b/doc/deprecated/SERVING_CONFIGURE.md
--- a/doc/TENSOR_RT.md
+++ b/doc/TENSOR_RT.md
 ## Paddle Serving uses TensorRT
-(English|[简体中文]((./TENSOR_RT_CN.md)))
+(English|[简体中文](./TENSOR_RT_CN.md))
 ### Background

--- a/doc/WINDOWS_TUTORIAL_CN.md
+++ b/doc/WINDOWS_TUTORIAL_CN.md
@@ -14,7 +14,7 @@
 **安装Git工具**： 详情参见[Git官网](https://git-scm.com/downloads)
-**安装必要的C++库（可选）**：部分用户可能会在`import paddle`阶段遇见dll无法链接的问题，建议可以[安装Visual Studio社区版本](`https://visualstudio.microsoft.com/`) ，并且安装C++的相关组件。
+**安装必要的C++库（可选）**：部分用户可能会在`import paddle`阶段遇见dll无法链接的问题，建议[安装Visual Studio社区版本](https://visualstudio.microsoft.com/) ，并且安装C++的相关组件。
 **安装Paddle和Serving**：在Powershell，执行

--- a/doc/deprecated/CREATING.md
+++ b/doc/deprecated/CREATING.md
@@ -77,7 +77,7 @@ service ImageClassifyService {
 关于Serving端的配置的详细信息，可以参考[Serving端配置](SERVING_CONFIGURE.md)
-以下配置文件将ReaderOP, ClassifyOP和WriteJsonOP串联成一个workflow (关于OP/workflow等概念，可参考[设计文档](../DESIGN.md))
+以下配置文件将ReaderOP, ClassifyOP和WriteJsonOP串联成一个workflow (关于OP/workflow等概念，可参考[设计文档](DESIGN.md))
 - 配置文件示例：

--- a/doc/deprecated/DESIGN.md
+++ b/doc/deprecated/DESIGN.md
@@ -45,11 +45,11 @@ Models that can be predicted using the Paddle Inference Library, models saved du
 ### 3.4 Server Inferface
-![Server Interface](server_interface.png)
+![Server Interface](../server_interface.png)
 ### 3.5 Client Interface
-<img src='client_inferface.png' width = "600" height = "200">
+<img src='../client_inferface.png' width = "600" height = "200">
 ### 3.6 Client io used during Training
@@ -66,7 +66,7 @@ def save_model(server_model_folder,
 ## 4. Paddle Serving Underlying Framework
-![Paddle-Serging Overall Architecture](framework.png)
+![Paddle-Serging Overall Architecture](../framework.png)
 **Model Management Framework**: Connects model files of multiple machine learning platforms and provides a unified inference interface
 **Business Scheduling Framework**: Abstracts the calculation logic of various different inference models, provides a general DAG scheduling framework, and connects different operators through DAG diagrams to complete a prediction service together. This abstract model allows users to conveniently implement their own calculation logic, and at the same time facilitates operator sharing. (Users build their own forecasting services. A large part of their work is to build DAGs and provide operators.)
@@ -102,31 +102,31 @@ class FluidFamilyCore {
 With reference to the abstract idea of model calculation of the TensorFlow framework, the business logic is abstracted into a DAG diagram, driven by configuration, generating a workflow, and skipping C ++ code compilation. Each specific step of the service corresponds to a specific OP. The OP can configure the upstream OP that it depends on. Unified message passing between OPs is achieved by the thread-level bus and channel mechanisms. For example, the service process of a simple prediction service can be abstracted into 3 steps including reading request data-> calling the prediction interface-> writing back the prediction result, and correspondingly implemented to 3 OP: ReaderOp-> ClassifyOp-> WriteOp
-![Infer Service](predict-service.png)
+![Infer Service](../predict-service.png)
-Regarding the dependencies between OPs, and the establishment of workflows through OPs, you can refer to [从零开始写一个预测服务](./deprecated/CREATING.md) (simplified Chinese Version)
+Regarding the dependencies between OPs, and the establishment of workflows through OPs, you can refer to [从零开始写一个预测服务](CREATING.md) (simplified Chinese Version)
 Server instance perspective
-![Server instance perspective](server-side.png)
+![Server instance perspective](../server-side.png)
 #### 4.2.2 Paddle Serving Multi-Service Mechanism
-![Paddle Serving multi-service](multi-service.png)
+![Paddle Serving multi-service](../multi-service.png)
-Paddle Serving instances can load multiple models at the same time, and each model uses a Service (and its configured workflow) to undertake services. You can refer to [service configuration file in Demo example](../tools/cpp_examples/demo-serving/conf/service.prototxt) to learn how to configure multiple services for the serving instance
+Paddle Serving instances can load multiple models at the same time, and each model uses a Service (and its configured workflow) to undertake services. You can refer to [service configuration file in Demo example](../../tools/cpp_examples/demo-serving/conf/service.prototxt) to learn how to configure multiple services for the serving instance
 #### 4.2.3 Hierarchical relationship of business scheduling
 From the client's perspective, a Paddle Serving service can be divided into three levels: Service, Endpoint, and Variant from top to bottom.
-![Call hierarchy relationship](multi-variants.png)
+![Call hierarchy relationship](../multi-variants.png)
 One Service corresponds to one inference model, and there is one endpoint under the model. Different versions of the model are implemented through multiple variant concepts under endpoint:
-The same model prediction service can configure multiple variants, and each variant has its own downstream IP list. The client code can configure relative weights for each variant to achieve the relationship of adjusting the traffic ratio (refer to the description of variant_weight_list in [Client Configuration](./deprecated/CLIENT_CONFIGURE.md) section 3.2).
+The same model prediction service can configure multiple variants, and each variant has its own downstream IP list. The client code can configure relative weights for each variant to achieve the relationship of adjusting the traffic ratio (refer to the description of variant_weight_list in [Client Configuration](../CLIENT_CONFIGURE.md) section 3.2).
-![Client-side proxy function](client-side-proxy.png)
+![Client-side proxy function](../client-side-proxy.png)
 ## 5. User Interface
@@ -141,7 +141,7 @@ No matter how the communication protocol changes, the framework only needs to en
 ### 5.1 Data Compression Method
-Baidu-rpc has built-in data compression methods such as snappy, gzip, zlib, which can be configured in the configuration file (refer to [Client Configuration](./deprecated/CLIENT_CONFIGURE.md) Section 3.1 for an introduction to compress_type)
+Baidu-rpc has built-in data compression methods such as snappy, gzip, zlib, which can be configured in the configuration file (refer to [Client Configuration](../CLIENT_CONFIGURE.md) Section 3.1 for an introduction to compress_type)
 ### 5.2 C ++ SDK API Interface

--- a/doc/deprecated/DESIGN_CN.md
+++ b/doc/deprecated/DESIGN_CN.md
@@ -47,11 +47,11 @@ PaddlePaddle是百度开源的机器学习框架，广泛支持各种深度学
 ### 3.4 Server Inferface
-![Server Interface](server_interface.png)
+![Server Interface](../server_interface.png)
 ### 3.5 Client Interface
-<img src='client_inferface.png' width = "600" height = "200">
+<img src='../client_inferface.png' width = "600" height = "200">
 ### 3.6 训练过程中使用的Client io
@@ -68,7 +68,7 @@ def save_model(server_model_folder,
 ## 4. Paddle Serving底层框架
-![Paddle-Serging总体框图](framework.png)
+![Paddle-Serging总体框图](../framework.png)
 **模型管理框架**：对接多种机器学习平台的模型文件，向上提供统一的inference接口
 **业务调度框架**：对各种不同预测模型的计算逻辑进行抽象，提供通用的DAG调度框架，通过DAG图串联不同的算子，共同完成一次预测服务。该抽象模型使用户可以方便的实现自己的计算逻辑，同时便于算子共用。（用户搭建自己的预测服务，很大一部分工作是搭建DAG和提供算子的实现）
@@ -104,31 +104,31 @@ class FluidFamilyCore {
 参考TF框架的模型计算的抽象思想，将业务逻辑抽象成DAG图，由配置驱动，生成workflow，跳过C++代码编译。业务的每个具体步骤，对应一个具体的OP，OP可配置自己依赖的上游OP。OP之间消息传递统一由线程级Bus和channel机制实现。例如，一个简单的预测服务的服务过程，可以抽象成读请求数据->调用预测接口->写回预测结果等3个步骤，相应的实现到3个OP: ReaderOp->ClassifyOp->WriteOp
-![预测服务Service](predict-service.png)
+![预测服务Service](../predict-service.png)
 关于OP之间的依赖关系，以及通过OP组建workflow，可以参考[从零开始写一个预测服务](https://github.com/PaddlePaddle/Serving/blob/develop/doc/deprecated/CREATING.md)的相关章节
 服务端实例透视图
-![服务端实例透视图](server-side.png)
+![服务端实例透视图](../server-side.png)
 #### 4.2.2 Paddle Serving的多服务机制
-![Paddle Serving的多服务机制](multi-service.png)
+![Paddle Serving的多服务机制](../multi-service.png)
-Paddle Serving实例可以同时加载多个模型，每个模型用一个Service（以及其所配置的workflow）承接服务。可以参考[Demo例子中的service配置文件](../tools/cpp_examples/demo-serving/conf/service.prototxt)了解如何为serving实例配置多个service
+Paddle Serving实例可以同时加载多个模型，每个模型用一个Service（以及其所配置的workflow）承接服务。可以参考[Demo例子中的service配置文件](../..//tools/cpp_examples/demo-serving/conf/service.prototxt)了解如何为serving实例配置多个service
 #### 4.2.3 业务调度层级关系
 从客户端看，一个Paddle Serving service从顶向下可分为Service, Endpoint, Variant等3个层级
-![调用层级关系](multi-variants.png)
+![调用层级关系](../multi-variants.png)
 一个Service对应一个预测模型，模型下有1个endpoint。模型的不同版本，通过endpoint下多个variant概念实现：
-同一个模型预测服务，可以配置多个variant，每个variant有自己的下游IP列表。客户端代码可以对各个variant配置相对权重，以达到调节流量比例的关系（参考[客户端配置](./deprecated/CLIENT_CONFIGURE.md)第3.2节中关于variant_weight_list的说明）。
+同一个模型预测服务，可以配置多个variant，每个variant有自己的下游IP列表。客户端代码可以对各个variant配置相对权重，以达到调节流量比例的关系（参考[客户端配置](CLIENT_CONFIGURE.md)第3.2节中关于variant_weight_list的说明）。
-![Client端proxy功能](client-side-proxy.png)
+![Client端proxy功能](../client-side-proxy.png)
 ## 5. 用户接口
@@ -143,7 +143,7 @@ Paddle Serving实例可以同时加载多个模型，每个模型用一个Servic
 ### 5.1 数据压缩方法
-Baidu-rpc内置了snappy, gzip, zlib等数据压缩方法，可在配置文件中配置（参考[客户端配置](./deprecated/CLIENT_CONFIGURE.md)第3.1节关于compress_type的介绍）
+Baidu-rpc内置了snappy, gzip, zlib等数据压缩方法，可在配置文件中配置（参考[客户端配置](CLIENT_CONFIGURE.md)第3.1节关于compress_type的介绍）
 ### 5.2 C++ SDK API接口

--- a/doc/deprecated/MODEL_ENSEMBLE_IN_PADDLE_SERVING.md
+++ b/doc/deprecated/MODEL_ENSEMBLE_IN_PADDLE_SERVING.md
@@ -10,7 +10,7 @@ Next, we will take the text classification task as an example to show model ense
 In this example (see the figure below), the server side predict the bow and CNN models with the same input in a service in parallel, The client side fetchs the prediction results of the two models, and processes the prediction results to get the final predict results.
-![simple example](model_ensemble_example.png)
+![simple example](../model_ensemble_example.png)
 It should be noted that at present, only multiple models with the same format input and output in the same service are supported. In this example, the input and output formats of CNN and BOW model are the same.

--- a/doc/deprecated/MODEL_ENSEMBLE_IN_PADDLE_SERVING_CN.md
+++ b/doc/deprecated/MODEL_ENSEMBLE_IN_PADDLE_SERVING_CN.md
@@ -10,7 +10,7 @@
 该样例中（见下图），Server端在一项服务中并行预测相同输入的BOW和CNN模型，Client端获取两个模型的预测结果并进行后处理，得到最终的预测结果。
-![simple example](model_ensemble_example.png)
+![simple example](../model_ensemble_example.png)
 需要注意的是，目前只支持在同一个服务中使用多个相同格式输入输出的模型。在该例子中，CNN模型和BOW模型的输入输出格式是相同的。

--- a/doc/deprecated/NEW_WEB_SERVICE.md
+++ b/doc/deprecated/NEW_WEB_SERVICE.md
@@ -2,7 +2,7 @@
 ([简体中文](NEW_WEB_SERVICE_CN.md)|English)
-This document will take the image classification service based on the Imagenet data set as an example to introduce how to develop a new web service. The complete code can be visited at [here](../python/examples/imagenet/resnet50_web_service.py).
+This document will take the image classification service based on the Imagenet data set as an example to introduce how to develop a new web service. The complete code can be visited at [here](../../python/examples/imagenet/resnet50_web_service.py).
 ## WebService base class

--- a/doc/deprecated/NEW_WEB_SERVICE_CN.md
+++ b/doc/deprecated/NEW_WEB_SERVICE_CN.md
@@ -2,7 +2,7 @@
 (简体中文|[English](NEW_WEB_SERVICE.md))
-本文档将以Imagenet图像分类服务为例，来介绍如何开发一个新的Web Service。您可以在[这里](../python/examples/imagenet/resnet50_web_service.py)查阅完整的代码。
+本文档将以Imagenet图像分类服务为例，来介绍如何开发一个新的Web Service。您可以在[这里](../../python/examples/imagenet/resnet50_web_service.py)查阅完整的代码。
 ## WebService基类

--- a/java/examples/src/main/java/PipelineClientExample.java
+++ b/java/examples/src/main/java/PipelineClientExample.java
@@ -32,7 +32,7 @@ public class PipelineClientExample {
        System.out.println(fetch);
        if (StaticPipelineClient.succ != true) {
-            if(!StaticPipelineClient.initClient("127.0.0.1","18070")){
+            if (!StaticPipelineClient.initClient("127.0.0.1","18070")) {
                System.out.println("connect failed.");
                return false;
            }
@@ -57,7 +57,7 @@ public class PipelineClientExample {
        List<String> fetch = Arrays.asList("prediction");
        System.out.println(fetch);
        if (StaticPipelineClient.succ != true) {
-            if(!StaticPipelineClient.initClient("127.0.0.1","18070")){
+            if (!StaticPipelineClient.initClient("127.0.0.1","18070")) {
                System.out.println("connect failed.");
                return false;
            }
@@ -86,7 +86,7 @@ public class PipelineClientExample {
            }};
        List<String> fetch = Arrays.asList("prediction");
        if (StaticPipelineClient.succ != true) {
-            if(!StaticPipelineClient.initClient("127.0.0.1","9998")){
+            if (!StaticPipelineClient.initClient("127.0.0.1","9998")) {
                System.out.println("connect failed.");
                return false;
            }
@@ -105,7 +105,7 @@ public class PipelineClientExample {
   * @param npdata INDArray type(The input data).
   * @return String (specified String type for python Numpy eval method).
   */
-    String convertINDArrayToString(INDArray npdata){
+    String convertINDArrayToString(INDArray npdata) {
        return "array("+npdata.toString()+")";
    }

--- a/java/examples/src/main/java/StaticPipelineClient.java
+++ b/java/examples/src/main/java/StaticPipelineClient.java
@@ -30,10 +30,10 @@ public class StaticPipelineClient {
   * @param strPort String type(The server port) such as "8891".
   * @return boolean (the sign of connect status).
   */
-    public static boolean initClient(String strIp,String strPort){
+    public static boolean initClient(String strIp,String strPort) {
        String target = strIp+ ":"+ strPort;//"172.17.0.2:18070";
        System.out.println("initial connect.");
-        if(succ){
+        if (succ) {
            System.out.println("already connect.");
            return true;
        }

--- a/java/src/main/java/io/paddle/serving/client/PipelineClient.java
+++ b/java/src/main/java/io/paddle/serving/client/PipelineClient.java
@@ -88,7 +88,7 @@ public class PipelineClient {
            keys.add(entry.getKey());
            values.add(entry.getValue());
        }
-        if(profile){
+        if (profile) {
            keys.add(_profile_key);
            values.add(_profile_value);
        }

--- a/paddle_inference/paddle/include/paddle_engine.h
+++ b/paddle_inference/paddle/include/paddle_engine.h
@@ -37,9 +37,24 @@ using paddle_infer::Tensor;
 using paddle_infer::CreatePredictor;
 DECLARE_int32(gpuid);
+DECLARE_string(precision);
+DECLARE_bool(use_calib);
 static const int max_batch = 32;
 static const int min_subgraph_size = 3;
+static PrecisionType precision_type;
+PrecisionType GetPrecision(const std::string& precision_data) {
+  std::string precision_type = predictor::ToLower(precision_data);
+  if (precision_type == "fp32") {
+    return PrecisionType::kFloat32;
+  } else if (precision_type == "int8") {
+    return PrecisionType::kInt8;
+  } else if (precision_type == "fp16") {
+    return PrecisionType::kHalf;
+  }
+  return PrecisionType::kFloat32;
+}
 // Engine Base
 class PaddleEngineBase {
@@ -107,9 +122,9 @@ class PaddleInferenceEngine : public PaddleEngineBase {
    if (engine_conf.has_encrypted_model() && engine_conf.encrypted_model()) {
      // decrypt model
      std::string model_buffer, params_buffer, key_buffer;
-      predictor::ReadBinaryFile(model_path + "encrypt_model", &model_buffer);
+      predictor::ReadBinaryFile(model_path + "/encrypt_model", &model_buffer);
-      predictor::ReadBinaryFile(model_path + "encrypt_params", &params_buffer);
+      predictor::ReadBinaryFile(model_path + "/encrypt_params", &params_buffer);
-      predictor::ReadBinaryFile(model_path + "key", &key_buffer);
+      predictor::ReadBinaryFile(model_path + "/key", &key_buffer);
      auto cipher = paddle::MakeCipher("");
      std::string real_model_buffer = cipher->Decrypt(model_buffer, key_buffer);
@@ -137,6 +152,7 @@ class PaddleInferenceEngine : public PaddleEngineBase {
      // 2000MB GPU memory
      config.EnableUseGpu(2000, FLAGS_gpuid);
    }
+    precision_type = GetPrecision(FLAGS_precision);
    if (engine_conf.has_use_trt() && engine_conf.use_trt()) {
      if (!engine_conf.has_use_gpu() || !engine_conf.use_gpu()) {
@@ -145,14 +161,24 @@ class PaddleInferenceEngine : public PaddleEngineBase {
      config.EnableTensorRtEngine(1 << 20,
                                  max_batch,
                                  min_subgraph_size,
-                                  Config::Precision::kFloat32,
+                                  precision_type,
                                  false,
-                                  false);
+                                  FLAGS_use_calib);
      LOG(INFO) << "create TensorRT predictor";
    }
    if (engine_conf.has_use_lite() && engine_conf.use_lite()) {
-      config.EnableLiteEngine(PrecisionType::kFloat32, true);
+      config.EnableLiteEngine(precision_type, true);
+    }
+    if ((!engine_conf.has_use_lite() && !engine_conf.has_use_gpu()) ||
+        (engine_conf.has_use_lite() && !engine_conf.use_lite() &&
+         engine_conf.has_use_gpu() && !engine_conf.use_gpu())) {
+      if (precision_type == PrecisionType::kInt8) {
+        config.EnableMkldnnQuantizer();
+      } else if (precision_type == PrecisionType::kHalf) {
+        config.EnableMkldnnBfloat16();
+      }
    }
    if (engine_conf.has_use_xpu() && engine_conf.use_xpu()) {
@@ -171,7 +197,6 @@ class PaddleInferenceEngine : public PaddleEngineBase {
      config.EnableMemoryOptim();
    }
    predictor::AutoLock lock(predictor::GlobalCreateMutex::instance());
    _predictor = CreatePredictor(config);
    if (NULL == _predictor.get()) {

--- a/paddle_inference/paddle/src/paddle_engine.cpp
+++ b/paddle_inference/paddle/src/paddle_engine.cpp
@@ -20,6 +20,8 @@ namespace paddle_serving {
 namespace inference {
 DEFINE_int32(gpuid, 0, "GPU device id to use");
+DEFINE_string(precision, "fp32", "precision to deploy, default is fp32");
+DEFINE_bool(use_calib, false, "calibration mode, default is false");
 REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
    ::baidu::paddle_serving::predictor::FluidInferEngine<PaddleInferenceEngine>,

--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
@@ -10,7 +10,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf faster_rcnn_hrnetv2p_w18_1x.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`. 

--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
@@ -11,7 +11,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf faster_rcnn_hrnetv2p_w18_1x.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。

--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README.md
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README.md
@@ -10,7 +10,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf fcos_dcn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
@@ -18,4 +18,3 @@ This model support TensorRT, if you want a faster inference, please use `--use_t
 ```
 python test_client.py 000000570688.jpg
 ```
--- a/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README_CN.md
+++ b/python/examples/detection/fcos_dcn_r50_fpn_1x_coco/README_CN.md
@@ -11,7 +11,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf fcos_dcn_r50_fpn_1x_coco.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
@@ -20,4 +20,3 @@ python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --g
 ```
 python test_client.py 000000570688.jpg
 ```
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/README.md
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/README.md
@@ -10,7 +10,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### Start the service
 ```
 tar xf ssd_vgg16_300_240e_voc.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 This model support TensorRT, if you want a faster inference, please use `--use_trt`.
@@ -18,4 +18,3 @@ This model support TensorRT, if you want a faster inference, please use `--use_t
 ```
 python test_client.py 000000570688.jpg
 ```
--- a/python/examples/detection/ssd_vgg16_300_240e_voc/README_CN.md
+++ b/python/examples/detection/ssd_vgg16_300_240e_voc/README_CN.md
@@ -11,7 +11,7 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 ### 启动服务
 ```
 tar xf ssd_vgg16_300_240e_voc.tar
-python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --gpu_ids 0
+python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
@@ -20,4 +20,3 @@ python -m paddle_serving_server_gpu.serve --model serving_server --port 9494 --g
 ```
 python test_client.py 000000570688.jpg
 ```
--- a/python/examples/encryption/README.md
+++ b/python/examples/encryption/README.md
@@ -13,12 +13,25 @@ sh get_data.sh
 ## Encrypt Model
 The `paddlepaddle` package is used in this example, you may need to download the corresponding package(`pip install paddlepaddle`).
+[python encrypt.py](./encrypt.py)
+[//file]:#encrypt.py
+``` python
+def serving_encryption():
+    inference_model_to_serving(
+        dirname="./uci_housing_model",
+        params_filename=None,
+        serving_server="encrypt_server",
+        serving_client="encrypt_client",
+        encryption=True)
 ```
-python encrypt.py
+dirname is the folder path where the model is located. If the parameter is discrete, it is unnecessary to specify params_filename, else you need to set `params_filename="__params__"`.
-```
 The key is stored in the `key` file, and the encrypted model file and server-side configuration file are stored in the `encrypt_server` directory.
 client-side configuration file are stored in the `encrypt_client` directory.
+**Notice：** When encryption prediction is used, the model configuration and parameter folder loaded by server and client should be encrypt_server/ and encrypt_client/
 ## Start Encryption Service
 CPU Service
 ```
@@ -31,5 +44,5 @@ python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_
 ## Prediction
 ```
-python test_client.py uci_housing_client/serving_client_conf.prototxt
+python test_client.py encrypt_client/serving_client_conf.prototxt
 ```
--- a/python/examples/encryption/README_CN.md
+++ b/python/examples/encryption/README_CN.md
@@ -12,11 +12,27 @@ sh get_data.sh
 ## 模型加密
 本示例中使用了`paddlepaddle`包中的模块，需要进行下载（`pip install paddlepaddle`）。
-```
-python encrypt.py
+运行[python encrypt.py](./encrypt.py)进行模型加密
-```
+[//file]:#encrypt.py
+``` python
+def serving_encryption():
+    inference_model_to_serving(
+        dirname="./uci_housing_model",
+        params_filename=None,
+        serving_server="encrypt_server",
+        serving_client="encrypt_client",
+        encryption=True)
+```
+其中dirname为模型所在的文件夹路径
+当参数为离散参数时，无须指定params_filename，当参数为__params__时，需指定`params_filename="__params__"`
 密钥保存在`key`文件中，加密模型文件以及server端配置文件保存在`encrypt_server`目录下，client端配置文件保存在`encrypt_client`目录下。
+**注意：** 当使用加密预测时，服务端和客户端启动加载的模型配置和参数文件夹是encrypt_server/和encrypt_client/
 ## 启动加密预测服务
 CPU预测服务
 ```
@@ -29,5 +45,5 @@ python -m paddle_serving_server.serve --model encrypt_server/ --port 9300 --use_
 ## 预测
 ```
-python test_client.py uci_housing_client/serving_client_conf.prototxt
+python test_client.py encrypt_client/
 ```
--- a/python/examples/encryption/encrypt.py
+++ b/python/examples/encryption/encrypt.py
@@ -18,6 +18,7 @@ from paddle_serving_client.io import inference_model_to_serving
 def serving_encryption():
    inference_model_to_serving(
        dirname="./uci_housing_model",
+        params_filename=None,
        serving_server="encrypt_server",
        serving_client="encrypt_client",
        encryption=True)

--- a/python/examples/fit_a_line/test_client.py
+++ b/python/examples/fit_a_line/test_client.py
@@ -28,7 +28,7 @@ test_reader = paddle.batch(
    batch_size=1)
 for data in test_reader():
-    new_data = np.zeros((1, 1, 13)).astype("float32")
+    new_data = np.zeros((1, 13)).astype("float32")
    new_data[0] = data[0][0]
    fetch_map = client.predict(
        feed={"x": new_data}, fetch=["price"], batch=True)

--- a/python/examples/ocr/README.md
+++ b/python/examples/ocr/README.md
@@ -98,3 +98,30 @@ python rec_debugger_server.py gpu #for gpu user
 ```
 python rec_web_client.py
 ```
+## C++ OCR Service
+**Notice：** If you need to concatenate det model and rec model, and do pre-processing and post-processing in Paddle Serving C++ framework, you need to use the C++ server compiled with WITH_OPENCV option，see the [COMPILE.md](../../../doc/COMPILE.md)
+### Start Service
+Select a startup mode according to CPU / GPU device
+After the -- model parameter, the folder path of multiple model files is passed in to start the prediction service of multiple model concatenation.
+```
+#for cpu user
+python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
+#for gpu user
+python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_id 0
+```
+### Client Prediction
+The pre-processing and post-processing is in the C + + server part, the image's Base64 encoded string is passed into the C + + server.
+so the value of parameter `feed_var` which is in the file `ocr_det_client/serving_client_conf.prototxt` should be changed.
+for this case, `feed_type` should be 3(which means the data type is string),`shape` should be 1.
+By passing in multiple client folder paths, the client can be started for multi model prediction.
+```
+python ocr_cpp_client.py ocr_det_client ocr_rec_client
+```
--- a/python/examples/ocr/README_CN.md
+++ b/python/examples/ocr/README_CN.md
@@ -98,3 +98,29 @@ python rec_debugger_server.py gpu #for gpu user
 ```
 python rec_web_client.py
 ```
+## C++ OCR Service服务
+**注意：** 若您需要使用Paddle Serving C++框架串联det模型和rec模型，并进行前后处理，您需要使用开启WITH_OPENCV选项编译的C++ Server，详见[COMPILE.md](../../../doc/COMPILE.md)
+### 启动服务
+根据CPU/GPU设备选择一种启动方式
+通过--model后，指定多个模型文件的文件夹路径来启动多模型串联的预测服务。
+```
+#for cpu user
+python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293
+#for gpu user
+python -m paddle_serving_server.serve --model ocr_det_model ocr_rec_model --port 9293 --gpu_id 0
+```
+### 启动客户端
+由于需要在C++Server部分进行前后处理，传入C++Server的仅仅是图片的base64编码的字符串，故第一个模型的Client配置需要修改
+即`ocr_det_client/serving_client_conf.prototxt`中`feed_var`字段
+对于本示例而言，`feed_type`应修改为3(数据类型为string),`shape`为1.
+通过在客户端启动后加入多个client模型的client配置文件夹路径，启动client进行预测。
+```
+python ocr_cpp_client.py ocr_det_client ocr_rec_client
+```
--- a/python/examples/ocr/ocr_cpp_client.py
+++ b/python/examples/ocr/ocr_cpp_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_client import Client
+import sys
+import numpy as np
+import base64
+import os
+import cv2
+from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
+from paddle_serving_app.reader import Div, Normalize, Transpose
+client = Client()
+# TODO:load_client need to load more than one client model.
+# this need to figure out some details.
+client.load_client_config(sys.argv[1:])
+client.connect(["127.0.0.1:9293"])
+import paddle
+test_img_dir = "imgs/"
+def cv2_to_base64(image):
+    return base64.b64encode(image) #data.tostring()).decode('utf8')
+for img_file in os.listdir(test_img_dir):
+    with open(os.path.join(test_img_dir, img_file), 'rb') as file:
+        image_data = file.read()
+    image = cv2_to_base64(image_data)
+    fetch_map = client.predict(
+        feed={"image": image}, fetch = ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"], batch=True)
+    #print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
+    print(fetch_map)
--- a/python/examples/ocr/ocr_debugger_server.py
+++ b/python/examples/ocr/ocr_debugger_server.py
@@ -106,8 +106,8 @@ ocr_service.load_model_config("ocr_rec_model")
 ocr_service.prepare_server(workdir="workdir", port=9292)
 ocr_service.init_det_debugger(det_model_config="ocr_det_model")
 if sys.argv[1] == 'gpu':
-    ocr_service.set_gpus("2")
+    ocr_service.set_gpus("0")
-    ocr_service.run_debugger_service(gpu = True)
+    ocr_service.run_debugger_service(gpu=True)
 elif sys.argv[1] == 'cpu':
    ocr_service.run_debugger_service()
 ocr_service.run_web_service()
--- a/python/examples/ocr/rec_debugger_server.py
+++ b/python/examples/ocr/rec_debugger_server.py
@@ -71,7 +71,8 @@ ocr_service.load_model_config("ocr_rec_model")
 if sys.argv[1] == 'gpu':
    ocr_service.set_gpus("0")
    ocr_service.init_rec()
-    ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu", gpuid=0)
+    ocr_service.prepare_server(
+        workdir="workdir", port=9292, device="gpu", gpuid=0)
 elif sys.argv[1] == 'cpu':
    ocr_service.init_rec()
    ocr_service.prepare_server(workdir="workdir", port=9292, device="cpu")

--- a/python/examples/pipeline/bert/pipeline_rpc_client.py
+++ b/python/examples/pipeline/bert/pipeline_rpc_client.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys
 import os
 import yaml
 import requests
 import time
 import json
-try:
+from paddle_serving_server.pipeline import PipelineClient
-    from paddle_serving_server_gpu.pipeline import PipelineClient
-except ImportError:
-    from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 client = PipelineClient()
 client.connect(['127.0.0.1:9998'])
 batch_size = 101
 with open("data-c.txt", 'r') as fin:
-     lines = fin.readlines()
+    lines = fin.readlines()
-     start_idx = 0
+    start_idx = 0
-     while start_idx < len(lines):
+    while start_idx < len(lines):
-         end_idx = min(len(lines), start_idx + batch_size)
+        end_idx = min(len(lines), start_idx + batch_size)
-         feed = {}
+        feed = {}
-         for i in range(start_idx, end_idx):
+        for i in range(start_idx, end_idx):
-             feed[str(i - start_idx)] = lines[i]
+            feed[str(i - start_idx)] = lines[i]
-         ret = client.predict(feed_dict=feed, fetch=["res"])
+        ret = client.predict(feed_dict=feed, fetch=["res"])
-         print(ret)
+        print(ret)
-         start_idx += batch_size
+        start_idx += batch_size
--- a/python/examples/pipeline/bert/web_service.py
+++ b/python/examples/pipeline/bert/web_service.py
@@ -11,10 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
+from paddle_serving_server.web_service import WebService, Op
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import sys
@@ -37,7 +34,8 @@ class BertOp(Op):
        for i in range(batch_size):
            feed_dict = self.reader.process(input_dict[str(i)].encode("utf-8"))
            for key in feed_dict.keys():
-                feed_dict[key] = np.array(feed_dict[key]).reshape((1, len(feed_dict[key]), 1))
+                feed_dict[key] = np.array(feed_dict[key]).reshape(
+                    (1, len(feed_dict[key]), 1))
            feed_res.append(feed_dict)
        feed_dict = {}
        for key in feed_res[0].keys():
@@ -57,5 +55,5 @@ class BertService(WebService):
 bert_service = BertService(name="bert")
-bert_service.prepare_pipeline_config("config2.yml")
+bert_service.prepare_pipeline_config("config.yml")
 bert_service.run_service()
--- a/python/examples/pipeline/imagenet/resnet50_web_service.py
+++ b/python/examples/pipeline/imagenet/resnet50_web_service.py
@@ -13,10 +13,7 @@
 # limitations under the License.
 import sys
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
-try:
+from paddle_serving_server.web_service import WebService, Op
-    from paddle_serving_server.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import base64, cv2

--- a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
+++ b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
@@ -12,17 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
+import numpy as np
+from paddle_serving_app.reader.imdb_reader import IMDBDataset
+import logging
+from paddle_serving_server.web_service import WebService
 from paddle_serving_server.pipeline import Op, RequestOp, ResponseOp
 from paddle_serving_server.pipeline import PipelineServer
 from paddle_serving_server.pipeline.proto import pipeline_service_pb2
 from paddle_serving_server.pipeline.channel import ChannelDataErrcode
-import numpy as np
-from paddle_serving_app.reader.imdb_reader import IMDBDataset
-import logging
-try:
-    from paddle_serving_server.web_service import WebService
-except ImportError:
-    from paddle_serving_server.web_service import WebService
 _LOGGER = logging.getLogger()
 user_handler = logging.StreamHandler()

--- a/python/examples/pipeline/ocr/config.yml
+++ b/python/examples/pipeline/ocr/config.yml
@@ -40,7 +40,7 @@ op:
            fetch_list: ["concat_1.tmp_0"]
            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
-            devices: "2"
+            devices: "0"
    rec:
        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
        concurrency: 2
@@ -64,4 +64,4 @@ op:
            fetch_list: ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"] 
            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
-            devices: "2"
+            devices: "0"
--- a/python/examples/pipeline/ocr/web_service.py
+++ b/python/examples/pipeline/ocr/web_service.py
@@ -11,10 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
+from paddle_serving_server.web_service import WebService, Op
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import cv2
@@ -48,7 +45,7 @@ class DetOp(Op):
        imgs = []
        for key in input_dict.keys():
            data = base64.b64decode(input_dict[key].encode('utf8'))
-            data = np.fromstring(data, np.uint8)
+            data = np.frombuffer(data, np.uint8)
            self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
            self.ori_h, self.ori_w, _ = self.im.shape
            det_img = self.det_preprocess(self.im)
@@ -57,7 +54,7 @@ class DetOp(Op):
        return {"image": np.concatenate(imgs, axis=0)}, False, None, ""
    def postprocess(self, input_dicts, fetch_dict, log_id):
-#        print(fetch_dict)
+        #        print(fetch_dict)
        det_out = fetch_dict["concat_1.tmp_0"]
        ratio_list = [
            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
@@ -114,5 +111,5 @@ class OcrService(WebService):
 uci_service = OcrService(name="ocr")
-uci_service.prepare_pipeline_config("config2.yml")
+uci_service.prepare_pipeline_config("config.yml")
 uci_service.run_service()
--- a/python/examples/pipeline/simple_web_service/web_service.py
+++ b/python/examples/pipeline/simple_web_service/web_service.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
-    from paddle_serving_server.web_service import WebService, Op
+from paddle_serving_server.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 import sys
@@ -34,8 +32,11 @@ class UciOp(Op):
        x_value = input_dict["x"].split(self.batch_separator)
        x_lst = []
        for x_val in x_value:
-            x_lst.append(np.array([float(x.strip()) for x in x_val.split(self.separator)]).reshape(1, 13))
+            x_lst.append(
-        input_dict["x"] = np.concatenate(x_lst, axis=0) 
+                np.array([
+                    float(x.strip()) for x in x_val.split(self.separator)
+                ]).reshape(1, 13))
+        input_dict["x"] = np.concatenate(x_lst, axis=0)
        proc_dict = {}
        return input_dict, False, None, ""
@@ -53,5 +54,5 @@ class UciService(WebService):
 uci_service = UciService(name="uci")
-uci_service.prepare_pipeline_config("config2.yml")
+uci_service.prepare_pipeline_config("config.yml")
 uci_service.run_service()
--- a/python/examples/pipeline/simple_web_service/web_service_java.py
+++ b/python/examples/pipeline/simple_web_service/web_service_java.py
@@ -11,10 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-try:
+from paddle_serving_server.web_service import WebService, Op
-    from paddle_serving_server.web_service import WebService, Op
-except ImportError:
-    from paddle_serving_server.web_service import WebService, Op
 import logging
 import numpy as np
 from numpy import array

--- a/python/examples/senta/senta_web_service.py
+++ b/python/examples/senta/senta_web_service.py
@@ -13,13 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle_serving_server.web_service import WebService
-from paddle_serving_client import Client
-from paddle_serving_app.reader import LACReader, SentaReader
 import os
 import sys
 import numpy as np
-#senta_web_service.py
 from paddle_serving_server.web_service import WebService
 from paddle_serving_client import Client
 from paddle_serving_app.reader import LACReader, SentaReader

--- a/python/examples/xpu/fit_a_line_xpu/test_server.py
+++ b/python/examples/xpu/fit_a_line_xpu/test_server.py
@@ -31,6 +31,7 @@ class UciService(WebService):
 uci_service = UciService(name="uci")
 uci_service.load_model_config("uci_housing_model")
-uci_service.prepare_server(workdir="workdir", port=9393, use_lite=True, use_xpu=True, ir_optim=True)
+uci_service.prepare_server(
+    workdir="workdir", port=9393, use_lite=True, use_xpu=True, ir_optim=True)
 uci_service.run_rpc_service()
 uci_service.run_web_service()
--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -19,18 +19,20 @@ import os
 import google.protobuf.text_format
 import numpy as np
 import argparse
-import paddle.fluid as fluid
-import paddle.inference as inference
 from .proto import general_model_config_pb2 as m_config
-from paddle.fluid.core import PaddleTensor
+import paddle.inference as paddle_infer
-from paddle.fluid.core import AnalysisConfig
-from paddle.fluid.core import create_paddle_predictor
 import logging
 logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
-logger = logging.getLogger("fluid")
+logger = logging.getLogger("LocalPredictor")
 logger.setLevel(logging.INFO)
+precision_map = {
+    'int8': paddle_infer.PrecisionType.Int8,
+    'fp32': paddle_infer.PrecisionType.Float32,
+    'fp16': paddle_infer.PrecisionType.Half,
+}
 class LocalPredictor(object):
    """
@@ -60,9 +62,11 @@ class LocalPredictor(object):
                          use_trt=False,
                          use_lite=False,
                          use_xpu=False,
+                          precision="fp32",
+                          use_calib=False,
                          use_feed_fetch_ops=False):
        """
-        Load model config and set the engine config for the paddle predictor
+        Load model configs and create the paddle predictor by Paddle Inference API.
        Args:
            model_path: model config path.
@@ -75,6 +79,8 @@ class LocalPredictor(object):
            use_trt: use nvidia TensorRT optimization, False default
            use_lite: use Paddle-Lite Engint, False default
            use_xpu: run predict on Baidu Kunlun, False default
+            precision: precision mode, "fp32" default
+            use_calib: use TensorRT calibration, False default
            use_feed_fetch_ops: use feed/fetch ops, False default.
        """
        client_config = "{}/serving_server_conf.prototxt".format(model_path)
@@ -83,14 +89,20 @@ class LocalPredictor(object):
        model_conf = google.protobuf.text_format.Merge(
            str(f.read()), model_conf)
        if os.path.exists(os.path.join(model_path, "__params__")):
-            config = AnalysisConfig(os.path.join(model_path, "__model__"), os.path.join(model_path, "__params__")) 
+            config = paddle_infer.Config(
+                os.path.join(model_path, "__model__"),
+                os.path.join(model_path, "__params__"))
        else:
-            config = AnalysisConfig(model_path) 
+            config = paddle_infer.Config(model_path)
-        logger.info("load_model_config params: model_path:{}, use_gpu:{},\
+        logger.info(
+            "LocalPredictor load_model_config params: model_path:{}, use_gpu:{},\
            gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{},\
-            use_trt:{}, use_lite:{}, use_xpu: {}, use_feed_fetch_ops:{}".format(
+            use_trt:{}, use_lite:{}, use_xpu: {}, precision: {}, use_calib: {},\
-            model_path, use_gpu, gpu_id, use_profile, thread_num, mem_optim,
+            use_feed_fetch_ops:{}"
-            ir_optim, use_trt, use_lite, use_xpu, use_feed_fetch_ops))
+            .format(model_path, use_gpu, gpu_id, use_profile, thread_num,
+                    mem_optim, ir_optim, use_trt, use_lite, use_xpu, precision,
+                    use_calib, use_feed_fetch_ops))
        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
@@ -106,6 +118,9 @@ class LocalPredictor(object):
            self.fetch_names_to_idx_[var.alias_name] = i
            self.fetch_names_to_type_[var.alias_name] = var.fetch_type
+        precision_type = paddle_infer.PrecisionType.Float32
+        if precision.lower() in precision_map:
+            precision_type = precision_map[precision.lower()]
        if use_profile:
            config.enable_profile()
        if mem_optim:
@@ -121,6 +136,7 @@ class LocalPredictor(object):
            config.enable_use_gpu(100, gpu_id)
            if use_trt:
                config.enable_tensorrt_engine(
+                    precision_mode=precision_type,
                    workspace_size=1 << 20,
                    max_batch_size=32,
                    min_subgraph_size=3,
@@ -129,7 +145,7 @@ class LocalPredictor(object):
        if use_lite:
            config.enable_lite_engine(
-                precision_mode=inference.PrecisionType.Float32,
+                precision_mode=precision_type,
                zero_copy=True,
                passes_filter=[],
                ops_filter=[])
@@ -138,11 +154,16 @@ class LocalPredictor(object):
            # 2MB l3 cache
            config.enable_xpu(8 * 1024 * 1024)
-        self.predictor = create_paddle_predictor(config)
+        if not use_gpu and not use_lite:
+            if precision_type == paddle_infer.PrecisionType.Int8:
+                config.enable_quantizer()
+            if precision.lower() == "bf16":
+                config.enable_mkldnn_bfloat16()
+        self.predictor = paddle_infer.create_predictor(config)
    def predict(self, feed=None, fetch=None, batch=False, log_id=0):
        """
-        Predict locally
+        Run model inference by Paddle Inference API.
        Args:
            feed: feed var
@@ -155,14 +176,16 @@ class LocalPredictor(object):
            fetch_map: dict 
        """
        if feed is None or fetch is None:
-            raise ValueError("You should specify feed and fetch for prediction")
+            raise ValueError("You should specify feed and fetch for prediction.\
+                log_id:{}".format(log_id))
        fetch_list = []
        if isinstance(fetch, str):
            fetch_list = [fetch]
        elif isinstance(fetch, list):
            fetch_list = fetch
        else:
-            raise ValueError("Fetch only accepts string and list of string")
+            raise ValueError("Fetch only accepts string and list of string.\
+                log_id:{}".format(log_id))
        feed_batch = []
        if isinstance(feed, dict):
@@ -170,27 +193,21 @@ class LocalPredictor(object):
        elif isinstance(feed, list):
            feed_batch = feed
        else:
-            raise ValueError("Feed only accepts dict and list of dict")
+            raise ValueError("Feed only accepts dict and list of dict.\
+                log_id:{}".format(log_id))
-        int_slot_batch = []
-        float_slot_batch = []
-        int_feed_names = []
-        float_feed_names = []
-        int_shape = []
-        float_shape = []
-        fetch_names = []
-        counter = 0
-        batch_size = len(feed_batch)
+        fetch_names = []
+        # Filter invalid fetch names
        for key in fetch_list:
            if key in self.fetch_names_:
                fetch_names.append(key)
        if len(fetch_names) == 0:
            raise ValueError(
-                "Fetch names should not be empty or out of saved fetch list.")
+                "Fetch names should not be empty or out of saved fetch list.\
-            return {}
+                    log_id:{}".format(log_id))
+        # Assemble the input data of paddle predictor 
        input_names = self.predictor.get_input_names()
        for name in input_names:
            if isinstance(feed[name], list):
@@ -204,27 +221,31 @@ class LocalPredictor(object):
                feed[name] = feed[name].astype("int32")
            else:
                raise ValueError("local predictor receives wrong data type")
-            input_tensor = self.predictor.get_input_tensor(name)
+            input_tensor_handle = self.predictor.get_input_handle(name)
            if "{}.lod".format(name) in feed:
-                input_tensor.set_lod([feed["{}.lod".format(name)]])
+                input_tensor_handle.set_lod([feed["{}.lod".format(name)]])
            if batch == False:
-                input_tensor.copy_from_cpu(feed[name][np.newaxis, :])
+                input_tensor_handle.copy_from_cpu(feed[name][np.newaxis, :])
            else:
-                input_tensor.copy_from_cpu(feed[name])
+                input_tensor_handle.copy_from_cpu(feed[name])
-        output_tensors = []
+        output_tensor_handles = []
        output_names = self.predictor.get_output_names()
        for output_name in output_names:
-            output_tensor = self.predictor.get_output_tensor(output_name)
+            output_tensor_handle = self.predictor.get_output_handle(output_name)
-            output_tensors.append(output_tensor)
+            output_tensor_handles.append(output_tensor_handle)
+        # Run inference 
+        self.predictor.run()
+        # Assemble output data of predict results
        outputs = []
-        self.predictor.zero_copy_run()
+        for output_tensor_handle in output_tensor_handles:
-        for output_tensor in output_tensors:
+            output = output_tensor_handle.copy_to_cpu()
-            output = output_tensor.copy_to_cpu()
            outputs.append(output)
        fetch_map = {}
        for i, name in enumerate(fetch):
            fetch_map[name] = outputs[i]
-            if len(output_tensors[i].lod()) > 0:
+            if len(output_tensor_handles[i].lod()) > 0:
-                fetch_map[name + ".lod"] = np.array(output_tensors[i].lod()[
+                fetch_map[name + ".lod"] = np.array(output_tensor_handles[i]
-                    0]).astype('int32')
+                                                    .lod()[0]).astype('int32')
        return fetch_map
--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
@@ -14,7 +14,6 @@
 # pylint: disable=doc-string-missing
 from . import version
 from . import client
 from .client import *

--- a/python/paddle_serving_client/client.py
+++ b/python/paddle_serving_client/client.py
@@ -31,11 +31,18 @@ sys.path.append(
    os.path.join(os.path.abspath(os.path.dirname(__file__)), 'proto'))
 from .proto import multi_lang_general_model_service_pb2_grpc
+#param 'type'(which is in feed_var or fetch_var) = 0 means dataType is int64
+#param 'type'(which is in feed_var or fetch_var) = 1 means dataType is float32
+#param 'type'(which is in feed_var or fetch_var) = 2 means dataType is int32
+#param 'type'(which is in feed_var or fetch_var) = 3 means dataType is string(also called bytes in proto)
 int64_type = 0
 float32_type = 1
 int32_type = 2
+bytes_type = 3
+#int_type,float_type,string_type are the set of each subdivision classes.
 int_type = set([int64_type, int32_type])
 float_type = set([float32_type])
+string_type = set([bytes_type])
 class _NOPProfiler(object):
@@ -139,10 +146,22 @@ class Client(object):
        from .serving_client import PredictorRes
        self.predictorres_constructor = PredictorRes
-    def load_client_config(self, path):
+    def load_client_config(self, model_config_path_list):
+        if isinstance(model_config_path_list, str):
+            model_config_path_list = [model_config_path_list]
+        elif isinstance(model_config_path_list, list):
+            pass
+        file_path_list = []
+        for single_model_config in model_config_path_list:
+            if os.path.isdir(single_model_config):
+                file_path_list.append("{}/serving_server_conf.prototxt".format(
+                    single_model_config))
+            elif os.path.isfile(single_model_config):
+                file_path_list.append(single_model_config)
        from .serving_client import PredictorClient
        model_conf = m_config.GeneralModelConfig()
-        f = open(path, 'r')
+        f = open(file_path_list[0], 'r')
        model_conf = google.protobuf.text_format.Merge(
            str(f.read()), model_conf)
@@ -151,19 +170,16 @@ class Client(object):
        # get feed shapes, feed types
        # map feed names to index
        self.client_handle_ = PredictorClient()
-        self.client_handle_.init(path)
+        self.client_handle_.init(file_path_list)
        if "FLAGS_max_body_size" not in os.environ:
            os.environ["FLAGS_max_body_size"] = str(512 * 1024 * 1024)
        read_env_flags = ["profile_client", "profile_server", "max_body_size"]
        self.client_handle_.init_gflags([sys.argv[
            0]] + ["--tryfromenv=" + ",".join(read_env_flags)])
        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
-        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
+        self.feed_names_to_idx_ = {}  #this is not useful
-        self.feed_names_to_idx_ = {}
-        self.fetch_names_to_type_ = {}
-        self.fetch_names_to_idx_ = {}
        self.lod_tensor_set = set()
-        self.feed_tensor_len = {}
+        self.feed_tensor_len = {}  #this is only used for shape check
        self.key = None
        for i, var in enumerate(model_conf.feed_var):
@@ -178,6 +194,14 @@ class Client(object):
                for dim in self.feed_shapes_[var.alias_name]:
                    counter *= dim
                self.feed_tensor_len[var.alias_name] = counter
+        if len(file_path_list) > 1:
+            model_conf = m_config.GeneralModelConfig()
+            f = open(file_path_list[-1], 'r')
+            model_conf = google.protobuf.text_format.Merge(
+                str(f.read()), model_conf)
+        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
+        self.fetch_names_to_type_ = {}
+        self.fetch_names_to_idx_ = {}
        for i, var in enumerate(model_conf.fetch_var):
            self.fetch_names_to_idx_[var.alias_name] = i
            self.fetch_names_to_type_[var.alias_name] = var.fetch_type
@@ -288,13 +312,17 @@ class Client(object):
            raise ValueError("Feed only accepts dict and list of dict")
        int_slot_batch = []
-        float_slot_batch = []
        int_feed_names = []
-        float_feed_names = []
        int_shape = []
        int_lod_slot_batch = []
+        float_slot_batch = []
+        float_feed_names = []
        float_lod_slot_batch = []
        float_shape = []
+        string_slot_batch = []
+        string_feed_names = []
+        string_lod_slot_batch = []
+        string_shape = []
        fetch_names = []
        counter = 0
@@ -311,9 +339,11 @@ class Client(object):
        for i, feed_i in enumerate(feed_batch):
            int_slot = []
-            float_slot = []
            int_lod_slot = []
+            float_slot = []
            float_lod_slot = []
+            string_slot = []
+            string_lod_slot = []
            for key in feed_i:
                if ".lod" not in key and key not in self.feed_names_:
                    raise ValueError("Wrong feed name: {}.".format(key))
@@ -368,10 +398,24 @@ class Client(object):
                    else:
                        float_slot.append(feed_i[key])
                        self.all_numpy_input = False
+                #if input is string, feed is not numpy.
+                elif self.feed_types_[key] in string_type:
+                    if i == 0:
+                        string_feed_names.append(key)
+                        string_shape.append(self.feed_shapes_[key])
+                        if "{}.lod".format(key) in feed_i:
+                            string_lod_slot_batch.append(feed_i["{}.lod".format(
+                                key)])
+                        else:
+                            string_lod_slot_batch.append([])
+                    string_slot.append(feed_i[key])
+                    self.has_numpy_input = True
            int_slot_batch.append(int_slot)
-            float_slot_batch.append(float_slot)
            int_lod_slot_batch.append(int_lod_slot)
+            float_slot_batch.append(float_slot)
            float_lod_slot_batch.append(float_lod_slot)
+            string_slot_batch.append(string_slot)
+            string_lod_slot_batch.append(string_lod_slot)
        self.profile_.record('py_prepro_1')
        self.profile_.record('py_client_infer_0')
@@ -381,8 +425,9 @@ class Client(object):
            res = self.client_handle_.numpy_predict(
                float_slot_batch, float_feed_names, float_shape,
                float_lod_slot_batch, int_slot_batch, int_feed_names, int_shape,
-                int_lod_slot_batch, fetch_names, result_batch_handle, self.pid,
+                int_lod_slot_batch, string_slot_batch, string_feed_names,
-                log_id)
+                string_shape, string_lod_slot_batch, fetch_names,
+                result_batch_handle, self.pid, log_id)
        elif self.has_numpy_input == False:
            raise ValueError(
                "Please make sure all of your inputs are numpy array")
@@ -509,8 +554,8 @@ class MultiLangClient(object):
        get_client_config_req = multi_lang_general_model_service_pb2.GetClientConfigRequest(
        )
        resp = self.stub_.GetClientConfig(get_client_config_req)
-        model_config_str = resp.client_config_str
+        model_config_path_list = resp.client_config_str_list
-        self._parse_model_config(model_config_str)
+        self._parse_model_config(model_config_path_list)
    def _flatten_list(self, nested_list):
        for item in nested_list:
@@ -520,25 +565,39 @@ class MultiLangClient(object):
            else:
                yield item
-    def _parse_model_config(self, model_config_str):
+    def _parse_model_config(self, model_config_path_list):
+        if isinstance(model_config_path_list, str):
+            model_config_path_list = [model_config_path_list]
+        elif isinstance(model_config_path_list, list):
+            pass
+        file_path_list = []
+        for single_model_config in model_config_path_list:
+            if os.path.isdir(single_model_config):
+                file_path_list.append("{}/serving_server_conf.prototxt".format(
+                    single_model_config))
+            elif os.path.isfile(single_model_config):
+                file_path_list.append(single_model_config)
        model_conf = m_config.GeneralModelConfig()
-        model_conf = google.protobuf.text_format.Merge(model_config_str,
+        f = open(file_path_list[0], 'r')
-                                                       model_conf)
+        model_conf = google.protobuf.text_format.Merge(
+            str(f.read()), model_conf)
        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
        self.feed_types_ = {}
        self.feed_shapes_ = {}
-        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
-        self.fetch_types_ = {}
        self.lod_tensor_set_ = set()
        for i, var in enumerate(model_conf.feed_var):
            self.feed_types_[var.alias_name] = var.feed_type
            self.feed_shapes_[var.alias_name] = var.shape
            if var.is_lod_tensor:
                self.lod_tensor_set_.add(var.alias_name)
-            else:
+        if len(file_path_list) > 1:
-                counter = 1
+            model_conf = m_config.GeneralModelConfig()
-                for dim in self.feed_shapes_[var.alias_name]:
+            f = open(file_path_list[-1], 'r')
-                    counter *= dim
+            model_conf = google.protobuf.text_format.Merge(
+                str(f.read()), model_conf)
+        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
+        self.fetch_types_ = {}
        for i, var in enumerate(model_conf.fetch_var):
            self.fetch_types_[var.alias_name] = var.fetch_type
            if var.is_lod_tensor:

--- a/python/paddle_serving_server/__init__.py
+++ b/python/paddle_serving_server/__init__.py
--- a/python/paddle_serving_server/dag.py
+++ b/python/paddle_serving_server/dag.py
--- a/python/paddle_serving_server/rpc_service.py
+++ b/python/paddle_serving_server/rpc_service.py
--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
--- a/python/paddle_serving_server/server.py
+++ b/python/paddle_serving_server/server.py
--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
--- a/python/pipeline/pipeline_server.py
+++ b/python/pipeline/pipeline_server.py
--- a/tools/scripts/ipipe_py3.sh
+++ b/tools/scripts/ipipe_py3.sh