Merge branch 'develop' into wangjiawei04-patch-2

fdc7e2b3 · TeslaZhao · GitHub · 8391764f · 90d838a3 · fdc7e2b3
99 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,6 +54,7 @@ option(SERVER	    "Compile Paddle Serving Server"		    OFF)
 option(APP          "Compile Paddle Serving App package"	    OFF)
 option(WITH_ELASTIC_CTR "Compile ELASITC-CTR solution"              OFF)
 option(PACK         "Compile for whl"                               OFF)
+option(WITH_TRT     "Compile Paddle Serving with TRT"       OFF)

 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)

--- a/README.md
+++ b/README.md
@@ -128,6 +128,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 | `mem_optim_off` | - | - | Disable memory / graphic memory optimization |
 | `ir_optim` | - | - | Enable analysis and optimization of calculation graph |
 | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL |
+| `use_trt` (Only for trt version) | - | - | Run inference with TensorRT  |

 Here, we use `curl` to send a HTTP POST request to the service we just started. Users can use any python library to send HTTP POST as well, e.g, [requests](https://requests.readthedocs.io/en/master/).
 </center>

--- a/README_CN.md
+++ b/README_CN.md
@@ -124,6 +124,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 | `mem_optim_off` | - | - | Disable memory optimization |
 | `ir_optim` | - | - | Enable analysis and optimization of calculation graph |
 | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL |
+| `use_trt` (Only for trt version) | - | - | Run inference with TensorRT  |

 我们使用 `curl` 命令来发送HTTP POST请求给刚刚启动的服务。用户也可以调用python库来发送HTTP POST请求，请参考英文文档 [requests](https://requests.readthedocs.io/en/master/)。
 </center>

--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -34,7 +34,11 @@ message( "WITH_GPU = ${WITH_GPU}")
 SET(PADDLE_VERSION "1.8.4")

 if (WITH_GPU)
-    SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda${CUDA_VERSION_MAJOR}-cudnn7-avx-mkl")
+    if (WITH_TRT)
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7.6-avx-mkl-trt6")
+    else()
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10-cudnn7-avx-mkl")
+    endif()
 else()
    if (WITH_AVX)
        if (WITH_MKLML)
@@ -50,21 +54,38 @@ endif()
 SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/fluid_inference.tgz")
 MESSAGE(STATUS "PADDLE_LIB_PATH=${PADDLE_LIB_PATH}")
 if (WITH_GPU OR WITH_MKLML)
-ExternalProject_Add(
-    "extern_paddle"
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    URL                 "${PADDLE_LIB_PATH}"
-    PREFIX              "${PADDLE_SOURCES_DIR}"
-    DOWNLOAD_DIR        "${PADDLE_DOWNLOAD_DIR}"
-    CONFIGURE_COMMAND   ""
-    BUILD_COMMAND       ""
-    UPDATE_COMMAND      ""
-    INSTALL_COMMAND
-        ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/include ${PADDLE_INSTALL_DIR}/include &&
-        ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/lib ${PADDLE_INSTALL_DIR}/lib &&
-        ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/third_party ${PADDLE_INSTALL_DIR}/third_party &&
-        ${CMAKE_COMMAND} -E copy ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so.0 ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so 
-)
+    if (WITH_TRT)
+        ExternalProject_Add(
+            "extern_paddle"
+            ${EXTERNAL_PROJECT_LOG_ARGS}
+            URL                 "${PADDLE_LIB_PATH}"
+            PREFIX              "${PADDLE_SOURCES_DIR}"
+            DOWNLOAD_DIR        "${PADDLE_DOWNLOAD_DIR}"
+            CONFIGURE_COMMAND   ""
+            BUILD_COMMAND       ""
+            UPDATE_COMMAND      ""
+            INSTALL_COMMAND
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/include ${PADDLE_INSTALL_DIR}/include &&
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/lib ${PADDLE_INSTALL_DIR}/lib &&
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/third_party ${PADDLE_INSTALL_DIR}/third_party
+        )
+    else()
+        ExternalProject_Add(
+            "extern_paddle"
+            ${EXTERNAL_PROJECT_LOG_ARGS}
+            URL                 "${PADDLE_LIB_PATH}"
+            PREFIX              "${PADDLE_SOURCES_DIR}"
+            DOWNLOAD_DIR        "${PADDLE_DOWNLOAD_DIR}"
+            CONFIGURE_COMMAND   ""
+            BUILD_COMMAND       ""
+            UPDATE_COMMAND      ""
+            INSTALL_COMMAND
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/include ${PADDLE_INSTALL_DIR}/include &&
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/lib ${PADDLE_INSTALL_DIR}/lib &&
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/third_party ${PADDLE_INSTALL_DIR}/third_party &&
+                ${CMAKE_COMMAND} -E copy ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so.0 ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so 
+        )
+    endif()
 else()
 ExternalProject_Add(
    "extern_paddle"
@@ -92,8 +113,16 @@ LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib)
 ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/openblas/lib/libopenblas.a)

-ADD_LIBRARY(paddle_fluid STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.a)
+ADD_LIBRARY(paddle_fluid SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.so)
+
+if (WITH_TRT)
+ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET nvinfer PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer.so)
+
+ADD_LIBRARY(nvinfer_plugin SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET nvinfer_plugin PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer_plugin.so)
+endif()

 ADD_LIBRARY(xxhash STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET xxhash PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xxhash/lib/libxxhash.a)
@@ -101,4 +130,9 @@ SET_PROPERTY(TARGET xxhash PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/thir
 LIST(APPEND external_project_dependencies paddle)

 LIST(APPEND paddle_depend_libs
-        xxhash)
+    xxhash)
+
+if(WITH_TRT)
+LIST(APPEND paddle_depend_libs
+    nvinfer nvinfer_plugin)
+endif()
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -44,6 +44,7 @@ message EngineDesc {
  optional bool static_optimization = 14;
  optional bool force_update_static_cache = 15;
  optional bool enable_ir_optimization = 16;
+  optional bool use_trt = 17;
 };

 // model_toolkit conf

--- a/core/general-client/include/general_model.h
+++ b/core/general-client/include/general_model.h
@@ -218,25 +218,15 @@ class PredictorClient {

  int destroy_predictor();

-  int batch_predict(
-      const std::vector<std::vector<std::vector<float>>>& float_feed_batch,
-      const std::vector<std::string>& float_feed_name,
-      const std::vector<std::vector<int>>& float_shape,
-      const std::vector<std::vector<std::vector<int64_t>>>& int_feed_batch,
-      const std::vector<std::string>& int_feed_name,
-      const std::vector<std::vector<int>>& int_shape,
-      const std::vector<std::string>& fetch_name,
-      PredictorRes& predict_res_batch,  // NOLINT
-      const int& pid,
-      const uint64_t log_id);
-
  int numpy_predict(
      const std::vector<std::vector<py::array_t<float>>>& float_feed_batch,
      const std::vector<std::string>& float_feed_name,
      const std::vector<std::vector<int>>& float_shape,
+      const std::vector<std::vector<int>>& float_lod_slot_batch,
      const std::vector<std::vector<py::array_t<int64_t>>>& int_feed_batch,
      const std::vector<std::string>& int_feed_name,
      const std::vector<std::vector<int>>& int_shape,
+      const std::vector<std::vector<int>>& int_lod_slot_batch,
      const std::vector<std::string>& fetch_name,
      PredictorRes& predict_res_batch,  // NOLINT
      const int& pid,

--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -137,227 +137,15 @@ int PredictorClient::create_predictor() {
  return 0;
 }

-int PredictorClient::batch_predict(
-    const std::vector<std::vector<std::vector<float>>> &float_feed_batch,
-    const std::vector<std::string> &float_feed_name,
-    const std::vector<std::vector<int>> &float_shape,
-    const std::vector<std::vector<std::vector<int64_t>>> &int_feed_batch,
-    const std::vector<std::string> &int_feed_name,
-    const std::vector<std::vector<int>> &int_shape,
-    const std::vector<std::string> &fetch_name,
-    PredictorRes &predict_res_batch,
-    const int &pid,
-    const uint64_t log_id) {
-  int batch_size = std::max(float_feed_batch.size(), int_feed_batch.size());
-
-  predict_res_batch.clear();
-  Timer timeline;
-  int64_t preprocess_start = timeline.TimeStampUS();
-
-  int fetch_name_num = fetch_name.size();
-
-  _api.thrd_initialize();
-  std::string variant_tag;
-  _predictor = _api.fetch_predictor("general_model", &variant_tag);
-  predict_res_batch.set_variant_tag(variant_tag);
-  VLOG(2) << "fetch general model predictor done.";
-  VLOG(2) << "float feed name size: " << float_feed_name.size();
-  VLOG(2) << "int feed name size: " << int_feed_name.size();
-  VLOG(2) << "max body size : " << brpc::fLU64::FLAGS_max_body_size;
-  Request req;
-  req.set_log_id(log_id);
-  for (auto &name : fetch_name) {
-    req.add_fetch_var_names(name);
-  }
-
-  for (int bi = 0; bi < batch_size; bi++) {
-    VLOG(2) << "prepare batch " << bi;
-    std::vector<Tensor *> tensor_vec;
-    FeedInst *inst = req.add_insts();
-    std::vector<std::vector<float>> float_feed = float_feed_batch[bi];
-    std::vector<std::vector<int64_t>> int_feed = int_feed_batch[bi];
-    for (auto &name : float_feed_name) {
-      tensor_vec.push_back(inst->add_tensor_array());
-    }
-
-    for (auto &name : int_feed_name) {
-      tensor_vec.push_back(inst->add_tensor_array());
-    }
-
-    VLOG(2) << "batch [" << bi << "] int_feed_name and float_feed_name "
-            << "prepared";
-    int vec_idx = 0;
-    VLOG(2) << "tensor_vec size " << tensor_vec.size() << " float shape "
-            << float_shape.size();
-    for (auto &name : float_feed_name) {
-      int idx = _feed_name_to_idx[name];
-      Tensor *tensor = tensor_vec[idx];
-      VLOG(2) << "prepare float feed " << name << " shape size "
-              << float_shape[vec_idx].size();
-      for (uint32_t j = 0; j < float_shape[vec_idx].size(); ++j) {
-        tensor->add_shape(float_shape[vec_idx][j]);
-      }
-      tensor->set_elem_type(1);
-      for (uint32_t j = 0; j < float_feed[vec_idx].size(); ++j) {
-        tensor->add_float_data(float_feed[vec_idx][j]);
-      }
-      vec_idx++;
-    }
-
-    VLOG(2) << "batch [" << bi << "] "
-            << "float feed value prepared";
-
-    vec_idx = 0;
-    for (auto &name : int_feed_name) {
-      int idx = _feed_name_to_idx[name];
-      Tensor *tensor = tensor_vec[idx];
-      if (_type[idx] == 0) {
-        VLOG(2) << "prepare int64 feed " << name << " shape size "
-                << int_shape[vec_idx].size();
-        VLOG(3) << "feed var name " << name << " index " << vec_idx
-                << "first data " << int_feed[vec_idx][0];
-        for (uint32_t j = 0; j < int_feed[vec_idx].size(); ++j) {
-          tensor->add_int64_data(int_feed[vec_idx][j]);
-        }
-      } else if (_type[idx] == 2) {
-        VLOG(2) << "prepare int32 feed " << name << " shape size "
-                << int_shape[vec_idx].size();
-        VLOG(3) << "feed var name " << name << " index " << vec_idx
-                << "first data " << int32_t(int_feed[vec_idx][0]);
-        for (uint32_t j = 0; j < int_feed[vec_idx].size(); ++j) {
-          tensor->add_int_data(int32_t(int_feed[vec_idx][j]));
-        }
-      }
-
-      for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
-        tensor->add_shape(int_shape[vec_idx][j]);
-      }
-      tensor->set_elem_type(_type[idx]);
-      vec_idx++;
-    }
-
-    VLOG(2) << "batch [" << bi << "] "
-            << "int feed value prepared";
-  }
-
-  int64_t preprocess_end = timeline.TimeStampUS();
-
-  int64_t client_infer_start = timeline.TimeStampUS();
-
-  Response res;
-
-  int64_t client_infer_end = 0;
-  int64_t postprocess_start = 0;
-  int64_t postprocess_end = 0;
-
-  if (FLAGS_profile_client) {
-    if (FLAGS_profile_server) {
-      req.set_profile_server(true);
-    }
-  }
-
-  res.Clear();
-  if (_predictor->inference(&req, &res) != 0) {
-    LOG(ERROR) << "failed call predictor with req: " << req.ShortDebugString();
-    _api.thrd_clear();
-    return -1;
-  } else {
-    client_infer_end = timeline.TimeStampUS();
-    postprocess_start = client_infer_end;
-    VLOG(2) << "get model output num";
-    uint32_t model_num = res.outputs_size();
-    VLOG(2) << "model num: " << model_num;
-    for (uint32_t m_idx = 0; m_idx < model_num; ++m_idx) {
-      VLOG(2) << "process model output index: " << m_idx;
-      auto output = res.outputs(m_idx);
-      ModelRes model;
-      model.set_engine_name(output.engine_name());
-
-      int idx = 0;
-
-      for (auto &name : fetch_name) {
-        // int idx = _fetch_name_to_idx[name];
-        int shape_size = output.insts(0).tensor_array(idx).shape_size();
-        VLOG(2) << "fetch var " << name << " index " << idx << " shape size "
-                << shape_size;
-        model._shape_map[name].resize(shape_size);
-        for (int i = 0; i < shape_size; ++i) {
-          model._shape_map[name][i] =
-              output.insts(0).tensor_array(idx).shape(i);
-        }
-        int lod_size = output.insts(0).tensor_array(idx).lod_size();
-        if (lod_size > 0) {
-          model._lod_map[name].resize(lod_size);
-          for (int i = 0; i < lod_size; ++i) {
-            model._lod_map[name][i] = output.insts(0).tensor_array(idx).lod(i);
-          }
-        }
-        idx += 1;
-      }
-
-      idx = 0;
-      for (auto &name : fetch_name) {
-        // int idx = _fetch_name_to_idx[name];
-        if (_fetch_name_to_type[name] == 0) {
-          VLOG(2) << "ferch var " << name << "type int64";
-          int size = output.insts(0).tensor_array(idx).int64_data_size();
-          model._int64_value_map[name] = std::vector<int64_t>(
-              output.insts(0).tensor_array(idx).int64_data().begin(),
-              output.insts(0).tensor_array(idx).int64_data().begin() + size);
-        } else if (_fetch_name_to_type[name] == 1) {
-          VLOG(2) << "fetch var " << name << "type float";
-          int size = output.insts(0).tensor_array(idx).float_data_size();
-          model._float_value_map[name] = std::vector<float>(
-              output.insts(0).tensor_array(idx).float_data().begin(),
-              output.insts(0).tensor_array(idx).float_data().begin() + size);
-        } else if (_fetch_name_to_type[name] == 2) {
-          VLOG(2) << "fetch var " << name << "type int32";
-          int size = output.insts(0).tensor_array(idx).int_data_size();
-          model._int32_value_map[name] = std::vector<int32_t>(
-              output.insts(0).tensor_array(idx).int_data().begin(),
-              output.insts(0).tensor_array(idx).int_data().begin() + size);
-        }
-
-        idx += 1;
-      }
-      predict_res_batch.add_model_res(std::move(model));
-    }
-    postprocess_end = timeline.TimeStampUS();
-  }
-
-  if (FLAGS_profile_client) {
-    std::ostringstream oss;
-    oss << "PROFILE\t"
-        << "pid:" << pid << "\t"
-        << "prepro_0:" << preprocess_start << " "
-        << "prepro_1:" << preprocess_end << " "
-        << "client_infer_0:" << client_infer_start << " "
-        << "client_infer_1:" << client_infer_end << " ";
-    if (FLAGS_profile_server) {
-      int op_num = res.profile_time_size() / 2;
-      for (int i = 0; i < op_num; ++i) {
-        oss << "op" << i << "_0:" << res.profile_time(i * 2) << " ";
-        oss << "op" << i << "_1:" << res.profile_time(i * 2 + 1) << " ";
-      }
-    }
-
-    oss << "postpro_0:" << postprocess_start << " ";
-    oss << "postpro_1:" << postprocess_end;
-
-    fprintf(stderr, "%s\n", oss.str().c_str());
-  }
-
-  _api.thrd_clear();
-  return 0;
-}
-
 int PredictorClient::numpy_predict(
    const std::vector<std::vector<py::array_t<float>>> &float_feed_batch,
    const std::vector<std::string> &float_feed_name,
    const std::vector<std::vector<int>> &float_shape,
+    const std::vector<std::vector<int>> &float_lod_slot_batch,
    const std::vector<std::vector<py::array_t<int64_t>>> &int_feed_batch,
    const std::vector<std::string> &int_feed_name,
    const std::vector<std::vector<int>> &int_shape,
+    const std::vector<std::vector<int>> &int_lod_slot_batch,
    const std::vector<std::string> &fetch_name,
    PredictorRes &predict_res_batch,
    const int &pid,
@@ -412,6 +200,9 @@ int PredictorClient::numpy_predict(
      for (uint32_t j = 0; j < float_shape[vec_idx].size(); ++j) {
        tensor->add_shape(float_shape[vec_idx][j]);
      }
+      for (uint32_t j = 0; j < float_lod_slot_batch[vec_idx].size(); ++j) {
+        tensor->add_lod(float_lod_slot_batch[vec_idx][j]);
+      }
      tensor->set_elem_type(1);
      const int float_shape_size = float_shape[vec_idx].size();
      switch (float_shape_size) {
@@ -470,6 +261,9 @@ int PredictorClient::numpy_predict(
      for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
        tensor->add_shape(int_shape[vec_idx][j]);
      }
+      for (uint32_t j = 0; j < int_lod_slot_batch[vec_idx].size(); ++j) {
+        tensor->add_lod(int_lod_slot_batch[vec_idx][j]);
+      }
      tensor->set_elem_type(_type[idx]);

      if (_type[idx] == 0) {

--- a/core/general-client/src/pybind_general_model.cpp
+++ b/core/general-client/src/pybind_general_model.cpp
@@ -95,42 +95,18 @@ PYBIND11_MODULE(serving_client, m) {
           [](PredictorClient &self) { self.create_predictor(); })
      .def("destroy_predictor",
           [](PredictorClient &self) { self.destroy_predictor(); })
-      .def("batch_predict",
-           [](PredictorClient &self,
-              const std::vector<std::vector<std::vector<float>>>
-                  &float_feed_batch,
-              const std::vector<std::string> &float_feed_name,
-              const std::vector<std::vector<int>> &float_shape,
-              const std::vector<std::vector<std::vector<int64_t>>>
-                  &int_feed_batch,
-              const std::vector<std::string> &int_feed_name,
-              const std::vector<std::vector<int>> &int_shape,
-              const std::vector<std::string> &fetch_name,
-              PredictorRes &predict_res_batch,
-              const int &pid,
-              const uint64_t log_id) {
-             return self.batch_predict(float_feed_batch,
-                                       float_feed_name,
-                                       float_shape,
-                                       int_feed_batch,
-                                       int_feed_name,
-                                       int_shape,
-                                       fetch_name,
-                                       predict_res_batch,
-                                       pid,
-                                       log_id);
-           },
-           py::call_guard<py::gil_scoped_release>())
      .def("numpy_predict",
           [](PredictorClient &self,
              const std::vector<std::vector<py::array_t<float>>>
                  &float_feed_batch,
              const std::vector<std::string> &float_feed_name,
              const std::vector<std::vector<int>> &float_shape,
+              const std::vector<std::vector<int>> &float_lod_slot_batch,
              const std::vector<std::vector<py::array_t<int64_t>>>
                  &int_feed_batch,
              const std::vector<std::string> &int_feed_name,
              const std::vector<std::vector<int>> &int_shape,
+              const std::vector<std::vector<int>> &int_lod_slot_batch,
              const std::vector<std::string> &fetch_name,
              PredictorRes &predict_res_batch,
              const int &pid,
@@ -138,9 +114,11 @@ PYBIND11_MODULE(serving_client, m) {
             return self.numpy_predict(float_feed_batch,
                                       float_feed_name,
                                       float_shape,
+                                       float_lod_slot_batch,
                                       int_feed_batch,
                                       int_feed_name,
                                       int_shape,
+                                       int_lod_slot_batch,
                                       fetch_name,
                                       predict_res_batch,
                                       pid,

--- a/core/general-server/CMakeLists.txt
+++ b/core/general-server/CMakeLists.txt
@@ -9,7 +9,7 @@ endif()
 target_include_directories(serving PUBLIC
        ${CMAKE_CURRENT_BINARY_DIR}/../../core/predictor
        )
-
+    include_directories(${CUDNN_ROOT}/include/)
 if(WITH_GPU)
    target_link_libraries(serving -Wl,--whole-archive fluid_gpu_engine
            -Wl,--no-whole-archive)
@@ -29,7 +29,11 @@ if(WITH_GPU)
 endif()

 if(WITH_MKL OR WITH_GPU)
+    if (WITH_TRT)
+    target_link_libraries(serving -liomp5 -lmklml_intel -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -lbz2)
+    else()
    target_link_libraries(serving -liomp5 -lmklml_intel -lmkldnn -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -lbz2)
+endif()
 else()
    target_link_libraries(serving openblas -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -lbz2)
 endif()

--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -73,8 +73,6 @@ int GeneralReaderOp::inference() {
  // reade request from client
  const Request *req = dynamic_cast<const Request *>(get_request_message());
  uint64_t log_id = req->log_id();
-
-  int batch_size = req->insts_size();
  int input_var_num = 0;
  std::vector<int64_t> elem_type;
  std::vector<int64_t> elem_size;
@@ -83,7 +81,6 @@ int GeneralReaderOp::inference() {
  GeneralBlob *res = mutable_data<GeneralBlob>();
  TensorVector *out = &res->tensor_vector;

-  res->SetBatchSize(batch_size);
  res->SetLogId(log_id);

  if (!res) {
@@ -98,11 +95,11 @@ int GeneralReaderOp::inference() {

  VLOG(2) << "(logid=" << log_id
          << ") start to call load general model_conf op";
+
  baidu::paddle_serving::predictor::Resource &resource =
      baidu::paddle_serving::predictor::Resource::instance();

  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
-
  std::shared_ptr<PaddleGeneralModelConfig> model_config =
      resource.get_general_model_config();

@@ -122,13 +119,11 @@ int GeneralReaderOp::inference() {
  elem_type.resize(var_num);
  elem_size.resize(var_num);
  capacity.resize(var_num);
-
  // prepare basic information for input
  for (int i = 0; i < var_num; ++i) {
    paddle::PaddleTensor lod_tensor;
    elem_type[i] = req->insts(0).tensor_array(i).elem_type();
-    VLOG(2) << "(logid=" << log_id << ") var[" << i
-            << "] has elem type: " << elem_type[i];
+    VLOG(2) << "var[" << i << "] has elem type: " << elem_type[i];
    if (elem_type[i] == 0) {  // int64
      elem_size[i] = sizeof(int64_t);
      lod_tensor.dtype = paddle::PaddleDType::INT64;
@@ -139,13 +134,24 @@ int GeneralReaderOp::inference() {
      elem_size[i] = sizeof(int32_t);
      lod_tensor.dtype = paddle::PaddleDType::INT32;
    }
-
-    if (model_config->_is_lod_feed[i]) {
-      lod_tensor.lod.resize(1);
-      lod_tensor.lod[0].push_back(0);
+    // implement lod tensor here
+    if (req->insts(0).tensor_array(i).lod_size() > 0) {
      VLOG(2) << "(logid=" << log_id << ") var[" << i << "] is lod_tensor";
+      lod_tensor.lod.resize(1);
+      for (int k = 0; k < req->insts(0).tensor_array(i).lod_size(); ++k) {
+        lod_tensor.lod[0].push_back(req->insts(0).tensor_array(i).lod(k));
+      }
+      capacity[i] = 1;
+      for (int k = 0; k < req->insts(0).tensor_array(i).shape_size(); ++k) {
+        int dim = req->insts(0).tensor_array(i).shape(k);
+        VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
+                << "]: " << dim;
+        capacity[i] *= dim;
+        lod_tensor.shape.push_back(dim);
+      }
+      VLOG(2) << "(logid=" << log_id << ") var[" << i
+              << "] is tensor, capacity: " << capacity[i];
    } else {
-      lod_tensor.shape.push_back(batch_size);
      capacity[i] = 1;
      for (int k = 0; k < req->insts(0).tensor_array(i).shape_size(); ++k) {
        int dim = req->insts(0).tensor_array(i).shape(k);
@@ -160,51 +166,40 @@ int GeneralReaderOp::inference() {
    lod_tensor.name = model_config->_feed_name[i];
    out->push_back(lod_tensor);
  }
-
  // specify the memory needed for output tensor_vector
  for (int i = 0; i < var_num; ++i) {
    if (out->at(i).lod.size() == 1) {
      int tensor_size = 0;
-      for (int j = 0; j < batch_size; ++j) {
-        const Tensor &tensor = req->insts(j).tensor_array(i);
-        int data_len = 0;
-        if (tensor.int64_data_size() > 0) {
-          data_len = tensor.int64_data_size();
-        } else if (tensor.float_data_size() > 0) {
-          data_len = tensor.float_data_size();
-        } else if (tensor.int_data_size() > 0) {
-          data_len = tensor.int_data_size();
-        }
-        VLOG(2) << "(logid=" << log_id << ") tensor size for var[" << i
-                << "]: " << data_len;
-        tensor_size += data_len;
-
-        int cur_len = out->at(i).lod[0].back();
-        VLOG(2) << "(logid=" << log_id << ") current len: " << cur_len;
-
-        int sample_len = 0;
-        if (tensor.shape_size() == 1) {
-          sample_len = data_len;
-        } else {
-          sample_len = tensor.shape(0);
-        }
-        out->at(i).lod[0].push_back(cur_len + sample_len);
-        VLOG(2) << "(logid=" << log_id << ") new len: " << cur_len + sample_len;
-      }
-      out->at(i).data.Resize(tensor_size * elem_size[i]);
-      out->at(i).shape = {out->at(i).lod[0].back()};
-      for (int j = 1; j < req->insts(0).tensor_array(i).shape_size(); ++j) {
-        out->at(i).shape.push_back(req->insts(0).tensor_array(i).shape(j));
+      const Tensor &tensor = req->insts(0).tensor_array(i);
+      int data_len = 0;
+      if (tensor.int64_data_size() > 0) {
+        data_len = tensor.int64_data_size();
+      } else if (tensor.float_data_size() > 0) {
+        data_len = tensor.float_data_size();
+      } else if (tensor.int_data_size() > 0) {
+        data_len = tensor.int_data_size();
      }
-      if (out->at(i).shape.size() == 1) {
-        out->at(i).shape.push_back(1);
+      VLOG(2) << "(logid=" << log_id << ") tensor size for var[" << i
+              << "]: " << data_len;
+      tensor_size += data_len;
+
+      int cur_len = out->at(i).lod[0].back();
+      VLOG(2) << "(logid=" << log_id << ") current len: " << cur_len;
+
+      int sample_len = 0;
+      if (tensor.shape_size() == 1) {
+        sample_len = data_len;
+      } else {
+        sample_len = tensor.shape(0);
      }
+      VLOG(2) << "(logid=" << log_id << ") new len: " << cur_len + sample_len;
+      out->at(i).data.Resize(tensor_size * elem_size[i]);
      VLOG(2) << "(logid=" << log_id << ") var[" << i
              << "] is lod_tensor and len=" << out->at(i).lod[0].back();
    } else {
-      out->at(i).data.Resize(batch_size * capacity[i] * elem_size[i]);
+      out->at(i).data.Resize(capacity[i] * elem_size[i]);
      VLOG(2) << "(logid=" << log_id << ") var[" << i
-              << "] is tensor and capacity=" << batch_size * capacity[i];
+              << "] is tensor and capacity=" << capacity[i];
    }
  }

@@ -215,58 +210,36 @@ int GeneralReaderOp::inference() {
      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
              << "] is " << req->insts(0).tensor_array(i).int64_data(0);
      int offset = 0;
-      for (int j = 0; j < batch_size; ++j) {
-        int elem_num = req->insts(j).tensor_array(i).int64_data_size();
-        for (int k = 0; k < elem_num; ++k) {
-          dst_ptr[offset + k] = req->insts(j).tensor_array(i).int64_data(k);
-        }
-        if (out->at(i).lod.size() == 1) {
-          offset = out->at(i).lod[0][j + 1];
-        } else {
-          offset += capacity[i];
-        }
+      int elem_num = req->insts(0).tensor_array(i).int64_data_size();
+      for (int k = 0; k < elem_num; ++k) {
+        dst_ptr[offset + k] = req->insts(0).tensor_array(i).int64_data(k);
      }
    } else if (elem_type[i] == 1) {
      float *dst_ptr = static_cast<float *>(out->at(i).data.data());
      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
              << "] is " << req->insts(0).tensor_array(i).float_data(0);
      int offset = 0;
-      for (int j = 0; j < batch_size; ++j) {
-        int elem_num = req->insts(j).tensor_array(i).float_data_size();
-        for (int k = 0; k < elem_num; ++k) {
-          dst_ptr[offset + k] = req->insts(j).tensor_array(i).float_data(k);
-        }
-        if (out->at(i).lod.size() == 1) {
-          offset = out->at(i).lod[0][j + 1];
-        } else {
-          offset += capacity[i];
-        }
+      int elem_num = req->insts(0).tensor_array(i).float_data_size();
+      for (int k = 0; k < elem_num; ++k) {
+        dst_ptr[offset + k] = req->insts(0).tensor_array(i).float_data(k);
      }
    } else if (elem_type[i] == 2) {
      int32_t *dst_ptr = static_cast<int32_t *>(out->at(i).data.data());
      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
              << "] is " << req->insts(0).tensor_array(i).int_data(0);
      int offset = 0;
-      for (int j = 0; j < batch_size; ++j) {
-        int elem_num = req->insts(j).tensor_array(i).int_data_size();
-        for (int k = 0; k < elem_num; ++k) {
-          dst_ptr[offset + k] = req->insts(j).tensor_array(i).int_data(k);
-        }
-        if (out->at(i).lod.size() == 1) {
-          offset = out->at(i).lod[0][j + 1];
-        } else {
-          offset += capacity[i];
-        }
+      int elem_num = req->insts(0).tensor_array(i).int_data_size();
+      for (int k = 0; k < elem_num; ++k) {
+        dst_ptr[offset + k] = req->insts(0).tensor_array(i).int_data(k);
      }
    }
  }

  VLOG(2) << "(logid=" << log_id << ") output size: " << out->size();
-
  timeline.Pause();
  int64_t end = timeline.TimeStampUS();
  res->p_size = 0;
-  res->_batch_size = batch_size;
+  res->_batch_size = 1;
  AddBlobInfo(res, start);
  AddBlobInfo(res, end);


--- a/core/predictor/CMakeLists.txt
+++ b/core/predictor/CMakeLists.txt
@@ -13,7 +13,9 @@ set_source_files_properties(
        PROPERTIES
        COMPILE_FLAGS  "-Wno-strict-aliasing -Wno-unused-variable -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure)
-
+if (WITH_TRT)
+    add_definitions(-DWITH_TRT)
+endif()
 target_link_libraries(pdserving
        brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)


--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -38,6 +38,7 @@ class InferEngineCreationParams {
    _enable_ir_optimization = false;
    _static_optimization = false;
    _force_update_static_cache = false;
+    _use_trt = false;
  }

  void set_path(const std::string& path) { _path = path; }
@@ -50,12 +51,16 @@ class InferEngineCreationParams {
    _enable_ir_optimization = enable_ir_optimization;
  }

+  void set_use_trt(bool use_trt) { _use_trt = use_trt; }
+
  bool enable_memory_optimization() const {
    return _enable_memory_optimization;
  }

  bool enable_ir_optimization() const { return _enable_ir_optimization; }

+  bool use_trt() const { return _use_trt; }
+
  void set_static_optimization(bool static_optimization = false) {
    _static_optimization = static_optimization;
  }
@@ -86,6 +91,7 @@ class InferEngineCreationParams {
  bool _enable_ir_optimization;
  bool _static_optimization;
  bool _force_update_static_cache;
+  bool _use_trt;
 };

 class InferEngine {
@@ -172,6 +178,10 @@ class ReloadableInferEngine : public InferEngine {
          force_update_static_cache);
    }

+    if (conf.has_use_trt()) {
+      _infer_engine_params.set_use_trt(conf.use_trt());
+    }
+
    if (!check_need_reload() || load(_infer_engine_params) != 0) {
      LOG(ERROR) << "Failed load model_data_path" << _model_data_path;
      return -1;
@@ -553,8 +563,12 @@ class CloneDBReloadableInferEngine
 };

 template <typename FluidFamilyCore>
+#ifdef WITH_TRT
+class FluidInferEngine : public DBReloadableInferEngine<FluidFamilyCore> {
+#else
 class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
- public:
+#endif
+ public:  // NOLINT
  FluidInferEngine() {}
  ~FluidInferEngine() {}


--- a/core/sdk-cpp/include/abtest.h
+++ b/core/sdk-cpp/include/abtest.h
@@ -51,8 +51,8 @@ class WeightedRandomRender : public EndpointRouterBase {
        new (std::nothrow) Factory<WeightedRandomRender, EndpointRouterBase>();
    if (factory == NULL) {
      RAW_LOG(ERROR,
-              "Failed regist factory: WeightedRandomRender->EndpointRouterBase \
-          in macro!");
+              "Failed regist factory: WeightedRandomRender->EndpointRouterBase "
+              "in macro!");
      return -1;
    }

@@ -63,8 +63,8 @@ class WeightedRandomRender : public EndpointRouterBase {
    if (FactoryPool<EndpointRouterBase>::instance().register_factory(
            "WeightedRandomRender", factory) != 0) {
      RAW_LOG(INFO,
-              "Factory has been registed: \
-              WeightedRandomRender->EndpointRouterBase.");
+              "Factory has been registed: "
+              "WeightedRandomRender->EndpointRouterBase.");
    }

    return 0;

--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -77,10 +77,10 @@ export PATH=$PATH:$GOPATH/bin
 ```shell
 go env -w GO111MODULE=on
 go env -w GOPROXY=https://goproxy.cn,direct
-go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway
-go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger
-go get -u github.com/golang/protobuf/protoc-gen-go
-go get -u google.golang.org/grpc
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
+go get -u github.com/golang/protobuf/protoc-gen-go@v1.4.3
+go get -u google.golang.org/grpc@v1.33.0
 ```


@@ -91,9 +91,9 @@ go get -u google.golang.org/grpc
 ``` shell
 mkdir server-build-cpu && cd server-build-cpu
 cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
-      -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
-      -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
-      -DSERVER=ON ..
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DSERVER=ON ..
 make -j10
 ```

@@ -104,10 +104,28 @@ you can execute `make install` to put targets under directory `./output`, you ne
 ``` shell
 mkdir server-build-gpu && cd server-build-gpu
 cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
-      -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
-      -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
-      -DSERVER=ON \
-      -DWITH_GPU=ON ..
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
+    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \  
+    -DSERVER=ON \
+    -DWITH_GPU=ON ..
+make -j10
+```
+
+### Integrated TRT version paddle inference library
+
+```
+mkdir server-build-trt && cd server-build-trt
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH} \
+    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
+    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
+    -DSERVER=ON \
+    -DWITH_GPU=ON \
+    -DWITH_TRT=ON ..
 make -j10
 ```

@@ -136,7 +154,10 @@ execute `make install` to put targets under directory `./output`

 ```bash
 mkdir app-build && cd app-build
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DAPP=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DAPP=ON ..
 make
 ```

@@ -167,7 +188,9 @@ Please use the example under `python/examples` to verify.
 |     WITH_AVX     | Compile Paddle Serving with AVX intrinsics | OFF  |
 |     WITH_MKL     |  Compile Paddle Serving with MKL support   | OFF  |
 |     WITH_GPU     |   Compile Paddle Serving with NVIDIA GPU   | OFF  |
-|    CUDNN_ROOT    |    Define CuDNN library and header path    |      |
+|  CUDNN_LIBRARY   |    Define CuDNN library and header path    |      |
+| CUDA_TOOLKIT_ROOT_DIR |       Define CUDA PATH                |      |
+|   TENSORRT_ROOT  |           Define TensorRT PATH             |      |
 |      CLIENT      |       Compile Paddle Serving Client        | OFF  |
 |      SERVER      |       Compile Paddle Serving Server        | OFF  |
 |       APP        |     Compile Paddle Serving App package     | OFF  |
@@ -182,7 +205,8 @@ To compile the Paddle Serving GPU version on bare metal, you need to install the

 - CUDA
 - CuDNN
- NCCL2
+
+To compile the TensorRT version, you need to install the TensorRT library.

 Note here:

@@ -192,21 +216,12 @@ Note here:

 The following is the base library version matching relationship used by the PaddlePaddle release version for reference:

-|        |  CUDA   |          CuDNN           | NCCL2  |
-| :----: | :-----: | :----------------------: | :----: |
-| CUDA 8 | 8.0.61  | CuDNN 7.1.2 for CUDA 8.0 | 2.1.4  |
-| CUDA 9 | 9.0.176 | CuDNN 7.3.1 for CUDA 9.0 | 2.2.12 |
+|          |  CUDA   |          CuDNN           | TensorRT |
+| :----:   | :-----: | :----------------------: | :----:   |
+| post9    |  9.0    | CuDNN 7.3.1 for CUDA 9.0 |          |
+| post10   |  10.0   | CuDNN 7.5.1 for CUDA 10.0|          |
+| trt      |  10.1   | CuDNN 7.5.1 for CUDA 10.1| 6.0.1.5  |

 ### How to make the compiler detect the CuDNN library

 Download the corresponding CUDNN version from NVIDIA developer official website and decompressing it, add `-DCUDNN_ROOT` to cmake command, to specify the path of CUDNN.
-
-### How to make the compiler detect the nccl library
-
-After downloading the corresponding version of the nccl2 library from the NVIDIA developer official website and decompressing it, add the following environment variables (take nccl2.1.4 as an example):
-
-```shell
-export C_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$C_INCLUDE_PATH
-export CPLUS_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$CPLUS_INCLUDE_PATH
-export LD_LIBRARY_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/lib/:$LD_LIBRARY_PATH
-```
--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -74,10 +74,10 @@ export PATH=$PATH:$GOPATH/bin
 ```shell
 go env -w GO111MODULE=on
 go env -w GOPROXY=https://goproxy.cn,direct
-go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway
-go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger
-go get -u github.com/golang/protobuf/protoc-gen-go
-go get -u google.golang.org/grpc
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
+go get -u github.com/golang/protobuf/protoc-gen-go@v1.4.3
+go get -u google.golang.org/grpc@v1.33.0
 ```


@@ -87,7 +87,10 @@ go get -u google.golang.org/grpc

 ``` shell
 mkdir server-build-cpu && cd server-build-cpu
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DSERVER=ON ..
 make -j10
 ```

@@ -97,21 +100,44 @@ make -j10

 ``` shell
 mkdir server-build-gpu && cd server-build-gpu
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON -DWITH_GPU=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
+    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
+    -DSERVER=ON \
+    -DWITH_GPU=ON ..
 make -j10
 ```

-执行`make install`可以把目标产出放在`./output`目录下。
+### 集成TensorRT版本Paddle Inference Library

-**注意：** 编译成功后，需要设置`SERVING_BIN`路径，详见后面的[注意事项](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE_CN.md#注意事项)。
+```
+mkdir server-build-trt && cd server-build-trt
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH} \
+    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
+    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
+    -DSERVER=ON \
+    -DWITH_GPU=ON \
+    -DWITH_TRT=ON ..
+make -j10
+```

+执行`make install`可以把目标产出放在`./output`目录下。

+**注意：** 编译成功后，需要设置`SERVING_BIN`路径，详见后面的[注意事项](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE_CN.md#注意事项)。

 ## 编译Client部分

 ``` shell
 mkdir client-build && cd client-build
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCLIENT=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DCLIENT=ON ..
 make -j10
 ```

@@ -123,7 +149,11 @@ make -j10

 ```bash
 mkdir app-build && cd app-build
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCMAKE_INSTALL_PREFIX=./output -DAPP=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DCMAKE_INSTALL_PREFIX=./output \
+    -DAPP=ON ..
 make
 ```

@@ -154,7 +184,10 @@ make
 |     WITH_AVX     | Compile Paddle Serving with AVX intrinsics | OFF  |
 |     WITH_MKL     |  Compile Paddle Serving with MKL support   | OFF  |
 |     WITH_GPU     |   Compile Paddle Serving with NVIDIA GPU   | OFF  |
-|    CUDNN_ROOT    |    Define CuDNN library and header path    |      |
+|     WITH_TRT     |    Compile Paddle Serving with TensorRT    | OFF  |
+|  CUDNN_LIBRARY   |    Define CuDNN library and header path    |      |
+| CUDA_TOOLKIT_ROOT_DIR |       Define CUDA PATH                |      |
+|   TENSORRT_ROOT  |           Define TensorRT PATH             |      |
 |      CLIENT      |       Compile Paddle Serving Client        | OFF  |
 |      SERVER      |       Compile Paddle Serving Server        | OFF  |
 |       APP        |     Compile Paddle Serving App package     | OFF  |
@@ -169,7 +202,8 @@ Paddle Serving通过PaddlePaddle预测库支持在GPU上做预测。WITH_GPU选

 - CUDA
 - CuDNN
- NCCL2
+
+编译TensorRT版本，需要安装TensorRT库。

 这里要注意的是：

@@ -178,21 +212,12 @@ Paddle Serving通过PaddlePaddle预测库支持在GPU上做预测。WITH_GPU选

 以下是PaddlePaddle发布版本所使用的基础库版本匹配关系，供参考：

-|        |  CUDA   |          CuDNN           | NCCL2  |
-| :----: | :-----: | :----------------------: | :----: |
-| CUDA 8 | 8.0.61  | CuDNN 7.1.2 for CUDA 8.0 | 2.1.4  |
-| CUDA 9 | 9.0.176 | CuDNN 7.3.1 for CUDA 9.0 | 2.2.12 |
+|          |  CUDA   |          CuDNN           | TensorRT |
+| :----:   | :-----: | :----------------------: | :----:   |
+| post9    |  9.0    | CuDNN 7.3.1 for CUDA 9.0 |          |
+| post10   |  10.0   | CuDNN 7.5.1 for CUDA 10.0|          |
+| trt      |  10.1   | CuDNN 7.5.1 for CUDA 10.1| 6.0.1.5  |

 ### 如何让Paddle Serving编译系统探测到CuDNN库

-从NVIDIA developer官网下载对应版本CuDNN并在本地解压后，在cmake编译命令中增加`-DCUDNN_ROOT`参数，指定CuDNN库所在路径。
-
-### 如何让Paddle Serving编译系统探测到nccl库
-
-从NVIDIA developer官网下载对应版本nccl2库并解压后，增加如下环境变量 (以nccl2.1.4为例)：
-
-```shell
-export C_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$C_INCLUDE_PATH
-export CPLUS_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$CPLUS_INCLUDE_PATH
-export LD_LIBRARY_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/lib/:$LD_LIBRARY_PATH
-```
+从NVIDIA developer官网下载对应版本CuDNN并在本地解压后，在cmake编译命令中增加`-DCUDNN_LIBRARY`参数，指定CuDNN库所在路径。
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
 # FAQ

- Q: 如何调整RPC服务的等待时间，避免超时？ 

-  A: 使用set_rpc_timeout_ms设置更长的等待时间，单位为毫秒，默认时间为20秒。
-  
-  示例：
-  ```
-  from paddle_serving_client import Client

-  client = Client()
-  client.load_client_config(sys.argv[1])
-  client.set_rpc_timeout_ms(100000)
-  client.connect(["127.0.0.1:9393"])
-   ```
+## 基础知识

- Q: 如何使用自己编译的Paddle Serving进行预测？
+#### Q: Paddle Serving 、Paddle Inference、PaddleHub Serving三者的区别及联系？

-  A: 通过pip命令安装自己编译出的whl包，并设置SERVING_BIN环境变量为编译出的serving二进制文件路径。
+**A:** paddle serving是远程服务，即发起预测的设备（手机、浏览器、客户端等）与实际预测的硬件不在一起。	paddle inference是一个library，适合嵌入到一个大系统中保证预测效率，paddle serving调用了paddle       inference做远程服务。paddlehub serving可以认为是一个示例，都会使用paddle serving作为统一预测服务入口。如果在web端交互，一般是调用远程服务的形式，可以使用paddle serving的web service搭建。

- Q: 执行GPU预测时遇到InvalidArgumentError: Device id must be less than GPU count, but received id is: 0. GPU count is: 0.
+#### Q: paddle-serving是否支持Int32支持

-  A: 将显卡驱动对应的libcuda.so的目录添加到LD_LIBRARY_PATH环境变量中
+**A:** 在protobuf定feed_type和fetch_type编号与数据类型对应如下

- Q: 执行GPU预测时遇到ExternalError: Cudnn error, CUDNN_STATUS_BAD_PARAM at (/home/scmbuild/workspaces_cluster.dev/baidu.lib.paddlepaddle/baidu/lib/paddlepaddle/Paddle/paddle/fluid/operators/batch_norm_op.cu:198)
+     0-int64

-  A: 将cudnn的lib64路径添加到LD_LIBRARY_PATH，安装自pypi的Paddle Serving中post9版使用的是cudnn 7.3,post10使用的是cudnn 7.5。如果是使用自己编译的Paddle Serving，可以在log/serving.INFO日志文件中查看对应的cudnn版本。
+	  1-float32

- Q: 执行GPU预测时遇到Error: Failed to find dynamic library: libcublas.so
+	  2-int32

-  A: 将cuda的lib64路径添加到LD_LIBRARY_PATH, post9版本的Paddle Serving使用的是cuda 9.0，post10版本使用的cuda 10.0。
+#### Q: paddle-serving是否支持windows和Linux环境下的多线程调用 

- Q: 部署和预测中的日志信息在哪里查看？
+**A:** 客户端可以发起多线程访问调用服务端 

- A: server端的日志分为两部分，一部分打印到标准输出，一部分打印到启动服务时的目录下的log/serving.INFO文件中。
+#### Q: paddle-serving如何修改消息大小限制

-    client端的日志直接打印到标准输出。
+**A:** 在server端和client但通过FLAGS_max_body_size来扩大数据量限制，单位为字节，默认为64MB

-    通过在部署服务之前 'export  GLOG_v=3'可以输出更为详细的日志信息。
-    
- Q: GPU环境运行Serving报错，GPU count is: 0。
-  ```
-  terminate called after throwing an instance of 'paddle::platform::EnforceNotMet'
-  what():
-  --------------------------------------------
-  C++ Call Stacks (More useful to developers):
-  --------------------------------------------
-  0   std::string paddle::platform::GetTraceBackString<std::string const&>(std::string const&, char const*, int)
-  1   paddle::platform::SetDeviceId(int)
-  2   paddle::AnalysisConfig::fraction_of_gpu_memory_for_pool() const
-  3   std::unique_ptr<paddle::PaddlePredictor, std::default_delete<paddle::PaddlePredictor> > paddle::CreatePaddlePredictor<paddle::AnalysisConfig, (paddle::PaddleEngineKind)2>(paddle::AnalysisConfig const&)
-  4   std::unique_ptr<paddle::PaddlePredictor, std::default_delete<paddle::PaddlePredictor> > paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(paddle::AnalysisConfig const&)
-  ----------------------
-  Error Message Summary:
-  ----------------------
-  InvalidArgumentError: Device id must be less than GPU count, but received id is: 0. GPU count is: 0.
-  [Hint: Expected id < GetCUDADeviceCount(), but received id:0 >= GetCUDADeviceCount():0.] at (/home/scmbuild/workspaces_cluster.dev/baidu.lib.paddlepaddle/baidu/lib/paddlepaddle/Paddle/paddle/fluid/platform/gpu_info.cc:211)
-  ```
-  A: libcuda.so没有链接成功。首先在机器上找到libcuda.so，ldd检查libnvidia版本与nvidia-smi中版本一致（libnvidia-fatbinaryloader.so.418.39，与NVIDIA-SMI 418.39 Driver Version: 418.39）,然后用export导出libcuda.so的路径即可（例如libcuda.so在/usr/lib64/，export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib64/）
-  
- 
+#### Q: paddle-serving客户端目前支持哪些语言
+
+**A:** java c++ python 
+
+#### Q: paddle-serving目前支持哪些协议
+
+**A:** http rpc 
+
+
+## 编译问题
+
+#### Q: 如何使用自己编译的Paddle Serving进行预测？
+
+**A:** 通过pip命令安装自己编译出的whl包，并设置SERVING_BIN环境变量为编译出的serving二进制文件路径。
+
+#### Q: 使用Java客户端，mvn compile过程出现"No compiler is provided in this environment. Perhaps you are running on a JRE rather than a JDK?"错误
+
+**A:** 没有安装JDK，或者JAVA_HOME路径配置错误（正确配置是JDK路径，常见错误配置成JRE路径，例如正确路径参考JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.262.b10-0.el7_8.x86_64/"）。Java JDK安装参考https://segmentfault.com/a/1190000015389941
+
+
+
+## 部署问题
+
+#### Q: GPU环境运行Serving报错，GPU count is: 0。
+
+```
+terminate called after throwing an instance of 'paddle::platform::EnforceNotMet'
+what():
+--------------------------------------------
+C++ Call Stacks (More useful to developers):
+--------------------------------------------
+0   std::string paddle::platform::GetTraceBackString<std::string const&>(std::string const&, char const*, int)
+1   paddle::platform::SetDeviceId(int)
+2   paddle::AnalysisConfig::fraction_of_gpu_memory_for_pool() const
+3   std::unique_ptr<paddle::PaddlePredictor, std::default_delete<paddle::PaddlePredictor> > paddle::CreatePaddlePredictor<paddle::AnalysisConfig, (paddle::PaddleEngineKind)2>(paddle::AnalysisConfig const&)
+4   std::unique_ptr<paddle::PaddlePredictor, std::default_delete<paddle::PaddlePredictor> > paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(paddle::AnalysisConfig const&)
+----------------------
+Error Message Summary:
+----------------------
+InvalidArgumentError: Device id must be less than GPU count, but received id is: 0. GPU count is: 0.
+[Hint: Expected id < GetCUDADeviceCount(), but received id:0 >= GetCUDADeviceCount():0.] at (/home/scmbuild/workspaces_cluster.dev/baidu.lib.paddlepaddle/baidu/lib/paddlepaddle/Paddle/paddle/fluid/platform/gpu_info.cc:211)
+```
+
+**A:** libcuda.so没有链接成功。首先在机器上找到libcuda.so，ldd检查libnvidia版本与nvidia-smi中版本一致（libnvidia-fatbinaryloader.so.418.39，与NVIDIA-SMI 418.39 Driver Version: 418.39）,然后用export导出libcuda.so的路径即可（例如libcuda.so在/usr/lib64/，export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib64/）
+
+#### Q: 遇到 GPU not found, please check your environment or use cpu version by "pip install paddle_serving_server"
+
+**A:** 检查环境中是否有N卡：ls /dev/ | grep nvidia
+
+#### Q: 目前Paddle Serving支持哪些镜像环境？
+
+**A:** 目前（0.4.0）仅支持CentOS，具体列表查阅[这里](https://github.com/PaddlePaddle/Serving/blob/develop/doc/DOCKER_IMAGES.md)
+
+#### Q: python编译的GCC版本与serving的版本不匹配
+
+**A:**:1)使用[GPU docker](https://github.com/PaddlePaddle/Serving/blob/develop/doc/RUN_IN_DOCKER.md#gpunvidia-docker)解决环境问题
+
+	   2)修改anaconda的虚拟环境下安装的python的gcc版本[参考](https://www.jianshu.com/p/c498b3d86f77) 
+
+#### Q: paddle-serving是否支持本地离线安装 
+
+**A:** 支持离线部署，需要把一些相关的[依赖包](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE.md)提前准备安装好
+
+## 预测问题
+
+#### Q: 使用GPU第一次预测时特别慢，如何调整RPC服务的等待时间避免超时？ 
+
+**A:** GPU第一次预测需要初始化。使用set_rpc_timeout_ms设置更长的等待时间，单位为毫秒，默认时间为20秒。
+
+示例：
+
+```
+from paddle_serving_client import Client
+
+client = Client()
+client.load_client_config(sys.argv[1])
+client.set_rpc_timeout_ms(100000)
+client.connect(["127.0.0.1:9393"])
+```
+
+#### Q: 执行GPU预测时遇到InvalidArgumentError: Device id must be less than GPU count, but received id is: 0. GPU count is: 0.
+
+**A:** 将显卡驱动对应的libcuda.so的目录添加到LD_LIBRARY_PATH环境变量中
+
+#### Q: 执行GPU预测时遇到ExternalError: Cudnn error, CUDNN_STATUS_BAD_PARAM at (../batch_norm_op.cu:198)
+
+**A:** 将cudnn的lib64路径添加到LD_LIBRARY_PATH，安装自pypi的Paddle Serving中post9版使用的是cudnn 7.3,post10使用的是cudnn 7.5。如果是使用自己编译的Paddle Serving，可以在log/serving.INFO日志文件中查看对应的cudnn版本。
+
+#### Q: 执行GPU预测时遇到Error: Failed to find dynamic library: libcublas.so
+
+**A:** 将cuda的lib64路径添加到LD_LIBRARY_PATH, post9版本的Paddle Serving使用的是cuda 9.0，post10版本使用的cuda 10.0。
+
+#### Q: Client端fetch的变量名如何设置
+
+**A:** 可以查看配置文件serving_server_conf.prototxt，获取需要的变量名
+
+#### Q: 如何使用多语言客户端
+
+**A:** 多语言客户端要与多语言服务端配套使用。当前版本下（0.4.0），服务端需要将Server改为MultiLangServer（如果是以命令行启动的话只需要添加--use_multilang参数），Python客户端需要将Client改为MultiLangClient，同时去除load_client_config的过程。[Java客户端参考文档](https://github.com/PaddlePaddle/Serving/blob/develop/doc/JAVA_SDK_CN.md)
+
+#### Q: 如何在Windows下使用Paddle Serving
+
+**A:** 当前版本（0.4.0）在Windows上可以运行多语言RPC客户端，或使用HTTP方式访问。如果使用多语言RPC客户端，需要在Linux环境（比如本机容器，或远程Linux机器）中运行多语言服务端；如果使用HTTP方式，需要在Linux环境中运行普通服务端
+
+#### Q: libnvinfer.so: cannot open shared object file: No such file or directory)
+
+ **A:** 参考该文档安装TensorRT: https://blog.csdn.net/hesongzefairy/article/details/105343525
+
+
+
+## 日志排查
+
+#### Q: 部署和预测中的日志信息在哪里查看？
+
+**A:** server端的日志分为两部分，一部分打印到标准输出，一部分打印到启动服务时的目录下的log/serving.INFO文件中。
+
+client端的日志直接打印到标准输出。
+
+通过在部署服务之前 'export  GLOG_v=3'可以输出更为详细的日志信息。
+
+#### Q: paddle-serving启动成功后，相关的日志在哪里设置
+
+**A:** 1)警告是glog组件打印的，告知glog初始化之前日志打印在STDERR
+
+	   2)一般采用GLOG_v方式启动服务同时设置日志级别。
+
+例如：
+```
+GLOG_v=2 python -m paddle_serving_server.serve --model xxx_conf/ --port 9999 
+```
+
+
+
+#### Q: （GLOG_v=2下）Server端日志一切正常，但Client端始终得不到正确的预测结果
+
+**A:** 可能是配置文件有问题，检查下配置文件（is_load_tensor，fetch_type等有没有问题）
+
+#### Q: 如何给Server传递Logid
+
+**A:** Logid默认为0（后续应该有自动生成Logid的计划，当前版本0.4.0），Client端通过在predict函数中指定log_id参数传递
+
+
+
+## 性能优化
--- a/doc/GRPC_IMPL_CN.md
+++ b/doc/GRPC_IMPL_CN.md
-# gRPC接口
+# gRPC接口使用介绍
+
+  - [1.与bRPC接口对比](#1与brpc接口对比)
+      - [1.1 服务端对比](#11-服务端对比)
+      - [1.2 客服端对比](#12-客服端对比)
+      - [1.3 其他](#13-其他)
+  - [2.示例：线性回归预测服务](#2示例线性回归预测服务)
+      - [获取数据](#获取数据)
+      - [开启 gRPC 服务端](#开启-grpc-服务端)
+    - [客户端预测](#客户端预测)
+      - [同步预测](#同步预测)
+      - [异步预测](#异步预测)
+      - [Batch 预测](#batch-预测)
+      - [通用 pb 预测](#通用-pb-预测)
+      - [预测超时](#预测超时)
+      - [List 输入](#list-输入)
+  - [3.更多示例](#3更多示例)
+  
+使用gRPC接口，Client端可以在Win/Linux/MacOS平台上调用不同语言。gRPC 接口实现结构如下：
+
+![](https://github.com/PaddlePaddle/Serving/blob/develop/doc/grpc_impl.png)
+
+## 1.与bRPC接口对比
+
+#### 1.1 服务端对比
+
+* gRPC Server 端 `load_model_config` 函数添加 `client_config_path` 参数：

-gRPC 接口实现形式类似 Web Service：
-
-![](grpc_impl.png)
-
-## 与bRPC接口对比
-
-1. gRPC Server 端 `load_model_config` 函数添加 `client_config_path` 参数：
-
-   ```python
+   ```
   def load_model_config(self, server_config_paths, client_config_path=None)
   ```
+    在一些例子中 bRPC Server 端与 bRPC Client 端的配置文件可能不同（如 在cube local 中，Client 端的数据先交给 cube，经过 cube 处理后再交给预测库），此时 gRPC Server 端需要手动设置 gRPC Client 端的配置`client_config_path`。
+    **`client_config_path` 默认为 `<server_config_path>/serving_server_conf.prototxt`。**

-   在一些例子中 bRPC Server 端与 bRPC Client 端的配置文件可能是不同的（如 cube local 例子中，Client 端的数据先交给 cube，经过 cube 处理后再交给预测库），所以 gRPC Server 端需要获取 gRPC Client 端的配置；同时为了取消 gRPC Client 端手动加载配置文件的过程，所以设计 gRPC Server 端同时加载两个配置文件。`client_config_path` 默认为 `<server_config_path>/serving_server_conf.prototxt`。
+#### 1.2 客服端对比

-2. gRPC Client 端取消 `load_client_config` 步骤：
+* gRPC Client 端取消 `load_client_config` 步骤：

   在 `connect` 步骤通过 RPC 获取相应的 prototxt（从任意一个 endpoint 获取即可）。

-3. gRPC Client 需要通过 RPC 方式设置 timeout 时间（调用形式与 bRPC Client保持一致）
+* gRPC Client 需要通过 RPC 方式设置 timeout 时间（调用形式与 bRPC Client保持一致）

   因为 bRPC Client 在 `connect` 后无法更改 timeout 时间，所以当 gRPC Server 收到变更 timeout 的调用请求时会重新创建 bRPC Client 实例以变更 bRPC Client timeout时间，同时 gRPC Client 会设置 gRPC 的 deadline 时间。

   **注意，设置 timeout 接口和 Inference 接口不能同时调用（非线程安全），出于性能考虑暂时不加锁。**

-4. gRPC Client 端 `predict` 函数添加 `asyn` 和 `is_python` 参数：
+* gRPC Client 端 `predict` 函数添加 `asyn` 和 `is_python` 参数：

-   ```python
+   ```
   def predict(self, feed, fetch, need_variant_tag=False, asyn=False, is_python=True)
   ```

-   其中，`asyn` 为异步调用选项。当 `asyn=True` 时为异步调用，返回 `MultiLangPredictFuture` 对象，通过 `MultiLangPredictFuture.result()` 阻塞获取预测值；当 `asyn=Fasle` 为同步调用。
+1.    `asyn` 为异步调用选项。当 `asyn=True` 时为异步调用，返回 `MultiLangPredictFuture` 对象，通过 `MultiLangPredictFuture.result()` 阻塞获取预测值；当 `asyn=Fasle` 为同步调用。
+
+2.    `is_python` 为 proto 格式选项。当 `is_python=True` 时，基于 numpy bytes 格式进行数据传输，目前只适用于 Python；当 `is_python=False` 时，以普通数据格式传输，更加通用。使用 numpy bytes 格式传输耗时比普通数据格式小很多（详见 [#654](https://github.com/PaddlePaddle/Serving/pull/654)）。
+
+#### 1.3 其他
+
+* 异常处理：当 gRPC Server 端的 bRPC Client 预测失败（返回 `None`）时，gRPC Client 端同样返回None。其他 gRPC 异常会在 Client 内部捕获，并在返回的 fetch_map 中添加一个 "status_code" 字段来区分是否预测正常（参考 timeout 样例）。
+
+* 由于 gRPC 只支持 pick_first 和 round_robin 负载均衡策略，ABTEST 特性还未打齐。
+
+* 系统兼容性：
+    * [x]  CentOS
+    * [x]  macOS
+    * [x]  Windows
+
+* 已经支持的客户端语言：
+
+   -  Python
+   -  Java
+   -  Go
+   
+   
+## 2.示例：线性回归预测服务
+
+以下是采用gRPC实现的关于线性回归预测的一个示例，具体代码详见此[链接](https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/grpc_impl_example/fit_a_line)
+#### 获取数据
+
+```shell
+sh get_data.sh
+```
+
+#### 开启 gRPC 服务端
+
+``` shell
+python test_server.py uci_housing_model/
+```
+
+也可以通过下面的一行代码开启默认 gRPC 服务：
+
+```shell
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang
+```
+注：--use_multilang参数用来启用多语言客户端
+
+### 客户端预测
+
+#### 同步预测
+
+``` shell
+python test_sync_client.py
+```
+
+#### 异步预测
+
+``` shell
+python test_asyn_client.py
+```
+
+#### Batch 预测
+
+``` shell
+python test_batch_client.py
+```

-   `is_python` 为 proto 格式选项。当 `is_python=True` 时，基于 numpy bytes 格式进行数据传输，目前只适用于 Python；当 `is_python=False` 时，以普通数据格式传输，更加通用。使用 numpy bytes 格式传输耗时比普通数据格式小很多（详见 [#654](https://github.com/PaddlePaddle/Serving/pull/654)）。
+#### 通用 pb 预测

-5. 异常处理：当 gRPC Server 端的 bRPC Client 预测失败（返回 `None`）时，gRPC Client 端同样返回None。其他 gRPC 异常会在 Client 内部捕获，并在返回的 fetch_map 中添加一个 "status_code" 字段来区分是否预测正常（参考 timeout 样例）。
+``` shell
+python test_general_pb_client.py
+```

-6. 由于 gRPC 只支持 pick_first 和 round_robin 负载均衡策略，ABTEST 特性还未打齐。
+#### 预测超时

-7. 经测试，gRPC 版本可以在 Windows、macOS 平台使用。
+``` shell
+python test_timeout_client.py
+```

-8. 计划支持的客户端语言：
+#### List 输入

-   - [x] Python
-   - [ ] Java
-   - [ ] Go
-   - [ ] JavaScript
+``` shell
+python test_list_input_client.py
+```

-## Python 端的一些例子 
+## 3.更多示例

-详见 `python/examples/grpc_impl_example` 下的示例文件。
+详见[`python/examples/grpc_impl_example`](https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/grpc_impl_example)下的示例文件。
--- a/doc/LATEST_PACKAGES.md
+++ b/doc/LATEST_PACKAGES.md
@@ -18,6 +18,8 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.0.0-py2-none-an
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py3-none-any.whl
 #cuda 10.0
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py3-none-any.whl
+#cuda10.1 with TensorRT 6
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py3-none-any.whl
 ```
 ### Python 2
 ```
@@ -25,6 +27,8 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py2-none-any.whl
 #cuda 10.0
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py2-none-any.whl
+##cuda10.1 with TensorRT 6
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py2-none-any.whl
 ```

 ## Client

--- a/java/README.md
+++ b/java/README.md
+## Java Demo
+
+### Install package
+```
+mvn compile
+mvn install
+cd examples
+mvn compile
+mvn install
+```
+
+### Start Server
+
+take the fit_a_line demo as example
+
+```
+ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang #CPU
+python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang #GPU
+```
+
+### Client Predict
+```
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample fit_a_line
+```
+
+The Java example also contains the prediction client of Bert, Model_enaemble, asyn_predict, batch_predict, Cube_local, Cube_quant, and Yolov4 models.
--- a/java/README_CN.md
+++ b/java/README_CN.md
+## Java 示例
+
+### 安装客户端依赖
+```
+mvn compile
+mvn install
+cd examples
+mvn compile
+mvn install
+```
+
+### 启动服务端
+
+以fit_a_line模型为例
+
+```
+ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang #CPU
+python -m paddle_serving_server_gpu.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang #GPU
+```
+
+### 客户端预测
+```
+java -cp paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample fit_a_line
+```
+
+java示例中还包含了bert、model_enaemble、asyn_predict、batch_predict、cube_local、cube_quant、yolov4模型的预测客户端。
--- a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
+++ b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
@@ -23,7 +23,6 @@
 #include "core/configure/inferencer_configure.pb.h"
 #include "core/predictor/framework/infer.h"
 #include "paddle_inference_api.h"  // NOLINT
-//#include "predictor/framework/infer.h"

 namespace baidu {
 namespace paddle_serving {

--- a/paddle_inference/inferencer-fluid-gpu/CMakeLists.txt
+++ b/paddle_inference/inferencer-fluid-gpu/CMakeLists.txt
@@ -2,6 +2,7 @@ FILE(GLOB fluid_gpu_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
 add_library(fluid_gpu_engine ${fluid_gpu_engine_srcs})
 target_include_directories(fluid_gpu_engine PUBLIC
        ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
+
 add_dependencies(fluid_gpu_engine pdserving extern_paddle configure)
 target_link_libraries(fluid_gpu_engine pdserving paddle_fluid iomp5 mklml_intel -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)


--- a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
+++ b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
@@ -190,7 +190,7 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {

    paddle::AnalysisConfig analysis_config;
    analysis_config.SetModel(data_path);
-    analysis_config.EnableUseGpu(100, FLAGS_gpuid);
+    analysis_config.EnableUseGpu(1500, FLAGS_gpuid);
    analysis_config.SwitchSpecifyInputNames(true);
    analysis_config.SetCpuMathLibraryNumThreads(1);

@@ -198,12 +198,68 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
      analysis_config.EnableMemoryOptim();
    }

-    if (params.enable_ir_optimization()) {
-      analysis_config.SwitchIrOptim(true);
+#if 0  // todo: support flexible shape
+
+    int min_seq_len = 1;
+    int max_seq_len = 512;
+    int opt_seq_len = 128;
+    int head_number = 12;
+    int batch = 50;
+
+    std::vector<int> min_in_shape = {batch, min_seq_len, 1};
+    std::vector<int> max_in_shape = {batch, max_seq_len, 1};
+    std::vector<int> opt_in_shape = {batch, opt_seq_len, 1};
+
+    std::string input1_name = "src_text_a_ids";
+    std::string input2_name = "pos_text_a_ids";
+    std::string input3_name = "sent_text_a_ids";
+    std::string input4_name = "stack_0.tmp_0";
+
+    std::map<std::string, std::vector<int>> min_input_shape = {
+        {input1_name, min_in_shape},
+        {input2_name, min_in_shape},
+        {input3_name, min_in_shape},
+        {input4_name, {batch, head_number, min_seq_len, min_seq_len}},
+    };
+
+    std::map<std::string, std::vector<int>> max_input_shape = {
+        {input1_name, max_in_shape},
+        {input2_name, max_in_shape},
+        {input3_name, max_in_shape},
+        {input4_name, {batch, head_number, max_seq_len, max_seq_len}},
+    };
+    std::map<std::string, std::vector<int>> opt_input_shape = {
+        {input1_name, opt_in_shape},
+        {input2_name, opt_in_shape},
+        {input3_name, opt_in_shape},
+        {input4_name, {batch, head_number, opt_seq_len, opt_seq_len}},
+    };
+
+    analysis_config.SetTRTDynamicShapeInfo(
+        min_input_shape, max_input_shape, opt_input_shape);
+#endif
+    int max_batch = 32;
+    int min_subgraph_size = 3;
+    if (params.use_trt()) {
+      analysis_config.EnableTensorRtEngine(
+          1 << 20,
+          max_batch,
+          min_subgraph_size,
+          paddle::AnalysisConfig::Precision::kFloat32,
+          false,
+          false);
+      LOG(INFO) << "create TensorRT predictor";
    } else {
-      analysis_config.SwitchIrOptim(false);
-    }
+      if (params.enable_memory_optimization()) {
+        analysis_config.EnableMemoryOptim();
+      }

+      if (params.enable_ir_optimization()) {
+        analysis_config.SwitchIrOptim(true);
+      } else {
+        analysis_config.SwitchIrOptim(false);
+      }
+    }
    AutoLock lock(GlobalPaddleCreateMutex::instance());
    _core =
        paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -80,6 +80,16 @@ if (SERVER)
            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+    elseif(WITH_TRT)
+        add_custom_command(
+            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+            COMMAND cp -r
+            ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
+            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
+            "server_gpu" trt
+            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
+        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
    else()
        add_custom_command(
            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp

--- a/python/examples/bert/bert_client.py
+++ b/python/examples/bert/bert_client.py
@@ -18,16 +18,20 @@ import sys
 from paddle_serving_client import Client
 from paddle_serving_client.utils import benchmark_args
 from paddle_serving_app.reader import ChineseBertReader
-
+import numpy as np
 args = benchmark_args()

 reader = ChineseBertReader({"max_seq_len": 128})
 fetch = ["pooled_output"]
-endpoint_list = ["127.0.0.1:9292"]
+endpoint_list = ['127.0.0.1:9292']
 client = Client()
 client.load_client_config(args.model)
 client.connect(endpoint_list)

 for line in sys.stdin:
    feed_dict = reader.process(line)
-    result = client.predict(feed=feed_dict, fetch=fetch)
+    for key in feed_dict.keys():
+        feed_dict[key] = np.array(feed_dict[key]).reshape((128, 1))
+    #print(feed_dict)
+    result = client.predict(feed=feed_dict, fetch=fetch, batch=True)
+print(result)
--- a/python/examples/bert/bert_web_service.py
+++ b/python/examples/bert/bert_web_service.py
@@ -13,10 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
-from paddle_serving_server_gpu.web_service import WebService
+from paddle_serving_server.web_service import WebService
 from paddle_serving_app.reader import ChineseBertReader
 import sys
 import os
+import numpy as np


 class BertService(WebService):
@@ -27,18 +28,21 @@ class BertService(WebService):
        })

    def preprocess(self, feed=[], fetch=[]):
-        feed_res = [
-            self.reader.process(ins["words"].encode("utf-8")) for ins in feed
-        ]
-        return feed_res, fetch
+        feed_res = []
+        is_batch = True
+        for ins in feed:
+            feed_dict = self.reader.process(ins["words"].encode("utf-8"))
+            for key in feed_dict.keys():
+                feed_dict[key] = np.array(feed_dict[key]).reshape(
+                    (len(feed_dict[key]), 1))
+            feed_res.append(feed_dict)
+        return feed_res, fetch, is_batch


 bert_service = BertService(name="bert")
 bert_service.load()
 bert_service.load_model_config(sys.argv[1])
-gpu_ids = os.environ["CUDA_VISIBLE_DEVICES"]
-bert_service.set_gpus(gpu_ids)
 bert_service.prepare_server(
-    workdir="workdir", port=int(sys.argv[2]), device="gpu")
+    workdir="workdir", port=int(sys.argv[2]), device="cpu")
 bert_service.run_rpc_service()
 bert_service.run_web_service()
--- a/python/examples/bert/test_multi_fetch_client.py
+++ b/python/examples/bert/test_multi_fetch_client.py
@@ -15,6 +15,7 @@
 from paddle_serving_client import Client
 from paddle_serving_app.reader import ChineseBertReader
 import sys
+import numpy as np

 client = Client()
 client.load_client_config("./bert_seq32_client/serving_client_conf.prototxt")
@@ -28,12 +29,21 @@ expected_shape = {
    "pooled_output": (4, 768)
 }
 batch_size = 4
-feed_batch = []
+feed_batch = {}

+batch_len = 0
 for line in sys.stdin:
    feed = reader.process(line)
+    if batch_len == 0:
+        for key in feed.keys():
+            val_len = len(feed[key])
+            feed_batch[key] = np.array(feed[key]).reshape((1, val_len, 1))
+        continue
    if len(feed_batch) < batch_size:
-        feed_batch.append(feed)
+        for key in feed.keys():
+            np.concatenate([
+                feed_batch[key], np.array(feed[key]).reshape((1, val_len, 1))
+            ])
    else:
        fetch_map = client.predict(feed=feed_batch, fetch=fetch)
        feed_batch = []

--- a/python/examples/criteo_ctr/test_client.py
+++ b/python/examples/criteo_ctr/test_client.py
@@ -20,7 +20,7 @@ import os
 import time
 import criteo_reader as criteo
 from paddle_serving_client.metric import auc
-
+import numpy as np
 import sys

 py_version = sys.version_info[0]
@@ -49,7 +49,8 @@ for ei in range(1000):
        data = reader().__next__()
    feed_dict = {}
    for i in range(1, 27):
-        feed_dict["sparse_{}".format(i - 1)] = data[0][i]
+        feed_dict["sparse_{}".format(i - 1)] = np.array(data[0][i]).reshape(-1)
+        feed_dict["sparse_{}.lod".format(i - 1)] = [0, len(data[0][i])]
    fetch_map = client.predict(feed=feed_dict, fetch=["prob"])
 end = time.time()
 print(end - start)
--- a/python/examples/criteo_ctr_with_cube/test_client.py
+++ b/python/examples/criteo_ctr_with_cube/test_client.py
@@ -19,6 +19,7 @@ import os
 import criteo as criteo
 import time
 from paddle_serving_client.metric import auc
+import numpy as np

 py_version = sys.version_info[0]

@@ -41,10 +42,15 @@ for ei in range(10000):
    else:
        data = reader().__next__()
    feed_dict = {}
-    feed_dict['dense_input'] = data[0][0]
+    feed_dict['dense_input'] = np.array(data[0][0]).astype("float32").reshape(
+        1, 13)
+    feed_dict['dense_input.lod'] = [0, 1]
    for i in range(1, 27):
-        feed_dict["embedding_{}.tmp_0".format(i - 1)] = data[0][i]
-    fetch_map = client.predict(feed=feed_dict, fetch=["prob"])
+        tmp_data = np.array(data[0][i]).astype(np.int64)
+        feed_dict["embedding_{}.tmp_0".format(i - 1)] = tmp_data.reshape(
+            (1, len(data[0][i])))
+        feed_dict["embedding_{}.tmp_0.lod".format(i - 1)] = [0, 1]
+    fetch_map = client.predict(feed=feed_dict, fetch=["prob"], batch=True)
    prob_list.append(fetch_map['prob'][0][1])
    label_list.append(data[0][-1][0])


--- a/python/examples/faster_rcnn_model/test_client.py
+++ b/python/examples/faster_rcnn_model/test_client.py
@@ -36,6 +36,7 @@ fetch_map = client.predict(
        "im_info": np.array(list(im.shape[1:]) + [1.0]),
        "im_shape": np.array(list(im.shape[1:]) + [1.0])
    },
-    fetch=["multiclass_nms"])
+    fetch=["multiclass_nms"],
+    batch=False)
 fetch_map["image"] = sys.argv[3]
 postprocess(fetch_map)
--- a/python/examples/fit_a_line/test_client.py
+++ b/python/examples/fit_a_line/test_client.py
@@ -27,5 +27,10 @@ test_reader = paddle.batch(
    batch_size=1)

 for data in test_reader():
-    fetch_map = client.predict(feed={"x": data[0][0]}, fetch=["price"])
+    import numpy as np
+    new_data = np.zeros((1, 1, 13)).astype("float32")
+    new_data[0] = data[0][0]
+    fetch_map = client.predict(
+        feed={"x": new_data}, fetch=["price"], batch=True)
    print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
+    print(fetch_map)
--- a/python/examples/fit_a_line/test_multi_process_client.py
+++ b/python/examples/fit_a_line/test_multi_process_client.py
@@ -15,6 +15,7 @@
 from paddle_serving_client import Client
 from paddle_serving_client.utils import MultiThreadRunner
 import paddle
+import numpy as np


 def single_func(idx, resource):
@@ -26,6 +27,7 @@ def single_func(idx, resource):
        0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584,
        0.6283, 0.4919, 0.1856, 0.0795, -0.0332
    ]
+    x = np.array(x)
    for i in range(1000):
        fetch_map = client.predict(feed={"x": x}, fetch=["price"])
        if fetch_map is None:

--- a/python/examples/imagenet/resnet50_rpc_client.py
+++ b/python/examples/imagenet/resnet50_rpc_client.py
@@ -38,7 +38,8 @@ start = time.time()
 image_file = "https://paddle-serving.bj.bcebos.com/imagenet-example/daisy.jpg"
 for i in range(10):
    img = seq(image_file)
-    fetch_map = client.predict(feed={"image": img}, fetch=["score"])
+    fetch_map = client.predict(
+        feed={"image": img}, fetch=["score"], batch=False)
    prob = max(fetch_map["score"][0])
    label = label_dict[fetch_map["score"][0].tolist().index(prob)].strip(
    ).replace(",", "")

--- a/python/examples/imagenet/resnet50_web_service.py
+++ b/python/examples/imagenet/resnet50_web_service.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import sys
 from paddle_serving_client import Client
+import numpy as np
 from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage

 if len(sys.argv) != 4:
@@ -43,12 +44,13 @@ class ImageService(WebService):

    def preprocess(self, feed=[], fetch=[]):
        feed_batch = []
+        is_batch = True
        for ins in feed:
            if "image" not in ins:
                raise ("feed data error!")
            img = self.seq(ins["image"])
-            feed_batch.append({"image": img})
-        return feed_batch, fetch
+            feed_batch.append({"image": img[np.newaxis, :]})
+        return feed_batch, fetch, is_batch

    def postprocess(self, feed=[], fetch=[], fetch_map={}):
        score_list = fetch_map["score"]

--- a/python/examples/imdb/benchmark.py
+++ b/python/examples/imdb/benchmark.py
@@ -17,7 +17,8 @@ import os
 import sys
 import time
 import requests
-from paddle_serving_app.reader import IMDBDataset
+import numpy as np
+from paddle_serving_app.reader.imdb_reader import IMDBDataset
 from paddle_serving_client import Client
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import MultiThreadRunner, benchmark_args, show_latency
@@ -47,11 +48,17 @@ def single_func(idx, resource):
        for i in range(1000):
            if args.batch_size >= 1:
                feed_batch = []
+                feed = {"words": [], "words.lod": [0]}
                for bi in range(args.batch_size):
                    word_ids, label = imdb_dataset.get_words_and_label(dataset[
                        bi])
-                    feed_batch.append({"words": word_ids})
-                result = client.predict(feed=feed_batch, fetch=["prediction"])
+                    feed["words.lod"].append(feed["words.lod"][-1] + len(
+                        word_ids))
+                    feed["words"].extend(word_ids)
+                feed["words"] = np.array(feed["words"]).reshape(
+                    len(feed["words"]), 1)
+                result = client.predict(
+                    feed=feed, fetch=["prediction"], batch=True)
                if result is None:
                    raise ("predict failed.")
            else:

--- a/python/examples/imdb/test_client.py
+++ b/python/examples/imdb/test_client.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing
 from paddle_serving_client import Client
-from paddle_serving_app.reader import IMDBDataset
+from paddle_serving_app.reader.imdb_reader import IMDBDataset
 import sys
+import numpy as np

 client = Client()
 client.load_client_config(sys.argv[1])
@@ -28,7 +29,12 @@ imdb_dataset.load_resource(sys.argv[2])

 for line in sys.stdin:
    word_ids, label = imdb_dataset.get_words_and_label(line)
-    feed = {"words": word_ids}
+    word_len = len(word_ids)
+    feed = {
+        "words": np.array(word_ids).reshape(word_len, 1),
+        "words.lod": [0, word_len]
+    }
+    #print(feed)
    fetch = ["prediction"]
-    fetch_map = client.predict(feed=feed, fetch=fetch)
+    fetch_map = client.predict(feed=feed, fetch=fetch, batch=True)
    print("{} {}".format(fetch_map["prediction"][0], label[0]))
--- a/python/examples/imdb/text_classify_service.py
+++ b/python/examples/imdb/text_classify_service.py
@@ -14,8 +14,9 @@
 # pylint: disable=doc-string-missing

 from paddle_serving_server.web_service import WebService
-from paddle_serving_app.reader import IMDBDataset
+from paddle_serving_app.reader.imdb_reader import IMDBDataset
 import sys
+import numpy as np


 class IMDBService(WebService):
@@ -26,10 +27,15 @@ class IMDBService(WebService):
        self.dataset.load_resource(args["dict_file_path"])

    def preprocess(self, feed={}, fetch=[]):
-        res_feed = [{
-            "words": self.dataset.get_words_only(ins["words"])
-        } for ins in feed]
-        return res_feed, fetch
+        feed_batch = []
+        words_lod = [0]
+        for ins in feed:
+            words = self.dataset.get_words_only(ins["words"])
+            words = np.array(words).reshape(len(words), 1)
+            words_lod.append(words_lod[-1] + len(words))
+            feed_batch.append(words)
+        feed = {"words": np.concatenate(feed_batch), "words.lod": words_lod}
+        return feed, fetch


 imdb_service = IMDBService(name="imdb")

--- a/python/examples/lac/lac_client.py
+++ b/python/examples/lac/lac_client.py
@@ -19,6 +19,7 @@ from paddle_serving_app.reader import LACReader
 import sys
 import os
 import io
+import numpy as np

 client = Client()
 client.load_client_config(sys.argv[1])
@@ -31,7 +32,17 @@ for line in sys.stdin:
    feed_data = reader.process(line)
    if len(feed_data) <= 0:
        continue
-    fetch_map = client.predict(feed={"words": feed_data}, fetch=["crf_decode"])
+    print(feed_data)
+    #fetch_map = client.predict(feed={"words": np.array(feed_data).reshape(len(feed_data), 1), "words.lod": [0, len(feed_data)]}, fetch=["crf_decode"], batch=True)
+    fetch_map = client.predict(
+        feed={
+            "words": np.array(feed_data + feed_data).reshape(
+                len(feed_data) * 2, 1),
+            "words.lod": [0, len(feed_data), 2 * len(feed_data)]
+        },
+        fetch=["crf_decode"],
+        batch=True)
+    print(fetch_map)
    begin = fetch_map['crf_decode.lod'][0]
    end = fetch_map['crf_decode.lod'][1]
    segs = reader.parse_result(line, fetch_map["crf_decode"][begin:end])

--- a/python/examples/lac/lac_web_service.py
+++ b/python/examples/lac/lac_web_service.py
@@ -15,6 +15,7 @@
 from paddle_serving_server.web_service import WebService
 import sys
 from paddle_serving_app.reader import LACReader
+import numpy as np


 class LACService(WebService):
@@ -23,13 +24,21 @@ class LACService(WebService):

    def preprocess(self, feed={}, fetch=[]):
        feed_batch = []
+        fetch = ["crf_decode"]
+        lod_info = [0]
+        is_batch = True
        for ins in feed:
            if "words" not in ins:
                raise ("feed data error!")
            feed_data = self.reader.process(ins["words"])
-            feed_batch.append({"words": feed_data})
-        fetch = ["crf_decode"]
-        return feed_batch, fetch
+            feed_batch.append(np.array(feed_data).reshape(len(feed_data), 1))
+            lod_info.append(lod_info[-1] + len(feed_data))
+        feed_dict = {
+            "words": np.concatenate(
+                feed_batch, axis=0),
+            "words.lod": lod_info
+        }
+        return feed_dict, fetch, is_batch

    def postprocess(self, feed={}, fetch=[], fetch_map={}):
        batch_ret = []

--- a/python/examples/ocr/README.md
+++ b/python/examples/ocr/README.md
@@ -34,9 +34,9 @@ python ocr_web_server.py gpu
 ```
 python ocr_web_client.py
 ```
-If you want a faster web service, please try Web Debugger Service
+If you want a faster web service, please try Web LocalPredictor Service

-## Web Debugger Service
+## Web LocalPredictor Service
 ```
 #choose one of cpu/gpu commands as following
 #for cpu user
@@ -45,7 +45,7 @@ python ocr_debugger_server.py cpu
 python ocr_debugger_server.py gpu 
 ```

-## Web Debugger Client Prediction
+## Web LocalPredictor Client Prediction
 ```
 python ocr_web_client.py
 ```
@@ -61,7 +61,7 @@ Dataset: RCTW 500 sample images
 | engine                       | client read image(ms) | client-server tras time(ms) | server read image（ms） | det pre(ms) | det infer(ms) | det post(ms) | rec pre(ms) | rec infer(ms) | rec post(ms) | server-client trans time(ms) | server side time consumption(ms) | server side overhead(ms) | total time（ms) |
 |------------------------------|----------------|----------------------------|------------------|--------------------|------------------|--------------------|--------------------|------------------|--------------------|--------------------------|--------------------|--------------|---------------|
 | Serving web service          | 8.69         | 13.41                      | 109.97           | 2.82               | 87.76            | 4.29               | 3.98               | 78.51            | 3.66               | 4.12                     | 181.02             | 136.49       | 317.51        |
-| Serving Debugger web service |  8.73        | 16.42                      | 115.27           | 2.93               | 20.63            | 3.97               | 4.48               | 13.84            | 3.60               | 6.91                     | 49.45              | 147.33       | 196.78        |
+| Serving LocalPredictor web service |  8.73        | 16.42                      | 115.27           | 2.93               | 20.63            | 3.97               | 4.48               | 13.84            | 3.60               | 6.91                     | 49.45              | 147.33       | 196.78        |

 ## Appendix: For Users who want to launch Det or Rec only
 if you are going to detect images not recognize it or directly recognize the words from images. We also provide Det and Rec server for you.

--- a/python/examples/ocr/README_CN.md
+++ b/python/examples/ocr/README_CN.md
@@ -34,8 +34,8 @@ python ocr_web_server.py gpu
 python ocr_web_client.py
 ```

-如果用户需要更快的执行速度，请尝试Debugger版Web服务
-## 启动Debugger版Web服务
+如果用户需要更快的执行速度，请尝试LocalPredictor版Web服务
+## 启动LocalPredictor版Web服务
 ```
 #根据CPU/GPU设备选择一种启动方式
 #for cpu user
@@ -60,7 +60,7 @@ GPU: Nvidia Tesla V100单卡
 | engine                       | 客户端读图(ms) | 客户端发送请求到服务端(ms) | 服务端读图（ms） | 检测预处理耗时(ms) | 检测模型耗时(ms) | 检测后处理耗时(ms) | 识别预处理耗时(ms) | 识别模型耗时(ms) | 识别后处理耗时(ms) | 服务端回传客户端时间(ms) | 服务端整体耗时(ms) | 空跑耗时(ms) | 整体耗时（ms) |
 |------------------------------|----------------|----------------------------|------------------|--------------------|------------------|--------------------|--------------------|------------------|--------------------|--------------------------|--------------------|--------------|---------------|
 | Serving web service          | 8.69         | 13.41                      | 109.97           | 2.82               | 87.76            | 4.29               | 3.98               | 78.51            | 3.66               | 4.12                     | 181.02             | 136.49      | 317.51        |
-| Serving Debugger web service | 8.73         | 16.42                      | 115.27           | 2.93               | 20.63            | 3.97               | 4.48               | 13.84            | 3.60               | 6.91                     | 49.45              | 147.33      | 196.78        |
+| Serving LocalPredictor web service | 8.73         | 16.42                      | 115.27           | 2.93               | 20.63            | 3.97               | 4.48               | 13.84            | 3.60               | 6.91                     | 49.45              | 147.33      | 196.78        |


 ## 附录： 检测/识别单服务启动

--- a/python/examples/ocr/ocr_debugger_server.py
+++ b/python/examples/ocr/ocr_debugger_server.py
@@ -26,7 +26,7 @@ if sys.argv[1] == 'gpu':
    from paddle_serving_server_gpu.web_service import WebService
 elif sys.argv[1] == 'cpu':
    from paddle_serving_server.web_service import WebService
-from paddle_serving_app.local_predict import Debugger
+from paddle_serving_app.local_predict import LocalPredictor
 import time
 import re
 import base64
@@ -39,7 +39,7 @@ class OCRService(WebService):
            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
                (2, 0, 1))
        ])
-        self.det_client = Debugger()
+        self.det_client = LocalPredictor()
        if sys.argv[1] == 'gpu':
            self.det_client.load_model_config(
                det_model_config, gpu=True, profile=False)

--- a/python/examples/pipeline/imagenet/README.md
+++ b/python/examples/pipeline/imagenet/README.md
+# Imagenet Pipeline WebService
+
+This document will takes Imagenet service as an example to introduce how to use Pipeline WebService.
+
+## Get model
+```
+sh get_model.sh
+```
+
+## Start server
+
+```
+python resnet50_web_service.py &>log.txt &
+```
+
+## RPC test
+```
+python pipeline_rpc_client.py
+```
--- a/python/examples/pipeline/imagenet/README_CN.md
+++ b/python/examples/pipeline/imagenet/README_CN.md
+# Imagenet Pipeline WebService
+
+这里以 Uci 服务为例来介绍 Pipeline WebService 的使用。
+
+## 获取模型
+```
+sh get_data.sh
+```
+
+## 启动服务
+
+```
+python web_service.py &>log.txt &
+```
+
+## 测试
+```
+curl -X POST -k http://localhost:18082/uci/prediction -d '{"key": ["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}'
+```
--- a/python/examples/pipeline/imagenet/config.yml
+++ b/python/examples/pipeline/imagenet/config.yml
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+##当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
+worker_num: 1
+
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
+http_port: 18082
+rpc_port: 9999
+
+dag:
+    #op资源类型, True, 为线程模型；False，为进程模型
+    is_thread_op: False
+op:
+    imagenet:
+        #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
+        local_service_conf:
+
+            #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+            concurrency: 2
+
+            #uci模型路径
+            model_config: ResNet50_vd_model
+
+            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
+            devices: "0" # "0,1"
+
+            #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
+            client_type: local_predictor
+
+            #Fetch结果列表，以client_config中fetch_var的alias_name为准
+            fetch_list: ["score"] 
--- a/python/examples/pipeline/imagenet/daisy.jpg
+++ b/python/examples/pipeline/imagenet/daisy.jpg
--- a/python/examples/pipeline/imagenet/get_model.sh
+++ b/python/examples/pipeline/imagenet/get_model.sh
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/imagenet-example/ResNet50_vd.tar.gz
+tar -xzvf ResNet50_vd.tar.gz
+
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/imagenet-example/image_data.tar.gz
+tar -xzvf image_data.tar.gz
--- a/python/examples/pipeline/imagenet/imagenet.label
+++ b/python/examples/pipeline/imagenet/imagenet.label
--- a/python/examples/pipeline/imagenet/pipeline_rpc_client.py
+++ b/python/examples/pipeline/imagenet/pipeline_rpc_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_server_gpu.pipeline import PipelineClient
+import numpy as np
+import requests
+import json
+import cv2
+import base64
+import os
+
+client = PipelineClient()
+client.connect(['127.0.0.1:9999'])
+
+
+def cv2_to_base64(image):
+    return base64.b64encode(image).decode('utf8')
+
+
+with open("daisy.jpg", 'rb') as file:
+    image_data = file.read()
+image = cv2_to_base64(image_data)
+
+for i in range(1):
+    ret = client.predict(feed_dict={"image": image}, fetch=["label", "prob"])
+    print(ret)
--- a/python/examples/pipeline/imagenet/resnet50_web_service.py
+++ b/python/examples/pipeline/imagenet/resnet50_web_service.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
+try:
+    from paddle_serving_server_gpu.web_service import WebService, Op
+except ImportError:
+    from paddle_serving_server.web_service import WebService, Op
+import logging
+import numpy as np
+import base64, cv2
+
+
+class ImagenetOp(Op):
+    def init_op(self):
+        self.seq = Sequential([
+            Resize(256), CenterCrop(224), RGB2BGR(), Transpose((2, 0, 1)),
+            Div(255), Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225],
+                                True)
+        ])
+        self.label_dict = {}
+        label_idx = 0
+        with open("imagenet.label") as fin:
+            for line in fin:
+                self.label_dict[label_idx] = line.strip()
+                label_idx += 1
+
+    def preprocess(self, input_dicts, data_id, log_id):
+        (_, input_dict), = input_dicts.items()
+        data = base64.b64decode(input_dict["image"].encode('utf8'))
+        data = np.fromstring(data, np.uint8)
+        # Note: class variables(self.var) can only be used in process op mode
+        im = cv2.imdecode(data, cv2.IMREAD_COLOR)
+        img = self.seq(im)
+        return {"image": img[np.newaxis, :].copy()}, False, None, ""
+
+    def postprocess(self, input_dicts, fetch_dict, log_id):
+        print(fetch_dict)
+        score_list = fetch_dict["score"]
+        result = {"label": [], "prob": []}
+        for score in score_list:
+            score = score.tolist()
+            max_score = max(score)
+            result["label"].append(self.label_dict[score.index(max_score)]
+                                   .strip().replace(",", ""))
+            result["prob"].append(max_score)
+        result["label"] = str(result["label"])
+        result["prob"] = str(result["prob"])
+        return result, None, ""
+
+
+class ImageService(WebService):
+    def get_pipeline_response(self, read_op):
+        image_op = ImagenetOp(name="imagenet", input_ops=[read_op])
+        return image_op
+
+
+uci_service = ImageService(name="imagenet")
+uci_service.prepare_pipeline_config("config.yml")
+uci_service.run_service()
--- a/python/examples/pipeline/imdb_model_ensemble/README_CN.md
+++ b/python/examples/pipeline/imdb_model_ensemble/README_CN.md
@@ -8,8 +8,8 @@ sh get_data.sh
 ## 启动服务

 ```
-python -m paddle_serving_server_gpu.serve --model imdb_cnn_model --port 9292 &> cnn.log &
-python -m paddle_serving_server_gpu.serve --model imdb_bow_model --port 9393 &> bow.log &
+python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 &> cnn.log &
+python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.log &
 python test_pipeline_server.py &>pipeline.log &
 ```

@@ -17,8 +17,3 @@ python test_pipeline_server.py &>pipeline.log &
 ```
 python test_pipeline_client.py
 ```
-
-## HTTP 测试
-```
-curl -X POST -k http://localhost:9999/prediction -d '{"key": ["words"], "value": ["i am very sad | 0"]}'
-```
--- a/python/examples/pipeline/imdb_model_ensemble/config.yml
+++ b/python/examples/pipeline/imdb_model_ensemble/config.yml
-rpc_port: 18085
+#rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时，会自动将rpc_port设置为http_port+1
+rpc_port: 18070
+
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
+http_port: 18071
+
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+#当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
 worker_num: 4
-build_dag_each_worker: false
-http_port: 9999
+
+#build_dag_each_worker, False，框架在进程内创建一条DAG；True，框架会每个进程内创建多个独立的DAG
+build_dag_each_worker: False
+
 dag:
-    is_thread_op: false
-    client_type: brpc
+    #op资源类型, True, 为线程模型；False，为进程模型
+    is_thread_op: True
+
+    #重试次数
    retry: 1
-    use_profile: false
+
+    #使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
+    use_profile: False
+
+    #channel的最大长度，默认为0
+    channel_size: 0
+
+    #tracer, 跟踪框架吞吐，每个OP和channel的工作情况。无tracer时不生成数据
    tracer:
+        #每次trace的时间间隔，单位秒/s
        interval_s: 10
+op:
+    bow:
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 1
+    
+        #client连接类型，brpc
+        client_type: brpc
+
+        #Serving交互重试次数，默认不重试
+        retry: 1
+
+        #Serving交互超时时间, 单位ms
+        timeout: 3000
+
+        #Serving IPs
+        server_endpoints: ["127.0.0.1:9393"]
+
+        #bow模型client端配置
+        client_config: "imdb_bow_client_conf/serving_client_conf.prototxt"
+
+        #Fetch结果列表，以client_config中fetch_var的alias_name为准
+        fetch_list: ["prediction"]
+
+        #批量查询Serving的数量, 默认1。batch_size>1要设置auto_batching_timeout，否则不足batch_size时会阻塞
+        batch_size: 1
+
+        #批量查询超时，与batch_size配合使用
+        auto_batching_timeout: 2000
+    cnn:
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 1
+
+        #client连接类型，brpc
+        client_type: brpc
+
+        #Serving交互重试次数，默认不重试
+        retry: 1
+
+        #超时时间, 单位ms
+        timeout: 3000
+
+        #Serving IPs
+        server_endpoints: ["127.0.0.1:9292"]
+
+        #cnn模型client端配置
+        client_config: "imdb_cnn_client_conf/serving_client_conf.prototxt"
+
+        #Fetch结果列表，以client_config中fetch_var的alias_name为准
+        fetch_list: ["prediction"]
+        
+        #批量查询Serving的数量, 默认1。batch_size>1要设置auto_batching_timeout，否则不足batch_size时会阻塞
+        batch_size: 1
+
+        #批量查询超时，与batch_size配合使用
+        auto_batching_timeout: 2000
+    combine:
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 1
+
+        #Serving交互重试次数，默认不重试
+        retry: 1
+
+        #超时时间, 单位ms
+        timeout: 3000
+        
+        #批量查询Serving的数量, 默认1。batch_size>1要设置auto_batching_timeout，否则不足batch_size时会阻塞
+        batch_size: 1
+
+        #批量查询超时，与batch_size配合使用
+        auto_batching_timeout: 2000
--- a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_client.py
+++ b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_client.py
@@ -15,21 +15,22 @@ from paddle_serving_server.pipeline import PipelineClient
 import numpy as np

 client = PipelineClient()
-client.connect(['127.0.0.1:18080'])
+client.connect(['127.0.0.1:18070'])

 words = 'i am very sad | 0'

 futures = []
-for i in range(4):
+for i in range(100):
    futures.append(
        client.predict(
-            feed_dict={"words": words},
+            feed_dict={"words": words,
+                       "logid": 10000 + i},
            fetch=["prediction"],
            asyn=True,
            profile=False))

 for f in futures:
    res = f.result()
-    if res["ecode"] != 0:
+    if res.err_no != 0:
        print("predict failed: {}".format(res))
    print(res)
--- a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
+++ b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
@@ -15,10 +15,14 @@
 from paddle_serving_server.pipeline import Op, RequestOp, ResponseOp
 from paddle_serving_server.pipeline import PipelineServer
 from paddle_serving_server.pipeline.proto import pipeline_service_pb2
-from paddle_serving_server.pipeline.channel import ChannelDataEcode
+from paddle_serving_server.pipeline.channel import ChannelDataErrcode
 import numpy as np
-from paddle_serving_app.reader import IMDBDataset
+from paddle_serving_app.reader.imdb_reader import IMDBDataset
 import logging
+try:
+    from paddle_serving_server.web_service import WebService
+except ImportError:
+    from paddle_serving_server_gpu.web_service import WebService

 _LOGGER = logging.getLogger()
 user_handler = logging.StreamHandler()
@@ -41,74 +45,68 @@ class ImdbRequestOp(RequestOp):
                continue
            words = request.value[idx]
            word_ids, _ = self.imdb_dataset.get_words_and_label(words)
-            dictdata[key] = np.array(word_ids)
-        return dictdata
+            word_len = len(word_ids)
+            dictdata[key] = np.array(word_ids).reshape(word_len, 1)
+            dictdata["{}.lod".format(key)] = np.array([0, word_len])
+
+        log_id = None
+        if request.logid is not None:
+            log_id = request.logid
+        return dictdata, log_id, None, ""


 class CombineOp(Op):
-    def preprocess(self, input_data):
+    def preprocess(self, input_data, data_id, log_id):
+        #_LOGGER.info("Enter CombineOp::preprocess")
        combined_prediction = 0
        for op_name, data in input_data.items():
            _LOGGER.info("{}: {}".format(op_name, data["prediction"]))
            combined_prediction += data["prediction"]
        data = {"prediction": combined_prediction / 2}
-        return data
+        return data, False, None, ""


 class ImdbResponseOp(ResponseOp):
    # Here ImdbResponseOp is consistent with the default ResponseOp implementation
    def pack_response_package(self, channeldata):
        resp = pipeline_service_pb2.Response()
-        resp.ecode = channeldata.ecode
-        if resp.ecode == ChannelDataEcode.OK.value:
+        resp.err_no = channeldata.error_code
+        if resp.err_no == ChannelDataErrcode.OK.value:
            feed = channeldata.parse()
            # ndarray to string
            for name, var in feed.items():
                resp.value.append(var.__repr__())
                resp.key.append(name)
        else:
-            resp.error_info = channeldata.error_info
+            resp.err_msg = channeldata.error_info
        return resp


 read_op = ImdbRequestOp()
-bow_op = Op(name="bow",
-            input_ops=[read_op],
-            server_endpoints=["127.0.0.1:9393"],
-            fetch_list=["prediction"],
-            client_config="imdb_bow_client_conf/serving_client_conf.prototxt",
-            concurrency=1,
-            timeout=-1,
-            retry=1,
-            batch_size=3,
-            auto_batching_timeout=1000)
-cnn_op = Op(name="cnn",
-            input_ops=[read_op],
-            server_endpoints=["127.0.0.1:9292"],
-            fetch_list=["prediction"],
-            client_config="imdb_cnn_client_conf/serving_client_conf.prototxt",
-            concurrency=1,
-            timeout=-1,
-            retry=1,
-            batch_size=1,
-            auto_batching_timeout=None)
-combine_op = CombineOp(
-    name="combine",
-    input_ops=[bow_op, cnn_op],
-    concurrency=1,
-    timeout=-1,
-    retry=1,
-    batch_size=2,
-    auto_batching_timeout=None)
+
+
+class BowOp(Op):
+    def init_op(self):
+        pass
+
+
+class CnnOp(Op):
+    def init_op(self):
+        pass
+
+
+bow_op = BowOp("bow", input_ops=[read_op])
+cnn_op = CnnOp("cnn", input_ops=[read_op])
+combine_op = CombineOp("combine", input_ops=[bow_op, cnn_op])

 # fetch output of bow_op
-# response_op = ImdbResponseOp(input_ops=[bow_op])
+#response_op = ImdbResponseOp(input_ops=[bow_op])

 # fetch output of combine_op
 response_op = ImdbResponseOp(input_ops=[combine_op])

 # use default ResponseOp implementation
-# response_op = ResponseOp(input_ops=[combine_op])
+#response_op = ResponseOp(input_ops=[combine_op])

 server = PipelineServer()
 server.set_response_op(response_op)

--- a/python/examples/pipeline/ocr/README.md
+++ b/python/examples/pipeline/ocr/README.md
@@ -28,31 +28,9 @@ python web_service.py &>log.txt &
 python pipeline_http_client.py
 ```

-
-
 <!--
 ## More (PipelineServing)

-You can choose one of the following versions to start Service.
-
-### Remote Service Version
-```
-python -m paddle_serving_server_gpu.serve --model ocr_det_model --port 12000 --gpu_id 0 &> det.log &
-python -m paddle_serving_server_gpu.serve --model ocr_rec_model --port 12001 --gpu_id 0 &> rec.log &
-python remote_service_pipeline_server.py &>pipeline.log &
-```
-
-### Local Service Version
-```
-python local_service_pipeline_server.py &>pipeline.log &
-```
-
-### Hybrid Service Version
-```
-python -m paddle_serving_server_gpu.serve --model ocr_rec_model --port 12001 --gpu_id 0 &> rec.log &
-python hybrid_service_pipeline_server.py &>pipeline.log &
-```
-
 ## Client Prediction

 ### RPC

--- a/python/examples/pipeline/ocr/README_CN.md
+++ b/python/examples/pipeline/ocr/README_CN.md
@@ -31,26 +31,6 @@ python pipeline_http_client.py
 <!--
 ## 其他 (PipelineServing)

-你可以选择下面任意一种版本启动服务。
-
-### 远程服务版本
-```
-python -m paddle_serving_server.serve --model ocr_det_model --port 12000 --gpu_id 0 &> det.log &
-python -m paddle_serving_server.serve --model ocr_rec_model --port 12001 --gpu_id 0 &> rec.log &
-python remote_service_pipeline_server.py &>pipeline.log &
-```
-
-### 本地服务版本
-```
-python local_service_pipeline_server.py &>pipeline.log &
-```
-
-### 混合服务版本
-```
-python -m paddle_serving_server_gpu.serve --model ocr_rec_model --port 12001 --gpu_id 0 &> rec.log &
-python hybrid_service_pipeline_server.py &>pipeline.log &
-```
-
 ## 启动客户端

 ### RPC

--- a/python/examples/pipeline/ocr/config.yml
+++ b/python/examples/pipeline/ocr/config.yml
-rpc_port: 18080
-worker_num: 4
-build_dag_each_worker: false
+#rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时，会自动将rpc_port设置为http_port+1
+rpc_port: 18090
+
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
 http_port: 9999
+
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+##当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
+worker_num: 1
+
+#build_dag_each_worker, False，框架在进程内创建一条DAG；True，框架会每个进程内创建多个独立的DAG
+build_dag_each_worker: false
+
 dag:
-    is_thread_op: false
-    client_type: brpc
+    #op资源类型, True, 为线程模型；False，为进程模型
+    is_thread_op: False
+
+    #重试次数
    retry: 1
+
+    #使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
    use_profile: false
 op:
    det:
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
        concurrency: 2
+
+        #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
        local_service_conf:
+            #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
+            client_type: local_predictor
+
+            #det模型路径
            model_config: ocr_det_model
+
+            #Fetch结果列表，以client_config中fetch_var的alias_name为准
+            fetch_list: ["concat_1.tmp_0"]
+
+            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
            devices: "0"
    rec:
-        concurrency: 1
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 2
+
+        #超时时间, 单位ms
        timeout: -1
+ 
+        #Serving交互重试次数，默认不重试
        retry: 1
+
+        #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
        local_service_conf:
+
+            #client类型，包括brpc, grpc和local_predictor。local_predictor不启动Serving服务，进程内预测
+            client_type: local_predictor
+
+            #rec模型路径
            model_config: ocr_rec_model
+
+            #Fetch结果列表，以client_config中fetch_var的alias_name为准
+            fetch_list: ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"] 
+
+            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
            devices: "0"
--- a/python/examples/pipeline/ocr/hybrid_service_pipeline_server.py
+++ b/python/examples/pipeline/ocr/hybrid_service_pipeline_server.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server_gpu.pipeline import Op, RequestOp, ResponseOp
-from paddle_serving_server_gpu.pipeline import PipelineServer
-from paddle_serving_server_gpu.pipeline.proto import pipeline_service_pb2
-from paddle_serving_server_gpu.pipeline.channel import ChannelDataEcode
-from paddle_serving_server_gpu.pipeline import LocalRpcServiceHandler
-import numpy as np
-import cv2
-import time
-import base64
-import json
-from paddle_serving_app.reader import OCRReader
-from paddle_serving_app.reader import Sequential, ResizeByFactor
-from paddle_serving_app.reader import Div, Normalize, Transpose
-from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
-import time
-import re
-import base64
-import logging
-
-_LOGGER = logging.getLogger()
-
-
-class DetOp(Op):
-    def init_op(self):
-        self.det_preprocess = Sequential([
-            ResizeByFactor(32, 960), Div(255),
-            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
-                (2, 0, 1))
-        ])
-        self.filter_func = FilterBoxes(10, 10)
-        self.post_func = DBPostProcess({
-            "thresh": 0.3,
-            "box_thresh": 0.5,
-            "max_candidates": 1000,
-            "unclip_ratio": 1.5,
-            "min_size": 3
-        })
-
-    def preprocess(self, input_dicts):
-        (_, input_dict), = input_dicts.items()
-        data = base64.b64decode(input_dict["image"].encode('utf8'))
-        data = np.fromstring(data, np.uint8)
-        # Note: class variables(self.var) can only be used in process op mode
-        self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
-        self.ori_h, self.ori_w, _ = self.im.shape
-        det_img = self.det_preprocess(self.im)
-        _, self.new_h, self.new_w = det_img.shape
-        return {"image": det_img}
-
-    def postprocess(self, input_dicts, fetch_dict):
-        det_out = fetch_dict["concat_1.tmp_0"]
-        ratio_list = [
-            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
-        ]
-        dt_boxes_list = self.post_func(det_out, [ratio_list])
-        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
-        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
-        return out_dict
-
-
-class RecOp(Op):
-    def init_op(self):
-        self.ocr_reader = OCRReader()
-        self.get_rotate_crop_image = GetRotateCropImage()
-        self.sorted_boxes = SortedBoxes()
-
-    def preprocess(self, input_dicts):
-        (_, input_dict), = input_dicts.items()
-        im = input_dict["image"]
-        dt_boxes = input_dict["dt_boxes"]
-        dt_boxes = self.sorted_boxes(dt_boxes)
-        feed_list = []
-        img_list = []
-        max_wh_ratio = 0
-        for i, dtbox in enumerate(dt_boxes):
-            boximg = self.get_rotate_crop_image(im, dt_boxes[i])
-            img_list.append(boximg)
-            h, w = boximg.shape[0:2]
-            wh_ratio = w * 1.0 / h
-            max_wh_ratio = max(max_wh_ratio, wh_ratio)
-        for img in img_list:
-            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
-            feed = {"image": norm_img}
-            feed_list.append(feed)
-        return feed_list
-
-    def postprocess(self, input_dicts, fetch_dict):
-        rec_res = self.ocr_reader.postprocess(fetch_dict, with_score=True)
-        res_lst = []
-        for res in rec_res:
-            res_lst.append(res[0])
-        res = {"res": str(res_lst)}
-        return res
-
-
-read_op = RequestOp()
-det_op = DetOp(
-    name="det",
-    input_ops=[read_op],
-    local_rpc_service_handler=LocalRpcServiceHandler(
-        model_config="ocr_det_model",
-        workdir="det_workdir",  # defalut: "workdir"
-        thread_num=2,  # defalut: 2
-        devices="0",  # gpu0. defalut: "" (cpu)
-        mem_optim=True,  # defalut: True
-        ir_optim=False,  # defalut: False
-        available_port_generator=None),  # defalut: None
-    concurrency=1)
-rec_op = RecOp(
-    name="rec",
-    input_ops=[det_op],
-    server_endpoints=["127.0.0.1:12001"],
-    fetch_list=["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"],
-    client_config="ocr_rec_client/serving_client_conf.prototxt",
-    concurrency=1)
-response_op = ResponseOp(input_ops=[rec_op])
-
-server = PipelineServer("ocr")
-server.set_response_op(response_op)
-server.prepare_server('config.yml')
-server.run_server()
--- a/python/examples/pipeline/ocr/local_service_pipeline_server.py
+++ b/python/examples/pipeline/ocr/local_service_pipeline_server.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server_gpu.pipeline import Op, RequestOp, ResponseOp
-from paddle_serving_server_gpu.pipeline import PipelineServer
-from paddle_serving_server_gpu.pipeline.proto import pipeline_service_pb2
-from paddle_serving_server_gpu.pipeline.channel import ChannelDataEcode
-from paddle_serving_server_gpu.pipeline import LocalRpcServiceHandler
-import numpy as np
-import cv2
-import time
-import base64
-import json
-from paddle_serving_app.reader import OCRReader
-from paddle_serving_app.reader import Sequential, ResizeByFactor
-from paddle_serving_app.reader import Div, Normalize, Transpose
-from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
-import time
-import re
-import base64
-import logging
-
-_LOGGER = logging.getLogger()
-
-
-class DetOp(Op):
-    def init_op(self):
-        self.det_preprocess = Sequential([
-            ResizeByFactor(32, 960), Div(255),
-            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
-                (2, 0, 1))
-        ])
-        self.filter_func = FilterBoxes(10, 10)
-        self.post_func = DBPostProcess({
-            "thresh": 0.3,
-            "box_thresh": 0.5,
-            "max_candidates": 1000,
-            "unclip_ratio": 1.5,
-            "min_size": 3
-        })
-
-    def preprocess(self, input_dicts):
-        (_, input_dict), = input_dicts.items()
-        data = base64.b64decode(input_dict["image"].encode('utf8'))
-        data = np.fromstring(data, np.uint8)
-        # Note: class variables(self.var) can only be used in process op mode
-        self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
-        self.ori_h, self.ori_w, _ = self.im.shape
-        det_img = self.det_preprocess(self.im)
-        _, self.new_h, self.new_w = det_img.shape
-        return {"image": det_img}
-
-    def postprocess(self, input_dicts, fetch_dict):
-        det_out = fetch_dict["concat_1.tmp_0"]
-        ratio_list = [
-            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
-        ]
-        dt_boxes_list = self.post_func(det_out, [ratio_list])
-        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
-        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
-        return out_dict
-
-
-class RecOp(Op):
-    def init_op(self):
-        self.ocr_reader = OCRReader()
-        self.get_rotate_crop_image = GetRotateCropImage()
-        self.sorted_boxes = SortedBoxes()
-
-    def preprocess(self, input_dicts):
-        (_, input_dict), = input_dicts.items()
-        im = input_dict["image"]
-        dt_boxes = input_dict["dt_boxes"]
-        dt_boxes = self.sorted_boxes(dt_boxes)
-        feed_list = []
-        img_list = []
-        max_wh_ratio = 0
-        for i, dtbox in enumerate(dt_boxes):
-            boximg = self.get_rotate_crop_image(im, dt_boxes[i])
-            img_list.append(boximg)
-            h, w = boximg.shape[0:2]
-            wh_ratio = w * 1.0 / h
-            max_wh_ratio = max(max_wh_ratio, wh_ratio)
-        for img in img_list:
-            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
-            feed = {"image": norm_img}
-            feed_list.append(feed)
-        return feed_list
-
-    def postprocess(self, input_dicts, fetch_dict):
-        rec_res = self.ocr_reader.postprocess(fetch_dict, with_score=True)
-        res_lst = []
-        for res in rec_res:
-            res_lst.append(res[0])
-        res = {"res": str(res_lst)}
-        return res
-
-
-read_op = RequestOp()
-det_op = DetOp(
-    name="det",
-    input_ops=[read_op],
-    local_rpc_service_handler=LocalRpcServiceHandler(
-        model_config="ocr_det_model",
-        workdir="det_workdir",  # defalut: "workdir"
-        thread_num=2,  # defalut: 2
-        devices="0",  # gpu0. defalut: "" (cpu)
-        mem_optim=True,  # defalut: True
-        ir_optim=False,  # defalut: False
-        available_port_generator=None),  # defalut: None
-    concurrency=1)
-rec_op = RecOp(
-    name="rec",
-    input_ops=[det_op],
-    local_rpc_service_handler=LocalRpcServiceHandler(
-        model_config="ocr_rec_model"),
-    concurrency=1)
-response_op = ResponseOp(input_ops=[rec_op])
-
-server = PipelineServer("ocr")
-server.set_response_op(response_op)
-server.prepare_server('config.yml')
-server.run_server()
--- a/python/examples/pipeline/ocr/pipeline_http_client.py
+++ b/python/examples/pipeline/ocr/pipeline_http_client.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle_serving_server_gpu.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 import requests
 import json

--- a/python/examples/pipeline/ocr/pipeline_rpc_client.py
+++ b/python/examples/pipeline/ocr/pipeline_rpc_client.py
@@ -20,7 +20,7 @@ import base64
 import os

 client = PipelineClient()
-client.connect(['127.0.0.1:18080'])
+client.connect(['127.0.0.1:18090'])


 def cv2_to_base64(image):
@@ -33,6 +33,6 @@ for img_file in os.listdir(test_img_dir):
        image_data = file.read()
    image = cv2_to_base64(image_data)

-for i in range(4):
+for i in range(1):
    ret = client.predict(feed_dict={"image": image}, fetch=["res"])
    print(ret)
--- a/python/examples/pipeline/ocr/remote_service_pipeline_server.py
+++ b/python/examples/pipeline/ocr/remote_service_pipeline_server.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=doc-string-missing
-from paddle_serving_server_gpu.pipeline import Op, RequestOp, ResponseOp
-from paddle_serving_server_gpu.pipeline import PipelineServer
-from paddle_serving_server_gpu.pipeline.proto import pipeline_service_pb2
-from paddle_serving_server_gpu.pipeline.channel import ChannelDataEcode
-import numpy as np
-import cv2
-import time
-import base64
-import json
-from paddle_serving_app.reader import OCRReader
-from paddle_serving_app.reader import Sequential, ResizeByFactor
-from paddle_serving_app.reader import Div, Normalize, Transpose
-from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
-import time
-import re
-import base64
-import logging
-
-_LOGGER = logging.getLogger()
-
-
-class DetOp(Op):
-    def init_op(self):
-        self.det_preprocess = Sequential([
-            ResizeByFactor(32, 960), Div(255),
-            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
-                (2, 0, 1))
-        ])
-        self.filter_func = FilterBoxes(10, 10)
-        self.post_func = DBPostProcess({
-            "thresh": 0.3,
-            "box_thresh": 0.5,
-            "max_candidates": 1000,
-            "unclip_ratio": 1.5,
-            "min_size": 3
-        })
-
-    def preprocess(self, input_dicts):
-        (_, input_dict), = input_dicts.items()
-        data = base64.b64decode(input_dict["image"].encode('utf8'))
-        data = np.fromstring(data, np.uint8)
-        # Note: class variables(self.var) can only be used in process op mode
-        self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
-        self.ori_h, self.ori_w, _ = self.im.shape
-        det_img = self.det_preprocess(self.im)
-        _, self.new_h, self.new_w = det_img.shape
-        return {"image": det_img}
-
-    def postprocess(self, input_dicts, fetch_dict):
-        det_out = fetch_dict["concat_1.tmp_0"]
-        ratio_list = [
-            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
-        ]
-        dt_boxes_list = self.post_func(det_out, [ratio_list])
-        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
-        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
-        return out_dict
-
-
-class RecOp(Op):
-    def init_op(self):
-        self.ocr_reader = OCRReader()
-        self.get_rotate_crop_image = GetRotateCropImage()
-        self.sorted_boxes = SortedBoxes()
-
-    def preprocess(self, input_dicts):
-        (_, input_dict), = input_dicts.items()
-        im = input_dict["image"]
-        dt_boxes = input_dict["dt_boxes"]
-        dt_boxes = self.sorted_boxes(dt_boxes)
-        feed_list = []
-        img_list = []
-        max_wh_ratio = 0
-        for i, dtbox in enumerate(dt_boxes):
-            boximg = self.get_rotate_crop_image(im, dt_boxes[i])
-            img_list.append(boximg)
-            h, w = boximg.shape[0:2]
-            wh_ratio = w * 1.0 / h
-            max_wh_ratio = max(max_wh_ratio, wh_ratio)
-        for img in img_list:
-            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
-            feed = {"image": norm_img}
-            feed_list.append(feed)
-        return feed_list
-
-    def postprocess(self, input_dicts, fetch_dict):
-        rec_res = self.ocr_reader.postprocess(fetch_dict, with_score=True)
-        res_lst = []
-        for res in rec_res:
-            res_lst.append(res[0])
-        res = {"res": str(res_lst)}
-        return res
-
-
-read_op = RequestOp()
-det_op = DetOp(
-    name="det",
-    input_ops=[read_op],
-    server_endpoints=["127.0.0.1:12000"],
-    fetch_list=["concat_1.tmp_0"],
-    client_config="ocr_det_client/serving_client_conf.prototxt",
-    concurrency=1)
-rec_op = RecOp(
-    name="rec",
-    input_ops=[det_op],
-    server_endpoints=["127.0.0.1:12001"],
-    fetch_list=["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"],
-    client_config="ocr_rec_client/serving_client_conf.prototxt",
-    concurrency=1)
-response_op = ResponseOp(input_ops=[rec_op])
-
-server = PipelineServer("ocr")
-server.set_response_op(response_op)
-server.prepare_server('config.yml')
-server.run_server()
--- a/python/examples/pipeline/ocr/web_service.py
+++ b/python/examples/pipeline/ocr/web_service.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 try:
-    from paddle_serving_server_gpu.web_service import WebService, Op
-except ImportError:
    from paddle_serving_server.web_service import WebService, Op
+except ImportError:
+    from paddle_serving_server_gpu.web_service import WebService, Op
 import logging
 import numpy as np
 import cv2
@@ -43,7 +43,7 @@ class DetOp(Op):
            "min_size": 3
        })

-    def preprocess(self, input_dicts):
+    def preprocess(self, input_dicts, data_id, log_id):
        (_, input_dict), = input_dicts.items()
        data = base64.b64decode(input_dict["image"].encode('utf8'))
        data = np.fromstring(data, np.uint8)
@@ -52,9 +52,9 @@ class DetOp(Op):
        self.ori_h, self.ori_w, _ = self.im.shape
        det_img = self.det_preprocess(self.im)
        _, self.new_h, self.new_w = det_img.shape
-        return {"image": det_img}
+        return {"image": det_img[np.newaxis, :].copy()}, False, None, ""

-    def postprocess(self, input_dicts, fetch_dict):
+    def postprocess(self, input_dicts, fetch_dict, log_id):
        det_out = fetch_dict["concat_1.tmp_0"]
        ratio_list = [
            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
@@ -62,7 +62,8 @@ class DetOp(Op):
        dt_boxes_list = self.post_func(det_out, [ratio_list])
        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
-        return out_dict
+        print("out dict", out_dict)
+        return out_dict, None, ""


 class RecOp(Op):
@@ -71,7 +72,7 @@ class RecOp(Op):
        self.get_rotate_crop_image = GetRotateCropImage()
        self.sorted_boxes = SortedBoxes()

-    def preprocess(self, input_dicts):
+    def preprocess(self, input_dicts, data_id, log_id):
        (_, input_dict), = input_dicts.items()
        im = input_dict["image"]
        dt_boxes = input_dict["dt_boxes"]
@@ -85,19 +86,22 @@ class RecOp(Op):
            h, w = boximg.shape[0:2]
            wh_ratio = w * 1.0 / h
            max_wh_ratio = max(max_wh_ratio, wh_ratio)
-        for img in img_list:
+        _, w, h = self.ocr_reader.resize_norm_img(img_list[0],
+                                                  max_wh_ratio).shape
+        imgs = np.zeros((len(img_list), 3, w, h)).astype('float32')
+        for id, img in enumerate(img_list):
            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
-            feed = {"image": norm_img}
-            feed_list.append(feed)
-        return feed_list
+            imgs[id] = norm_img
+        feed = {"image": imgs.copy()}
+        return feed, False, None, ""

-    def postprocess(self, input_dicts, fetch_dict):
+    def postprocess(self, input_dicts, fetch_dict, log_id):
        rec_res = self.ocr_reader.postprocess(fetch_dict, with_score=True)
        res_lst = []
        for res in rec_res:
            res_lst.append(res[0])
        res = {"res": str(res_lst)}
-        return res
+        return res, None, ""


 class OcrService(WebService):

--- a/python/examples/pipeline/simple_web_service/README.md
+++ b/python/examples/pipeline/simple_web_service/README.md
@@ -15,5 +15,5 @@ python web_service.py &>log.txt &

 ## Http test
 ```
-curl -X POST -k http://localhost:18080/uci/prediction -d '{"key": ["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}'
+curl -X POST -k http://localhost:18082/uci/prediction -d '{"key": ["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}'
 ```
--- a/python/examples/pipeline/simple_web_service/README_CN.md
+++ b/python/examples/pipeline/simple_web_service/README_CN.md
@@ -15,5 +15,5 @@ python web_service.py &>log.txt &

 ## 测试
 ```
-curl -X POST -k http://localhost:18080/uci/prediction -d '{"key": ["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}'
+curl -X POST -k http://localhost:18082/uci/prediction -d '{"key": ["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}'
 ```
--- a/python/examples/pipeline/simple_web_service/config.yml
+++ b/python/examples/pipeline/simple_web_service/config.yml
-worker_num: 4
-http_port: 18080
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+##当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
+worker_num: 1
+
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
+http_port: 18082
+
 dag:
-    is_thread_op: false
+    #op资源类型, True, 为线程模型；False，为进程模型
+    is_thread_op: False
 op:
    uci:
+        #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
        local_service_conf:
+
+            #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+            concurrency: 2
+
+            #uci模型路径
            model_config: uci_housing_model
-            devices: "" # "0,1"
+
+            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
+            devices: "0" # "0,1"
+
+            #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
+            client_type: local_predictor
+
+            #Fetch结果列表，以client_config中fetch_var的alias_name为准
+            fetch_list: ["price"] 
--- a/python/examples/pipeline/simple_web_service/web_service.py
+++ b/python/examples/pipeline/simple_web_service/web_service.py
@@ -25,19 +25,25 @@ class UciOp(Op):
    def init_op(self):
        self.separator = ","

-    def preprocess(self, input_dicts):
+    def preprocess(self, input_dicts, data_id, log_id):
        (_, input_dict), = input_dicts.items()
-        _LOGGER.info(input_dict)
+        _LOGGER.error("UciOp::preprocess >>> log_id:{}, input:{}".format(
+            log_id, input_dict))
        x_value = input_dict["x"]
+        proc_dict = {}
        if isinstance(x_value, (str, unicode)):
            input_dict["x"] = np.array(
-                [float(x.strip()) for x in x_value.split(self.separator)])
-        return input_dict
+                [float(x.strip())
+                 for x in x_value.split(self.separator)]).reshape(1, 13)
+            _LOGGER.error("input_dict:{}".format(input_dict))

-    def postprocess(self, input_dicts, fetch_dict):
-        # _LOGGER.info(fetch_dict)
+        return input_dict, False, None, ""
+
+    def postprocess(self, input_dicts, fetch_dict, log_id):
+        _LOGGER.info("UciOp::postprocess >>> log_id:{}, fetch_dict:{}".format(
+            log_id, fetch_dict))
        fetch_dict["price"] = str(fetch_dict["price"][0][0])
-        return fetch_dict
+        return fetch_dict, None, ""


 class UciService(WebService):

--- a/python/examples/resnet_v2_50/resnet50_debug.py
+++ b/python/examples/resnet_v2_50/resnet50_debug.py
@@ -14,10 +14,10 @@

 from paddle_serving_app.reader import Sequential, File2Image, Resize, CenterCrop
 from paddle_serving_app.reader import RGB2BGR, Transpose, Div, Normalize
-from paddle_serving_app.local_predict import Debugger
+from paddle_serving_app.local_predict import LocalPredictor
 import sys

-debugger = Debugger()
+debugger = LocalPredictor()
 debugger.load_model_config(sys.argv[1], gpu=True)

 seq = Sequential([

--- a/python/examples/senta/senta_web_service.py
+++ b/python/examples/senta/senta_web_service.py
@@ -18,7 +18,7 @@ from paddle_serving_client import Client
 from paddle_serving_app.reader import LACReader, SentaReader
 import os
 import sys
-
+import numpy as np
 #senta_web_service.py
 from paddle_serving_server.web_service import WebService
 from paddle_serving_client import Client
@@ -36,20 +36,36 @@ class SentaService(WebService):

    #定义senta模型预测服务的预处理，调用顺序：lac reader->lac模型预测->预测结果后处理->senta reader
    def preprocess(self, feed=[], fetch=[]):
-        feed_data = [{
-            "words": self.lac_reader.process(x["words"])
-        } for x in feed]
-        lac_result = self.lac_client.predict(
-            feed=feed_data, fetch=["crf_decode"])
        feed_batch = []
+        is_batch = True
+        words_lod = [0]
+        for ins in feed:
+            if "words" not in ins:
+                raise ("feed data error!")
+            feed_data = self.lac_reader.process(ins["words"])
+            words_lod.append(words_lod[-1] + len(feed_data))
+            feed_batch.append(np.array(feed_data).reshape(len(feed_data), 1))
+        words = np.concatenate(feed_batch, axis=0)
+
+        lac_result = self.lac_client.predict(
+            feed={"words": words,
+                  "words.lod": words_lod},
+            fetch=["crf_decode"],
+            batch=True)
        result_lod = lac_result["crf_decode.lod"]
+        feed_batch = []
+        words_lod = [0]
        for i in range(len(feed)):
            segs = self.lac_reader.parse_result(
                feed[i]["words"],
                lac_result["crf_decode"][result_lod[i]:result_lod[i + 1]])
            feed_data = self.senta_reader.process(segs)
-            feed_batch.append({"words": feed_data})
-        return feed_batch, fetch
+            feed_batch.append(np.array(feed_data).reshape(len(feed_data), 1))
+            words_lod.append(words_lod[-1] + len(feed_data))
+        return {
+            "words": np.concatenate(feed_batch),
+            "words.lod": words_lod
+        }, fetch, is_batch


 senta_service = SentaService(name="senta")

--- a/python/examples/unet_for_image_seg/unet_benchmark/README.md
+++ b/python/examples/unet_for_image_seg/unet_benchmark/README.md
+#UNET_BENCHMARK 使用说明
+## 功能
+* benchmark测试
+## 注意事项
+* 示例图片（可以有多张）请放置于与img_data路径中，支持jpg，jpeg
+* 图片张数应该大于等于并发数量
+## TODO
+* http benchmark
--- a/python/examples/unet_for_image_seg/unet_benchmark/img_data/N0060.jpg
+++ b/python/examples/unet_for_image_seg/unet_benchmark/img_data/N0060.jpg
--- a/python/examples/unet_for_image_seg/unet_benchmark/launch_benckmark.sh
+++ b/python/examples/unet_for_image_seg/unet_benchmark/launch_benckmark.sh
+#!/bin/bash
+python unet_benchmark.py --thread 1 --batch_size 1 --model ../unet_client/serving_client_conf.prototxt
+# thread/batch can be modified as you wish 
--- a/python/examples/unet_for_image_seg/unet_benchmark/unet_benchmark.py
+++ b/python/examples/unet_for_image_seg/unet_benchmark/unet_benchmark.py
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+  unet bench mark script
+  20201130 first edition by cg82616424
+"""
+from __future__ import unicode_literals, absolute_import
+import os
+import time
+import json
+import requests
+from paddle_serving_client import Client
+from paddle_serving_client.utils import MultiThreadRunner
+from paddle_serving_client.utils import benchmark_args, show_latency
+from paddle_serving_app.reader import Sequential, File2Image, Resize, Transpose, BGR2RGB, SegPostprocess
+args = benchmark_args()
+
+
+def get_img_names(path):
+    """
+    Brief:
+        get img files(jpg) under this path
+        if any exception happened return None
+    Args:
+        path (string): image file path
+    Returns:
+        list: images names under this folder
+    """
+    if not os.path.exists(path):
+        return None
+    if not os.path.isdir(path):
+        return None
+    list_name = []
+    for f_handler in os.listdir(path):
+        file_path = os.path.join(path, f_handler)
+        if os.path.isdir(file_path):
+            continue
+        else:
+            if not file_path.endswith(".jpeg") and not file_path.endswith(
+                    ".jpg"):
+                continue
+            list_name.append(file_path)
+    return list_name
+
+
+def preprocess_img(img_list):
+    """
+    Brief:
+        prepare img data for benchmark
+    Args:
+        img_list(list): list for img file path
+    Returns:
+        image content binary list after preprocess
+    """
+    preprocess = Sequential([File2Image(), Resize((512, 512))])
+    result_list = []
+    for img in img_list:
+        img_tmp = preprocess(img)
+        result_list.append(img_tmp)
+    return result_list
+
+
+def benckmark_worker(idx, resource):
+    """
+    Brief:
+        benchmark single worker for unet
+    Args:
+        idx(int): worker idx ,use idx to select backend unet service
+        resource(dict): unet serving endpoint dict 
+    Returns:
+        latency
+    TODO:
+        http benckmarks
+    """
+    profile_flags = False
+    latency_flags = False
+    postprocess = SegPostprocess(2)
+    if os.getenv("FLAGS_profile_client"):
+        profile_flags = True
+    if os.getenv("FLAGS_serving_latency"):
+        latency_flags = True
+        latency_list = []
+    client_handler = Client()
+    client_handler.load_client_config(args.model)
+    client_handler.connect(
+        [resource["endpoint"][idx % len(resource["endpoint"])]])
+    start = time.time()
+    turns = resource["turns"]
+    img_list = resource["img_list"]
+    for i in range(turns):
+        if args.batch_size >= 1:
+            l_start = time.time()
+            feed_batch = []
+            b_start = time.time()
+            for bi in range(args.batch_size):
+                feed_batch.append({"image": img_list[bi]})
+            b_end = time.time()
+            if profile_flags:
+                sys.stderr.write(
+                    "PROFILE\tpid:{}\tunt_pre_0:{} unet_pre_1:{}\n".format(
+                        os.getpid(),
+                        int(round(b_start * 1000000)),
+                        int(round(b_end * 1000000))))
+            result = client_handler.predict(
+                feed={"image": img_list[bi]}, fetch=["output"])
+            #result["filename"] = "./img_data/N0060.jpg" % (os.getpid(), idx, time.time())
+            #postprocess(result) # if you  want to measure post process time, you have to uncomment this line
+            l_end = time.time()
+            if latency_flags:
+                latency_list.append(l_end * 1000 - l_start * 1000)
+        else:
+            print("unsupport batch size {}".format(args.batch_size))
+    end = time.time()
+    if latency_flags:
+        return [[end - start], latency_list]
+    else:
+        return [[end - start]]
+
+
+if __name__ == '__main__':
+    """
+    usage: 
+    """
+    img_file_list = get_img_names("./img_data")
+    img_content_list = preprocess_img(img_file_list)
+    multi_thread_runner = MultiThreadRunner()
+    endpoint_list = ["127.0.0.1:9494"]
+    turns = 1
+    start = time.time()
+    result = multi_thread_runner.run(benckmark_worker, args.thread, {
+        "endpoint": endpoint_list,
+        "turns": turns,
+        "img_list": img_content_list
+    })
+    end = time.time()
+    total_cost = end - start
+    avg_cost = 0
+    for i in range(args.thread):
+        avg_cost += result[0][i]
+    avg_cost = avg_cost / args.thread
+    print("total cost: {}s".format(total_cost))
+    print("each thread cost: {}s. ".format(avg_cost))
+    print("qps: {}samples/s".format(args.batch_size * args.thread * turns /
+                                    total_cost))
+    if os.getenv("FLAGS_serving_latency"):
+        show_latency(result[1])
--- a/python/examples/yolov4/test_client.py
+++ b/python/examples/yolov4/test_client.py
@@ -35,6 +35,7 @@ fetch_map = client.predict(
        "image": im,
        "im_size": np.array(list(im.shape[1:])),
    },
-    fetch=["save_infer_model/scale_0.tmp_0"])
+    fetch=["save_infer_model/scale_0.tmp_0"],
+    batch=False)
 fetch_map["image"] = sys.argv[1]
 postprocess(fetch_map)
--- a/python/paddle_serving_app/README.md
+++ b/python/paddle_serving_app/README.md
@@ -160,10 +160,10 @@ Therefore, a local prediction tool is built into the paddle_serving_app, which i
 Taking [fit_a_line prediction service](../examples/fit_a_line) as an example, the following code can be used to run local prediction.

 ```python
-from paddle_serving_app.local_predict import Debugger
+from paddle_serving_app.local_predict import LocalPredictor
 import numpy as np

-debugger = Debugger()
+debugger = LocalPredictor()
 debugger.load_model_config("./uci_housing_model", gpu=False)
 data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
        -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]

--- a/python/paddle_serving_app/README_CN.md
+++ b/python/paddle_serving_app/README_CN.md
@@ -147,10 +147,10 @@ Paddle Serving框架的server预测op使用了Paddle 的预测框架，在部署
 以[fit_a_line预测服务](../examples/fit_a_line)为例，使用以下代码即可执行本地预测。

 ```python
-from paddle_serving_app.local_predict import Debugger
+from paddle_serving_app.local_predict import LocalPredictor
 import numpy as np

-debugger = Debugger()
+debugger = LocalPredictor()
 debugger.load_model_config("./uci_housing_model", gpu=False)
 data = [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727,
        -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]

--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -31,7 +31,13 @@ logger = logging.getLogger("fluid")
 logger.setLevel(logging.INFO)


-class Debugger(object):
+class LocalPredictor(object):
+    """
+    Prediction in the current process of the local environment, in process
+    call, Compared with RPC/HTTP, LocalPredictor has better performance, 
+    because of no network and packaging load.
+    """
+
    def __init__(self):
        self.feed_names_ = []
        self.fetch_names_ = []
@@ -42,13 +48,41 @@ class Debugger(object):
        self.fetch_names_to_idx_ = {}
        self.fetch_names_to_type_ = {}

-    def load_model_config(self, model_path, gpu=False, profile=True, cpu_num=1):
+    def load_model_config(self,
+                          model_path,
+                          use_gpu=False,
+                          gpu_id=0,
+                          use_profile=False,
+                          thread_num=1,
+                          mem_optim=True,
+                          ir_optim=False,
+                          use_trt=False,
+                          use_feed_fetch_ops=False):
+        """
+        Load model config and set the engine config for the paddle predictor
+   
+        Args:
+            model_path: model config path.
+            use_gpu: calculating with gpu, False default.
+            gpu_id: gpu id, 0 default.
+            use_profile: use predictor profiles, False default.
+            thread_num: thread nums, default 1. 
+            mem_optim: memory optimization, True default.
+            ir_optim: open calculation chart optimization, False default.
+            use_trt: use nvidia TensorRT optimization, False default
+            use_feed_fetch_ops: use feed/fetch ops, False default.
+        """
        client_config = "{}/serving_server_conf.prototxt".format(model_path)
        model_conf = m_config.GeneralModelConfig()
        f = open(client_config, 'r')
        model_conf = google.protobuf.text_format.Merge(
            str(f.read()), model_conf)
        config = AnalysisConfig(model_path)
+        logger.info("load_model_config params: model_path:{}, use_gpu:{},\
+            gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{},\
+            use_trt:{}, use_feed_fetch_ops:{}".format(
+            model_path, use_gpu, gpu_id, use_profile, thread_num, mem_optim,
+            ir_optim, use_trt, use_feed_fetch_ops))

        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
        self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
@@ -64,19 +98,43 @@ class Debugger(object):
            self.fetch_names_to_idx_[var.alias_name] = i
            self.fetch_names_to_type_[var.alias_name] = var.fetch_type

-        if not gpu:
-            config.disable_gpu()
-        else:
-            config.enable_use_gpu(100, 0)
-        if profile:
+        if use_profile:
            config.enable_profile()
+        if mem_optim:
+            config.enable_memory_optim()
+        config.switch_ir_optim(ir_optim)
+        config.set_cpu_math_library_num_threads(thread_num)
+        config.switch_use_feed_fetch_ops(use_feed_fetch_ops)
        config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
-        config.set_cpu_math_library_num_threads(cpu_num)
-        config.switch_ir_optim(False)
-        config.switch_use_feed_fetch_ops(False)
+
+        if not use_gpu:
+            config.disable_gpu()
+        else:
+            config.enable_use_gpu(100, gpu_id)
+            if use_trt:
+                config.enable_tensorrt_engine(
+                    workspace_size=1 << 20,
+                    max_batch_size=32,
+                    min_subgraph_size=3,
+                    use_static=False,
+                    use_calib_mode=False)
+
        self.predictor = create_paddle_predictor(config)

-    def predict(self, feed=None, fetch=None):
+    def predict(self, feed=None, fetch=None, batch=False, log_id=0):
+        """
+        Predict locally
+
+        Args:
+            feed: feed var
+            fetch: fetch var
+            batch: batch data or not, False default.If batch is False, a new
+                   dimension is added to header of the shape[np.newaxis].
+            log_id: for logging
+
+        Returns:
+            fetch_map: dict 
+        """
        if feed is None or fetch is None:
            raise ValueError("You should specify feed and fetch for prediction")
        fetch_list = []
@@ -121,10 +179,19 @@ class Debugger(object):
                    name])
            if self.feed_types_[name] == 0:
                feed[name] = feed[name].astype("int64")
-            else:
+            elif self.feed_types_[name] == 1:
                feed[name] = feed[name].astype("float32")
+            elif self.feed_types_[name] == 2:
+                feed[name] = feed[name].astype("int32")
+            else:
+                raise ValueError("local predictor receives wrong data type")
            input_tensor = self.predictor.get_input_tensor(name)
-            input_tensor.copy_from_cpu(feed[name])
+            if "{}.lod".format(name) in feed:
+                input_tensor.set_lod([feed["{}.lod".format(name)]])
+            if batch == False:
+                input_tensor.copy_from_cpu(feed[name][np.newaxis, :])
+            else:
+                input_tensor.copy_from_cpu(feed[name])
        output_tensors = []
        output_names = self.predictor.get_output_names()
        for output_name in output_names:
@@ -139,5 +206,6 @@ class Debugger(object):
        for i, name in enumerate(fetch):
            fetch_map[name] = outputs[i]
            if len(output_tensors[i].lod()) > 0:
-                fetch_map[name + ".lod"] = output_tensors[i].lod()[0]
+                fetch_map[name + ".lod"] = np.array(output_tensors[i].lod()[
+                    0]).astype('int32')
        return fetch_map
--- a/python/paddle_serving_app/reader/__init__.py
+++ b/python/paddle_serving_app/reader/__init__.py
@@ -18,5 +18,5 @@ from .image_reader import RCNNPostprocess, SegPostprocess, PadStride, BlazeFaceP
 from .image_reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
 from .lac_reader import LACReader
 from .senta_reader import SentaReader
-from .imdb_reader import IMDBDataset
+#from .imdb_reader import IMDBDataset
 from .ocr_reader import OCRReader
--- a/python/paddle_serving_app/reader/pddet/image_tool.py
+++ b/python/paddle_serving_app/reader/pddet/image_tool.py
@@ -22,18 +22,17 @@ import yaml
 import copy
 import argparse
 import logging
-import paddle.fluid as fluid
 import json

 FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 logger = logging.getLogger(__name__)

-precision_map = {
-    'trt_int8': fluid.core.AnalysisConfig.Precision.Int8,
-    'trt_fp32': fluid.core.AnalysisConfig.Precision.Float32,
-    'trt_fp16': fluid.core.AnalysisConfig.Precision.Half
-}
+#precision_map = {
+#    'trt_int8': fluid.core.AnalysisConfig.Precision.Int8,
+#    'trt_fp32': fluid.core.AnalysisConfig.Precision.Float32,
+#    'trt_fp16': fluid.core.AnalysisConfig.Precision.Half
+#}


 class Resize(object):

--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
@@ -233,7 +233,12 @@ class Client(object):
            #    key))
            pass

-    def predict(self, feed=None, fetch=None, need_variant_tag=False, log_id=0):
+    def predict(self,
+                feed=None,
+                fetch=None,
+                batch=False,
+                need_variant_tag=False,
+                log_id=0):
        self.profile_.record('py_prepro_0')

        if feed is None or fetch is None:
@@ -260,7 +265,10 @@ class Client(object):
        int_feed_names = []
        float_feed_names = []
        int_shape = []
+        int_lod_slot_batch = []
+        float_lod_slot_batch = []
        float_shape = []
+
        fetch_names = []
        counter = 0
        batch_size = len(feed_batch)
@@ -277,31 +285,56 @@ class Client(object):
        for i, feed_i in enumerate(feed_batch):
            int_slot = []
            float_slot = []
+            int_lod_slot = []
+            float_lod_slot = []
            for key in feed_i:
-                if key not in self.feed_names_:
+                if ".lod" not in key and key not in self.feed_names_:
                    raise ValueError("Wrong feed name: {}.".format(key))
+                if ".lod" in key:
+                    continue
                #if not isinstance(feed_i[key], np.ndarray):
                self.shape_check(feed_i, key)
                if self.feed_types_[key] in int_type:
                    if i == 0:
                        int_feed_names.append(key)
+                        shape_lst = []
+                        if batch == False:
+                            feed_i[key] = feed_i[key][np.newaxis, :]
                        if isinstance(feed_i[key], np.ndarray):
-                            int_shape.append(list(feed_i[key].shape))
+                            shape_lst.extend(list(feed_i[key].shape))
+                            int_shape.append(shape_lst)
                        else:
                            int_shape.append(self.feed_shapes_[key])
+                        if "{}.lod".format(key) in feed_i:
+                            int_lod_slot_batch.append(feed_i["{}.lod".format(
+                                key)])
+                        else:
+                            int_lod_slot_batch.append([])
+
                    if isinstance(feed_i[key], np.ndarray):
                        int_slot.append(feed_i[key])
                        self.has_numpy_input = True
                    else:
                        int_slot.append(feed_i[key])
                        self.all_numpy_input = False
+
                elif self.feed_types_[key] in float_type:
                    if i == 0:
                        float_feed_names.append(key)
+                        shape_lst = []
+                        if batch == False:
+                            feed_i[key] = feed_i[key][np.newaxis, :]
                        if isinstance(feed_i[key], np.ndarray):
-                            float_shape.append(list(feed_i[key].shape))
+                            shape_lst.extend(list(feed_i[key].shape))
+                            float_shape.append(shape_lst)
                        else:
                            float_shape.append(self.feed_shapes_[key])
+                        if "{}.lod".format(key) in feed_i:
+                            float_lod_slot_batch.append(feed_i["{}.lod".format(
+                                key)])
+                        else:
+                            float_lod_slot_batch.append([])
+
                    if isinstance(feed_i[key], np.ndarray):
                        float_slot.append(feed_i[key])
                        self.has_numpy_input = True
@@ -310,6 +343,8 @@ class Client(object):
                        self.all_numpy_input = False
            int_slot_batch.append(int_slot)
            float_slot_batch.append(float_slot)
+            int_lod_slot_batch.append(int_lod_slot)
+            float_lod_slot_batch.append(float_lod_slot)

        self.profile_.record('py_prepro_1')
        self.profile_.record('py_client_infer_0')
@@ -317,14 +352,13 @@ class Client(object):
        result_batch_handle = self.predictorres_constructor()
        if self.all_numpy_input:
            res = self.client_handle_.numpy_predict(
-                float_slot_batch, float_feed_names, float_shape, int_slot_batch,
-                int_feed_names, int_shape, fetch_names, result_batch_handle,
-                self.pid, log_id)
+                float_slot_batch, float_feed_names, float_shape,
+                float_lod_slot_batch, int_slot_batch, int_feed_names, int_shape,
+                int_lod_slot_batch, fetch_names, result_batch_handle, self.pid,
+                log_id)
        elif self.has_numpy_input == False:
-            res = self.client_handle_.batch_predict(
-                float_slot_batch, float_feed_names, float_shape, int_slot_batch,
-                int_feed_names, int_shape, fetch_names, result_batch_handle,
-                self.pid, log_id)
+            raise ValueError(
+                "Please make sure all of your inputs are numpy array")
        else:
            raise ValueError(
                "Please make sure the inputs are all in list type or all in numpy.array type"

--- a/python/paddle_serving_client/io/__init__.py
+++ b/python/paddle_serving_client/io/__init__.py
@@ -92,9 +92,12 @@ def save_model(server_model_folder,
            fetch_var.shape.extend(tmp_shape)
        config.fetch_var.extend([fetch_var])

-    cmd = "mkdir -p {}".format(client_config_folder)
-
-    os.system(cmd)
+    try:
+        save_dirname = os.path.normpath(client_config_folder)
+        os.makedirs(save_dirname)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
    with open("{}/serving_client_conf.prototxt".format(client_config_folder),
              "w") as fout:
        fout.write(str(config))

--- a/python/paddle_serving_server/__init__.py
+++ b/python/paddle_serving_server/__init__.py
@@ -584,7 +584,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
                    else:
                        raise Exception("error type.")
                tensor.shape.extend(list(model_result[name].shape))
-                if name in self.lod_tensor_set_:
+                if "{}.lod".format(name) in model_result:
                    tensor.lod.extend(model_result["{}.lod".format(name)]
                                      .tolist())
                inst.tensor_array.append(tensor)

--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -52,6 +52,20 @@ class WebService(object):
    def load_model_config(self, model_config):
        print("This API will be deprecated later. Please do not use it")
        self.model_config = model_config
+        import os
+        from .proto import general_model_config_pb2 as m_config
+        import google.protobuf.text_format
+        if os.path.isdir(model_config):
+            client_config = "{}/serving_server_conf.prototxt".format(
+                model_config)
+        elif os.path.isfile(path):
+            client_config = model_config
+        model_conf = m_config.GeneralModelConfig()
+        f = open(client_config, 'r')
+        model_conf = google.protobuf.text_format.Merge(
+            str(f.read()), model_conf)
+        self.feed_names = [var.alias_name for var in model_conf.feed_var]
+        self.fetch_names = [var.alias_name for var in model_conf.fetch_var]

    def _launch_rpc_service(self):
        op_maker = OpMaker()
@@ -112,13 +126,14 @@ class WebService(object):
        if "fetch" not in request.json:
            abort(400)
        try:
-            feed, fetch = self.preprocess(request.json["feed"],
-                                          request.json["fetch"])
+            feed, fetch, is_batch = self.preprocess(request.json["feed"],
+                                                    request.json["fetch"])
            if isinstance(feed, dict) and "fetch" in feed:
                del feed["fetch"]
            if len(feed) == 0:
                raise ValueError("empty input")
-            fetch_map = self.client.predict(feed=feed, fetch=fetch)
+            fetch_map = self.client.predict(
+                feed=feed, fetch=fetch, batch=is_batch)
            result = self.postprocess(
                feed=request.json["feed"], fetch=fetch, fetch_map=fetch_map)
            result = {"result": result}
@@ -171,24 +186,22 @@ class WebService(object):
        self.app_instance = app_instance

    def _launch_local_predictor(self):
-        from paddle_serving_app.local_predict import Debugger
-        self.client = Debugger()
+        from paddle_serving_app.local_predict import LocalPredictor
+        self.client = LocalPredictor()
        self.client.load_model_config(
            "{}".format(self.model_config), gpu=False, profile=False)

    def run_web_service(self):
        print("This API will be deprecated later. Please do not use it")
-        self.app_instance.run(host="0.0.0.0",
-                              port=self.port,
-                              threaded=False,
-                              processes=1)
+        self.app_instance.run(host="0.0.0.0", port=self.port, threaded=True)

    def get_app_instance(self):
        return self.app_instance

    def preprocess(self, feed=[], fetch=[]):
        print("This API will be deprecated later. Please do not use it")
-        return feed, fetch
+        is_batch = True
+        return feed, fetch, is_batch

    def postprocess(self, feed=[], fetch=[], fetch_map=None):
        print("This API will be deprecated later. Please do not use it")

--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
@@ -73,6 +73,8 @@ def serve_args():
        default=False,
        action="store_true",
        help="Use Multi-language-service")
+    parser.add_argument(
+        "--use_trt", default=False, action="store_true", help="Use TensorRT")
    parser.add_argument(
        "--product_name",
        type=str,
@@ -205,6 +207,7 @@ class Server(object):
        self.cur_path = os.getcwd()
        self.use_local_bin = False
        self.gpuid = 0
+        self.use_trt = False
        self.model_config_paths = None  # for multi-model in a workflow
        self.product_name = None
        self.container_id = None
@@ -271,6 +274,9 @@ class Server(object):
    def set_gpuid(self, gpuid=0):
        self.gpuid = gpuid

+    def set_trt(self):
+        self.use_trt = True
+
    def _prepare_engine(self, model_config_paths, device):
        if self.model_toolkit_conf == None:
            self.model_toolkit_conf = server_sdk.ModelToolkitConf()
@@ -290,6 +296,7 @@ class Server(object):
            engine.enable_ir_optimization = self.ir_optimization
            engine.static_optimization = False
            engine.force_update_static_cache = False
+            engine.use_trt = self.use_trt

            if device == "cpu":
                engine.type = "FLUID_CPU_ANALYSIS_DIR"
@@ -396,7 +403,10 @@ class Server(object):
        for line in version_file.readlines():
            if re.match("cuda_version", line):
                cuda_version = line.split("\"")[1]
-                device_version = "serving-gpu-cuda" + cuda_version + "-"
+                if cuda_version != "trt":
+                    device_version = "serving-gpu-cuda" + cuda_version + "-"
+                else:
+                    device_version = "serving-gpu-" + cuda_version + "-"

        folder_name = device_version + serving_server_version
        tar_name = folder_name + ".tar.gz"
@@ -645,7 +655,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
                    else:
                        raise Exception("error type.")
                tensor.shape.extend(list(model_result[name].shape))
-                if name in self.lod_tensor_set_:
+                if "{}.lod".format(name) in model_result:
                    tensor.lod.extend(model_result["{}.lod".format(name)]
                                      .tolist())
                inst.tensor_array.append(tensor)

--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
@@ -64,6 +64,8 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
    server.set_memory_optimize(mem_optim)
    server.set_ir_optimize(ir_optim)
    server.set_max_body_size(max_body_size)
+    if args.use_trt:
+        server.set_trt()

    if args.product_name != None:
        server.set_product_name(args.product_name)

--- a/python/paddle_serving_server_gpu/web_service.py
+++ b/python/paddle_serving_server_gpu/web_service.py
@@ -58,6 +58,20 @@ class WebService(object):
    def load_model_config(self, model_config):
        print("This API will be deprecated later. Please do not use it")
        self.model_config = model_config
+        import os
+        from .proto import general_model_config_pb2 as m_config
+        import google.protobuf.text_format
+        if os.path.isdir(model_config):
+            client_config = "{}/serving_server_conf.prototxt".format(
+                model_config)
+        elif os.path.isfile(path):
+            client_config = model_config
+        model_conf = m_config.GeneralModelConfig()
+        f = open(client_config, 'r')
+        model_conf = google.protobuf.text_format.Merge(
+            str(f.read()), model_conf)
+        self.feed_names = [var.alias_name for var in model_conf.feed_var]
+        self.fetch_names = [var.alias_name for var in model_conf.fetch_var]

    def set_gpus(self, gpus):
        print("This API will be deprecated later. Please do not use it")
@@ -167,13 +181,14 @@ class WebService(object):
        if "fetch" not in request.json:
            abort(400)
        try:
-            feed, fetch = self.preprocess(request.json["feed"],
-                                          request.json["fetch"])
+            feed, fetch, is_batch = self.preprocess(request.json["feed"],
+                                                    request.json["fetch"])
            if isinstance(feed, dict) and "fetch" in feed:
                del feed["fetch"]
            if len(feed) == 0:
                raise ValueError("empty input")
-            fetch_map = self.client.predict(feed=feed, fetch=fetch)
+            fetch_map = self.client.predict(
+                feed=feed, fetch=fetch, batch=is_batch)
            result = self.postprocess(
                feed=request.json["feed"], fetch=fetch, fetch_map=fetch_map)
            result = {"result": result}
@@ -232,24 +247,22 @@ class WebService(object):
        self.app_instance = app_instance

    def _launch_local_predictor(self, gpu):
-        from paddle_serving_app.local_predict import Debugger
-        self.client = Debugger()
+        from paddle_serving_app.local_predict import LocalPredictor
+        self.client = LocalPredictor()
        self.client.load_model_config(
            "{}".format(self.model_config), gpu=gpu, profile=False)

    def run_web_service(self):
        print("This API will be deprecated later. Please do not use it")
-        self.app_instance.run(host="0.0.0.0",
-                              port=self.port,
-                              threaded=False,
-                              processes=4)
+        self.app_instance.run(host="0.0.0.0", port=self.port, threaded=True)

    def get_app_instance(self):
        return self.app_instance

    def preprocess(self, feed=[], fetch=[]):
        print("This API will be deprecated later. Please do not use it")
-        return feed, fetch
+        is_batch = True
+        return feed, fetch, is_batch

    def postprocess(self, feed=[], fetch=[], fetch_map=None):
        print("This API will be deprecated later. Please do not use it")

--- a/python/pipeline/__init__.py
+++ b/python/pipeline/__init__.py
@@ -15,5 +15,5 @@ from . import logger  # this module must be the first to import
 from .operator import Op, RequestOp, ResponseOp
 from .pipeline_server import PipelineServer
 from .pipeline_client import PipelineClient
-from .local_rpc_service_handler import LocalRpcServiceHandler
+from .local_service_handler import LocalServiceHandler
 from .analyse import Analyst
--- a/python/pipeline/channel.py
+++ b/python/pipeline/channel.py
--- a/python/pipeline/dag.py
+++ b/python/pipeline/dag.py
--- a/python/pipeline/gateway/proto/gateway.proto
+++ b/python/pipeline/gateway/proto/gateway.proto
@@ -19,22 +19,25 @@ option go_package = ".;pipeline_serving";
 import "google/api/annotations.proto";

 message Response {
-  repeated string key = 1;
-  repeated string value = 2;
-  int32 ecode = 3;
-  string error_info = 4;
+  int32 err_no = 1;
+  string err_msg = 2;
+  repeated string key = 3;
+  repeated string value = 4;
 };

 message Request {
  repeated string key = 1;
  repeated string value = 2;
  string name = 3;
-}
+  string method = 4;
+  int64 logid = 5;
+  string clientip = 6;
+};

 service PipelineService {
  rpc inference(Request) returns (Response) {
    option (google.api.http) = {
-      post : "/{name=*}/prediction"
+      post : "/{name=*}/{method=*}"
      body : "*"
    };
  }

--- a/python/pipeline/gateway/proxy_server.go
+++ b/python/pipeline/gateway/proxy_server.go
@@ -38,7 +38,8 @@ func run_proxy_server(grpc_port int, http_port int) error {
  ctx, cancel := context.WithCancel(ctx)
  defer cancel()

-  mux := runtime.NewServeMux()
+  //EmitDefaults=true, does not filter out the default inputs 
+  mux := runtime.NewServeMux(runtime.WithMarshalerOption(runtime.MIMEWildcard, &runtime.JSONPb{OrigName: true, EmitDefaults: true}))
  opts := []grpc.DialOption{grpc.WithInsecure()}
  err := gw.RegisterPipelineServiceHandlerFromEndpoint(ctx, mux, *pipelineEndpoint, opts)
  if err != nil {

--- a/python/pipeline/local_rpc_service_handler.py
+++ b/python/pipeline/local_rpc_service_handler.py
--- a/python/pipeline/operator.py
+++ b/python/pipeline/operator.py
--- a/python/pipeline/pipeline_client.py
+++ b/python/pipeline/pipeline_client.py
@@ -18,14 +18,20 @@ import numpy as np
 from numpy import *
 import logging
 import functools
-from .channel import ChannelDataEcode
+import json
+import socket
+from .channel import ChannelDataErrcode
 from .proto import pipeline_service_pb2
 from .proto import pipeline_service_pb2_grpc
-
+import six
 _LOGGER = logging.getLogger(__name__)


 class PipelineClient(object):
+    """
+    PipelineClient provides the basic capabilities of the pipeline SDK
+    """
+
    def __init__(self):
        self._channel = None
        self._profile_key = "pipeline.profile"
@@ -42,6 +48,26 @@ class PipelineClient(object):

    def _pack_request_package(self, feed_dict, profile):
        req = pipeline_service_pb2.Request()
+
+        logid = feed_dict.get("logid")
+        if logid is None:
+            req.logid = 0
+        else:
+            if six.PY2:
+                req.logid = long(logid)
+            elif six.PY3:
+                req.logid = int(log_id)
+            feed_dict.pop("logid")
+
+        clientip = feed_dict.get("clientip")
+        if clientip is None:
+            hostname = socket.gethostname()
+            ip = socket.gethostbyname(hostname)
+            req.clientip = ip
+        else:
+            req.clientip = clientip
+            feed_dict.pop("clientip")
+
        np.set_printoptions(threshold=sys.maxsize)
        for key, value in feed_dict.items():
            req.key.append(key)
@@ -60,29 +86,7 @@ class PipelineClient(object):
        return req

    def _unpack_response_package(self, resp, fetch):
-        if resp.ecode != 0:
-            return {
-                "ecode": resp.ecode,
-                "ecode_desc": ChannelDataEcode(resp.ecode),
-                "error_info": resp.error_info,
-            }
-        fetch_map = {"ecode": resp.ecode}
-        for idx, key in enumerate(resp.key):
-            if key == self._profile_key:
-                if resp.value[idx] != "":
-                    sys.stderr.write(resp.value[idx])
-                continue
-            if fetch is not None and key not in fetch:
-                continue
-            data = resp.value[idx]
-            try:
-                evaled_data = eval(data)
-                if isinstance(evaled_data, np.ndarray):
-                    data = evaled_data
-            except Exception as e:
-                pass
-            fetch_map[key] = data
-        return fetch_map
+        return resp

    def predict(self, feed_dict, fetch=None, asyn=False, profile=False):
        if not isinstance(feed_dict, dict):

--- a/python/pipeline/pipeline_server.py
+++ b/python/pipeline/pipeline_server.py
--- a/python/pipeline/proto/pipeline_service.proto
+++ b/python/pipeline/proto/pipeline_service.proto
--- a/python/setup.py.server_gpu.in
+++ b/python/setup.py.server_gpu.in
--- a/tools/Dockerfile.cuda10.0-cudnn7.devel
+++ b/tools/Dockerfile.cuda10.0-cudnn7.devel
--- a/tools/serving_build.sh
+++ b/tools/serving_build.sh