diff --git a/README.md b/README.md
index 870f2f587f35ba1526c49f58698ae4db17ff0f81..44cee7bac8087a60e08754f007ad33bdebad98e3 100644
--- a/README.md
+++ b/README.md
@@ -45,9 +45,10 @@ nvidia-docker exec -it test bash
 ```
 
 ```shell
-pip install paddle-serving-client 
-pip install paddle-serving-server # CPU
-pip install paddle-serving-server-gpu # GPU
+pip install paddle-serving-client==0.3.2 
+pip install paddle-serving-server==0.3.2 # CPU
+pip install paddle-serving-server-gpu==0.3.2.post9 # GPU with CUDA9.0
+pip install paddle-serving-server-gpu==0.3.2.post10 # GPU with CUDA10.0
 ```
 
 You may need to use a domestic mirror source (in China, you can use the Tsinghua mirror source, add `-i https://pypi.tuna.tsinghua.edu.cn/simple` to pip command) to speed up the download.
@@ -172,6 +173,11 @@ Here, `client.predict` function has two arguments. `feed` is a `python dict` wit
 - [An End-to-end tutorial from training to inference service deployment](doc/TRAIN_TO_SERVICE.md)
 - [Write Bert-as-Service in 10 minutes](doc/BERT_10_MINS.md)
 
+### Tutorial at AIStudio
+- [Introduction to PaddleServing](https://aistudio.baidu.com/aistudio/projectdetail/605819)
+- [Image Segmentation on Paddle Serving](https://aistudio.baidu.com/aistudio/projectdetail/457715)
+- [Sentimental Analysis](https://aistudio.baidu.com/aistudio/projectdetail/509014)
+
 ### Developers
 - [How to config Serving native operators on server side?](doc/SERVER_DAG.md)
 - [How to develop a new Serving operator?](doc/NEW_OPERATOR.md)
diff --git a/README_CN.md b/README_CN.md
index 6317a79513a3d5e3247d249885d8bfe06de0e1c9..8bdc2702a68ed88437495fe8b4ced3817742d13a 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -47,9 +47,10 @@ nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/se
 nvidia-docker exec -it test bash
 ```
 ```shell
-pip install paddle-serving-client
-pip install paddle-serving-server # CPU
-pip install paddle-serving-server-gpu # GPU
+pip install paddle-serving-client==0.3.2
+pip install paddle-serving-server==0.3.2 # CPU
+pip install paddle-serving-server-gpu==0.3.2.post9 # GPU with CUDA9.0
+pip install paddle-serving-server-gpu==0.3.2.post10 # GPU with CUDA10.0
 ```
 
 您可能需要使用国内镜像源（例如清华源, 在pip命令中添加`-i https://pypi.tuna.tsinghua.edu.cn/simple`）来加速下载。
@@ -169,6 +170,11 @@ print(fetch_map)
 - [端到端完成从训练到部署全流程](doc/TRAIN_TO_SERVICE_CN.md)
 - [十分钟构建Bert-As-Service](doc/BERT_10_MINS_CN.md)
 
+### AIStudio教程
+- [PaddleServing作业](https://aistudio.baidu.com/aistudio/projectdetail/605819)
+- [PaddleServing图像分割](https://aistudio.baidu.com/aistudio/projectdetail/457715)
+- [PaddleServing情感分析](https://aistudio.baidu.com/aistudio/projectdetail/509014)
+
 ### 开发者教程
 - [如何配置Server端的计算图?](doc/SERVER_DAG_CN.md)
 - [如何开发一个新的General Op?](doc/NEW_OPERATOR_CN.md)
diff --git a/cmake/paddlepaddle.cmake b/cmake/paddlepaddle.cmake
index c1a7ce0a7599bec9d14c01da9b7af99765a95aba..13f776621ff4bbefcbd22c2c0732f1e1a8a41b05 100644
--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -31,7 +31,7 @@ message( "WITH_GPU = ${WITH_GPU}")
 # Paddle Version should be one of:
 # latest: latest develop build
 # version number like 1.5.2
-SET(PADDLE_VERSION "1.8.3")
+SET(PADDLE_VERSION "1.8.4")
 
 if (WITH_GPU)
     if (WITH_TRT)
diff --git a/core/configure/proto/multi_lang_general_model_service.proto b/core/configure/proto/multi_lang_general_model_service.proto
index b83450aed666b96de324050d53b10c56e059a8d5..18fbcf760647e1694e738c0832fe45f4f7d9934f 100644
--- a/core/configure/proto/multi_lang_general_model_service.proto
+++ b/core/configure/proto/multi_lang_general_model_service.proto
@@ -14,6 +14,8 @@
 
 syntax = "proto2";
 
+package baidu.paddle_serving.multi_lang;
+
 option java_multiple_files = true;
 option java_package = "io.paddle.serving.grpc";
 option java_outer_classname = "ServingProto";
@@ -37,6 +39,7 @@ message InferenceRequest {
   repeated string feed_var_names = 2;
   repeated string fetch_var_names = 3;
   required bool is_python = 4 [ default = false ];
+  required uint64 log_id = 5 [ default = 0 ];
 };
 
 message InferenceResponse {
diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto
index 3dfc1db4412c95c9e82c7c5c2a21a29519b84267..c008ee857bb7c69672e399ce44b2420d5db7fb3c 100644
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -59,6 +59,8 @@ message ResourceConf {
   optional string cube_config_path = 5;
   optional string cube_config_file = 6;
   optional int32 cube_quant_bits = 7; // set 0 if no quant.
+  optional string auth_product_name = 8;
+  optional string auth_container_id = 9;
 };
 
 // DAG node depency info
diff --git a/core/cube/cube-api/include/meta.h b/core/cube/cube-api/include/meta.h
index 69bbb8ccc12e423d286183ed5dd87e90bf2e59de..ec872a38d8b0294f7b06e8557848f6e8ca79aa2b 100644
--- a/core/cube/cube-api/include/meta.h
+++ b/core/cube/cube-api/include/meta.h
@@ -22,7 +22,8 @@
 #ifdef BCLOUD
 #include "baidu/rpc/channel.h"
 #include "baidu/rpc/parallel_channel.h"
-#include "rapidjson/document.h"
+#include "rapidjson_1.0/document.h"
+#include "rapidjson_1.0/rapidjson.h"
 #else
 #include "brpc/channel.h"
 #include "brpc/parallel_channel.h"
diff --git a/core/general-client/include/general_model.h b/core/general-client/include/general_model.h
index b5d27df5edbaf9278ecb8614e282d104347206f8..a81a0005473f3eb4039dd77aa430957e52eda687 100644
--- a/core/general-client/include/general_model.h
+++ b/core/general-client/include/general_model.h
@@ -227,7 +227,8 @@ class PredictorClient {
       const std::vector<std::vector<int>>& int_shape,
       const std::vector<std::string>& fetch_name,
       PredictorRes& predict_res_batch,  // NOLINT
-      const int& pid);
+      const int& pid,
+      const uint64_t log_id);
 
   int numpy_predict(
       const std::vector<std::vector<py::array_t<float>>>& float_feed_batch,
@@ -238,7 +239,8 @@ class PredictorClient {
       const std::vector<std::vector<int>>& int_shape,
       const std::vector<std::string>& fetch_name,
       PredictorRes& predict_res_batch,  // NOLINT
-      const int& pid);
+      const int& pid,
+      const uint64_t log_id);
 
  private:
   PredictorApi _api;
diff --git a/core/general-client/src/general_model.cpp b/core/general-client/src/general_model.cpp
index 9f709c71045577f7b043777a7ad1528a0e2ccc28..a3160830a71c1244af209671da3f96d559c47f02 100644
--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -39,7 +39,9 @@ using configure::GeneralModelConfig;
 
 void PredictorClient::init_gflags(std::vector<std::string> argv) {
   std::call_once(gflags_init_flag, [&]() {
+#ifndef BCLOUD
     FLAGS_logtostderr = true;
+#endif
     argv.insert(argv.begin(), "dummy");
     int argc = argv.size();
     char **arr = new char *[argv.size()];
@@ -144,7 +146,8 @@ int PredictorClient::batch_predict(
     const std::vector<std::vector<int>> &int_shape,
     const std::vector<std::string> &fetch_name,
     PredictorRes &predict_res_batch,
-    const int &pid) {
+    const int &pid,
+    const uint64_t log_id) {
   int batch_size = std::max(float_feed_batch.size(), int_feed_batch.size());
 
   predict_res_batch.clear();
@@ -162,6 +165,7 @@ int PredictorClient::batch_predict(
   VLOG(2) << "int feed name size: " << int_feed_name.size();
   VLOG(2) << "max body size : " << brpc::fLU64::FLAGS_max_body_size;
   Request req;
+  req.set_log_id(log_id);
   for (auto &name : fetch_name) {
     req.add_fetch_var_names(name);
   }
@@ -356,7 +360,8 @@ int PredictorClient::numpy_predict(
     const std::vector<std::vector<int>> &int_shape,
     const std::vector<std::string> &fetch_name,
     PredictorRes &predict_res_batch,
-    const int &pid) {
+    const int &pid,
+    const uint64_t log_id) {
   int batch_size = std::max(float_feed_batch.size(), int_feed_batch.size());
   VLOG(2) << "batch size: " << batch_size;
   predict_res_batch.clear();
@@ -374,6 +379,7 @@ int PredictorClient::numpy_predict(
   VLOG(2) << "int feed name size: " << int_feed_name.size();
   VLOG(2) << "max body size : " << brpc::fLU64::FLAGS_max_body_size;
   Request req;
+  req.set_log_id(log_id);
   for (auto &name : fetch_name) {
     req.add_fetch_var_names(name);
   }
diff --git a/core/general-client/src/pybind_general_model.cpp b/core/general-client/src/pybind_general_model.cpp
index 3e065e4de1ff3c01ff6bc05cb39a2607620915b4..1e79a8d2489a9ebc2024402bada32a4be2000146 100644
--- a/core/general-client/src/pybind_general_model.cpp
+++ b/core/general-client/src/pybind_general_model.cpp
@@ -107,7 +107,8 @@ PYBIND11_MODULE(serving_client, m) {
               const std::vector<std::vector<int>> &int_shape,
               const std::vector<std::string> &fetch_name,
               PredictorRes &predict_res_batch,
-              const int &pid) {
+              const int &pid,
+              const uint64_t log_id) {
              return self.batch_predict(float_feed_batch,
                                        float_feed_name,
                                        float_shape,
@@ -116,7 +117,8 @@ PYBIND11_MODULE(serving_client, m) {
                                        int_shape,
                                        fetch_name,
                                        predict_res_batch,
-                                       pid);
+                                       pid,
+                                       log_id);
            },
            py::call_guard<py::gil_scoped_release>())
       .def("numpy_predict",
@@ -131,7 +133,8 @@ PYBIND11_MODULE(serving_client, m) {
               const std::vector<std::vector<int>> &int_shape,
               const std::vector<std::string> &fetch_name,
               PredictorRes &predict_res_batch,
-              const int &pid) {
+              const int &pid,
+              const uint64_t log_id) {
              return self.numpy_predict(float_feed_batch,
                                        float_feed_name,
                                        float_shape,
@@ -140,7 +143,8 @@ PYBIND11_MODULE(serving_client, m) {
                                        int_shape,
                                        fetch_name,
                                        predict_res_batch,
-                                       pid);
+                                       pid,
+                                       log_id);
            },
            py::call_guard<py::gil_scoped_release>());
 }
diff --git a/core/general-server/op/general_copy_op.cpp b/core/general-server/op/general_copy_op.cpp
index 322bcc07795f1b053847991eae17cb3922dd7a7b..0391a98bcb7f471c0a0687dd9deb7b404a15a2bf 100644
--- a/core/general-server/op/general_copy_op.cpp
+++ b/core/general-server/op/general_copy_op.cpp
@@ -45,36 +45,41 @@ int GeneralCopyOp::inference() {
   const std::string pre_name = pre_node_names[0];
 
   const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
-  VLOG(2) << "precedent name: " << pre_name;
+  uint64_t log_id = input_blob->GetLogId();
+
+  VLOG(2) << "(logid=" << log_id << ") precedent name: " << pre_name;
   const TensorVector *in = &input_blob->tensor_vector;
-  VLOG(2) << "input size: " << in->size();
+  VLOG(2) << "(logid=" << log_id << ") input size: " << in->size();
   int batch_size = input_blob->GetBatchSize();
   int input_var_num = 0;
 
   GeneralBlob *res = mutable_data<GeneralBlob>();
+  res->SetLogId(log_id);
   TensorVector *out = &res->tensor_vector;
 
-  VLOG(2) << "input batch size: " << batch_size;
+  VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
   res->SetBatchSize(batch_size);
 
   if (!res) {
-    LOG(ERROR) << "Failed get op tls reader object output";
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed get op tls reader object output";
   }
 
   Timer timeline;
   int64_t start = timeline.TimeStampUS();
 
-  VLOG(2) << "Going to init lod tensor";
+  VLOG(2) << "(logid=" << log_id << ") Going to init lod tensor";
   for (int i = 0; i < in->size(); ++i) {
     paddle::PaddleTensor lod_tensor;
     CopyLod(&in->at(i), &lod_tensor);
     lod_tensor.dtype = in->at(i).dtype;
     lod_tensor.name = in->at(i).name;
-    VLOG(2) << "lod tensor [" << i << "].name = " << lod_tensor.name;
+    VLOG(2) << "(logid=" << log_id << ") lod tensor [" << i
+            << "].name = " << lod_tensor.name;
     out->push_back(lod_tensor);
   }
 
-  VLOG(2) << "pack done.";
+  VLOG(2) << "(logid=" << log_id << ") pack done.";
 
   for (int i = 0; i < out->size(); ++i) {
     int64_t *src_ptr = static_cast<int64_t *>(in->at(i).data.data());
@@ -86,7 +91,7 @@ int GeneralCopyOp::inference() {
     }
   }
 
-  VLOG(2) << "output done.";
+  VLOG(2) << "(logid=" << log_id << ") output done.";
 
   timeline.Pause();
   int64_t end = timeline.TimeStampUS();
@@ -94,7 +99,7 @@ int GeneralCopyOp::inference() {
   AddBlobInfo(res, start);
   AddBlobInfo(res, end);
 
-  VLOG(2) << "read data from client success";
+  VLOG(2) << "(logid=" << log_id << ") read data from client success";
   return 0;
 }
 
diff --git a/core/general-server/op/general_copy_op.h b/core/general-server/op/general_copy_op.h
index 89627ffb9e4d15bbcbfa6c7fc3a608ada03dad6e..9b4caadc6a82f1f1a601ab66394b3f629af703ff 100644
--- a/core/general-server/op/general_copy_op.h
+++ b/core/general-server/op/general_copy_op.h
@@ -13,20 +13,12 @@
 // limitations under the License.
 
 #pragma once
-#include <vector>
-#ifdef BCLOUD
-#ifdef WITH_GPU
-#include "paddle/paddle_inference_api.h"
-#else
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#endif
-#else
-#include "paddle_inference_api.h"  // NOLINT
-#endif
 #include <string>
+#include <vector>
 #include "core/general-server/general_model_service.pb.h"
 #include "core/general-server/op/general_infer_helper.h"
 #include "core/predictor/framework/resource.h"
+#include "paddle_inference_api.h"  // NOLINT
 
 namespace baidu {
 namespace paddle_serving {
diff --git a/core/general-server/op/general_dist_kv_infer_op.cpp b/core/general-server/op/general_dist_kv_infer_op.cpp
index adaa6cbc1818fc5300faf662d98ad47c9af4c468..6809907226511f7de576f1e2bbdc21b7ac401422 100644
--- a/core/general-server/op/general_dist_kv_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_infer_op.cpp
@@ -50,18 +50,20 @@ int GeneralDistKVInferOp::inference() {
   const std::string pre_name = pre_node_names[0];
 
   const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
-  VLOG(2) << "Get precedent op name: " << pre_name;
+  uint64_t log_id = input_blob->GetLogId();
+  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
   GeneralBlob *output_blob = mutable_data<GeneralBlob>();
 
   if (!input_blob) {
-    LOG(ERROR) << "Failed mutable depended argument, op:" << pre_name;
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed mutable depended argument, op:" << pre_name;
     return -1;
   }
 
   const TensorVector *in = &input_blob->tensor_vector;
   TensorVector *out = &output_blob->tensor_vector;
   int batch_size = input_blob->GetBatchSize();
-  VLOG(2) << "input batch size: " << batch_size;
+  VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
   std::vector<uint64_t> keys;
   std::vector<rec::mcube::CubeValue> values;
   int sparse_count = 0;
@@ -96,13 +98,14 @@ int GeneralDistKVInferOp::inference() {
   rec::mcube::CubeAPI *cube = rec::mcube::CubeAPI::instance();
   std::vector<std::string> table_names = cube->get_table_names();
   if (table_names.size() == 0) {
-    LOG(ERROR) << "cube init error or cube config not given.";
+    LOG(ERROR) << "(logid=" << log_id
+               << ") cube init error or cube config not given.";
     return -1;
   }
   int ret = cube->seek(table_names[0], keys, &values);
   int64_t cube_end = timeline.TimeStampUS();
   if (values.size() != keys.size() || values[0].buff.size() == 0) {
-    LOG(ERROR) << "cube value return null";
+    LOG(ERROR) << "(logid=" << log_id << ") cube value return null";
   }
   size_t EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
   TensorVector sparse_out;
@@ -153,14 +156,16 @@ int GeneralDistKVInferOp::inference() {
   infer_in.insert(infer_in.end(), sparse_out.begin(), sparse_out.end());
 
   output_blob->SetBatchSize(batch_size);
+  output_blob->SetLogId(log_id);
 
-  VLOG(2) << "infer batch size: " << batch_size;
+  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
 
   int64_t start = timeline.TimeStampUS();
 
   if (InferManager::instance().infer(
           engine_name().c_str(), &infer_in, out, batch_size)) {
-    LOG(ERROR) << "Failed do infer in fluid model: " << engine_name();
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name();
     return -1;
   }
 
diff --git a/core/general-server/op/general_dist_kv_infer_op.h b/core/general-server/op/general_dist_kv_infer_op.h
index 2dee5bca6f9e12dbb8b36a6c39aa0a8e77763d23..56d19ee366feaf000d7b24f4017b39155b7e65c1 100644
--- a/core/general-server/op/general_dist_kv_infer_op.h
+++ b/core/general-server/op/general_dist_kv_infer_op.h
@@ -15,17 +15,9 @@
 #pragma once
 #include <string>
 #include <vector>
-#ifdef BCLOUD
-#ifdef WITH_GPU
-#include "paddle/paddle_inference_api.h"
-#else
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#endif
-#else
-#include "paddle_inference_api.h"  // NOLINT
-#endif
 #include "core/general-server/general_model_service.pb.h"
 #include "core/general-server/op/general_infer_helper.h"
+#include "paddle_inference_api.h"  // NOLINT
 
 namespace baidu {
 namespace paddle_serving {
diff --git a/core/general-server/op/general_dist_kv_quant_infer_op.cpp b/core/general-server/op/general_dist_kv_quant_infer_op.cpp
index 8752e8a72085c946b097cecf62a0bdbf90d682c4..93ce76f3d3399ac62435352d2271154ab7f84235 100644
--- a/core/general-server/op/general_dist_kv_quant_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_quant_infer_op.cpp
@@ -59,10 +59,13 @@ int GeneralDistKVQuantInferOp::inference() {
     return -1;
   }
 
+  uint64_t log_id = input_blob->GetLogId();
+  output_blob->SetLogId(log_id);
+
   const TensorVector *in = &input_blob->tensor_vector;
   TensorVector *out = &output_blob->tensor_vector;
   int batch_size = input_blob->GetBatchSize();
-  VLOG(2) << "input batch size: " << batch_size;
+  VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
   std::vector<uint64_t> keys;
   std::vector<rec::mcube::CubeValue> values;
   int sparse_count = 0;
@@ -94,13 +97,14 @@ int GeneralDistKVQuantInferOp::inference() {
   rec::mcube::CubeAPI *cube = rec::mcube::CubeAPI::instance();
   std::vector<std::string> table_names = cube->get_table_names();
   if (table_names.size() == 0) {
-    LOG(ERROR) << "cube init error or cube config not given.";
+    LOG(ERROR) << "(logid=" << log_id
+               << ") cube init error or cube config not given.";
     return -1;
   }
   int ret = cube->seek(table_names[0], keys, &values);
 
   if (values.size() != keys.size() || values[0].buff.size() == 0) {
-    LOG(ERROR) << "cube value return null";
+    LOG(ERROR) << "(logid=" << log_id << ") cube value return null";
   }
 
   TensorVector sparse_out;
@@ -182,7 +186,7 @@ int GeneralDistKVQuantInferOp::inference() {
 
   output_blob->SetBatchSize(batch_size);
 
-  VLOG(2) << "infer batch size: " << batch_size;
+  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
 
   Timer timeline;
   int64_t start = timeline.TimeStampUS();
@@ -190,7 +194,8 @@ int GeneralDistKVQuantInferOp::inference() {
 
   if (InferManager::instance().infer(
           engine_name().c_str(), &infer_in, out, batch_size)) {
-    LOG(ERROR) << "Failed do infer in fluid model: " << engine_name();
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name();
     return -1;
   }
 
diff --git a/core/general-server/op/general_dist_kv_quant_infer_op.h b/core/general-server/op/general_dist_kv_quant_infer_op.h
index e153311a2a2e2df1bd12720e2ce6cbe9ddb31ec0..0f99e2072374bc4bc0b76a1ca876a152f98488b6 100644
--- a/core/general-server/op/general_dist_kv_quant_infer_op.h
+++ b/core/general-server/op/general_dist_kv_quant_infer_op.h
@@ -15,17 +15,9 @@
 #pragma once
 #include <string>
 #include <vector>
-#ifdef BCLOUD
-#ifdef WITH_GPU
-#include "paddle/paddle_inference_api.h"
-#else
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#endif
-#else
-#include "paddle_inference_api.h"  // NOLINT
-#endif
 #include "core/general-server/general_model_service.pb.h"
 #include "core/general-server/op/general_infer_helper.h"
+#include "paddle_inference_api.h"  // NOLINT
 
 namespace baidu {
 namespace paddle_serving {
diff --git a/core/general-server/op/general_infer_helper.h b/core/general-server/op/general_infer_helper.h
index 4fa1995664a2dca449ebc228079c86919a32d328..40320348349a43aa79ce0d599f3aebeb764dc10e 100644
--- a/core/general-server/op/general_infer_helper.h
+++ b/core/general-server/op/general_infer_helper.h
@@ -15,17 +15,9 @@
 #pragma once
 
 #include <string.h>
+#include <string>
 #include <vector>
-#ifdef BCLOUD
-#ifdef WITH_GPU
-#include "paddle/paddle_inference_api.h"
-#else
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#endif
-#else
 #include "paddle_inference_api.h"  // NOLINT
-#endif
-#include <string>
 
 namespace baidu {
 namespace paddle_serving {
@@ -35,6 +27,7 @@ struct GeneralBlob {
   std::vector<paddle::PaddleTensor> tensor_vector;
   int64_t time_stamp[20];
   int p_size = 0;
+  uint64_t _log_id = -1;  // for logging
 
   int _batch_size;
 
@@ -46,9 +39,11 @@ struct GeneralBlob {
     tensor_vector.clear();
   }
 
-  int SetBatchSize(int batch_size) { _batch_size = batch_size; }
+  void SetBatchSize(int batch_size) { _batch_size = batch_size; }
+  void SetLogId(uint64_t log_id) { _log_id = log_id; }
 
   int GetBatchSize() const { return _batch_size; }
+  uint64_t GetLogId() const { return _log_id; }
   std::string ShortDebugString() const { return "Not implemented!"; }
 };
 
diff --git a/core/general-server/op/general_infer_op.cpp b/core/general-server/op/general_infer_op.cpp
index a9ff2e7226b25842889e391d82217b3b6a140170..b9478542c71e04b0f3f80b277da7d8d41f636d3d 100644
--- a/core/general-server/op/general_infer_op.cpp
+++ b/core/general-server/op/general_infer_op.cpp
@@ -47,22 +47,26 @@ int GeneralInferOp::inference() {
   const std::string pre_name = pre_node_names[0];
 
   const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
-  VLOG(2) << "Get precedent op name: " << pre_name;
+  uint64_t log_id = input_blob->GetLogId();
+  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
   GeneralBlob *output_blob = mutable_data<GeneralBlob>();
+  output_blob->SetLogId(log_id);
 
   if (!input_blob) {
-    LOG(ERROR) << "Failed mutable depended argument, op:" << pre_name;
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed mutable depended argument, op:" << pre_name;
     return -1;
   }
 
   const TensorVector *in = &input_blob->tensor_vector;
   TensorVector *out = &output_blob->tensor_vector;
-  int batch_size = input_blob->GetBatchSize();
-  VLOG(2) << "input batch size: " << batch_size;
 
-  output_blob->SetBatchSize(batch_size);
+  int batch_size = input_blob->_batch_size;
+  VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
 
-  VLOG(2) << "infer batch size: " << batch_size;
+  output_blob->_batch_size = batch_size;
+
+  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
 
   Timer timeline;
   int64_t start = timeline.TimeStampUS();
@@ -70,7 +74,8 @@ int GeneralInferOp::inference() {
 
   if (InferManager::instance().infer(
           engine_name().c_str(), in, out, batch_size)) {
-    LOG(ERROR) << "Failed do infer in fluid model: " << engine_name().c_str();
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name().c_str();
     return -1;
   }
 
diff --git a/core/general-server/op/general_infer_op.h b/core/general-server/op/general_infer_op.h
index ff0b210ad7c6824a7e8a61e9ac504a65eafa4c58..b41784185ff445c540774b8b24ef897caf6fbf96 100644
--- a/core/general-server/op/general_infer_op.h
+++ b/core/general-server/op/general_infer_op.h
@@ -15,17 +15,9 @@
 #pragma once
 #include <string>
 #include <vector>
-#ifdef BCLOUD
-#ifdef WITH_GPU
-#include "paddle/paddle_inference_api.h"
-#else
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#endif
-#else
-#include "paddle_inference_api.h"  // NOLINT
-#endif
 #include "core/general-server/general_model_service.pb.h"
 #include "core/general-server/op/general_infer_helper.h"
+#include "paddle_inference_api.h"  // NOLINT
 
 namespace baidu {
 namespace paddle_serving {
diff --git a/core/general-server/op/general_reader_op.cpp b/core/general-server/op/general_reader_op.cpp
index 380f861606a7719a33407dd946c5ac476629fdb7..14fd617e058ccc392a673678d03145ec1f6fd6d2 100644
--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -37,9 +37,9 @@ int conf_check(const Request *req,
                const std::shared_ptr<PaddleGeneralModelConfig> &model_config) {
   int var_num = req->insts(0).tensor_array_size();
   if (var_num != model_config->_feed_type.size()) {
-    VLOG(2) << "var num: " << var_num;
-    VLOG(2) << "model config var num: " << model_config->_feed_type.size();
-    LOG(ERROR) << "feed var number not match.";
+    LOG(ERROR) << "feed var number not match: model config["
+               << model_config->_feed_type.size() << "] vs. actual[" << var_num
+               << "]";
     return -1;
   }
 
@@ -72,6 +72,7 @@ int conf_check(const Request *req,
 int GeneralReaderOp::inference() {
   // reade request from client
   const Request *req = dynamic_cast<const Request *>(get_request_message());
+  uint64_t log_id = req->log_id();
 
   int batch_size = req->insts_size();
   int input_var_num = 0;
@@ -83,25 +84,29 @@ int GeneralReaderOp::inference() {
   TensorVector *out = &res->tensor_vector;
 
   res->SetBatchSize(batch_size);
+  res->SetLogId(log_id);
 
   if (!res) {
-    LOG(ERROR) << "Failed get op tls reader object output";
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed get op tls reader object output";
   }
 
   Timer timeline;
   int64_t start = timeline.TimeStampUS();
   int var_num = req->insts(0).tensor_array_size();
-  VLOG(2) << "var num: " << var_num;
+  VLOG(2) << "(logid=" << log_id << ") var num: " << var_num;
 
-  VLOG(2) << "start to call load general model_conf op";
+  VLOG(2) << "(logid=" << log_id
+          << ") start to call load general model_conf op";
   baidu::paddle_serving::predictor::Resource &resource =
       baidu::paddle_serving::predictor::Resource::instance();
 
-  VLOG(2) << "get resource pointer done.";
+  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
+
   std::shared_ptr<PaddleGeneralModelConfig> model_config =
       resource.get_general_model_config();
 
-  VLOG(2) << "print general model config done.";
+  VLOG(2) << "(logid=" << log_id << ") print general model config done.";
 
   // TODO(guru4elephant): how to do conditional check?
   /*
@@ -122,7 +127,8 @@ int GeneralReaderOp::inference() {
   for (int i = 0; i < var_num; ++i) {
     paddle::PaddleTensor lod_tensor;
     elem_type[i] = req->insts(0).tensor_array(i).elem_type();
-    VLOG(2) << "var[" << i << "] has elem type: " << elem_type[i];
+    VLOG(2) << "(logid=" << log_id << ") var[" << i
+            << "] has elem type: " << elem_type[i];
     if (elem_type[i] == 0) {  // int64
       elem_size[i] = sizeof(int64_t);
       lod_tensor.dtype = paddle::PaddleDType::INT64;
@@ -137,17 +143,19 @@ int GeneralReaderOp::inference() {
     if (model_config->_is_lod_feed[i]) {
       lod_tensor.lod.resize(1);
       lod_tensor.lod[0].push_back(0);
-      VLOG(2) << "var[" << i << "] is lod_tensor";
+      VLOG(2) << "(logid=" << log_id << ") var[" << i << "] is lod_tensor";
     } else {
       lod_tensor.shape.push_back(batch_size);
       capacity[i] = 1;
       for (int k = 0; k < req->insts(0).tensor_array(i).shape_size(); ++k) {
         int dim = req->insts(0).tensor_array(i).shape(k);
-        VLOG(2) << "shape for var[" << i << "]: " << dim;
+        VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
+                << "]: " << dim;
         capacity[i] *= dim;
         lod_tensor.shape.push_back(dim);
       }
-      VLOG(2) << "var[" << i << "] is tensor, capacity: " << capacity[i];
+      VLOG(2) << "(logid=" << log_id << ") var[" << i
+              << "] is tensor, capacity: " << capacity[i];
     }
     lod_tensor.name = model_config->_feed_name[i];
     out->push_back(lod_tensor);
@@ -167,11 +175,12 @@ int GeneralReaderOp::inference() {
         } else if (tensor.int_data_size() > 0) {
           data_len = tensor.int_data_size();
         }
-        VLOG(2) << "tensor size for var[" << i << "]: " << data_len;
+        VLOG(2) << "(logid=" << log_id << ") tensor size for var[" << i
+                << "]: " << data_len;
         tensor_size += data_len;
 
         int cur_len = out->at(i).lod[0].back();
-        VLOG(2) << "current len: " << cur_len;
+        VLOG(2) << "(logid=" << log_id << ") current len: " << cur_len;
 
         int sample_len = 0;
         if (tensor.shape_size() == 1) {
@@ -180,7 +189,7 @@ int GeneralReaderOp::inference() {
           sample_len = tensor.shape(0);
         }
         out->at(i).lod[0].push_back(cur_len + sample_len);
-        VLOG(2) << "new len: " << cur_len + sample_len;
+        VLOG(2) << "(logid=" << log_id << ") new len: " << cur_len + sample_len;
       }
       out->at(i).data.Resize(tensor_size * elem_size[i]);
       out->at(i).shape = {out->at(i).lod[0].back()};
@@ -190,11 +199,11 @@ int GeneralReaderOp::inference() {
       if (out->at(i).shape.size() == 1) {
         out->at(i).shape.push_back(1);
       }
-      VLOG(2) << "var[" << i
+      VLOG(2) << "(logid=" << log_id << ") var[" << i
               << "] is lod_tensor and len=" << out->at(i).lod[0].back();
     } else {
       out->at(i).data.Resize(batch_size * capacity[i] * elem_size[i]);
-      VLOG(2) << "var[" << i
+      VLOG(2) << "(logid=" << log_id << ") var[" << i
               << "] is tensor and capacity=" << batch_size * capacity[i];
     }
   }
@@ -203,8 +212,8 @@ int GeneralReaderOp::inference() {
   for (int i = 0; i < var_num; ++i) {
     if (elem_type[i] == 0) {
       int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
-      VLOG(2) << "first element data in var[" << i << "] is "
-              << req->insts(0).tensor_array(i).int64_data(0);
+      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
+              << "] is " << req->insts(0).tensor_array(i).int64_data(0);
       int offset = 0;
       for (int j = 0; j < batch_size; ++j) {
         int elem_num = req->insts(j).tensor_array(i).int64_data_size();
@@ -219,8 +228,8 @@ int GeneralReaderOp::inference() {
       }
     } else if (elem_type[i] == 1) {
       float *dst_ptr = static_cast<float *>(out->at(i).data.data());
-      VLOG(2) << "first element data in var[" << i << "] is "
-              << req->insts(0).tensor_array(i).float_data(0);
+      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
+              << "] is " << req->insts(0).tensor_array(i).float_data(0);
       int offset = 0;
       for (int j = 0; j < batch_size; ++j) {
         int elem_num = req->insts(j).tensor_array(i).float_data_size();
@@ -235,8 +244,8 @@ int GeneralReaderOp::inference() {
       }
     } else if (elem_type[i] == 2) {
       int32_t *dst_ptr = static_cast<int32_t *>(out->at(i).data.data());
-      VLOG(2) << "first element data in var[" << i << "] is "
-              << req->insts(0).tensor_array(i).int_data(0);
+      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
+              << "] is " << req->insts(0).tensor_array(i).int_data(0);
       int offset = 0;
       for (int j = 0; j < batch_size; ++j) {
         int elem_num = req->insts(j).tensor_array(i).int_data_size();
@@ -252,15 +261,16 @@ int GeneralReaderOp::inference() {
     }
   }
 
-  VLOG(2) << "output size: " << out->size();
+  VLOG(2) << "(logid=" << log_id << ") output size: " << out->size();
 
   timeline.Pause();
   int64_t end = timeline.TimeStampUS();
   res->p_size = 0;
+  res->_batch_size = batch_size;
   AddBlobInfo(res, start);
   AddBlobInfo(res, end);
 
-  VLOG(2) << "read data from client success";
+  VLOG(2) << "(logid=" << log_id << ") read data from client success";
   return 0;
 }
 DEFINE_OP(GeneralReaderOp);
diff --git a/core/general-server/op/general_reader_op.h b/core/general-server/op/general_reader_op.h
index c45d6ad5139a7a9a267f1c6556028a99295500de..cb9693982ff659214dd21ff09f189f86b6b3a339 100644
--- a/core/general-server/op/general_reader_op.h
+++ b/core/general-server/op/general_reader_op.h
@@ -13,21 +13,13 @@
 // limitations under the License.
 
 #pragma once
-#include <vector>
-#ifdef BCLOUD
-#ifdef WITH_GPU
-#include "paddle/paddle_inference_api.h"
-#else
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#endif
-#else
-#include "paddle_inference_api.h"  // NOLINT
-#endif
 #include <string>
+#include <vector>
 #include "core/general-server/general_model_service.pb.h"
 #include "core/general-server/load_general_model_service.pb.h"
 #include "core/general-server/op/general_infer_helper.h"
 #include "core/predictor/framework/resource.h"
+#include "paddle_inference_api.h"  // NOLINT
 
 namespace baidu {
 namespace paddle_serving {
diff --git a/core/general-server/op/general_response_op.cpp b/core/general-server/op/general_response_op.cpp
index b2d918bef0f3c715aa69f52a65edd48cdcc5e87b..5f80510f79f8acf09aed9f7f65e84b9cfaa9a8ed 100644
--- a/core/general-server/op/general_response_op.cpp
+++ b/core/general-server/op/general_response_op.cpp
@@ -42,6 +42,9 @@ using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
 int GeneralResponseOp::inference() {
   const std::vector<std::string> pre_node_names = pre_names();
   VLOG(2) << "pre node names size: " << pre_node_names.size();
+  const GeneralBlob *input_blob;
+  uint64_t log_id =
+      get_depend_argument<GeneralBlob>(pre_node_names[0])->GetLogId();
 
   const Request *req = dynamic_cast<const Request *>(get_request_message());
   // response inst with only fetch_var_names
@@ -52,15 +55,17 @@ int GeneralResponseOp::inference() {
   // timeline.Start();
   int64_t start = timeline.TimeStampUS();
 
-  VLOG(2) << "start to call load general model_conf op";
+  VLOG(2) << "(logid=" << log_id
+          << ") start to call load general model_conf op";
   baidu::paddle_serving::predictor::Resource &resource =
       baidu::paddle_serving::predictor::Resource::instance();
 
-  VLOG(2) << "get resource pointer done.";
+  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
   std::shared_ptr<PaddleGeneralModelConfig> model_config =
       resource.get_general_model_config();
 
-  VLOG(2) << "max body size : " << brpc::fLU64::FLAGS_max_body_size;
+  VLOG(2) << "(logid=" << log_id
+          << ") max body size : " << brpc::fLU64::FLAGS_max_body_size;
 
   std::vector<int> fetch_index;
   fetch_index.resize(req->fetch_var_names_size());
@@ -69,16 +74,16 @@ int GeneralResponseOp::inference() {
         model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
   }
 
-  const GeneralBlob *input_blob;
   for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
     const std::string &pre_name = pre_node_names[pi];
-    VLOG(2) << "pre names[" << pi << "]: " << pre_name << " ("
-            << pre_node_names.size() << ")";
+    VLOG(2) << "(logid=" << log_id << ") pre names[" << pi << "]: " << pre_name
+            << " (" << pre_node_names.size() << ")";
     input_blob = get_depend_argument<GeneralBlob>(pre_name);
     // fprintf(stderr, "input(%s) blob address %x\n", pre_names.c_str(),
     // input_blob);
     if (!input_blob) {
-      LOG(ERROR) << "Failed mutable depended argument, op: " << pre_name;
+      LOG(ERROR) << "(logid=" << log_id
+                 << ") Failed mutable depended argument, op: " << pre_name;
       return -1;
     }
 
@@ -92,17 +97,19 @@ int GeneralResponseOp::inference() {
     for (auto &idx : fetch_index) {
       Tensor *tensor = fetch_inst->add_tensor_array();
       if (model_config->_is_lod_fetch[idx]) {
-        VLOG(2) << "out[" << idx << "] " << model_config->_fetch_name[idx]
-                << " is lod_tensor";
+        VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
+                << model_config->_fetch_name[idx] << " is lod_tensor";
         for (int k = 0; k < in->at(idx).shape.size(); ++k) {
-          VLOG(2) << "shape[" << k << "]: " << in->at(idx).shape[k];
+          VLOG(2) << "(logid=" << log_id << ") shape[" << k
+                  << "]: " << in->at(idx).shape[k];
           tensor->add_shape(in->at(idx).shape[k]);
         }
       } else {
-        VLOG(2) << "out[" << idx << "] " << model_config->_fetch_name[idx]
-                << " is tensor";
+        VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] "
+                << model_config->_fetch_name[idx] << " is tensor";
         for (int k = 0; k < in->at(idx).shape.size(); ++k) {
-          VLOG(2) << "shape[" << k << "]: " << in->at(idx).shape[k];
+          VLOG(2) << "(logid=" << log_id << ") shape[" << k
+                  << "]: " << in->at(idx).shape[k];
           tensor->add_shape(in->at(idx).shape[k]);
         }
       }
@@ -119,8 +126,8 @@ int GeneralResponseOp::inference() {
       auto dtype = in->at(idx).dtype;
 
       if (dtype == paddle::PaddleDType::INT64) {
-        VLOG(2) << "Prepare int64 var [" << model_config->_fetch_name[idx]
-                << "].";
+        VLOG(2) << "(logid=" << log_id << ") Prepare int64 var ["
+                << model_config->_fetch_name[idx] << "].";
         int64_t *data_ptr = static_cast<int64_t *>(in->at(idx).data.data());
         // from
         // https://stackoverflow.com/questions/15499641/copy-a-stdvector-to-a-repeated-field-from-protobuf-with-memcpy
@@ -130,16 +137,16 @@ int GeneralResponseOp::inference() {
         fetch_p->mutable_tensor_array(var_idx)->mutable_int64_data()->Swap(
             &tmp_data);
       } else if (dtype == paddle::PaddleDType::FLOAT32) {
-        VLOG(2) << "Prepare float var [" << model_config->_fetch_name[idx]
-                << "].";
+        VLOG(2) << "(logid=" << log_id << ") Prepare float var ["
+                << model_config->_fetch_name[idx] << "].";
         float *data_ptr = static_cast<float *>(in->at(idx).data.data());
         google::protobuf::RepeatedField<float> tmp_data(data_ptr,
                                                         data_ptr + cap);
         fetch_p->mutable_tensor_array(var_idx)->mutable_float_data()->Swap(
             &tmp_data);
       } else if (dtype == paddle::PaddleDType::INT32) {
-        VLOG(2) << "Prepare int32 var [" << model_config->_fetch_name[idx]
-                << "].";
+        VLOG(2) << "(logid=" << log_id << ")Prepare int32 var ["
+                << model_config->_fetch_name[idx] << "].";
         int32_t *data_ptr = static_cast<int32_t *>(in->at(idx).data.data());
         google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr,
                                                           data_ptr + cap);
@@ -148,13 +155,16 @@ int GeneralResponseOp::inference() {
       }
 
       if (model_config->_is_lod_fetch[idx]) {
-        for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
-          fetch_p->mutable_tensor_array(var_idx)->add_lod(
-              in->at(idx).lod[0][j]);
+        if (in->at(idx).lod.size() > 0) {
+          for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
+            fetch_p->mutable_tensor_array(var_idx)->add_lod(
+                in->at(idx).lod[0][j]);
+          }
         }
       }
 
-      VLOG(2) << "fetch var [" << model_config->_fetch_name[idx] << "] ready";
+      VLOG(2) << "(logid=" << log_id << ") fetch var ["
+              << model_config->_fetch_name[idx] << "] ready";
       var_idx++;
     }
   }
@@ -167,7 +177,8 @@ int GeneralResponseOp::inference() {
     // a more elegant way.
     for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
       input_blob = get_depend_argument<GeneralBlob>(pre_node_names[pi]);
-      VLOG(2) << "p size for input blob: " << input_blob->p_size;
+      VLOG(2) << "(logid=" << log_id
+              << ") p size for input blob: " << input_blob->p_size;
       int profile_time_idx = -1;
       if (pi == 0) {
         profile_time_idx = 0;
diff --git a/core/general-server/op/general_response_op.h b/core/general-server/op/general_response_op.h
index 4b0f6ed17b5a66dbda7bccef25cec03bf044e6c5..0f72b8f98df336dd515560129a8cfd27650738bb 100644
--- a/core/general-server/op/general_response_op.h
+++ b/core/general-server/op/general_response_op.h
@@ -15,16 +15,8 @@
 #pragma once
 #include <string>
 #include <vector>
-#ifdef BCLOUD
-#ifdef WITH_GPU
-#include "paddle/paddle_inference_api.h"
-#else
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#endif
-#else
-#include "paddle_inference_api.h"  // NOLINT
-#endif
 #include "core/general-server/general_model_service.pb.h"
+#include "paddle_inference_api.h"  // NOLINT
 
 namespace baidu {
 namespace paddle_serving {
diff --git a/core/general-server/op/general_text_reader_op.cpp b/core/general-server/op/general_text_reader_op.cpp
index 154e975d314a72515624b7bbf1aff85f70b8b5d3..3fa433c6cc31a3dbce331013780212d50e7f643c 100644
--- a/core/general-server/op/general_text_reader_op.cpp
+++ b/core/general-server/op/general_text_reader_op.cpp
@@ -35,6 +35,7 @@ using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
 int GeneralTextReaderOp::inference() {
   // reade request from client
   const Request *req = dynamic_cast<const Request *>(get_request_message());
+  uint64_t log_id = req->log_id();
 
   int batch_size = req->insts_size();
   int input_var_num = 0;
@@ -44,16 +45,18 @@ int GeneralTextReaderOp::inference() {
   std::vector<int64_t> capacity;
 
   GeneralBlob *res = mutable_data<GeneralBlob>();
-  TensorVector *out = &res->tensor_vector;
-
-  res->SetBatchSize(batch_size);
 
   if (!res) {
-    LOG(ERROR) << "Failed get op tls reader object output";
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed get op tls reader object output";
   }
 
+  TensorVector *out = &res->tensor_vector;
+  res->SetBatchSize(batch_size);
+  res->SetLogId(log_id);
+
   if (batch_size <= 0) {
-    LOG(ERROR) << "Batch size < 0";
+    LOG(ERROR) << "(logid=" << log_id << ") Batch size < 0";
     return -1;
   }
 
@@ -61,17 +64,18 @@ int GeneralTextReaderOp::inference() {
   int64_t start = timeline.TimeStampUS();
 
   int var_num = req->insts(0).tensor_array_size();
-  VLOG(2) << "var num: " << var_num;
+  VLOG(2) << "(logid=" << log_id << ") var num: " << var_num;
 
-  VLOG(2) << "start to call load general model_conf op";
+  VLOG(2) << "(logid=" << log_id
+          << ") start to call load general model_conf op";
   baidu::paddle_serving::predictor::Resource &resource =
       baidu::paddle_serving::predictor::Resource::instance();
 
-  VLOG(2) << "get resource pointer done.";
+  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
   std::shared_ptr<PaddleGeneralModelConfig> model_config =
       resource.get_general_model_config();
 
-  VLOG(2) << "print general model config done.";
+  VLOG(2) << "(logid=" << log_id << ") print general model config done.";
 
   elem_type.resize(var_num);
   elem_size.resize(var_num);
@@ -79,7 +83,8 @@ int GeneralTextReaderOp::inference() {
   for (int i = 0; i < var_num; ++i) {
     paddle::PaddleTensor lod_tensor;
     elem_type[i] = req->insts(0).tensor_array(i).elem_type();
-    VLOG(2) << "var[" << i << "] has elem type: " << elem_type[i];
+    VLOG(2) << "(logid=" << log_id << ") var[" << i
+            << "] has elem type: " << elem_type[i];
     if (elem_type[i] == 0) {  // int64
       elem_size[i] = sizeof(int64_t);
       lod_tensor.dtype = paddle::PaddleDType::INT64;
@@ -91,17 +96,19 @@ int GeneralTextReaderOp::inference() {
     if (req->insts(0).tensor_array(i).shape(0) == -1) {
       lod_tensor.lod.resize(1);
       lod_tensor.lod[0].push_back(0);
-      VLOG(2) << "var[" << i << "] is lod_tensor";
+      VLOG(2) << "(logid=" << log_id << ") var[" << i << "] is lod_tensor";
     } else {
       lod_tensor.shape.push_back(batch_size);
       capacity[i] = 1;
       for (int k = 0; k < req->insts(0).tensor_array(i).shape_size(); ++k) {
         int dim = req->insts(0).tensor_array(i).shape(k);
-        VLOG(2) << "shape for var[" << i << "]: " << dim;
+        VLOG(2) << "(logid=" << log_id << ") shape for var[" << i
+                << "]: " << dim;
         capacity[i] *= dim;
         lod_tensor.shape.push_back(dim);
       }
-      VLOG(2) << "var[" << i << "] is tensor, capacity: " << capacity[i];
+      VLOG(2) << "(logid=" << log_id << ") var[" << i
+              << "] is tensor, capacity: " << capacity[i];
     }
     lod_tensor.name = model_config->_feed_name[i];
     out->push_back(lod_tensor);
@@ -117,11 +124,11 @@ int GeneralTextReaderOp::inference() {
       }
       out->at(i).data.Resize(out->at(i).lod[0].back() * elem_size[i]);
       out->at(i).shape = {out->at(i).lod[0].back(), 1};
-      VLOG(2) << "var[" << i
+      VLOG(2) << "(logid=" << log_id << ") var[" << i
               << "] is lod_tensor and len=" << out->at(i).lod[0].back();
     } else {
       out->at(i).data.Resize(batch_size * capacity[i] * elem_size[i]);
-      VLOG(2) << "var[" << i
+      VLOG(2) << "(logid=" << log_id << ") var[" << i
               << "] is tensor and capacity=" << batch_size * capacity[i];
     }
   }
@@ -163,7 +170,7 @@ int GeneralTextReaderOp::inference() {
   AddBlobInfo(res, start);
   AddBlobInfo(res, end);
 
-  VLOG(2) << "read data from client success";
+  VLOG(2) << "(logid=" << log_id << ") read data from client success";
   return 0;
 }
 DEFINE_OP(GeneralTextReaderOp);
diff --git a/core/general-server/op/general_text_reader_op.h b/core/general-server/op/general_text_reader_op.h
index ca134256fce4aaa003f4b07033d4c471ebdb59b7..af822993dc37fae23c1fa584d640cbfe8d9950c8 100644
--- a/core/general-server/op/general_text_reader_op.h
+++ b/core/general-server/op/general_text_reader_op.h
@@ -13,21 +13,13 @@
 // limitations under the License.
 
 #pragma once
-#include <vector>
-#ifdef BCLOUD
-#ifdef WITH_GPU
-#include "paddle/paddle_inference_api.h"
-#else
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#endif
-#else
-#include "paddle_inference_api.h"  // NOLINT
-#endif
 #include <string>
+#include <vector>
 #include "core/general-server/general_model_service.pb.h"
 #include "core/general-server/load_general_model_service.pb.h"
 #include "core/general-server/op/general_infer_helper.h"
 #include "core/predictor/framework/resource.h"
+#include "paddle_inference_api.h"  // NOLINT
 
 namespace baidu {
 namespace paddle_serving {
diff --git a/core/general-server/op/general_text_response_op.cpp b/core/general-server/op/general_text_response_op.cpp
index ae194119f1fc3edad01662041035f7011873998a..03eea7d76c83782b661ea4553fc5fc0eee99e372 100644
--- a/core/general-server/op/general_text_response_op.cpp
+++ b/core/general-server/op/general_text_response_op.cpp
@@ -40,6 +40,9 @@ int GeneralTextResponseOp::inference() {
   VLOG(2) << "Going to run inference";
   const std::vector<std::string> pre_node_names = pre_names();
   VLOG(2) << "pre node names size: " << pre_node_names.size();
+  const GeneralBlob *input_blob;
+  uint64_t log_id =
+      get_depend_argument<GeneralBlob>(pre_node_names[0])->GetLogId();
 
   const Request *req = dynamic_cast<const Request *>(get_request_message());
   // response inst with only fetch_var_names
@@ -48,11 +51,12 @@ int GeneralTextResponseOp::inference() {
   Timer timeline;
   int64_t start = timeline.TimeStampUS();
 
-  VLOG(2) << "start to call load general model_conf op";
+  VLOG(2) << "(logid=" << log_id
+          << ") start to call load general model_conf op";
   baidu::paddle_serving::predictor::Resource &resource =
       baidu::paddle_serving::predictor::Resource::instance();
 
-  VLOG(2) << "get resource pointer done.";
+  VLOG(2) << "(logid=" << log_id << ") get resource pointer done.";
   std::shared_ptr<PaddleGeneralModelConfig> model_config =
       resource.get_general_model_config();
 
@@ -63,20 +67,20 @@ int GeneralTextResponseOp::inference() {
         model_config->_fetch_alias_name_to_index[req->fetch_var_names(i)];
   }
 
-  const GeneralBlob *input_blob;
   for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
     const std::string &pre_name = pre_node_names[pi];
-    VLOG(2) << "pre names[" << pi << "]: " << pre_name << " ("
-            << pre_node_names.size() << ")";
+    VLOG(2) << "(logid=" << log_id << ") pre names[" << pi << "]: " << pre_name
+            << " (" << pre_node_names.size() << ")";
     input_blob = get_depend_argument<GeneralBlob>(pre_name);
     if (!input_blob) {
-      LOG(ERROR) << "Failed mutable depended argument, op: " << pre_name;
+      LOG(ERROR) << "(logid=" << log_id
+                 << ") Failed mutable depended argument, op: " << pre_name;
       return -1;
     }
 
     const TensorVector *in = &input_blob->tensor_vector;
     int batch_size = input_blob->GetBatchSize();
-    VLOG(2) << "input batch size: " << batch_size;
+    VLOG(2) << "(logid=" << log_id << ") input batch size: " << batch_size;
 
     ModelOutput *output = res->add_outputs();
     output->set_engine_name(
@@ -88,12 +92,13 @@ int GeneralTextResponseOp::inference() {
         // currently only response float tensor or lod_tensor
         tensor->set_elem_type(1);
         if (model_config->_is_lod_fetch[idx]) {
-          VLOG(2) << "out[" << idx << " is lod_tensor";
+          VLOG(2) << "(logid=" << log_id << ") out[" << idx << " is lod_tensor";
           tensor->add_shape(-1);
         } else {
-          VLOG(2) << "out[" << idx << "] is tensor";
+          VLOG(2) << "(logid=" << log_id << ") out[" << idx << "] is tensor";
           for (int k = 1; k < in->at(idx).shape.size(); ++k) {
-            VLOG(2) << "shape[" << k - 1 << "]: " << in->at(idx).shape[k];
+            VLOG(2) << "(logid=" << log_id << ") shape[" << k - 1
+                    << "]: " << in->at(idx).shape[k];
             tensor->add_shape(in->at(idx).shape[k]);
           }
         }
@@ -137,7 +142,8 @@ int GeneralTextResponseOp::inference() {
     // a more elegant way.
     for (uint32_t pi = 0; pi < pre_node_names.size(); ++pi) {
       input_blob = get_depend_argument<GeneralBlob>(pre_node_names[pi]);
-      VLOG(2) << "p size for input blob: " << input_blob->p_size;
+      VLOG(2) << "(logid=" << log_id
+              << ") p size for input blob: " << input_blob->p_size;
       int profile_time_idx = -1;
       if (pi == 0) {
         profile_time_idx = 0;
diff --git a/core/general-server/op/general_text_response_op.h b/core/general-server/op/general_text_response_op.h
index 52f7bbf0f7d76122bad14cf513302f70c35aa1d8..334d98476e67f745635f7d66d7b8682de62da355 100644
--- a/core/general-server/op/general_text_response_op.h
+++ b/core/general-server/op/general_text_response_op.h
@@ -15,17 +15,9 @@
 #pragma once
 #include <string>
 #include <vector>
-#ifdef BCLOUD
-#ifdef WITH_GPU
-#include "paddle/paddle_inference_api.h"
-#else
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#endif
-#else
-#include "paddle_inference_api.h"  // NOLINT
-#endif
 #include "core/general-server/general_model_service.pb.h"
 #include "core/general-server/op/general_infer_helper.h"
+#include "paddle_inference_api.h"  // NOLINT
 
 namespace baidu {
 namespace paddle_serving {
diff --git a/core/general-server/proto/general_model_service.proto b/core/general-server/proto/general_model_service.proto
index 8581ecb2a2e10deced910a20ce26c2beaca956fa..e7dd5fccf54be43db8e65a9ed1112ceaece93700 100644
--- a/core/general-server/proto/general_model_service.proto
+++ b/core/general-server/proto/general_model_service.proto
@@ -37,6 +37,7 @@ message Request {
   repeated FeedInst insts = 1;
   repeated string fetch_var_names = 2;
   optional bool profile_server = 3 [ default = false ];
+  required uint64 log_id = 4 [ default = 0 ];
 };
 
 message Response {
diff --git a/core/general-server/proto/load_general_model_service.proto b/core/general-server/proto/load_general_model_service.proto
index b8a86497f8c0b683f4e95f4517d83f576e79baad..f844bd5b2c0ddb34a32d00559b087c2fbb2ebfed 100644
--- a/core/general-server/proto/load_general_model_service.proto
+++ b/core/general-server/proto/load_general_model_service.proto
@@ -21,6 +21,7 @@ option cc_generic_services = true;
 message RequestAndResponse {
   required int32 a = 1;
   required float b = 2;
+  required uint64 log_id = 3 [ default = 0 ];
 };
 
 service LoadGeneralModelService {
diff --git a/core/pdcodegen/plugin/pdcodegen b/core/pdcodegen/plugin/pdcodegen
deleted file mode 100755
index bb81217121a15b99cda8a320f357f716357f96c5..0000000000000000000000000000000000000000
Binary files a/core/pdcodegen/plugin/pdcodegen and /dev/null differ
diff --git a/core/pdcodegen/src/pdcodegen.cpp b/core/pdcodegen/src/pdcodegen.cpp
index af4081a985ece584f82120799fc9a384f83830be..c505ca66385dd363ad0a76470012f07a925bcd17 100644
--- a/core/pdcodegen/src/pdcodegen.cpp
+++ b/core/pdcodegen/src/pdcodegen.cpp
@@ -280,25 +280,29 @@ class PdsCodeGenerator : public CodeGenerator {
             "  baidu::rpc::ClosureGuard done_guard(done);\n"
             "  baidu::rpc::Controller* cntl = \n"
             "        static_cast<baidu::rpc::Controller*>(cntl_base);\n"
+            "  uint64_t log_id = request->log_id();\n"
+            "  cntl->set_log_id(log_id);\n"
             "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
             "       "
             "::baidu::paddle_serving::predictor::InferServiceManager::instance("
             ").item(\"$service$\");\n"
             "  if (svr == NULL) {\n"
-            "    LOG(ERROR) << \"Not found service: $service$\";\n"
+            "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: "
+            "$service$\";\n"
             "    cntl->SetFailed(404, \"Not found service: $service$\");\n"
             "    return ;\n"
             "  }\n"
-            "  LOG(INFO) << \" remote_side=\[\" << cntl->remote_side() << "  // NOLINT
-            "\"\]\";\n"
-            "  LOG(INFO) << \" local_side=\[\" << cntl->local_side() << "  // NOLINT
-            "\"\]\";\n"
-            "  LOG(INFO) << \" service_name=\[\" << \"$name$\" << \"\]\";\n"  // NOLINT
-            "  LOG(INFO) << \" log_id=\[\" << cntl->log_id() << \"\]\";\n"  // NOLINT
-            "  int err_code = svr->inference(request, response);\n"
+            "  LOG(INFO) << \"(logid=\" << log_id << \") remote_side=\[\" "  // NOLINT
+            "<< cntl->remote_side() << \"\]\";\n"
+            "  LOG(INFO) << \"(logid=\" << log_id << \") local_side=\[\" "  // NOLINT
+            "<< cntl->local_side() << \"\]\";\n"
+            "  LOG(INFO) << \"(logid=\" << log_id << \") service_name=\[\" "  // NOLINT
+            "<< \"$name$\" << \"\]\";\n"
+            "  int err_code = svr->inference(request, response, log_id);\n"
             "  if (err_code != 0) {\n"
             "    LOG(WARNING)\n"
-            "        << \"Failed call inferservice[$name$], name[$service$]\"\n"
+            "        << \"(logid=\" << log_id << \") Failed call "
+            "inferservice[$name$], name[$service$]\"\n"
             "        << \", error_code: \" << err_code;\n"
             "    cntl->SetFailed(err_code, \"InferService inference "
             "failed!\");\n"
@@ -306,7 +310,8 @@ class PdsCodeGenerator : public CodeGenerator {
             "  gettimeofday(&tv, NULL);\n"
             "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n"
             "  // flush notice log\n"
-            "  LOG(INFO) << \" tc=\[\" << (end - start) << \"\]\";\n",  // NOLINT
+            "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - "  // NOLINT
+            "start) << \"\]\";\n",  // NOLINT
             "name",
             class_name,
             "service",
@@ -317,26 +322,31 @@ class PdsCodeGenerator : public CodeGenerator {
             "  baidu::rpc::ClosureGuard done_guard(done);\n"
             "  baidu::rpc::Controller* cntl = \n"
             "        static_cast<baidu::rpc::Controller*>(cntl_base);\n"
+            "  uint64_t log_id = equest->log_id();\n"
+            "  cntl->set_log_id(log_id);\n"
             "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
             "       "
             "::baidu::paddle_serving::predictor::InferServiceManager::instance("
             ").item(\"$service$\");\n"
             "  if (svr == NULL) {\n"
-            "    LOG(ERROR) << \"Not found service: $service$\";\n"
+            "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: "
+            "$service$\";\n"
             "    cntl->SetFailed(404, \"Not found service: $service$\");\n"
             "    return ;\n"
             "  }\n"
-            "  LOG(INFO) << \" remote_side=\[\" << cntl->remote_side() << "  // NOLINT
-            "\"\]\";\n"
-            "  LOG(INFO) << \" local_side=\[\" << cntl->local_side() << "  // NOLINT
-            "\"\]\";\n"
-            "  LOG(INFO) << \" service_name=\[\" << \"$name$\" << \"\]\";\n"  // NOLINT
-            "  LOG(INFO) << \" log_id=\[\" << cntl->log_id() << \"\]\";\n"  // NOLINT
+            "  LOG(INFO) << \"(logid=\" << log_id << \") remote_side=\[\" "  // NOLINT
+            "<< cntl->remote_side() << \"\]\";\n"
+            "  LOG(INFO) << \"(logid=\" << log_id << \") local_side=\[\" "  // NOLINT
+            "<< cntl->local_side() << \"\]\";\n"
+            "  LOG(INFO) << \"(logid=\" << log_id << \") service_name=\[\" "  // NOLINT
+            "<< \"$name$\" << \"\]\";\n"
             "  butil::IOBufBuilder debug_os;\n"
-            "  int err_code = svr->inference(request, response, &debug_os);\n"
+            "  int err_code = svr->inference(request, response, log_id, "
+            "&debug_os);\n"
             "  if (err_code != 0) {\n"
             "    LOG(WARNING)\n"
-            "        << \"Failed call inferservice[$name$], name[$service$]\"\n"
+            "        << \"(logid=\" << log_id << \") Failed call "
+            "inferservice[$name$], name[$service$]\"\n"
             "        << \", error_code: \" << err_code;\n"
             "    cntl->SetFailed(err_code, \"InferService inference "
             "failed!\");\n"
@@ -345,9 +355,11 @@ class PdsCodeGenerator : public CodeGenerator {
             "  gettimeofday(&tv, NULL);\n"
             "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n"
             "  // flush notice log\n"
-            "  LOG(INFO) << \" tc=\[\" << (end - start) << \"\]\";\n"  // NOLINT
+            "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - "  // NOLINT
+            "start) << \"\]\";\n"
             "  LOG(INFO)\n"
-            "      << \"TC=[\" << (end - start) << \"] Received debug "
+            "      << \"(logid=\" << log_id << \") TC=[\" << (end - start) << "
+            "\"] Received debug "
             "request[log_id=\" << cntl->log_id()\n"
             "      << \"] from \" << cntl->remote_side()\n"
             "      << \" to \" << cntl->local_side();\n",
@@ -1011,25 +1023,31 @@ class PdsCodeGenerator : public CodeGenerator {
             "  brpc::ClosureGuard done_guard(done);\n"
             "  brpc::Controller* cntl = \n"
             "        static_cast<brpc::Controller*>(cntl_base);\n"
+            "  uint64_t log_id = request->log_id();\n"
+            "  cntl->set_log_id(log_id);\n"
             "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
             "       "
             "::baidu::paddle_serving::predictor::InferServiceManager::instance("
             ").item(\"$service$\");\n"
             "  if (svr == NULL) {\n"
-            "    LOG(ERROR) << \"Not found service: $service$\";\n"
+            "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: "
+            "$service$\";\n"
             "    cntl->SetFailed(404, \"Not found service: $service$\");\n"
             "    return ;\n"
             "  }\n"
-            "  LOG(INFO) << \" remote_side=\[\" << cntl->remote_side() << "  // NOLINT
+            "  LOG(INFO) << \"(logid=\" << log_id << \") "
+            "remote_side=\[\" << cntl->remote_side() << "  // NOLINT
             "\"\]\";\n"
-            "  LOG(INFO) << \" local_side=\[\" << cntl->local_side() << "  // NOLINT
+            "  LOG(INFO) << \"(logid=\" << log_id << \") "
+            "local_side=\[\" << cntl->local_side() << "  // NOLINT
             "\"\]\";\n"
-            "  LOG(INFO) << \" service_name=\[\" << \"$name$\" << \"\]\";\n"  // NOLINT
-            "  LOG(INFO) << \" log_id=\[\" << cntl->log_id() << \"\]\";\n"  // NOLINT
-            "  int err_code = svr->inference(request, response);\n"
+            "  LOG(INFO) << \"(logid=\" << log_id << \") "
+            "service_name=\[\" << \"$name$\" << \"\]\";\n"  // NOLINT
+            "  int err_code = svr->inference(request, response, log_id);\n"
             "  if (err_code != 0) {\n"
             "    LOG(WARNING)\n"
-            "        << \"Failed call inferservice[$name$], name[$service$]\"\n"
+            "        << \"(logid=\" << log_id << \") Failed call "
+            "inferservice[$name$], name[$service$]\"\n"
             "        << \", error_code: \" << err_code;\n"
             "    cntl->SetFailed(err_code, \"InferService inference "
             "failed!\");\n"
@@ -1037,7 +1055,8 @@ class PdsCodeGenerator : public CodeGenerator {
             "  gettimeofday(&tv, NULL);\n"
             "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n"
             "  // flush notice log\n"
-            "  LOG(INFO) << \" tc=\[\" << (end - start) << \"\]\";\n",  // NOLINT
+            "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - "  // NOLINT
+            "start) << \"\]\";\n",  // NOLINT
             "name",
             class_name,
             "service",
@@ -1048,26 +1067,31 @@ class PdsCodeGenerator : public CodeGenerator {
             "  brpc::ClosureGuard done_guard(done);\n"
             "  brpc::Controller* cntl = \n"
             "        static_cast<brpc::Controller*>(cntl_base);\n"
+            "  uint64_t log_id = request->log_id();\n"
+            "  cntl->set_log_id(log_id);\n"
             "  ::baidu::paddle_serving::predictor::InferService* svr = \n"
             "       "
             "::baidu::paddle_serving::predictor::InferServiceManager::instance("
             ").item(\"$service$\");\n"
             "  if (svr == NULL) {\n"
-            "    LOG(ERROR) << \"Not found service: $service$\";\n"
+            "    LOG(ERROR) << \"(logid=\" << log_id << \") Not found service: "
+            "$service$\";\n"
             "    cntl->SetFailed(404, \"Not found service: $service$\");\n"
             "    return ;\n"
             "  }\n"
-            "  LOG(INFO) << \" remote_side=\[\" << cntl->remote_side() << "  // NOLINT
-            "\"\]\";\n"
-            "  LOG(INFO) << \" local_side=\[\" << cntl->local_side() << "  // NOLINT
-            "\"\]\";\n"
-            "  LOG(INFO) << \" service_name=\[\" << \"$name$\" << \"\]\";\n"  // NOLINT
-            "  LOG(INFO) << \" log_id=\[\" << cntl->log_id() << \"\]\";\n"  // NOLINT
+            "  LOG(INFO) << \"(logid=\" << log_id << \") remote_side=\[\" "  // NOLINT
+            " << cntl->remote_side() << \"\]\";\n"
+            "  LOG(INFO) << \"(logid=\" << log_id << \") local_side=\[\" "  // NOLINT
+            "<< cntl->local_side() << \"\]\";\n"
+            "  LOG(INFO) << \"(logid=\" << log_id << \") service_name=\[\" "  // NOLINT
+            "<< \"$name$\" << \"\]\";\n"
             "  butil::IOBufBuilder debug_os;\n"
-            "  int err_code = svr->inference(request, response, &debug_os);\n"
+            "  int err_code = svr->inference(request, response, log_id, "
+            "&debug_os);\n"
             "  if (err_code != 0) {\n"
             "    LOG(WARNING)\n"
-            "        << \"Failed call inferservice[$name$], name[$service$]\"\n"
+            "        << \"(logid=\" << log_id << \") Failed call "
+            "inferservice[$name$], name[$service$]\"\n"
             "        << \", error_code: \" << err_code;\n"
             "    cntl->SetFailed(err_code, \"InferService inference "
             "failed!\");\n"
@@ -1076,9 +1100,11 @@ class PdsCodeGenerator : public CodeGenerator {
             "  gettimeofday(&tv, NULL);\n"
             "  long end = tv.tv_sec * 1000000 + tv.tv_usec;\n"
             "  // flush notice log\n"
-            "  LOG(INFO) << \" tc=\[\" << (end - start) << \"\]\";\n"  // NOLINT
+            "  LOG(INFO) << \"(logid=\" << log_id << \") tc=\[\" << (end - "  // NOLINT
+            "start) << \"\]\";\n"  // NOLINT
             "  LOG(INFO)\n"
-            "      << \"TC=[\" << (end - start) << \"] Received debug "
+            "      << \"(logid=\" << log_id << \") TC=[\" << (end - start) << "
+            "\"] Received debug "
             "request[log_id=\" << cntl->log_id()\n"
             "      << \"] from \" << cntl->remote_side()\n"
             "      << \" to \" << cntl->local_side();\n",
diff --git a/core/predictor/CMakeLists.txt b/core/predictor/CMakeLists.txt
index 1b9dc7b29845a2b8c7f958c1d8e836cb57e91d41..6b5013c3edadb4592df40db539fa75fb9364d02f 100644
--- a/core/predictor/CMakeLists.txt
+++ b/core/predictor/CMakeLists.txt
@@ -6,7 +6,7 @@ include(framework/CMakeLists.txt)
 include(tools/CMakeLists.txt)
 include(src/CMakeLists.txt)
 
-
+add_definitions(-D__STDC_FORMAT_MACROS)
 add_library(pdserving ${pdserving_srcs})
 set_source_files_properties(
         ${pdserving_srcs}
diff --git a/core/predictor/common/inner_common.h b/core/predictor/common/inner_common.h
index 96b8a8027070da559e239cdc5f6057d534ff3412..f6847146ba14b2b9fc1b07485c748e6e8300d7bd 100644
--- a/core/predictor/common/inner_common.h
+++ b/core/predictor/common/inner_common.h
@@ -50,7 +50,7 @@
 #include "butil/time.h"
 #endif
 
-#include "glog/raw_logging.h"
+#define ERROR_STRING_LEN 10240
 
 #include "core/configure/general_model_config.pb.h"
 #include "core/configure/include/configure_parser.h"
diff --git a/core/predictor/framework/channel.h b/core/predictor/framework/channel.h
index a48368329469f36ab7881972e6a7059ab8066b5d..67808be16409cdf0610363d0039accf0f3a9d5cb 100644
--- a/core/predictor/framework/channel.h
+++ b/core/predictor/framework/channel.h
@@ -72,9 +72,10 @@ class Channel {
 
   const std::string& op() { return _op; }
 
-  int share_to_bus(Bus* bus) {
+  int share_to_bus(Bus* bus, const uint64_t log_id) {
     if (bus->regist(_op, this) != 0) {
-      LOG(ERROR) << "Failed regist channel[" << _op << "] to bus!";
+      LOG(ERROR) << "(logid=" << log_id << ") Failed regist channel[" << _op
+                 << "] to bus!";
       return -1;
     }
 
diff --git a/core/predictor/framework/dag.cpp b/core/predictor/framework/dag.cpp
index f039ac70ffe2e55a59f926d754ca411a034058f4..c45952f8fb8f3b6d48c2e1295d6a43d45ad185e5 100644
--- a/core/predictor/framework/dag.cpp
+++ b/core/predictor/framework/dag.cpp
@@ -155,13 +155,11 @@ int Dag::init(const configure::Workflow& conf, const std::string& name) {
   }
 
   if (FLAGS_el_log_level == 16) {
-    LOG(INFO) << "DAG: " << _dag_name;
-    LOG(INFO) << ", Op Num: " << _index_nodes.size();
+    LOG(INFO) << "DAG: " << _dag_name << ", Op Num: " << _index_nodes.size();
     for (uint32_t nid = 0; nid < _index_nodes.size(); nid++) {
       DagNode* node = _index_nodes[nid];
-      LOG(INFO) << ", OP-" << node->id << "-" << node->name << "-"
-                << node->type;
-      LOG(INFO) << " depends: " << node->depends.size();
+      LOG(INFO) << "OP-" << node->id << "-" << node->name << "-" << node->type
+                << " depends: " << node->depends.size();
 
       boost::unordered_map<std::string, EdgeMode>::iterator it;
       for (it = node->depends.begin(); it != node->depends.end(); it++) {
@@ -214,8 +212,8 @@ int Dag::topo_sort() {
     }
   }
   for (int i = 0; i < in_degree.size(); ++i) {
-    LOG(INFO) << "(" << _index_nodes[i]->name << ") in_degree[" << i
-              << "]: " << in_degree[i];
+    VLOG(2) << "(" << _index_nodes[i]->name << ") in_degree[" << i
+            << "]: " << in_degree[i];
   }
   int sorted_num = 0;
   DagStage* stage = new (std::nothrow) DagStage();
diff --git a/core/predictor/framework/dag_view.cpp b/core/predictor/framework/dag_view.cpp
index bde8084b41fee00bc95d2a35444a15258d2a12a8..29a4e97378c20d6f9caae8a97de7dc5f714960e9 100644
--- a/core/predictor/framework/dag_view.cpp
+++ b/core/predictor/framework/dag_view.cpp
@@ -26,7 +26,9 @@ namespace baidu {
 namespace paddle_serving {
 namespace predictor {
 
-int DagView::init(Dag* dag, const std::string& service_name) {
+int DagView::init(Dag* dag,
+                  const std::string& service_name,
+                  const uint64_t log_id) {
   _name = dag->name();
   _full_name = service_name + NAME_DELIMITER + dag->name();
   _bus = butil::get_object<Bus>();
@@ -36,17 +38,20 @@ int DagView::init(Dag* dag, const std::string& service_name) {
   for (uint32_t si = 0; si < stage_size; si++) {
     const DagStage* stage = dag->stage_by_index(si);
     if (stage == NULL) {
-      LOG(ERROR) << "Failed get stage by index:" << si;
+      LOG(ERROR) << "(logid=" << log_id << ") Failed get stage by index:" << si;
       return ERR_INTERNAL_FAILURE;
     }
     ViewStage* vstage = butil::get_object<ViewStage>();
     if (vstage == NULL) {
-      LOG(ERROR) << "Failed get vstage from object pool"
+      LOG(ERROR) << "(logid=" << log_id
+                 << ") Failed get vstage from object pool"
                  << "at:" << si;
       return ERR_MEM_ALLOC_FAILURE;
     }
-    VLOG(2) << "stage[" << si << "] name: " << stage->full_name;
-    VLOG(2) << "stage[" << si << "] node size: " << stage->nodes.size();
+    VLOG(2) << "(logid=" << log_id << ") stage[" << si
+            << "] name: " << stage->full_name;
+    VLOG(2) << "(logid=" << log_id << ") stage[" << si
+            << "] node size: " << stage->nodes.size();
     vstage->full_name = service_name + NAME_DELIMITER + stage->full_name;
     uint32_t node_size = stage->nodes.size();
     // create tls view node
@@ -54,31 +59,39 @@ int DagView::init(Dag* dag, const std::string& service_name) {
       DagNode* node = stage->nodes[ni];
       ViewNode* vnode = butil::get_object<ViewNode>();
       if (vnode == NULL) {
-        LOG(ERROR) << "Failed get vnode at:" << ni;
+        LOG(ERROR) << "(logid=" << log_id << ") Failed get vnode at:" << ni;
         return ERR_MEM_ALLOC_FAILURE;
       }
       // factory type
       Op* op = OpRepository::instance().get_op(node->type);
       if (op == NULL) {
-        LOG(ERROR) << "Failed get op with type:" << node->type;
+        LOG(ERROR) << "(logid=" << log_id
+                   << ") Failed get op with type:" << node->type;
         return ERR_INTERNAL_FAILURE;
       }
 
       // initialize a TLS op object
-      VLOG(2) << "dag view initialized: \n"
+      VLOG(2) << "(logid=" << log_id << ") dag view initialized: \n"
               << "node id: " << node->id << "\n"
               << "node name: " << node->name << "\n"
               << "node type: " << node->type;
-      if (op->init(_bus, dag, node->id, node->name, node->type, node->conf) !=
-          0) {
-        LOG(WARNING) << "Failed init op, type:" << node->type;
+      if (op->init(_bus,
+                   dag,
+                   node->id,
+                   node->name,
+                   node->type,
+                   node->conf,
+                   log_id) != 0) {
+        LOG(WARNING) << "(logid=" << log_id
+                     << ") Failed init op, type:" << node->type;
         return ERR_INTERNAL_FAILURE;
       }
 
       op->set_full_name(service_name + NAME_DELIMITER + node->full_name);
 
       // Set the name of the Op as the key of the matching engine.
-      VLOG(2) << "op->set_engine_name(" << node->name.c_str() << ")";
+      VLOG(2) << "(logid=" << log_id << ") op->set_engine_name("
+              << node->name.c_str() << ")";
       op->set_engine_name(node->name);
 
       vnode->conf = node;
@@ -88,7 +101,7 @@ int DagView::init(Dag* dag, const std::string& service_name) {
            it != vnode->conf->depends.end();
            ++it) {
         std::string pre_node_name = it->first;
-        VLOG(2) << "add op pre name: \n"
+        VLOG(2) << "(logid=" << log_id << ") add op pre name: \n"
                 << "current op name: " << vnode->op->op_name()
                 << ", previous op name: " << pre_node_name;
         vnode->op->add_pre_node_name(pre_node_name);
@@ -102,7 +115,7 @@ int DagView::init(Dag* dag, const std::string& service_name) {
     //<< " previous op name: "
     //<< _view[si - 1]->nodes.back()->op->op_name();
     // vstage->nodes.back()->op->set_pre_node_name(
-    //_view[si - 1]->nodes.back()->op->op_name());
+    // _view[si - 1]->nodes.back()->op->op_name());
     /*}*/
     _view.push_back(vstage);
   }
@@ -133,14 +146,15 @@ int DagView::deinit() {
   return ERR_OK;
 }
 
-int DagView::execute(butil::IOBufBuilder* debug_os) {
+int DagView::execute(const uint64_t log_id, butil::IOBufBuilder* debug_os) {
   uint32_t stage_size = _view.size();
   for (uint32_t si = 0; si < stage_size; si++) {
-    TRACEPRINTF("start to execute stage[%u]", si);
-    int errcode = execute_one_stage(_view[si], debug_os);
-    TRACEPRINTF("finish to execute stage[%u]", si);
+    TRACEPRINTF("(logid=%" PRIu64 ") start to execute stage[%u]", log_id, si);
+    int errcode = execute_one_stage(_view[si], log_id, debug_os);
+    TRACEPRINTF("(logid=%" PRIu64 ") finish to execute stage[%u]", log_id, si);
     if (errcode < 0) {
-      LOG(ERROR) << "failed execute stage[" << _view[si]->debug();
+      LOG(ERROR) << "(logid=" << log_id << ") Failed execute stage["
+                 << _view[si]->debug();
       return errcode;
     }
   }
@@ -151,29 +165,34 @@ int DagView::execute(butil::IOBufBuilder* debug_os) {
 // You can derive a subclass to implement this func.
 // ParallelDagView maybe the one you want.
 int DagView::execute_one_stage(ViewStage* vstage,
+                               const uint64_t log_id,
                                butil::IOBufBuilder* debug_os) {
   butil::Timer stage_time(butil::Timer::STARTED);
   uint32_t node_size = vstage->nodes.size();
-  VLOG(2) << "vstage->nodes.size(): " << node_size;
+  VLOG(2) << "(logid=" << log_id << ") vstage->nodes.size(): " << node_size;
   for (uint32_t ni = 0; ni < node_size; ni++) {
     ViewNode* vnode = vstage->nodes[ni];
     DagNode* conf = vnode->conf;
     Op* op = vnode->op;
-    TRACEPRINTF("start to execute op[%s]", op->name());
-    int errcode = op->process(debug_os != NULL);
-    TRACEPRINTF("finish to execute op[%s]", op->name());
+    TRACEPRINTF(
+        "(logid=%" PRIu64 ") start to execute op[%s]", log_id, op->name());
+    int errcode = op->process(log_id, debug_os != NULL);
+    TRACEPRINTF(
+        "(logid=%" PRIu64 ") finish to execute op[%s]", log_id, op->name());
     if (errcode < 0) {
-      LOG(ERROR) << "Execute failed, Op:" << op->debug_string();
+      LOG(ERROR) << "(logid=" << log_id
+                 << ") Execute failed, Op:" << op->debug_string();
       return errcode;
     }
 
     if (errcode > 0) {
-      LOG(INFO) << "Execute ignore, Op:" << op->debug_string();
+      LOG(INFO) << "(logid=" << log_id
+                << ") Execute ignore, Op:" << op->debug_string();
       continue;
     }
 
     if (debug_os) {
-      (*debug_os) << "{\"op_name\": \"" << op->name()
+      (*debug_os) << "(logid=" << log_id << ") {\"op_name\": \"" << op->name()
                   << "\", \"debug_str:\": \"" << op->debug_string()
                   << "\", \"time_info\": \"" << op->time_info() << "\"}";
     }
@@ -186,34 +205,34 @@ int DagView::execute_one_stage(ViewStage* vstage,
   return ERR_OK;
 }
 
-int DagView::set_request_channel(Channel& request) {
+int DagView::set_request_channel(Channel& request, const uint64_t log_id) {
   // Each workflow should get the very beginning
   // request (channel), and commit it to bus, for
   // the first stage ops consuming.
 
-  request.share_to_bus(_bus);
+  request.share_to_bus(_bus, log_id);
 
   return ERR_OK;
 }
 
-const Channel* DagView::get_response_channel() const {
+const Channel* DagView::get_response_channel(const uint64_t log_id) const {
   // Caller obtains response channel from bus, and
   // writes it to rpc response(protbuf/json)
   if (_view.size() < 1) {
-    LOG(ERROR) << "invalid empty view stage!";
+    LOG(ERROR) << "(logid=" << log_id << ") invalid empty view stage!";
     return NULL;
   }
 
   ViewStage* last_stage = _view[_view.size() - 1];
   if (last_stage->nodes.size() != 1 || last_stage->nodes[0] == NULL) {
-    LOG(ERROR) << "Invalid last stage, size[" << last_stage->nodes.size()
-               << "] != 1";
+    LOG(ERROR) << "(logid=" << log_id << ") Invalid last stage, size["
+               << last_stage->nodes.size() << "] != 1";
     return NULL;
   }
 
   Op* last_op = last_stage->nodes[0]->op;
   if (last_op == NULL) {
-    LOG(ERROR) << "Last op is NULL";
+    LOG(ERROR) << "(logid=" << log_id << ") Last op is NULL";
     return NULL;
   }
   return last_op->mutable_channel();
diff --git a/core/predictor/framework/dag_view.h b/core/predictor/framework/dag_view.h
index 4999f64b47eb667e90437d387a5ac5ba5337fc64..8ba9d224c577b475d0a52b79e92f72bd1abaa187 100644
--- a/core/predictor/framework/dag_view.h
+++ b/core/predictor/framework/dag_view.h
@@ -47,21 +47,22 @@ class DagView {
 
   ~DagView() {}
 
-  int init(Dag* dag, const std::string& service_name);
+  int init(Dag* dag, const std::string& service_name, const uint64_t log_id);
 
   int deinit();
 
-  int execute(butil::IOBufBuilder* debug_os);
+  int execute(const uint64_t log_id, butil::IOBufBuilder* debug_os);
 
   // The default execution strategy is in sequencing
   // You can derive a subclass to implement this func.
   // ParallelDagView maybe the one you want.
   virtual int execute_one_stage(ViewStage* vstage,
+                                const uint64_t log_id,
                                 butil::IOBufBuilder* debug_os);
 
-  int set_request_channel(Channel& request);  // NOLINT
+  int set_request_channel(Channel& request, const uint64_t log_id);  // NOLINT
 
-  const Channel* get_response_channel() const;
+  const Channel* get_response_channel(const uint64_t log_id) const;
 
   const std::string& name() const { return _name; }
 
diff --git a/core/predictor/framework/factory.h b/core/predictor/framework/factory.h
index 8d5fc9a1c40b047351f38a1136728ee179a191ed..fde95eaa1565c8d0f4fca7f846c7c8a49b383163 100644
--- a/core/predictor/framework/factory.h
+++ b/core/predictor/framework/factory.h
@@ -17,7 +17,7 @@
 #include <string>
 #include <utility>
 #include "core/predictor/common/inner_common.h"
-#include "glog/raw_logging.h"
+
 namespace baidu {
 namespace paddle_serving {
 namespace predictor {
@@ -28,7 +28,12 @@ namespace predictor {
     FactoryDerive<D, B>* factory = new (std::nothrow) FactoryDerive<D, B>(); \
     if (factory == NULL ||                                                   \
         FactoryPool<B>::instance().register_factory(tag, factory) != 0) {    \
-      RAW_LOG_FATAL("Failed regist factory: %s in macro!", #D);              \
+      char err_str[ERROR_STRING_LEN];                                        \
+      snprintf(err_str,                                                      \
+               ERROR_STRING_LEN - 1,                                         \
+               "Failed regist factory: %s in macro!",                        \
+               #D);                                                          \
+      RAW_LOG(FATAL, err_str);                                               \
       return -1;                                                             \
     }                                                                        \
     return 0;                                                                \
@@ -54,7 +59,13 @@ namespace predictor {
     if (factory == NULL ||                                                     \
         ::baidu::paddle_serving::predictor::FactoryPool<B>::instance()         \
                 .register_factory(#D, factory) != 0) {                         \
-      RAW_LOG_FATAL("Failed regist factory: %s->%s in macro!", #D, #B);        \
+      char err_str[ERROR_STRING_LEN];                                          \
+      snprintf(err_str,                                                        \
+               ERROR_STRING_LEN - 1,                                           \
+               "Failed regist factory: %s->%s in macro!",                      \
+               #D,                                                             \
+               #B);                                                            \
+      RAW_LOG(FATAL, err_str);                                                 \
       return;                                                                  \
     }                                                                          \
     return;                                                                    \
@@ -66,15 +77,26 @@ namespace predictor {
     ::baidu::paddle_serving::predictor::FactoryDerive<D, B>* factory = new (   \
         ::std::nothrow)::baidu::paddle_serving::predictor::FactoryDerive<D,    \
                                                                          B>(); \
+    char err_str[ERROR_STRING_LEN];                                            \
     if (factory == NULL ||                                                     \
         ::baidu::paddle_serving::predictor::FactoryPool<B>::instance()         \
                 .register_factory(N, factory) != 0) {                          \
-      RAW_LOG_FATAL(                                                           \
-          "Failed regist factory: %s->%s, tag: %s in macro!", #D, #B, N);      \
+      snprintf(err_str,                                                        \
+               ERROR_STRING_LEN - 1,                                           \
+               "Failed regist factory: %s->%s, tag: %s in macro!",             \
+               #D,                                                             \
+               #B,                                                             \
+               N);                                                             \
+      RAW_LOG(FATAL, err_str);                                                 \
       return;                                                                  \
     }                                                                          \
-    RAW_LOG_WARNING(                                                           \
-        "Succ regist factory: %s->%s, tag: %s in macro!", #D, #B, N);          \
+    snprintf(err_str,                                                          \
+             ERROR_STRING_LEN - 1,                                             \
+             "Succ regist factory: %s->%s, tag: %s in macro!",                 \
+             #D,                                                               \
+             #B,                                                               \
+             N);                                                               \
+    RAW_LOG(WARNING, err_str);                                                 \
     return;                                                                    \
   }
 
@@ -102,24 +124,35 @@ class FactoryPool {
   }
 
   int register_factory(const std::string& tag, FactoryBase<B>* factory) {
+    char err_str[ERROR_STRING_LEN];
     typename std::map<std::string, FactoryBase<B>*>::iterator it =
         _pool.find(tag);
     if (it != _pool.end()) {
-      RAW_LOG_FATAL("Insert duplicate with tag: %s", tag.c_str());
+      snprintf(err_str,
+               ERROR_STRING_LEN - 1,
+               "Insert duplicate with tag: %s",
+               tag.c_str());
+      RAW_LOG(FATAL, err_str);
       return -1;
     }
 
     std::pair<typename std::map<std::string, FactoryBase<B>*>::iterator, bool>
         r = _pool.insert(std::make_pair(tag, factory));
     if (!r.second) {
-      RAW_LOG_FATAL("Failed insert new factory with: %s", tag.c_str());
+      snprintf(err_str,
+               ERROR_STRING_LEN - 1,
+               "Failed insert new factory with: %s",
+               tag.c_str());
+      RAW_LOG(FATAL, err_str);
       return -1;
     }
 
-    RAW_LOG_INFO("Succ insert one factory, tag: %s, base type %s",
-                 tag.c_str(),
-                 typeid(B).name());
-
+    snprintf(err_str,
+             ERROR_STRING_LEN - 1,
+             "Succ insert one factory, tag: %s, base type %s",
+             tag.c_str(),
+             typeid(B).name());
+    RAW_LOG(INFO, err_str);
     return 0;
   }
 
@@ -127,9 +160,13 @@ class FactoryPool {
     typename std::map<std::string, FactoryBase<B>*>::iterator it =
         _pool.find(tag);
     if (it == _pool.end() || it->second == NULL) {
-      RAW_LOG_FATAL("Not found factory pool, tag: %s, pool size %u",
-                    tag.c_str(),
-                    _pool.size());
+      char err_str[ERROR_STRING_LEN];
+      snprintf(err_str,
+               ERROR_STRING_LEN - 1,
+               "Not found factory pool, tag: %s, pool size %u",
+               tag.c_str(),
+               _pool.size());
+      RAW_LOG(FATAL, err_str);
       return NULL;
     }
 
diff --git a/core/predictor/framework/infer.h b/core/predictor/framework/infer.h
index 1ab49f36a8b84fbb122204f5beb8d16290206674..9fc79bd54b1a718bfb969622d4046214e215ea95 100644
--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -615,7 +615,8 @@ class VersionedInferEngine : public InferEngine {
       LOG(ERROR) << "Failed generate engine with type:" << engine_type;
       return -1;
     }
-    VLOG(2) << "FLGS_logtostderr " << FLAGS_logtostderr;
+#ifndef BCLOUD
+    VLOG(2) << "FLAGS_logtostderr " << FLAGS_logtostderr;
     int tmp = FLAGS_logtostderr;
     if (engine->proc_initialize(conf, version) != 0) {
       LOG(ERROR) << "Failed initialize engine, type:" << engine_type;
@@ -623,6 +624,12 @@ class VersionedInferEngine : public InferEngine {
     }
     VLOG(2) << "FLGS_logtostderr " << FLAGS_logtostderr;
     FLAGS_logtostderr = tmp;
+#else
+    if (engine->proc_initialize(conf, version) != 0) {
+      LOG(ERROR) << "Failed initialize engine, type:" << engine_type;
+      return -1;
+    }
+#endif
     auto r = _versions.insert(std::make_pair(engine->version(), engine));
     if (!r.second) {
       LOG(ERROR) << "Failed insert item: " << engine->version()
diff --git a/core/predictor/framework/op_repository.h b/core/predictor/framework/op_repository.h
index d27e68c1dbcd98e7393aac6e8b0f001e7300a2bc..bf3b2327cd4a1f0af83c98a5bfe529c37ceb403e 100644
--- a/core/predictor/framework/op_repository.h
+++ b/core/predictor/framework/op_repository.h
@@ -62,7 +62,10 @@ class OpRepository {
   template <typename OP_TYPE>
   void regist_op(std::string op_type) {
     _repository[op_type] = &OpFactory<OP_TYPE>::instance();
-    RAW_LOG_INFO("Succ regist op: %s", op_type.c_str());
+    char err_str[ERROR_STRING_LEN];
+    snprintf(
+        err_str, ERROR_STRING_LEN - 1, "Succ regist op: %s", op_type.c_str());
+    RAW_LOG(INFO, err_str);
   }
 
   Op* get_op(std::string op_type);
diff --git a/core/predictor/framework/resource.cpp b/core/predictor/framework/resource.cpp
index ca219519e2dcf20bc961d991e3f2eb0ad060f38f..cdb21097fdf40ca6060d99088ed5649a08507720 100644
--- a/core/predictor/framework/resource.cpp
+++ b/core/predictor/framework/resource.cpp
@@ -17,6 +17,9 @@
 #include <string>
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/framework/kv_manager.h"
+#ifdef BCLOUD
+#include "aipe_sec_client.h"  // NOLINT
+#endif
 namespace baidu {
 namespace paddle_serving {
 namespace predictor {
@@ -109,6 +112,42 @@ int Resource::initialize(const std::string& path, const std::string& file) {
   }
   LOG(WARNING) << "Successfully proc initialized mempool wrapper";
 
+#ifdef WITH_AUTH
+  std::string product_name_str = resource_conf.auth_product_name();
+  std::string container_id_str = resource_conf.auth_container_id();
+
+  char* product_name = new char[product_name_str.size() + 1];
+  snprintf(product_name,
+           product_name_str.size() + 1,
+           "%s",
+           product_name_str.c_str());
+  char* container_id = new char[container_id_str.size() + 1];
+  snprintf(container_id,
+           container_id_str.size() + 1,
+           "%s",
+           container_id_str.c_str());
+
+  aipe_auth_request request;
+  request.product_name = product_name;
+  request.container_id = container_id;
+  request.request_ts = (int64_t)time(NULL);
+
+  LOG(INFO) << "\nEasypack info"
+            << "\nproduct name: " << request.product_name
+            << "\ncontainer_id: " << request.container_id
+            << "\nrequest time stamp: " << request.request_ts;
+
+  aipe_auth_response response;
+  response = check_auth(request);
+
+  if (response.result == 0) {
+    LOG(INFO) << "Authentication succeed.";
+  } else {
+    LOG(ERROR) << "Authentication failed. Error code: " << response.result;
+    return -1;
+  }
+#endif
+
   if (FLAGS_enable_model_toolkit) {
     int err = 0;
     std::string model_toolkit_path = resource_conf.model_toolkit_path();
diff --git a/core/predictor/framework/service.cpp b/core/predictor/framework/service.cpp
index 95c7db9f96a6e78522190e3f522d38669423475b..cb02a3278b37bd76631193fbd78cf026eed633c9 100644
--- a/core/predictor/framework/service.cpp
+++ b/core/predictor/framework/service.cpp
@@ -19,6 +19,7 @@
 #include <butil/time.h>  // butil::Timer
 #endif
 
+#include <inttypes.h>
 #include <list>
 #include <string>
 #include <vector>
@@ -135,50 +136,63 @@ const std::string& InferService::name() const { return _infer_service_format; }
 // ´®ÐÐÖ´ÐÐÃ¿¸öworkflow
 int InferService::inference(const google::protobuf::Message* request,
                             google::protobuf::Message* response,
+                            const uint64_t log_id,
                             butil::IOBufBuilder* debug_os) {
-  TRACEPRINTF("start to inference");
+  TRACEPRINTF("(logid=%" PRIu64 ") start to inference", log_id);
   // when funtion call begins, framework will reset
   // thread local variables&resources automatically.
   if (Resource::instance().thread_clear() != 0) {
-    LOG(ERROR) << "Failed thread clear whole resource";
+    LOG(ERROR) << "(logid=" << log_id << ") Failed thread clear whole resource";
     return ERR_INTERNAL_FAILURE;
   }
 
-  TRACEPRINTF("finish to thread clear");
+  TRACEPRINTF("(logid=%" PRIu64 ") finish to thread clear", log_id);
 
   if (_enable_map_request_to_workflow) {
-    LOG(INFO) << "enable map request == True";
-    std::vector<Workflow*>* workflows = _map_request_to_workflow(request);
+    VLOG(2) << "(logid=" << log_id << ") enable map request == True";
+    std::vector<Workflow*>* workflows =
+        _map_request_to_workflow(request, log_id);
     if (!workflows || workflows->size() == 0) {
-      LOG(ERROR) << "Failed to map request to workflow";
+      LOG(ERROR) << "(logid=" << log_id
+                 << ") Failed to map request to workflow";
       return ERR_INTERNAL_FAILURE;
     }
     size_t fsize = workflows->size();
     for (size_t fi = 0; fi < fsize; ++fi) {
       Workflow* workflow = (*workflows)[fi];
       if (workflow == NULL) {
-        LOG(ERROR) << "Failed to get valid workflow at: " << fi;
+        LOG(ERROR) << "(logid=" << log_id
+                   << ") Failed to get valid workflow at: " << fi;
         return ERR_INTERNAL_FAILURE;
       }
-      TRACEPRINTF("start to execute workflow[%s]", workflow->name().c_str());
-      int errcode = _execute_workflow(workflow, request, response, debug_os);
-      TRACEPRINTF("finish to execute workflow[%s]", workflow->name().c_str());
+      TRACEPRINTF("(logid=%" PRIu64 ") start to execute workflow[%s]",
+                  log_id,
+                  workflow->name().c_str());
+      int errcode =
+          _execute_workflow(workflow, request, response, log_id, debug_os);
+      TRACEPRINTF("(logid=%" PRIu64 ") finish to execute workflow[%s]",
+                  log_id,
+                  workflow->name().c_str());
       if (errcode < 0) {
-        LOG(ERROR) << "Failed execute workflow[" << workflow->name()
-                   << "] in:" << name();
+        LOG(ERROR) << "(logid=" << log_id << ") Failed execute workflow["
+                   << workflow->name() << "] in:" << name();
         return errcode;
       }
     }
   } else {
-    LOG(INFO) << "enable map request == False";
-    TRACEPRINTF("start to execute one workflow");
+    VLOG(2) << "(logid=" << log_id << ") enable map request == False";
+    TRACEPRINTF("(logid=%" PRIu64 ") start to execute one workflow", log_id);
     size_t fsize = _flows.size();
     for (size_t fi = 0; fi < fsize; ++fi) {
-      TRACEPRINTF("start to execute one workflow-%lu", fi);
-      int errcode = execute_one_workflow(fi, request, response, debug_os);
-      TRACEPRINTF("finish to execute one workflow-%lu", fi);
+      TRACEPRINTF(
+          "(logid=%" PRIu64 ") start to execute one workflow-%lu", log_id, fi);
+      int errcode =
+          execute_one_workflow(fi, request, response, log_id, debug_os);
+      TRACEPRINTF(
+          "(logid=%" PRIu64 ") finish to execute one workflow-%lu", log_id, fi);
       if (errcode < 0) {
-        LOG(ERROR) << "Failed execute 0-th workflow in:" << name();
+        LOG(ERROR) << "(logid=" << log_id
+                   << ") Failed execute 0-th workflow in:" << name();
         return errcode;
       }
     }
@@ -188,26 +202,30 @@ int InferService::inference(const google::protobuf::Message* request,
 
 int InferService::debug(const google::protobuf::Message* request,
                         google::protobuf::Message* response,
+                        const uint64_t log_id,
                         butil::IOBufBuilder* debug_os) {
-  return inference(request, response, debug_os);
+  return inference(request, response, log_id, debug_os);
 }
 
 int InferService::execute_one_workflow(uint32_t index,
                                        const google::protobuf::Message* request,
                                        google::protobuf::Message* response,
+                                       const uint64_t log_id,
                                        butil::IOBufBuilder* debug_os) {
   if (index >= _flows.size()) {
-    LOG(ERROR) << "Faield execute workflow, index: " << index
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Faield execute workflow, index: " << index
                << " >= max:" << _flows.size();
     return ERR_OVERFLOW_FAILURE;
   }
   Workflow* workflow = _flows[index];
-  return _execute_workflow(workflow, request, response, debug_os);
+  return _execute_workflow(workflow, request, response, log_id, debug_os);
 }
 
 int InferService::_execute_workflow(Workflow* workflow,
                                     const google::protobuf::Message* request,
                                     google::protobuf::Message* response,
+                                    const uint64_t log_id,
                                     butil::IOBufBuilder* debug_os) {
   butil::Timer workflow_time(butil::Timer::STARTED);
   // create and submit beginer channel
@@ -215,54 +233,62 @@ int InferService::_execute_workflow(Workflow* workflow,
   req_channel.init(0, START_OP_NAME);
   req_channel = request;
 
-  DagView* dv = workflow->fetch_dag_view(full_name());
-  dv->set_request_channel(req_channel);
+  DagView* dv = workflow->fetch_dag_view(full_name(), log_id);
+  dv->set_request_channel(req_channel, log_id);
 
   // call actual inference interface
-  int errcode = dv->execute(debug_os);
+  int errcode = dv->execute(log_id, debug_os);
   if (errcode < 0) {
-    LOG(ERROR) << "Failed execute dag for workflow:" << workflow->name();
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed execute dag for workflow:" << workflow->name();
     return errcode;
   }
 
-  TRACEPRINTF("finish to dv execute");
+  TRACEPRINTF("(logid=%" PRIu64 ") finish to dv execute", log_id);
   // create ender channel and copy
-  const Channel* res_channel = dv->get_response_channel();
+  const Channel* res_channel = dv->get_response_channel(log_id);
+  if (res_channel == NULL) {
+    LOG(ERROR) << "(logid=" << log_id << ") Failed get response channel";
+    return ERR_INTERNAL_FAILURE;
+  }
+
   if (!_merger || !_merger->merge(res_channel->message(), response)) {
-    LOG(ERROR) << "Failed merge channel res to response";
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed merge channel res to response";
     return ERR_INTERNAL_FAILURE;
   }
-  TRACEPRINTF("finish to copy from");
+  TRACEPRINTF("(logid=%" PRIu64 ") finish to copy from", log_id);
 
   workflow_time.stop();
-  LOG(INFO) << "workflow total time: " << workflow_time.u_elapsed();
+  LOG(INFO) << "(logid=" << log_id
+            << ") workflow total time: " << workflow_time.u_elapsed();
   PredictorMetric::GetInstance()->update_latency_metric(
       WORKFLOW_METRIC_PREFIX + dv->full_name(), workflow_time.u_elapsed());
 
   // return tls data to object pool
   workflow->return_dag_view(dv);
-  TRACEPRINTF("finish to return dag view");
+  TRACEPRINTF("(logid=%" PRIu64 ") finish to return dag view", log_id);
   return ERR_OK;
 }
 
 std::vector<Workflow*>* InferService::_map_request_to_workflow(
-    const google::protobuf::Message* request) {
+    const google::protobuf::Message* request, const uint64_t log_id) {
   const google::protobuf::Descriptor* desc = request->GetDescriptor();
   const google::protobuf::FieldDescriptor* field =
       desc->FindFieldByName(_request_field_key);
   if (field == NULL) {
-    LOG(ERROR) << "No field[" << _request_field_key << "] in ["
-               << desc->full_name() << "].";
+    LOG(ERROR) << "(logid=" << log_id << ") No field[" << _request_field_key
+               << "] in [" << desc->full_name() << "].";
     return NULL;
   }
   if (field->is_repeated()) {
-    LOG(ERROR) << "field[" << desc->full_name() << "." << _request_field_key
-               << "] is repeated.";
+    LOG(ERROR) << "(logid=" << log_id << ") field[" << desc->full_name() << "."
+               << _request_field_key << "] is repeated.";
     return NULL;
   }
   if (field->cpp_type() != google::protobuf::FieldDescriptor::CPPTYPE_STRING) {
-    LOG(ERROR) << "field[" << desc->full_name() << "." << _request_field_key
-               << "] should be string";
+    LOG(ERROR) << "(logid=" << log_id << ") field[" << desc->full_name() << "."
+               << _request_field_key << "] should be string";
     return NULL;
   }
   const std::string& field_value =
@@ -270,7 +296,7 @@ std::vector<Workflow*>* InferService::_map_request_to_workflow(
   std::vector<Workflow*>* p_workflow =
       _request_to_workflow_map.seek(field_value);
   if (p_workflow == NULL) {
-    LOG(ERROR) << "cannot find key[" << field_value
+    LOG(ERROR) << "(logid=" << log_id << ") cannot find key[" << field_value
                << "] in _request_to_workflow_map";
     return NULL;
   }
diff --git a/core/predictor/framework/service.h b/core/predictor/framework/service.h
index ef6d3a3a468d1fc47c3012ad5d664bb64595a52c..d3fb0b988f002ab68d28173f9993c02b8eb76813 100644
--- a/core/predictor/framework/service.h
+++ b/core/predictor/framework/service.h
@@ -52,25 +52,29 @@ class InferService {
   // Execute each workflow serially
   virtual int inference(const google::protobuf::Message* request,
                         google::protobuf::Message* response,
+                        const uint64_t log_id,
                         butil::IOBufBuilder* debug_os = NULL);
 
   int debug(const google::protobuf::Message* request,
             google::protobuf::Message* response,
+            const uint64_t log_id,
             butil::IOBufBuilder* debug_os);
 
   int execute_one_workflow(uint32_t index,
                            const google::protobuf::Message* request,
                            google::protobuf::Message* response,
+                           const uint64_t log_id,
                            butil::IOBufBuilder* debug_os);
 
  private:
   int _execute_workflow(Workflow* workflow,
                         const google::protobuf::Message* request,
                         google::protobuf::Message* response,
+                        const uint64_t log_id,
                         butil::IOBufBuilder* debug_os);
 
   std::vector<Workflow*>* _map_request_to_workflow(
-      const google::protobuf::Message* request);
+      const google::protobuf::Message* request, const uint64_t log_id);
 
  private:
   std::vector<Workflow*> _flows;
@@ -88,6 +92,7 @@ class ParallelInferService : public InferService {
   // Execute workflows in parallel
   int inference(const google::protobuf::Message* request,
                 google::protobuf::Message* response,
+                const uint64_t log_id,
                 butil::IOBufBuilder* debug_os) {
     return 0;
   }
diff --git a/core/predictor/framework/service_manager.h b/core/predictor/framework/service_manager.h
index fa5e872625739ce233d7dd5efe11e1a0fa61d49d..b6b301dd3dc88dc064e0b17739fa059f3366f023 100644
--- a/core/predictor/framework/service_manager.h
+++ b/core/predictor/framework/service_manager.h
@@ -23,17 +23,24 @@ namespace predictor {
 
 #define REGIST_FORMAT_SERVICE(svr_name, svr)                                 \
   do {                                                                       \
+    char err_str[ERROR_STRING_LEN];                                          \
     int ret =                                                                \
         ::baidu::paddle_serving::predictor::FormatServiceManager::instance() \
             .regist_service(svr_name, svr);                                  \
     if (ret != 0) {                                                          \
-      RAW_LOG_ERROR("Failed regist service[%s][%s]",                         \
-                    svr_name.c_str(),                                        \
-                    typeid(svr).name());                                     \
+      snprintf(err_str,                                                      \
+               ERROR_STRING_LEN - 1,                                         \
+               "Failed regist service[%s][%s]",                              \
+               svr_name.c_str(),                                             \
+               typeid(svr).name());                                          \
+      RAW_LOG(ERROR, err_str);                                               \
     } else {                                                                 \
-      RAW_LOG_INFO("Success regist service[%s][%s]",                         \
-                   svr_name.c_str(),                                         \
-                   typeid(svr).name());                                      \
+      snprintf(err_str,                                                      \
+               ERROR_STRING_LEN - 1,                                         \
+               "Success regist service[%s][%s]",                             \
+               svr_name.c_str(),                                             \
+               typeid(svr).name());                                          \
+      RAW_LOG(INFO, err_str);                                                \
     }                                                                        \
   } while (0)
 
@@ -42,31 +49,46 @@ class FormatServiceManager {
   typedef google::protobuf::Service Service;
 
   int regist_service(const std::string& svr_name, Service* svr) {
+    char err_str[ERROR_STRING_LEN];
     if (_service_map.find(svr_name) != _service_map.end()) {
-      RAW_LOG_ERROR("Service[%s][%s] already exist!",
-                    svr_name.c_str(),
-                    typeid(svr).name());
+      snprintf(err_str,
+               ERROR_STRING_LEN - 1,
+               "Service[%s][%s] already exist!",
+               svr_name.c_str(),
+               typeid(svr).name());
+      RAW_LOG(ERROR, err_str);
       return -1;
     }
 
     std::pair<boost::unordered_map<std::string, Service*>::iterator, bool> ret;
     ret = _service_map.insert(std::make_pair(svr_name, svr));
     if (ret.second == false) {
-      RAW_LOG_ERROR("Service[%s][%s] insert failed!",
-                    svr_name.c_str(),
-                    typeid(svr).name());
+      snprintf(err_str,
+               ERROR_STRING_LEN - 1,
+               "Service[%s][%s] insert failed!",
+               svr_name.c_str(),
+               typeid(svr).name());
+      RAW_LOG(ERROR, err_str);
       return -1;
     }
 
-    RAW_LOG_INFO("Service[%s] insert successfully!", svr_name.c_str());
+    snprintf(err_str,
+             ERROR_STRING_LEN - 1,
+             "Service[%s] insert successfully!",
+             svr_name.c_str());
+    RAW_LOG(INFO, err_str);
     return 0;
   }
 
   Service* get_service(const std::string& svr_name) {
+    char err_str[ERROR_STRING_LEN];
     boost::unordered_map<std::string, Service*>::iterator res;
     if ((res = _service_map.find(svr_name)) == _service_map.end()) {
-      RAW_LOG_WARNING("Service[%s] not found in service manager!",
-                      svr_name.c_str());
+      snprintf(err_str,
+               ERROR_STRING_LEN - 1,
+               "Service[%s] not found in service manager!",
+               svr_name.c_str());
+      RAW_LOG(WARNING, err_str);
       return NULL;
     }
     return (*res).second;
diff --git a/core/predictor/framework/workflow.cpp b/core/predictor/framework/workflow.cpp
index 16c4a6e9f475bf575f84bd24764d6348ac65120c..147ab36b79330c781c605d2d29ffb04c4f761aa7 100644
--- a/core/predictor/framework/workflow.cpp
+++ b/core/predictor/framework/workflow.cpp
@@ -32,21 +32,22 @@ int Workflow::init(const configure::Workflow& conf) {
   return 0;
 }
 
-DagView* Workflow::fetch_dag_view(const std::string& service_name) {
+DagView* Workflow::fetch_dag_view(const std::string& service_name,
+                                  const uint64_t log_id) {
   DagView* view = NULL;
   if (_type == "Sequence") {
     view = butil::get_object<DagView>();
   } else if (_type == "Parallel") {
     view = butil::get_object<ParallelDagView>();
   } else {
-    LOG(ERROR) << "Unknown dag type:" << _type << "!";
+    LOG(ERROR) << "(logid=" << log_id << ") Unknown dag type:" << _type << "!";
     return NULL;
   }
   if (view == NULL) {
-    LOG(ERROR) << "create dag view from pool failed!";
+    LOG(ERROR) << "(logid=" << log_id << ") create dag view from pool failed!";
     return NULL;
   }
-  view->init(&_dag, service_name);
+  view->init(&_dag, service_name, log_id);
   return view;
 }
 
diff --git a/core/predictor/framework/workflow.h b/core/predictor/framework/workflow.h
index a4b3ed1dadccaa24cbeb6813ec7bcc18bac2aad8..14e4d567a540a19579208c91d046ba83de1679e3 100644
--- a/core/predictor/framework/workflow.h
+++ b/core/predictor/framework/workflow.h
@@ -36,7 +36,8 @@ class Workflow {
   // different apps.
   int init(const configure::Workflow& conf);
 
-  DagView* fetch_dag_view(const std::string& service_name);
+  DagView* fetch_dag_view(const std::string& service_name,
+                          const uint64_t log_id);
 
   int deinit() { return 0; }
 
diff --git a/core/predictor/op/op.cpp b/core/predictor/op/op.cpp
index 59ef6aed71977a3f762ff4fbe9480db19cb4057e..33dba2b506543ed1103cb0b456f5f054969f17fa 100644
--- a/core/predictor/op/op.cpp
+++ b/core/predictor/op/op.cpp
@@ -35,7 +35,8 @@ int Op::init(Bus* bus,
              uint32_t id,
              const std::string& name,
              const std::string& type,
-             void* conf) {
+             void* conf,
+             const uint64_t log_id) {
   _bus = bus;
   _dag = dag;
   _id = id;
@@ -45,7 +46,8 @@ int Op::init(Bus* bus,
 
   _timer = butil::get_object<TimerFlow>();
   if (!_timer) {
-    LOG(ERROR) << "Invalid timerflow in op:" << this->name();
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Invalid timerflow in op:" << this->name();
     return -1;
   }
 
@@ -55,7 +57,8 @@ int Op::init(Bus* bus,
 
   Channel* channel = mutable_channel();
   if (channel == NULL) {
-    LOG(ERROR) << "Failed mutable channel in op: " << this->id() << ", "
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed mutable channel in op: " << this->id() << ", "
                << this->name() << "!";
     return -1;
   }
@@ -96,18 +99,20 @@ int Op::check_time(const char* tag) {
   return 0;
 }
 
-int Op::process(bool debug) {
+int Op::process(const uint64_t log_id, bool debug) {
   butil::Timer op_time(butil::Timer::STARTED);
   if (debug && _timer) {
     _timer->start();
   }
   if (!_has_init) {
-    LOG(ERROR) << "Make sure op has been init before inference";
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Make sure op has been init before inference";
     return ERR_INTERNAL_FAILURE;
   }
 
   if (_has_calc) {
-    LOG(INFO) << "Op: " << _name << " already processed before";
+    LOG(INFO) << "(logid=" << log_id << ") Op: " << _name
+              << " already processed before";
     return ERR_OK;
   }
 
@@ -143,7 +148,7 @@ int Op::process(bool debug) {
 
   // 3. share output to bus
   Channel* channel = mutable_channel();
-  channel->share_to_bus(_bus);
+  channel->share_to_bus(_bus, log_id);
 
   // 4. mark has calculated
   _has_calc = true;
@@ -156,7 +161,8 @@ int Op::process(bool debug) {
   op_time.stop();
   PredictorMetric::GetInstance()->update_latency_metric(
       OP_METRIC_PREFIX + full_name(), op_time.u_elapsed());
-  LOG(INFO) << " " << name() << "_time=[" << op_time.u_elapsed() << "]";
+  LOG(INFO) << "(logid=" << log_id << ") " << name() << "_time=["
+            << op_time.u_elapsed() << "]";
   return ERR_OK;
 }
 
diff --git a/core/predictor/op/op.h b/core/predictor/op/op.h
index ae52975fe6f2506fb0bf483318f607df137c8a96..ea700cce164805d04ddd10b72311f068245e2f10 100644
--- a/core/predictor/op/op.h
+++ b/core/predictor/op/op.h
@@ -113,13 +113,14 @@ class Op {
            uint32_t id,
            const std::string& name,
            const std::string& type,
-           void* conf);
+           void* conf,
+           const uint64_t log_id);
 
   int deinit();
 
   int check_time(const char* tag);
 
-  int process(bool debug);
+  int process(const uint64_t log_id, bool debug);
 
   std::string time_info();
 
diff --git a/core/predictor/src/pdserving.cpp b/core/predictor/src/pdserving.cpp
index 157d52cee1adaea0524ebde01f75a90a6b2adc2f..59ec59d9012c94c322eee2ab3f357218deeedbb4 100644
--- a/core/predictor/src/pdserving.cpp
+++ b/core/predictor/src/pdserving.cpp
@@ -202,8 +202,6 @@ int main(int argc, char** argv) {
   }
   VLOG(2) << "Succ call pthread worker start function";
 
-#ifndef BCLOUD
-
   if (Resource::instance().general_model_initialize(FLAGS_resource_path,
                                                     FLAGS_resource_file) != 0) {
     LOG(ERROR) << "Failed to initialize general model conf: "
@@ -213,6 +211,7 @@ int main(int argc, char** argv) {
 
   VLOG(2) << "Succ initialize general model";
 
+#ifndef BCLOUD
   // FATAL messages are output to stderr
   FLAGS_stderrthreshold = 3;
 #endif
diff --git a/core/sdk-cpp/include/abtest.h b/core/sdk-cpp/include/abtest.h
index 4833325416cfd6418bf33444001917d887f08cc0..47a502745ae8aa6297729a0a3695600402cf5cfe 100644
--- a/core/sdk-cpp/include/abtest.h
+++ b/core/sdk-cpp/include/abtest.h
@@ -50,9 +50,9 @@ class WeightedRandomRender : public EndpointRouterBase {
     Factory<WeightedRandomRender, EndpointRouterBase>* factory =
         new (std::nothrow) Factory<WeightedRandomRender, EndpointRouterBase>();
     if (factory == NULL) {
-      RAW_LOG_ERROR(
-          "Failed regist factory: WeightedRandomRender->EndpointRouterBase in "
-          "macro!");
+      RAW_LOG(ERROR,
+              "Failed regist factory: WeightedRandomRender->EndpointRouterBase "
+              "in macro!");
       return -1;
     }
 
@@ -62,9 +62,9 @@ class WeightedRandomRender : public EndpointRouterBase {
     // together.
     if (FactoryPool<EndpointRouterBase>::instance().register_factory(
             "WeightedRandomRender", factory) != 0) {
-      RAW_LOG_INFO(
-          "Factory has been registed: "
-          "WeightedRandomRender->EndpointRouterBase.");
+      RAW_LOG(INFO,
+              "Factory has been registed: "
+              "WeightedRandomRender->EndpointRouterBase.");
     }
 
     return 0;
diff --git a/core/sdk-cpp/include/factory.h b/core/sdk-cpp/include/factory.h
index 4a3d988afcd981dd92eca5f65c3f254d5f2255d5..89c8aae3ef6bd7b296a8a953f2db88786b501352 100644
--- a/core/sdk-cpp/include/factory.h
+++ b/core/sdk-cpp/include/factory.h
@@ -18,7 +18,6 @@
 #include <utility>
 #include "core/sdk-cpp/include/common.h"
 #include "core/sdk-cpp/include/stub_impl.h"
-#include "glog/raw_logging.h"
 
 namespace baidu {
 namespace paddle_serving {
@@ -28,12 +27,20 @@ namespace sdk_cpp {
 namespace brpc = baidu::rpc;
 #endif
 
+#define ERROR_STRING_LEN 10240
+
 #define INLINE_REGIST_OBJECT(D, B, E)                                    \
   do {                                                                   \
     Factory<D, B>* factory = new (std::nothrow) Factory<D, B>();         \
     if (factory == NULL ||                                               \
         FactoryPool<B>::instance().register_factory(#D, factory) != 0) { \
-      RAW_LOG_ERROR("Failed regist factory: %s->%s in macro!", #D, #B);  \
+      char err_str[ERROR_STRING_LEN];                                    \
+      snprintf(err_str,                                                  \
+               ERROR_STRING_LEN - 1,                                     \
+               "Failed regist factory: %s->%s in macro!",                \
+               #D,                                                       \
+               #B);                                                      \
+      RAW_LOG(ERROR, err_str);                                           \
       return E;                                                          \
     }                                                                    \
   } while (0)
@@ -43,7 +50,12 @@ namespace brpc = baidu::rpc;
     Factory<D, B>* factory = new (std::nothrow) Factory<D, B>();          \
     if (factory == NULL ||                                                \
         FactoryPool<B>::instance().register_factory(tag, factory) != 0) { \
-      RAW_LOG_ERROR("Failed regist factory: %s in macro!", #D);           \
+      char err_str[ERROR_STRING_LEN];                                     \
+      snprintf(err_str,                                                   \
+               ERROR_STRING_LEN - 1,                                      \
+               "Failed regist factory: %s in macro!",                     \
+               #D);                                                       \
+      RAW_LOG(ERROR, err_str);                                            \
       return -1;                                                          \
     }                                                                     \
     return 0;                                                             \
@@ -66,7 +78,13 @@ namespace brpc = baidu::rpc;
     if (factory == NULL ||                                                     \
         ::baidu::paddle_serving::sdk_cpp::FactoryPool<B>::instance()           \
                 .register_factory(#D, factory) != 0) {                         \
-      RAW_LOG_ERROR("Failed regist factory: %s->%s in macro!", #D, #B);        \
+      char err_str[ERROR_STRING_LEN];                                          \
+      snprintf(err_str,                                                        \
+               ERROR_STRING_LEN - 1,                                           \
+               "Failed regist factory: %s->%s in macro!",                      \
+               #D,                                                             \
+               #B);                                                            \
+      RAW_LOG(ERROR, err_str);                                                 \
       return;                                                                  \
     }                                                                          \
     return;                                                                    \
@@ -80,8 +98,14 @@ namespace brpc = baidu::rpc;
     if (factory == NULL ||                                                     \
         ::baidu::paddle_serving::sdk_cpp::FactoryPool<B>::instance()           \
                 .register_factory(T, factory) != 0) {                          \
-      RAW_LOG_ERROR(                                                           \
-          "Failed regist factory: %s->%s, tag %s in macro!", #D, #B, T);       \
+      char err_str[ERROR_STRING_LEN];                                          \
+      snprintf(err_str,                                                        \
+               ERROR_STRING_LEN - 1,                                           \
+               "Failed regist factory: %s->%s, tag %s in macro!",              \
+               #D,                                                             \
+               #B,                                                             \
+               T);                                                             \
+      RAW_LOG(ERROR, err_str);                                                 \
       return;                                                                  \
     }                                                                          \
     return;                                                                    \
@@ -108,8 +132,13 @@ namespace brpc = baidu::rpc;
         ::baidu::paddle_serving::sdk_cpp::FactoryPool<                     \
             ::baidu::paddle_serving::sdk_cpp::Stub>::instance()            \
                 .register_factory(T, factory) != 0) {                      \
-      RAW_LOG_ERROR(                                                       \
-          "Failed regist factory: %s->Stub, tag: %s in macro!", #D, T);    \
+      char err_str[ERROR_STRING_LEN];                                      \
+      snprintf(err_str,                                                    \
+               ERROR_STRING_LEN - 1,                                       \
+               "Failed regist factory: %s->Stub, tag: %s in macro!",       \
+               #D,                                                         \
+               T);                                                         \
+      RAW_LOG(ERROR, err_str);                                             \
       return;                                                              \
     }                                                                      \
     return;                                                                \
@@ -146,14 +175,24 @@ class FactoryPool {
     typename std::map<std::string, FactoryBase<B>*>::iterator it =
         _pool.find(tag);
     if (it != _pool.end()) {
-      RAW_LOG_ERROR("Insert duplicate with tag: %s", tag.c_str());
+      char err_str[ERROR_STRING_LEN];
+      snprintf(err_str,
+               ERROR_STRING_LEN - 1,
+               "Insert duplicate with tag: %s",
+               tag.c_str());
+      RAW_LOG(ERROR, err_str);
       return -1;
     }
 
     std::pair<typename std::map<std::string, FactoryBase<B>*>::iterator, bool>
         r = _pool.insert(std::make_pair(tag, factory));
     if (!r.second) {
-      RAW_LOG_ERROR("Failed insert new factory with: %s", tag.c_str());
+      char err_str[ERROR_STRING_LEN];
+      snprintf(err_str,
+               ERROR_STRING_LEN - 1,
+               "Failed insert new factory with: %s",
+               tag.c_str());
+      RAW_LOG(ERROR, err_str);
       return -1;
     }
 
@@ -164,9 +203,13 @@ class FactoryPool {
     typename std::map<std::string, FactoryBase<B>*>::iterator it =
         _pool.find(tag);
     if (it == _pool.end() || it->second == NULL) {
-      RAW_LOG_ERROR("Not found factory pool, tag: %s, pool size: %u",
-                    tag.c_str(),
-                    _pool.size());
+      char err_str[ERROR_STRING_LEN];
+      snprintf(err_str,
+               ERROR_STRING_LEN - 1,
+               "Not found factory pool, tag: %s, pool size: %u",
+               tag.c_str(),
+               _pool.size());
+      RAW_LOG(ERROR, err_str);
       return NULL;
     }
 
diff --git a/core/sdk-cpp/proto/general_model_service.proto b/core/sdk-cpp/proto/general_model_service.proto
index 51c0335a9db896e1260e83915de81e51451a904b..9988b298bdd22210fbe3127b9e4b57c89077f3ff 100644
--- a/core/sdk-cpp/proto/general_model_service.proto
+++ b/core/sdk-cpp/proto/general_model_service.proto
@@ -37,6 +37,7 @@ message Request {
   repeated FeedInst insts = 1;
   repeated string fetch_var_names = 2;
   optional bool profile_server = 3 [ default = false ];
+  required uint64 log_id = 4 [ default = 0 ];
 };
 
 message Response {
diff --git a/doc/COMPILE.md b/doc/COMPILE.md
index 84b1b65cbdbb0dcf6079d30bd7ebc9baf4a8c6e1..abb66084ac6f6c57c13c940eb10a87e2aba2daa2 100644
--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -4,26 +4,25 @@
 
 ## Compilation environment requirements
 
-|            module            |                           version                            |
-| :--------------------------: | :----------------------------------------------------------: |
-|              OS              |                           CentOS 7                           |
-|             gcc              |                       4.8.5 and later                        |
-|           gcc-c++            |                       4.8.5 and later                        |
-|             git              |                        3.82 and later                        |
-|            cmake             |                       3.2.0 and later                        |
-|            Python            |               2.7.2 and later / 3.6 and later                |
-|              Go              |                       1.9.2 and later                        |
-|             git              |                       2.17.1 and later                       |
-|         glibc-static         |                             2.17                             |
-|        openssl-devel         |                            1.0.2k                            |
-|         bzip2-devel          |                       1.0.6 and later                        |
-| python-devel / python3-devel |              2.7.5 and later / 3.6.8 and later               |
-|         sqlite-devel         |                       3.7.17 and later                       |
-|           patchelf           |                        0.9 and later                         |
-|           libXext            |                            1.3.3                             |
-|            libSM             |                            1.2.2                             |
-|          libXrender          |                            0.9.10                            |
-|          python-whl          | numpy>=1.12, <=1.16.4<br/>google>=2.0.3<br/>protobuf>=3.12.2<br/>grpcio-tools>=1.28.1<br/>grpcio>=1.28.1<br/>func-timeout>=4.3.5<br/>pyyaml>=1.3.0<br/>sentencepiece==0.1.92<br>flask>=1.1.2<br>ujson>=2.0.3 |
+|            module            |              version              |
+| :--------------------------: | :-------------------------------: |
+|              OS              |             CentOS 7              |
+|             gcc              |          4.8.5 and later          |
+|           gcc-c++            |          4.8.5 and later          |
+|             git              |          3.82 and later           |
+|            cmake             |          3.2.0 and later          |
+|            Python            |  2.7.2 and later / 3.6 and later  |
+|              Go              |          1.9.2 and later          |
+|             git              |         2.17.1 and later          |
+|         glibc-static         |               2.17                |
+|        openssl-devel         |              1.0.2k               |
+|         bzip2-devel          |          1.0.6 and later          |
+| python-devel / python3-devel | 2.7.5 and later / 3.6.8 and later |
+|         sqlite-devel         |         3.7.17 and later          |
+|           patchelf           |           0.9 and later           |
+|           libXext            |               1.3.3               |
+|            libSM             |               1.2.2               |
+|          libXrender          |              0.9.10               |
 
 It is recommended to use Docker for compilation. We have prepared the Paddle Serving compilation environment for you, see [this document](DOCKER_IMAGES.md).
 
@@ -62,6 +61,25 @@ pip install -r python/requirements.txt
 
 If Python3 is used, replace `pip` with `pip3`.
 
+## GOPATH Setting
+
+
+## Compile Arguments
+
+The default GOPATH is `$HOME/go`, which you can set to other values.
+```shell
+export GOPATH=$HOME/go
+export PATH=$PATH:$GOPATH/bin
+```
+
+## Get go packages
+
+```shell
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger
+go get -u github.com/golang/protobuf/protoc-gen-go
+go get -u google.golang.org/grpc
+```
 
 
 ## Compile Server
@@ -70,7 +88,10 @@ If Python3 is used, replace `pip` with `pip3`.
 
 ``` shell
 mkdir server-build-cpu && cd server-build-cpu
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+      -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+      -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+      -DSERVER=ON ..
 make -j10
 ```
 
@@ -80,7 +101,11 @@ you can execute `make install` to put targets under directory `./output`, you ne
 
 ``` shell
 mkdir server-build-gpu && cd server-build-gpu
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON -DWITH_GPU=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+      -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+      -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+      -DSERVER=ON \
+      -DWITH_GPU=ON ..
 make -j10
 ```
 
@@ -94,7 +119,10 @@ execute `make install` to put targets under directory `./output`
 
 ``` shell
 mkdir client-build && cd client-build
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCLIENT=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+      -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+      -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+      -DCLIENT=ON ..
 make -j10
 ```
 
@@ -114,7 +142,7 @@ make
 
 ## Install wheel package
 
-Regardless of the client, server or App part, after compiling, install the whl package under `python/dist/`.
+Regardless of the client, server or App part, after compiling, install the whl package in `python/dist/` in the temporary directory(`server-build-cpu`, `server-build-gpu`, `client-build`,`app-build`) of the compilation process.
 
 
 
@@ -124,6 +152,12 @@ When running the python server, it will check the `SERVING_BIN` environment vari
 
 
 
+## Verify
+
+Please use the example under `python/examples` to verify.
+
+
+
 ## CMake Option Description
 
 | Compile Options  |                    Description             | Default |
diff --git a/doc/COMPILE_CN.md b/doc/COMPILE_CN.md
index a38faff4289a4946d82f8b4a71afd521c7cd48fd..2ddaaf71f23b0199c7458d068139a6b7169c25d8 100644
--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -4,26 +4,25 @@
 
 ## 编译环境设置
 
-|             组件             |                           版本要求                           |
-| :--------------------------: | :----------------------------------------------------------: |
-|              OS              |                           CentOS 7                           |
-|             gcc              |                       4.8.5 and later                        |
-|           gcc-c++            |                       4.8.5 and later                        |
-|             git              |                        3.82 and later                        |
-|            cmake             |                       3.2.0 and later                        |
-|            Python            |               2.7.2 and later / 3.6 and later                |
-|              Go              |                       1.9.2 and later                        |
-|             git              |                       2.17.1 and later                       |
-|         glibc-static         |                             2.17                             |
-|        openssl-devel         |                            1.0.2k                            |
-|         bzip2-devel          |                       1.0.6 and later                        |
-| python-devel / python3-devel |              2.7.5 and later / 3.6.8 and later               |
-|         sqlite-devel         |                       3.7.17 and later                       |
-|           patchelf           |                             0.9                              |
-|           libXext            |                            1.3.3                             |
-|            libSM             |                            1.2.2                             |
-|          libXrender          |                            0.9.10                            |
-|          python-whl          | numpy>=1.12, <=1.16.4<br/>google>=2.0.3<br/>protobuf>=3.12.2<br/>grpcio-tools>=1.28.1<br/>grpcio>=1.28.1<br/>func-timeout>=4.3.5<br/>pyyaml>=1.3.0<br/>sentencepiece==0.1.92<br/>flask>=1.1.2<br/>ujson>=2.0.3 |
+|             组件             |             版本要求              |
+| :--------------------------: | :-------------------------------: |
+|              OS              |             CentOS 7              |
+|             gcc              |          4.8.5 and later          |
+|           gcc-c++            |          4.8.5 and later          |
+|             git              |          3.82 and later           |
+|            cmake             |          3.2.0 and later          |
+|            Python            |  2.7.2 and later / 3.6 and later  |
+|              Go              |          1.9.2 and later          |
+|             git              |         2.17.1 and later          |
+|         glibc-static         |               2.17                |
+|        openssl-devel         |              1.0.2k               |
+|         bzip2-devel          |          1.0.6 and later          |
+| python-devel / python3-devel | 2.7.5 and later / 3.6.8 and later |
+|         sqlite-devel         |         3.7.17 and later          |
+|           patchelf           |                0.9                |
+|           libXext            |               1.3.3               |
+|            libSM             |               1.2.2               |
+|          libXrender          |              0.9.10               |
 
 推荐使用Docker编译，我们已经为您准备好了Paddle Serving编译环境，详见[该文档](DOCKER_IMAGES_CN.md)。
 
@@ -62,6 +61,22 @@ pip install -r python/requirements.txt
 
 如果使用 Python3，请以 `pip3` 替换 `pip`。
 
+## GOPATH 设置
+
+默认 GOPATH 设置为 `$HOME/go`，您也可以设置为其他值。
+```shell
+export GOPATH=$HOME/go
+export PATH=$PATH:$GOPATH/bin
+```
+
+## 获取 Go packages
+
+```shell
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger
+go get -u github.com/golang/protobuf/protoc-gen-go
+go get -u google.golang.org/grpc
+```
 
 
 ## 编译Server部分
@@ -114,7 +129,7 @@ make
 
 ## 安装wheel包
 
-无论是Client端，Server端还是App部分，编译完成后，安装`python/dist/`下的whl包即可。
+无论是Client端，Server端还是App部分，编译完成后，安装编译过程临时目录（`server-build-cpu`、`server-build-gpu`、`client-build`、`app-build`）下的`python/dist/` 中的whl包即可。
 
 
 
@@ -124,6 +139,12 @@ make
 
 
 
+## 如何验证
+
+请使用 `python/examples` 下的例子进行验证。
+
+
+
 ## CMake选项说明
 
 |     编译选项     |                    说明                    | 默认 |
diff --git a/doc/FAQ.md b/doc/FAQ.md
index eb4f05a28594effcf59aac880cf4d81846a3a925..daf5cb8613d0754a966ce09f1a8f7dac5cfd7a78 100644
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -1,8 +1,8 @@
 # FAQ
 
-- Q：如何调整RPC服务的等待时间，避免超时？ 
+- Q: 如何调整RPC服务的等待时间，避免超时？ 
 
-  A：使用set_rpc_timeout_ms设置更长的等待时间，单位为毫秒，默认时间为20秒。
+  A: 使用set_rpc_timeout_ms设置更长的等待时间，单位为毫秒，默认时间为20秒。
   
   示例：
   ```
@@ -15,4 +15,25 @@
    ```
 
 - Q: 如何使用自己编译的Paddle Serving进行预测？
-  A：通过pip命令安装自己编译出的whl包，并设置SERVING_BIN环境变量为编译出的serving二进制文件路径。
+
+  A: 通过pip命令安装自己编译出的whl包，并设置SERVING_BIN环境变量为编译出的serving二进制文件路径。
+
+- Q: 执行GPU预测时遇到InvalidArgumentError: Device id must be less than GPU count, but received id is: 0. GPU count is: 0.
+
+  A: 将显卡驱动对应的libcuda.so的目录添加到LD_LIBRARY_PATH环境变量中
+
+- Q: 执行GPU预测时遇到ExternalError: Cudnn error, CUDNN_STATUS_BAD_PARAM at (/home/scmbuild/workspaces_cluster.dev/baidu.lib.paddlepaddle/baidu/lib/paddlepaddle/Paddle/paddle/fluid/operators/batch_norm_op.cu:198)
+
+  A: 将cudnn的lib64路径添加到LD_LIBRARY_PATH，安装自pypi的Paddle Serving中post9版使用的是cudnn 7.3,post10使用的是cudnn 7.5。如果是使用自己编译的Paddle Serving，可以在log/serving.INFO日志文件中查看对应的cudnn版本。
+
+- Q: 执行GPU预测时遇到Error: Failed to find dynamic library: libcublas.so
+
+  A: 将cuda的lib64路径添加到LD_LIBRARY_PATH, post9版本的Paddle Serving使用的是cuda 9.0，post10版本使用的cuda 10.0。
+
+- Q: 部署和预测中的日志信息在哪里查看？
+
+- A: server端的日志分为两部分，一部分打印到标准输出，一部分打印到启动服务时的目录下的log/serving.INFO文件中。
+
+    client端的日志直接打印到标准输出。
+
+    通过在部署服务之前 'export  GLOG_v=3'可以输出更为详细的日志信息。
diff --git a/doc/INFERENCE_TO_SERVING.md b/doc/INFERENCE_TO_SERVING.md
index 8334159ea255ca65241a2b567e43682a148bb775..e10ee976fb455c8cc49a0d5fa44ed4cc1f300ba9 100644
--- a/doc/INFERENCE_TO_SERVING.md
+++ b/doc/INFERENCE_TO_SERVING.md
@@ -2,6 +2,20 @@
 
 ([简体中文](./INFERENCE_TO_SERVING_CN.md)|English)
 
+We should know something before converting to serving model
+
+**inference_model_dir**：the directory of Paddle inference model
+
+**serving_client_dir**: the directory of server side configuration
+
+**serving_client_dir**: the directory of client side configuration
+
+**model_filename**: this is model description file whose default value is `__model__`, if it's not default name, set `model_filename` explicitly
+
+**params_filename**: during `save_inference_model` every Variable will be save as a single file. If we have the inference model whose params are compressed into one file, please set `params_filename` explicitly
+
+
+
 ## Example
 
 ``` python
@@ -12,3 +26,11 @@ serving_server_dir = "serving_server_dir"
 feed_var_names, fetch_var_names = inference_model_to_serving(
 		inference_model_dir, serving_client_dir, serving_server_dir)
 ```
+
+if your model file and params file are both standalone, please use the following api.
+
+```
+feed_var_names, fetch_var_names = inference_model_to_serving(
+		inference_model_dir, serving_client_dir, serving_server_dir,
+		model_filename="model", params_filename="params")
+```
diff --git a/doc/INFERENCE_TO_SERVING_CN.md b/doc/INFERENCE_TO_SERVING_CN.md
index 94d1def424db467e200020c69fbd6d1599a5ffde..e7e909ac04be3b1a0885b3390d99a153dfbd170e 100644
--- a/doc/INFERENCE_TO_SERVING_CN.md
+++ b/doc/INFERENCE_TO_SERVING_CN.md
@@ -4,6 +4,19 @@
 
 ## 示例
 
+在下列代码中，我们需要知道以下信息。
+
+**模型文件夹**：这个文件夹就是Paddle的inference_model所在的文件夹
+
+**serving_client_dir**: 这个文件夹是inference_model转换成Serving模型后，服务端配置的保存路径
+
+**serving_client_dir**: 这个文件夹是inference_model转换成Serving模型后，客户端配置的保存路径
+
+**模型描述文件**: 模型描述文件也就是`model_filename`默认值为`__model__`,是一个pb2文本文件，如果是别的文件名需要显式指定
+
+**模型参数文件**: 在`save_inference_model`阶段，默认方式是每一个Variable保存一个二进制文件，如果是这种情况就不需要做指定。如果所有参数用压缩成一个文件的形式保存，则需要显式指定`params_filename`
+
+
 ``` python
 from paddle_serving_client.io import inference_model_to_serving
 inference_model_dir = "your_inference_model"
@@ -12,3 +25,9 @@ serving_server_dir = "serving_server_dir"
 feed_var_names, fetch_var_names = inference_model_to_serving(
 		inference_model_dir, serving_client_dir, serving_server_dir)
 ```
+如果模型中有模型描述文件`model_filename` 和 模型参数文件`params_filename`，那么请用
+```
+feed_var_names, fetch_var_names = inference_model_to_serving(
+		inference_model_dir, serving_client_dir, serving_server_dir,
+		 model_filename="model", params_filename="params")
+```
diff --git a/doc/LATEST_PACKAGES.md b/doc/LATEST_PACKAGES.md
index 038641afd38192da5b99f714d278232d3ad79fb4..247c04c000404944e7021093ff8bf3280c2f2539 100644
--- a/doc/LATEST_PACKAGES.md
+++ b/doc/LATEST_PACKAGES.md
@@ -3,51 +3,55 @@
 ## CPU server
 ### Python 3
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.3.2-py3-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.0.0-py3-none-any.whl
 ```
 
 ### Python 2
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.3.2-py2-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.0.0-py2-none-any.whl
 ```
 
 ## GPU server
 ### Python 3
 ```
 #cuda 9.0
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.2.post9-py3-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py3-none-any.whl
 #cuda 10.0
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.2.post10-py3-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py3-none-any.whl
 ```
 ### Python 2
 ```
 #cuda 9.0
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.2.post9-py2-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py2-none-any.whl
 #cuda 10.0
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.3.2.post10-py2-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py2-none-any.whl
 ```
 
 ## Client
 ### Python 3.7
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.3.2-cp37-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.0.0-cp37-none-any.whl
 ```
 ### Python 3.6
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.3.2-cp36-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.0.0-cp36-none-any.whl
+```
+### Python 3.5
+```
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.0.0-cp35-none-any.whl
 ```
 ### Python 2.7
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.3.2-cp27-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.0.0-cp27-none-any.whl
 ```
 
 ## App
 ### Python 3
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_app-0.1.2-py3-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_app-0.0.0-py3-none-any.whl
 ```
 
 ### Python 2
 ```
-https://paddle-serving.bj.bcebos.com/whl/paddle_serving_app-0.1.2-py2-none-any.whl
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_app-0.0.0-py2-none-any.whl
 ```
diff --git a/doc/NEW_WEB_SERVICE.md b/doc/NEW_WEB_SERVICE.md
index 39bca98a3bdfbc1b2cadb5d2c3d60395b4592b34..86e53b843eb18d28057f69a39934682d797e4de5 100644
--- a/doc/NEW_WEB_SERVICE.md
+++ b/doc/NEW_WEB_SERVICE.md
@@ -1,56 +1,152 @@
 # How to develop a new Web service?
 
+
 ([简体中文](NEW_WEB_SERVICE_CN.md)|English)
 
-This document will take the image classification service based on the Imagenet data set as an example to introduce how to develop a new web service. The complete code can be visited at [here](../python/examples/imagenet/resnet50_web_service.py).
+This document will take Uci service as an example to introduce how to develop a new Web Service. You can check out the complete code [here](../python/examples/pipeline/simple_web_service/web_service.py).
 
-## WebService base class
+## Op base class
+
+In some services, a single model may not meet business needs, requiring multiple models to be concatenated or parallel to complete the entire service. We call a single model operation Op and provide a simple set of interfaces to implement the complex logic of Op concatenation or parallelism.
 
-Paddle Serving implements the [WebService](https://github.com/PaddlePaddle/Serving/blob/develop/python/paddle_serving_server/web_service.py#L23) base class. You need to override its `preprocess` and `postprocess` method. The default implementation is as follows:
+Data between Ops is passed as a dictionary, Op can be started as threads or process, and Op can be configured for the number of concurrencies, etc.
+
+Typically, you need to inherit the Op base class and override its `init_op`,  `preprocess` and `postprocess` methods, which are implemented by default as follows:
 
 ```python
-class WebService(object):
-  
-    def preprocess(self, feed={}, fetch=[]):
-        return feed, fetch
-    def postprocess(self, feed={}, fetch=[], fetch_map=None):
-        return fetch_map
+class Op(object):
+  def init_op(self):
+    pass
+  def preprocess(self, input_dicts):
+    # multiple previous Op
+    if len(input_dicts) != 1:
+      _LOGGER.critical(
+        "Failed to run preprocess: this Op has multiple previous "
+        "inputs. Please override this func.")
+      os._exit(-1)
+    (_, input_dict), = input_dicts.items()
+    return input_dict
+  def postprocess(self, input_dicts, fetch_dict):
+    return fetch_dict
 ```
 
+### init_op
+
+This method is used to load user-defined resources such as dictionaries. A separator is loaded in the [UciOp](../python/examples/pipeline/simple_web_service/web_service.py).
+
+**Note**: If Op is launched in threaded mode, different threads of the same Op execute `init_op` only once and share `init_op` loaded resources when Op is multi-concurrent.
+
 ### preprocess
 
-The preprocess method has two input parameters, `feed` and `fetch`. For an HTTP request `request`:
+This method is used to preprocess the data before model prediction. It has an `input_dicts` parameter, `input_dicts` is a dictionary, key is the `name` of the previous Op, and value is the data transferred from the corresponding previous op (the data is also in dictionary format).
 
-- The value of `feed` is the feed part `request.json["feed"]` in the request data 
-- The value of `fetch` is the fetch part `request.json["fetch"]` in the request data
+The `preprocess` method needs to process the data into a ndarray dictionary (key is the feed variable name, and value is the corresponding ndarray value). Op will take the return value as the input of the model prediction and pass the output to the `postprocess` method.
 
-The return values are the feed and fetch values used in the prediction.
+**Note**: if Op does not have a model configuration file, the return value of `preprocess` will be directly passed to `postprocess`.
 
 ### postprocess
 
-The postprocess method has three input parameters, `feed`, `fetch` and `fetch_map`:
+This method is used for data post-processing after model prediction. It has two parameters, `input_dicts` and `fetch_dict`.
+
+Where the `input_dicts` parameter is consistent with the parameter in `preprocess` method, and `fetch_dict` is the output of the model prediction (key is the name of the fetch variable, and value is the corresponding ndarray value). Op will take the return value of `postprocess` as the input of subsequent Op `preprocess`.
 
-- The value of `feed` is the feed part `request.json["feed"]` in the request data 
-- The value of `fetch` is the fetch part `request.json["fetch"]` in the request data
-- The value of `fetch_map` is the model output value.
+**Note**: if Op does not have a model configuration file, `fetch_dict` will be the return value of `preprocess`.
 
-The return value will be processed as `{"reslut": fetch_map}` as the return of the HTTP request.
 
-## Develop ImageService class
+
+Here is the op of the UCI example:
+
+```python
+class UciOp(Op):
+    def init_op(self):
+        self.separator = ","
+
+    def preprocess(self, input_dicts):
+        (_, input_dict), = input_dicts.items()
+        x_value = input_dict["x"]
+        if isinstance(x_value, (str, unicode)):
+            input_dict["x"] = np.array(
+                [float(x.strip()) for x in x_value.split(self.separator)])
+        return input_dict
+
+    def postprocess(self, input_dicts, fetch_dict):
+        fetch_dict["price"] = str(fetch_dict["price"][0][0])
+        return fetch_dict
+```
+
+
+
+## WebService base class
+
+Paddle Serving implements the [WebService](https://github.com/PaddlePaddle/Serving/blob/develop/python/paddle_serving_server/web_service.py#L23) base class. You need to override its `get_pipeline_response` method to define the topological relationship between Ops. The default implementation is as follows:
 
 ```python
-class ImageService(WebService):
-
-    def preprocess(self, feed={}, fetch=[]):
-        reader = ImageReader()
-        feed_batch = []
-        for ins in feed:
-            if "image" not in ins:
-                raise ("feed data error!")
-            sample = base64.b64decode(ins["image"])
-            img = reader.process_image(sample)
-            feed_batch.append({"image": img})
-        return feed_batch, fetch
+class WebService(object):
+  def get_pipeline_response(self, read_op):
+    return None
+```
+
+Where `read_op` serves as the entry point of the topology map of the whole service (that is, the first op defined by the user is followed by `read_op`).
+
+For single Op service (single model), take Uci service as an example (there is only one Uci prediction model in the whole service):
+
+```python
+class UciService(WebService):
+  def get_pipeline_response(self, read_op):
+    uci_op = UciOp(name="uci", input_ops=[read_op])
+    return uci_op
+```
+
+For multiple Op services (multiple models), take Ocr service as an example (the whole service is completed in series by Det model and Rec model):
+
+```python
+class OcrService(WebService):
+  def get_pipeline_response(self, read_op):
+    det_op = DetOp(name="det", input_ops=[read_op])
+    rec_op = RecOp(name="rec", input_ops=[det_op])
+    return rec_op
+```
+
+
+
+WebService objects need to load a yaml configuration file through the `prepare_pipeline_config` to configure each Op and the entire service. The simplest configuration file is as follows (Uci example):
+
+```yaml
+http_port: 18080
+op:
+    uci:
+        local_service_conf:
+            model_config: uci_housing_model # path
+```
+
+All field names of yaml file are as follows:
+
+```yaml
+rpc_port: 18080  # gRPC port
+build_dag_each_worker: false  # Whether to use process server or not. The default is false
+worker_num: 1  # gRPC thread pool size (the number of processes in the process version servicer). The default is 1
+http_port: 0 # HTTP service port. Do not start HTTP service when the value is less or equals 0. The default value is 0.
+dag:
+    is_thread_op: true  # Whether to use the thread version of OP. The default is true
+    client_type: brpc  # Use brpc or grpc client. The default is brpc
+    retry: 1  # The number of times DAG executor retries after failure. The default value is 1, that is, no retrying
+    use_profile: false  # Whether to print the log on the server side. The default is false
+    tracer:
+        interval_s: -1 # Monitoring time interval of Tracer (in seconds). Do not start monitoring when the value is less than 1. The default value is -1
+op:
+    <op_name>: # op name, corresponding to the one defined in the program
+        concurrency: 1 # op concurrency number, the default is 1
+        timeout: -1 # predict timeout in milliseconds. The default value is -1, that is, no timeout
+        retry: 1 # timeout retransmissions. The default value is 1, that is, do not try again
+        batch_size: 1 # If this field is set, Op will merge multiple request outputs into a single batch
+        auto_batching_timeout: -1 # auto-batching timeout in milliseconds. The default value is -1, that is, no timeout
+        local_service_conf:
+            model_config: # the path of the corresponding model file. There is no default value(None). If this item is not configured, the model file will not be loaded.
+            workdir: "" # working directory of corresponding model
+            thread_num: 2 # the corresponding model is started with thread_num threads
+            devices: "" # on which device does the model launched. You can specify the GPU card number(such as "0,1,2"), which is CPU by default
+            mem_optim: true # mem optimization option, the default is true
+            ir_optim: false # ir optimization option, the default is false
 ```
 
-For the above `ImageService`, only the `preprocess` method is rewritten to process the image data in Base64 format into the data format required by prediction.
+All fields of Op can be defined when Op is created in the program (which will override yaml fields).
diff --git a/doc/NEW_WEB_SERVICE_CN.md b/doc/NEW_WEB_SERVICE_CN.md
index 43ca7fb61f2c70f13019574a7984e3665bd1b6fa..af6730a89badd8214323ea08bbb799033f57f09b 100644
--- a/doc/NEW_WEB_SERVICE_CN.md
+++ b/doc/NEW_WEB_SERVICE_CN.md
@@ -1,56 +1,152 @@
 # 如何开发一个新的Web Service？
 
+
 (简体中文|[English](NEW_WEB_SERVICE.md))
 
-本文档将以Imagenet图像分类服务为例，来介绍如何开发一个新的Web Service。您可以在[这里](../python/examples/imagenet/resnet50_web_service.py)查阅完整的代码。
+本文档将以 Uci 房价预测服务为例，来介绍如何开发一个新的Web Service。您可以在[这里](../python/examples/pipeline/simple_web_service/web_service.py)查阅完整的代码。
+
+## Op 基类
+
+在一些服务中，单个模型可能无法满足需求，需要多个模型串联或并联来完成整个服务。我们将单个模型操作称为 Op，并提供了一套简单的接口来实现 Op 串联或并联的复杂逻辑。
 
-## WebService基类
+Op 间数据是以字典形式进行传递的，Op 可以以线程或进程方式启动，同时可以对 Op 的并发数等进行配置。
 
-Paddle Serving实现了[WebService](https://github.com/PaddlePaddle/Serving/blob/develop/python/paddle_serving_server/web_service.py#L23)基类，您需要重写它的`preprocess`方法和`postprocess`方法，默认实现如下：
+通常情况下，您需要继承 Op 基类，重写它的 `init_op`、`preprocess` 和 `postprocess` 方法，默认实现如下：
 
 ```python
-class WebService(object):
-  
-    def preprocess(self, feed={}, fetch=[]):
-        return feed, fetch
-    def postprocess(self, feed={}, fetch=[], fetch_map=None):
-        return fetch_map
+class Op(object):
+  def init_op(self):
+    pass
+  def preprocess(self, input_dicts):
+    # multiple previous Op
+    if len(input_dicts) != 1:
+      _LOGGER.critical(
+        "Failed to run preprocess: this Op has multiple previous "
+        "inputs. Please override this func.")
+      os._exit(-1)
+    (_, input_dict), = input_dicts.items()
+    return input_dict
+  def postprocess(self, input_dicts, fetch_dict):
+    return fetch_dict
 ```
 
-### preprocess方法
+### init_op 方法
+
+该方法用于加载用户自定义资源（如字典等），在 [UciOp](../python/examples/pipeline/simple_web_service/web_service.py) 中加载了一个分隔符。
+
+**注意**：如果 Op 是以线程模式加载的，那么在 Op 多并发时，同种 Op 的不同线程只执行一次 `init_op`，且共用 `init_op` 加载的资源。
+
+### preprocess 方法
+
+该方法用于模型预测前对数据的预处理，它有一个 `input_dicts` 参数，`input_dicts` 是一个字典，key 为前继 Op 的 `name`，value 为对应前继 Op 传递过来的数据（数据同样是字典格式）。
+
+`preprocess` 方法需要将数据处理成 ndarray 字典（key 为 feed 变量名，value 为对应的 ndarray 值），Op 会将该返回值作为模型预测的输入，并将输出传递给 `postprocess` 方法。
 
-preprocess方法有两个输入参数，`feed`和`fetch`。对于一个HTTP请求`request`：
+**注意**：如果 Op 没有配置模型，则 `preprocess` 的返回值会直接传递给 `postprocess`。
 
-- `feed`的值为请求数据中的feed部分`request.json["feed"]`
-- `fetch`的值为请求数据中的fetch部分`request.json["fetch"]`
+### postprocess 方法
 
-返回值分别是预测过程中用到的feed和fetch值。
+该方法用于模型预测后对数据的后处理，它有两个参数，`input_dicts` 和 `fetch_dict`。
 
-### postprocess方法
+其中，`input_dicts` 与 `preprocess` 的参数相同，`fetch_dict` 为模型预测的输出（key 为 fetch 变量名，value 为对应的 ndarray 值）。Op 会将 `postprocess` 的返回值作为后继 Op `preprocess` 的输入。
 
-postprocess方法有三个输入参数，`feed`、`fetch`和`fetch_map`：
+**注意**：如果 Op 没有配置模型，则 `fetch_dict` 将为 `preprocess` 的返回值。
 
-- `feed`的值为请求数据中的feed部分`request.json["feed"]`
-- `fetch`的值为请求数据中的fetch部分`request.json["fetch"]`
-- `fetch_map`的值为fetch到的模型输出值
 
-返回值将会被处理成`{"reslut": fetch_map}`作为HTTP请求的返回。
 
-## 开发ImageService类
+下面是 Uci 例子的 Op：
 
 ```python
-class ImageService(WebService):
-
-    def preprocess(self, feed={}, fetch=[]):
-        reader = ImageReader()
-        feed_batch = []
-        for ins in feed:
-            if "image" not in ins:
-                raise ("feed data error!")
-            sample = base64.b64decode(ins["image"])
-            img = reader.process_image(sample)
-            feed_batch.append({"image": img})
-        return feed_batch, fetch
+class UciOp(Op):
+    def init_op(self):
+        self.separator = ","
+
+    def preprocess(self, input_dicts):
+        (_, input_dict), = input_dicts.items()
+        x_value = input_dict["x"]
+        if isinstance(x_value, (str, unicode)):
+            input_dict["x"] = np.array(
+                [float(x.strip()) for x in x_value.split(self.separator)])
+        return input_dict
+
+    def postprocess(self, input_dicts, fetch_dict):
+        fetch_dict["price"] = str(fetch_dict["price"][0][0])
+        return fetch_dict
+```
+
+
+
+## WebService 基类
+
+Paddle Serving 实现了 [WebService](https://github.com/PaddlePaddle/Serving/blob/develop/python/paddle_serving_server/web_service.py#L28) 基类，您需要重写它的 `get_pipeline_response` 方法来定义 Op 间的拓扑关系，并返回作为 Response 的 Op，默认实现如下：
+
+```python
+class WebService(object):
+  def get_pipeline_response(self, read_op):
+    return None
+```
+
+其中，`read_op` 作为整个服务拓扑图的入口（即用户自定义的第一个 Op 的前继为 `read_op`）。
+
+对于单 Op 服务（单模型），以 Uci 服务为例（整个服务中只有一个 Uci 房价预测模型）：
+
+```python
+class UciService(WebService):
+  def get_pipeline_response(self, read_op):
+    uci_op = UciOp(name="uci", input_ops=[read_op])
+    return uci_op
+```
+
+对于多 Op 服务（多模型），以 Ocr 服务为例（整个服务由 Det 模型和 Rec 模型串联完成）：
+
+```python
+class OcrService(WebService):
+  def get_pipeline_response(self, read_op):
+    det_op = DetOp(name="det", input_ops=[read_op])
+    rec_op = RecOp(name="rec", input_ops=[det_op])
+    return rec_op
+```
+
+
+
+WebService 对象需要通过 `prepare_pipeline_config` 加载一个 yaml 配置文件，用来对各个 Op 以及整个服务进行配置，最简单的配置文件如下（Uci 例子）：
+
+```yaml
+http_port: 18080
+op:
+    uci:
+        local_service_conf:
+            model_config: uci_housing_model # 路径
+```
+
+yaml 文件的所有字段名详见下面：
+
+```yaml
+rpc_port: 18080  # gRPC端口号
+build_dag_each_worker: false  # 是否使用进程版 Servicer，默认为 false
+worker_num: 1  # gRPC线程池大小（进程版 Servicer 中为进程数），默认为 1
+http_port: 0 # HTTP 服务的端口号，若该值小于或等于 0 则不开启 HTTP 服务，默认为 0
+dag:
+    is_thread_op: true  # 是否使用线程版Op，默认为 true
+    client_type: brpc  # 使用 brpc 或 grpc client，默认为 brpc
+    retry: 1  # DAG Executor 在失败后重试次数，默认为 1，即不重试
+    use_profile: false  # 是否在 Server 端打印日志，默认为 false
+    tracer:
+        interval_s: -1 # Tracer 监控的时间间隔，单位为秒。当该值小于 1 时不启动监控，默认为 -1
+op:
+    <op_name>: # op 名，与程序中定义的相对应
+        concurrency: 1 # op 并发数，默认为 1
+        timeout: -1 # 预测超时时间，单位为毫秒。默认为 -1 即不超时
+        retry: 1 # 超时重发次数。默认为 1 即不重试
+        batch_size: 1 # auto-batching 中的 batch_size，若设置该字段则 Op 会将多个请求输出合并为一个 batch
+        auto_batching_timeout: -1 # auto-batching 超时时间，单位为毫秒。默认为 -1 即不超时
+        local_service_conf:
+            model_config: # 对应模型文件的路径，无默认值（None）。若不配置该项则不会加载模型文件。
+            workdir: "" # 对应模型的工作目录
+            thread_num: 2 # 对应模型用几个线程启动
+            devices: "" # 模型启动在哪个设备上，可以指定 gpu 卡号（如 "0,1,2"），默认为 cpu
+            mem_optim: true # mem 优化选项，默认为 true
+            ir_optim: false # ir 优化选项，默认为 false
 ```
 
-对于上述的`ImageService`，只重写了前处理方法，将base64格式的图片数据处理成模型预测需要的数据格式。
+其中，Op 的所有字段均可以在程序中创建 Op 时定义（会覆盖 yaml 的字段）。
diff --git a/doc/PIPELINE_SERVING.md b/doc/PIPELINE_SERVING.md
index bfc408c2a37813a4b5c301cff3ee9e1efa6be656..4205aa15723d3625c0fea43eb9d0fd67f32f4a3f 100644
--- a/doc/PIPELINE_SERVING.md
+++ b/doc/PIPELINE_SERVING.md
@@ -33,6 +33,7 @@ The graph execution engine consists of OPs and Channels, and the connected OPs s
 - The default function of a single OP is to access a single Paddle Serving Service based on the input Channel data and put the result into the output Channel.
 - OP supports user customization, including preprocess, process, postprocess functions that can be inherited and implemented by the user.
 - OP can set the number of concurrencies to increase the number of concurrencies processed.
+- OP can obtain data from multiple different RPC requests for Auto-Batching.
 - OP can be started by a thread or process.
 
 ### Channel Design
@@ -46,6 +47,7 @@ The graph execution engine consists of OPs and Channels, and the connected OPs s
 </center>
 
 
+
 ### Extreme Case Consideration
 
 - Request timeout
@@ -59,9 +61,9 @@ The graph execution engine consists of OPs and Channels, and the connected OPs s
 - Whether input buffers and output buffers in Channel will increase indefinitely
 
   - It will not increase indefinitely. The input to the entire graph execution engine is placed inside a Channel's internal queue, directly acting as a traffic control buffer queue for the entire service.
-  - For input buffer, adjust the number of concurrencies of OP1 and OP2 according to the amount of computation, so that the number of input buffers from each input OP is relatively balanced.
-  - For output buffer, you can use a similar process as input buffer, which adjusts the concurrency of OP3 and OP4 to control the buffer length of output buffer.
-  - Note: The length of the input buffer depends on the speed at which each item in the internal queue is ready, and the length of the output buffer depends on the speed at which downstream OPs obtain data from the output buffer.
+  - For input buffer, adjust the number of concurrencies of OP1 and OP2 according to the amount of computation, so that the number of input buffers from each input OP is relatively balanced. (The length of the input buffer depends on the speed at which each item in the internal queue is ready)
+  - For output buffer, you can use a similar process as input buffer, which adjusts the concurrency of OP3 and OP4 to control the buffer length of output buffer. (The length of the output buffer depends on the speed at which downstream OPs obtain data from the output buffer)
+  - The amount of data in the Channel will not exceed `worker_num` of gRPC, that is, it will not exceed the thread pool size.
 
 ## Detailed Design
 
@@ -79,31 +81,36 @@ def __init__(name=None,
              client_config=None,
              concurrency=1,
              timeout=-1,
-             retry=1)
+             retry=1,
+             batch_size=1,
+             auto_batching_timeout=None)
 ```
 
 The meaning of each parameter is as follows:
 
-|    Parameter     |                           Meaning                            |
-| :--------------: | :----------------------------------------------------------: |
-|       name       | (str) String used to identify the OP type, which must be globally unique. |
-|    input_ops     |     (list) A list of all previous OPs of the current Op.     |
-| server_endpoints | (list) List of endpoints for remote Paddle Serving Service. If this parameter is not set, the OP will not access the remote Paddle Serving Service, that is, the process operation will not be performed. |
-|    fetch_list    | (list) List of fetch variable names for remote Paddle Serving Service. |
-|  client_config   | (str) The path of the client configuration file corresponding to the Paddle Serving Service. |
-|   concurrency    |             (int) The number of concurrent OPs.              |
-|     timeout      | (int) The timeout time of the process operation, in seconds. If the value is less than zero, no timeout is considered. |
-|      retry       | (int) Timeout number of retries. When the value is 1, no retries are made. |
+|       Parameter       |                           Meaning                            |
+| :-------------------: | :----------------------------------------------------------: |
+|         name          | (str) String used to identify the OP type, which must be globally unique. |
+|       input_ops       |     (list) A list of all previous OPs of the current Op.     |
+|   server_endpoints    | (list) List of endpoints for remote Paddle Serving Service. If this parameter is not set, the OP will not access the remote Paddle Serving Service, that is, the process operation will not be performed. |
+|      fetch_list       | (list) List of fetch variable names for remote Paddle Serving Service. |
+|     client_config     | (str) The path of the client configuration file corresponding to the Paddle Serving Service. |
+|      concurrency      |             (int) The number of concurrent OPs.              |
+|        timeout        | (int) The timeout time of the process operation, in ms. If the value is less than zero, no timeout is considered. |
+|         retry         | (int) Timeout number of retries. When the value is 1, no retries are made. |
+|      batch_size       | (int) The expected batch_size of Auto-Batching, since building batches may time out, the actual batch_size may be less than the set value. |
+| auto_batching_timeout | (float) Timeout for building batches of Auto-Batching (the unit is ms). |
+
 
 #### 2. General OP Secondary Development Interface
 
-|             Interface or Variable              |                           Explain                            |
-| :--------------------------------------------: | :----------------------------------------------------------: |
-|       def preprocess(self, input_dicts)        | Process the data obtained from the channel, and the processed data will be used as the input of the **process** function. |
-|          def process(self, feed_dict)          | The RPC prediction process is based on the Paddle Serving Client, and the processed data will be used as the input of the **postprocess** function. |
-| def postprocess(self, input_dicts, fetch_dict) | After processing the prediction results, the processed data will be put into the subsequent Channel to be obtained by the subsequent OP. |
-|               def init_op(self)                |      Used to load resources (such as word dictionary).       |
-|              self.concurrency_idx              | Concurrency index of current thread / process (different kinds of OP are calculated separately). |
+|              Interface or Variable               |                           Explain                            |
+| :----------------------------------------------: | :----------------------------------------------------------: |
+|        def preprocess(self, input_dicts)         | Process the data obtained from the channel, and the processed data will be used as the input of the **process** function. (This function handles a **sample**) |
+| def process(self, feed_dict_list, typical_logid) | The RPC prediction process is based on the Paddle Serving Client, and the processed data will be used as the input of the **postprocess** function. (This function handles a **batch**) |
+|  def postprocess(self, input_dicts, fetch_dict)  | After processing the prediction results, the processed data will be put into the subsequent Channel to be obtained by the subsequent OP. (This function handles a **sample**) |
+|                def init_op(self)                 |      Used to load resources (such as word dictionary).       |
+|               self.concurrency_idx               | Concurrency index of current process(not thread) (different kinds of OP are calculated separately). |
 
 In a running cycle, OP will execute three operations: preprocess, process, and postprocess (when the `server_endpoints` parameter is not set, the process operation is not executed). Users can rewrite these three functions. The default implementation is as follows:
 
@@ -117,24 +124,28 @@ def preprocess(self, input_dicts):
   (_, input_dict), = input_dicts.items()
   return input_dict
 
-def process(self, feed_dict):
-  err, err_info = ChannelData.check_npdata(feed_dict)
+def process(self, feed_dict_list, typical_logid):
+  err, err_info = ChannelData.check_batch_npdata(feed_dict_list)
   if err != 0:
     raise NotImplementedError(
       "{} Please override preprocess func.".format(err_info))
   call_result = self.client.predict(
-    feed=feed_dict, fetch=self._fetch_names)
+    feed=feed_dict_list, fetch=self._fetch_names, log_id=typical_logid)
+  if isinstance(self.client, MultiLangClient):
+    if call_result is None or call_result["serving_status_code"] != 0:
+      return None
+    call_result.pop("serving_status_code")
   return call_result
 
 def postprocess(self, input_dicts, fetch_dict):
   return fetch_dict
 ```
 
-The parameter of **preprocess** is the data `input_dicts` in the previous Channel. This variable is a dictionary with the name of the previous OP as key and the output of the corresponding OP as value.
+The parameter of **preprocess** is the data `input_dicts` in the previous Channel. This variable (as a **sample**) is a dictionary with the name of the previous OP as key and the output of the corresponding OP as value.
 
-The parameter of **process** is the input variable `fetch_dict` (the return value of the preprocess function) of the Paddle Serving Client prediction interface. This variable is a dictionary with feed_name as the key and the data in the ndarray format as the value.
+The parameter of **process** is the input variable `fetch_dict_list` (a list of the return value of the preprocess function) of the Paddle Serving Client prediction interface. This variable (as a **batch**) is a list of dictionaries with feed_name as the key and the data in the ndarray format as the value. `typical_logid` is used as the logid that penetrates to PaddleServingService.
 
-The parameters of **postprocess** are `input_dicts` and `fetch_dict`. `input_dicts` is consistent with the parameter of preprocess, and `fetch_dict` is the return value of the process function (if process is not executed, this value is the return value of preprocess).
+The parameters of **postprocess** are `input_dicts` and `fetch_dict`. `input_dicts` is consistent with the parameter of preprocess, and `fetch_dict` (as a **sample**) is a sample of the return batch of the process function (if process is not executed, this value is the return value of preprocess).
 
 Users can also rewrite the **init_op** function to load some custom resources (such as word dictionary). The default implementation is as follows:
 
@@ -143,7 +154,7 @@ def init_op(self):
   pass
 ```
 
-It should be noted that in the threaded version of OP, each OP will only call this function once, so the loaded resources must be thread safe.
+It should be **noted** that in the threaded version of OP, each OP will only call this function once, so the loaded resources must be thread safe.
 
 #### 3. RequestOp Definition
 
@@ -240,14 +251,17 @@ server.run_server()
 Where `response_op` is the responseop mentioned above, PipelineServer will initialize Channels according to the topology relationship of each OP and build the calculation graph. `config_yml_path` is the configuration file of PipelineServer. The example file is as follows:
 
 ```yaml
-port: 18080  # gRPC port
+rpc_port: 18080  # gRPC port
 worker_num: 1  # gRPC thread pool size (the number of processes in the process version servicer). The default is 1
 build_dag_each_worker: false  # Whether to use process server or not. The default is false
+http_port: 0 # HTTP service port. Do not start HTTP service when the value is less or equals 0. The default value is 0.
 dag:
     is_thread_op: true  # Whether to use the thread version of OP. The default is true
     client_type: brpc  # Use brpc or grpc client. The default is brpc
     retry: 1  # The number of times DAG executor retries after failure. The default value is 1, that is, no retrying
     use_profile: false  # Whether to print the log on the server side. The default is false
+    tracer:
+        interval_s: 600 # Monitoring time interval of Tracer (in seconds). Do not start monitoring when the value is less than 1. The default value is -1
 ```
 
 
@@ -272,6 +286,8 @@ python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 &> cnn.
 python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.log &
 ```
 
+PipelineServing also supports local automatic startup of PaddleServingService. Please refer to the example `python/examples/pipeline/ocr`.
+
 ### Start PipelineServer
 
 Run the following code
@@ -282,14 +298,8 @@ from paddle_serving_server.pipeline import PipelineServer
 from paddle_serving_server.pipeline.proto import pipeline_service_pb2
 from paddle_serving_server.pipeline.channel import ChannelDataEcode
 import numpy as np
-import logging
 from paddle_serving_app.reader import IMDBDataset
 
-logging.basicConfig(level=logging.DEBUG)
-
-_LOGGER = logging.getLogger()
-
-
 class ImdbRequestOp(RequestOp):
     def init_op(self):
         self.imdb_dataset = IMDBDataset()
@@ -377,7 +387,7 @@ for f in futures:
 
 
 
-## How to optimize through the timeline tool
+## How to optimize with the timeline tool
 
 In order to better optimize the performance, PipelineServing provides a timeline tool to monitor the time of each stage of the whole service.
 
@@ -390,15 +400,23 @@ dag:
     use_profile: true
 ```
 
-After the function is enabled, the server will print the corresponding log information to the standard output in the process of prediction. In order to show the time consumption of each stage more intuitively, scripts are provided for further analysis and processing of log files.
+After the function is enabled, the server will print the corresponding log information to the standard output in the process of prediction. In order to show the time consumption of each stage more intuitively, Analyst module is provided for further analysis and processing of log files.
 
-The output of the server is first saved to a file. Taking profile as an example, the script converts the time monitoring information in the log into JSON format and saves it to the trace file. The trace file can be visualized through the tracing function of Chrome browser.
+The output of the server is first saved to a file. Taking `profile.txt` as an example, the script converts the time monitoring information in the log into JSON format and saves it to the `trace` file. The `trace` file can be visualized through the tracing function of Chrome browser.
 
 ```shell
-python timeline_trace.py profile trace
+from paddle_serving_server.pipeline import Analyst
+import json
+import sys
+
+if __name__ == "__main__":
+    log_filename = "profile.txt"
+    trace_filename = "trace"
+    analyst = Analyst(log_filename)
+    analyst.save_trace(trace_filename)
 ```
 
-Specific operation: open Chrome browser, input in the address bar `chrome://tracing/` , jump to the tracing page, click the load button, open the saved trace file, and then visualize the time information of each stage of the prediction service.
+Specific operation: open Chrome browser, input in the address bar `chrome://tracing/` , jump to the tracing page, click the load button, open the saved `trace` file, and then visualize the time information of each stage of the prediction service.
 
 ### Output profile information on client side
 
diff --git a/doc/PIPELINE_SERVING_CN.md b/doc/PIPELINE_SERVING_CN.md
index 9e3fe9b9883c404eb476e81349d8a2096774bfc1..7cab409b2b8ca5d80eac05827f2e3fb774000998 100644
--- a/doc/PIPELINE_SERVING_CN.md
+++ b/doc/PIPELINE_SERVING_CN.md
@@ -6,6 +6,7 @@ Paddle Serving 通常用于单模型的一键部署，但端到端的深度学
 
 Paddle Serving 提供了用户友好的多模型组合服务编程框架，Pipeline Serving，旨在降低编程门槛，提高资源使用率（尤其是GPU设备），提升整体的预估效率。
 
+
 ## 整体架构设计
 
 Server端基于 gRPC 和图执行引擎构建，两者的关系如下图所示。
@@ -30,9 +31,10 @@ Server端基于 gRPC 和图执行引擎构建，两者的关系如下图所示
 
 ### OP的设计
 
-- 单个OP默认的功能是根据输入的 Channel 数据，访问一个 Paddle Serving 的单模型服务，并将结果存在输出的 Channel
+- 单个 OP 默认的功能是根据输入的 Channel 数据，访问一个 Paddle Serving 的单模型服务，并将结果存在输出的 Channel
 - 单个 OP 可以支持用户自定义，包括 preprocess，process，postprocess 三个函数都可以由用户继承和实现
 - 单个 OP 可以控制并发数，从而增加处理并发数
+- 单个 OP 可以获取多个不同 RPC 请求的数据，以实现 Auto-Batching
 - OP 可以由线程或进程启动
 
 ### Channel的设计
@@ -59,11 +61,9 @@ Server端基于 gRPC 和图执行引擎构建，两者的关系如下图所示
 - Channel 设计中的 input buffer 和 output buffer 是否会无限增加
 
   - 不会。整个图执行引擎的输入会放到一个 Channel 的 internal queue 里面，直接作为整个服务的流量控制缓冲队列
-  - 对于 input buffer，根据计算量的情况调整 OP1 和 OP2 的并发数，使得 input buffer 来自各个输入 OP 的数量相对平衡
-  - 对于 output buffer，可以采用和 input buffer 类似的处理方法，即调整 OP3 和 OP4 的并发数，使得 output buffer 的缓冲长度得到控制
-  - 注：input buffer 的长度取决于 internal queue 中每个 item 完全 ready 的速度，output buffer 的长度取决于下游 OP 从 output buffer 获取数据的速度
-
-## 详细设计
+  - 对于 input buffer，根据计算量的情况调整 OP1 和 OP2 的并发数，使得 input buffer 来自各个输入 OP 的数量相对平衡（input buffer 的长度取决于 internal queue 中每个 item 完全 ready 的速度）
+  - 对于 output buffer，可以采用和 input buffer 类似的处理方法，即调整 OP3 和 OP4 的并发数，使得 output buffer 的缓冲长度得到控制（output buffer 的长度取决于下游 OP 从 output buffer 获取数据的速度）
+  - 同时 Channel 中数据量不会超过 gRPC 的 `worker_num`，即线程池大小
 
 ### 用户接口设计
 
@@ -79,31 +79,36 @@ def __init__(name=None,
              client_config=None,
              concurrency=1,
              timeout=-1,
-             retry=1)
+             retry=1,
+             batch_size=1,
+             auto_batching_timeout=None)
 ```
 
 各参数含义如下
 
-|      参数名      |                             含义                             |
-| :--------------: | :----------------------------------------------------------: |
-|       name       |    （str）用于标识 OP 类型的字符串，该字段必须全局唯一。     |
-|    input_ops     |            （list）当前 OP 的所有前继 OP 的列表。            |
-| server_endpoints | （list）远程 Paddle Serving Service 的 endpoints 列表。如果不设置该参数，则不访问远程 Paddle Serving Service，即 不会执行 process 操作。 |
-|    fetch_list    |     （list）远程 Paddle Serving Service 的 fetch 列表。      |
-|  client_config   | （str）Paddle Serving Service 对应的 Client 端配置文件路径。 |
-|   concurrency    |                     （int）OP 的并发数。                     |
-|     timeout      | （int）process 操作的超时时间，单位为秒。若该值小于零，则视作不超时。 |
-|      retry       |       （int）超时重试次数。当该值为 1 时，不进行重试。       |
+|        参数名         |                             含义                             |
+| :-------------------: | :----------------------------------------------------------: |
+|         name          |    （str）用于标识 OP 类型的字符串，该字段必须全局唯一。     |
+|       input_ops       |            （list）当前 OP 的所有前继 OP 的列表。            |
+|   server_endpoints    | （list）远程 Paddle Serving Service 的 endpoints 列表。如果不设置该参数，则不访问远程 Paddle Serving Service，即 不会执行 process 操作。 |
+|      fetch_list       |     （list）远程 Paddle Serving Service 的 fetch 列表。      |
+|     client_config     | （str）Paddle Serving Service 对应的 Client 端配置文件路径。 |
+|      concurrency      |                     （int）OP 的并发数。                     |
+|        timeout        | （int）process 操作的超时时间，单位为毫秒。若该值小于零，则视作不超时。 |
+|         retry         |       （int）超时重试次数。当该值为 1 时，不进行重试。       |
+|      batch_size       | （int）进行 Auto-Batching 的期望 batch_size 大小，由于构建 batch 可能超时，实际 batch_size 可能小于设定值。 |
+| auto_batching_timeout | （float）进行 Auto-Batching 构建 batch 的超时时间，单位为毫秒。 |
+
 
 #### 2. 普通 OP二次开发接口
 
-|                   变量或接口                   |                             说明                             |
-| :--------------------------------------------: | :----------------------------------------------------------: |
-|       def preprocess(self, input_dicts)        | 对从 Channel 中获取的数据进行处理，处理完的数据将作为 **process** 函数的输入。 |
-|          def process(self, feed_dict)          | 基于 Paddle Serving Client 进行 RPC 预测，处理完的数据将作为 **postprocess** 函数的输入。 |
-| def postprocess(self, input_dicts, fetch_dict) | 处理预测结果，处理完的数据将被放入后继 Channel 中，以被后继 OP 获取。 |
-|               def init_op(self)                |                  用于加载资源（如字典等）。                  |
-|              self.concurrency_idx              |   当前线程（进程）的并发数索引（不同种类的 OP 单独计算）。   |
+|                    变量或接口                    |                             说明                             |
+| :----------------------------------------------: | :----------------------------------------------------------: |
+|        def preprocess(self, input_dicts)         | 对从 Channel 中获取的数据进行处理，处理完的数据将作为 **process** 函数的输入。（该函数对一个 **sample** 进行处理） |
+| def process(self, feed_dict_list, typical_logid) | 基于 Paddle Serving Client 进行 RPC 预测，处理完的数据将作为 **postprocess** 函数的输入。（该函数对一个 **batch** 进行处理） |
+|  def postprocess(self, input_dicts, fetch_dict)  | 处理预测结果，处理完的数据将被放入后继 Channel 中，以被后继 OP 获取。（该函数对一个 **sample** 进行处理） |
+|                def init_op(self)                 |                  用于加载资源（如字典等）。                  |
+|               self.concurrency_idx               |  当前进程（非线程）的并发数索引（不同种类的 OP 单独计算）。  |
 
 OP 在一个运行周期中会依次执行 preprocess，process，postprocess 三个操作（当不设置 `server_endpoints` 参数时，不执行 process 操作），用户可以对这三个函数进行重写，默认实现如下：
 
@@ -117,25 +122,28 @@ def preprocess(self, input_dicts):
   (_, input_dict), = input_dicts.items()
   return input_dict
 
-def process(self, feed_dict):
-  err, err_info = ChannelData.check_npdata(feed_dict)
+def process(self, feed_dict_list, typical_logid):
+  err, err_info = ChannelData.check_batch_npdata(feed_dict_list)
   if err != 0:
     raise NotImplementedError(
       "{} Please override preprocess func.".format(err_info))
   call_result = self.client.predict(
-    feed=feed_dict, fetch=self._fetch_names)
+    feed=feed_dict_list, fetch=self._fetch_names, log_id=typical_logid)
+  if isinstance(self.client, MultiLangClient):
+    if call_result is None or call_result["serving_status_code"] != 0:
+      return None
+    call_result.pop("serving_status_code")
   return call_result
 
 def postprocess(self, input_dicts, fetch_dict):
   return fetch_dict
 ```
 
+**preprocess** 的参数是前继 Channel 中的数据 `input_dicts`，该变量（作为一个 **sample**）是一个以前继 OP 的 name 为 Key，对应 OP 的输出为 Value 的字典。
 
-**preprocess** 的参数是前继 Channel 中的数据 `input_dicts`，该变量是一个以前继 OP 的 name 为 Key，对应 OP 的输出为 Value 的字典。
+**process** 的参数是 Paddle Serving Client 预测接口的输入变量 `fetch_dict_list`（preprocess 函数的返回值的列表），该变量（作为一个 **batch**）是一个列表，列表中的元素为以 feed_name 为 Key，对应 ndarray 格式的数据为 Value 的字典。`typical_logid` 作为向 PaddleServingService 穿透的 logid。
 
-**process** 的参数是 Paddle Serving Client 预测接口的输入变量 `fetch_dict`（preprocess 函数的返回值），该变量是一个以 feed_name 为 Key，对应 ndarray 格式的数据为 Value 的字典。
-
-**postprocess** 的参数是 `input_dicts` 和 `fetch_dict`，`input_dicts` 与 preprocess 的参数一致，`fetch_dict` 是 process 函数的返回值（如果没有执行 process ，则该值为 preprocess 的返回值）。
+**postprocess** 的参数是 `input_dicts` 和 `fetch_dict`，`input_dicts` 与 preprocess 的参数一致，`fetch_dict` （作为一个 **sample**）是 process 函数的返回 batch 中的一个 sample（如果没有执行 process ，则该值为 preprocess 的返回值）。
 
 用户还可以对 **init_op** 函数进行重写，已加载自定义的一些资源（比如字典等），默认实现如下：
 
@@ -144,7 +152,7 @@ def init_op(self):
   pass
 ```
 
-需要注意的是，在线程版 OP 中，每个 OP 只会调用一次该函数，故加载的资源必须要求是线程安全的。
+需要**注意**的是，在线程版 OP 中，每个 OP 只会调用一次该函数，故加载的资源必须要求是线程安全的。
 
 #### 3. RequestOp 定义
 
@@ -241,14 +249,17 @@ server.run_server()
 其中，`response_op` 为上面提到的 ResponseOp，PipelineServer 将会根据各个 OP 的拓扑关系初始化 Channel 并构建计算图。`config_yml_path` 为 PipelineServer 的配置文件，示例文件如下：
 
 ```yaml
-port: 18080  # gRPC端口号
+rpc_port: 18080  # gRPC端口号
 worker_num: 1  # gRPC线程池大小（进程版 Servicer 中为进程数），默认为 1
 build_dag_each_worker: false  # 是否使用进程版 Servicer，默认为 false
+http_port: 0 # HTTP 服务的端口号，若该值小于或等于 0 则不开启 HTTP 服务，默认为 0
 dag:
     is_thread_op: true  # 是否使用线程版Op，默认为 true
     client_type: brpc  # 使用 brpc 或 grpc client，默认为 brpc
     retry: 1  # DAG Executor 在失败后重试次数，默认为 1，即不重试
     use_profile: false  # 是否在 Server 端打印日志，默认为 false
+    tracer:
+        interval_s: 600 # Tracer 监控的时间间隔，单位为秒。当该值小于 1 时不启动监控，默认为 -1
 ```
 
 
@@ -273,6 +284,8 @@ python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 &> cnn.
 python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.log &
 ```
 
+PipelineServing 也支持本地自动启动 PaddleServingService，请参考 `python/examples/pipeline/ocr` 下的例子。
+
 ### 启动 PipelineServer
 
 运行下面代码
@@ -283,14 +296,8 @@ from paddle_serving_server.pipeline import PipelineServer
 from paddle_serving_server.pipeline.proto import pipeline_service_pb2
 from paddle_serving_server.pipeline.channel import ChannelDataEcode
 import numpy as np
-import logging
 from paddle_serving_app.reader import IMDBDataset
 
-logging.basicConfig(level=logging.DEBUG)
-
-_LOGGER = logging.getLogger()
-
-
 class ImdbRequestOp(RequestOp):
     def init_op(self):
         self.imdb_dataset = IMDBDataset()
@@ -311,7 +318,6 @@ class CombineOp(Op):
     def preprocess(self, input_data):
         combined_prediction = 0
         for op_name, data in input_data.items():
-            _LOGGER.info("{}: {}".format(op_name, data["prediction"]))
             combined_prediction += data["prediction"]
         data = {"prediction": combined_prediction / 2}
         return data
@@ -391,15 +397,23 @@ dag:
     use_profile: true
 ```
 
-开启该功能后，Server 端在预测的过程中会将对应的日志信息打印到标准输出，为了更直观地展现各阶段的耗时，提供脚本对日志文件做进一步的分析处理。
+开启该功能后，Server 端在预测的过程中会将对应的日志信息打印到标准输出，为了更直观地展现各阶段的耗时，提供 Analyst 模块对日志文件做进一步的分析处理。
 
-使用时先将 Server 的输出保存到文件，以 profile 为例，脚本将日志中的时间打点信息转换成 json 格式保存到trace 文件，trace 文件可以通过 chrome 浏览器的 tracing 功能进行可视化。
+使用时先将 Server 的输出保存到文件，以 `profile.txt` 为例，脚本将日志中的时间打点信息转换成 json 格式保存到 `trace` 文件，`trace` 文件可以通过 chrome 浏览器的 tracing 功能进行可视化。
 
-```shell
-python timeline_trace.py profile trace
+```python
+from paddle_serving_server.pipeline import Analyst
+import json
+import sys
+
+if __name__ == "__main__":
+    log_filename = "profile.txt"
+    trace_filename = "trace"
+    analyst = Analyst(log_filename)
+    analyst.save_trace(trace_filename)
 ```
 
-具体操作：打开 chrome 浏览器，在地址栏输入 chrome://tracing/ ，跳转至 tracing 页面，点击 load 按钮，打开保存的 trace 文件，即可将预测服务的各阶段时间信息可视化。
+具体操作：打开 chrome 浏览器，在地址栏输入 `chrome://tracing/` ，跳转至 tracing 页面，点击 load 按钮，打开保存的 `trace` 文件，即可将预测服务的各阶段时间信息可视化。
 
 ### 在 Client 端输出 Profile 信息
 
diff --git a/doc/SAVE.md b/doc/SAVE.md
index 54800fa06ab4b8c20c0ffe75d417e1b42ab6ebe6..8ebeb89c536f576bf73414fb06c1eb4bfde63ea0 100644
--- a/doc/SAVE.md
+++ b/doc/SAVE.md
@@ -38,12 +38,15 @@ If you have saved model files using Paddle's `save_inference_model` API, you can
 import paddle_serving_client.io as serving_io
 serving_io.inference_model_to_serving(dirname, serving_server="serving_server", serving_client="serving_client", model_filename=None, params_filename=None )
 ```
-dirname (str) - Path of saved model files. Program file and parameter files are saved in this directory.
-
-serving_server (str, optional) - The path of model files and configuration files for server. Default: "serving_server".
-
-serving_client (str, optional) - The path of configuration files for client. Default: "serving_client".
-
-model_filename (str, optional) - The name of file to load the inference program. If it is None, the default filename `__model__` will be used. Default: None.
-
-paras_filename (str, optional) - The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. Default: None.
+Or you can use a build-in python module called `paddle_serving_client.convert` to convert it.
+```python
+python -m paddle_serving_client.convert --dirname ./your_inference_model_dir
+```
+Arguments are the same as `inference_model_to_serving` API.
+| Argument | Type | Default | Description |
+|--------------|------|-----------|--------------------------------|
+| `dirname` | str | - | Path of saved model files. Program file and parameter files are saved in this directory. |
+| `serving_server` | str | `"serving_server"` | The path of model files and configuration files for server. |
+| `serving_client` | str | `"serving_client"` | The path of configuration files for client. |
+| `model_filename` | str | None | The name of file to load the inference program. If it is None, the default filename `__model__` will be used. |
+| `paras_filename` | str | None | The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. |
diff --git a/doc/SAVE_CN.md b/doc/SAVE_CN.md
index aaf0647fd1c4e95584bb7aa42a6671620adeb6d0..a05729ed9c01f421893403b4fc2a13bd42ad9fd4 100644
--- a/doc/SAVE_CN.md
+++ b/doc/SAVE_CN.md
@@ -39,12 +39,15 @@ for line in sys.stdin:
 import paddle_serving_client.io as serving_io
 serving_io.inference_model_to_serving(dirname, serving_server="serving_server", serving_client="serving_client",  model_filename=None, params_filename=None)
 ```
-dirname (str) – 需要转换的模型文件存储路径，Program结构文件和参数文件均保存在此目录。
-
-serving_server (str, 可选) - 转换后的模型文件和配置文件的存储路径。默认值为serving_server。
-
-serving_client (str, 可选) - 转换后的客户端配置文件存储路径。默认值为serving_client。
-
-model_filename (str，可选) – 存储需要转换的模型Inference Program结构的文件名称。如果设置为None，则使用 `__model__` 作为默认的文件名。默认值为None。
-
-params_filename (str，可选) – 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保存在一个单独的二进制文件中，它才需要被指定。如果模型参数是存储在各自分离的文件中，设置它的值为None。默认值为None。
+或者你可以使用Paddle Serving提供的名为`paddle_serving_client.convert`的内置模块进行转换。
+```python
+python -m paddle_serving_client.convert --dirname ./your_inference_model_dir
+```
+模块参数与`inference_model_to_serving`接口参数相同。
+| 参数 | 类型 | 默认值 | 描述 |
+|--------------|------|-----------|--------------------------------|
+| `dirname` | str | - | 需要转换的模型文件存储路径，Program结构文件和参数文件均保存在此目录。|
+| `serving_server` | str | `"serving_server"` | 转换后的模型文件和配置文件的存储路径。默认值为serving_server |
+| `serving_client` | str | `"serving_client"` | 转换后的客户端配置文件存储路径。默认值为serving_client |
+| `model_filename` | str | None | 存储需要转换的模型Inference Program结构的文件名称。如果设置为None，则使用 `__model__` 作为默认的文件名 |
+| `paras_filename` | str | None | 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保存在一个单独的二进制文件中，它才需要被指定。如果模型参数是存储在各自分离的文件中，设置它的值为None |
diff --git a/doc/deprecated/NEW_WEB_SERVICE.md b/doc/deprecated/NEW_WEB_SERVICE.md
new file mode 100644
index 0000000000000000000000000000000000000000..39bca98a3bdfbc1b2cadb5d2c3d60395b4592b34
--- /dev/null
+++ b/doc/deprecated/NEW_WEB_SERVICE.md
@@ -0,0 +1,56 @@
+# How to develop a new Web service?
+
+([简体中文](NEW_WEB_SERVICE_CN.md)|English)
+
+This document will take the image classification service based on the Imagenet data set as an example to introduce how to develop a new web service. The complete code can be visited at [here](../python/examples/imagenet/resnet50_web_service.py).
+
+## WebService base class
+
+Paddle Serving implements the [WebService](https://github.com/PaddlePaddle/Serving/blob/develop/python/paddle_serving_server/web_service.py#L23) base class. You need to override its `preprocess` and `postprocess` method. The default implementation is as follows:
+
+```python
+class WebService(object):
+  
+    def preprocess(self, feed={}, fetch=[]):
+        return feed, fetch
+    def postprocess(self, feed={}, fetch=[], fetch_map=None):
+        return fetch_map
+```
+
+### preprocess
+
+The preprocess method has two input parameters, `feed` and `fetch`. For an HTTP request `request`:
+
+- The value of `feed` is the feed part `request.json["feed"]` in the request data 
+- The value of `fetch` is the fetch part `request.json["fetch"]` in the request data
+
+The return values are the feed and fetch values used in the prediction.
+
+### postprocess
+
+The postprocess method has three input parameters, `feed`, `fetch` and `fetch_map`:
+
+- The value of `feed` is the feed part `request.json["feed"]` in the request data 
+- The value of `fetch` is the fetch part `request.json["fetch"]` in the request data
+- The value of `fetch_map` is the model output value.
+
+The return value will be processed as `{"reslut": fetch_map}` as the return of the HTTP request.
+
+## Develop ImageService class
+
+```python
+class ImageService(WebService):
+
+    def preprocess(self, feed={}, fetch=[]):
+        reader = ImageReader()
+        feed_batch = []
+        for ins in feed:
+            if "image" not in ins:
+                raise ("feed data error!")
+            sample = base64.b64decode(ins["image"])
+            img = reader.process_image(sample)
+            feed_batch.append({"image": img})
+        return feed_batch, fetch
+```
+
+For the above `ImageService`, only the `preprocess` method is rewritten to process the image data in Base64 format into the data format required by prediction.
diff --git a/doc/deprecated/NEW_WEB_SERVICE_CN.md b/doc/deprecated/NEW_WEB_SERVICE_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..43ca7fb61f2c70f13019574a7984e3665bd1b6fa
--- /dev/null
+++ b/doc/deprecated/NEW_WEB_SERVICE_CN.md
@@ -0,0 +1,56 @@
+# 如何开发一个新的Web Service？
+
+(简体中文|[English](NEW_WEB_SERVICE.md))
+
+本文档将以Imagenet图像分类服务为例，来介绍如何开发一个新的Web Service。您可以在[这里](../python/examples/imagenet/resnet50_web_service.py)查阅完整的代码。
+
+## WebService基类
+
+Paddle Serving实现了[WebService](https://github.com/PaddlePaddle/Serving/blob/develop/python/paddle_serving_server/web_service.py#L23)基类，您需要重写它的`preprocess`方法和`postprocess`方法，默认实现如下：
+
+```python
+class WebService(object):
+  
+    def preprocess(self, feed={}, fetch=[]):
+        return feed, fetch
+    def postprocess(self, feed={}, fetch=[], fetch_map=None):
+        return fetch_map
+```
+
+### preprocess方法
+
+preprocess方法有两个输入参数，`feed`和`fetch`。对于一个HTTP请求`request`：
+
+- `feed`的值为请求数据中的feed部分`request.json["feed"]`
+- `fetch`的值为请求数据中的fetch部分`request.json["fetch"]`
+
+返回值分别是预测过程中用到的feed和fetch值。
+
+### postprocess方法
+
+postprocess方法有三个输入参数，`feed`、`fetch`和`fetch_map`：
+
+- `feed`的值为请求数据中的feed部分`request.json["feed"]`
+- `fetch`的值为请求数据中的fetch部分`request.json["fetch"]`
+- `fetch_map`的值为fetch到的模型输出值
+
+返回值将会被处理成`{"reslut": fetch_map}`作为HTTP请求的返回。
+
+## 开发ImageService类
+
+```python
+class ImageService(WebService):
+
+    def preprocess(self, feed={}, fetch=[]):
+        reader = ImageReader()
+        feed_batch = []
+        for ins in feed:
+            if "image" not in ins:
+                raise ("feed data error!")
+            sample = base64.b64decode(ins["image"])
+            img = reader.process_image(sample)
+            feed_batch.append({"image": img})
+        return feed_batch, fetch
+```
+
+对于上述的`ImageService`，只重写了前处理方法，将base64格式的图片数据处理成模型预测需要的数据格式。
diff --git a/doc/pipeline_serving-image1.png b/doc/pipeline_serving-image1.png
index f46765124c8049dddc13092dec8e57dd2d932f73..731f54973946e46eb2d4e8d72d57d00239c4384a 100644
Binary files a/doc/pipeline_serving-image1.png and b/doc/pipeline_serving-image1.png differ
diff --git a/java/src/main/java/io/paddle/serving/client/Client.java b/java/src/main/java/io/paddle/serving/client/Client.java
index 1e09e0c23c89dd4f0d70e0c93269b2185a69807f..742d4f91ce17555a2ea96f2a629717228ba18cef 100644
--- a/java/src/main/java/io/paddle/serving/client/Client.java
+++ b/java/src/main/java/io/paddle/serving/client/Client.java
@@ -192,14 +192,16 @@ public class Client {
 
     private InferenceRequest _packInferenceRequest(
             List<HashMap<String, INDArray>> feed_batch,
-            Iterable<String> fetch) throws IllegalArgumentException {
+            Iterable<String> fetch,
+            long log_id) throws IllegalArgumentException {
         List<String> feed_var_names = new ArrayList<String>();
         feed_var_names.addAll(feed_batch.get(0).keySet());
 
         InferenceRequest.Builder req_builder = InferenceRequest.newBuilder()
             .addAllFeedVarNames(feed_var_names)
             .addAllFetchVarNames(fetch)
-            .setIsPython(false);
+            .setIsPython(false)
+            .setLogId(log_id);
         for (HashMap<String, INDArray> feed_data: feed_batch) {
             FeedInst.Builder inst_builder = FeedInst.newBuilder();
             for (String name: feed_var_names) {
@@ -332,76 +334,151 @@ public class Client {
     public Map<String, INDArray> predict(
             HashMap<String, INDArray> feed,
             Iterable<String> fetch) {
-        return predict(feed, fetch, false);
+        return predict(feed, fetch, false, 0);
+    }
+
+    public Map<String, INDArray> predict(
+            HashMap<String, INDArray> feed,
+            Iterable<String> fetch,
+            long log_id) {
+        return predict(feed, fetch, false, log_id);
     }
 
     public Map<String, HashMap<String, INDArray>> ensemble_predict(
             HashMap<String, INDArray> feed,
             Iterable<String> fetch) {
-        return ensemble_predict(feed, fetch, false);
+        return ensemble_predict(feed, fetch, false, 0);
+    }
+
+    public Map<String, HashMap<String, INDArray>> ensemble_predict(
+            HashMap<String, INDArray> feed,
+            Iterable<String> fetch,
+            long log_id) {
+        return ensemble_predict(feed, fetch, false, log_id);
     }
 
     public PredictFuture asyn_predict(
             HashMap<String, INDArray> feed,
             Iterable<String> fetch) {
-        return asyn_predict(feed, fetch, false);
+        return asyn_predict(feed, fetch, false, 0);
+    }
+
+    public PredictFuture asyn_predict(
+            HashMap<String, INDArray> feed,
+            Iterable<String> fetch,
+            long log_id) {
+        return asyn_predict(feed, fetch, false, log_id);
     }
 
     public Map<String, INDArray> predict(
             HashMap<String, INDArray> feed,
             Iterable<String> fetch,
             Boolean need_variant_tag) {
+        return predict(feed, fetch, need_variant_tag, 0);
+    }
+
+    public Map<String, INDArray> predict(
+            HashMap<String, INDArray> feed,
+            Iterable<String> fetch,
+            Boolean need_variant_tag,
+            long log_id) {
         List<HashMap<String, INDArray>> feed_batch
             = new ArrayList<HashMap<String, INDArray>>();
         feed_batch.add(feed);
-        return predict(feed_batch, fetch, need_variant_tag);
+        return predict(feed_batch, fetch, need_variant_tag, log_id);
     }
-    
+
     public Map<String, HashMap<String, INDArray>> ensemble_predict(
             HashMap<String, INDArray> feed,
             Iterable<String> fetch,
             Boolean need_variant_tag) {
+        return ensemble_predict(feed, fetch, need_variant_tag, 0);
+    }
+    
+    public Map<String, HashMap<String, INDArray>> ensemble_predict(
+            HashMap<String, INDArray> feed,
+            Iterable<String> fetch,
+            Boolean need_variant_tag,
+            long log_id) {
         List<HashMap<String, INDArray>> feed_batch
             = new ArrayList<HashMap<String, INDArray>>();
         feed_batch.add(feed);
-        return ensemble_predict(feed_batch, fetch, need_variant_tag);
+        return ensemble_predict(feed_batch, fetch, need_variant_tag, log_id);
     }
 
     public PredictFuture asyn_predict(
             HashMap<String, INDArray> feed,
             Iterable<String> fetch,
             Boolean need_variant_tag) {
+        return asyn_predict(feed, fetch, need_variant_tag, 0);
+    }
+
+    public PredictFuture asyn_predict(
+            HashMap<String, INDArray> feed,
+            Iterable<String> fetch,
+            Boolean need_variant_tag,
+            long log_id) {
         List<HashMap<String, INDArray>> feed_batch
             = new ArrayList<HashMap<String, INDArray>>();
         feed_batch.add(feed);
-        return asyn_predict(feed_batch, fetch, need_variant_tag);
+        return asyn_predict(feed_batch, fetch, need_variant_tag, log_id);
     }
 
     public Map<String, INDArray> predict(
             List<HashMap<String, INDArray>> feed_batch,
             Iterable<String> fetch) {
-        return predict(feed_batch, fetch, false);
+        return predict(feed_batch, fetch, false, 0);
+    }
+
+    public Map<String, INDArray> predict(
+            List<HashMap<String, INDArray>> feed_batch,
+            Iterable<String> fetch,
+            long log_id) {
+        return predict(feed_batch, fetch, false, log_id);
     }
     
     public Map<String, HashMap<String, INDArray>> ensemble_predict(
             List<HashMap<String, INDArray>> feed_batch,
             Iterable<String> fetch) {
-        return ensemble_predict(feed_batch, fetch, false);
+        return ensemble_predict(feed_batch, fetch, false, 0);
+    }
+
+    public Map<String, HashMap<String, INDArray>> ensemble_predict(
+            List<HashMap<String, INDArray>> feed_batch,
+            Iterable<String> fetch,
+            long log_id) {
+        return ensemble_predict(feed_batch, fetch, false, log_id);
     }
 
     public PredictFuture asyn_predict(
             List<HashMap<String, INDArray>> feed_batch,
             Iterable<String> fetch) {
-        return asyn_predict(feed_batch, fetch, false);
+        return asyn_predict(feed_batch, fetch, false, 0);
+    }
+
+    public PredictFuture asyn_predict(
+            List<HashMap<String, INDArray>> feed_batch,
+            Iterable<String> fetch,
+            long log_id) {
+        return asyn_predict(feed_batch, fetch, false, log_id);
     }
 
     public Map<String, INDArray> predict(
             List<HashMap<String, INDArray>> feed_batch,
             Iterable<String> fetch,
             Boolean need_variant_tag) {
+        return predict(feed_batch, fetch, need_variant_tag, 0);        
+    }
+
+    public Map<String, INDArray> predict(
+            List<HashMap<String, INDArray>> feed_batch,
+            Iterable<String> fetch,
+            Boolean need_variant_tag,
+            long log_id) {
         try {
             profiler_.record("java_prepro_0");
-            InferenceRequest req = _packInferenceRequest(feed_batch, fetch);
+            InferenceRequest req = _packInferenceRequest(
+                    feed_batch, fetch, log_id);
             profiler_.record("java_prepro_1");
             
             profiler_.record("java_client_infer_0");
@@ -415,7 +492,7 @@ public class Client {
                 = new ArrayList<Map.Entry<String, HashMap<String, INDArray>>>(
                     ensemble_result.entrySet());
             if (list.size() != 1) {
-                System.out.format("predict failed: please use ensemble_predict impl.\n");
+                System.out.format("Failed to predict: please use ensemble_predict impl.\n");
                 return null;
             }
             profiler_.record("java_postpro_1");
@@ -423,7 +500,7 @@ public class Client {
 
             return list.get(0).getValue();
         } catch (StatusRuntimeException e) {
-            System.out.format("predict failed: %s\n", e.toString());
+            System.out.format("Failed to predict: %s\n", e.toString());
             return null;
         }
     }
@@ -432,9 +509,18 @@ public class Client {
             List<HashMap<String, INDArray>> feed_batch,
             Iterable<String> fetch,
             Boolean need_variant_tag) {
+        return ensemble_predict(feed_batch, fetch, need_variant_tag, 0);        
+    }
+     
+    public Map<String, HashMap<String, INDArray>> ensemble_predict(
+            List<HashMap<String, INDArray>> feed_batch,
+            Iterable<String> fetch,
+            Boolean need_variant_tag,
+            long log_id) {
         try {
             profiler_.record("java_prepro_0");
-            InferenceRequest req = _packInferenceRequest(feed_batch, fetch);
+            InferenceRequest req = _packInferenceRequest(
+                    feed_batch, fetch, log_id);
             profiler_.record("java_prepro_1");
             
             profiler_.record("java_client_infer_0");
@@ -449,7 +535,7 @@ public class Client {
 
             return ensemble_result;
         } catch (StatusRuntimeException e) {
-            System.out.format("predict failed: %s\n", e.toString());
+            System.out.format("Failed to predict: %s\n", e.toString());
             return null;
         }
     }
@@ -458,7 +544,16 @@ public class Client {
             List<HashMap<String, INDArray>> feed_batch,
             Iterable<String> fetch,
             Boolean need_variant_tag) {
-        InferenceRequest req = _packInferenceRequest(feed_batch, fetch);
+        return asyn_predict(feed_batch, fetch, need_variant_tag, 0);
+    }
+
+    public PredictFuture asyn_predict(
+            List<HashMap<String, INDArray>> feed_batch,
+            Iterable<String> fetch,
+            Boolean need_variant_tag,
+            long log_id) {
+        InferenceRequest req = _packInferenceRequest(
+                feed_batch, fetch, log_id);
         ListenableFuture<InferenceResponse> future = futureStub_.inference(req);
         PredictFuture predict_future = new PredictFuture(future, 
             (InferenceResponse resp) -> {
diff --git a/java/src/main/proto/multi_lang_general_model_service.proto b/java/src/main/proto/multi_lang_general_model_service.proto
index b83450aed666b96de324050d53b10c56e059a8d5..18fbcf760647e1694e738c0832fe45f4f7d9934f 100644
--- a/java/src/main/proto/multi_lang_general_model_service.proto
+++ b/java/src/main/proto/multi_lang_general_model_service.proto
@@ -14,6 +14,8 @@
 
 syntax = "proto2";
 
+package baidu.paddle_serving.multi_lang;
+
 option java_multiple_files = true;
 option java_package = "io.paddle.serving.grpc";
 option java_outer_classname = "ServingProto";
@@ -37,6 +39,7 @@ message InferenceRequest {
   repeated string feed_var_names = 2;
   repeated string fetch_var_names = 3;
   required bool is_python = 4 [ default = false ];
+  required uint64 log_id = 5 [ default = 0 ];
 };
 
 message InferenceResponse {
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index edec41573b67f50feca52ee017bae2d7fa2b28ac..4b20cb2001ebb595601f22fa6e4aab8dd5df18f4 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,7 +1,5 @@
 if (CLIENT)
     file(INSTALL pipeline DESTINATION paddle_serving_client)
-    execute_process(COMMAND ${PYTHON_EXECUTABLE} run_codegen.py
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/paddle_serving_client/pipeline/proto)
     file(GLOB_RECURSE SERVING_CLIENT_PY_FILES paddle_serving_client/*.py)
     set(PY_FILES ${SERVING_CLIENT_PY_FILES})
     SET(PACKAGE_NAME "serving_client")
@@ -11,13 +9,9 @@ endif()
 if (SERVER)
     if (NOT WITH_GPU)
         file(INSTALL pipeline DESTINATION paddle_serving_server)
-        execute_process(COMMAND ${PYTHON_EXECUTABLE} run_codegen.py
-            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/paddle_serving_server/pipeline/proto)
         file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server/*.py)
     else()
         file(INSTALL pipeline DESTINATION paddle_serving_server_gpu)
-        execute_process(COMMAND ${PYTHON_EXECUTABLE} run_codegen.py
-            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/paddle_serving_server_gpu/pipeline/proto)
         file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server_gpu/*.py)
     endif()
         set(PY_FILES ${SERVING_SERVER_PY_FILES})
@@ -25,6 +19,8 @@ if (SERVER)
         set(SETUP_LOG_FILE "setup.py.server.log")
 endif()
 
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/util.py
+    ${CMAKE_CURRENT_BINARY_DIR}/util.py)
 if (CLIENT)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.client.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
@@ -47,6 +43,9 @@ if (SERVER)
     endif()
 endif()
 
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/gen_version.py
+    ${CMAKE_CURRENT_BINARY_DIR}/gen_version.py)
+
 set (SERVING_CLIENT_CORE ${PADDLE_SERVING_BINARY_DIR}/core/general-client/*.so)
 message("python env: " ${py_env})
 
@@ -54,6 +53,7 @@ if (APP)
 add_custom_command(
         OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
         COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_app/ ${PADDLE_SERVING_BINARY_DIR}/python/
+        COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py "app"
         COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
         DEPENDS ${SERVING_APP_CORE} general_model_config_py_proto ${PY_FILES})
 add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
@@ -65,6 +65,7 @@ add_custom_command(
 	COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_client/ ${PADDLE_SERVING_BINARY_DIR}/python/
 	COMMAND ${CMAKE_COMMAND} -E copy ${SERVING_CLIENT_CORE} ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/serving_client.so
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} python_tag.py
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py "client"
 	COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
 	DEPENDS ${SERVING_CLIENT_CORE} sdk_configure_py_proto ${PY_FILES})
 add_custom_target(paddle_python ALL DEPENDS serving_client ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
@@ -75,6 +76,7 @@ if (SERVER)
         add_custom_command(
             OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
             COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server/ ${PADDLE_SERVING_BINARY_DIR}/python/
+            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py "server"
             COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
             DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
         add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
@@ -83,7 +85,8 @@ if (SERVER)
             OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
             COMMAND cp -r
             ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
-            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} paddle_serving_server_gpu/gen_cuda_version.py ${CUDA_VERSION_MAJOR}
+            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
+            "server_gpu" ${CUDA_VERSION_MAJOR}
             COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
             DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
         add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
diff --git a/python/examples/blazeface/README.md b/python/examples/blazeface/README.md
index f569841ce4a3ae69b1ff16041f7fb7d4617177f7..6f9d3c5adab5f3275989479078cb4329d14589fd 100644
--- a/python/examples/blazeface/README.md
+++ b/python/examples/blazeface/README.md
@@ -3,7 +3,7 @@
 ## Get Model
 ```
 python -m paddle_serving_app.package --get_model blazeface
-tar -xzvf blazeface.tar.gz
+tar -xf blazeface.tar.gz
 ```
 
 ## RPC Service
diff --git a/python/examples/blazeface/test_client.py b/python/examples/blazeface/test_client.py
index 27eb185ea90ce72641cef44d9066c46945ad2629..5e22cb866e34cba9fbd38c415215b8985b1584b2 100644
--- a/python/examples/blazeface/test_client.py
+++ b/python/examples/blazeface/test_client.py
@@ -16,6 +16,7 @@ from paddle_serving_client import Client
 from paddle_serving_app.reader import *
 import sys
 import numpy as np
+from paddle_serving_app.reader import BlazeFacePostprocess
 
 preprocess = Sequential([
     File2Image(),
diff --git a/python/examples/imagenet/benchmark.py b/python/examples/imagenet/benchmark.py
index 0181b873a36c0e65beff1d03f750b5d78c89aa06..12b013bd2554f24430ad1810f971a340c4b6903e 100644
--- a/python/examples/imagenet/benchmark.py
+++ b/python/examples/imagenet/benchmark.py
@@ -90,6 +90,7 @@ def single_func(idx, resource):
                 image = base64.b64encode(
                     open("./image_data/n01440764/" + file_list[i]).read())
             else:
+                image_path = "./image_data/n01440764/" + file_list[i]
                 image = base64.b64encode(open(image_path, "rb").read()).decode(
                     "utf-8")
             req = json.dumps({"feed": [{"image": image}], "fetch": ["score"]})
diff --git a/python/examples/imagenet/resnet50_web_service.py b/python/examples/imagenet/resnet50_web_service.py
index e7d1914973f2aeb58a912f7d85e35f85718d7a9b..d38dcc0ffc1952193803575c7eb612c4f0bbad28 100644
--- a/python/examples/imagenet/resnet50_web_service.py
+++ b/python/examples/imagenet/resnet50_web_service.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import sys
 from paddle_serving_client import Client
-from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize
+from paddle_serving_app.reader import Sequential, URL2Image, Resize, CenterCrop, RGB2BGR, Transpose, Div, Normalize, Base64ToImage
 
 if len(sys.argv) != 4:
     print("python resnet50_web_service.py model device port")
diff --git a/python/examples/ocr/README.md b/python/examples/ocr/README.md
index ca9bbabdf57ce95763b25fec3751a85e4c8f9401..a0fc9f60160506183076233f33face1732a278c7 100644
--- a/python/examples/ocr/README.md
+++ b/python/examples/ocr/README.md
@@ -21,8 +21,13 @@ tar xf test_imgs.tar
 ### Start Service
 
 ```
+#choose one of cpu/gpu commands as following
+#for cpu user
+python -m paddle_serving_server.serve --model ocr_det_model --port 9293
+python ocr_web_server.py cpu
+#for gpu user
 python -m paddle_serving_server_gpu.serve --model ocr_det_model --port 9293 --gpu_id 0
-python ocr_web_server.py
+python ocr_web_server.py gpu
 ```
 
 ### Client Prediction
@@ -33,7 +38,11 @@ If you want a faster web service, please try Web Debugger Service
 
 ## Web Debugger Service
 ```
-python ocr_debugger_server.py
+#choose one of cpu/gpu commands as following
+#for cpu user
+python ocr_debugger_server.py cpu
+#for gpu user
+python ocr_debugger_server.py gpu 
 ```
 
 ## Web Debugger Client Prediction
@@ -54,15 +63,17 @@ Dataset: RCTW 500 sample images
 | Serving web service          | 8.69         | 13.41                      | 109.97           | 2.82               | 87.76            | 4.29               | 3.98               | 78.51            | 3.66               | 4.12                     | 181.02             | 136.49       | 317.51        |
 | Serving Debugger web service |  8.73        | 16.42                      | 115.27           | 2.93               | 20.63            | 3.97               | 4.48               | 13.84            | 3.60               | 6.91                     | 49.45              | 147.33       | 196.78        |
 
-## Appendix: Det or Rec only
+## Appendix: For Users who want to launch Det or Rec only
 if you are going to detect images not recognize it or directly recognize the words from images. We also provide Det and Rec server for you.
 
 ### Det Server 
 
 ```
-python det_web_server.py 
+python det_web_server.py cpu #for cpu user
+python det_web_server.py gpu #for gpu user
 #or
-python det_debugger_server.py
+python det_debugger_server.py cpu #for cpu user
+python det_debugger_server.py gpu #for gpu user
 ```
 
 ### Det Client
@@ -75,9 +86,11 @@ python ocr_web_client.py
 ### Rec Server
 
 ```
-python rec_web_server.py 
+python rec_web_server.py cpu #for cpu user
+python rec_web_server.py gpu #for gpu user
 #or
-python rec_debugger_server.py
+python rec_debugger_server.py cpu #for cpu user
+python rec_debugger_server.py gpu #for gpu user
 ```
 
 ### Rec Client
diff --git a/python/examples/ocr/README_CN.md b/python/examples/ocr/README_CN.md
index 65bc066a43a34d1a35cb4236473c571106c5f61b..8bdc45cf8e390b378708fbee2dbfe318132aea44 100644
--- a/python/examples/ocr/README_CN.md
+++ b/python/examples/ocr/README_CN.md
@@ -15,19 +15,18 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/ocr/test_imgs.t
 tar xf test_imgs.tar
 ```
 
-### 客户端预测
-
-```
-python ocr_rpc_client.py
-```
-
 ## Web Service服务
 
 ### 启动服务
 
 ```
+#根据CPU/GPU设备选择一种启动方式
+#for cpu user
+python -m paddle_serving_server.serve --model ocr_det_model --port 9293
+python ocr_web_server.py cpu
+#for gpu user
 python -m paddle_serving_server_gpu.serve --model ocr_det_model --port 9293 --gpu_id 0
-python ocr_web_server.py
+python ocr_web_server.py gpu
 ```
 
 ### 启动客户端
@@ -38,7 +37,11 @@ python ocr_web_client.py
 如果用户需要更快的执行速度，请尝试Debugger版Web服务
 ## 启动Debugger版Web服务
 ```
-python ocr_debugger_server.py
+#根据CPU/GPU设备选择一种启动方式
+#for cpu user
+python ocr_debugger_server.py cpu
+#for gpu user
+python ocr_debugger_server.py gpu
 ```
 
 ## 启动客户端
@@ -66,9 +69,11 @@ GPU: Nvidia Tesla V100单卡
 ### 启动检测服务
 
 ```
-python det_web_server.py 
+python det_web_server.py cpu #for cpu user
+python det_web_server.py gpu #for gpu user
 #or
-python det_debugger_server.py
+python det_debugger_server.py cpu #for cpu user
+python det_debugger_server.py gpu #for gpu user
 ```
 
 ### 检测服务客户端
@@ -81,9 +86,11 @@ python ocr_web_client.py
 ### 启动识别服务
 
 ```
-python rec_web_server.py 
+python rec_web_server.py cpu #for cpu user
+python rec_web_server.py gpu #for gpu user
 #or
-python rec_debugger_server.py
+python rec_debugger_server.py cpu #for cpu user
+python rec_debugger_server.py gpu #for gpu user
 ```
 
 ### 识别服务客户端
diff --git a/python/examples/ocr/det_debugger_server.py b/python/examples/ocr/det_debugger_server.py
index acfccdb6d24a7e1ba490705dd147f21dbf921d31..913a0bb4c9a099cbef886beb3889337d024d10d6 100644
--- a/python/examples/ocr/det_debugger_server.py
+++ b/python/examples/ocr/det_debugger_server.py
@@ -21,7 +21,10 @@ from paddle_serving_client import Client
 from paddle_serving_app.reader import Sequential, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes
-from paddle_serving_server_gpu.web_service import WebService
+if sys.argv[1] == 'gpu':
+    from paddle_serving_server_gpu.web_service import WebService
+elif sys.argv[1] == 'cpu':
+    from paddle_serving_server.web_service import WebService
 import time
 import re
 import base64
@@ -64,8 +67,12 @@ class OCRService(WebService):
 
 ocr_service = OCRService(name="ocr")
 ocr_service.load_model_config("ocr_det_model")
-ocr_service.set_gpus("0")
-ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu", gpuid=0)
+if sys.argv[1] == 'gpu':
+    ocr_service.set_gpus("0")
+    ocr_service.prepare_server(
+        workdir="workdir", port=9292, device="gpu", gpuid=0)
+elif sys.argv[1] == 'cpu':
+    ocr_service.prepare_server(workdir="workdir", port=9292)
 ocr_service.init_det()
 ocr_service.run_debugger_service()
 ocr_service.run_web_service()
diff --git a/python/examples/ocr/det_web_server.py b/python/examples/ocr/det_web_server.py
index dd69be0c70eb0f4dd627aa47ad33045a204f78c0..38c6541c70e9871d13dd81751d4edb2bc771a904 100644
--- a/python/examples/ocr/det_web_server.py
+++ b/python/examples/ocr/det_web_server.py
@@ -21,7 +21,10 @@ from paddle_serving_client import Client
 from paddle_serving_app.reader import Sequential, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes
-from paddle_serving_server_gpu.web_service import WebService
+if sys.argv[1] == 'gpu':
+    from paddle_serving_server_gpu.web_service import WebService
+elif sys.argv[1] == 'cpu':
+    from paddle_serving_server.web_service import WebService
 import time
 import re
 import base64
@@ -65,8 +68,12 @@ class OCRService(WebService):
 
 ocr_service = OCRService(name="ocr")
 ocr_service.load_model_config("ocr_det_model")
-ocr_service.set_gpus("0")
-ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu", gpuid=0)
+if sys.argv[1] == 'gpu':
+    ocr_service.set_gpus("0")
+    ocr_service.prepare_server(
+        workdir="workdir", port=9292, device="gpu", gpuid=0)
+elif sys.argv[1] == 'cpu':
+    ocr_service.prepare_server(workdir="workdir", port=9292, device="cpu")
 ocr_service.init_det()
 ocr_service.run_rpc_service()
 ocr_service.run_web_service()
diff --git a/python/examples/ocr/ocr_debugger_server.py b/python/examples/ocr/ocr_debugger_server.py
index 93e2d7a3d1dc64451774ecf790c2ebd3b39f1d91..f7458c3036734e4bb6e554097029270e11912a3a 100644
--- a/python/examples/ocr/ocr_debugger_server.py
+++ b/python/examples/ocr/ocr_debugger_server.py
@@ -22,7 +22,10 @@ from paddle_serving_client import Client
 from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
-from paddle_serving_server_gpu.web_service import WebService
+if sys.argv[1] == 'gpu':
+    from paddle_serving_server_gpu.web_service import WebService
+elif sys.argv[1] == 'cpu':
+    from paddle_serving_server.web_service import WebService
 from paddle_serving_app.local_predict import Debugger
 import time
 import re
@@ -37,8 +40,12 @@ class OCRService(WebService):
                 (2, 0, 1))
         ])
         self.det_client = Debugger()
-        self.det_client.load_model_config(
-            det_model_config, gpu=True, profile=False)
+        if sys.argv[1] == 'gpu':
+            self.det_client.load_model_config(
+                det_model_config, gpu=True, profile=False)
+        elif sys.argv[1] == 'cpu':
+            self.det_client.load_model_config(
+                det_model_config, gpu=False, profile=False)
         self.ocr_reader = OCRReader()
 
     def preprocess(self, feed=[], fetch=[]):
@@ -99,5 +106,8 @@ ocr_service = OCRService(name="ocr")
 ocr_service.load_model_config("ocr_rec_model")
 ocr_service.prepare_server(workdir="workdir", port=9292)
 ocr_service.init_det_debugger(det_model_config="ocr_det_model")
-ocr_service.run_debugger_service(gpu=True)
+if sys.argv[1] == 'gpu':
+    ocr_service.run_debugger_service(gpu=True)
+elif sys.argv[1] == 'cpu':
+    ocr_service.run_debugger_service()
 ocr_service.run_web_service()
diff --git a/python/examples/ocr/ocr_web_server.py b/python/examples/ocr/ocr_web_server.py
index d017f6b9b560dc82158641b9f3a9f80137b40716..de83ca94a4c1f55d886175d9a87b6a34db34c2a5 100644
--- a/python/examples/ocr/ocr_web_server.py
+++ b/python/examples/ocr/ocr_web_server.py
@@ -22,7 +22,10 @@ from paddle_serving_client import Client
 from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
-from paddle_serving_server_gpu.web_service import WebService
+if sys.argv[1] == 'gpu':
+    from paddle_serving_server_gpu.web_service import WebService
+elif sys.argv[1] == 'cpu':
+    from paddle_serving_server.web_service import WebService
 import time
 import re
 import base64
@@ -90,8 +93,12 @@ class OCRService(WebService):
 
 ocr_service = OCRService(name="ocr")
 ocr_service.load_model_config("ocr_rec_model")
-ocr_service.set_gpus("0")
-ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu", gpuid=0)
+if sys.argv[1] == 'gpu':
+    ocr_service.set_gpus("0")
+    ocr_service.prepare_server(
+        workdir="workdir", port=9292, device="gpu", gpuid=0)
+elif sys.argv[1] == 'cpu':
+    ocr_service.prepare_server(workdir="workdir", port=9292)
 ocr_service.init_det_client(
     det_port=9293,
     det_client_config="ocr_det_client/serving_client_conf.prototxt")
diff --git a/python/examples/ocr/rec_web_server.py b/python/examples/ocr/rec_web_server.py
index 684c313d4d50cfe00c576c81aad05a810525dcce..aae97fd9e3fbd1d29b6cf2ef160b92a522db2e22 100644
--- a/python/examples/ocr/rec_web_server.py
+++ b/python/examples/ocr/rec_web_server.py
@@ -22,7 +22,10 @@ from paddle_serving_client import Client
 from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
 from paddle_serving_app.reader import Div, Normalize, Transpose
 from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
-from paddle_serving_server_gpu.web_service import WebService
+if sys.argv[1] == 'gpu':
+    from paddle_serving_server_gpu.web_service import WebService
+elif sys.argv[1] == 'cpu':
+    from paddle_serving_server.web_service import WebService
 import time
 import re
 import base64
@@ -64,8 +67,12 @@ class OCRService(WebService):
 
 ocr_service = OCRService(name="ocr")
 ocr_service.load_model_config("ocr_rec_model")
-ocr_service.set_gpus("0")
 ocr_service.init_rec()
-ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu", gpuid=0)
+if sys.argv[1] == 'gpu':
+    ocr_service.set_gpus("0")
+    ocr_service.prepare_server(
+        workdir="workdir", port=9292, device="gpu", gpuid=0)
+elif sys.argv[1] == 'cpu':
+    ocr_service.prepare_server(workdir="workdir", port=9292, device="cpu")
 ocr_service.run_rpc_service()
 ocr_service.run_web_service()
diff --git a/python/examples/pipeline/imdb_model_ensemble/README_CN.md b/python/examples/pipeline/imdb_model_ensemble/README_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..88eeab70c470268775ad22fd65a6d1b999a6b167
--- /dev/null
+++ b/python/examples/pipeline/imdb_model_ensemble/README_CN.md
@@ -0,0 +1,24 @@
+# IMDB model ensemble 样例
+
+## 获取模型
+```
+sh get_data.sh
+```
+
+## 启动服务
+
+```
+python -m paddle_serving_server_gpu.serve --model imdb_cnn_model --port 9292 &> cnn.log &
+python -m paddle_serving_server_gpu.serve --model imdb_bow_model --port 9393 &> bow.log &
+python test_pipeline_server.py &>pipeline.log &
+```
+
+## 启动客户端
+```
+python test_pipeline_client.py
+```
+
+## HTTP 测试
+```
+curl -X POST -k http://localhost:9999/prediction -d '{"key": ["words"], "value": ["i am very sad | 0"]}'
+```
diff --git a/python/examples/pipeline/imdb_model_ensemble/config.yml b/python/examples/pipeline/imdb_model_ensemble/config.yml
index 7dac6eec2a2d7d90be3d4684f2aaaec7496249d9..3447ffd449de59ea76450e95c7f355413d1a12ac 100644
--- a/python/examples/pipeline/imdb_model_ensemble/config.yml
+++ b/python/examples/pipeline/imdb_model_ensemble/config.yml
@@ -1,8 +1,11 @@
-port: 18080
-worker_num: 1
+rpc_port: 18085
+worker_num: 4
 build_dag_each_worker: false
+http_port: 9999
 dag:
-    is_thread_op: true
+    is_thread_op: false
     client_type: brpc
     retry: 1
     use_profile: false
+    tracer:
+        interval_s: 10
diff --git a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_client.py b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_client.py
index 9cf476c4705c2267aa42337f90970d9fd051b7fd..765ab7fd5a02a4af59b0773135bc59c802464b42 100644
--- a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_client.py
+++ b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_client.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle_serving_client.pipeline import PipelineClient
+from paddle_serving_server.pipeline import PipelineClient
 import numpy as np
 
 client = PipelineClient()
@@ -20,12 +20,16 @@ client.connect(['127.0.0.1:18080'])
 words = 'i am very sad | 0'
 
 futures = []
-for i in range(100):
+for i in range(4):
     futures.append(
         client.predict(
-            feed_dict={"words": words}, fetch=["prediction"], asyn=True))
+            feed_dict={"words": words},
+            fetch=["prediction"],
+            asyn=True,
+            profile=False))
 
 for f in futures:
     res = f.result()
     if res["ecode"] != 0:
         print("predict failed: {}".format(res))
+    print(res)
diff --git a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
index 47c2ec7ad37b77916c2cc3e168965cec7d3c7e07..92a15379c0b6ae1ad0cdc1401a01556e41c7eed7 100644
--- a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
+++ b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
@@ -12,18 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
-
 from paddle_serving_server.pipeline import Op, RequestOp, ResponseOp
 from paddle_serving_server.pipeline import PipelineServer
 from paddle_serving_server.pipeline.proto import pipeline_service_pb2
 from paddle_serving_server.pipeline.channel import ChannelDataEcode
 import numpy as np
-import logging
 from paddle_serving_app.reader import IMDBDataset
-
-logging.basicConfig(level=logging.DEBUG)
+import logging
 
 _LOGGER = logging.getLogger()
+user_handler = logging.StreamHandler()
+user_handler.setLevel(logging.INFO)
+user_handler.setFormatter(
+    logging.Formatter(
+        "%(levelname)s %(asctime)s [%(filename)s:%(lineno)d] %(message)s"))
+_LOGGER.addHandler(user_handler)
 
 
 class ImdbRequestOp(RequestOp):
@@ -76,7 +79,9 @@ bow_op = Op(name="bow",
             client_config="imdb_bow_client_conf/serving_client_conf.prototxt",
             concurrency=1,
             timeout=-1,
-            retry=1)
+            retry=1,
+            batch_size=3,
+            auto_batching_timeout=1000)
 cnn_op = Op(name="cnn",
             input_ops=[read_op],
             server_endpoints=["127.0.0.1:9292"],
@@ -84,13 +89,17 @@ cnn_op = Op(name="cnn",
             client_config="imdb_cnn_client_conf/serving_client_conf.prototxt",
             concurrency=1,
             timeout=-1,
-            retry=1)
+            retry=1,
+            batch_size=1,
+            auto_batching_timeout=None)
 combine_op = CombineOp(
     name="combine",
     input_ops=[bow_op, cnn_op],
-    concurrency=5,
+    concurrency=1,
     timeout=-1,
-    retry=1)
+    retry=1,
+    batch_size=2,
+    auto_batching_timeout=None)
 
 # fetch output of bow_op
 # response_op = ImdbResponseOp(input_ops=[bow_op])
diff --git a/python/examples/pipeline/ocr/README.md b/python/examples/pipeline/ocr/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f51789fc5e419d715141ba59dc49011d4f306e56
--- /dev/null
+++ b/python/examples/pipeline/ocr/README.md
@@ -0,0 +1,67 @@
+# OCR Pipeline WebService 
+
+(English|[简体中文](./README_CN.md))
+
+This document will take OCR as an example to show how to use Pipeline WebService to start multi-model tandem services.
+
+## Get Model
+```
+python -m paddle_serving_app.package --get_model ocr_rec
+tar -xzvf ocr_rec.tar.gz
+python -m paddle_serving_app.package --get_model ocr_det
+tar -xzvf ocr_det.tar.gz
+```
+
+## Get Dataset (Optional)
+```
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/ocr/test_imgs.tar
+tar xf test_imgs.tar
+```
+
+## Start Service
+```
+python web_service.py &>log.txt &
+```
+
+## Test
+```
+python pipeline_http_client.py
+```
+
+
+
+<!--
+## More (PipelineServing)
+
+You can choose one of the following versions to start Service.
+
+### Remote Service Version
+```
+python -m paddle_serving_server_gpu.serve --model ocr_det_model --port 12000 --gpu_id 0 &> det.log &
+python -m paddle_serving_server_gpu.serve --model ocr_rec_model --port 12001 --gpu_id 0 &> rec.log &
+python remote_service_pipeline_server.py &>pipeline.log &
+```
+
+### Local Service Version
+```
+python local_service_pipeline_server.py &>pipeline.log &
+```
+
+### Hybrid Service Version
+```
+python -m paddle_serving_server_gpu.serve --model ocr_rec_model --port 12001 --gpu_id 0 &> rec.log &
+python hybrid_service_pipeline_server.py &>pipeline.log &
+```
+
+## Client Prediction
+
+### RPC
+```
+python pipeline_rpc_client.py
+```
+
+### HTTP
+```
+python pipeline_http_client.py
+```
+-->
diff --git a/python/examples/pipeline/ocr/README_CN.md b/python/examples/pipeline/ocr/README_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..ba1150d32e16298d0c1267d46f7d6e804b53d041
--- /dev/null
+++ b/python/examples/pipeline/ocr/README_CN.md
@@ -0,0 +1,67 @@
+# OCR Pipeline WebService
+
+([English](./README.md)|简体中文)
+
+本文档将以 OCR 为例，介绍如何使用 Pipeline WebService 启动多模型串联的服务。
+
+## 获取模型
+```
+python -m paddle_serving_app.package --get_model ocr_rec
+tar -xzvf ocr_rec.tar.gz
+python -m paddle_serving_app.package --get_model ocr_det
+tar -xzvf ocr_det.tar.gz
+```
+
+## 获取数据集（可选）
+```
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/ocr/test_imgs.tar
+tar xf test_imgs.tar
+```
+
+## 启动 WebService
+```
+python web_service.py &>log.txt &
+```
+
+## 测试
+```
+python pipeline_http_client.py
+```
+
+<!--
+## 其他 (PipelineServing)
+
+你可以选择下面任意一种版本启动服务。
+
+### 远程服务版本
+```
+python -m paddle_serving_server.serve --model ocr_det_model --port 12000 --gpu_id 0 &> det.log &
+python -m paddle_serving_server.serve --model ocr_rec_model --port 12001 --gpu_id 0 &> rec.log &
+python remote_service_pipeline_server.py &>pipeline.log &
+```
+
+### 本地服务版本
+```
+python local_service_pipeline_server.py &>pipeline.log &
+```
+
+### 混合服务版本
+```
+python -m paddle_serving_server_gpu.serve --model ocr_rec_model --port 12001 --gpu_id 0 &> rec.log &
+python hybrid_service_pipeline_server.py &>pipeline.log &
+```
+
+## 启动客户端
+
+### RPC
+
+```
+python pipeline_rpc_client.py
+```
+
+### HTTP
+
+```
+python pipeline_http_client.py
+```
+-->
diff --git a/python/examples/pipeline/ocr/config.yml b/python/examples/pipeline/ocr/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..48addccfd0e543e04adf6587c5532b2a18bb2810
--- /dev/null
+++ b/python/examples/pipeline/ocr/config.yml
@@ -0,0 +1,22 @@
+rpc_port: 18080
+worker_num: 4
+build_dag_each_worker: false
+http_port: 9999
+dag:
+    is_thread_op: false
+    client_type: brpc
+    retry: 1
+    use_profile: false
+op:
+    det:
+        concurrency: 2
+        local_service_conf:
+            model_config: ocr_det_model
+            devices: "0"
+    rec:
+        concurrency: 1
+        timeout: -1
+        retry: 1
+        local_service_conf:
+            model_config: ocr_rec_model
+            devices: "0"
diff --git a/python/examples/pipeline/ocr/hybrid_service_pipeline_server.py b/python/examples/pipeline/ocr/hybrid_service_pipeline_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eea9c3b36f74d04c74618a2012810a1a58d411e
--- /dev/null
+++ b/python/examples/pipeline/ocr/hybrid_service_pipeline_server.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_server_gpu.pipeline import Op, RequestOp, ResponseOp
+from paddle_serving_server_gpu.pipeline import PipelineServer
+from paddle_serving_server_gpu.pipeline.proto import pipeline_service_pb2
+from paddle_serving_server_gpu.pipeline.channel import ChannelDataEcode
+from paddle_serving_server_gpu.pipeline import LocalRpcServiceHandler
+import numpy as np
+import cv2
+import time
+import base64
+import json
+from paddle_serving_app.reader import OCRReader
+from paddle_serving_app.reader import Sequential, ResizeByFactor
+from paddle_serving_app.reader import Div, Normalize, Transpose
+from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
+import time
+import re
+import base64
+import logging
+
+_LOGGER = logging.getLogger()
+
+
+class DetOp(Op):
+    def init_op(self):
+        self.det_preprocess = Sequential([
+            ResizeByFactor(32, 960), Div(255),
+            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
+                (2, 0, 1))
+        ])
+        self.filter_func = FilterBoxes(10, 10)
+        self.post_func = DBPostProcess({
+            "thresh": 0.3,
+            "box_thresh": 0.5,
+            "max_candidates": 1000,
+            "unclip_ratio": 1.5,
+            "min_size": 3
+        })
+
+    def preprocess(self, input_dicts):
+        (_, input_dict), = input_dicts.items()
+        data = base64.b64decode(input_dict["image"].encode('utf8'))
+        data = np.fromstring(data, np.uint8)
+        # Note: class variables(self.var) can only be used in process op mode
+        self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
+        self.ori_h, self.ori_w, _ = self.im.shape
+        det_img = self.det_preprocess(self.im)
+        _, self.new_h, self.new_w = det_img.shape
+        return {"image": det_img}
+
+    def postprocess(self, input_dicts, fetch_dict):
+        det_out = fetch_dict["concat_1.tmp_0"]
+        ratio_list = [
+            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
+        ]
+        dt_boxes_list = self.post_func(det_out, [ratio_list])
+        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
+        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
+        return out_dict
+
+
+class RecOp(Op):
+    def init_op(self):
+        self.ocr_reader = OCRReader()
+        self.get_rotate_crop_image = GetRotateCropImage()
+        self.sorted_boxes = SortedBoxes()
+
+    def preprocess(self, input_dicts):
+        (_, input_dict), = input_dicts.items()
+        im = input_dict["image"]
+        dt_boxes = input_dict["dt_boxes"]
+        dt_boxes = self.sorted_boxes(dt_boxes)
+        feed_list = []
+        img_list = []
+        max_wh_ratio = 0
+        for i, dtbox in enumerate(dt_boxes):
+            boximg = self.get_rotate_crop_image(im, dt_boxes[i])
+            img_list.append(boximg)
+            h, w = boximg.shape[0:2]
+            wh_ratio = w * 1.0 / h
+            max_wh_ratio = max(max_wh_ratio, wh_ratio)
+        for img in img_list:
+            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
+            feed = {"image": norm_img}
+            feed_list.append(feed)
+        return feed_list
+
+    def postprocess(self, input_dicts, fetch_dict):
+        rec_res = self.ocr_reader.postprocess(fetch_dict, with_score=True)
+        res_lst = []
+        for res in rec_res:
+            res_lst.append(res[0])
+        res = {"res": str(res_lst)}
+        return res
+
+
+read_op = RequestOp()
+det_op = DetOp(
+    name="det",
+    input_ops=[read_op],
+    local_rpc_service_handler=LocalRpcServiceHandler(
+        model_config="ocr_det_model",
+        workdir="det_workdir",  # defalut: "workdir"
+        thread_num=2,  # defalut: 2
+        devices="0",  # gpu0. defalut: "" (cpu)
+        mem_optim=True,  # defalut: True
+        ir_optim=False,  # defalut: False
+        available_port_generator=None),  # defalut: None
+    concurrency=1)
+rec_op = RecOp(
+    name="rec",
+    input_ops=[det_op],
+    server_endpoints=["127.0.0.1:12001"],
+    fetch_list=["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"],
+    client_config="ocr_rec_client/serving_client_conf.prototxt",
+    concurrency=1)
+response_op = ResponseOp(input_ops=[rec_op])
+
+server = PipelineServer("ocr")
+server.set_response_op(response_op)
+server.prepare_server('config.yml')
+server.run_server()
diff --git a/python/examples/pipeline/ocr/imgs/1.jpg b/python/examples/pipeline/ocr/imgs/1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..08010177fed2ee8c3709912c06c0b161ba546313
Binary files /dev/null and b/python/examples/pipeline/ocr/imgs/1.jpg differ
diff --git a/python/examples/pipeline/ocr/local_service_pipeline_server.py b/python/examples/pipeline/ocr/local_service_pipeline_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccbd3b1b07a30422583812b659e1c249b37bcb9e
--- /dev/null
+++ b/python/examples/pipeline/ocr/local_service_pipeline_server.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_server_gpu.pipeline import Op, RequestOp, ResponseOp
+from paddle_serving_server_gpu.pipeline import PipelineServer
+from paddle_serving_server_gpu.pipeline.proto import pipeline_service_pb2
+from paddle_serving_server_gpu.pipeline.channel import ChannelDataEcode
+from paddle_serving_server_gpu.pipeline import LocalRpcServiceHandler
+import numpy as np
+import cv2
+import time
+import base64
+import json
+from paddle_serving_app.reader import OCRReader
+from paddle_serving_app.reader import Sequential, ResizeByFactor
+from paddle_serving_app.reader import Div, Normalize, Transpose
+from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
+import time
+import re
+import base64
+import logging
+
+_LOGGER = logging.getLogger()
+
+
+class DetOp(Op):
+    def init_op(self):
+        self.det_preprocess = Sequential([
+            ResizeByFactor(32, 960), Div(255),
+            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
+                (2, 0, 1))
+        ])
+        self.filter_func = FilterBoxes(10, 10)
+        self.post_func = DBPostProcess({
+            "thresh": 0.3,
+            "box_thresh": 0.5,
+            "max_candidates": 1000,
+            "unclip_ratio": 1.5,
+            "min_size": 3
+        })
+
+    def preprocess(self, input_dicts):
+        (_, input_dict), = input_dicts.items()
+        data = base64.b64decode(input_dict["image"].encode('utf8'))
+        data = np.fromstring(data, np.uint8)
+        # Note: class variables(self.var) can only be used in process op mode
+        self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
+        self.ori_h, self.ori_w, _ = self.im.shape
+        det_img = self.det_preprocess(self.im)
+        _, self.new_h, self.new_w = det_img.shape
+        return {"image": det_img}
+
+    def postprocess(self, input_dicts, fetch_dict):
+        det_out = fetch_dict["concat_1.tmp_0"]
+        ratio_list = [
+            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
+        ]
+        dt_boxes_list = self.post_func(det_out, [ratio_list])
+        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
+        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
+        return out_dict
+
+
+class RecOp(Op):
+    def init_op(self):
+        self.ocr_reader = OCRReader()
+        self.get_rotate_crop_image = GetRotateCropImage()
+        self.sorted_boxes = SortedBoxes()
+
+    def preprocess(self, input_dicts):
+        (_, input_dict), = input_dicts.items()
+        im = input_dict["image"]
+        dt_boxes = input_dict["dt_boxes"]
+        dt_boxes = self.sorted_boxes(dt_boxes)
+        feed_list = []
+        img_list = []
+        max_wh_ratio = 0
+        for i, dtbox in enumerate(dt_boxes):
+            boximg = self.get_rotate_crop_image(im, dt_boxes[i])
+            img_list.append(boximg)
+            h, w = boximg.shape[0:2]
+            wh_ratio = w * 1.0 / h
+            max_wh_ratio = max(max_wh_ratio, wh_ratio)
+        for img in img_list:
+            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
+            feed = {"image": norm_img}
+            feed_list.append(feed)
+        return feed_list
+
+    def postprocess(self, input_dicts, fetch_dict):
+        rec_res = self.ocr_reader.postprocess(fetch_dict, with_score=True)
+        res_lst = []
+        for res in rec_res:
+            res_lst.append(res[0])
+        res = {"res": str(res_lst)}
+        return res
+
+
+read_op = RequestOp()
+det_op = DetOp(
+    name="det",
+    input_ops=[read_op],
+    local_rpc_service_handler=LocalRpcServiceHandler(
+        model_config="ocr_det_model",
+        workdir="det_workdir",  # defalut: "workdir"
+        thread_num=2,  # defalut: 2
+        devices="0",  # gpu0. defalut: "" (cpu)
+        mem_optim=True,  # defalut: True
+        ir_optim=False,  # defalut: False
+        available_port_generator=None),  # defalut: None
+    concurrency=1)
+rec_op = RecOp(
+    name="rec",
+    input_ops=[det_op],
+    local_rpc_service_handler=LocalRpcServiceHandler(
+        model_config="ocr_rec_model"),
+    concurrency=1)
+response_op = ResponseOp(input_ops=[rec_op])
+
+server = PipelineServer("ocr")
+server.set_response_op(response_op)
+server.prepare_server('config.yml')
+server.run_server()
diff --git a/python/examples/pipeline/ocr/pipeline_http_client.py b/python/examples/pipeline/ocr/pipeline_http_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d40e6474d6e0e32ac36835de3b69f4f90b6171d
--- /dev/null
+++ b/python/examples/pipeline/ocr/pipeline_http_client.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_server_gpu.pipeline import PipelineClient
+import numpy as np
+import requests
+import json
+import cv2
+import base64
+import os
+
+
+def cv2_to_base64(image):
+    return base64.b64encode(image).decode('utf8')
+
+
+url = "http://127.0.0.1:9999/ocr/prediction"
+test_img_dir = "imgs/"
+for img_file in os.listdir(test_img_dir):
+    with open(os.path.join(test_img_dir, img_file), 'rb') as file:
+        image_data1 = file.read()
+    image = cv2_to_base64(image_data1)
+
+for i in range(4):
+    data = {"key": ["image"], "value": [image]}
+    r = requests.post(url=url, data=json.dumps(data))
+    print(r.json())
diff --git a/python/examples/pipeline/ocr/pipeline_rpc_client.py b/python/examples/pipeline/ocr/pipeline_rpc_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..93524c36cb300e71bcde57f930cebc62e3d86cba
--- /dev/null
+++ b/python/examples/pipeline/ocr/pipeline_rpc_client.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_server_gpu.pipeline import PipelineClient
+import numpy as np
+import requests
+import json
+import cv2
+import base64
+import os
+
+client = PipelineClient()
+client.connect(['127.0.0.1:18080'])
+
+
+def cv2_to_base64(image):
+    return base64.b64encode(image).decode('utf8')
+
+
+test_img_dir = "imgs/"
+for img_file in os.listdir(test_img_dir):
+    with open(os.path.join(test_img_dir, img_file), 'rb') as file:
+        image_data = file.read()
+    image = cv2_to_base64(image_data)
+
+for i in range(4):
+    ret = client.predict(feed_dict={"image": image}, fetch=["res"])
+    print(ret)
diff --git a/python/examples/pipeline/ocr/remote_service_pipeline_server.py b/python/examples/pipeline/ocr/remote_service_pipeline_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..170e6dd9c4687e10bb4af6278f2f5b0c9ac09878
--- /dev/null
+++ b/python/examples/pipeline/ocr/remote_service_pipeline_server.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_server_gpu.pipeline import Op, RequestOp, ResponseOp
+from paddle_serving_server_gpu.pipeline import PipelineServer
+from paddle_serving_server_gpu.pipeline.proto import pipeline_service_pb2
+from paddle_serving_server_gpu.pipeline.channel import ChannelDataEcode
+import numpy as np
+import cv2
+import time
+import base64
+import json
+from paddle_serving_app.reader import OCRReader
+from paddle_serving_app.reader import Sequential, ResizeByFactor
+from paddle_serving_app.reader import Div, Normalize, Transpose
+from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
+import time
+import re
+import base64
+import logging
+
+_LOGGER = logging.getLogger()
+
+
+class DetOp(Op):
+    def init_op(self):
+        self.det_preprocess = Sequential([
+            ResizeByFactor(32, 960), Div(255),
+            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
+                (2, 0, 1))
+        ])
+        self.filter_func = FilterBoxes(10, 10)
+        self.post_func = DBPostProcess({
+            "thresh": 0.3,
+            "box_thresh": 0.5,
+            "max_candidates": 1000,
+            "unclip_ratio": 1.5,
+            "min_size": 3
+        })
+
+    def preprocess(self, input_dicts):
+        (_, input_dict), = input_dicts.items()
+        data = base64.b64decode(input_dict["image"].encode('utf8'))
+        data = np.fromstring(data, np.uint8)
+        # Note: class variables(self.var) can only be used in process op mode
+        self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
+        self.ori_h, self.ori_w, _ = self.im.shape
+        det_img = self.det_preprocess(self.im)
+        _, self.new_h, self.new_w = det_img.shape
+        return {"image": det_img}
+
+    def postprocess(self, input_dicts, fetch_dict):
+        det_out = fetch_dict["concat_1.tmp_0"]
+        ratio_list = [
+            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
+        ]
+        dt_boxes_list = self.post_func(det_out, [ratio_list])
+        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
+        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
+        return out_dict
+
+
+class RecOp(Op):
+    def init_op(self):
+        self.ocr_reader = OCRReader()
+        self.get_rotate_crop_image = GetRotateCropImage()
+        self.sorted_boxes = SortedBoxes()
+
+    def preprocess(self, input_dicts):
+        (_, input_dict), = input_dicts.items()
+        im = input_dict["image"]
+        dt_boxes = input_dict["dt_boxes"]
+        dt_boxes = self.sorted_boxes(dt_boxes)
+        feed_list = []
+        img_list = []
+        max_wh_ratio = 0
+        for i, dtbox in enumerate(dt_boxes):
+            boximg = self.get_rotate_crop_image(im, dt_boxes[i])
+            img_list.append(boximg)
+            h, w = boximg.shape[0:2]
+            wh_ratio = w * 1.0 / h
+            max_wh_ratio = max(max_wh_ratio, wh_ratio)
+        for img in img_list:
+            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
+            feed = {"image": norm_img}
+            feed_list.append(feed)
+        return feed_list
+
+    def postprocess(self, input_dicts, fetch_dict):
+        rec_res = self.ocr_reader.postprocess(fetch_dict, with_score=True)
+        res_lst = []
+        for res in rec_res:
+            res_lst.append(res[0])
+        res = {"res": str(res_lst)}
+        return res
+
+
+read_op = RequestOp()
+det_op = DetOp(
+    name="det",
+    input_ops=[read_op],
+    server_endpoints=["127.0.0.1:12000"],
+    fetch_list=["concat_1.tmp_0"],
+    client_config="ocr_det_client/serving_client_conf.prototxt",
+    concurrency=1)
+rec_op = RecOp(
+    name="rec",
+    input_ops=[det_op],
+    server_endpoints=["127.0.0.1:12001"],
+    fetch_list=["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"],
+    client_config="ocr_rec_client/serving_client_conf.prototxt",
+    concurrency=1)
+response_op = ResponseOp(input_ops=[rec_op])
+
+server = PipelineServer("ocr")
+server.set_response_op(response_op)
+server.prepare_server('config.yml')
+server.run_server()
diff --git a/python/examples/pipeline/ocr/web_service.py b/python/examples/pipeline/ocr/web_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1e6ec808343d62cc7c85b2d78ac1caa57c8cf28
--- /dev/null
+++ b/python/examples/pipeline/ocr/web_service.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    from paddle_serving_server_gpu.web_service import WebService, Op
+except ImportError:
+    from paddle_serving_server.web_service import WebService, Op
+import logging
+import numpy as np
+import cv2
+import base64
+from paddle_serving_app.reader import OCRReader
+from paddle_serving_app.reader import Sequential, ResizeByFactor
+from paddle_serving_app.reader import Div, Normalize, Transpose
+from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
+
+_LOGGER = logging.getLogger()
+
+
+class DetOp(Op):
+    def init_op(self):
+        self.det_preprocess = Sequential([
+            ResizeByFactor(32, 960), Div(255),
+            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
+                (2, 0, 1))
+        ])
+        self.filter_func = FilterBoxes(10, 10)
+        self.post_func = DBPostProcess({
+            "thresh": 0.3,
+            "box_thresh": 0.5,
+            "max_candidates": 1000,
+            "unclip_ratio": 1.5,
+            "min_size": 3
+        })
+
+    def preprocess(self, input_dicts):
+        (_, input_dict), = input_dicts.items()
+        data = base64.b64decode(input_dict["image"].encode('utf8'))
+        data = np.fromstring(data, np.uint8)
+        # Note: class variables(self.var) can only be used in process op mode
+        self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
+        self.ori_h, self.ori_w, _ = self.im.shape
+        det_img = self.det_preprocess(self.im)
+        _, self.new_h, self.new_w = det_img.shape
+        return {"image": det_img}
+
+    def postprocess(self, input_dicts, fetch_dict):
+        det_out = fetch_dict["concat_1.tmp_0"]
+        ratio_list = [
+            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
+        ]
+        dt_boxes_list = self.post_func(det_out, [ratio_list])
+        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
+        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
+        return out_dict
+
+
+class RecOp(Op):
+    def init_op(self):
+        self.ocr_reader = OCRReader()
+        self.get_rotate_crop_image = GetRotateCropImage()
+        self.sorted_boxes = SortedBoxes()
+
+    def preprocess(self, input_dicts):
+        (_, input_dict), = input_dicts.items()
+        im = input_dict["image"]
+        dt_boxes = input_dict["dt_boxes"]
+        dt_boxes = self.sorted_boxes(dt_boxes)
+        feed_list = []
+        img_list = []
+        max_wh_ratio = 0
+        for i, dtbox in enumerate(dt_boxes):
+            boximg = self.get_rotate_crop_image(im, dt_boxes[i])
+            img_list.append(boximg)
+            h, w = boximg.shape[0:2]
+            wh_ratio = w * 1.0 / h
+            max_wh_ratio = max(max_wh_ratio, wh_ratio)
+        for img in img_list:
+            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
+            feed = {"image": norm_img}
+            feed_list.append(feed)
+        return feed_list
+
+    def postprocess(self, input_dicts, fetch_dict):
+        rec_res = self.ocr_reader.postprocess(fetch_dict, with_score=True)
+        res_lst = []
+        for res in rec_res:
+            res_lst.append(res[0])
+        res = {"res": str(res_lst)}
+        return res
+
+
+class OcrService(WebService):
+    def get_pipeline_response(self, read_op):
+        det_op = DetOp(name="det", input_ops=[read_op])
+        rec_op = RecOp(name="rec", input_ops=[det_op])
+        return rec_op
+
+
+uci_service = OcrService(name="ocr")
+uci_service.prepare_pipeline_config("config.yml")
+uci_service.run_service()
diff --git a/python/examples/pipeline/simple_web_service/README.md b/python/examples/pipeline/simple_web_service/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..049fbf2ec69bb83062f396e59344e29b0094372a
--- /dev/null
+++ b/python/examples/pipeline/simple_web_service/README.md
@@ -0,0 +1,19 @@
+# Simple Pipeline WebService
+
+This document will takes UCI service as an example to introduce how to use Pipeline WebService.
+
+## Get model
+```
+sh get_data.sh
+```
+
+## Start server
+
+```
+python web_service.py &>log.txt &
+```
+
+## Http test
+```
+curl -X POST -k http://localhost:18080/uci/prediction -d '{"key": ["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}'
+```
diff --git a/python/examples/pipeline/simple_web_service/README_CN.md b/python/examples/pipeline/simple_web_service/README_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..c08d642f7c8034e9d326a24636728bff36f8638b
--- /dev/null
+++ b/python/examples/pipeline/simple_web_service/README_CN.md
@@ -0,0 +1,19 @@
+# Simple Pipeline WebService
+
+这里以 Uci 服务为例来介绍 Pipeline WebService 的使用。
+
+## 获取模型
+```
+sh get_data.sh
+```
+
+## 启动服务
+
+```
+python web_service.py &>log.txt &
+```
+
+## 测试
+```
+curl -X POST -k http://localhost:18080/uci/prediction -d '{"key": ["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}'
+```
diff --git a/python/examples/pipeline/simple_web_service/config.yml b/python/examples/pipeline/simple_web_service/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..72e473e320e792b8fafc46768c8ef38e7a00436c
--- /dev/null
+++ b/python/examples/pipeline/simple_web_service/config.yml
@@ -0,0 +1,9 @@
+worker_num: 4
+http_port: 18080
+dag:
+    is_thread_op: false
+op:
+    uci:
+        local_service_conf:
+            model_config: uci_housing_model
+            devices: "" # "0,1"
diff --git a/python/examples/pipeline/simple_web_service/get_data.sh b/python/examples/pipeline/simple_web_service/get_data.sh
new file mode 100644
index 0000000000000000000000000000000000000000..84a3966a0ef323cef4b146d8e9489c70a7a8ae35
--- /dev/null
+++ b/python/examples/pipeline/simple_web_service/get_data.sh
@@ -0,0 +1,2 @@
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
+tar -xzf uci_housing.tar.gz
diff --git a/python/examples/pipeline/simple_web_service/web_service.py b/python/examples/pipeline/simple_web_service/web_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..28197e804ffc08d094d0e33d3d2654ace3093ded
--- /dev/null
+++ b/python/examples/pipeline/simple_web_service/web_service.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    from paddle_serving_server_gpu.web_service import WebService, Op
+except ImportError:
+    from paddle_serving_server.web_service import WebService, Op
+import logging
+import numpy as np
+
+_LOGGER = logging.getLogger()
+
+
+class UciOp(Op):
+    def init_op(self):
+        self.separator = ","
+
+    def preprocess(self, input_dicts):
+        (_, input_dict), = input_dicts.items()
+        _LOGGER.info(input_dict)
+        x_value = input_dict["x"]
+        if isinstance(x_value, (str, unicode)):
+            input_dict["x"] = np.array(
+                [float(x.strip()) for x in x_value.split(self.separator)])
+        return input_dict
+
+    def postprocess(self, input_dicts, fetch_dict):
+        # _LOGGER.info(fetch_dict)
+        fetch_dict["price"] = str(fetch_dict["price"][0][0])
+        return fetch_dict
+
+
+class UciService(WebService):
+    def get_pipeline_response(self, read_op):
+        uci_op = UciOp(name="uci", input_ops=[read_op])
+        return uci_op
+
+
+uci_service = UciService(name="uci")
+uci_service.prepare_pipeline_config("config.yml")
+uci_service.run_service()
diff --git a/python/examples/yolov4/test_client.py b/python/examples/yolov4/test_client.py
index 92dcd06552ca1fdd3f2d54060e9de501f052e349..2616e55766192fca676e58efc4f0a2a3d634f1d3 100644
--- a/python/examples/yolov4/test_client.py
+++ b/python/examples/yolov4/test_client.py
@@ -30,7 +30,6 @@ client.load_client_config("yolov4_client/serving_client_conf.prototxt")
 client.connect(['127.0.0.1:9393'])
 
 im = preprocess(sys.argv[1])
-print(im.shape)
 fetch_map = client.predict(
     feed={
         "image": im,
diff --git a/python/gen_version.py b/python/gen_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..258905f5815f6af01398479732b907c80cb9d739
--- /dev/null
+++ b/python/gen_version.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import re
+import os
+import subprocess
+
+
+def update_info(file_name, feature, info):
+    new_str = ""
+    with open(file_name, "r") as f:
+        for line in f.readlines():
+            if re.match(feature, line):
+                if isinstance(info, str):
+                    line = feature + " = \"" + info.strip() + "\"\n"
+                else:
+                    line = feature + " = \"" + info.decode('utf-8').strip(
+                    ) + "\"\n"
+            new_str = new_str + line
+
+    with open(file_name, "w") as f:
+        f.write(new_str)
+
+
+if len(sys.argv) > 2:
+    update_info("paddle_serving_server_gpu/version.py", "cuda_version",
+                sys.argv[2])
+
+path = "paddle_serving_" + sys.argv[1]
+commit_id = subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+update_info(path + "/version.py", "commit_id", commit_id)
diff --git a/python/paddle_serving_app/reader/__init__.py b/python/paddle_serving_app/reader/__init__.py
index 93e2cd76102d93f52955060055afda34f9576ed8..05b53fb6aba24522a377dc12634bd1667e966292 100644
--- a/python/paddle_serving_app/reader/__init__.py
+++ b/python/paddle_serving_app/reader/__init__.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .chinese_bert_reader import ChineseBertReader
-from .image_reader import ImageReader, File2Image, URL2Image, Sequential, Normalize
+from .image_reader import ImageReader, File2Image, URL2Image, Sequential, Normalize, Base64ToImage
 from .image_reader import CenterCrop, Resize, Transpose, Div, RGB2BGR, BGR2RGB, ResizeByFactor
-from .image_reader import RCNNPostprocess, SegPostprocess, PadStride
+from .image_reader import RCNNPostprocess, SegPostprocess, PadStride, BlazeFacePostprocess
 from .image_reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
 from .lac_reader import LACReader
 from .senta_reader import SentaReader
diff --git a/python/paddle_serving_app/reader/image_reader.py b/python/paddle_serving_app/reader/image_reader.py
index 50c0753c27f845e784676b54ae7e029bec2a4ec4..38a1766433848c800ad40e1be7e79c2ac7989199 100644
--- a/python/paddle_serving_app/reader/image_reader.py
+++ b/python/paddle_serving_app/reader/image_reader.py
@@ -317,7 +317,7 @@ class RCNNPostprocess(object):
                             self.clip_bbox([xmin, ymin, xmax, ymax])
                         w = xmax - xmin
                         h = ymax - ymin
-                        im_shape = t['im_shape'][0][i].tolist()
+                        im_shape = t['im_shape'].tolist()
                         im_height, im_width = int(im_shape[0]), int(im_shape[1])
                         xmin *= im_width
                         ymin *= im_height
@@ -420,7 +420,7 @@ class RCNNPostprocess(object):
         for key in image_with_bbox:
             if key == "image":
                 continue
-            if ".lod" in key:
+            if ".lod" in key or "im_shape" in key:
                 continue
             fetch_name = key
         bbox_result = self._get_bbox_result(image_with_bbox, fetch_name,
diff --git a/python/paddle_serving_app/version.py b/python/paddle_serving_app/version.py
index 332cba98dd692c4e33da68d4de7763e83e3729b5..d1ccc660c4021d71845f3a68c1c4a7b53d5c323a 100644
--- a/python/paddle_serving_app/version.py
+++ b/python/paddle_serving_app/version.py
@@ -12,4 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Paddle Serving App version string """
-serving_app_version = "0.1.2"
+serving_app_version = "0.0.0"
+commit_id = ""
diff --git a/python/paddle_serving_client/__init__.py b/python/paddle_serving_client/__init__.py
index cf669c54f3492fc739bedcfacc49537a5ecc545f..d350b5842b283af3182bdc0348d977ede1129e6e 100644
--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
@@ -233,7 +233,7 @@ class Client(object):
             #    key))
             pass
 
-    def predict(self, feed=None, fetch=None, need_variant_tag=False):
+    def predict(self, feed=None, fetch=None, need_variant_tag=False, log_id=0):
         self.profile_.record('py_prepro_0')
 
         if feed is None or fetch is None:
@@ -319,12 +319,12 @@ class Client(object):
             res = self.client_handle_.numpy_predict(
                 float_slot_batch, float_feed_names, float_shape, int_slot_batch,
                 int_feed_names, int_shape, fetch_names, result_batch_handle,
-                self.pid)
+                self.pid, log_id)
         elif self.has_numpy_input == False:
             res = self.client_handle_.batch_predict(
                 float_slot_batch, float_feed_names, float_shape, int_slot_batch,
                 int_feed_names, int_shape, fetch_names, result_batch_handle,
-                self.pid)
+                self.pid, log_id)
         else:
             raise ValueError(
                 "Please make sure the inputs are all in list type or all in numpy.array type"
@@ -347,28 +347,45 @@ class Client(object):
                     result_map[name] = result_batch_handle.get_int64_by_name(
                         mi, name)
                     shape = result_batch_handle.get_shape(mi, name)
+                    if result_map[name].size == 0:
+                        raise ValueError(
+                            "Failed to fetch, maybe the type of [{}]"
+                            " is wrong, please check the model file".format(
+                                name))
                     result_map[name].shape = shape
                     if name in self.lod_tensor_set:
-                        result_map["{}.lod".format(
-                            name)] = result_batch_handle.get_lod(mi, name)
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
                 elif self.fetch_names_to_type_[name] == float32_type:
                     result_map[name] = result_batch_handle.get_float_by_name(
                         mi, name)
+                    if result_map[name].size == 0:
+                        raise ValueError(
+                            "Failed to fetch, maybe the type of [{}]"
+                            " is wrong, please check the model file".format(
+                                name))
                     shape = result_batch_handle.get_shape(mi, name)
                     result_map[name].shape = shape
                     if name in self.lod_tensor_set:
-                        result_map["{}.lod".format(
-                            name)] = result_batch_handle.get_lod(mi, name)
-
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
                 elif self.fetch_names_to_type_[name] == int32_type:
                     # result_map[name] will be py::array(numpy array)
                     result_map[name] = result_batch_handle.get_int32_by_name(
                         mi, name)
+                    if result_map[name].size == 0:
+                        raise ValueError(
+                            "Failed to fetch, maybe the type of [{}]"
+                            " is wrong, please check the model file".format(
+                                name))
                     shape = result_batch_handle.get_shape(mi, name)
                     result_map[name].shape = shape
                     if name in self.lod_tensor_set:
-                        result_map["{}.lod".format(
-                            name)] = result_batch_handle.get_lod(mi, name)
+                        tmp_lod = result_batch_handle.get_lod(mi, name)
+                        if np.size(tmp_lod) > 0:
+                            result_map["{}.lod".format(name)] = tmp_lod
             multi_result_map.append(result_map)
         ret = None
         if len(model_engine_names) == 1:
@@ -466,10 +483,11 @@ class MultiLangClient(object):
             if var.is_lod_tensor:
                 self.lod_tensor_set_.add(var.alias_name)
 
-    def _pack_inference_request(self, feed, fetch, is_python):
+    def _pack_inference_request(self, feed, fetch, is_python, log_id):
         req = multi_lang_general_model_service_pb2.InferenceRequest()
         req.fetch_var_names.extend(fetch)
         req.is_python = is_python
+        req.log_id = log_id
         feed_batch = None
         if isinstance(feed, dict):
             feed_batch = [feed]
@@ -602,12 +620,13 @@ class MultiLangClient(object):
                 fetch,
                 need_variant_tag=False,
                 asyn=False,
-                is_python=True):
+                is_python=True,
+                log_id=0):
         if not asyn:
             try:
                 self.profile_.record('py_prepro_0')
                 req = self._pack_inference_request(
-                    feed, fetch, is_python=is_python)
+                    feed, fetch, is_python=is_python, log_id=log_id)
                 self.profile_.record('py_prepro_1')
 
                 self.profile_.record('py_client_infer_0')
@@ -626,7 +645,8 @@ class MultiLangClient(object):
             except grpc.RpcError as e:
                 return {"serving_status_code": e.code()}
         else:
-            req = self._pack_inference_request(feed, fetch, is_python=is_python)
+            req = self._pack_inference_request(
+                feed, fetch, is_python=is_python, log_id=log_id)
             call_future = self.stub_.Inference.future(
                 req, timeout=self.rpc_timeout_s_)
             return MultiLangPredictFuture(
diff --git a/python/paddle_serving_client/convert.py b/python/paddle_serving_client/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3cd3a05f8e09155b0c884e3ddf12b57234de3dd
--- /dev/null
+++ b/python/paddle_serving_client/convert.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+    Convert a paddle inference model into a model file that can be used for Paddle Serving.
+    Example:
+        python -m paddle_serving_client.convert --dirname ./inference_model
+"""
+import argparse
+from .io import inference_model_to_serving
+
+
+def parse_args():  # pylint: disable=doc-string-missing
+    parser = argparse.ArgumentParser("convert")
+    parser.add_argument(
+        "--dirname",
+        type=str,
+        required=True,
+        help='Path of saved model files. Program file and parameter files are saved in this directory.'
+    )
+    parser.add_argument(
+        "--serving_server",
+        type=str,
+        default="serving_server",
+        help='The path of model files and configuration files for server. Default: "serving_server".'
+    )
+    parser.add_argument(
+        "--serving_client",
+        type=str,
+        default="serving_client",
+        help='The path of configuration files for client. Default: "serving_client".'
+    )
+    parser.add_argument(
+        "--model_filename",
+        type=str,
+        default=None,
+        help='The name of file to load the inference program. If it is None, the default filename __model__ will be used'
+    )
+    parser.add_argument(
+        "--params_filename",
+        type=str,
+        default=None,
+        help='The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. Default: None.'
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    inference_model_to_serving(
+        args.dirname,
+        serving_server=args.serving_server,
+        serving_client=args.serving_client,
+        model_filename=args.model_filename,
+        params_filename=args.params_filename)
diff --git a/python/paddle_serving_client/io/__init__.py b/python/paddle_serving_client/io/__init__.py
index 69e185be3d2e4d1a579a29d30b59341bfb8666ed..2071e0d1da9e0c12cc431f2d86cfa9d71c79218f 100644
--- a/python/paddle_serving_client/io/__init__.py
+++ b/python/paddle_serving_client/io/__init__.py
@@ -74,7 +74,8 @@ def save_model(server_model_folder,
         fetch_var = model_conf.FetchVar()
         fetch_var.alias_name = key
         fetch_var.name = fetch_var_dict[key].name
-        fetch_var.is_lod_tensor = fetch_var_dict[key].lod_level >= 1
+        #fetch_var.is_lod_tensor = fetch_var_dict[key].lod_level >= 1
+        fetch_var.is_lod_tensor = 1
         if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT64:
             fetch_var.fetch_type = 0
         if fetch_var_dict[key].dtype == core.VarDesc.VarType.FP32:
diff --git a/python/paddle_serving_client/version.py b/python/paddle_serving_client/version.py
index f7fc14b2a7f0c25b471e8d3bb44e9d6db6839d01..490ba962acf817b9e87f9699afd4b3ae8f61ad0f 100644
--- a/python/paddle_serving_client/version.py
+++ b/python/paddle_serving_client/version.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Paddle Serving Client version string """
-serving_client_version = "0.3.2"
-serving_server_version = "0.3.2"
-module_proto_version = "0.3.2"
+serving_client_version = "0.0.0"
+serving_server_version = "0.0.0"
+module_proto_version = "0.0.0"
+commit_id = ""
diff --git a/python/paddle_serving_server/__init__.py b/python/paddle_serving_server/__init__.py
index 678c0583d1e132791a1199e315ea380a4ae3108b..76694bc5bb864f4c21ff3b9c2cfd07761c5adbea 100644
--- a/python/paddle_serving_server/__init__.py
+++ b/python/paddle_serving_server/__init__.py
@@ -103,8 +103,8 @@ class OpSeqMaker(object):
             elif len(node.dependencies) == 1:
                 if node.dependencies[0].name != self.workflow.nodes[-1].name:
                     raise Exception(
-                        'You must add op in order in OpSeqMaker. The previous op is {}, but the current op is followed by {}.'.
-                        format(node.dependencies[0].name, self.workflow.nodes[
+                        'You must add op in order in OpSeqMaker. The previous op is {}, but the current op is followed by {}.'
+                        .format(node.dependencies[0].name, self.workflow.nodes[
                             -1].name))
         self.workflow.nodes.extend([node])
 
@@ -157,8 +157,14 @@ class Server(object):
         self.cur_path = os.getcwd()
         self.use_local_bin = False
         self.mkl_flag = False
+        self.product_name = None
+        self.container_id = None
         self.model_config_paths = None  # for multi-model in a workflow
 
+    def get_fetch_list(self):
+        fetch_names = [var.alias_name for var in self.model_conf.fetch_var]
+        return fetch_names
+
     def set_max_concurrency(self, concurrency):
         self.max_concurrency = concurrency
 
@@ -191,6 +197,16 @@ class Server(object):
     def set_ir_optimize(self, flag=False):
         self.ir_optimization = flag
 
+    def set_product_name(self, product_name=None):
+        if product_name == None:
+            raise ValueError("product_name can't be None.")
+        self.product_name = product_name
+
+    def set_container_id(self, container_id):
+        if container_id == None:
+            raise ValueError("container_id can't be None.")
+        self.container_id = container_id
+
     def check_local_bin(self):
         if "SERVING_BIN" in os.environ:
             self.use_local_bin = True
@@ -254,6 +270,10 @@ class Server(object):
             self.resource_conf.model_toolkit_file = self.model_toolkit_fn
             self.resource_conf.general_model_path = workdir
             self.resource_conf.general_model_file = self.general_model_config_fn
+            if self.product_name != None:
+                self.resource_conf.auth_product_name = self.product_name
+            if self.container_id != None:
+                self.resource_conf.auth_container_id = self.container_id
 
     def _write_pb_str(self, filepath, pb_obj):
         with open(filepath, "w") as fout:
@@ -351,8 +371,8 @@ class Server(object):
                 if os.path.exists(tar_name):
                     os.remove(tar_name)
                 raise SystemExit(
-                    'Download failed, please check your network or permission of {}.'.
-                    format(self.module_path))
+                    'Download failed, please check your network or permission of {}.'
+                    .format(self.module_path))
             else:
                 try:
                     print('Decompressing files ..')
@@ -363,8 +383,8 @@ class Server(object):
                     if os.path.exists(exe_path):
                         os.remove(exe_path)
                     raise SystemExit(
-                        'Decompressing failed, please check your permission of {} or disk space left.'.
-                        format(self.module_path))
+                        'Decompressing failed, please check your permission of {} or disk space left.'
+                        .format(self.module_path))
                 finally:
                     os.remove(tar_name)
         #release lock
@@ -502,6 +522,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
         feed_names = list(request.feed_var_names)
         fetch_names = list(request.fetch_var_names)
         is_python = request.is_python
+        log_id = request.log_id
         feed_batch = []
         for feed_inst in request.insts:
             feed_dict = {}
@@ -530,7 +551,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
                 data.shape = list(feed_inst.tensor_array[idx].shape)
                 feed_dict[name] = data
             feed_batch.append(feed_dict)
-        return feed_batch, fetch_names, is_python
+        return feed_batch, fetch_names, is_python, log_id
 
     def _pack_inference_response(self, ret, fetch_names, is_python):
         resp = multi_lang_general_model_service_pb2.InferenceResponse()
@@ -540,7 +561,6 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
         results, tag = ret
         resp.tag = tag
         resp.err_code = 0
-
         if not self.is_multi_model_:
             results = {'general_infer_0': results}
         for model_name, model_result in results.items():
@@ -583,10 +603,13 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
         return resp
 
     def Inference(self, request, context):
-        feed_dict, fetch_names, is_python = self._unpack_inference_request(
-            request)
+        feed_dict, fetch_names, is_python, log_id = \
+                self._unpack_inference_request(request)
         ret = self.bclient_.predict(
-            feed=feed_dict, fetch=fetch_names, need_variant_tag=True)
+            feed=feed_dict,
+            fetch=fetch_names,
+            need_variant_tag=True,
+            log_id=log_id)
         return self._pack_inference_response(ret, fetch_names, is_python)
 
     def GetClientConfig(self, request, context):
diff --git a/python/paddle_serving_server/serve.py b/python/paddle_serving_server/serve.py
index 704cf0304adf1ac647c244063c2b23049f92b221..d282ac076e377806e9a3b320b880ffed6300b971 100644
--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
@@ -58,6 +58,16 @@ def parse_args():  # pylint: disable=doc-string-missing
         default=False,
         action="store_true",
         help="Use Multi-language-service")
+    parser.add_argument(
+        "--product_name",
+        type=str,
+        default=None,
+        help="product_name for authentication")
+    parser.add_argument(
+        "--container_id",
+        type=str,
+        default=None,
+        help="container_id for authentication")
     return parser.parse_args()
 
 
@@ -101,6 +111,10 @@ def start_standard_model():  # pylint: disable=doc-string-missing
     server.use_mkl(use_mkl)
     server.set_max_body_size(max_body_size)
     server.set_port(port)
+    if args.product_name != None:
+        server.set_product_name(args.product_name)
+    if args.container_id != None:
+        server.set_container_id(args.container_id)
 
     server.load_model_config(model)
     server.prepare_server(workdir=workdir, port=port, device=device)
diff --git a/python/paddle_serving_server/version.py b/python/paddle_serving_server/version.py
index f7fc14b2a7f0c25b471e8d3bb44e9d6db6839d01..490ba962acf817b9e87f9699afd4b3ae8f61ad0f 100644
--- a/python/paddle_serving_server/version.py
+++ b/python/paddle_serving_server/version.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Paddle Serving Client version string """
-serving_client_version = "0.3.2"
-serving_server_version = "0.3.2"
-module_proto_version = "0.3.2"
+serving_client_version = "0.0.0"
+serving_server_version = "0.0.0"
+module_proto_version = "0.0.0"
+commit_id = ""
diff --git a/python/paddle_serving_server/web_service.py b/python/paddle_serving_server/web_service.py
old mode 100755
new mode 100644
index b0c1b79bda5041b4eca114d778a23d3a123c226e..9430da83ef1b3add9b79d305f03d2aef195028a6
--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -21,12 +21,36 @@ from paddle_serving_client import Client
 from contextlib import closing
 import socket
 
+from paddle_serving_server import pipeline
+from paddle_serving_server.pipeline import Op
+
 
 class WebService(object):
     def __init__(self, name="default_service"):
         self.name = name
+        # pipeline
+        self._server = pipeline.PipelineServer(self.name)
+
+    def get_pipeline_response(self, read_op):
+        return None
+
+    def prepare_pipeline_config(self, yaml_file):
+        # build dag
+        read_op = pipeline.RequestOp()
+        last_op = self.get_pipeline_response(read_op)
+        if not isinstance(last_op, Op):
+            raise ValueError("The return value type of `get_pipeline_response` "
+                             "function is not Op type, please check function "
+                             "`get_pipeline_response`.")
+        response_op = pipeline.ResponseOp(input_ops=[last_op])
+        self._server.set_response_op(response_op)
+        self._server.prepare_server(yaml_file)
+
+    def run_service(self):
+        self._server.run_server()
 
     def load_model_config(self, model_config):
+        print("This API will be deprecated later. Please do not use it")
         self.model_config = model_config
 
     def _launch_rpc_service(self):
@@ -63,6 +87,7 @@ class WebService(object):
                        device="cpu",
                        mem_optim=True,
                        ir_optim=False):
+        print("This API will be deprecated later. Please do not use it")
         self.workdir = workdir
         self.port = port
         self.device = device
@@ -91,15 +116,18 @@ class WebService(object):
                                           request.json["fetch"])
             if isinstance(feed, dict) and "fetch" in feed:
                 del feed["fetch"]
+            if len(feed) == 0:
+                raise ValueError("empty input")
             fetch_map = self.client.predict(feed=feed, fetch=fetch)
             result = self.postprocess(
                 feed=request.json["feed"], fetch=fetch, fetch_map=fetch_map)
             result = {"result": result}
         except ValueError as err:
-            result = {"result": err}
+            result = {"result": str(err)}
         return result
 
     def run_rpc_service(self):
+        print("This API will be deprecated later. Please do not use it")
         import socket
         localIP = socket.gethostbyname(socket.gethostname())
         print("web service address:")
@@ -122,7 +150,34 @@ class WebService(object):
 
         self.app_instance = app_instance
 
+    def run_debugger_service(self):
+        import socket
+        localIP = socket.gethostbyname(socket.gethostname())
+        print("web service address:")
+        print("http://{}:{}/{}/prediction".format(localIP, self.port,
+                                                  self.name))
+        app_instance = Flask(__name__)
+
+        @app_instance.before_first_request
+        def init():
+            self._launch_local_predictor()
+
+        service_name = "/" + self.name + "/prediction"
+
+        @app_instance.route(service_name, methods=["POST"])
+        def run():
+            return self.get_prediction(request)
+
+        self.app_instance = app_instance
+
+    def _launch_local_predictor(self):
+        from paddle_serving_app.local_predict import Debugger
+        self.client = Debugger()
+        self.client.load_model_config(
+            "{}".format(self.model_config), gpu=False, profile=False)
+
     def run_web_service(self):
+        print("This API will be deprecated later. Please do not use it")
         self.app_instance.run(host="0.0.0.0",
                               port=self.port,
                               threaded=False,
@@ -132,9 +187,11 @@ class WebService(object):
         return self.app_instance
 
     def preprocess(self, feed=[], fetch=[]):
+        print("This API will be deprecated later. Please do not use it")
         return feed, fetch
 
     def postprocess(self, feed=[], fetch=[], fetch_map=None):
+        print("This API will be deprecated later. Please do not use it")
         for key in fetch_map:
             fetch_map[key] = fetch_map[key].tolist()
         return fetch_map
diff --git a/python/paddle_serving_server_gpu/__init__.py b/python/paddle_serving_server_gpu/__init__.py
index 8dda9898a07197ae8ef46556ce0b10dd6fe79fb4..3f248ab4d8fb782b847c717faf1a1038de8fa60b 100644
--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
@@ -75,6 +75,16 @@ def serve_args():
         help="Use Multi-language-service")
     parser.add_argument(
         "--use_trt", default=False, action="store_true", help="Use TensorRT")
+    parser.add_argument(
+        "--product_name",
+        type=str,
+        default=None,
+        help="product_name for authentication")
+    parser.add_argument(
+        "--container_id",
+        type=str,
+        default=None,
+        help="container_id for authentication")
     return parser.parse_args()
 
 
@@ -143,8 +153,8 @@ class OpSeqMaker(object):
             elif len(node.dependencies) == 1:
                 if node.dependencies[0].name != self.workflow.nodes[-1].name:
                     raise Exception(
-                        'You must add op in order in OpSeqMaker. The previous op is {}, but the current op is followed by {}.'.
-                        format(node.dependencies[0].name, self.workflow.nodes[
+                        'You must add op in order in OpSeqMaker. The previous op is {}, but the current op is followed by {}.'
+                        .format(node.dependencies[0].name, self.workflow.nodes[
                             -1].name))
         self.workflow.nodes.extend([node])
 
@@ -199,6 +209,12 @@ class Server(object):
         self.gpuid = 0
         self.use_trt = False
         self.model_config_paths = None  # for multi-model in a workflow
+        self.product_name = None
+        self.container_id = None
+
+    def get_fetch_list(self):
+        fetch_names = [var.alias_name for var in self.model_conf.fetch_var]
+        return fetch_names
 
     def set_max_concurrency(self, concurrency):
         self.max_concurrency = concurrency
@@ -232,6 +248,16 @@ class Server(object):
     def set_ir_optimize(self, flag=False):
         self.ir_optimization = flag
 
+    def set_product_name(self, product_name=None):
+        if product_name == None:
+            raise ValueError("product_name can't be None.")
+        self.product_name = product_name
+
+    def set_container_id(self, container_id):
+        if container_id == None:
+            raise ValueError("container_id can't be None.")
+        self.container_id = container_id
+
     def check_local_bin(self):
         if "SERVING_BIN" in os.environ:
             self.use_local_bin = True
@@ -309,6 +335,10 @@ class Server(object):
             self.resource_conf.model_toolkit_file = self.model_toolkit_fn
             self.resource_conf.general_model_path = workdir
             self.resource_conf.general_model_file = self.general_model_config_fn
+            if self.product_name != None:
+                self.resource_conf.auth_product_name = self.product_name
+            if self.container_id != None:
+                self.resource_conf.auth_container_id = self.container_id
 
     def _write_pb_str(self, filepath, pb_obj):
         with open(filepath, "w") as fout:
@@ -400,8 +430,8 @@ class Server(object):
                 if os.path.exists(tar_name):
                     os.remove(tar_name)
                 raise SystemExit(
-                    'Download failed, please check your network or permission of {}.'.
-                    format(self.module_path))
+                    'Download failed, please check your network or permission of {}.'
+                    .format(self.module_path))
             else:
                 try:
                     print('Decompressing files ..')
@@ -412,8 +442,8 @@ class Server(object):
                     if os.path.exists(exe_path):
                         os.remove(exe_path)
                     raise SystemExit(
-                        'Decompressing failed, please check your permission of {} or disk space left.'.
-                        format(self.module_path))
+                        'Decompressing failed, please check your permission of {} or disk space left.'
+                        .format(self.module_path))
                 finally:
                     os.remove(tar_name)
         #release lock
@@ -559,6 +589,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
         feed_names = list(request.feed_var_names)
         fetch_names = list(request.fetch_var_names)
         is_python = request.is_python
+        log_id = request.log_id
         feed_batch = []
         for feed_inst in request.insts:
             feed_dict = {}
@@ -587,7 +618,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
                 data.shape = list(feed_inst.tensor_array[idx].shape)
                 feed_dict[name] = data
             feed_batch.append(feed_dict)
-        return feed_batch, fetch_names, is_python
+        return feed_batch, fetch_names, is_python, log_id
 
     def _pack_inference_response(self, ret, fetch_names, is_python):
         resp = multi_lang_general_model_service_pb2.InferenceResponse()
@@ -640,10 +671,13 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
         return resp
 
     def Inference(self, request, context):
-        feed_dict, fetch_names, is_python = self._unpack_inference_request(
-            request)
+        feed_dict, fetch_names, is_python, log_id \
+                = self._unpack_inference_request(request)
         ret = self.bclient_.predict(
-            feed=feed_dict, fetch=fetch_names, need_variant_tag=True)
+            feed=feed_dict,
+            fetch=fetch_names,
+            need_variant_tag=True,
+            log_id=log_id)
         return self._pack_inference_response(ret, fetch_names, is_python)
 
     def GetClientConfig(self, request, context):
diff --git a/python/paddle_serving_server_gpu/serve.py b/python/paddle_serving_server_gpu/serve.py
index 7ed5dc0250d6205f1fc0c2a257d989ea389dfe47..c2b170fbeb3f9ee772e86c216fe3776f34187743 100644
--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
@@ -67,6 +67,11 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
     if args.use_trt:
         server.set_trt()
 
+    if args.product_name != None:
+        server.set_product_name(args.product_name)
+    if args.container_id != None:
+        server.set_container_id(args.container_id)
+
     server.load_model_config(model)
     server.prepare_server(workdir=workdir, port=port, device=device)
     if gpuid >= 0:
@@ -85,8 +90,8 @@ def start_multi_card(args):  # pylint: disable=doc-string-missing
             for ids in gpus:
                 if int(ids) >= len(env_gpus):
                     print(
-                        " Max index of gpu_ids out of range, the number of CUDA_VISIBLE_DEVICES is {}.".
-                        format(len(env_gpus)))
+                        " Max index of gpu_ids out of range, the number of CUDA_VISIBLE_DEVICES is {}."
+                        .format(len(env_gpus)))
                     exit(-1)
         else:
             env_gpus = []
diff --git a/python/paddle_serving_server_gpu/version.py b/python/paddle_serving_server_gpu/version.py
index 2272c3aa91f999697ea8ef3e2cdb585b01db8bed..b774c2237242cc488ee14ef85b1142929a3879d7 100644
--- a/python/paddle_serving_server_gpu/version.py
+++ b/python/paddle_serving_server_gpu/version.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Paddle Serving Client version string """
-serving_client_version = "0.3.2"
-serving_server_version = "0.3.2"
-module_proto_version = "0.3.2"
+serving_client_version = "0.0.0"
+serving_server_version = "0.0.0"
+module_proto_version = "0.0.0"
 cuda_version = "9"
+commit_id = ""
diff --git a/python/paddle_serving_server_gpu/web_service.py b/python/paddle_serving_server_gpu/web_service.py
index 5e9fdf4f4fda84dfb7c4f598fae6cf2381c377ca..ea72f1869f1c0b84a088221a9770366e7432879d 100644
--- a/python/paddle_serving_server_gpu/web_service.py
+++ b/python/paddle_serving_server_gpu/web_service.py
@@ -24,17 +24,43 @@ import sys
 import numpy as np
 import paddle_serving_server_gpu as serving
 
+from paddle_serving_server_gpu import pipeline
+from paddle_serving_server_gpu.pipeline import Op
+
 
 class WebService(object):
     def __init__(self, name="default_service"):
         self.name = name
-        self.gpus = []
-        self.rpc_service_list = []
+        # pipeline
+        self._server = pipeline.PipelineServer(self.name)
+
+        self.gpus = []  # deprecated
+        self.rpc_service_list = []  # deprecated
+
+    def get_pipeline_response(self, read_op):
+        return None
+
+    def prepare_pipeline_config(self, yaml_file):
+        # build dag
+        read_op = pipeline.RequestOp()
+        last_op = self.get_pipeline_response(read_op)
+        if not isinstance(last_op, Op):
+            raise ValueError("The return value type of `get_pipeline_response` "
+                             "function is not Op type, please check function "
+                             "`get_pipeline_response`.")
+        response_op = pipeline.ResponseOp(input_ops=[last_op])
+        self._server.set_response_op(response_op)
+        self._server.prepare_server(yaml_file)
+
+    def run_service(self):
+        self._server.run_server()
 
     def load_model_config(self, model_config):
+        print("This API will be deprecated later. Please do not use it")
         self.model_config = model_config
 
     def set_gpus(self, gpus):
+        print("This API will be deprecated later. Please do not use it")
         self.gpus = [int(x) for x in gpus.split(",")]
 
     def default_rpc_service(self,
@@ -88,6 +114,7 @@ class WebService(object):
                        gpuid=0,
                        mem_optim=True,
                        ir_optim=False):
+        print("This API will be deprecated later. Please do not use it")
         self.workdir = workdir
         self.port = port
         self.device = device
@@ -151,10 +178,11 @@ class WebService(object):
                 feed=request.json["feed"], fetch=fetch, fetch_map=fetch_map)
             result = {"result": result}
         except ValueError as err:
-            result = {"result": err}
+            result = {"result": str(err)}
         return result
 
     def run_rpc_service(self):
+        print("This API will be deprecated later. Please do not use it")
         import socket
         localIP = socket.gethostbyname(socket.gethostname())
         print("web service address:")
@@ -183,6 +211,7 @@ class WebService(object):
 
     # TODO: maybe change another API name: maybe run_local_predictor?
     def run_debugger_service(self, gpu=False):
+        print("This API will be deprecated later. Please do not use it")
         import socket
         localIP = socket.gethostbyname(socket.gethostname())
         print("web service address:")
@@ -209,18 +238,21 @@ class WebService(object):
             "{}".format(self.model_config), gpu=gpu, profile=False)
 
     def run_web_service(self):
+        print("This API will be deprecated later. Please do not use it")
         self.app_instance.run(host="0.0.0.0",
                               port=self.port,
                               threaded=False,
-                              processes=1)
+                              processes=4)
 
     def get_app_instance(self):
         return self.app_instance
 
     def preprocess(self, feed=[], fetch=[]):
+        print("This API will be deprecated later. Please do not use it")
         return feed, fetch
 
     def postprocess(self, feed=[], fetch=[], fetch_map=None):
-        for key in fetch_map.iterkeys():
+        print("This API will be deprecated later. Please do not use it")
+        for key in fetch_map:
             fetch_map[key] = fetch_map[key].tolist()
         return fetch_map
diff --git a/python/pipeline/__init__.py b/python/pipeline/__init__.py
index f720e4d2c851cec6270d31d6d44a766acc246291..7718016c9989a3b7348c3389c86495537786abb8 100644
--- a/python/pipeline/__init__.py
+++ b/python/pipeline/__init__.py
@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from operator import Op, RequestOp, ResponseOp
-from pipeline_server import PipelineServer
-from pipeline_client import PipelineClient
-from analyse import Analyst
+from . import logger  # this module must be the first to import
+from .operator import Op, RequestOp, ResponseOp
+from .pipeline_server import PipelineServer
+from .pipeline_client import PipelineClient
+from .local_rpc_service_handler import LocalRpcServiceHandler
+from .analyse import Analyst
diff --git a/python/pipeline/analyse.py b/python/pipeline/analyse.py
index 0cb4196c53900e77f0d9ba346a6a16a264ef95de..424b7e025394467840ae77a696e42cefc5a06eed 100644
--- a/python/pipeline/analyse.py
+++ b/python/pipeline/analyse.py
@@ -17,7 +17,7 @@ import copy
 import re
 import logging
 
-_LOGGER = logging.getLogger()
+_LOGGER = logging.getLogger(__name__)
 
 
 class Analyst(object):
@@ -69,7 +69,7 @@ class Analyst(object):
         with open(self._profile_file) as f:
             for line in f.readlines():
                 line = line.strip().split("\t")
-                if line[0] == "PROFILE":
+                if line[0] == "PROFILE" and len(line) >= 3:
                     trace_list = self._prase_line(line[1], line[2], counter)
                     counter += 1
                     for trace in trace_list:
@@ -164,7 +164,7 @@ class OpAnalyst(object):
 
     def add(self, name_str, ts_list):
         if self._close:
-            _LOGGER.error("OpAnalyst is closed.")
+            _LOGGER.error("Failed to add item: OpAnalyst is closed.")
             return
         op_name, curr_idx, step = self._parse(name_str)
         if op_name not in self.op_time_list_dict:
diff --git a/python/pipeline/channel.py b/python/pipeline/channel.py
index bff22ad9216fb6e639eb2857e6a189dcb2643d67..51aa0d4b4c33947d85a18f613f897129f85061fd 100644
--- a/python/pipeline/channel.py
+++ b/python/pipeline/channel.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
+from time import time as _time
 import threading
 import multiprocessing
 import multiprocessing.queues
@@ -25,9 +26,10 @@ else:
 import numpy as np
 import logging
 import enum
+import os
 import copy
 
-_LOGGER = logging.getLogger()
+_LOGGER = logging.getLogger(__name__)
 
 
 class ChannelDataEcode(enum.Enum):
@@ -38,7 +40,8 @@ class ChannelDataEcode(enum.Enum):
     RPC_PACKAGE_ERROR = 4
     CLIENT_ERROR = 5
     CLOSED_ERROR = 6
-    UNKNOW = 7
+    NO_SERVICE = 7
+    UNKNOW = 8
 
 
 class ChannelDataType(enum.Enum):
@@ -68,21 +71,25 @@ class ChannelData(object):
         '''
         if ecode is not None:
             if data_id is None or error_info is None:
-                raise ValueError("data_id and error_info cannot be None")
+                _LOGGER.critical("Failed to generate ChannelData: data_id"
+                                 " and error_info cannot be None")
+                os._exit(-1)
             datatype = ChannelDataType.ERROR.value
         else:
             if datatype == ChannelDataType.CHANNEL_NPDATA.value:
                 ecode, error_info = ChannelData.check_npdata(npdata)
                 if ecode != ChannelDataEcode.OK.value:
                     datatype = ChannelDataType.ERROR.value
-                    _LOGGER.error(error_info)
+                    _LOGGER.error("(logid={}) {}".format(data_id, error_info))
             elif datatype == ChannelDataType.DICT.value:
                 ecode, error_info = ChannelData.check_dictdata(dictdata)
                 if ecode != ChannelDataEcode.OK.value:
                     datatype = ChannelDataType.ERROR.value
-                    _LOGGER.error(error_info)
+                    _LOGGER.error("(logid={}) {}".format(data_id, error_info))
             else:
-                raise ValueError("datatype not match")
+                _LOGGER.critical("(logid={}) datatype not match".format(
+                    data_id))
+                os._exit(-1)
         self.datatype = datatype
         self.npdata = npdata
         self.dictdata = dictdata
@@ -106,14 +113,24 @@ class ChannelData(object):
             for sample in dictdata:
                 if not isinstance(sample, dict):
                     ecode = ChannelDataEcode.TYPE_ERROR.value
-                    error_info = "the value of data must " \
-                            "be dict, but get {}.".format(type(sample))
+                    error_info = "Failed to check data: the type of " \
+                            "data must be dict, but get {}.".format(type(sample))
                     break
         elif not isinstance(dictdata, dict):
             # batch size = 1
             ecode = ChannelDataEcode.TYPE_ERROR.value
-            error_info = "the value of data must " \
-                        "be dict, but get {}.".format(type(dictdata))
+            error_info = "Failed to check data: the type of data must " \
+                    "be dict, but get {}.".format(type(dictdata))
+        return ecode, error_info
+
+    @staticmethod
+    def check_batch_npdata(batch):
+        ecode = ChannelDataEcode.OK.value
+        error_info = None
+        for npdata in batch:
+            ecode, error_info = ChannelData.check_npdata(npdata)
+            if ecode != ChannelDataEcode.OK.value:
+                break
         return ecode, error_info
 
     @staticmethod
@@ -125,27 +142,30 @@ class ChannelData(object):
             for sample in npdata:
                 if not isinstance(sample, dict):
                     ecode = ChannelDataEcode.TYPE_ERROR.value
-                    error_info = "the value of data must " \
-                            "be dict, but get {}.".format(type(sample))
+                    error_info = "Failed to check data: the " \
+                            "value of data must be dict, but get {}.".format(
+                                    type(sample))
                     break
                 for _, value in sample.items():
                     if not isinstance(value, np.ndarray):
                         ecode = ChannelDataEcode.TYPE_ERROR.value
-                        error_info = "the value of data must " \
-                                "be np.ndarray, but get {}.".format(type(value))
+                        error_info = "Failed to check data: the" \
+                                " value of data must be np.ndarray, but get {}.".format(
+                                        type(value))
                         return ecode, error_info
         elif isinstance(npdata, dict):
             # batch_size = 1
             for _, value in npdata.items():
                 if not isinstance(value, np.ndarray):
                     ecode = ChannelDataEcode.TYPE_ERROR.value
-                    error_info = "the value of data must " \
-                            "be np.ndarray, but get {}.".format(type(value))
+                    error_info = "Failed to check data: the value " \
+                            "of data must be np.ndarray, but get {}.".format(
+                                    type(value))
                     break
         else:
             ecode = ChannelDataEcode.TYPE_ERROR.value
-            error_info = "the value of data must " \
-                    "be dict, but get {}.".format(type(npdata))
+            error_info = "Failed to check data: the value of data " \
+                    "must be dict, but get {}.".format(type(npdata))
         return ecode, error_info
 
     def parse(self):
@@ -157,9 +177,19 @@ class ChannelData(object):
             # return dict
             feed = self.dictdata
         else:
-            raise TypeError("Error type({}) in datatype.".format(self.datatype))
+            _LOGGER.critical("Failed to parse channeldata: error " \
+                    "type({}) in datatype.".format(self.datatype))
+            os._exit(-1)
         return feed
 
+    def __cmp__(self, other):
+        if self.id < other.id:
+            return -1
+        elif self.id == other.id:
+            return 0
+        else:
+            return 1
+
     def __str__(self):
         return "type[{}], ecode[{}], id[{}]".format(
             ChannelDataType(self.datatype).name, self.ecode, self.id)
@@ -175,7 +205,7 @@ class ProcessChannel(object):
         Only when all types of Ops get the data of the same ID,
         the data will be poped; The Op of the same type will not
         get the data of the same ID.
-    3. (TODO) Timeout and BatchSize are not fully supported.
+    3. Function front support timeout param to make auto-batching.
 
     Note:
     1. The ID of the data in the channel must be different.
@@ -194,16 +224,15 @@ class ProcessChannel(object):
         maintains the data obtained from queue.
     """
 
-    def __init__(self, manager, name=None, maxsize=0, timeout=None):
+    def __init__(self, manager, name=None, maxsize=0):
         # For queue multiprocess: after putting an object on 
         # an empty queue there may be an infinitessimal delay
         # before the queue's :meth:`~Queue.empty`
         # see more:
         # - https://bugs.python.org/issue18277
         # - https://hg.python.org/cpython/rev/860fc6a2bd21
-        self._que = manager.Queue(maxsize=maxsize)
+        self._que = manager.PriorityQueue(maxsize=maxsize)
         self._maxsize = maxsize
-        self._timeout = timeout
         self.name = name
         self._stop = manager.Value('i', 0)
 
@@ -219,6 +248,12 @@ class ProcessChannel(object):
         self._base_cursor = manager.Value('i', 0)
         self._output_buf = manager.list()
 
+    def get_maxsize(self):
+        return self._maxsize
+
+    def size(self):
+        return self._que.qsize()
+
     def get_producers(self):
         return self._producers
 
@@ -228,37 +263,41 @@ class ProcessChannel(object):
     def _log(self, info_str):
         return "[{}] {}".format(self.name, info_str)
 
-    def debug(self):
-        return self._log("p: {}, c: {}".format(self.get_producers(),
-                                               self.get_consumers()))
-
     def add_producer(self, op_name):
         """ not thread safe, and can only be called during initialization. """
         if op_name in self._producers:
-            raise ValueError(
-                self._log("producer({}) is already in channel".format(op_name)))
+            _LOGGER.critical(
+                self._log("Failed to add producer: producer({})" \
+                        " is already in channel".format(op_name)))
+            os._exit(-1)
         self._producers.append(op_name)
+        _LOGGER.debug(self._log("Succ add a producer: {}".format(op_name)))
 
     def add_consumer(self, op_name):
         """ not thread safe, and can only be called during initialization. """
         if op_name in self._consumer_cursors:
-            raise ValueError(
-                self._log("consumer({}) is already in channel".format(op_name)))
+            _LOGGER.critical(
+                    self._log("Failed to add consumer: consumer({})" \
+                            " is already in channel".format(op_name)))
+            os._exit(-1)
         self._consumer_cursors[op_name] = 0
 
         if self._cursor_count.get(0) is None:
             self._cursor_count[0] = 0
         self._cursor_count[0] += 1
+        _LOGGER.debug(self._log("Succ add a consumer: {}".format(op_name)))
 
     def push(self, channeldata, op_name=None):
         _LOGGER.debug(
-            self._log("{} try to push data: {}".format(op_name,
-                                                       channeldata.__str__())))
+            self._log("(logid={}) Op({}) Pushing data".format(channeldata.id,
+                                                              op_name)))
         if len(self._producers) == 0:
-            raise Exception(
+            _LOGGER.critical(
                 self._log(
-                    "expected number of producers to be greater than 0, but the it is 0."
-                ))
+                    "(logid={}) Op({}) Failed to push data: expected number"
+                    " of producers to be greater than 0, but the it is 0.".
+                    format(channeldata.id, op_name)))
+            os._exit(-1)
         elif len(self._producers) == 1:
             with self._cv:
                 while self._stop.value == 0:
@@ -269,23 +308,23 @@ class ProcessChannel(object):
                         self._cv.wait()
                 if self._stop.value == 1:
                     raise ChannelStopError()
-                _LOGGER.debug(
-                    self._log("{} channel size: {}".format(op_name,
-                                                           self._que.qsize())))
                 self._cv.notify_all()
-                _LOGGER.debug(self._log("{} notify all".format(op_name)))
-            _LOGGER.debug(self._log("{} push data succ!".format(op_name)))
+            _LOGGER.debug(
+                self._log("(logid={}) Op({}) Pushed data into internal queue.".
+                          format(channeldata.id, op_name)))
             return True
         elif op_name is None:
-            raise Exception(
+            _LOGGER.critical(
                 self._log(
-                    "There are multiple producers, so op_name cannot be None."))
+                    "(logid={}) Op({}) Failed to push data: there are multiple "
+                    "producers, so op_name cannot be None.".format(
+                        channeldata.id, op_name)))
+            os._exit(-1)
 
         producer_num = len(self._producers)
         data_id = channeldata.id
         put_data = None
         with self._cv:
-            _LOGGER.debug(self._log("{} get lock".format(op_name)))
             if data_id not in self._input_buf:
                 self._input_buf[data_id] = {
                     name: None
@@ -307,14 +346,12 @@ class ProcessChannel(object):
 
             if put_data is None:
                 _LOGGER.debug(
-                    self._log("{} push data succ, but not push to queue.".
-                              format(op_name)))
+                    self._log(
+                        "(logid={}) Op({}) Pushed data into input_buffer.".
+                        format(data_id, op_name)))
             else:
                 while self._stop.value == 0:
                     try:
-                        _LOGGER.debug(
-                            self._log("{} push data succ: {}".format(
-                                op_name, put_data.__str__())))
                         self._que.put(put_data, timeout=0)
                         break
                     except Queue.Empty:
@@ -323,43 +360,59 @@ class ProcessChannel(object):
                     raise ChannelStopError()
 
                 _LOGGER.debug(
-                    self._log("multi | {} push data succ!".format(op_name)))
+                    self._log(
+                        "(logid={}) Op({}) Pushed data into internal_queue.".
+                        format(data_id, op_name)))
             self._cv.notify_all()
         return True
 
-    def front(self, op_name=None):
-        _LOGGER.debug(self._log("{} try to get data...".format(op_name)))
+    def front(self, op_name=None, timeout=None):
+        _LOGGER.debug(
+            self._log("Op({}) Getting data[?]; timeout(s)={}".format(op_name,
+                                                                     timeout)))
+        endtime = None
+        if timeout is not None:
+            if timeout <= 0:
+                timeout = None
+            else:
+                endtime = _time() + timeout
+
         if len(self._consumer_cursors) == 0:
-            raise Exception(
+            _LOGGER.critical(
                 self._log(
-                    "expected number of consumers to be greater than 0, but the it is 0."
-                ))
+                    "Op({}) Failed to get data: expected number of consumers to be " \
+                            "greater than 0, but the it is 0.".format(op_name)))
+            os._exit(-1)
         elif len(self._consumer_cursors) == 1:
             resp = None
             with self._cv:
                 while self._stop.value == 0 and resp is None:
                     try:
-                        _LOGGER.debug(
-                            self._log("{} try to get(with channel empty: {})".
-                                      format(op_name, self._que.empty())))
                         resp = self._que.get(timeout=0)
                         break
                     except Queue.Empty:
-                        _LOGGER.debug(
-                            self._log(
-                                "{} wait for empty queue(with channel empty: {})".
-                                format(op_name, self._que.empty())))
-                        self._cv.wait()
+                        if timeout is not None:
+                            remaining = endtime - _time()
+                            if remaining <= 0.0:
+                                _LOGGER.debug(
+                                    self._log("Op({}) Failed to get data: "
+                                              "timeout".format(op_name)))
+                                raise ChannelTimeoutError()
+                            self._cv.wait(remaining)
+                        else:
+                            self._cv.wait()
                 if self._stop.value == 1:
                     raise ChannelStopError()
             _LOGGER.debug(
-                self._log("{} get data succ: {}".format(op_name, resp.__str__(
-                ))))
+                self._log("(logid={}) Op({}) Got data".format(resp.values()[0]
+                                                              .id, op_name)))
             return resp
         elif op_name is None:
-            raise Exception(
+            _LOGGER.critical(
                 self._log(
-                    "There are multiple consumers, so op_name cannot be None."))
+                    "Op({}) Failed to get data: there are multiple consumers, "
+                    "so op_name cannot be None.".format(op_name)))
+            os._exit(-1)
 
         # In output_buf, different Ops (according to op_name) have different
         # cursors. In addition, there is a base_cursor. Their difference is
@@ -376,24 +429,25 @@ class ProcessChannel(object):
             # it is necessary to obtain a data from queue and add it to output_buf.
             while self._stop.value == 0 and self._consumer_cursors[
                     op_name] - self._base_cursor.value >= len(self._output_buf):
-                _LOGGER.debug(
-                    self._log(
-                        "({}) B self._consumer_cursors: {}, self._base_cursor: {}, len(self._output_buf): {}".
-                        format(op_name, self._consumer_cursors,
-                               self._base_cursor.value, len(self._output_buf))))
                 try:
-                    _LOGGER.debug(
-                        self._log("{} try to get(with channel size: {})".format(
-                            op_name, self._que.qsize())))
                     channeldata = self._que.get(timeout=0)
                     self._output_buf.append(channeldata)
-                    break
-                except Queue.Empty:
                     _LOGGER.debug(
                         self._log(
-                            "{} wait for empty queue(with channel size: {})".
-                            format(op_name, self._que.qsize())))
-                    self._cv.wait()
+                            "(logid={}) Op({}) Pop ready item into output_buffer".
+                            format(channeldata.values()[0].id, op_name)))
+                    break
+                except Queue.Empty:
+                    if timeout is not None:
+                        remaining = endtime - _time()
+                        if remaining <= 0.0:
+                            _LOGGER.debug(
+                                self._log("Op({}) Failed to get data: timeout".
+                                          format(op_name)))
+                            raise ChannelTimeoutError()
+                        self._cv.wait(remaining)
+                    else:
+                        self._cv.wait()
             if self._stop.value == 1:
                 raise ChannelStopError()
 
@@ -401,7 +455,6 @@ class ProcessChannel(object):
             base_cursor = self._base_cursor.value
             data_idx = consumer_cursor - base_cursor
             resp = self._output_buf[data_idx]
-            _LOGGER.debug(self._log("{} get data: {}".format(op_name, resp)))
 
             self._cursor_count[consumer_cursor] -= 1
             if consumer_cursor == base_cursor and self._cursor_count[
@@ -413,6 +466,7 @@ class ProcessChannel(object):
                 self._base_cursor.value += 1
                 # to avoid cursor overflow
                 if self._base_cursor.value >= self._reset_max_cursor:
+                    _LOGGER.info(self._log("Reset cursor in Channel"))
                     self._base_cursor.value -= self._reset_max_cursor
                     for name in self._consumer_cursors.keys():
                         self._consumer_cursors[name] -= self._reset_max_cursor
@@ -430,25 +484,21 @@ class ProcessChannel(object):
                 self._cursor_count[new_consumer_cursor] = 0
             self._cursor_count[new_consumer_cursor] += 1
 
-            _LOGGER.debug(
-                self._log(
-                    "({}) A self._consumer_cursors: {}, self._base_cursor: {}, len(self._output_buf): {}".
-                    format(op_name, self._consumer_cursors,
-                           self._base_cursor.value, len(self._output_buf))))
-            _LOGGER.debug(self._log("{} notify all".format(op_name)))
             self._cv.notify_all()
 
-        _LOGGER.debug(self._log("multi | {} get data succ!".format(op_name)))
-        return resp  # reference, read only
+        _LOGGER.debug(
+            self._log("(logid={}) Op({}) Got data from output_buffer".format(
+                resp.values()[0].id, op_name)))
+        return resp
 
     def stop(self):
-        _LOGGER.debug(self._log("stop."))
+        _LOGGER.info(self._log("stop."))
         self._stop.value = 1
         with self._cv:
             self._cv.notify_all()
 
 
-class ThreadChannel(Queue.Queue):
+class ThreadChannel(Queue.PriorityQueue):
     """ 
     (Thread version)The channel used for communication between Ops.
 
@@ -458,7 +508,7 @@ class ThreadChannel(Queue.Queue):
         Only when all types of Ops get the data of the same ID,
         the data will be poped; The Op of the same type will not
         get the data of the same ID.
-    3. (TODO) Timeout and BatchSize are not fully supported.
+    3. Function front support timeout param to make auto-batching.
 
     Note:
     1. The ID of the data in the channel must be different.
@@ -477,10 +527,9 @@ class ThreadChannel(Queue.Queue):
         maintains the data obtained from queue.
     """
 
-    def __init__(self, name=None, maxsize=-1, timeout=None):
+    def __init__(self, name=None, maxsize=-1):
         Queue.Queue.__init__(self, maxsize=maxsize)
         self._maxsize = maxsize
-        self._timeout = timeout
         self.name = name
         self._stop = False
 
@@ -496,6 +545,12 @@ class ThreadChannel(Queue.Queue):
         self._base_cursor = 0
         self._output_buf = []
 
+    def get_maxsize(self):
+        return self._maxsize
+
+    def size(self):
+        return self.qsize()
+
     def get_producers(self):
         return self._producers
 
@@ -505,37 +560,41 @@ class ThreadChannel(Queue.Queue):
     def _log(self, info_str):
         return "[{}] {}".format(self.name, info_str)
 
-    def debug(self):
-        return self._log("p: {}, c: {}".format(self.get_producers(),
-                                               self.get_consumers()))
-
     def add_producer(self, op_name):
         """ not thread safe, and can only be called during initialization. """
         if op_name in self._producers:
-            raise ValueError(
-                self._log("producer({}) is already in channel".format(op_name)))
+            _LOGGER.critical(
+                self._log("Failed to add producer: producer({}) is "
+                          "already in channel".format(op_name)))
+            os._exit(-1)
         self._producers.append(op_name)
+        _LOGGER.debug(self._log("Succ add a producer: {}".format(op_name)))
 
     def add_consumer(self, op_name):
         """ not thread safe, and can only be called during initialization. """
         if op_name in self._consumer_cursors:
-            raise ValueError(
-                self._log("consumer({}) is already in channel".format(op_name)))
+            _LOGGER.critical(
+                self._log("Failed to add consumer: consumer({}) is "
+                          "already in channel".format(op_name)))
+            os._exit(-1)
         self._consumer_cursors[op_name] = 0
 
         if self._cursor_count.get(0) is None:
             self._cursor_count[0] = 0
         self._cursor_count[0] += 1
+        _LOGGER.debug(self._log("Succ add a consumer: {}".format(op_name)))
 
     def push(self, channeldata, op_name=None):
         _LOGGER.debug(
-            self._log("{} try to push data: {}".format(op_name,
-                                                       channeldata.__str__())))
+            self._log("(logid={}) Op({}) Pushing data".format(channeldata.id,
+                                                              op_name)))
         if len(self._producers) == 0:
-            raise Exception(
+            _LOGGER.critical(
                 self._log(
-                    "expected number of producers to be greater than 0, but the it is 0."
-                ))
+                    "(logid={}) Op({}) Failed to push data: expected number of "
+                    "producers to be greater than 0, but the it is 0.".format(
+                        channeldata.id, op_name)))
+            os._exit(-1)
         elif len(self._producers) == 1:
             with self._cv:
                 while self._stop is False:
@@ -547,18 +606,22 @@ class ThreadChannel(Queue.Queue):
                 if self._stop:
                     raise ChannelStopError()
                 self._cv.notify_all()
-            _LOGGER.debug(self._log("{} push data succ!".format(op_name)))
+            _LOGGER.debug(
+                self._log("(logid={}) Op({}) Pushed data into internal_queue.".
+                          format(channeldata.id, op_name)))
             return True
         elif op_name is None:
-            raise Exception(
+            _LOGGER.critical(
                 self._log(
-                    "There are multiple producers, so op_name cannot be None."))
+                    "(logid={}) Op({}) Failed to push data: there are multiple"
+                    " producers, so op_name cannot be None.".format(
+                        channeldata.id, op_name)))
+            os._exit(-1)
 
         producer_num = len(self._producers)
         data_id = channeldata.id
         put_data = None
         with self._cv:
-            _LOGGER.debug(self._log("{} get lock".format(op_name)))
             if data_id not in self._input_buf:
                 self._input_buf[data_id] = {
                     name: None
@@ -575,8 +638,9 @@ class ThreadChannel(Queue.Queue):
 
             if put_data is None:
                 _LOGGER.debug(
-                    self._log("{} push data succ, but not push to queue.".
-                              format(op_name)))
+                    self._log(
+                        "(logid={}) Op({}) Pushed data into input_buffer.".
+                        format(data_id, op_name)))
             else:
                 while self._stop is False:
                     try:
@@ -588,17 +652,29 @@ class ThreadChannel(Queue.Queue):
                     raise ChannelStopError()
 
                 _LOGGER.debug(
-                    self._log("multi | {} push data succ!".format(op_name)))
+                    self._log(
+                        "(logid={}) Op({}) Pushed data into internal_queue.".
+                        format(data_id, op_name)))
             self._cv.notify_all()
         return True
 
-    def front(self, op_name=None):
-        _LOGGER.debug(self._log("{} try to get data".format(op_name)))
+    def front(self, op_name=None, timeout=None):
+        _LOGGER.debug(
+            self._log("Op({}) Getting data[?]; timeout(s)={}".format(op_name,
+                                                                     timeout)))
+        endtime = None
+        if timeout is not None:
+            if timeout <= 0:
+                timeout = None
+            else:
+                endtime = _time() + timeout
+
         if len(self._consumer_cursors) == 0:
-            raise Exception(
+            _LOGGER.critical(
                 self._log(
-                    "expected number of consumers to be greater than 0, but the it is 0."
-                ))
+                    "Op({}) Failed to get data: expected number of consumers to be "
+                    "greater than 0, but the it is 0.".format(op_name)))
+            os._exit(-1)
         elif len(self._consumer_cursors) == 1:
             resp = None
             with self._cv:
@@ -607,17 +683,29 @@ class ThreadChannel(Queue.Queue):
                         resp = self.get(timeout=0)
                         break
                     except Queue.Empty:
-                        self._cv.wait()
+                        if timeout is not None:
+                            remaining = endtime - _time()
+                            if remaining <= 0.0:
+                                _LOGGER.debug(
+                                    self._log(
+                                        "Op({}) Failed to get data: timeout".
+                                        format(op_name)))
+                                raise ChannelTimeoutError()
+                            self._cv.wait(remaining)
+                        else:
+                            self._cv.wait()
                 if self._stop:
                     raise ChannelStopError()
             _LOGGER.debug(
-                self._log("{} get data succ: {}".format(op_name, resp.__str__(
-                ))))
+                self._log("(logid={}) Op({}) Got data".format(resp.values()[0]
+                                                              .id, op_name)))
             return resp
         elif op_name is None:
-            raise Exception(
-                self._log(
-                    "There are multiple consumers, so op_name cannot be None."))
+            _LOGGER.critical(
+                self._log("Op({}) Failed to get data: there are multiple "
+                          "consumers, so op_name cannot be None.".format(
+                              op_name)))
+            os._exit(-1)
 
         # In output_buf, different Ops (according to op_name) have different
         # cursors. In addition, there is a base_cursor. Their difference is
@@ -637,9 +725,22 @@ class ThreadChannel(Queue.Queue):
                 try:
                     channeldata = self.get(timeout=0)
                     self._output_buf.append(channeldata)
+                    _LOGGER.debug(
+                        self._log(
+                            "(logid={}) Op({}) Pop ready item into output_buffer".
+                            format(channeldata.values()[0].id, op_name)))
                     break
                 except Queue.Empty:
-                    self._cv.wait()
+                    if timeout is not None:
+                        remaining = endtime - _time()
+                        if remaining <= 0.0:
+                            _LOGGER.debug(
+                                self._log("Op({}) Failed to get data: timeout".
+                                          format(op_name)))
+                            raise ChannelTimeoutError()
+                        self._cv.wait(remaining)
+                    else:
+                        self._cv.wait()
             if self._stop:
                 raise ChannelStopError()
 
@@ -659,6 +760,7 @@ class ThreadChannel(Queue.Queue):
                 self._base_cursor += 1
                 # to avoid cursor overflow
                 if self._base_cursor >= self._reset_max_cursor:
+                    _LOGGER.info(self._log("Reset cursor in Channel"))
                     self._base_cursor -= self._reset_max_cursor
                     for name in self._consumer_cursors:
                         self._consumer_cursors[name] -= self._reset_max_cursor
@@ -668,7 +770,6 @@ class ThreadChannel(Queue.Queue):
                     }
             else:
                 resp = copy.deepcopy(self._output_buf[data_idx])
-            _LOGGER.debug(self._log("{} get data: {}".format(op_name, resp)))
 
             self._consumer_cursors[op_name] += 1
             new_consumer_cursor = self._consumer_cursors[op_name]
@@ -678,16 +779,23 @@ class ThreadChannel(Queue.Queue):
 
             self._cv.notify_all()
 
-        _LOGGER.debug(self._log("multi | {} get data succ!".format(op_name)))
+        _LOGGER.debug(
+            self._log("(logid={}) Op({}) Got data from output_buffer".format(
+                resp.values()[0].id, op_name)))
         return resp
 
     def stop(self):
-        _LOGGER.debug(self._log("stop."))
+        _LOGGER.info(self._log("stop."))
         self._stop = True
         with self._cv:
             self._cv.notify_all()
 
 
+class ChannelTimeoutError(RuntimeError):
+    def __init__(self):
+        pass
+
+
 class ChannelStopError(RuntimeError):
     def __init__(self):
         pass
diff --git a/python/pipeline/dag.py b/python/pipeline/dag.py
index d965b8f18921bd8f617cf42bcbc9dae6e7d4a869..272071f3211ed6029e5ba757da5ee2c780681ac2 100644
--- a/python/pipeline/dag.py
+++ b/python/pipeline/dag.py
@@ -24,42 +24,45 @@ else:
     raise Exception("Error Python version")
 import os
 import logging
+import collections
 
 from .operator import Op, RequestOp, ResponseOp, VirtualOp
 from .channel import (ThreadChannel, ProcessChannel, ChannelData,
                       ChannelDataEcode, ChannelDataType, ChannelStopError)
-from .profiler import TimeProfiler
-from .util import NameGenerator
+from .profiler import TimeProfiler, PerformanceTracer
+from .util import NameGenerator, ThreadIdGenerator, PipelineProcSyncManager
+from .proto import pipeline_service_pb2
 
-_LOGGER = logging.getLogger()
+_LOGGER = logging.getLogger(__name__)
 
 
 class DAGExecutor(object):
-    def __init__(self, response_op, dag_config, show_info):
-        self._retry = dag_config.get('retry', 1)
+    def __init__(self, response_op, server_conf, worker_idx):
+        build_dag_each_worker = server_conf["build_dag_each_worker"]
+        server_worker_num = server_conf["worker_num"]
+        dag_conf = server_conf["dag"]
 
-        client_type = dag_config.get('client_type', 'brpc')
-        self._server_use_profile = dag_config.get('use_profile', False)
-        channel_size = dag_config.get('channel_size', 0)
-        self._is_thread_op = dag_config.get('is_thread_op', True)
+        self._retry = dag_conf["retry"]
+        client_type = dag_conf["client_type"]
+        self._server_use_profile = dag_conf["use_profile"]
+        channel_size = dag_conf["channel_size"]
+        self._is_thread_op = dag_conf["is_thread_op"]
 
-        if show_info and self._server_use_profile:
-            _LOGGER.info("================= PROFILER ================")
-            if self._is_thread_op:
-                _LOGGER.info("op: thread")
-                _LOGGER.info("profile mode: sync")
-            else:
-                _LOGGER.info("op: process")
-                _LOGGER.info("profile mode: asyn")
-            _LOGGER.info("-------------------------------------------")
+        tracer_conf = dag_conf["tracer"]
+        tracer_interval_s = tracer_conf["interval_s"]
 
-        self.name = "@G"
+        self.name = "@DAGExecutor"
         self._profiler = TimeProfiler()
         self._profiler.enable(True)
 
+        self._tracer = None
+        if tracer_interval_s >= 1:
+            self._tracer = PerformanceTracer(
+                self._is_thread_op, tracer_interval_s, server_worker_num)
+
         self._dag = DAG(self.name, response_op, self._server_use_profile,
                         self._is_thread_op, client_type, channel_size,
-                        show_info)
+                        build_dag_each_worker, self._tracer)
         (in_channel, out_channel, pack_rpc_func,
          unpack_rpc_func) = self._dag.build()
         self._dag.start()
@@ -69,15 +72,23 @@ class DAGExecutor(object):
         self._pack_rpc_func = pack_rpc_func
         self._unpack_rpc_func = unpack_rpc_func
 
-        _LOGGER.debug(self._log(in_channel.debug()))
-        _LOGGER.debug(self._log(out_channel.debug()))
+        if self._tracer is not None:
+            self._tracer.start()
+
+        # generate id: data_id == request_id == log_id
+        base_counter = 0
+        gen_id_step = 1
+        if build_dag_each_worker:
+            base_counter = worker_idx
+            gen_id_step = server_worker_num
+        self._id_generator = ThreadIdGenerator(
+            max_id=1000000000000000000,
+            base_counter=base_counter,
+            step=gen_id_step)
 
-        self._id_lock = threading.Lock()
-        self._id_counter = 0
-        self._reset_max_id = 1000000000000000000
         self._cv_pool = {}
         self._cv_for_cv_pool = threading.Condition()
-        self._fetch_buffer = None
+        self._fetch_buffer = {}
         self._recive_func = None
 
         self._client_profile_key = "pipeline.profile"
@@ -86,32 +97,38 @@ class DAGExecutor(object):
     def start(self):
         self._recive_func = threading.Thread(
             target=DAGExecutor._recive_out_channel_func, args=(self, ))
+        self._recive_func.daemon = True
         self._recive_func.start()
+        _LOGGER.debug("[DAG Executor] Start recive thread")
 
     def stop(self):
         self._dag.stop()
         self._dag.join()
+        _LOGGER.info("[DAG Executor] Stop")
 
     def _get_next_data_id(self):
-        with self._id_lock:
-            if self._id_counter >= self._reset_max_id:
-                self._id_counter -= self._reset_max_id
-            self._id_counter += 1
-            return self._id_counter - 1
+        data_id = self._id_generator.next()
+        cond_v = threading.Condition()
+        with self._cv_for_cv_pool:
+            self._cv_pool[data_id] = cond_v
+            self._fetch_buffer[data_id] = None
+        return data_id, cond_v
 
     def _set_in_channel(self, in_channel):
         if not isinstance(in_channel, (ThreadChannel, ProcessChannel)):
-            raise TypeError(
-                self._log('in_channel must be Channel type, but get {}'.format(
-                    type(in_channel))))
+            _LOGGER.critical("[DAG Executor] Failed to set in_channel: "
+                             "in_channel must be Channel type, but get {}".
+                             format(type(in_channel)))
+            os._exit(-1)
         in_channel.add_producer(self.name)
         self._in_channel = in_channel
 
     def _set_out_channel(self, out_channel):
         if not isinstance(out_channel, (ThreadChannel, ProcessChannel)):
-            raise TypeError(
-                self._log('out_channel must be Channel type, but get {}'.format(
-                    type(out_channel))))
+            _LOGGER.critical("[DAG Executor] Failed to set out_channel: "
+                             "must be Channel type, but get {}".format(
+                                 type(out_channel)))
+            os._exit(-1)
         out_channel.add_consumer(self.name)
         self._out_channel = out_channel
 
@@ -121,7 +138,7 @@ class DAGExecutor(object):
             try:
                 channeldata_dict = self._out_channel.front(self.name)
             except ChannelStopError:
-                _LOGGER.debug(self._log("stop."))
+                _LOGGER.info("[DAG Executor] Stop.")
                 with self._cv_for_cv_pool:
                     for data_id, cv in self._cv_pool.items():
                         closed_errror_data = ChannelData(
@@ -129,46 +146,61 @@ class DAGExecutor(object):
                             error_info="dag closed.",
                             data_id=data_id)
                         with cv:
-                            self._fetch_buffer = closed_errror_data
+                            self._fetch_buffer[data_id] = closed_errror_data
                             cv.notify_all()
                 break
 
             if len(channeldata_dict) != 1:
-                _LOGGER.error("out_channel cannot have multiple input ops")
+                _LOGGER.critical(
+                    "[DAG Executor] Failed to fetch result: out_channel "
+                    "cannot have multiple input ops")
                 os._exit(-1)
             (_, channeldata), = channeldata_dict.items()
             if not isinstance(channeldata, ChannelData):
-                raise TypeError(
-                    self._log('data must be ChannelData type, but get {}'.
-                              format(type(channeldata))))
+                _LOGGER.critical(
+                    '[DAG Executor] Failed to fetch result: data in out_channel" \
+                    " must be ChannelData type, but get {}'
+                    .format(type(channeldata)))
+                os._exit(-1)
 
             data_id = channeldata.id
-            _LOGGER.debug("recive thread fetch data: {}".format(data_id))
+            _LOGGER.debug("(logid={}) [recive thread] Fetched data".format(
+                data_id))
             with self._cv_for_cv_pool:
-                cv = self._cv_pool[data_id]
-            with cv:
-                self._fetch_buffer = channeldata
-                cv.notify_all()
-
-    def _get_channeldata_from_fetch_buffer(self, data_id):
-        resp = None
-        cv = threading.Condition()
-        with self._cv_for_cv_pool:
-            self._cv_pool[data_id] = cv
-        with cv:
-            cv.wait()
-            _LOGGER.debug("resp func get lock (data_id: {})".format(data_id))
-            resp = copy.deepcopy(self._fetch_buffer)
-        with self._cv_for_cv_pool:
-            self._cv_pool.pop(data_id)
-        return resp
+                cond_v = self._cv_pool[data_id]
+            with cond_v:
+                self._fetch_buffer[data_id] = channeldata
+                cond_v.notify_all()
+
+    def _get_channeldata_from_fetch_buffer(self, data_id, cond_v):
+        ready_data = None
+
+        with cond_v:
+            with self._cv_for_cv_pool:
+                if self._fetch_buffer[data_id] is not None:
+                    # The requested data is already ready
+                    ready_data = self._fetch_buffer[data_id]
+                    self._cv_pool.pop(data_id)
+                    self._fetch_buffer.pop(data_id)
+            if ready_data is None:
+                # Wait for data ready
+                cond_v.wait()
+                with self._cv_for_cv_pool:
+                    ready_data = self._fetch_buffer[data_id]
+                    self._cv_pool.pop(data_id)
+                    self._fetch_buffer.pop(data_id)
+        _LOGGER.debug("(logid={}) [resp thread] Got data".format(data_id))
+        return ready_data
 
     def _pack_channeldata(self, rpc_request, data_id):
-        _LOGGER.debug(self._log('start inferce'))
         dictdata = None
         try:
             dictdata = self._unpack_rpc_func(rpc_request)
         except Exception as e:
+            _LOGGER.error(
+                "(logid={}) Failed to parse RPC request package: {}"
+                .format(data_id, e),
+                exc_info=True)
             return ChannelData(
                 ecode=ChannelDataEcode.RPC_PACKAGE_ERROR.value,
                 error_info="rpc package error: {}".format(e),
@@ -181,96 +213,132 @@ class DAGExecutor(object):
                 if key == self._client_profile_key:
                     profile_value = rpc_request.value[idx]
                     break
+            client_need_profile = (profile_value == self._client_profile_value)
+            _LOGGER.debug("(logid={}) Need profile in client: {}".format(
+                data_id, client_need_profile))
             return ChannelData(
                 datatype=ChannelDataType.DICT.value,
                 dictdata=dictdata,
                 data_id=data_id,
-                client_need_profile=(
-                    profile_value == self._client_profile_value))
+                client_need_profile=client_need_profile)
 
     def call(self, rpc_request):
-        data_id = self._get_next_data_id()
+        if self._tracer is not None:
+            trace_buffer = self._tracer.data_buffer()
+
+        data_id, cond_v = self._get_next_data_id()
+        _LOGGER.info("(logid={}) Succ generate id".format(data_id))
+
+        start_call, end_call = None, None
         if not self._is_thread_op:
-            self._profiler.record("call_{}#DAG-{}_0".format(data_id, data_id))
+            start_call = self._profiler.record("call_{}#DAG-{}_0".format(
+                data_id, data_id))
         else:
-            self._profiler.record("call_{}#DAG_0".format(data_id))
+            start_call = self._profiler.record("call_{}#DAG_0".format(data_id))
 
+        _LOGGER.debug("(logid={}) Parsing RPC request package".format(data_id))
         self._profiler.record("prepack_{}#{}_0".format(data_id, self.name))
         req_channeldata = self._pack_channeldata(rpc_request, data_id)
         self._profiler.record("prepack_{}#{}_1".format(data_id, self.name))
 
         resp_channeldata = None
         for i in range(self._retry):
-            _LOGGER.debug(self._log('push data'))
-            #self._profiler.record("push_{}#{}_0".format(data_id, self.name))
+            _LOGGER.debug("(logid={}) Pushing data into Graph engine".format(
+                data_id))
             try:
                 self._in_channel.push(req_channeldata, self.name)
             except ChannelStopError:
-                _LOGGER.debug(self._log("stop."))
+                _LOGGER.debug("[DAG Executor] Stop")
+                with self._cv_for_cv_pool:
+                    self._cv_pool.pop(data_id)
                 return self._pack_for_rpc_resp(
                     ChannelData(
                         ecode=ChannelDataEcode.CLOSED_ERROR.value,
                         error_info="dag closed.",
                         data_id=data_id))
-            #self._profiler.record("push_{}#{}_1".format(data_id, self.name))
 
-            _LOGGER.debug(self._log('wait for infer'))
-            #self._profiler.record("fetch_{}#{}_0".format(data_id, self.name))
-            resp_channeldata = self._get_channeldata_from_fetch_buffer(data_id)
-            #self._profiler.record("fetch_{}#{}_1".format(data_id, self.name))
+            _LOGGER.debug("(logid={}) Wait for Graph engine...".format(data_id))
+            resp_channeldata = self._get_channeldata_from_fetch_buffer(data_id,
+                                                                       cond_v)
 
             if resp_channeldata.ecode == ChannelDataEcode.OK.value:
+                _LOGGER.info("(logid={}) Succ predict".format(data_id))
                 break
+            else:
+                _LOGGER.error("(logid={}) Failed to predict: {}"
+                              .format(data_id, resp_channeldata.error_info))
+                if resp_channeldata.ecode != ChannelDataEcode.TIMEOUT.value:
+                    break
+
             if i + 1 < self._retry:
-                _LOGGER.warn("retry({}): {}".format(
-                    i + 1, resp_channeldata.error_info))
+                _LOGGER.warning("(logid={}) DAGExecutor retry({}/{})".format(
+                    data_id, i + 1, self._retry))
 
+        _LOGGER.debug("(logid={}) Packing RPC response package".format(data_id))
         self._profiler.record("postpack_{}#{}_0".format(data_id, self.name))
         rpc_resp = self._pack_for_rpc_resp(resp_channeldata)
         self._profiler.record("postpack_{}#{}_1".format(data_id, self.name))
         if not self._is_thread_op:
-            self._profiler.record("call_{}#DAG-{}_1".format(data_id, data_id))
+            end_call = self._profiler.record("call_{}#DAG-{}_1".format(data_id,
+                                                                       data_id))
         else:
-            self._profiler.record("call_{}#DAG_1".format(data_id))
-        #self._profiler.print_profile()
+            end_call = self._profiler.record("call_{}#DAG_1".format(data_id))
+
+        if self._tracer is not None:
+            trace_buffer.put({
+                "name": "DAG",
+                "id": data_id,
+                "succ": resp_channeldata.ecode == ChannelDataEcode.OK.value,
+                "actions": {
+                    "call_{}".format(data_id): end_call - start_call,
+                },
+            })
 
         profile_str = self._profiler.gen_profile_str()
         if self._server_use_profile:
             sys.stderr.write(profile_str)
 
         # add profile info into rpc_resp
-        profile_value = ""
         if resp_channeldata.client_need_profile:
             profile_set = resp_channeldata.profile_data_set
             profile_set.add(profile_str)
             profile_value = "".join(list(profile_set))
-        rpc_resp.key.append(self._client_profile_key)
-        rpc_resp.value.append(profile_value)
+            rpc_resp.key.append(self._client_profile_key)
+            rpc_resp.value.append(profile_value)
 
         return rpc_resp
 
     def _pack_for_rpc_resp(self, channeldata):
-        _LOGGER.debug(self._log('get channeldata'))
-        return self._pack_rpc_func(channeldata)
-
-    def _log(self, info_str):
-        return "[{}] {}".format(self.name, info_str)
+        try:
+            return self._pack_rpc_func(channeldata)
+        except Exception as e:
+            _LOGGER.error(
+                "(logid={}) Failed to pack RPC response package: {}"
+                .format(channeldata.id, e),
+                exc_info=True)
+            resp = pipeline_service_pb2.Response()
+            resp.ecode = ChannelDataEcode.RPC_PACKAGE_ERROR.value
+            resp.error_info = "rpc package error: {}".format(e)
+            return resp
 
 
 class DAG(object):
     def __init__(self, request_name, response_op, use_profile, is_thread_op,
-                 client_type, channel_size, show_info):
+                 client_type, channel_size, build_dag_each_worker, tracer):
         self._request_name = request_name
         self._response_op = response_op
         self._use_profile = use_profile
         self._is_thread_op = is_thread_op
         self._channel_size = channel_size
         self._client_type = client_type
-        self._show_info = show_info
+        self._build_dag_each_worker = build_dag_each_worker
+        self._tracer = tracer
         if not self._is_thread_op:
-            self._manager = multiprocessing.Manager()
+            self._manager = PipelineProcSyncManager()
+        _LOGGER.info("[DAG] Succ init")
 
-    def get_use_ops(self, response_op):
+    @staticmethod
+    def get_use_ops(response_op):
         unique_names = set()
         used_ops = set()
         succ_ops_of_use_op = {}  # {op_name: succ_ops}
@@ -288,8 +356,10 @@ class DAG(object):
                     used_ops.add(pred_op)
                     # check the name of op is globally unique
                     if pred_op.name in unique_names:
-                        raise Exception("the name of Op must be unique: {}".
-                                        format(pred_op.name))
+                        _LOGGER.critical("Failed to get used Ops: the"
+                                         " name of Op must be unique: {}".
+                                         format(pred_op.name))
+                        os._exit(-1)
                     unique_names.add(pred_op.name)
         return used_ops, succ_ops_of_use_op
 
@@ -301,10 +371,13 @@ class DAG(object):
         else:
             channel = ProcessChannel(
                 self._manager, name=name_gen.next(), maxsize=self._channel_size)
+        _LOGGER.debug("[DAG] Generate channel: {}".format(channel.name))
         return channel
 
     def _gen_virtual_op(self, name_gen):
-        return VirtualOp(name=name_gen.next())
+        vir_op = VirtualOp(name=name_gen.next())
+        _LOGGER.debug("[DAG] Generate virtual_op: {}".format(vir_op.name))
+        return vir_op
 
     def _topo_sort(self, used_ops, response_op, out_degree_ops):
         out_degree_num = {
@@ -318,7 +391,9 @@ class DAG(object):
             if len(op.get_input_ops()) == 0:
                 zero_indegree_num += 1
         if zero_indegree_num != 1:
-            raise Exception("DAG contains multiple input Ops")
+            _LOGGER.critical("Failed to topo sort: DAG contains "
+                             "multiple RequestOps")
+            os._exit(-1)
         last_op = response_op.get_input_ops()[0]
         ques[que_idx].put(last_op)
 
@@ -342,37 +417,47 @@ class DAG(object):
                 break
             que_idx = (que_idx + 1) % 2
         if sorted_op_num < len(used_ops):
-            raise Exception("not legal DAG")
+            _LOGGER.critical("Failed to topo sort: not legal DAG")
+            os._exit(-1)
 
         return dag_views, last_op
 
     def _build_dag(self, response_op):
         if response_op is None:
-            raise Exception("response_op has not been set.")
-        used_ops, out_degree_ops = self.get_use_ops(response_op)
-        if self._show_info:
+            _LOGGER.critical("Failed to build DAG: ResponseOp"
+                             " has not been set.")
+            os._exit(-1)
+        used_ops, out_degree_ops = DAG.get_use_ops(response_op)
+        if not self._build_dag_each_worker:
             _LOGGER.info("================= USED OP =================")
             for op in used_ops:
-                if op.name != self._request_name:
+                if not isinstance(op, RequestOp):
                     _LOGGER.info(op.name)
             _LOGGER.info("-------------------------------------------")
         if len(used_ops) <= 1:
-            raise Exception(
-                "Besides RequestOp and ResponseOp, there should be at least one Op in DAG."
-            )
+            _LOGGER.critical(
+                "Failed to build DAG: besides RequestOp and ResponseOp, "
+                "there should be at least one Op in DAG.")
+            os._exit(-1)
+        if self._build_dag_each_worker:
+            _LOGGER.info("Because `build_dag_each_worker` mode is used, "
+                         "Auto-batching is set to the default config: "
+                         "batch_size=1, auto_batching_timeout=None")
+            for op in used_ops:
+                op.use_default_auto_batching_config()
 
         dag_views, last_op = self._topo_sort(used_ops, response_op,
                                              out_degree_ops)
         dag_views = list(reversed(dag_views))
-        if self._show_info:
-            _LOGGER.info("================== DAG ====================")
+        if not self._build_dag_each_worker:
+            _LOGGER.debug("================== DAG ====================")
             for idx, view in enumerate(dag_views):
-                _LOGGER.info("(VIEW {})".format(idx))
+                _LOGGER.debug("(VIEW {})".format(idx))
                 for op in view:
-                    _LOGGER.info("  [{}]".format(op.name))
+                    _LOGGER.debug("  [{}]".format(op.name))
                     for out_op in out_degree_ops[op.name]:
-                        _LOGGER.info("    - {}".format(out_op.name))
-            _LOGGER.info("-------------------------------------------")
+                        _LOGGER.debug("    - {}".format(out_op.name))
+            _LOGGER.debug("-------------------------------------------")
 
         # create channels and virtual ops
         virtual_op_name_gen = NameGenerator("vir")
@@ -414,7 +499,6 @@ class DAG(object):
                     continue
                 channel = self._gen_channel(channel_name_gen)
                 channels.append(channel)
-                _LOGGER.debug("{} => {}".format(channel.name, op.name))
                 op.add_input_channel(channel)
                 pred_ops = pred_op_of_next_view_op[op.name]
                 if v_idx == 0:
@@ -422,8 +506,6 @@ class DAG(object):
                 else:
                     # if pred_op is virtual op, it will use ancestors as producers to channel
                     for pred_op in pred_ops:
-                        _LOGGER.debug("{} => {}".format(pred_op.name,
-                                                        channel.name))
                         pred_op.add_output_channel(channel)
                 processed_op.add(op.name)
                 # find same input op to combine channel
@@ -439,8 +521,6 @@ class DAG(object):
                             same_flag = False
                             break
                     if same_flag:
-                        _LOGGER.debug("{} => {}".format(channel.name,
-                                                        other_op.name))
                         other_op.add_input_channel(channel)
                         processed_op.add(other_op.name)
         output_channel = self._gen_channel(channel_name_gen)
@@ -458,14 +538,19 @@ class DAG(object):
             actual_ops.append(op)
 
         for c in channels:
-            _LOGGER.debug(c.debug())
+            _LOGGER.debug("Channel({}):\n\t- producers: {}\n\t- consumers: {}"
+                          .format(c.name, c.get_producers(), c.get_consumers()))
 
         return (actual_ops, channels, input_channel, output_channel, pack_func,
                 unpack_func)
 
+    def get_channels(self):
+        return self._channels
+
     def build(self):
         (actual_ops, channels, input_channel, output_channel, pack_func,
          unpack_func) = self._build_dag(self._response_op)
+        _LOGGER.info("[DAG] Succ build DAG")
 
         self._actual_ops = actual_ops
         self._channels = channels
@@ -474,18 +559,24 @@ class DAG(object):
         self._pack_func = pack_func
         self._unpack_func = unpack_func
 
+        if self._tracer is not None:
+            self._tracer.set_channels(self._channels)
+
         return self._input_channel, self._output_channel, self._pack_func, self._unpack_func
 
     def start(self):
         self._threads_or_proces = []
         for op in self._actual_ops:
             op.use_profiler(self._use_profile)
+            op.set_tracer(self._tracer)
             if self._is_thread_op:
                 self._threads_or_proces.extend(
                     op.start_with_thread(self._client_type))
             else:
                 self._threads_or_proces.extend(
                     op.start_with_process(self._client_type))
+        _LOGGER.info("[DAG] start")
+
         # not join yet
         return self._threads_or_proces
 
diff --git a/python/paddle_serving_server_gpu/gen_cuda_version.py b/python/pipeline/gateway/__init__.py
similarity index 63%
rename from python/paddle_serving_server_gpu/gen_cuda_version.py
rename to python/pipeline/gateway/__init__.py
index 4a320a0e4dd9f9145a2c7682d5eecb7f582862b5..abf198b97e6e818e1fbe59006f98492640bcee54 100644
--- a/python/paddle_serving_server_gpu/gen_cuda_version.py
+++ b/python/pipeline/gateway/__init__.py
@@ -11,17 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import sys
-import re
-import os
-
-new_str = ""
-with open("paddle_serving_server_gpu/version.py", "r") as f:
-    for line in f.readlines():
-        if re.match("cuda_version", line):
-            line = re.sub(r"\d+", sys.argv[1], line)
-        new_str = new_str + line
-
-with open("paddle_serving_server_gpu/version.py", "w") as f:
-    f.write(new_str)
diff --git a/python/pipeline/gateway/proto/gateway.proto b/python/pipeline/gateway/proto/gateway.proto
new file mode 100644
index 0000000000000000000000000000000000000000..9d3d501d06acf731231504a0ba97e89c72519ae4
--- /dev/null
+++ b/python/pipeline/gateway/proto/gateway.proto
@@ -0,0 +1,41 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+package baidu.paddle_serving.pipeline_serving;
+option go_package = ".;pipeline_serving";
+
+import "google/api/annotations.proto";
+
+message Response {
+  repeated string key = 1;
+  repeated string value = 2;
+  int32 ecode = 3;
+  string error_info = 4;
+};
+
+message Request {
+  repeated string key = 1;
+  repeated string value = 2;
+  string name = 3;
+}
+
+service PipelineService {
+  rpc inference(Request) returns (Response) {
+    option (google.api.http) = {
+      post : "/{name=*}/prediction"
+      body : "*"
+    };
+  }
+};
diff --git a/python/pipeline/gateway/proxy_server.go b/python/pipeline/gateway/proxy_server.go
new file mode 100644
index 0000000000000000000000000000000000000000..a74e798463b58efe26ab027c649a07131d4bbf32
--- /dev/null
+++ b/python/pipeline/gateway/proxy_server.go
@@ -0,0 +1,52 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+  "C"
+  "flag"
+  "net/http"
+  "log"
+  "strconv"
+
+  "golang.org/x/net/context"
+  "github.com/grpc-ecosystem/grpc-gateway/runtime"
+  "google.golang.org/grpc"
+
+  gw "./proto"
+)
+
+//export run_proxy_server
+func run_proxy_server(grpc_port int, http_port int) error {
+  var (
+    pipelineEndpoint = flag.String("pipeline_endpoint", "localhost:" + strconv.Itoa(grpc_port), "endpoint of PipelineService")
+  )
+
+  ctx := context.Background()
+  ctx, cancel := context.WithCancel(ctx)
+  defer cancel()
+
+  mux := runtime.NewServeMux()
+  opts := []grpc.DialOption{grpc.WithInsecure()}
+  err := gw.RegisterPipelineServiceHandlerFromEndpoint(ctx, mux, *pipelineEndpoint, opts)
+  if err != nil {
+    return err
+  }
+
+  log.Println("start proxy service")
+  return http.ListenAndServe(":" + strconv.Itoa(http_port), mux) // proxy port
+}
+
+func main() {}
diff --git a/python/pipeline/local_rpc_service_handler.py b/python/pipeline/local_rpc_service_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..376fcaf13af4e5a51ccf3ee6a1bd06a474a33bbd
--- /dev/null
+++ b/python/pipeline/local_rpc_service_handler.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+import multiprocessing
+try:
+    from paddle_serving_server_gpu import OpMaker, OpSeqMaker, Server
+    PACKAGE_VERSION = "GPU"
+except ImportError:
+    from paddle_serving_server import OpMaker, OpSeqMaker, Server
+    PACKAGE_VERSION = "CPU"
+from . import util
+
+_LOGGER = logging.getLogger(__name__)
+_workdir_name_gen = util.NameGenerator("workdir_")
+
+
+class LocalRpcServiceHandler(object):
+    def __init__(self,
+                 model_config,
+                 workdir="",
+                 thread_num=2,
+                 devices="",
+                 mem_optim=True,
+                 ir_optim=False,
+                 available_port_generator=None):
+        if available_port_generator is None:
+            available_port_generator = util.GetAvailablePortGenerator()
+
+        self._model_config = model_config
+        self._port_list = []
+        if devices == "":
+            # cpu
+            devices = [-1]
+            self._port_list.append(available_port_generator.next())
+            _LOGGER.info("Model({}) will be launch in cpu device. Port({})"
+                         .format(model_config, self._port_list))
+        else:
+            # gpu
+            if PACKAGE_VERSION == "CPU":
+                raise ValueError(
+                    "You are using the CPU version package("
+                    "paddle-serving-server), unable to set devices")
+            devices = [int(x) for x in devices.split(",")]
+            for _ in devices:
+                self._port_list.append(available_port_generator.next())
+            _LOGGER.info("Model({}) will be launch in gpu device: {}. Port({})"
+                         .format(model_config, devices, self._port_list))
+        self._workdir = workdir
+        self._devices = devices
+        self._thread_num = thread_num
+        self._mem_optim = mem_optim
+        self._ir_optim = ir_optim
+
+        self._rpc_service_list = []
+        self._server_pros = []
+        self._fetch_vars = None
+
+    def get_fetch_list(self):
+        return self._fetch_vars
+
+    def get_port_list(self):
+        return self._port_list
+
+    def get_client_config(self):
+        return os.path.join(self._model_config, "serving_server_conf.prototxt")
+
+    def _prepare_one_server(self, workdir, port, gpuid, thread_num, mem_optim,
+                            ir_optim):
+        device = "gpu"
+        if gpuid == -1:
+            device = "cpu"
+        op_maker = OpMaker()
+        read_op = op_maker.create('general_reader')
+        general_infer_op = op_maker.create('general_infer')
+        general_response_op = op_maker.create('general_response')
+
+        op_seq_maker = OpSeqMaker()
+        op_seq_maker.add_op(read_op)
+        op_seq_maker.add_op(general_infer_op)
+        op_seq_maker.add_op(general_response_op)
+
+        server = Server()
+        server.set_op_sequence(op_seq_maker.get_op_sequence())
+        server.set_num_threads(thread_num)
+        server.set_memory_optimize(mem_optim)
+        server.set_ir_optimize(ir_optim)
+
+        server.load_model_config(self._model_config)
+        if gpuid >= 0:
+            server.set_gpuid(gpuid)
+        server.prepare_server(workdir=workdir, port=port, device=device)
+        if self._fetch_vars is None:
+            self._fetch_vars = server.get_fetch_list()
+        return server
+
+    def _start_one_server(self, service_idx):
+        self._rpc_service_list[service_idx].run_server()
+
+    def prepare_server(self):
+        for i, device_id in enumerate(self._devices):
+            if self._workdir != "":
+                workdir = "{}_{}".format(self._workdir, i)
+            else:
+                workdir = _workdir_name_gen.next()
+            self._rpc_service_list.append(
+                self._prepare_one_server(
+                    workdir,
+                    self._port_list[i],
+                    device_id,
+                    thread_num=self._thread_num,
+                    mem_optim=self._mem_optim,
+                    ir_optim=self._ir_optim))
+
+    def start_server(self):
+        for i, service in enumerate(self._rpc_service_list):
+            p = multiprocessing.Process(
+                target=self._start_one_server, args=(i, ))
+            p.daemon = True
+            self._server_pros.append(p)
+        for p in self._server_pros:
+            p.start()
diff --git a/python/pipeline/logger.py b/python/pipeline/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..b566c012d3ced8f4f1bddd9b1622abc4beb9c8a5
--- /dev/null
+++ b/python/pipeline/logger.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import logging.config
+import os
+
+
+class SectionLevelFilter(object):
+    def __init__(self, levels):
+        self._levels = levels
+
+    def filter(self, logRecord):
+        return logRecord.levelno in self._levels
+
+
+log_dir = "PipelineServingLogs"
+if not os.path.exists(log_dir):
+    os.makedirs(log_dir)
+
+logger_config = {
+    "version": 1,
+    "formatters": {
+        "normal_fmt": {
+            "format":
+            "%(levelname)s %(asctime)s [%(filename)s:%(lineno)d] %(message)s",
+        },
+        "tracer_fmt": {
+            "format": "%(asctime)s %(message)s",
+        },
+    },
+    "handlers": {
+        "f_pipeline.log": {
+            "class": "logging.FileHandler",
+            "level": "INFO",
+            "formatter": "normal_fmt",
+            "filename": os.path.join(log_dir, "pipeline.log"),
+        },
+        "f_pipeline.log.wf": {
+            "class": "logging.FileHandler",
+            "level": "WARNING",
+            "formatter": "normal_fmt",
+            "filename": os.path.join(log_dir, "pipeline.log.wf"),
+        },
+        "f_tracer.log": {
+            "class": "logging.FileHandler",
+            "level": "INFO",
+            "formatter": "tracer_fmt",
+            "filename": os.path.join(log_dir, "pipeline.tracer"),
+        },
+    },
+    "loggers": {
+        # propagate = True
+        ".".join(__name__.split(".")[:-1] + ["profiler"]): {
+            "level": "INFO",
+            "handlers": ["f_tracer.log"],
+        },
+    },
+    "root": {
+        "level": "DEBUG",
+        "handlers": ["f_pipeline.log", "f_pipeline.log.wf"],
+    },
+}
+
+logging.config.dictConfig(logger_config)
diff --git a/python/pipeline/operator.py b/python/pipeline/operator.py
index 0e5e66ae326f4ae5b131458121d11b455a721a72..3b928b9cbab28904e6225d88e229e9a0d2da4f56 100644
--- a/python/pipeline/operator.py
+++ b/python/pipeline/operator.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
-
+from time import time as _time
+import time
 import threading
 import multiprocessing
 from paddle_serving_client import MultiLangClient, Client
@@ -21,16 +22,25 @@ import logging
 import func_timeout
 import os
 import sys
+import collections
 import numpy as np
 from numpy import *
+if sys.version_info.major == 2:
+    import Queue
+elif sys.version_info.major == 3:
+    import queue as Queue
+else:
+    raise Exception("Error Python version")
 
 from .proto import pipeline_service_pb2
 from .channel import (ThreadChannel, ProcessChannel, ChannelDataEcode,
-                      ChannelData, ChannelDataType, ChannelStopError)
+                      ChannelData, ChannelDataType, ChannelStopError,
+                      ChannelTimeoutError)
 from .util import NameGenerator
-from .profiler import TimeProfiler
+from .profiler import UnsafeTimeProfiler as TimeProfiler
+from . import local_rpc_service_handler
 
-_LOGGER = logging.getLogger()
+_LOGGER = logging.getLogger(__name__)
 _op_name_gen = NameGenerator("Op")
 
 
@@ -38,60 +48,187 @@ class Op(object):
     def __init__(self,
                  name=None,
                  input_ops=[],
-                 server_endpoints=[],
-                 fetch_list=[],
+                 server_endpoints=None,
+                 fetch_list=None,
                  client_config=None,
-                 concurrency=1,
-                 timeout=-1,
-                 retry=1):
+                 concurrency=None,
+                 timeout=None,
+                 retry=None,
+                 batch_size=None,
+                 auto_batching_timeout=None,
+                 local_rpc_service_handler=None):
+        # In __init__, all the parameters are just saved and Op is not initialized
         if name is None:
             name = _op_name_gen.next()
         self.name = name  # to identify the type of OP, it must be globally unique
         self.concurrency = concurrency  # amount of concurrency
         self.set_input_ops(input_ops)
 
+        self._local_rpc_service_handler = local_rpc_service_handler
         self._server_endpoints = server_endpoints
-        self.with_serving = False
-        if len(self._server_endpoints) != 0:
-            self.with_serving = True
-        self._client_config = client_config
         self._fetch_names = fetch_list
-
+        self._client_config = client_config
         self._timeout = timeout
         self._retry = max(1, retry)
+        self._batch_size = batch_size
+        self._auto_batching_timeout = auto_batching_timeout
+
         self._input = None
         self._outputs = []
 
         self._server_use_profile = False
+        self._tracer = None
 
-        # only for multithread
+        # only for thread op
         self._for_init_op_lock = threading.Lock()
         self._for_close_op_lock = threading.Lock()
         self._succ_init_op = False
         self._succ_close_op = False
 
+    def init_from_dict(self, conf):
+        # init op
+        if self.concurrency is None:
+            self.concurrency = conf["concurrency"]
+        if self._retry is None:
+            self._retry = conf["retry"]
+        if self._fetch_names is None:
+            self._fetch_names = conf.get("fetch_list")
+        if self._client_config is None:
+            self._client_config = conf.get("client_config")
+
+        if self._timeout is None:
+            self._timeout = conf["timeout"]
+        if self._timeout > 0:
+            self._timeout = self._timeout / 1000.0
+        else:
+            self._timeout = -1
+
+        if self._batch_size is None:
+            self._batch_size = conf["batch_size"]
+        if self._auto_batching_timeout is None:
+            self._auto_batching_timeout = conf["auto_batching_timeout"]
+        if self._auto_batching_timeout <= 0 or self._batch_size == 1:
+            _LOGGER.warning(
+                self._log(
+                    "Because auto_batching_timeout <= 0 or batch_size == 1,"
+                    " set auto_batching_timeout to None."))
+            self._auto_batching_timeout = None
+        else:
+            self._auto_batching_timeout = self._auto_batching_timeout / 1000.0
+
+        if self._server_endpoints is None:
+            server_endpoints = conf.get("server_endpoints", [])
+            if len(server_endpoints) != 0:
+                # remote service
+                self.with_serving = True
+                self._server_endpoints = server_endpoints
+            else:
+                if self._local_rpc_service_handler is None:
+                    local_service_conf = conf.get("local_service_conf")
+                    _LOGGER.info("local_service_conf: {}".format(
+                        local_service_conf))
+                    model_config = local_service_conf.get("model_config")
+                    _LOGGER.info("model_config: {}".format(model_config))
+                    if model_config is None:
+                        self.with_serving = False
+                    else:
+                        # local rpc service
+                        self.with_serving = True
+                        service_handler = local_rpc_service_handler.LocalRpcServiceHandler(
+                            model_config=model_config,
+                            workdir=local_service_conf["workdir"],
+                            thread_num=local_service_conf["thread_num"],
+                            devices=local_service_conf["devices"],
+                            mem_optim=local_service_conf["mem_optim"],
+                            ir_optim=local_service_conf["ir_optim"])
+                        service_handler.prepare_server()  # get fetch_list
+                        serivce_ports = service_handler.get_port_list()
+                        self._server_endpoints = [
+                            "127.0.0.1:{}".format(p) for p in serivce_ports
+                        ]
+                        if self._client_config is None:
+                            self._client_config = service_handler.get_client_config(
+                            )
+                        if self._fetch_names is None:
+                            self._fetch_names = service_handler.get_fetch_list()
+                        self._local_rpc_service_handler = service_handler
+                else:
+                    self.with_serving = True
+                    self._local_rpc_service_handler.prepare_server(
+                    )  # get fetch_list
+                    serivce_ports = self._local_rpc_service_handler.get_port_list(
+                    )
+                    self._server_endpoints = [
+                        "127.0.0.1:{}".format(p) for p in serivce_ports
+                    ]
+                    if self._client_config is None:
+                        self._client_config = self._local_rpc_service_handler.get_client_config(
+                        )
+                    if self._fetch_names is None:
+                        self._fetch_names = self._local_rpc_service_handler.get_fetch_list(
+                        )
+        else:
+            self.with_serving = True
+
+        if not isinstance(self, RequestOp) and not isinstance(self, ResponseOp):
+            _LOGGER.info(
+                self._log("\n\tinput_ops: {},"
+                          "\n\tserver_endpoints: {}"
+                          "\n\tfetch_list: {}"
+                          "\n\tclient_config: {}"
+                          "\n\tconcurrency: {},"
+                          "\n\ttimeout(s): {},"
+                          "\n\tretry: {},"
+                          "\n\tbatch_size: {},"
+                          "\n\tauto_batching_timeout(s): {}".format(
+                              ", ".join([op.name for op in self._input_ops
+                                         ]), self._server_endpoints,
+                              self._fetch_names, self._client_config,
+                              self.concurrency, self._timeout, self._retry,
+                              self._batch_size, self._auto_batching_timeout)))
+
+    def launch_local_rpc_service(self):
+        if self._local_rpc_service_handler is None:
+            _LOGGER.warning(
+                self._log("Failed to launch local rpc"
+                          " service: local_rpc_service_handler is None."))
+            return
+        port = self._local_rpc_service_handler.get_port_list()
+        self._local_rpc_service_handler.start_server()
+        _LOGGER.info("Op({}) use local rpc service at port: {}"
+                     .format(self.name, port))
+
+    def use_default_auto_batching_config(self):
+        if self._batch_size != 1:
+            _LOGGER.warning("Op({}) reset batch_size=1 (original: {})"
+                            .format(self.name, self._batch_size))
+            self._batch_size = 1
+        if self._auto_batching_timeout != None:
+            _LOGGER.warning(
+                "Op({}) reset auto_batching_timeout=None (original: {})"
+                .format(self.name, self._auto_batching_timeout))
+            self._auto_batching_timeout = None
+
     def use_profiler(self, use_profile):
         self._server_use_profile = use_profile
 
-    def _profiler_record(self, string):
-        if self._profiler is None:
-            return
-        self._profiler.record(string)
+    def set_tracer(self, tracer):
+        self._tracer = tracer
 
     def init_client(self, client_type, client_config, server_endpoints,
                     fetch_names):
         if self.with_serving == False:
-            _LOGGER.debug("{} no client".format(self.name))
+            _LOGGER.info("Op({}) has no client (and it also do not "
+                         "run the process function)".format(self.name))
             return None
-        _LOGGER.debug("{} client_config: {}".format(self.name, client_config))
-        _LOGGER.debug("{} fetch_names: {}".format(self.name, fetch_names))
         if client_type == 'brpc':
             client = Client()
             client.load_client_config(client_config)
         elif client_type == 'grpc':
             client = MultiLangClient()
         else:
-            raise ValueError("unknow client type: {}".format(client_type))
+            raise ValueError("Failed to init client: unknow client "
+                             "type {}".format(client_type))
         client.connect(server_endpoints)
         self._fetch_names = fetch_names
         return client
@@ -105,16 +242,19 @@ class Op(object):
         self._input_ops = []
         for op in ops:
             if not isinstance(op, Op):
-                raise TypeError(
-                    self._log('input op must be Op type, not {}'.format(
-                        type(op))))
+                _LOGGER.critical(
+                    self._log("Failed to set input_ops: input op "
+                              "must be Op type, not {}".format(type(op))))
+                os._exit(-1)
             self._input_ops.append(op)
 
     def add_input_channel(self, channel):
         if not isinstance(channel, (ThreadChannel, ProcessChannel)):
-            raise TypeError(
-                self._log('input channel must be Channel type, not {}'.format(
-                    type(channel))))
+            _LOGGER.critical(
+                self._log("Failed to set input_channel: input "
+                          "channel must be Channel type, not {}".format(
+                              type(channel))))
+            os._exit(-1)
         channel.add_consumer(self.name)
         self._input = channel
 
@@ -126,9 +266,10 @@ class Op(object):
 
     def add_output_channel(self, channel):
         if not isinstance(channel, (ThreadChannel, ProcessChannel)):
-            raise TypeError(
-                self._log('output channel must be Channel type, not {}'.format(
-                    type(channel))))
+            _LOGGER.critical(
+                self._log("Failed to add output_channel: output channel "
+                          "must be Channel type, not {}".format(type(channel))))
+            os._exit(-1)
         channel.add_producer(self.name)
         self._outputs.append(channel)
 
@@ -141,21 +282,28 @@ class Op(object):
     def preprocess(self, input_dicts):
         # multiple previous Op
         if len(input_dicts) != 1:
-            raise NotImplementedError(
-                'this Op has multiple previous inputs. Please override this func.'
-            )
+            _LOGGER.critical(
+                self._log(
+                    "Failed to run preprocess: this Op has multiple previous "
+                    "inputs. Please override this func."))
+            os._exit(-1)
 
         (_, input_dict), = input_dicts.items()
         return input_dict
 
-    def process(self, feed_dict):
-        err, err_info = ChannelData.check_npdata(feed_dict)
+    def process(self, feed_batch, typical_logid):
+        err, err_info = ChannelData.check_batch_npdata(feed_batch)
         if err != 0:
-            raise NotImplementedError(
-                "{} Please override preprocess func.".format(err_info))
+            _LOGGER.critical(
+                self._log("Failed to run process: {}. Please override "
+                          "preprocess func.".format(err_info)))
+            os._exit(-1)
         call_result = self.client.predict(
-            feed=feed_dict, fetch=self._fetch_names)
-        _LOGGER.debug(self._log("get call_result"))
+            feed=feed_batch, fetch=self._fetch_names, log_id=typical_logid)
+        if isinstance(self.client, MultiLangClient):
+            if call_result is None or call_result["serving_status_code"] != 0:
+                return None
+            call_result.pop("serving_status_code")
         return call_result
 
     def postprocess(self, input_dict, fetch_dict):
@@ -184,43 +332,51 @@ class Op(object):
                                  data,
                                  channels,
                                  name=None,
+                                 profile_str=None,
                                  client_need_profile=False,
                                  profile_set=None):
         if name is None:
             name = self.name
-        self._add_profile_into_channeldata(data, client_need_profile,
-                                           profile_set)
-        for channel in channels:
-            channel.push(data, name)
-
-    def _add_profile_into_channeldata(self, data, client_need_profile,
-                                      profile_set):
-        profile_str = self._profiler.gen_profile_str()
-        if self._server_use_profile:
-            sys.stderr.write(profile_str)
 
+        # add profile into channeldata
         if client_need_profile and profile_set is not None:
-            profile_set.add(profile_str)
+            if profile_str is not None:
+                profile_set.add(profile_str)
             data.add_profile(profile_set)
 
+        for channel in channels:
+            channel.push(data, name)
+
     def start_with_process(self, client_type):
+        trace_buffer = None
+        if self._tracer is not None:
+            trace_buffer = self._tracer.data_buffer()
         proces = []
         for concurrency_idx in range(self.concurrency):
             p = multiprocessing.Process(
                 target=self._run,
                 args=(concurrency_idx, self._get_input_channel(),
-                      self._get_output_channels(), client_type, False))
+                      self._get_output_channels(), client_type, False,
+                      trace_buffer))
+            p.daemon = True
             p.start()
             proces.append(p)
         return proces
 
     def start_with_thread(self, client_type):
+        trace_buffer = None
+        if self._tracer is not None:
+            trace_buffer = self._tracer.data_buffer()
         threads = []
         for concurrency_idx in range(self.concurrency):
             t = threading.Thread(
                 target=self._run,
                 args=(concurrency_idx, self._get_input_channel(),
-                      self._get_output_channels(), client_type, True))
+                      self._get_output_channels(), client_type, True,
+                      trace_buffer))
+            # When a process exits, it attempts to terminate
+            # all of its daemonic child processes.
+            t.daemon = True
             t.start()
             threads.append(t)
         return threads
@@ -228,252 +384,459 @@ class Op(object):
     def init_op(self):
         pass
 
-    def _run_preprocess(self, parsed_data, data_id, log_func):
-        preped_data, error_channeldata = None, None
-        try:
-            preped_data = self.preprocess(parsed_data)
-        except NotImplementedError as e:
-            # preprocess function not implemented
-            error_info = log_func(e)
-            _LOGGER.error(error_info)
-            error_channeldata = ChannelData(
-                ecode=ChannelDataEcode.NOT_IMPLEMENTED.value,
-                error_info=error_info,
-                data_id=data_id)
-        except TypeError as e:
-            # Error type in channeldata.datatype
-            error_info = log_func(e)
-            _LOGGER.error(error_info)
-            error_channeldata = ChannelData(
-                ecode=ChannelDataEcode.TYPE_ERROR.value,
-                error_info=error_info,
-                data_id=data_id)
-        except Exception as e:
-            error_info = log_func(e)
-            _LOGGER.error(error_info)
-            error_channeldata = ChannelData(
-                ecode=ChannelDataEcode.UNKNOW.value,
-                error_info=error_info,
-                data_id=data_id)
-        return preped_data, error_channeldata
-
-    def _run_process(self, preped_data, data_id, log_func):
-        midped_data, error_channeldata = None, None
+    def _run_preprocess(self, parsed_data_dict, op_info_prefix):
+        _LOGGER.debug("{} Running preprocess".format(op_info_prefix))
+        preped_data_dict = collections.OrderedDict()
+        err_channeldata_dict = collections.OrderedDict()
+        for data_id, parsed_data in parsed_data_dict.items():
+            preped_data, error_channeldata = None, None
+            try:
+                preped_data = self.preprocess(parsed_data)
+            except TypeError as e:
+                # Error type in channeldata.datatype
+                error_info = "(logid={}) {} Failed to preprocess: {}".format(
+                    data_id, op_info_prefix, e)
+                _LOGGER.error(error_info, exc_info=True)
+                error_channeldata = ChannelData(
+                    ecode=ChannelDataEcode.TYPE_ERROR.value,
+                    error_info=error_info,
+                    data_id=data_id)
+            except Exception as e:
+                error_info = "(logid={}) {} Failed to preprocess: {}".format(
+                    data_id, op_info_prefix, e)
+                _LOGGER.error(error_info, exc_info=True)
+                error_channeldata = ChannelData(
+                    ecode=ChannelDataEcode.UNKNOW.value,
+                    error_info=error_info,
+                    data_id=data_id)
+            if error_channeldata is not None:
+                err_channeldata_dict[data_id] = error_channeldata
+            else:
+                preped_data_dict[data_id] = preped_data
+        _LOGGER.debug("{} Succ preprocess".format(op_info_prefix))
+        return preped_data_dict, err_channeldata_dict
+
+    def _run_process(self, preped_data_dict, op_info_prefix):
+        _LOGGER.debug("{} Running process".format(op_info_prefix))
+        midped_data_dict = collections.OrderedDict()
+        err_channeldata_dict = collections.OrderedDict()
         if self.with_serving:
+            data_ids = preped_data_dict.keys()
+            typical_logid = data_ids[0]
+            if len(data_ids) != 1:
+                for data_id in data_ids:
+                    _LOGGER.info(
+                        "(logid={}) {} During access to PaddleServingService,"
+                        " we selected logid={} (from batch: {}) as a "
+                        "representative for logging.".format(
+                            data_id, op_info_prefix, typical_logid, data_ids))
+
+            # combine samples to batch
+            one_input = preped_data_dict[data_ids[0]]
+            feed_batch = []
+            input_offset = None
+            if isinstance(one_input, dict):
+                # sample input
+                feed_batch = [preped_data_dict[data_id] for data_id in data_ids]
+                input_offset = list(range(len(data_ids) + 1))
+            elif isinstance(one_input, list):
+                # batch input
+                input_offset = [0]
+                for data_id in data_ids:
+                    batch_input = preped_data_dict[data_id]
+                    offset = input_offset[-1] + len(batch_input)
+                    feed_batch += batch_input
+                    input_offset.append(offset)
+            else:
+                _LOGGER.critical(
+                    "{} Failed to process: expect input type is dict(sample"
+                    " input) or list(batch input), but get {}".format(
+                        op_info_prefix, type(one_input)))
+                os._exit(-1)
+
+            midped_batch = None
             ecode = ChannelDataEcode.OK.value
             if self._timeout <= 0:
                 try:
-                    midped_data = self.process(preped_data)
+                    midped_batch = self.process(feed_batch, typical_logid)
                 except Exception as e:
                     ecode = ChannelDataEcode.UNKNOW.value
-                    error_info = log_func(e)
-                    _LOGGER.error(error_info)
+                    error_info = "(logid={}) {} Failed to process(batch: {}): {}".format(
+                        typical_logid, op_info_prefix, data_ids, e)
+                    _LOGGER.error(error_info, exc_info=True)
             else:
                 for i in range(self._retry):
                     try:
-                        midped_data = func_timeout.func_timeout(
-                            self._timeout, self.process, args=(preped_data, ))
+                        midped_batch = func_timeout.func_timeout(
+                            self._timeout,
+                            self.process,
+                            args=(feed_batch, typical_logid))
                     except func_timeout.FunctionTimedOut as e:
                         if i + 1 >= self._retry:
                             ecode = ChannelDataEcode.TIMEOUT.value
-                            error_info = log_func(e)
+                            error_info = "(logid={}) {} Failed to process(batch: {}): " \
+                                    "exceeded retry count.".format(
+                                            typical_logid, op_info_prefix, data_ids)
                             _LOGGER.error(error_info)
                         else:
-                            _LOGGER.warn(
-                                log_func("timeout, retry({})".format(i + 1)))
+                            _LOGGER.warning(
+                                "(logid={}) {} Failed to process(batch: {}): timeout,"
+                                " and retrying({}/{})...".format(
+                                    typical_logid, op_info_prefix, data_ids, i +
+                                    1, self._retry))
                     except Exception as e:
                         ecode = ChannelDataEcode.UNKNOW.value
-                        error_info = log_func(e)
-                        _LOGGER.error(error_info)
+                        error_info = "(logid={}) {} Failed to process(batch: {}): {}".format(
+                            typical_logid, op_info_prefix, data_ids, e)
+                        _LOGGER.error(error_info, exc_info=True)
                         break
                     else:
                         break
             if ecode != ChannelDataEcode.OK.value:
-                error_channeldata = ChannelData(
-                    ecode=ecode, error_info=error_info, data_id=data_id)
-            elif midped_data is None:
+                for data_id in data_ids:
+                    err_channeldata_dict[data_id] = ChannelData(
+                        ecode=ecode, error_info=error_info, data_id=data_id)
+            elif midped_batch is None:
                 # op client return None
-                error_channeldata = ChannelData(
-                    ecode=ChannelDataEcode.CLIENT_ERROR.value,
-                    error_info=log_func(
-                        "predict failed. pls check the server side."),
-                    data_id=data_id)
+                error_info = "(logid={}) {} Failed to predict, please check if " \
+                        "PaddleServingService is working properly.".format(
+                                typical_logid, op_info_prefix)
+                _LOGGER.error(error_info)
+                for data_id in data_ids:
+                    err_channeldata_dict[data_id] = ChannelData(
+                        ecode=ChannelDataEcode.CLIENT_ERROR.value,
+                        error_info=error_info,
+                        data_id=data_id)
+            else:
+                # transform np format to dict format
+                var_names = midped_batch.keys()
+                lod_var_names = set()
+                lod_offset_names = set()
+                for name in var_names:
+                    lod_offset_name = "{}.lod".format(name)
+                    if lod_offset_name in var_names:
+                        _LOGGER.debug("(logid={}) {} {} is LodTensor".format(
+                            typical_logid, op_info_prefix, name))
+                        lod_var_names.add(name)
+                        lod_offset_names.add(lod_offset_name)
+
+                for idx, data_id in enumerate(data_ids):
+                    midped_data_dict[data_id] = {}
+
+                for name, value in midped_batch.items():
+                    if name in lod_offset_names:
+                        continue
+                    if name in lod_var_names:
+                        # lodtensor
+                        lod_offset_name = "{}.lod".format(name)
+                        lod_offset = midped_batch[lod_offset_name]
+                        for idx, data_id in enumerate(data_ids):
+                            data_offset_left = input_offset[idx]
+                            data_offset_right = input_offset[idx + 1]
+                            lod_offset_left = lod_offset[data_offset_left]
+                            lod_offset_right = lod_offset[data_offset_right]
+                            midped_data_dict[data_id][name] = value[
+                                lod_offset_left:lod_offset_right]
+                            midped_data_dict[data_id][lod_offset_name] = \
+                                    lod_offset[data_offset_left:data_offset_right + 1] - lod_offset[data_offset_left]
+                    else:
+                        # normal tensor
+                        for idx, data_id in enumerate(data_ids):
+                            left = input_offset[idx]
+                            right = input_offset[idx + 1]
+                            midped_data_dict[data_id][name] = value[left:right]
         else:
-            midped_data = preped_data
-        return midped_data, error_channeldata
+            midped_data_dict = preped_data_dict
+        _LOGGER.debug("{} Succ process".format(op_info_prefix))
+        return midped_data_dict, err_channeldata_dict
+
+    def _run_postprocess(self, parsed_data_dict, midped_data_dict,
+                         op_info_prefix):
+        _LOGGER.debug("{} Running postprocess".format(op_info_prefix))
+        postped_data_dict = collections.OrderedDict()
+        err_channeldata_dict = collections.OrderedDict()
+        for data_id, midped_data in midped_data_dict.items():
+            postped_data, err_channeldata = None, None
+            try:
+                postped_data = self.postprocess(parsed_data_dict[data_id],
+                                                midped_data)
+            except Exception as e:
+                error_info = "(logid={}) {} Failed to postprocess: {}".format(
+                    data_id, op_info_prefix, e)
+                _LOGGER.error(error_info, exc_info=True)
+                err_channeldata = ChannelData(
+                    ecode=ChannelDataEcode.UNKNOW.value,
+                    error_info=error_info,
+                    data_id=data_id)
+            if err_channeldata is not None:
+                err_channeldata_dict[data_id] = err_channeldata
+                continue
+            else:
+                if not isinstance(postped_data, dict):
+                    error_info = "(logid={}) {} Failed to postprocess: " \
+                            "output of postprocess funticon must be " \
+                            "dict type, but get {}".format(
+                                data_id, op_info_prefix,
+                                type(postped_data))
+                    _LOGGER.error(error_info)
+                    err_channeldata = ChannelData(
+                        ecode=ChannelDataEcode.UNKNOW.value,
+                        error_info=error_info,
+                        data_id=data_id)
+                    err_channeldata_dict[data_id] = err_channeldata
+                    continue
+
+                output_data = None
+                err, _ = ChannelData.check_npdata(postped_data)
+                if err == 0:
+                    output_data = ChannelData(
+                        ChannelDataType.CHANNEL_NPDATA.value,
+                        npdata=postped_data,
+                        data_id=data_id)
+                else:
+                    output_data = ChannelData(
+                        ChannelDataType.DICT.value,
+                        dictdata=postped_data,
+                        data_id=data_id)
+                postped_data_dict[data_id] = output_data
+        _LOGGER.debug("{} Succ postprocess".format(op_info_prefix))
+        return postped_data_dict, err_channeldata_dict
+
+    def _auto_batching_generator(self, input_channel, op_name, batch_size,
+                                 timeout, op_info_prefix):
+        while True:
+            batch = []
+            while len(batch) == 0:
+                endtime = None
+                if timeout is not None:
+                    endtime = _time() + timeout
+                for idx in range(batch_size):
+                    try:
+                        channeldata_dict = None
+                        if timeout is not None:
+                            remaining = endtime - _time()
+                            if remaining <= 0.0:
+                                _LOGGER.debug("{} Failed to generate batch: "
+                                              "timeout".format(op_info_prefix))
+                                break
+                            channeldata_dict = input_channel.front(op_name,
+                                                                   timeout)
+                        else:
+                            channeldata_dict = input_channel.front(op_name)
+                        batch.append(channeldata_dict)
+                    except ChannelTimeoutError:
+                        _LOGGER.debug("{} Failed to generate batch: "
+                                      "timeout".format(op_info_prefix))
+                        break
+            _LOGGER.debug("{} Got actual batch_size: {}".format(op_info_prefix,
+                                                                len(batch)))
+            yield batch
+
+    def _parse_channeldata_batch(self, batch, output_channels):
+        parsed_data_dict = collections.OrderedDict()
+        need_profile_dict = {}
+        profile_dict = {}
+        for channeldata_dict in batch:
+            (data_id, error_channeldata, parsed_data,
+                    client_need_profile, profile_set) = \
+                            self._parse_channeldata(channeldata_dict)
+            if error_channeldata is None:
+                parsed_data_dict[data_id] = parsed_data
+                need_profile_dict[data_id] = client_need_profile
+                profile_dict[data_id] = profile_set
+            else:
+                # error data in predecessor Op
+                # (error_channeldata with profile info)
+                self._push_to_output_channels(error_channeldata,
+                                              output_channels)
 
-    def _run_postprocess(self, input_dict, midped_data, data_id, log_func):
-        output_data, error_channeldata = None, None
-        try:
-            postped_data = self.postprocess(input_dict, midped_data)
-        except Exception as e:
-            error_info = log_func(e)
-            _LOGGER.error(error_info)
-            error_channeldata = ChannelData(
-                ecode=ChannelDataEcode.UNKNOW.value,
-                error_info=error_info,
-                data_id=data_id)
-            return output_data, error_channeldata
-
-        if not isinstance(postped_data, dict):
-            error_info = log_func("output of postprocess funticon must be " \
-                    "dict type, but get {}".format(type(postped_data)))
-            _LOGGER.error(error_info)
-            error_channeldata = ChannelData(
-                ecode=ChannelDataEcode.UNKNOW.value,
-                error_info=error_info,
-                data_id=data_id)
-            return output_data, error_channeldata
-
-        err, _ = ChannelData.check_npdata(postped_data)
-        if err == 0:
-            output_data = ChannelData(
-                ChannelDataType.CHANNEL_NPDATA.value,
-                npdata=postped_data,
-                data_id=data_id)
-        else:
-            output_data = ChannelData(
-                ChannelDataType.DICT.value,
-                dictdata=postped_data,
-                data_id=data_id)
-        return output_data, error_channeldata
+        return parsed_data_dict, need_profile_dict, profile_dict
 
     def _run(self, concurrency_idx, input_channel, output_channels, client_type,
-             is_thread_op):
-        def get_log_func(op_info_prefix):
-            def log_func(info_str):
-                return "{} {}".format(op_info_prefix, info_str)
-
-            return log_func
-
+             is_thread_op, trace_buffer):
         op_info_prefix = "[{}|{}]".format(self.name, concurrency_idx)
-        log = get_log_func(op_info_prefix)
         tid = threading.current_thread().ident
 
         # init op
-        self.concurrency_idx = concurrency_idx
+        profiler = None
         try:
-            if is_thread_op:
-                with self._for_init_op_lock:
-                    if not self._succ_init_op:
-                        # init profiler
-                        self._profiler = TimeProfiler()
-                        self._profiler.enable(True)
-                        # init client
-                        self.client = self.init_client(
-                            client_type, self._client_config,
-                            self._server_endpoints, self._fetch_names)
-                        # user defined
-                        self.init_op()
-                        self._succ_init_op = True
-                        self._succ_close_op = False
-            else:
-                # init profiler
-                self._profiler = TimeProfiler()
-                self._profiler.enable(True)
-                # init client
-                self.client = self.init_client(client_type, self._client_config,
-                                               self._server_endpoints,
-                                               self._fetch_names)
-                # user defined
-                self.init_op()
+            profiler = self._initialize(is_thread_op, client_type,
+                                        concurrency_idx)
         except Exception as e:
-            _LOGGER.error(log(e))
+            _LOGGER.critical(
+                "{} Failed to init op: {}".format(op_info_prefix, e),
+                exc_info=True)
             os._exit(-1)
+        _LOGGER.info("{} Succ init".format(op_info_prefix))
+
+        batch_generator = self._auto_batching_generator(
+            input_channel=input_channel,
+            op_name=self.name,
+            batch_size=self._batch_size,
+            timeout=self._auto_batching_timeout,
+            op_info_prefix=op_info_prefix)
 
+        start, end = None, None
+        trace_que = collections.deque()
         while True:
-            #self._profiler_record("get#{}_0".format(op_info_prefix))
+            start = int(round(_time() * 1000000))
             try:
-                channeldata_dict = input_channel.front(self.name)
+                channeldata_dict_batch = next(batch_generator)
             except ChannelStopError:
-                _LOGGER.debug(log("stop."))
-                if is_thread_op:
-                    with self._for_close_op_lock:
-                        if not self._succ_close_op:
-                            self._profiler = None
-                            self.client = None
-                            self._succ_init_op = False
-                            self._succ_close_op = True
+                _LOGGER.debug("{} Stop.".format(op_info_prefix))
+                self._finalize(is_thread_op)
                 break
-            #self._profiler_record("get#{}_1".format(op_info_prefix))
-            _LOGGER.debug(log("input_data: {}".format(channeldata_dict)))
+            end = int(round(_time() * 1000000))
+            in_time = end - start
 
-            (data_id, error_channeldata, parsed_data, client_need_profile,
-             profile_set) = self._parse_channeldata(channeldata_dict)
-            # error data in predecessor Op
-            if error_channeldata is not None:
-                try:
-                    # error_channeldata with profile info
-                    self._push_to_output_channels(error_channeldata,
-                                                  output_channels)
-                except ChannelStopError:
-                    _LOGGER.debug(log("stop."))
-                    break
+            # parse channeldata batch
+            try:
+                parsed_data_dict, need_profile_dict, profile_dict \
+                        = self._parse_channeldata_batch(
+                                channeldata_dict_batch, output_channels)
+            except ChannelStopError:
+                _LOGGER.debug("{} Stop.".format(op_info_prefix))
+                self._finalize(is_thread_op)
+                break
+            if len(parsed_data_dict) == 0:
+                # data in the whole batch is all error data
                 continue
 
             # preprecess
-            self._profiler_record("prep#{}_0".format(op_info_prefix))
-            preped_data, error_channeldata = self._run_preprocess(parsed_data,
-                                                                  data_id, log)
-            self._profiler_record("prep#{}_1".format(op_info_prefix))
-            if error_channeldata is not None:
-                try:
+            start = profiler.record("prep#{}_0".format(op_info_prefix))
+            preped_data_dict, err_channeldata_dict \
+                    = self._run_preprocess(parsed_data_dict, op_info_prefix)
+            end = profiler.record("prep#{}_1".format(op_info_prefix))
+            prep_time = end - start
+            try:
+                for data_id, err_channeldata in err_channeldata_dict.items():
                     self._push_to_output_channels(
-                        error_channeldata,
-                        output_channels,
-                        client_need_profile=client_need_profile,
-                        profile_set=profile_set)
-                except ChannelStopError:
-                    _LOGGER.debug(log("stop."))
-                    break
+                        data=err_channeldata,
+                        channels=output_channels,
+                        client_need_profile=need_profile_dict[data_id],
+                        profile_set=profile_dict[data_id])
+            except ChannelStopError:
+                _LOGGER.debug("{} Stop.".format(op_info_prefix))
+                self._finalize(is_thread_op)
+                break
+            if len(preped_data_dict) == 0:
                 continue
 
             # process
-            self._profiler_record("midp#{}_0".format(op_info_prefix))
-            midped_data, error_channeldata = self._run_process(preped_data,
-                                                               data_id, log)
-            self._profiler_record("midp#{}_1".format(op_info_prefix))
-            if error_channeldata is not None:
-                try:
+            start = profiler.record("midp#{}_0".format(op_info_prefix))
+            midped_data_dict, err_channeldata_dict \
+                    = self._run_process(preped_data_dict, op_info_prefix)
+            end = profiler.record("midp#{}_1".format(op_info_prefix))
+            midp_time = end - start
+            try:
+                for data_id, err_channeldata in err_channeldata_dict.items():
                     self._push_to_output_channels(
-                        error_channeldata,
-                        output_channels,
-                        client_need_profile=client_need_profile,
-                        profile_set=profile_set)
-                except ChannelStopError:
-                    _LOGGER.debug(log("stop."))
-                    break
+                        data=err_channeldata,
+                        channels=output_channels,
+                        client_need_profile=need_profile_dict[data_id],
+                        profile_set=profile_dict[data_id])
+            except ChannelStopError:
+                _LOGGER.debug("{} Stop.".format(op_info_prefix))
+                self._finalize(is_thread_op)
+                break
+            if len(midped_data_dict) == 0:
                 continue
 
             # postprocess
-            self._profiler_record("postp#{}_0".format(op_info_prefix))
-            output_data, error_channeldata = self._run_postprocess(
-                parsed_data, midped_data, data_id, log)
-            self._profiler_record("postp#{}_1".format(op_info_prefix))
-            if error_channeldata is not None:
-                try:
+            start = profiler.record("postp#{}_0".format(op_info_prefix))
+            postped_data_dict, err_channeldata_dict \
+                    = self._run_postprocess(
+                            parsed_data_dict, midped_data_dict, op_info_prefix)
+            end = profiler.record("postp#{}_1".format(op_info_prefix))
+            postp_time = end - start
+            try:
+                for data_id, err_channeldata in err_channeldata_dict.items():
                     self._push_to_output_channels(
-                        error_channeldata,
-                        output_channels,
-                        client_need_profile=client_need_profile,
-                        profile_set=profile_set)
-                except ChannelStopError:
-                    _LOGGER.debug(log("stop."))
-                    break
+                        data=err_channeldata,
+                        channels=output_channels,
+                        client_need_profile=need_profile_dict[data_id],
+                        profile_set=profile_dict[data_id])
+            except ChannelStopError:
+                _LOGGER.debug("{} Stop.".format(op_info_prefix))
+                self._finalize(is_thread_op)
+                break
+            if len(postped_data_dict) == 0:
                 continue
 
             # push data to channel (if run succ)
-            #self._profiler_record("push#{}_0".format(op_info_prefix))
+            start = int(round(_time() * 1000000))
             try:
-                self._push_to_output_channels(
-                    output_data,
-                    output_channels,
-                    client_need_profile=client_need_profile,
-                    profile_set=profile_set)
+                profile_str = profiler.gen_profile_str()
+                for data_id, postped_data in postped_data_dict.items():
+                    if self._server_use_profile:
+                        sys.stderr.write(profile_str)
+                    self._push_to_output_channels(
+                        data=postped_data,
+                        channels=output_channels,
+                        profile_str=profile_str,
+                        client_need_profile=need_profile_dict[data_id],
+                        profile_set=profile_dict[data_id])
             except ChannelStopError:
-                _LOGGER.debug(log("stop."))
+                _LOGGER.debug("{} Stop.".format(op_info_prefix))
+                self._finalize(is_thread_op)
                 break
-            #self._profiler_record("push#{}_1".format(op_info_prefix))
+            end = int(round(_time() * 1000000))
+            out_time = end - start
+            if trace_buffer is not None:
+                trace_que.append({
+                    "name": self.name,
+                    "actions": {
+                        "in": in_time,
+                        "prep": prep_time,
+                        "midp": midp_time,
+                        "postp": postp_time,
+                        "out": out_time,
+                    }
+                })
+                while trace_que:
+                    info = trace_que[0]
+                    try:
+                        trace_buffer.put_nowait(info)
+                        trace_que.popleft()
+                    except Queue.Full:
+                        break
+
+    def _initialize(self, is_thread_op, client_type, concurrency_idx):
+        if is_thread_op:
+            with self._for_init_op_lock:
+                if not self._succ_init_op:
+                    # for the threaded version of Op, each thread cannot get its concurrency_idx
+                    self.concurrency_idx = None
+                    # init client
+                    self.client = self.init_client(
+                        client_type, self._client_config,
+                        self._server_endpoints, self._fetch_names)
+                    # user defined
+                    self.init_op()
+                    self._succ_init_op = True
+                    self._succ_close_op = False
+        else:
+            self.concurrency_idx = concurrency_idx
+            # init client
+            self.client = self.init_client(client_type, self._client_config,
+                                           self._server_endpoints,
+                                           self._fetch_names)
+            # user defined
+            self.init_op()
+
+        # use a separate TimeProfiler per thread or process
+        profiler = TimeProfiler()
+        profiler.enable(True)
+        return profiler
+
+    def _finalize(self, is_thread_op):
+        if is_thread_op:
+            with self._for_close_op_lock:
+                if not self._succ_close_op:
+                    self._profiler = None
+                    self.client = None
+                    self._succ_init_op = False
+                    self._succ_close_op = True
 
     def _log(self, info):
         return "{} {}".format(self.name, info)
@@ -483,13 +846,13 @@ class RequestOp(Op):
     """ RequestOp do not run preprocess, process, postprocess. """
 
     def __init__(self):
-        # PipelineService.name = "@G"
-        super(RequestOp, self).__init__(name="@G", input_ops=[])
+        # PipelineService.name = "@DAGExecutor"
+        super(RequestOp, self).__init__(name="@DAGExecutor", input_ops=[])
         # init op
         try:
             self.init_op()
         except Exception as e:
-            _LOGGER.error(e)
+            _LOGGER.critical("Op(Request) Failed to init: {}".format(e))
             os._exit(-1)
 
     def unpack_request_package(self, request):
@@ -497,7 +860,9 @@ class RequestOp(Op):
         for idx, key in enumerate(request.key):
             data = request.value[idx]
             try:
-                data = eval(data)
+                evaled_data = eval(data)
+                if isinstance(evaled_data, np.ndarray):
+                    data = evaled_data
             except Exception as e:
                 pass
             dictdata[key] = data
@@ -508,12 +873,14 @@ class ResponseOp(Op):
     """ ResponseOp do not run preprocess, process, postprocess. """
 
     def __init__(self, input_ops):
-        super(ResponseOp, self).__init__(name="@R", input_ops=input_ops)
+        super(ResponseOp, self).__init__(
+            name="@DAGExecutor", input_ops=input_ops)
         # init op
         try:
             self.init_op()
         except Exception as e:
-            _LOGGER.error(e)
+            _LOGGER.critical("Op(ResponseOp) Failed to init: {}".format(
+                e, exc_info=True))
             os._exit(-1)
 
     def pack_response_package(self, channeldata):
@@ -524,7 +891,7 @@ class ResponseOp(Op):
                 feed = channeldata.parse()
                 # ndarray to string:
                 # https://stackoverflow.com/questions/30167538/convert-a-numpy-ndarray-to-stringor-bytes-and-convert-it-back-to-numpy-ndarray
-                np.set_printoptions(threshold=np.nan)
+                np.set_printoptions(threshold=sys.maxsize)
                 for name, var in feed.items():
                     resp.value.append(var.__repr__())
                     resp.key.append(name)
@@ -536,14 +903,19 @@ class ResponseOp(Op):
                         resp.error_info = self._log(
                             "fetch var type must be str({}).".format(
                                 type(var)))
+                        _LOGGER.error("(logid={}) Failed to pack RPC "
+                                      "response package: {}".format(
+                                          channeldata.id, resp.error_info))
                         break
                     resp.value.append(var)
                     resp.key.append(name)
             else:
                 resp.ecode = ChannelDataEcode.TYPE_ERROR.value
                 resp.error_info = self._log(
-                    "Error type({}) in datatype.".format(channeldata.datatype))
-                _LOGGER.error(resp.error_info)
+                    "error type({}) in datatype.".format(channeldata.datatype))
+                _LOGGER.error("(logid={}) Failed to pack RPC response"
+                              " package: {}".format(channeldata.id,
+                                                    resp.error_info))
         else:
             resp.error_info = channeldata.error_info
         return resp
@@ -561,6 +933,7 @@ class VirtualOp(Op):
         self._virtual_pred_ops.append(op)
 
     def _actual_pred_op_names(self, op):
+        # can use disjoint-set, but it's not necessary
         if not isinstance(op, VirtualOp):
             return [op.name]
         names = []
@@ -570,9 +943,11 @@ class VirtualOp(Op):
 
     def add_output_channel(self, channel):
         if not isinstance(channel, (ThreadChannel, ProcessChannel)):
-            raise TypeError(
-                self._log('output channel must be Channel type, not {}'.format(
-                    type(channel))))
+            _LOGGER.critical(
+                self._log("Failed to add output_channel: output_channel"
+                          " must be Channel type, not {}".format(
+                              type(channel))))
+            os._exit(-1)
         for op in self._virtual_pred_ops:
             for op_name in self._actual_pred_op_names(op):
                 channel.add_producer(op_name)
@@ -580,27 +955,31 @@ class VirtualOp(Op):
 
     def _run(self, concurrency_idx, input_channel, output_channels, client_type,
              is_thread_op):
-        def get_log_func(op_info_prefix):
-            def log_func(info_str):
-                return "{} {}".format(op_info_prefix, info_str)
-
-            return log_func
-
         op_info_prefix = "[{}|{}]".format(self.name, concurrency_idx)
         log = get_log_func(op_info_prefix)
         tid = threading.current_thread().ident
 
+        batch_generator = self._auto_batching_generator(
+            input_channel=input_channel,
+            op_name=self.name,
+            batch_size=1,
+            timeout=None,
+            log_func=log)
+
         while True:
             try:
-                channeldata_dict = input_channel.front(self.name)
+                channeldata_dict_batch = next(batch_generator)
             except ChannelStopError:
-                _LOGGER.debug(log("stop."))
+                _LOGGER.debug("{} Stop.".format(op_info_prefix))
+                self._finalize(is_thread_op)
                 break
 
             try:
-                for name, data in channeldata_dict.items():
-                    self._push_to_output_channels(
-                        data, channels=output_channels, name=name)
+                for channeldata_dict in channeldata_dict_batch:
+                    for name, data in channeldata_dict.items():
+                        self._push_to_output_channels(
+                            data, channels=output_channels, name=name)
             except ChannelStopError:
-                _LOGGER.debug(log("stop."))
+                _LOGGER.debug("{} Stop.".format(op_info_prefix))
+                self._finalize(is_thread_op)
                 break
diff --git a/python/pipeline/pipeline_client.py b/python/pipeline/pipeline_client.py
index 6d96b9264773b861ad9480cf59449e3fbf562b5e..48368dd81459de98f21af4048a2b694a54e80b75 100644
--- a/python/pipeline/pipeline_client.py
+++ b/python/pipeline/pipeline_client.py
@@ -18,10 +18,11 @@ import numpy as np
 from numpy import *
 import logging
 import functools
+from .channel import ChannelDataEcode
 from .proto import pipeline_service_pb2
 from .proto import pipeline_service_pb2_grpc
 
-_LOGGER = logging.getLogger()
+_LOGGER = logging.getLogger(__name__)
 
 
 class PipelineClient(object):
@@ -41,11 +42,12 @@ class PipelineClient(object):
 
     def _pack_request_package(self, feed_dict, profile):
         req = pipeline_service_pb2.Request()
+        np.set_printoptions(threshold=sys.maxsize)
         for key, value in feed_dict.items():
             req.key.append(key)
             if isinstance(value, np.ndarray):
                 req.value.append(value.__repr__())
-            elif isinstance(value, str):
+            elif isinstance(value, (str, unicode)):
                 req.value.append(value)
             elif isinstance(value, list):
                 req.value.append(np.array(value).__repr__())
@@ -59,7 +61,11 @@ class PipelineClient(object):
 
     def _unpack_response_package(self, resp, fetch):
         if resp.ecode != 0:
-            return {"ecode": resp.ecode, "error_info": resp.error_info}
+            return {
+                "ecode": resp.ecode,
+                "ecode_desc": ChannelDataEcode(resp.ecode),
+                "error_info": resp.error_info,
+            }
         fetch_map = {"ecode": resp.ecode}
         for idx, key in enumerate(resp.key):
             if key == self._profile_key:
@@ -70,7 +76,9 @@ class PipelineClient(object):
                 continue
             data = resp.value[idx]
             try:
-                data = eval(data)
+                evaled_data = eval(data)
+                if isinstance(evaled_data, np.ndarray):
+                    data = evaled_data
             except Exception as e:
                 pass
             fetch_map[key] = data
diff --git a/python/pipeline/pipeline_server.py b/python/pipeline/pipeline_server.py
index 3f8492c9da728446486a9cf076ecf26394235f91..a6d4f9ed66fd8f563cb1526c136cba11b06fd6b3 100644
--- a/python/pipeline/pipeline_server.py
+++ b/python/pipeline/pipeline_server.py
@@ -15,34 +15,41 @@
 from concurrent import futures
 import grpc
 import logging
+import json
 import socket
 import contextlib
 from contextlib import closing
 import multiprocessing
 import yaml
 
-from .proto import pipeline_service_pb2_grpc
-from .operator import ResponseOp
-from .dag import DAGExecutor
+from .proto import pipeline_service_pb2_grpc, pipeline_service_pb2
+from . import operator
+from . import dag
+from . import util
+from . import channel
 
-_LOGGER = logging.getLogger()
+_LOGGER = logging.getLogger(__name__)
 
 
-class PipelineService(pipeline_service_pb2_grpc.PipelineServiceServicer):
-    def __init__(self, response_op, dag_config, show_info):
-        super(PipelineService, self).__init__()
+class PipelineServicer(pipeline_service_pb2_grpc.PipelineServiceServicer):
+    def __init__(self, name, response_op, dag_conf, worker_idx=-1):
+        super(PipelineServicer, self).__init__()
+        self._name = name
+
         # init dag executor
-        self._dag_executor = DAGExecutor(
-            response_op, dag_config, show_info=show_info)
+        self._dag_executor = dag.DAGExecutor(response_op, dag_conf, worker_idx)
         self._dag_executor.start()
+        _LOGGER.info("[PipelineServicer] succ init")
 
     def inference(self, request, context):
+        if request.name != "" and request.name != self._name:
+            resp = pipeline_service_pb2.Response()
+            resp.ecode = channel.ChannelDataEcode.NO_SERVICE.value
+            resp.error_info = "Failed to inference: Service name error."
+            return resp
         resp = self._dag_executor.call(request)
         return resp
 
-    def __del__(self):
-        self._dag_executor.stop()
-
 
 @contextlib.contextmanager
 def _reserve_port(port):
@@ -59,80 +66,375 @@ def _reserve_port(port):
 
 
 class PipelineServer(object):
-    def __init__(self):
-        self._port = None
+    def __init__(self, name=None):
+        self._name = name  # for grpc-gateway path
+        self._rpc_port = None
         self._worker_num = None
         self._response_op = None
+        self._proxy_server = None
+
+    def _grpc_gateway(self, grpc_port, http_port):
+        import os
+        from ctypes import cdll
+        from . import gateway
+        lib_path = os.path.join(
+            os.path.dirname(gateway.__file__), "libproxy_server.so")
+        proxy_server = cdll.LoadLibrary(lib_path)
+        proxy_server.run_proxy_server(grpc_port, http_port)
+
+    def _run_grpc_gateway(self, grpc_port, http_port):
+        if http_port <= 0:
+            _LOGGER.info("Ignore grpc_gateway configuration.")
+            return
+        if not util.AvailablePortGenerator.port_is_available(http_port):
+            raise SystemExit("Failed to run grpc-gateway: prot {} "
+                             "is already used".format(http_port))
+        if self._proxy_server is not None:
+            raise RuntimeError("Proxy server has been started.")
+        self._proxy_server = multiprocessing.Process(
+            target=self._grpc_gateway, args=(
+                grpc_port,
+                http_port, ))
+        self._proxy_server.daemon = True
+        self._proxy_server.start()
 
     def set_response_op(self, response_op):
-        if not isinstance(response_op, ResponseOp):
-            raise Exception("response_op must be ResponseOp type.")
+        if not isinstance(response_op, operator.ResponseOp):
+            raise Exception("Failed to set response_op: response_op "
+                            "must be ResponseOp type.")
         if len(response_op.get_input_ops()) != 1:
-            raise Exception("response_op can only have one previous op.")
+            raise Exception("Failed to set response_op: response_op "
+                            "can only have one previous op.")
         self._response_op = response_op
+        self._used_op, _ = dag.DAG.get_use_ops(self._response_op)
+
+    def prepare_server(self, yml_file=None, yml_dict=None):
+        conf = ServerYamlConfChecker.load_server_yaml_conf(
+            yml_file=yml_file, yml_dict=yml_dict)
+
+        self._rpc_port = conf.get("rpc_port")
+        self._http_port = conf.get("http_port")
+        if self._rpc_port is None:
+            if self._http_port is None:
+                raise SystemExit("Failed to prepare_server: rpc_port or "
+                                 "http_port can not be None.")
+            else:
+                # http mode: generate rpc_port
+                if not util.AvailablePortGenerator.port_is_available(
+                        self._http_port):
+                    raise SystemExit("Failed to prepare_server: http_port({}) "
+                                     "is already used".format(self._http_port))
+                self._rpc_port = util.GetAvailablePortGenerator().next()
+        else:
+            if not util.AvailablePortGenerator.port_is_available(
+                    self._rpc_port):
+                raise SystemExit("Failed to prepare_server: prot {} "
+                                 "is already used".format(self._rpc_port))
+            if self._http_port is None:
+                # rpc mode
+                pass
+            else:
+                # http mode
+                if not util.AvailablePortGenerator.port_is_available(
+                        self._http_port):
+                    raise SystemExit("Failed to prepare_server: http_port({}) "
+                                     "is already used".format(self._http_port))
+
+        self._worker_num = conf["worker_num"]
+        self._build_dag_each_worker = conf["build_dag_each_worker"]
+        self._init_ops(conf["op"])
 
-    def _port_is_available(self, port):
-        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-            sock.settimeout(2)
-            result = sock.connect_ex(('0.0.0.0', port))
-        return result != 0
-
-    def prepare_server(self, yml_file):
-        with open(yml_file) as f:
-            yml_config = yaml.load(f.read())
-        self._port = yml_config.get('port')
-        if self._port is None:
-            raise SystemExit("Please set *port* in [{}] yaml file.".format(
-                yml_file))
-        if not self._port_is_available(self._port):
-            raise SystemExit("Prot {} is already used".format(self._port))
-        self._worker_num = yml_config.get('worker_num', 1)
-        self._build_dag_each_worker = yml_config.get('build_dag_each_worker',
-                                                     False)
         _LOGGER.info("============= PIPELINE SERVER =============")
-        _LOGGER.info("port: {}".format(self._port))
-        _LOGGER.info("worker_num: {}".format(self._worker_num))
-        servicer_info = "build_dag_each_worker: {}".format(
-            self._build_dag_each_worker)
+        _LOGGER.info("\n{}".format(
+            json.dumps(
+                conf, indent=4, separators=(',', ':'))))
         if self._build_dag_each_worker is True:
-            servicer_info += " (Make sure that install grpcio whl with --no-binary flag)"
-        _LOGGER.info(servicer_info)
+            _LOGGER.warning(
+                "(Make sure that install grpcio whl with --no-binary flag: "
+                "pip install grpcio --no-binary grpcio)")
         _LOGGER.info("-------------------------------------------")
 
-        self._dag_config = yml_config.get("dag", {})
+        self._conf = conf
+        self._start_local_rpc_service()
+
+    def _init_ops(self, op_conf):
+        default_conf = {
+            "concurrency": 1,
+            "timeout": -1,
+            "retry": 1,
+            "batch_size": 1,
+            "auto_batching_timeout": -1,
+            "local_service_conf": {
+                "workdir": "",
+                "thread_num": 2,
+                "devices": "",
+                "mem_optim": True,
+                "ir_optim": False,
+            },
+        }
+        for op in self._used_op:
+            if not isinstance(op, operator.RequestOp) and not isinstance(
+                    op, operator.ResponseOp):
+                conf = op_conf.get(op.name, default_conf)
+                op.init_from_dict(conf)
+
+    def _start_local_rpc_service(self):
+        # only brpc now
+        if self._conf["dag"]["client_type"] != "brpc":
+            _LOGGER.warning("Local service version must be brpc type now.")
+        for op in self._used_op:
+            if not isinstance(op, operator.RequestOp):
+                op.launch_local_rpc_service()
 
     def run_server(self):
         if self._build_dag_each_worker:
-            with _reserve_port(self._port) as port:
+            with _reserve_port(self._rpc_port) as port:
                 bind_address = 'localhost:{}'.format(port)
                 workers = []
                 for i in range(self._worker_num):
                     show_info = (i == 0)
                     worker = multiprocessing.Process(
                         target=self._run_server_func,
-                        args=(bind_address, self._response_op,
-                              self._dag_config))
+                        args=(bind_address, self._response_op, self._conf, i))
                     worker.start()
                     workers.append(worker)
+                self._run_grpc_gateway(
+                    grpc_port=self._rpc_port,
+                    http_port=self._http_port)  # start grpc_gateway
                 for worker in workers:
                     worker.join()
         else:
             server = grpc.server(
-                futures.ThreadPoolExecutor(max_workers=self._worker_num))
+                futures.ThreadPoolExecutor(max_workers=self._worker_num),
+                options=[('grpc.max_send_message_length', 256 * 1024 * 1024),
+                         ('grpc.max_receive_message_length', 256 * 1024 * 1024)
+                         ])
             pipeline_service_pb2_grpc.add_PipelineServiceServicer_to_server(
-                PipelineService(self._response_op, self._dag_config, True),
+                PipelineServicer(self._name, self._response_op, self._conf),
                 server)
-            server.add_insecure_port('[::]:{}'.format(self._port))
+            server.add_insecure_port('[::]:{}'.format(self._rpc_port))
             server.start()
+            self._run_grpc_gateway(
+                grpc_port=self._rpc_port,
+                http_port=self._http_port)  # start grpc_gateway
             server.wait_for_termination()
 
-    def _run_server_func(self, bind_address, response_op, dag_config):
-        options = (('grpc.so_reuseport', 1), )
+    def _run_server_func(self, bind_address, response_op, dag_conf, worker_idx):
+        options = [('grpc.so_reuseport', 1),
+                   ('grpc.max_send_message_length', 256 * 1024 * 1024),
+                   ('grpc.max_send_message_length', 256 * 1024 * 1024)]
         server = grpc.server(
             futures.ThreadPoolExecutor(
                 max_workers=1, ), options=options)
         pipeline_service_pb2_grpc.add_PipelineServiceServicer_to_server(
-            PipelineService(response_op, dag_config, False), server)
+            PipelineServicer(self._name, response_op, dag_conf, worker_idx),
+            server)
         server.add_insecure_port(bind_address)
         server.start()
         server.wait_for_termination()
+
+
+class ServerYamlConfChecker(object):
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def load_server_yaml_conf(yml_file=None, yml_dict=None):
+        if yml_file is not None and yml_dict is not None:
+            raise SystemExit("Failed to prepare_server: only one of yml_file"
+                             " or yml_dict can be selected as the parameter.")
+        if yml_file is not None:
+            with open(yml_file) as f:
+                conf = yaml.load(f.read())
+        elif yml_dict is not None:
+            conf = yml_dict
+        else:
+            raise SystemExit("Failed to prepare_server: yml_file or yml_dict"
+                             " can not be None.")
+        ServerYamlConfChecker.check_server_conf(conf)
+        ServerYamlConfChecker.check_dag_conf(conf["dag"])
+        ServerYamlConfChecker.check_tracer_conf(conf["dag"]["tracer"])
+        for op_name in conf["op"]:
+            ServerYamlConfChecker.check_op_conf(conf["op"][op_name])
+            ServerYamlConfChecker.check_local_service_conf(conf["op"][op_name][
+                "local_service_conf"])
+        return conf
+
+    @staticmethod
+    def check_conf(conf, default_conf, conf_type, conf_qualification):
+        ServerYamlConfChecker.fill_with_default_conf(conf, default_conf)
+        ServerYamlConfChecker.check_conf_type(conf, conf_type)
+        ServerYamlConfChecker.check_conf_qualification(conf, conf_qualification)
+
+    @staticmethod
+    def check_server_conf(conf):
+        default_conf = {
+            # "rpc_port": 9292,
+            "worker_num": 1,
+            "build_dag_each_worker": False,
+            #"http_port": 0,
+            "dag": {},
+            "op": {},
+        }
+
+        conf_type = {
+            "rpc_port": int,
+            "http_port": int,
+            "worker_num": int,
+            "build_dag_each_worker": bool,
+            "grpc_gateway_port": int,
+        }
+
+        conf_qualification = {
+            "rpc_port": [(">=", 1024), ("<=", 65535)],
+            "http_port": [(">=", 1024), ("<=", 65535)],
+            "worker_num": (">=", 1),
+        }
+
+        ServerYamlConfChecker.check_conf(conf, default_conf, conf_type,
+                                         conf_qualification)
+
+    @staticmethod
+    def check_local_service_conf(conf):
+        default_conf = {
+            "workdir": "",
+            "thread_num": 2,
+            "devices": "",
+            "mem_optim": True,
+            "ir_optim": False,
+        }
+        conf_type = {
+            "model_config": str,
+            "workdir": str,
+            "thread_num": int,
+            "devices": str,
+            "mem_optim": bool,
+            "ir_optim": bool,
+        }
+        conf_qualification = {"thread_num": (">=", 1), }
+        ServerYamlConfChecker.check_conf(conf, default_conf, conf_type,
+                                         conf_qualification)
+
+    @staticmethod
+    def check_op_conf(conf):
+        default_conf = {
+            "concurrency": 1,
+            "timeout": -1,
+            "retry": 1,
+            "batch_size": 1,
+            "auto_batching_timeout": -1,
+            "local_service_conf": {},
+        }
+        conf_type = {
+            "server_endpoints": list,
+            "fetch_list": list,
+            "client_config": str,
+            "concurrency": int,
+            "timeout": int,
+            "retry": int,
+            "batch_size": int,
+            "auto_batching_timeout": int,
+        }
+        conf_qualification = {
+            "concurrency": (">=", 1),
+            "retry": (">=", 1),
+            "batch_size": (">=", 1),
+        }
+        ServerYamlConfChecker.check_conf(conf, default_conf, conf_type,
+                                         conf_qualification)
+
+    @staticmethod
+    def check_tracer_conf(conf):
+        default_conf = {"interval_s": -1, }
+
+        conf_type = {"interval_s": int, }
+
+        conf_qualification = {}
+
+        ServerYamlConfChecker.check_conf(conf, default_conf, conf_type,
+                                         conf_qualification)
+
+    @staticmethod
+    def check_dag_conf(conf):
+        default_conf = {
+            "retry": 1,
+            "client_type": "brpc",
+            "use_profile": False,
+            "channel_size": 0,
+            "is_thread_op": True,
+            "tracer": {},
+        }
+
+        conf_type = {
+            "retry": int,
+            "client_type": str,
+            "use_profile": bool,
+            "channel_size": int,
+            "is_thread_op": bool,
+        }
+
+        conf_qualification = {
+            "retry": (">=", 1),
+            "client_type": ("in", ["brpc", "grpc"]),
+            "channel_size": (">=", 0),
+        }
+
+        ServerYamlConfChecker.check_conf(conf, default_conf, conf_type,
+                                         conf_qualification)
+
+    @staticmethod
+    def fill_with_default_conf(conf, default_conf):
+        for key, val in default_conf.items():
+            if conf.get(key) is None:
+                _LOGGER.warning("[CONF] {} not set, use default: {}"
+                                .format(key, val))
+                conf[key] = val
+
+    @staticmethod
+    def check_conf_type(conf, conf_type):
+        for key, val in conf_type.items():
+            if key not in conf:
+                continue
+            if not isinstance(conf[key], val):
+                raise SystemExit("[CONF] {} must be {} type, but get {}."
+                                 .format(key, val, type(conf[key])))
+
+    @staticmethod
+    def check_conf_qualification(conf, conf_qualification):
+        for key, qualification in conf_qualification.items():
+            if key not in conf:
+                continue
+            if not isinstance(qualification, list):
+                qualification = [qualification]
+            if not ServerYamlConfChecker.qualification_check(conf[key],
+                                                             qualification):
+                raise SystemExit("[CONF] {} must be {}, but get {}."
+                                 .format(key, ", ".join([
+                                     "{} {}"
+                                     .format(q[0], q[1]) for q in qualification
+                                 ]), conf[key]))
+
+    @staticmethod
+    def qualification_check(value, qualifications):
+        if not isinstance(qualifications, list):
+            qualifications = [qualifications]
+        ok = True
+        for q in qualifications:
+            operator, limit = q
+            if operator == "<":
+                ok = value < limit
+            elif operator == "==":
+                ok = value == limit
+            elif operator == ">":
+                ok = value > limit
+            elif operator == "<=":
+                ok = value <= limit
+            elif operator == ">=":
+                ok = value >= limit
+            elif operator == "in":
+                ok = value in limit
+            else:
+                raise SystemExit("unknow operator: {}".format(operator))
+            if ok == False:
+                break
+        return ok
diff --git a/python/pipeline/profiler.py b/python/pipeline/profiler.py
index e6fe43b10a17aafc2b4058c47e72f1e640e7912e..b83bdd1dc8c5c948353c8ee95f51fe325e38dbfc 100644
--- a/python/pipeline/profiler.py
+++ b/python/pipeline/profiler.py
@@ -22,10 +22,165 @@ elif sys.version_info.major == 3:
     import queue as Queue
 else:
     raise Exception("Error Python version")
+from time import time as _time
 import time
 import threading
+import multiprocessing
 
-_LOGGER = logging.getLogger()
+_LOGGER = logging.getLogger(__name__)
+_LOGGER.propagate = False
+
+
+class PerformanceTracer(object):
+    def __init__(self, is_thread_mode, interval_s, server_worker_num):
+        self._is_thread_mode = is_thread_mode
+        if is_thread_mode:
+            # Because the Channel in the thread mode cannot be
+            # accessed across processes, when using thread mode,
+            # the PerformanceTracer is also the thread mode.
+            # However, performance may be affected by GIL.
+            self._data_buffer = Queue.Queue()
+        else:
+            self._data_buffer = multiprocessing.Manager().Queue()
+        self._interval_s = interval_s
+        self._thrd = None
+        self._proc = None
+        self._channels = []
+        # The size of data in Channel will not exceed server_worker_num
+        self._server_worker_num = server_worker_num
+
+    def data_buffer(self):
+        return self._data_buffer
+
+    def start(self):
+        if self._is_thread_mode:
+            self._thrd = threading.Thread(
+                target=self._trace_func, args=(self._channels, ))
+            self._thrd.daemon = True
+            self._thrd.start()
+        else:
+            self._proc = multiprocessing.Process(
+                target=self._trace_func, args=(self._channels, ))
+            self._proc.daemon = True
+            self._proc.start()
+
+    def set_channels(self, channels):
+        self._channels = channels
+
+    def _trace_func(self, channels):
+        all_actions = ["in", "prep", "midp", "postp", "out"]
+        calcu_actions = ["prep", "midp", "postp"]
+        while True:
+            op_cost = {}
+            err_request = []
+            err_count = 0
+
+            _LOGGER.info("==================== TRACER ======================")
+            # op
+            while True:
+                try:
+                    item = self._data_buffer.get_nowait()
+                    name = item["name"]
+                    actions = item["actions"]
+
+                    if name == "DAG":
+                        succ = item["succ"]
+                        req_id = item["id"]
+                        if not succ:
+                            err_count += 1
+                            err_request.append(req_id)
+
+                    if name not in op_cost:
+                        op_cost[name] = {}
+
+                    for action, cost in actions.items():
+                        if action not in op_cost[name]:
+                            op_cost[name][action] = []
+                        op_cost[name][action].append(cost)
+                except Queue.Empty:
+                    break
+
+            if len(op_cost) != 0:
+                for name in op_cost:
+                    tot_cost, calcu_cost = 0.0, 0.0
+                    for action, costs in op_cost[name].items():
+                        op_cost[name][action] = sum(costs) / (1e3 * len(costs))
+                        tot_cost += op_cost[name][action]
+
+                    if name != "DAG":
+                        _LOGGER.info("Op({}):".format(name))
+                        for action in all_actions:
+                            if action in op_cost[name]:
+                                _LOGGER.info("\t{}[{} ms]".format(
+                                    action, op_cost[name][action]))
+                        for action in calcu_actions:
+                            if action in op_cost[name]:
+                                calcu_cost += op_cost[name][action]
+                        _LOGGER.info("\tidle[{}]".format(1 - 1.0 * calcu_cost /
+                                                         tot_cost))
+
+            if "DAG" in op_cost:
+                calls = op_cost["DAG"].values()
+                calls.sort()
+                tot = len(calls)
+                qps = 1.0 * tot / self._interval_s
+                ave_cost = sum(calls) / tot
+                latencys = [50, 60, 70, 80, 90, 95, 99]
+                _LOGGER.info("DAGExecutor:")
+                _LOGGER.info("\tQuery count[{}]".format(tot))
+                _LOGGER.info("\tQPS[{} q/s]".format(qps))
+                _LOGGER.info("\tSucc[{}]".format(1 - 1.0 * err_count / tot))
+                _LOGGER.info("\tError req[{}]".format(", ".join(
+                    [str(x) for x in err_request])))
+                _LOGGER.info("\tLatency:")
+                _LOGGER.info("\t\tave[{} ms]".format(ave_cost))
+                for latency in latencys:
+                    _LOGGER.info("\t\t.{}[{} ms]".format(latency, calls[int(
+                        tot * latency / 100.0)]))
+
+            # channel
+            _LOGGER.info("Channel (server worker num[{}]):".format(
+                self._server_worker_num))
+            for channel in channels:
+                _LOGGER.info("\t{}(In: {}, Out: {}) size[{}/{}]".format(
+                    channel.name,
+                    channel.get_producers(),
+                    channel.get_consumers(),
+                    channel.size(), channel.get_maxsize()))
+            time.sleep(self._interval_s)
+
+
+class UnsafeTimeProfiler(object):
+    """ thread unsafe profiler """
+
+    def __init__(self):
+        self.pid = os.getpid()
+        self.print_head = 'PROFILE\tpid:{}\t'.format(self.pid)
+        self.time_record = [self.print_head]
+        self._enable = False
+
+    def enable(self, enable):
+        self._enable = enable
+
+    def record(self, name):
+        if self._enable is False:
+            return
+        timestamp = int(round(_time() * 1000000))
+        self.time_record.append('{}:{} '.format(name, timestamp))
+        return timestamp
+
+    def print_profile(self):
+        if self._enable is False:
+            return
+        sys.stderr.write(self.gen_profile_str())
+
+    def gen_profile_str(self):
+        if self._enable is False:
+            return
+        self.time_record.append('\n')
+        profile_str = ''.join(self.time_record)
+        self.time_record = [self.print_head]
+        return profile_str
 
 
 class TimeProfiler(object):
@@ -42,12 +197,13 @@ class TimeProfiler(object):
     def record(self, name_with_tag):
         if self._enable is False:
             return
-        timestamp = int(round(time.time() * 1000000))
+        timestamp = int(round(_time() * 1000000))
         name_with_tag = name_with_tag.split("_")
         tag = name_with_tag[-1]
         name = '_'.join(name_with_tag[:-1])
         with self._lock:
             self._time_record.put((name, tag, timestamp))
+        return timestamp
 
     def print_profile(self):
         if self._enable is False:
diff --git a/python/pipeline/proto/pipeline_service.proto b/python/pipeline/proto/pipeline_service.proto
index a920d5618ce36a191390d5140bee0a42c7394a6b..02c922027ea6c00a3831137b55604950378b84fe 100644
--- a/python/pipeline/proto/pipeline_service.proto
+++ b/python/pipeline/proto/pipeline_service.proto
@@ -18,6 +18,7 @@ package baidu.paddle_serving.pipeline_serving;
 message Request {
   repeated string key = 1;
   repeated string value = 2;
+  optional string name = 3;
 };
 
 message Response {
diff --git a/python/pipeline/util.py b/python/pipeline/util.py
index a24c1a057ca4bbb5cf33f2402559ce3d14f3e6b9..d7847f179de7557b5446958536008adc3c981f95 100644
--- a/python/pipeline/util.py
+++ b/python/pipeline/util.py
@@ -13,13 +13,131 @@
 # limitations under the License.
 
 import sys
+import logging
+import threading
+import multiprocessing
+import multiprocessing.managers
+from contextlib import closing
+import socket
+if sys.version_info.major == 2:
+    import Queue
+    from Queue import PriorityQueue
+elif sys.version_info.major == 3:
+    import queue as Queue
+    from queue import PriorityQueue
+else:
+    raise Exception("Error Python version")
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class AvailablePortGenerator(object):
+    def __init__(self, start_port=12000):
+        self._curr_port = start_port
+
+    @staticmethod
+    def port_is_available(port):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+            sock.settimeout(2)
+            result = sock.connect_ex(('0.0.0.0', port))
+        if result != 0:
+            return True
+        else:
+            return False
+
+    def next(self):
+        while not AvailablePortGenerator.port_is_available(self._curr_port):
+            self._curr_port += 1
+        self._curr_port += 1
+        return self._curr_port - 1
+
+
+_AvailablePortGenerator = AvailablePortGenerator()
+
+
+def GetAvailablePortGenerator():
+    return _AvailablePortGenerator
 
 
 class NameGenerator(object):
+    # use unsafe-id-generator
     def __init__(self, prefix):
         self._idx = -1
         self._prefix = prefix
+        self._id_generator = UnsafeIdGenerator(1000000000000000000)
+
+    def next(self):
+        next_id = self._id_generator.next()
+        return "{}{}".format(self._prefix, next_id)
+
+
+class UnsafeIdGenerator(object):
+    def __init__(self, max_id, base_counter=0, step=1):
+        self._base_counter = base_counter
+        self._counter = self._base_counter
+        self._step = step
+        self._max_id = max_id  # for reset
 
     def next(self):
-        self._idx += 1
-        return "{}{}".format(self._prefix, self._idx)
+        if self._counter >= self._max_id:
+            self._counter = self._base_counter
+            _LOGGER.info("Reset Id: {}".format(self._counter))
+        next_id = self._counter
+        self._counter += self._step
+        return next_id
+
+
+class ThreadIdGenerator(UnsafeIdGenerator):
+    def __init__(self, max_id, base_counter=0, step=1, lock=None):
+        # if you want to use your lock, you may need to use Reentrant-Lock
+        self._lock = lock
+        if self._lock is None:
+            self._lock = threading.Lock()
+        super(ThreadIdGenerator, self).__init__(max_id, base_counter, step)
+
+    def next(self):
+        next_id = None
+        with self._lock:
+            if self._counter >= self._max_id:
+                self._counter = self._base_counter
+                _LOGGER.info("Reset Id: {}".format(self._counter))
+            next_id = self._counter
+            self._counter += self._step
+        return next_id
+
+
+class ProcessIdGenerator(UnsafeIdGenerator):
+    def __init__(self, max_id, base_counter=0, step=1, lock=None):
+        # if you want to use your lock, you may need to use Reentrant-Lock
+        self._lock = lock
+        if self._lock is None:
+            self._lock = multiprocessing.Lock()
+        self._base_counter = base_counter
+        self._counter = multiprocessing.Manager().Value('i', 0)
+        self._step = step
+        self._max_id = max_id
+
+    def next(self):
+        next_id = None
+        with self._lock:
+            if self._counter.value >= self._max_id:
+                self._counter.value = self._base_counter
+                _LOGGER.info("Reset Id: {}".format(self._counter.value))
+            next_id = self._counter.value
+            self._counter.value += self._step
+        return next_id
+
+
+def PipelineProcSyncManager():
+    """
+    add PriorityQueue into SyncManager, see more: 
+    https://stackoverflow.com/questions/25324560/strange-queue-priorityqueue-behaviour-with-multiprocessing-in-python-2-7-6?answertab=active#tab-top
+    """
+
+    class PipelineManager(multiprocessing.managers.SyncManager):
+        pass
+
+    PipelineManager.register("PriorityQueue", PriorityQueue)
+    m = PipelineManager()
+    m.start()
+    return m
diff --git a/python/requirements.txt b/python/requirements.txt
index 697b24fd4db6aff6b30913d8a5d23416dc208c80..6771d1adea85c0fd7ac32c26fcfd7dfe3f2cbdd4 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,5 +1,10 @@
 numpy>=1.12, <=1.16.4 ; python_version<"3.5"
+shapely==1.7.0
+wheel>=0.34.0, <0.35.0
+setuptools>=44.1.0
+opencv-python==4.2.0.32
 google>=2.0.3
+opencv-python==4.2.0.32
 protobuf>=3.12.2
 grpcio-tools>=1.28.1
 grpcio>=1.28.1
diff --git a/python/setup.py.app.in b/python/setup.py.app.in
index 1ee1cabb5a572536e6869852e3ab638cda6adcb8..1a06b0d352c1da4cdd09f74cb900853d4016afa8 100644
--- a/python/setup.py.app.in
+++ b/python/setup.py.app.in
@@ -16,7 +16,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import platform
 import os
 
 from setuptools import setup, Distribution, Extension
@@ -24,26 +23,17 @@ from setuptools import find_packages
 from setuptools import setup
 from paddle_serving_app.version import serving_app_version
 from pkg_resources import DistributionNotFound, get_distribution
+import util
         
-def python_version():
-    return [int(v) for v in platform.python_version().split(".")]
-
-def find_package(pkgname):
-    try:
-        get_distribution(pkgname)
-        return True
-    except DistributionNotFound:
-        return False
-
-max_version, mid_version, min_version = python_version()
+max_version, mid_version, min_version = util.python_version()
 
 if '${PACK}' == 'ON':
     copy_lib()
 
 
 REQUIRED_PACKAGES = [
-    'six >= 1.10.0', 'sentencepiece', 'opencv-python', 'pillow',
-    'shapely', 'pyclipper'
+    'six >= 1.10.0', 'sentencepiece', 'opencv-python<=4.2.0.32', 'pillow',
+    'shapely<=1.6.1', 'pyclipper'
 ]
 
 packages=['paddle_serving_app',
diff --git a/python/setup.py.client.in b/python/setup.py.client.in
index 96773c38dc950c0b8357274dff30d7c952ecdc25..196ff6c56b0dc049e3f2b27368f2a08de74b5c09 100644
--- a/python/setup.py.client.in
+++ b/python/setup.py.client.in
@@ -16,7 +16,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import platform
 import os
 import sys
 
@@ -24,45 +23,39 @@ from setuptools import setup, Distribution, Extension
 from setuptools import find_packages
 from setuptools import setup
 from paddle_serving_client.version import serving_client_version
-from pkg_resources import DistributionNotFound, get_distribution
+import util
 
 py_version = sys.version_info
         
-def python_version():
-    return [int(v) for v in platform.python_version().split(".")]
-
-def find_package(pkgname):
-    try:
-        get_distribution(pkgname)
-        return True
-    except DistributionNotFound:
-        return False
-
 def copy_lib():
     if py_version[0] == 2:
         lib_list = ['libpython2.7.so.1.0', 'libssl.so.10', 'libcrypto.so.10'] 
+    elif py_version[1] == 5:
+        lib_list = ['libpython3.5m.so.1.0', 'libssl.so.10', 'libcrypto.so.10']
     elif py_version[1] == 6:
         lib_list = ['libpython3.6m.so.1.0', 'libssl.so.10', 'libcrypto.so.10']
     elif py_version[1] == 7:
         lib_list = ['libpython3.7m.so.1.0', 'libssl.so.10', 'libcrypto.so.10']
     os.popen('mkdir -p paddle_serving_client/lib')
     for lib in lib_list:
-        r = os.popen('whereis {}'.format(lib))
+        r = os.popen('which {}'.format(lib))
         text = r.read()
-        os.popen('cp {} ./paddle_serving_client/lib'.format(text.strip().split(' ')[1]))
+        os.popen('cp {} ./paddle_serving_client/lib'.format(text.strip()))
 
-max_version, mid_version, min_version = python_version()
+max_version, mid_version, min_version = util.python_version()
+
+# gen pipeline proto code
+util.gen_pipeline_code("paddle_serving_client")
 
 if '${PACK}' == 'ON':
     copy_lib()
 
-
 REQUIRED_PACKAGES = [
     'six >= 1.10.0', 'protobuf >= 3.11.0', 'numpy >= 1.12', 'grpcio >= 1.28.1',
     'grpcio-tools >= 1.28.1'
 ]
 
-if not find_package("paddlepaddle") and not find_package("paddlepaddle-gpu"):
+if not util.find_package("paddlepaddle") and not util.find_package("paddlepaddle-gpu"):
     REQUIRED_PACKAGES.append("paddlepaddle")
 
 
@@ -72,8 +65,10 @@ packages=['paddle_serving_client',
     	  'paddle_serving_client.metric',
     	  'paddle_serving_client.utils',
           'paddle_serving_client.pipeline',
-          'paddle_serving_client.pipeline.proto']
-package_data={'paddle_serving_client': ['serving_client.so','lib/*'],}
+          'paddle_serving_client.pipeline.proto',
+          'paddle_serving_client.pipeline.gateway',
+          'paddle_serving_client.pipeline.gateway.proto']
+package_data={'paddle_serving_client': ['serving_client.so', 'lib/*', 'pipeline/gateway/libproxy_server.so'],}
 package_dir={'paddle_serving_client':
              '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client',
              'paddle_serving_client.proto':
@@ -87,7 +82,11 @@ package_dir={'paddle_serving_client':
              'paddle_serving_client.pipeline':
              '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/pipeline',
              'paddle_serving_client.pipeline.proto':
-             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/pipeline/proto'}
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/pipeline/proto',
+             'paddle_serving_client.pipeline.gateway':
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/pipeline/gateway',
+             'paddle_serving_client.pipeline.gateway.proto':
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/pipeline/gateway/proto'}
 
 setup(
     name='paddle-serving-client',
diff --git a/python/setup.py.in b/python/setup.py.in
index af7036bdd99e05966156064dd2bcf1bb8463b716..fa7051db94ebdd69778f7957f50b1301697398fe 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -16,17 +16,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import platform
-
 from setuptools import setup, Distribution, Extension
 from setuptools import find_packages
 from setuptools import setup
 from paddle_serving.version import serving_client_version
+from grpc_tools import protoc
+import util
 
-def python_version():
-    return [int(v) for v in platform.python_version().split(".")]
-
-max_version, mid_version, min_version = python_version()
+max_version, mid_version, min_version = util.python_version()
 
 REQUIRED_PACKAGES = [
     'six >= 1.10.0', 'protobuf >= 3.1.0','paddlepaddle'
diff --git a/python/setup.py.server.in b/python/setup.py.server.in
index db679edbab8e6ba6929ed631c2bbc5a731146d0d..6733f1a4788818c530e3be0719686cea54cace49 100644
--- a/python/setup.py.server.in
+++ b/python/setup.py.server.in
@@ -16,25 +16,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import platform
-
 from setuptools import setup, Distribution, Extension
 from setuptools import find_packages
 from setuptools import setup
 from paddle_serving_server.version import serving_server_version
-from pkg_resources import DistributionNotFound, get_distribution
-
-def find_package(pkgname):
-    try:
-        get_distribution(pkgname)
-        return True
-    except DistributionNotFound:
-        return False
+import util
 
-def python_version():
-    return [int(v) for v in platform.python_version().split(".")]
+max_version, mid_version, min_version = util.python_version()
 
-max_version, mid_version, min_version = python_version()
+# gen pipeline proto code
+util.gen_pipeline_code("paddle_serving_server")
 
 REQUIRED_PACKAGES = [
     'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio >= 1.28.1', 'grpcio-tools >= 1.28.1',
@@ -44,7 +35,9 @@ REQUIRED_PACKAGES = [
 packages=['paddle_serving_server',
           'paddle_serving_server.proto',
           'paddle_serving_server.pipeline',
-          'paddle_serving_server.pipeline.proto']
+          'paddle_serving_server.pipeline.proto',
+          'paddle_serving_server.pipeline.gateway',
+          'paddle_serving_server.pipeline.gateway.proto']
 
 package_dir={'paddle_serving_server':
              '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server',
@@ -53,7 +46,13 @@ package_dir={'paddle_serving_server':
              'paddle_serving_server.pipeline':
              '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/pipeline',
              'paddle_serving_server.pipeline.proto':
-             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/pipeline/proto'}
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/pipeline/proto',
+             'paddle_serving_server.pipeline.gateway':
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/pipeline/gateway',
+             'paddle_serving_server.pipeline.gateway.proto':
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/pipeline/gateway/proto'}
+
+package_data={'paddle_serving_server': ['pipeline/gateway/libproxy_server.so'],}
 
 setup(
     name='paddle-serving-server',
@@ -65,6 +64,7 @@ setup(
     author_email='guru4elephant@gmail.com',
     install_requires=REQUIRED_PACKAGES,
     packages=packages,
+    package_data=package_data,
     package_dir=package_dir,
     # PyPI package information.
     classifiers=[
diff --git a/python/setup.py.server_gpu.in b/python/setup.py.server_gpu.in
index 4554c1d368f70a32d16ceeabb54d63625f9f256d..523615b8e782c29ebdedadc54a9473a0b672aac0 100644
--- a/python/setup.py.server_gpu.in
+++ b/python/setup.py.server_gpu.in
@@ -16,25 +16,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import platform
-
 from setuptools import setup, Distribution, Extension
 from setuptools import find_packages
 from setuptools import setup
 from paddle_serving_server_gpu.version import serving_server_version
-from pkg_resources import DistributionNotFound, get_distribution
-
-def find_package(pkgname):
-    try:
-        get_distribution(pkgname)
-        return True
-    except DistributionNotFound:
-        return False
+import util
 
-def python_version():
-    return [int(v) for v in platform.python_version().split(".")]
+max_version, mid_version, min_version = util.python_version()
 
-max_version, mid_version, min_version = python_version()
+# gen pipeline proto code
+util.gen_pipeline_code("paddle_serving_server_gpu")
 
 REQUIRED_PACKAGES = [
     'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio >= 1.28.1', 'grpcio-tools >= 1.28.1',
@@ -44,7 +35,9 @@ REQUIRED_PACKAGES = [
 packages=['paddle_serving_server_gpu',
           'paddle_serving_server_gpu.proto',
           'paddle_serving_server_gpu.pipeline',
-          'paddle_serving_server_gpu.pipeline.proto']
+          'paddle_serving_server_gpu.pipeline.proto',
+          'paddle_serving_server_gpu.pipeline.gateway',
+          'paddle_serving_server_gpu.pipeline.gateway.proto']
 
 package_dir={'paddle_serving_server_gpu':
              '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu',
@@ -53,7 +46,13 @@ package_dir={'paddle_serving_server_gpu':
              'paddle_serving_server_gpu.pipeline':
              '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/pipeline',
              'paddle_serving_server_gpu.pipeline.proto':
-             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/pipeline/proto'}
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/pipeline/proto',
+             'paddle_serving_server_gpu.pipeline.gateway':
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/pipeline/gateway',
+             'paddle_serving_server_gpu.pipeline.gateway.proto':
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/pipeline/gateway/proto'}
+
+package_data={'paddle_serving_server_gpu': ['pipeline/gateway/libproxy_server.so'],}
 
 setup(
     name='paddle-serving-server-gpu',
@@ -65,6 +64,7 @@ setup(
     author_email='guru4elephant@gmail.com',
     install_requires=REQUIRED_PACKAGES,
     packages=packages,
+    package_data=package_data,
     package_dir=package_dir,
     # PyPI package information.
     classifiers=[
diff --git a/python/util.py b/python/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ae68c1ed53766cb7f4f623e3a5f4fb50f7eb095
--- /dev/null
+++ b/python/util.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pkg_resources import DistributionNotFound, get_distribution
+from grpc_tools import protoc
+import os
+import platform
+
+
+def python_version():
+    return [int(v) for v in platform.python_version().split(".")]
+
+
+def find_package(pkgname):
+    try:
+        get_distribution(pkgname)
+        return True
+    except DistributionNotFound:
+        return False
+
+
+def gen_pipeline_code(package_name):
+    # pipeline service proto
+    protoc.main((
+        '',
+        '-I.',
+        '--python_out=.',
+        '--grpc_python_out=.',
+        '{}/pipeline/proto/pipeline_service.proto'.format(package_name), ))
+
+    # pipeline grpc-gateway proto
+    # *.pb.go
+    ret = os.system(
+        "cd {}/pipeline/gateway/proto/ && "
+        "../../../../../third_party/install/protobuf/bin/protoc -I. "
+        "-I$GOPATH/src "
+        "-I$GOPATH/src/github.com/grpc-ecosystem/grpc-gateway/third_party/googleapis "
+        "--go_out=plugins=grpc:. "
+        "gateway.proto".format(package_name))
+    if ret != 0:
+        exit(1)
+    # *.gw.go
+    ret = os.system(
+        "cd {}/pipeline/gateway/proto/ && "
+        "../../../../../third_party/install/protobuf/bin/protoc -I. "
+        "-I$GOPATH/src "
+        "-I$GOPATH/src/github.com/grpc-ecosystem/grpc-gateway/third_party/googleapis "
+        "--grpc-gateway_out=logtostderr=true:. "
+        "gateway.proto".format(package_name))
+    if ret != 0:
+        exit(1)
+
+    # pipeline grpc-gateway shared-lib
+    ret = os.system(
+        "cd {}/pipeline/gateway && "
+        "go build -buildmode=c-shared -o libproxy_server.so proxy_server.go".
+        format(package_name))
+    if ret != 0:
+        exit(1)
diff --git a/tools/Dockerfile b/tools/Dockerfile
index 6c61937755ea5e0257e70ce27cab528b76222b12..bf4254495e5a1163455887008540945d5898182e 100644
--- a/tools/Dockerfile
+++ b/tools/Dockerfile
@@ -12,4 +12,5 @@ RUN yum -y install wget && \
 RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
     python get-pip.py && rm get-pip.py && \
     localedef -c -i en_US -f UTF-8 en_US.UTF-8 && \
-    echo "export LANG=en_US.utf8" >> /root/.bashrc
+    echo "export LANG=en_US.utf8" >> /root/.bashrc && \
+    echo "export LANGUAGE=en_US.utf8" >> /root/.bashrc
diff --git a/tools/Dockerfile.centos6.cuda9.0-cudnn7.devel b/tools/Dockerfile.centos6.cuda9.0-cudnn7.devel
index 9ee3591b9a1e2ea5881106cf7e67ca28b24c1890..b4af571c19cec131800dd871b558c00d601b2f5e 100644
--- a/tools/Dockerfile.centos6.cuda9.0-cudnn7.devel
+++ b/tools/Dockerfile.centos6.cuda9.0-cudnn7.devel
@@ -31,7 +31,6 @@ RUN yum -y install wget && \
     curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
     python get-pip.py && \
     rm get-pip.py && \
-    pip install google protobuf setuptools wheel flask numpy==1.16.4 && \
     wget https://www.python.org/ftp/python/3.6.8/Python-3.6.8.tgz && \
     tar -zxf Python-3.6.8.tgz && \
     cd Python-3.6.8 && \
@@ -42,7 +41,7 @@ RUN yum -y install wget && \
     echo 'export LD_LIBRARY_PATH=/usr/local/python3.6/lib:$LD_LIBRARY_PATH' >> /root/.bashrc && \
     source /root/.bashrc && \
     cd .. && rm -rf Python-3.6.8* && \
-    pip3 install google protobuf setuptools wheel flask numpy==1.16.4 && \
     yum -y install epel-release && yum -y install patchelf libXext libSM libXrender && \
     yum clean all && \
-    echo "export LANG=en_US.utf8" >> /root/.bashrc
+    echo "export LANG=en_US.utf8" >> /root/.bashrc && \
+    echo "export LANGUAGE=en_US.utf8" >> /root/.bashrc
diff --git a/tools/Dockerfile.centos6.devel b/tools/Dockerfile.centos6.devel
index 83981dcc4731252dfc75270b5ce6fc623a0266a8..6dfc304cb281f5bb7c7d60a0b38354118c893f39 100644
--- a/tools/Dockerfile.centos6.devel
+++ b/tools/Dockerfile.centos6.devel
@@ -31,7 +31,6 @@ RUN yum -y install wget && \
     curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
     python get-pip.py && \
     rm get-pip.py && \
-    pip install google protobuf setuptools wheel flask numpy==1.16.4 && \
     wget https://www.python.org/ftp/python/3.6.8/Python-3.6.8.tgz && \
     tar -zxf Python-3.6.8.tgz && \
     cd Python-3.6.8 && \
@@ -42,8 +41,8 @@ RUN yum -y install wget && \
     echo 'export LD_LIBRARY_PATH=/usr/local/python3.6/lib:$LD_LIBRARY_PATH' >> /root/.bashrc && \
     source /root/.bashrc && \
     cd .. && rm -rf Python-3.6.8* && \
-    pip3 install google protobuf setuptools wheel flask numpy==1.16.4 && \
     yum -y install epel-release && yum -y install patchelf libXext libSM libXrender && \
     yum clean all && \
     localedef -c -i en_US -f UTF-8 en_US.UTF-8 && \
-    echo "export LANG=en_US.utf8" >> /root/.bashrc
+    echo "export LANG=en_US.utf8" >> /root/.bashrc && \
+    echo "export LANGUAGE=en_US.utf8" >> /root/.bashrc
diff --git a/tools/Dockerfile.ci b/tools/Dockerfile.ci
index 92aee9e777387374397682b3a22dafe1294ccde4..0eb695d301f6b45c73d3f3b4af018bd2c83860ac 100644
--- a/tools/Dockerfile.ci
+++ b/tools/Dockerfile.ci
@@ -23,7 +23,6 @@ RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
 RUN yum -y install python-devel sqlite-devel >/dev/null \
     && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
     && python get-pip.py >/dev/null \
-    && pip install google protobuf setuptools wheel flask >/dev/null \
     && rm get-pip.py
 
 RUN wget http://nixos.org/releases/patchelf/patchelf-0.10/patchelf-0.10.tar.bz2 \
@@ -35,8 +34,7 @@ RUN wget http://nixos.org/releases/patchelf/patchelf-0.10/patchelf-0.10.tar.bz2
     && cd .. \
     && rm -rf patchelf-0.10*
 
-RUN yum install -y python3 python3-devel \
-    && pip3 install google protobuf setuptools wheel flask
+RUN yum install -y python3 python3-devel
 
 RUN yum -y update >/dev/null \
     && yum -y install dnf >/dev/null \
diff --git a/tools/Dockerfile.cuda10.0-cudnn7 b/tools/Dockerfile.cuda10.0-cudnn7
index d2a5b2c93a3e78b807c7828c984a5fc29f50fd2d..c26eaeb986cbd7d66ee51bac1444cf293800d839 100644
--- a/tools/Dockerfile.cuda10.0-cudnn7
+++ b/tools/Dockerfile.cuda10.0-cudnn7
@@ -18,6 +18,7 @@ RUN ln -s /usr/local/cuda-10.0/lib64/libcublas.so.10.0 /usr/local/cuda-10.0/lib6
     ln -s /usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudnn.so.7 /usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudnn.so && \
     echo 'export LD_LIBRARY_PATH=/usr/local/cuda-10.0/targets/x86_64-linux/lib:$LD_LIBRARY_PATH' >> /root/.bashrc && \
     echo "export LANG=en_US.utf8" >> /root/.bashrc && \
+    echo "export LANGUAGE=en_US.utf8" >> /root/.bashrc && \
     mkdir -p /usr/local/cuda/extras
 
 COPY --from=builder /usr/local/cuda/extras/CUPTI /usr/local/cuda/extras/CUPTI
diff --git a/tools/Dockerfile.cuda10.0-cudnn7.devel b/tools/Dockerfile.cuda10.0-cudnn7.devel
index b46f9b96cf0d081cf9cdfc12cb46be037677ac86..d65e1e256c8772101a51fcc58ab525aff9f182a1 100644
--- a/tools/Dockerfile.cuda10.0-cudnn7.devel
+++ b/tools/Dockerfile.cuda10.0-cudnn7.devel
@@ -23,13 +23,12 @@ RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
 RUN yum -y install python-devel sqlite-devel  \
     && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
     && python get-pip.py >/dev/null \
-    && pip install google protobuf setuptools wheel flask >/dev/null \
     && rm get-pip.py 
 
 RUN yum install -y python3 python3-devel \
-    && pip3 install google protobuf setuptools wheel flask \
     && yum -y install epel-release && yum -y install patchelf libXext libSM libXrender\
     && yum clean all 
 
 RUN localedef -c -i en_US -f UTF-8 en_US.UTF-8 \
-    && echo "export LANG=en_US.utf8" >> /root/.bashrc
+    && echo "export LANG=en_US.utf8" >> /root/.bashrc \
+    && echo "export LANGUAGE=en_US.utf8" >> /root/.bashrc
diff --git a/tools/Dockerfile.cuda9.0-cudnn7 b/tools/Dockerfile.cuda9.0-cudnn7
index 145cd53eb0b9848349d591fdc078833ec5a89740..ead1cfe77d7148789d84bc01fb05cedda5ff1fe6 100644
--- a/tools/Dockerfile.cuda9.0-cudnn7
+++ b/tools/Dockerfile.cuda9.0-cudnn7
@@ -18,6 +18,7 @@ RUN ln -s /usr/local/cuda-9.0/lib64/libcublas.so.9.0 /usr/local/cuda-9.0/lib64/l
     ln -s /usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudnn.so.7 /usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudnn.so && \
     echo 'export LD_LIBRARY_PATH=/usr/local/cuda-9.0/targets/x86_64-linux/lib:$LD_LIBRARY_PATH' >> /root/.bashrc && \
     echo "export LANG=en_US.utf8" >> /root/.bashrc && \
+    echo "export LANGUAGE=en_US.utf8" >> /root/.bashrc && \
     mkdir -p /usr/local/cuda/extras
 
 COPY --from=builder /usr/local/cuda/extras/CUPTI /usr/local/cuda/extras/CUPTI
diff --git a/tools/Dockerfile.cuda9.0-cudnn7.devel b/tools/Dockerfile.cuda9.0-cudnn7.devel
index c101b32118bf735a100bdb07f2e33f7978a8a30c..b33ed58a2a97a6bea7a8f408c2e6a7f6ccc7d448 100644
--- a/tools/Dockerfile.cuda9.0-cudnn7.devel
+++ b/tools/Dockerfile.cuda9.0-cudnn7.devel
@@ -22,13 +22,12 @@ RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
 RUN yum -y install python-devel sqlite-devel  \
     && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
     && python get-pip.py >/dev/null \
-    && pip install google protobuf setuptools wheel flask >/dev/null \
     && rm get-pip.py 
 
 RUN yum install -y python3 python3-devel \
-    && pip3 install google protobuf setuptools wheel flask \
     && yum -y install epel-release && yum -y install patchelf libXext libSM libXrender\
     && yum clean all 
 
 RUN localedef -c -i en_US -f UTF-8 en_US.UTF-8 \
-    && echo "export LANG=en_US.utf8" >> /root/.bashrc
+    && echo "export LANG=en_US.utf8" >> /root/.bashrc \
+    && echo "export LANGUAGE=en_US.utf8" >> /root/.bashrc
diff --git a/tools/Dockerfile.devel b/tools/Dockerfile.devel
index dc00384e39bb742400fee74663a551cf44019d61..83e3b491c30fe99eaa615e836efeef6aad0c0cc4 100644
--- a/tools/Dockerfile.devel
+++ b/tools/Dockerfile.devel
@@ -19,13 +19,12 @@ RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
 RUN yum -y install python-devel sqlite-devel  \
     && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
     && python get-pip.py >/dev/null \
-    && pip install google protobuf setuptools wheel flask >/dev/null \
     && rm get-pip.py 
 
 RUN yum install -y python3 python3-devel \
-    && pip3 install google protobuf setuptools wheel flask \
     && yum -y install epel-release && yum -y install patchelf libXext libSM libXrender\
     && yum clean all 
 
 RUN localedef -c -i en_US -f UTF-8 en_US.UTF-8 \
-    && echo "export LANG=en_US.utf8" >> /root/.bashrc
+    && echo "export LANG=en_US.utf8" >> /root/.bashrc \
+    && echo "export LANGUAGE=en_US.utf8" >> /root/.bashrc
diff --git a/tools/serving_build.sh b/tools/serving_build.sh
index ac6e5f8cb5fcb7db5e7890c09b08d12ba14d0294..ee6e7cdb40ca86f1e4f4921fa4b257cb982337a5 100644
--- a/tools/serving_build.sh
+++ b/tools/serving_build.sh
@@ -19,6 +19,13 @@ function init() {
     cd Serving
     export SERVING_WORKDIR=$PWD
     $PYTHONROOT/bin/python -m pip install -r python/requirements.txt
+    export GOPATH=$HOME/go
+    export PATH=$PATH:$GOPATH/bin
+
+    go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway
+    go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger
+    go get -u github.com/golang/protobuf/protoc-gen-go
+    go get -u google.golang.org/grpc
 }
 
 function check_cmd() {
@@ -298,7 +305,6 @@ function python_test_bert() {
     cd bert # pwd: /Serving/python/examples/bert
     case $TYPE in
         CPU)
-            pip install paddlehub
             # Because download from paddlehub may timeout,
             # download the model from bos(max_seq_len=128).
             wget https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SemanticModel/bert_chinese_L-12_H-768_A-12.tar.gz
@@ -306,14 +312,12 @@ function python_test_bert() {
             sh get_data.sh
             check_cmd "python -m paddle_serving_server.serve --model bert_chinese_L-12_H-768_A-12_model --port 9292 &"
             sleep 5
-            pip install paddle_serving_app
             check_cmd "head -n 10 data-c.txt | python bert_client.py --model bert_chinese_L-12_H-768_A-12_client/serving_client_conf.prototxt"
             kill_server_process
             echo "bert RPC inference pass"
             ;;
         GPU)
             export CUDA_VISIBLE_DEVICES=0
-            pip install paddlehub
             # Because download from paddlehub may timeout,
             # download the model from bos(max_seq_len=128).
             wget https://paddle-serving.bj.bcebos.com/paddle_hub_models/text/SemanticModel/bert_chinese_L-12_H-768_A-12.tar.gz
@@ -321,7 +325,6 @@ function python_test_bert() {
             sh get_data.sh
             check_cmd "python -m paddle_serving_server_gpu.serve --model bert_chinese_L-12_H-768_A-12_model --port 9292 --gpu_ids 0 &"
             sleep 5
-            pip install paddle_serving_app
             check_cmd "head -n 10 data-c.txt | python bert_client.py --model bert_chinese_L-12_H-768_A-12_client/serving_client_conf.prototxt"
             kill_server_process
             echo "bert RPC inference pass"
@@ -760,13 +763,14 @@ function python_test_resnet50(){
 }
 
 function python_test_pipeline(){
-    # pwd:/ Serving/python/examples
+    # pwd: /Serving/python/examples
     local TYPE=$1
     export SERVING_BIN=${SERVING_WORKDIR}/build-server-${TYPE}/core/general-server/serving
     unsetproxy
-    cd pipeline/imdb_model_ensemble
+    cd pipeline # pwd: /Serving/python/examples/pipeline
     case $TYPE in
         CPU)
+            cd imdb_model_ensemble # pwd: /Serving/python/examples/pipeline/imdb_model_ensemble
             # start paddle serving service (brpc)
             sh get_data.sh
             python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 --workdir test9292 &> cnn.log &
@@ -775,8 +779,8 @@ function python_test_pipeline(){
             
             # test: thread servicer & thread op
             cat << EOF > config.yml
-port: 18080
-worker_num: 2
+rpc_port: 18080
+worker_num: 4
 build_dag_each_worker: false
 dag:
     is_thread_op: true
@@ -792,8 +796,8 @@ EOF
 
             # test: thread servicer & process op
             cat << EOF > config.yml
-port: 18080
-worker_num: 2
+rpc_port: 18080
+worker_num: 4
 build_dag_each_worker: false
 dag:
     is_thread_op: false
@@ -807,13 +811,13 @@ EOF
             ps -ef | grep "pipeline_server" | grep -v grep | awk '{print $2}' | xargs kill
             kill_process_by_port 18080
 
-            # test: process servicer & thread op
+            # test: process servicer & process op
             cat << EOF > config.yml
-port: 18080
-worker_num: 2
-build_dag_each_worker: true
+rpc_port: 18080
+worker_num: 4
+build_dag_each_worker: false
 dag:
-    is_thread_op: flase
+    is_thread_op: false
     client_type: brpc
     retry: 1
     use_profile: false
@@ -823,12 +827,14 @@ EOF
             check_cmd "python test_pipeline_client.py"
             ps -ef | grep "pipeline_server" | grep -v grep | awk '{print $2}' | xargs kill
             kill_process_by_port 18080
-
-            # test: process servicer & process op
+            
+            # test: process servicer & thread op
+            pip uninstall grpcio -y
+            pip install grpcio --no-binary=grpcio
             cat << EOF > config.yml
-port: 18080
-worker_num: 2
-build_dag_each_worker: false
+rpc_port: 18080
+worker_num: 4
+build_dag_each_worker: true
 dag:
     is_thread_op: false
     client_type: brpc
@@ -840,7 +846,7 @@ EOF
             check_cmd "python test_pipeline_client.py"
             ps -ef | grep "pipeline_server" | grep -v grep | awk '{print $2}' | xargs kill
             kill_process_by_port 18080
-            
+
             kill_server_process
             kill_process_by_port 9292
             kill_process_by_port 9393
@@ -850,8 +856,8 @@ EOF
             python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 --use_multilang --workdir test9393 &> bow.log &
             sleep 5
             cat << EOF > config.yml
-port: 18080
-worker_num: 2
+rpc_port: 18080
+worker_num: 4
 build_dag_each_worker: false
 dag:
     is_thread_op: false
@@ -867,16 +873,47 @@ EOF
             kill_server_process
             kill_process_by_port 9292
             kill_process_by_port 9393
+            cd ..
+
+            cd simple_web_service # pwd: /Serving/python/examples/pipeline/simple_web_service
+            sh get_data.sh
+            python web_service.py >/dev/null &
+            sleep 5
+            curl -X POST -k http://localhost:18080/uci/prediction -d '{"key": ["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}'
+             check http code
+            http_code=`curl -X POST -k -d '{"key":["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}' -s -w "%{http_code}" -o /dev/null http://localhost:18080/uci/prediction`
+            if [ ${http_code} -ne 200 ]; then
+                echo "HTTP status code -ne 200"
+                exit 1
+            fi
+            ps -ef | grep "web_service" | grep -v grep | awk '{print $2}' | xargs kill
+            ps -ef | grep "pipeline" | grep -v grep | awk '{print $2}' | xargs kill
+            kill_server_process
+            cd ..
             ;;
         GPU)
-            echo "pipeline ignore GPU test"
+            cd simple_web_service # pwd: /Serving/python/examples/pipeline/simple_web_service
+            sh get_data.sh
+            python web_service.py >/dev/null &
+            sleep 5
+            curl -X POST -k http://localhost:18080/uci/prediction -d '{"key": ["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}'
+            # check http code
+            http_code=`curl -X POST -k -d '{"key":["x"], "value": ["0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332"]}' -s -w "%{http_code}" -o /dev/null http://localhost:18080/uci/prediction`
+            if [ ${http_code} -ne 200 ]; then
+                echo "HTTP status code -ne 200"
+                exit 1
+            fi
+            ps -ef | grep "web_service" | grep -v grep | awk '{print $2}' | xargs kill
+            ps -ef | grep "pipeline" | grep -v grep | awk '{print $2}' | xargs kill
+            kill_server_process
+            cd .. # pwd: /Serving/python/examples/pipeline
             ;;
         *)
             echo "error type"
             exit 1
             ;;
     esac
-    cd ../../
+    cd ..
     setproxy
     unset SERVING_BIN
 }
@@ -926,118 +963,8 @@ function monitor_test() {
     mkdir _monitor_test && cd _monitor_test # pwd: /Serving/_monitor_test
     case $TYPE in
         CPU):
-            pip install pyftpdlib
-            mkdir remote_path
-            mkdir local_path
-            cd remote_path # pwd: /Serving/_monitor_test/remote_path
-            check_cmd "python -m pyftpdlib -p 8000 &>/dev/null &"
-            cd .. # pwd: /Serving/_monitor_test
-
-            # type: ftp
-            # remote_path: /
-            # remote_model_name: uci_housing.tar.gz
-            # local_tmp_path: ___tmp
-            # local_path: local_path
-            cd remote_path # pwd: /Serving/_monitor_test/remote_path
-            wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
-            touch donefile
-            cd ..  # pwd: /Serving/_monitor_test
-            mkdir -p local_path/uci_housing_model
-            python -m paddle_serving_server.monitor \
-                    --type='ftp' --ftp_host='127.0.0.1' --ftp_port='8000' \
-                    --remote_path='/' --remote_model_name='uci_housing.tar.gz' \
-                    --remote_donefile_name='donefile' --local_path='local_path' \
-                    --local_model_name='uci_housing_model' --local_timestamp_file='fluid_time_file' \
-                    --local_tmp_path='___tmp' --unpacked_filename='uci_housing_model' \
-                    --interval='1' >/dev/null &
-            sleep 10
-            if [ ! -f "local_path/uci_housing_model/fluid_time_file" ]; then
-                echo "local_path/uci_housing_model/fluid_time_file not exist."
-                exit 1
-            fi
-            ps -ef | grep "monitor" | grep -v grep | awk '{print $2}' | xargs kill
-            rm -rf remote_path/*
-            rm -rf local_path/*
-
-            # type: ftp
-            # remote_path: /tmp_dir
-            # remote_model_name: uci_housing_model
-            # local_tmp_path: ___tmp
-            # local_path: local_path
-            mkdir -p remote_path/tmp_dir && cd remote_path/tmp_dir # pwd: /Serving/_monitor_test/remote_path/tmp_dir
-            wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
-            tar -xzf uci_housing.tar.gz
-            touch donefile
-            cd ../.. # pwd: /Serving/_monitor_test
-            mkdir -p local_path/uci_housing_model
-            python -m paddle_serving_server.monitor \
-                    --type='ftp' --ftp_host='127.0.0.1' --ftp_port='8000' \
-                    --remote_path='/tmp_dir' --remote_model_name='uci_housing_model' \
-                    --remote_donefile_name='donefile' --local_path='local_path' \
-                    --local_model_name='uci_housing_model' --local_timestamp_file='fluid_time_file' \
-                    --local_tmp_path='___tmp' --interval='1' >/dev/null &
-            sleep 10
-            if [ ! -f "local_path/uci_housing_model/fluid_time_file" ]; then
-                echo "local_path/uci_housing_model/fluid_time_file not exist."
-                exit 1
-            fi
-            ps -ef | grep "monitor" | grep -v grep | awk '{print $2}' | xargs kill
-            rm -rf remote_path/*
-            rm -rf local_path/*
-
-            # type: general
-            # remote_path: /
-            # remote_model_name: uci_housing.tar.gz
-            # local_tmp_path: ___tmp
-            # local_path: local_path
-            cd remote_path # pwd: /Serving/_monitor_test/remote_path
-            wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
-            touch donefile
-            cd ..  # pwd: /Serving/_monitor_test
-            mkdir -p local_path/uci_housing_model
-            python -m paddle_serving_server.monitor \
-                    --type='general' --general_host='ftp://127.0.0.1:8000' \
-                    --remote_path='/' --remote_model_name='uci_housing.tar.gz' \
-                    --remote_donefile_name='donefile' --local_path='local_path' \
-                    --local_model_name='uci_housing_model' --local_timestamp_file='fluid_time_file' \
-                    --local_tmp_path='___tmp' --unpacked_filename='uci_housing_model' \
-                    --interval='1' >/dev/null &
-            sleep 10
-            if [ ! -f "local_path/uci_housing_model/fluid_time_file" ]; then
-                echo "local_path/uci_housing_model/fluid_time_file not exist."
-                exit 1
-            fi
-            ps -ef | grep "monitor" | grep -v grep | awk '{print $2}' | xargs kill
-            rm -rf remote_path/*
-            rm -rf local_path/*
-
-            # type: general
-            # remote_path: /tmp_dir
-            # remote_model_name: uci_housing_model
-            # local_tmp_path: ___tmp
-            # local_path: local_path
-            mkdir -p remote_path/tmp_dir && cd remote_path/tmp_dir # pwd: /Serving/_monitor_test/remote_path/tmp_dir
-            wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
-            tar -xzf uci_housing.tar.gz
-            touch donefile
-            cd ../.. # pwd: /Serving/_monitor_test
-            mkdir -p local_path/uci_housing_model
-            python -m paddle_serving_server.monitor \
-                    --type='general' --general_host='ftp://127.0.0.1:8000' \
-                    --remote_path='/tmp_dir' --remote_model_name='uci_housing_model' \
-                    --remote_donefile_name='donefile' --local_path='local_path' \
-                    --local_model_name='uci_housing_model' --local_timestamp_file='fluid_time_file' \
-                    --local_tmp_path='___tmp' --interval='1' >/dev/null &
-            sleep 10
-            if [ ! -f "local_path/uci_housing_model/fluid_time_file" ]; then
-                echo "local_path/uci_housing_model/fluid_time_file not exist."
-                exit 1
-            fi
-            ps -ef | grep "monitor" | grep -v grep | awk '{print $2}' | xargs kill
-            rm -rf remote_path/*
-            rm -rf local_path/*
-
-            ps -ef | grep "pyftpdlib" | grep -v grep | awk '{print $2}' | xargs kill
+            # The CPU part and GPU part are identical.
+            # In order to avoid Travis CI timeout (50 min), the CPU version is not checked
             ;;
         GPU):
             pip install pyftpdlib