Merge branch 'develop' of https://github.com/PaddlePaddle/Serving into java-sdk

f482b551 · barrierye · d980d775 · 0af45cc0 · f482b551 · f482b551
82 changed file
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -143,7 +143,6 @@ function(grpc_protobuf_generate_python SRCS)
    set(${SRCS} ${${SRCS}} PARENT_SCOPE)
 endfunction()
 # Print and set the protobuf library information,
 # finish this cmake process and exit from this file.
 macro(PROMPT_PROTOBUF_LIB)

--- a/core/configure/CMakeLists.txt
+++ b/core/configure/CMakeLists.txt
@@ -86,6 +86,7 @@ add_custom_command(TARGET general_model_config_py_proto POST_BUILD
 		COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
 		COMMENT "Copy generated general_model_config proto file into directory paddle_serving_server/proto."
 		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 add_custom_command(TARGET multi_lang_general_model_service_py_proto POST_BUILD
                COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto
                COMMAND cp *.py ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto

--- a/core/configure/proto/multi_lang_general_model_service.proto
+++ b/core/configure/proto/multi_lang_general_model_service.proto
@@ -28,16 +28,17 @@ message FeedInst { repeated Tensor tensor_array = 1; };
 message FetchInst { repeated Tensor tensor_array = 1; };
-message Request {
+message InferenceRequest {
  repeated FeedInst insts = 1;
  repeated string feed_var_names = 2;
  repeated string fetch_var_names = 3;
  required bool is_python = 4 [ default = false ];
 };
-message Response {
+message InferenceResponse {
  repeated ModelOutput outputs = 1;
  optional string tag = 2;
+  required int32 err_code = 3;
 };
 message ModelOutput {
@@ -45,6 +46,17 @@ message ModelOutput {
  optional string engine_name = 2;
 }
+message SetTimeoutRequest { required int32 timeout_ms = 1; }
+message SimpleResponse { required int32 err_code = 1; }
+message GetClientConfigRequest {}
+message GetClientConfigResponse { required string client_config_str = 1; }
 service MultiLangGeneralModelService {
-  rpc inference(Request) returns (Response) {}
+  rpc Inference(InferenceRequest) returns (InferenceResponse) {}
+  rpc SetTimeout(SetTimeoutRequest) returns (SimpleResponse) {}
+  rpc GetClientConfig(GetClientConfigRequest)
+      returns (GetClientConfigResponse) {}
 };
--- a/core/general-client/include/general_model.h
+++ b/core/general-client/include/general_model.h
@@ -49,6 +49,8 @@ class ModelRes {
                            res._int64_value_map.end());
    _float_value_map.insert(res._float_value_map.begin(),
                            res._float_value_map.end());
+    _int32_value_map.insert(res._int32_value_map.begin(),
+                            res._int32_value_map.end());
    _shape_map.insert(res._shape_map.begin(), res._shape_map.end());
    _lod_map.insert(res._lod_map.begin(), res._lod_map.end());
  }
@@ -60,6 +62,9 @@ class ModelRes {
    _float_value_map.insert(
        std::make_move_iterator(std::begin(res._float_value_map)),
        std::make_move_iterator(std::end(res._float_value_map)));
+    _int32_value_map.insert(
+        std::make_move_iterator(std::begin(res._int32_value_map)),
+        std::make_move_iterator(std::end(res._int32_value_map)));
    _shape_map.insert(std::make_move_iterator(std::begin(res._shape_map)),
                      std::make_move_iterator(std::end(res._shape_map)));
    _lod_map.insert(std::make_move_iterator(std::begin(res._lod_map)),
@@ -78,6 +83,12 @@ class ModelRes {
  std::vector<float>&& get_float_by_name_with_rv(const std::string& name) {
    return std::move(_float_value_map[name]);
  }
+  const std::vector<int32_t>& get_int32_by_name(const std::string& name) {
+    return _int32_value_map[name];
+  }
+  std::vector<int32_t>&& get_int32_by_name_with_rv(const std::string& name) {
+    return std::move(_int32_value_map[name]);
+  }
  const std::vector<int>& get_shape_by_name(const std::string& name) {
    return _shape_map[name];
  }
@@ -103,6 +114,9 @@ class ModelRes {
      _float_value_map.insert(
          std::make_move_iterator(std::begin(res._float_value_map)),
          std::make_move_iterator(std::end(res._float_value_map)));
+      _int32_value_map.insert(
+          std::make_move_iterator(std::begin(res._int32_value_map)),
+          std::make_move_iterator(std::end(res._int32_value_map)));
      _shape_map.insert(std::make_move_iterator(std::begin(res._shape_map)),
                        std::make_move_iterator(std::end(res._shape_map)));
      _lod_map.insert(std::make_move_iterator(std::begin(res._lod_map)),
@@ -115,6 +129,7 @@ class ModelRes {
  std::string _engine_name;
  std::map<std::string, std::vector<int64_t>> _int64_value_map;
  std::map<std::string, std::vector<float>> _float_value_map;
+  std::map<std::string, std::vector<int32_t>> _int32_value_map;
  std::map<std::string, std::vector<int>> _shape_map;
  std::map<std::string, std::vector<int>> _lod_map;
 };
@@ -145,6 +160,14 @@ class PredictorRes {
                                                 const std::string& name) {
    return std::move(_models[model_idx].get_float_by_name_with_rv(name));
  }
+  const std::vector<int32_t>& get_int32_by_name(const int model_idx,
+                                                const std::string& name) {
+    return _models[model_idx].get_int32_by_name(name);
+  }
+  std::vector<int32_t>&& get_int32_by_name_with_rv(const int model_idx,
+                                                   const std::string& name) {
+    return std::move(_models[model_idx].get_int32_by_name_with_rv(name));
+  }
  const std::vector<int>& get_shape_by_name(const int model_idx,
                                            const std::string& name) {
    return _models[model_idx].get_shape_by_name(name);

--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -207,17 +207,28 @@ int PredictorClient::batch_predict(
    for (auto &name : int_feed_name) {
      int idx = _feed_name_to_idx[name];
      Tensor *tensor = tensor_vec[idx];
-      VLOG(2) << "prepare int feed " << name << " shape size "
+      if (_type[idx] == 0) {
-              << int_shape[vec_idx].size();
+        VLOG(2) << "prepare int64 feed " << name << " shape size "
+                << int_shape[vec_idx].size();
+        VLOG(3) << "feed var name " << name << " index " << vec_idx
+                << "first data " << int_feed[vec_idx][0];
+        for (uint32_t j = 0; j < int_feed[vec_idx].size(); ++j) {
+          tensor->add_int64_data(int_feed[vec_idx][j]);
+        }
+      } else if (_type[idx] == 2) {
+        VLOG(2) << "prepare int32 feed " << name << " shape size "
+                << int_shape[vec_idx].size();
+        VLOG(3) << "feed var name " << name << " index " << vec_idx
+                << "first data " << int32_t(int_feed[vec_idx][0]);
+        for (uint32_t j = 0; j < int_feed[vec_idx].size(); ++j) {
+          tensor->add_int_data(int32_t(int_feed[vec_idx][j]));
+        }
+      }
      for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
        tensor->add_shape(int_shape[vec_idx][j]);
      }
-      tensor->set_elem_type(0);
+      tensor->set_elem_type(_type[idx]);
-      VLOG(3) << "feed var name " << name << " index " << vec_idx
-              << "first data " << int_feed[vec_idx][0];
-      for (uint32_t j = 0; j < int_feed[vec_idx].size(); ++j) {
-        tensor->add_int64_data(int_feed[vec_idx][j]);
-      }
      vec_idx++;
    }
@@ -284,18 +295,25 @@ int PredictorClient::batch_predict(
      for (auto &name : fetch_name) {
        // int idx = _fetch_name_to_idx[name];
        if (_fetch_name_to_type[name] == 0) {
-          VLOG(2) << "ferch var " << name << "type int";
+          VLOG(2) << "ferch var " << name << "type int64";
          int size = output.insts(0).tensor_array(idx).int64_data_size();
          model._int64_value_map[name] = std::vector<int64_t>(
              output.insts(0).tensor_array(idx).int64_data().begin(),
              output.insts(0).tensor_array(idx).int64_data().begin() + size);
-        } else {
+        } else if (_fetch_name_to_type[name] == 1) {
          VLOG(2) << "fetch var " << name << "type float";
          int size = output.insts(0).tensor_array(idx).float_data_size();
          model._float_value_map[name] = std::vector<float>(
              output.insts(0).tensor_array(idx).float_data().begin(),
              output.insts(0).tensor_array(idx).float_data().begin() + size);
+        } else if (_fetch_name_to_type[name] == 2) {
+          VLOG(2) << "fetch var " << name << "type int32";
+          int size = output.insts(0).tensor_array(idx).int_data_size();
+          model._int32_value_map[name] = std::vector<int32_t>(
+              output.insts(0).tensor_array(idx).int_data().begin(),
+              output.insts(0).tensor_array(idx).int_data().begin() + size);
        }
        idx += 1;
      }
      predict_res_batch.add_model_res(std::move(model));
@@ -442,12 +460,19 @@ int PredictorClient::numpy_predict(
    for (auto &name : int_feed_name) {
      int idx = _feed_name_to_idx[name];
      Tensor *tensor = tensor_vec[idx];
-      VLOG(2) << "prepare int feed " << name << " shape size "
-              << int_shape[vec_idx].size();
      for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
        tensor->add_shape(int_shape[vec_idx][j]);
      }
-      tensor->set_elem_type(0);
+      tensor->set_elem_type(_type[idx]);
+      if (_type[idx] == 0) {
+        VLOG(2) << "prepare int feed " << name << " shape size "
+                << int_shape[vec_idx].size();
+      } else {
+        VLOG(2) << "prepare int32 feed " << name << " shape size "
+                << int_shape[vec_idx].size();
+      }
      const int int_shape_size = int_shape[vec_idx].size();
      switch (int_shape_size) {
@@ -457,7 +482,11 @@ int PredictorClient::numpy_predict(
            for (ssize_t j = 0; j < int_array.shape(1); j++) {
              for (ssize_t k = 0; k < int_array.shape(2); k++) {
                for (ssize_t l = 0; k < int_array.shape(3); l++) {
-                  tensor->add_int64_data(int_array(i, j, k, l));
+                  if (_type[idx] == 0) {
+                    tensor->add_int64_data(int_array(i, j, k, l));
+                  } else {
+                    tensor->add_int_data(int_array(i, j, k, l));
+                  }
                }
              }
            }
@@ -469,7 +498,11 @@ int PredictorClient::numpy_predict(
          for (ssize_t i = 0; i < int_array.shape(0); i++) {
            for (ssize_t j = 0; j < int_array.shape(1); j++) {
              for (ssize_t k = 0; k < int_array.shape(2); k++) {
-                tensor->add_int64_data(int_array(i, j, k));
+                if (_type[idx] == 0) {
+                  tensor->add_int64_data(int_array(i, j, k));
+                } else {
+                  tensor->add_int_data(int_array(i, j, k));
+                }
              }
            }
          }
@@ -479,7 +512,11 @@ int PredictorClient::numpy_predict(
          auto int_array = int_feed[vec_idx].unchecked<2>();
          for (ssize_t i = 0; i < int_array.shape(0); i++) {
            for (ssize_t j = 0; j < int_array.shape(1); j++) {
-              tensor->add_int64_data(int_array(i, j));
+              if (_type[idx] == 0) {
+                tensor->add_int64_data(int_array(i, j));
+              } else {
+                tensor->add_int_data(int_array(i, j));
+              }
            }
          }
          break;
@@ -487,7 +524,11 @@ int PredictorClient::numpy_predict(
        case 1: {
          auto int_array = int_feed[vec_idx].unchecked<1>();
          for (ssize_t i = 0; i < int_array.shape(0); i++) {
-            tensor->add_int64_data(int_array(i));
+            if (_type[idx] == 0) {
+              tensor->add_int64_data(int_array(i));
+            } else {
+              tensor->add_int_data(int_array(i));
+            }
          }
          break;
        }
@@ -557,17 +598,23 @@ int PredictorClient::numpy_predict(
      for (auto &name : fetch_name) {
        // int idx = _fetch_name_to_idx[name];
        if (_fetch_name_to_type[name] == 0) {
-          VLOG(2) << "ferch var " << name << "type int";
+          VLOG(2) << "ferch var " << name << "type int64";
          int size = output.insts(0).tensor_array(idx).int64_data_size();
          model._int64_value_map[name] = std::vector<int64_t>(
              output.insts(0).tensor_array(idx).int64_data().begin(),
              output.insts(0).tensor_array(idx).int64_data().begin() + size);
-        } else {
+        } else if (_fetch_name_to_type[name] == 1) {
          VLOG(2) << "fetch var " << name << "type float";
          int size = output.insts(0).tensor_array(idx).float_data_size();
          model._float_value_map[name] = std::vector<float>(
              output.insts(0).tensor_array(idx).float_data().begin(),
              output.insts(0).tensor_array(idx).float_data().begin() + size);
+        } else if (_fetch_name_to_type[name] == 2) {
+          VLOG(2) << "fetch var " << name << "type int32";
+          int size = output.insts(0).tensor_array(idx).int_data_size();
+          model._int32_value_map[name] = std::vector<int32_t>(
+              output.insts(0).tensor_array(idx).int_data().begin(),
+              output.insts(0).tensor_array(idx).int_data().begin() + size);
        }
        idx += 1;
      }
@@ -601,7 +648,6 @@ int PredictorClient::numpy_predict(
  _api.thrd_clear();
  return 0;
 }
 }  // namespace general_model
 }  // namespace paddle_serving
 }  // namespace baidu
--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -126,9 +126,12 @@ int GeneralReaderOp::inference() {
    if (elem_type[i] == 0) {  // int64
      elem_size[i] = sizeof(int64_t);
      lod_tensor.dtype = paddle::PaddleDType::INT64;
-    } else {
+    } else if (elem_type[i] == 1) {
      elem_size[i] = sizeof(float);
      lod_tensor.dtype = paddle::PaddleDType::FLOAT32;
+    } else if (elem_type[i] == 2) {
+      elem_size[i] = sizeof(int32_t);
+      lod_tensor.dtype = paddle::PaddleDType::INT32;
    }
    if (model_config->_is_lod_feed[i]) {
@@ -159,8 +162,10 @@ int GeneralReaderOp::inference() {
        int data_len = 0;
        if (tensor.int64_data_size() > 0) {
          data_len = tensor.int64_data_size();
-        } else {
+        } else if (tensor.float_data_size() > 0) {
          data_len = tensor.float_data_size();
+        } else if (tensor.int_data_size() > 0) {
+          data_len = tensor.int_data_size();
        }
        VLOG(2) << "tensor size for var[" << i << "]: " << data_len;
        tensor_size += data_len;
@@ -198,6 +203,8 @@ int GeneralReaderOp::inference() {
  for (int i = 0; i < var_num; ++i) {
    if (elem_type[i] == 0) {
      int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
+      VLOG(2) << "first element data in var[" << i << "] is "
+              << req->insts(0).tensor_array(i).int64_data(0);
      int offset = 0;
      for (int j = 0; j < batch_size; ++j) {
        int elem_num = req->insts(j).tensor_array(i).int64_data_size();
@@ -210,8 +217,10 @@ int GeneralReaderOp::inference() {
          offset += capacity[i];
        }
      }
-    } else {
+    } else if (elem_type[i] == 1) {
      float *dst_ptr = static_cast<float *>(out->at(i).data.data());
+      VLOG(2) << "first element data in var[" << i << "] is "
+              << req->insts(0).tensor_array(i).float_data(0);
      int offset = 0;
      for (int j = 0; j < batch_size; ++j) {
        int elem_num = req->insts(j).tensor_array(i).float_data_size();
@@ -224,6 +233,22 @@ int GeneralReaderOp::inference() {
          offset += capacity[i];
        }
      }
+    } else if (elem_type[i] == 2) {
+      int32_t *dst_ptr = static_cast<int32_t *>(out->at(i).data.data());
+      VLOG(2) << "first element data in var[" << i << "] is "
+              << req->insts(0).tensor_array(i).int_data(0);
+      int offset = 0;
+      for (int j = 0; j < batch_size; ++j) {
+        int elem_num = req->insts(j).tensor_array(i).int_data_size();
+        for (int k = 0; k < elem_num; ++k) {
+          dst_ptr[offset + k] = req->insts(j).tensor_array(i).int_data(k);
+        }
+        if (out->at(i).lod.size() == 1) {
+          offset = out->at(i).lod[0][j + 1];
+        } else {
+          offset += capacity[i];
+        }
+      }
    }
  }

--- a/core/general-server/op/general_response_op.cpp
+++ b/core/general-server/op/general_response_op.cpp
@@ -91,7 +91,6 @@ int GeneralResponseOp::inference() {
    for (auto &idx : fetch_index) {
      Tensor *tensor = fetch_inst->add_tensor_array();
-      tensor->set_elem_type(1);
      if (model_config->_is_lod_fetch[idx]) {
        VLOG(2) << "out[" << idx << "] " << model_config->_fetch_name[idx]
                << " is lod_tensor";
@@ -116,7 +115,7 @@ int GeneralResponseOp::inference() {
        cap *= in->at(idx).shape[j];
      }
      if (in->at(idx).dtype == paddle::PaddleDType::INT64) {
-        VLOG(2) << "Prepare float var [" << model_config->_fetch_name[idx]
+        VLOG(2) << "Prepare int64 var [" << model_config->_fetch_name[idx]
                << "].";
        int64_t *data_ptr = static_cast<int64_t *>(in->at(idx).data.data());
        if (model_config->_is_lod_fetch[idx]) {
@@ -157,6 +156,27 @@ int GeneralResponseOp::inference() {
        }
        VLOG(2) << "fetch var [" << model_config->_fetch_name[idx] << "] ready";
        var_idx++;
+      } else if (in->at(idx).dtype == paddle::PaddleDType::INT32) {
+        VLOG(2) << "Prepare int32 var [" << model_config->_fetch_name[idx]
+                << "].";
+        int32_t *data_ptr = static_cast<int32_t *>(in->at(idx).data.data());
+        if (model_config->_is_lod_fetch[idx]) {
+          FetchInst *fetch_p = output->mutable_insts(0);
+          for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
+            fetch_p->mutable_tensor_array(var_idx)->add_lod(
+                in->at(idx).lod[0][j]);
+          }
+          for (int j = 0; j < cap; ++j) {
+            fetch_p->mutable_tensor_array(var_idx)->add_int_data(data_ptr[j]);
+          }
+        } else {
+          FetchInst *fetch_p = output->mutable_insts(0);
+          for (int j = 0; j < cap; ++j) {
+            fetch_p->mutable_tensor_array(var_idx)->add_int_data(data_ptr[j]);
+          }
+        }
+        VLOG(2) << "fetch var [" << model_config->_fetch_name[idx] << "] ready";
+        var_idx++;
      }
    }
  }

--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -603,13 +603,13 @@ class VersionedInferEngine : public InferEngine {
      LOG(ERROR) << "Failed generate engine with type:" << engine_type;
      return -1;
    }
-    VLOG(2) << "FLGS_logtostderr " << FLAGS_logtostderr;
+    VLOG(2) << "FLAGS_logtostderr " << FLAGS_logtostderr;
    int tmp = FLAGS_logtostderr;
    if (engine->proc_initialize(conf, version) != 0) {
      LOG(ERROR) << "Failed initialize engine, type:" << engine_type;
      return -1;
    }
-    VLOG(2) << "FLGS_logtostderr " << FLAGS_logtostderr;
+    VLOG(2) << "FLAGS_logtostderr " << FLAGS_logtostderr;
    FLAGS_logtostderr = tmp;
    auto r = _versions.insert(std::make_pair(engine->version(), engine));
    if (!r.second) {

--- a/core/predictor/tools/seq_generator.cpp
+++ b/core/predictor/tools/seq_generator.cpp
@@ -233,7 +233,7 @@ int compress_parameter_parallel(const char *file1,
        greedy_search(
            emb_table + k * emb_size, xmin, xmax, loss, emb_size, bits);
        // 得出 loss 最小的时候的 scale
-        float scale = (xmax - xmin) * (pow2bits - 1);
+        float scale = (xmax - xmin) / (pow2bits - 1);
        char *min_ptr = tensor_temp;
        char *max_ptr = tensor_temp + sizeof(float);
        memcpy(min_ptr, &xmin, sizeof(float));

--- a/doc/GRPC_IMPL_CN.md
+++ b/doc/GRPC_IMPL_CN.md
+# gRPC接口
+gRPC 接口实现形式类似 Web Service：
+![](grpc_impl.png)
+## 与bRPC接口对比
+1. gRPC Server 端 `load_model_config` 函数添加 `client_config_path` 参数：
+   ```python
+   def load_model_config(self, server_config_paths, client_config_path=None)
+   ```
+   在一些例子中 bRPC Server 端与 bRPC Client 端的配置文件可能是不同的（如 cube local 例子中，Client 端的数据先交给 cube，经过 cube 处理后再交给预测库），所以 gRPC Server 端需要获取 gRPC Client 端的配置；同时为了取消 gRPC Client 端手动加载配置文件的过程，所以设计 gRPC Server 端同时加载两个配置文件。`client_config_path` 默认为 `<server_config_path>/serving_server_conf.prototxt`。
+2. gRPC Client 端取消 `load_client_config` 步骤：
+   在 `connect` 步骤通过 RPC 获取相应的 prototxt（从任意一个 endpoint 获取即可）。
+3. gRPC Client 需要通过 RPC 方式设置 timeout 时间（调用形式与 bRPC Client保持一致）
+   因为 bRPC Client 在 `connect` 后无法更改 timeout 时间，所以当 gRPC Server 收到变更 timeout 的调用请求时会重新创建 bRPC Client 实例以变更 bRPC Client timeout时间，同时 gRPC Client 会设置 gRPC 的 deadline 时间。
+   **注意，设置 timeout 接口和 Inference 接口不能同时调用（非线程安全），出于性能考虑暂时不加锁。**
+4. gRPC Client 端 `predict` 函数添加 `asyn` 和 `is_python` 参数：
+   ```python
+   def predict(self, feed, fetch, need_variant_tag=False, asyn=False, is_python=True)
+   ```
+   其中，`asyn` 为异步调用选项。当 `asyn=True` 时为异步调用，返回 `MultiLangPredictFuture` 对象，通过 `MultiLangPredictFuture.result()` 阻塞获取预测值；当 `asyn=Fasle` 为同步调用。
+   `is_python` 为 proto 格式选项。当 `is_python=True` 时，基于 numpy bytes 格式进行数据传输，目前只适用于 Python；当 `is_python=False` 时，以普通数据格式传输，更加通用。使用 numpy bytes 格式传输耗时比普通数据格式小很多（详见 [#654](https://github.com/PaddlePaddle/Serving/pull/654)）。
+5. 异常处理：当 gRPC Server 端的 bRPC Client 预测失败（返回 `None`）时，gRPC Client 端同样返回None。其他 gRPC 异常会在 Client 内部捕获，并在返回的 fetch_map 中添加一个 "status_code" 字段来区分是否预测正常（参考 timeout 样例）。
+6. 由于 gRPC 只支持 pick_first 和 round_robin 负载均衡策略，ABTEST 特性还未打齐。
+7. 经测试，gRPC 版本可以在 Windows、macOS 平台使用。
+8. 计划支持的客户端语言：
+   - [x] Python
+   - [ ] Java
+   - [ ] Go
+   - [ ] JavaScript
+## Python 端的一些例子 
+详见 `python/examples/grpc_impl_example` 下的示例文件。
--- a/doc/grpc_impl.png
+++ b/doc/grpc_impl.png
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
 if (CLIENT)
+    file(INSTALL pipeline DESTINATION paddle_serving_client)
+    execute_process(COMMAND ${PYTHON_EXECUTABLE} run_codegen.py
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/paddle_serving_client/pipeline/proto)
    file(GLOB_RECURSE SERVING_CLIENT_PY_FILES paddle_serving_client/*.py)
    set(PY_FILES ${SERVING_CLIENT_PY_FILES})
    SET(PACKAGE_NAME "serving_client")
@@ -7,8 +10,14 @@ endif()
 if (SERVER)
    if (NOT WITH_GPU)
+        file(INSTALL pipeline DESTINATION paddle_serving_server)
+        execute_process(COMMAND ${PYTHON_EXECUTABLE} run_codegen.py
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/paddle_serving_server/pipeline/proto)
        file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server/*.py)
    else()
+        file(INSTALL pipeline DESTINATION paddle_serving_server_gpu)
+        execute_process(COMMAND ${PYTHON_EXECUTABLE} run_codegen.py
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/paddle_serving_server_gpu/pipeline/proto)
        file(GLOB_RECURSE SERVING_SERVER_PY_FILES paddle_serving_server_gpu/*.py)
    endif()
        set(PY_FILES ${SERVING_SERVER_PY_FILES})

--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/README_CN.md
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/README_CN.md
+## 带稀疏参数索引服务的CTR预测服务
+该样例是为了展示gRPC Server 端 `load_model_config` 函数，在这个例子中，bRPC Server 端与 bRPC Client 端的配置文件是不同的（bPRC Client 端的数据先交给 cube，经过 cube 处理后再交给预测库）
+### 获取样例数据
+```
+sh get_data.sh
+```
+### 下载模型和稀疏参数序列文件
+```
+wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz
+tar xf ctr_cube_unittest.tar.gz
+mv models/ctr_client_conf ./
+mv models/ctr_serving_model_kv ./
+mv models/data ./cube/
+```
+执行脚本后会在当前目录有ctr_server_model_kv和ctr_client_config文件夹。
+### 启动稀疏参数索引服务
+```
+wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz
+tar xf cube_app.tar.gz
+mv cube_app/cube* ./cube/
+sh cube_prepare.sh &
+```
+此处，模型当中的稀疏参数会被存放在稀疏参数索引服务Cube当中，关于稀疏参数索引服务Cube的介绍，请阅读[稀疏参数索引服务Cube单机版使用指南](../../../doc/CUBE_LOCAL_CN.md)
+### 启动RPC预测服务，服务端线程数为4（可在test_server.py配置）
+```
+python test_server.py ctr_serving_model_kv ctr_client_conf/serving_client_conf.prototxt 
+```
+### 执行预测
+```
+python test_client.py ./raw_data
+```
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/args.py
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/args.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+import argparse
+def parse_args():
+    parser = argparse.ArgumentParser(description="PaddlePaddle CTR example")
+    parser.add_argument(
+        '--train_data_path',
+        type=str,
+        default='./data/raw/train.txt',
+        help="The path of training dataset")
+    parser.add_argument(
+        '--sparse_only',
+        type=bool,
+        default=False,
+        help="Whether we use sparse features only")
+    parser.add_argument(
+        '--test_data_path',
+        type=str,
+        default='./data/raw/valid.txt',
+        help="The path of testing dataset")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=1000,
+        help="The size of mini-batch (default:1000)")
+    parser.add_argument(
+        '--embedding_size',
+        type=int,
+        default=10,
+        help="The size for embedding layer (default:10)")
+    parser.add_argument(
+        '--num_passes',
+        type=int,
+        default=10,
+        help="The number of passes to train (default: 10)")
+    parser.add_argument(
+        '--model_output_dir',
+        type=str,
+        default='models',
+        help='The path for model to store (default: models)')
+    parser.add_argument(
+        '--sparse_feature_dim',
+        type=int,
+        default=1000001,
+        help='sparse feature hashing space for index processing')
+    parser.add_argument(
+        '--is_local',
+        type=int,
+        default=1,
+        help='Local train or distributed train (default: 1)')
+    parser.add_argument(
+        '--cloud_train',
+        type=int,
+        default=0,
+        help='Local train or distributed train on paddlecloud (default: 0)')
+    parser.add_argument(
+        '--async_mode',
+        action='store_true',
+        default=False,
+        help='Whether start pserver in async mode to support ASGD')
+    parser.add_argument(
+        '--no_split_var',
+        action='store_true',
+        default=False,
+        help='Whether split variables into blocks when update_method is pserver')
+    parser.add_argument(
+        '--role',
+        type=str,
+        default='pserver',  # trainer or pserver
+        help='The path for model to store (default: models)')
+    parser.add_argument(
+        '--endpoints',
+        type=str,
+        default='127.0.0.1:6000',
+        help='The pserver endpoints, like: 127.0.0.1:6000,127.0.0.1:6001')
+    parser.add_argument(
+        '--current_endpoint',
+        type=str,
+        default='127.0.0.1:6000',
+        help='The path for model to store (default: 127.0.0.1:6000)')
+    parser.add_argument(
+        '--trainer_id',
+        type=int,
+        default=0,
+        help='The path for model to store (default: models)')
+    parser.add_argument(
+        '--trainers',
+        type=int,
+        default=1,
+        help='The num of trianers, (default: 1)')
+    return parser.parse_args()
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/clean.sh
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/clean.sh
+ps -ef | grep cube | awk {'print $2'} | xargs kill -9
+rm -rf cube/cube_data cube/data cube/log* cube/nohup* cube/output/ cube/donefile cube/input cube/monitor cube/cube-builder.INFO
+ps -ef | grep test | awk {'print $2'} | xargs kill -9
+ps -ef | grep serving | awk {'print $2'} | xargs kill -9
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/criteo.py
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/criteo.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+class CriteoDataset(object):
+    def setup(self, sparse_feature_dim):
+        self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        self.cont_max_ = [
+            20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
+        ]
+        self.cont_diff_ = [
+            20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
+        ]
+        self.hash_dim_ = sparse_feature_dim
+        # here, training data are lines with line_index < train_idx_
+        self.train_idx_ = 41256555
+        self.continuous_range_ = range(1, 14)
+        self.categorical_range_ = range(14, 40)
+    def _process_line(self, line):
+        features = line.rstrip('\n').split('\t')
+        dense_feature = []
+        sparse_feature = []
+        for idx in self.continuous_range_:
+            if features[idx] == '':
+                dense_feature.append(0.0)
+            else:
+                dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / \
+                                     self.cont_diff_[idx - 1])
+        for idx in self.categorical_range_:
+            sparse_feature.append(
+                [hash(str(idx) + features[idx]) % self.hash_dim_])
+        return dense_feature, sparse_feature, [int(features[0])]
+    def infer_reader(self, filelist, batch, buf_size):
+        def local_iter():
+            for fname in filelist:
+                with open(fname.strip(), "r") as fin:
+                    for line in fin:
+                        dense_feature, sparse_feature, label = self._process_line(
+                            line)
+                        #yield dense_feature, sparse_feature, label
+                        yield [dense_feature] + sparse_feature + [label]
+        import paddle
+        batch_iter = paddle.batch(
+            paddle.reader.shuffle(
+                local_iter, buf_size=buf_size),
+            batch_size=batch)
+        return batch_iter
+    def generate_sample(self, line):
+        def data_iter():
+            dense_feature, sparse_feature, label = self._process_line(line)
+            feature_name = ["dense_input"]
+            for idx in self.categorical_range_:
+                feature_name.append("C" + str(idx - 13))
+            feature_name.append("label")
+            yield zip(feature_name, [dense_feature] + sparse_feature + [label])
+        return data_iter
+if __name__ == "__main__":
+    criteo_dataset = CriteoDataset()
+    criteo_dataset.setup(int(sys.argv[1]))
+    criteo_dataset.run_from_stdin()
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/criteo_reader.py
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/criteo_reader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+import sys
+import paddle.fluid.incubate.data_generator as dg
+class CriteoDataset(dg.MultiSlotDataGenerator):
+    def setup(self, sparse_feature_dim):
+        self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        self.cont_max_ = [
+            20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
+        ]
+        self.cont_diff_ = [
+            20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
+        ]
+        self.hash_dim_ = sparse_feature_dim
+        # here, training data are lines with line_index < train_idx_
+        self.train_idx_ = 41256555
+        self.continuous_range_ = range(1, 14)
+        self.categorical_range_ = range(14, 40)
+    def _process_line(self, line):
+        features = line.rstrip('\n').split('\t')
+        dense_feature = []
+        sparse_feature = []
+        for idx in self.continuous_range_:
+            if features[idx] == '':
+                dense_feature.append(0.0)
+            else:
+                dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / \
+                                     self.cont_diff_[idx - 1])
+        for idx in self.categorical_range_:
+            sparse_feature.append(
+                [hash(str(idx) + features[idx]) % self.hash_dim_])
+        return dense_feature, sparse_feature, [int(features[0])]
+    def infer_reader(self, filelist, batch, buf_size):
+        def local_iter():
+            for fname in filelist:
+                with open(fname.strip(), "r") as fin:
+                    for line in fin:
+                        dense_feature, sparse_feature, label = self._process_line(
+                            line)
+                        #yield dense_feature, sparse_feature, label
+                        yield [dense_feature] + sparse_feature + [label]
+        import paddle
+        batch_iter = paddle.batch(
+            paddle.reader.shuffle(
+                local_iter, buf_size=buf_size),
+            batch_size=batch)
+        return batch_iter
+    def generate_sample(self, line):
+        def data_iter():
+            dense_feature, sparse_feature, label = self._process_line(line)
+            feature_name = ["dense_input"]
+            for idx in self.categorical_range_:
+                feature_name.append("C" + str(idx - 13))
+            feature_name.append("label")
+            yield zip(feature_name, [dense_feature] + sparse_feature + [label])
+        return data_iter
+if __name__ == "__main__":
+    criteo_dataset = CriteoDataset()
+    criteo_dataset.setup(int(sys.argv[1]))
+    criteo_dataset.run_from_stdin()
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/conf/cube.conf
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/conf/cube.conf
+[{
+    "dict_name": "test_dict",
+    "shard": 1,
+    "dup": 1,
+    "timeout": 200,
+    "retry": 3,
+    "backup_request": 100,
+    "type": "ipport_list",
+    "load_balancer": "rr",
+    "nodes": [{
+        "ipport_list": "list://127.0.0.1:8027"
+    }]
+}]
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/conf/gflags.conf
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/conf/gflags.conf
+--port=8027
+--dict_split=1
+--in_mem=true
+--log_dir=./log/
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/keys
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/keys
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube_prepare.sh
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube_prepare.sh
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+#! /bin/bash
+mkdir -p cube_model
+mkdir -p cube/data
+./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature  
+./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1  -only_build=false
+mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
+cd cube && ./cube 
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube_quant_prepare.sh
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube_quant_prepare.sh
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+#! /bin/bash
+mkdir -p cube_model
+mkdir -p cube/data
+./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature 8  
+./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1  -only_build=false
+mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
+cd cube && ./cube 
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/get_data.sh
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/get_data.sh
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/data/ctr_prediction/ctr_data.tar.gz
+tar -zxvf ctr_data.tar.gz
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/local_train.py
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/local_train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from __future__ import print_function
+from args import parse_args
+import os
+import paddle.fluid as fluid
+import sys
+from network_conf import dnn_model
+dense_feature_dim = 13
+def train():
+    args = parse_args()
+    sparse_only = args.sparse_only
+    if not os.path.isdir(args.model_output_dir):
+        os.mkdir(args.model_output_dir)
+    dense_input = fluid.layers.data(
+        name="dense_input", shape=[dense_feature_dim], dtype='float32')
+    sparse_input_ids = [
+        fluid.layers.data(
+            name="C" + str(i), shape=[1], lod_level=1, dtype="int64")
+        for i in range(1, 27)
+    ]
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    #nn_input = None if sparse_only else dense_input
+    nn_input = dense_input
+    predict_y, loss, auc_var, batch_auc_var, infer_vars = dnn_model(
+        nn_input, sparse_input_ids, label, args.embedding_size,
+        args.sparse_feature_dim)
+    optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
+    optimizer.minimize(loss)
+    exe = fluid.Executor(fluid.CPUPlace())
+    exe.run(fluid.default_startup_program())
+    dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+    dataset.set_use_var([dense_input] + sparse_input_ids + [label])
+    python_executable = "python"
+    pipe_command = "{} criteo_reader.py {}".format(python_executable,
+                                                   args.sparse_feature_dim)
+    dataset.set_pipe_command(pipe_command)
+    dataset.set_batch_size(128)
+    thread_num = 10
+    dataset.set_thread(thread_num)
+    whole_filelist = [
+        "raw_data/part-%d" % x for x in range(len(os.listdir("raw_data")))
+    ]
+    print(whole_filelist)
+    dataset.set_filelist(whole_filelist[:100])
+    dataset.load_into_memory()
+    fluid.layers.Print(auc_var)
+    epochs = 1
+    for i in range(epochs):
+        exe.train_from_dataset(
+            program=fluid.default_main_program(), dataset=dataset, debug=True)
+        print("epoch {} finished".format(i))
+    import paddle_serving_client.io as server_io
+    feed_var_dict = {}
+    feed_var_dict['dense_input'] = dense_input
+    for i, sparse in enumerate(sparse_input_ids):
+        feed_var_dict["embedding_{}.tmp_0".format(i)] = sparse
+    fetch_var_dict = {"prob": predict_y}
+    feed_kv_dict = {}
+    feed_kv_dict['dense_input'] = dense_input
+    for i, emb in enumerate(infer_vars):
+        feed_kv_dict["embedding_{}.tmp_0".format(i)] = emb
+    fetch_var_dict = {"prob": predict_y}
+    server_io.save_model("ctr_serving_model", "ctr_client_conf", feed_var_dict,
+                         fetch_var_dict, fluid.default_main_program())
+    server_io.save_model("ctr_serving_model_kv", "ctr_client_conf_kv",
+                         feed_kv_dict, fetch_var_dict,
+                         fluid.default_main_program())
+if __name__ == '__main__':
+    train()
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/network_conf.py
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/network_conf.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+import paddle.fluid as fluid
+import math
+def dnn_model(dense_input, sparse_inputs, label, embedding_size,
+              sparse_feature_dim):
+    def embedding_layer(input):
+        emb = fluid.layers.embedding(
+            input=input,
+            is_sparse=True,
+            is_distributed=False,
+            size=[sparse_feature_dim, embedding_size],
+            param_attr=fluid.ParamAttr(
+                name="SparseFeatFactors",
+                initializer=fluid.initializer.Uniform()))
+        x = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+        return emb, x
+    def mlp_input_tensor(emb_sums, dense_tensor):
+        #if isinstance(dense_tensor, fluid.Variable):
+        #    return fluid.layers.concat(emb_sums, axis=1)
+        #else:
+        return fluid.layers.concat(emb_sums + [dense_tensor], axis=1)
+    def mlp(mlp_input):
+        fc1 = fluid.layers.fc(input=mlp_input,
+                              size=400,
+                              act='relu',
+                              param_attr=fluid.ParamAttr(
+                                  initializer=fluid.initializer.Normal(
+                                      scale=1 / math.sqrt(mlp_input.shape[1]))))
+        fc2 = fluid.layers.fc(input=fc1,
+                              size=400,
+                              act='relu',
+                              param_attr=fluid.ParamAttr(
+                                  initializer=fluid.initializer.Normal(
+                                      scale=1 / math.sqrt(fc1.shape[1]))))
+        fc3 = fluid.layers.fc(input=fc2,
+                              size=400,
+                              act='relu',
+                              param_attr=fluid.ParamAttr(
+                                  initializer=fluid.initializer.Normal(
+                                      scale=1 / math.sqrt(fc2.shape[1]))))
+        pre = fluid.layers.fc(input=fc3,
+                              size=2,
+                              act='softmax',
+                              param_attr=fluid.ParamAttr(
+                                  initializer=fluid.initializer.Normal(
+                                      scale=1 / math.sqrt(fc3.shape[1]))))
+        return pre
+    emb_pair_sums = list(map(embedding_layer, sparse_inputs))
+    emb_sums = [x[1] for x in emb_pair_sums]
+    infer_vars = [x[0] for x in emb_pair_sums]
+    mlp_in = mlp_input_tensor(emb_sums, dense_input)
+    predict = mlp(mlp_in)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.reduce_sum(cost)
+    accuracy = fluid.layers.accuracy(input=predict, label=label)
+    auc_var, batch_auc_var, auc_states = \
+        fluid.layers.auc(input=predict, label=label, num_thresholds=2 ** 12, slide_steps=20)
+    return predict, avg_cost, auc_var, batch_auc_var, infer_vars
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_client.py
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_client import MultiLangClient as Client
+import sys
+import os
+import criteo as criteo
+import time
+from paddle_serving_client.metric import auc
+import grpc
+client = Client()
+client.connect(["127.0.0.1:9292"])
+batch = 1
+buf_size = 100
+dataset = criteo.CriteoDataset()
+dataset.setup(1000001)
+test_filelists = ["{}/part-0".format(sys.argv[1])]
+reader = dataset.infer_reader(test_filelists, batch, buf_size)
+label_list = []
+prob_list = []
+start = time.time()
+for ei in range(10000):
+    data = reader().next()
+    feed_dict = {}
+    feed_dict['dense_input'] = data[0][0]
+    for i in range(1, 27):
+        feed_dict["embedding_{}.tmp_0".format(i - 1)] = data[0][i]
+    fetch_map = client.predict(feed=feed_dict, fetch=["prob"])
+    if fetch_map["serving_status_code"] == 0:
+        prob_list.append(fetch_map['prob'][0][1])
+        label_list.append(data[0][-1][0])
+print(auc(label_list, prob_list))
+end = time.time()
+print(end - start)
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server.py
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+import os
+import sys
+from paddle_serving_server import OpMaker
+from paddle_serving_server import OpSeqMaker
+from paddle_serving_server import MultiLangServer as Server
+op_maker = OpMaker()
+read_op = op_maker.create('general_reader')
+general_dist_kv_infer_op = op_maker.create('general_dist_kv_infer')
+response_op = op_maker.create('general_response')
+op_seq_maker = OpSeqMaker()
+op_seq_maker.add_op(read_op)
+op_seq_maker.add_op(general_dist_kv_infer_op)
+op_seq_maker.add_op(response_op)
+server = Server()
+server.set_op_sequence(op_seq_maker.get_op_sequence())
+server.set_num_threads(4)
+server.load_model_config(sys.argv[1], sys.argv[2])
+server.prepare_server(workdir="work_dir1", port=9292, device="cpu")
+server.run_server()
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server_gpu.py
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server_gpu.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+import os
+import sys
+from paddle_serving_server_gpu import OpMaker
+from paddle_serving_server_gpu import OpSeqMaker
+from paddle_serving_server_gpu import MultiLangServer as Server
+op_maker = OpMaker()
+read_op = op_maker.create('general_reader')
+general_dist_kv_infer_op = op_maker.create('general_dist_kv_infer')
+response_op = op_maker.create('general_response')
+op_seq_maker = OpSeqMaker()
+op_seq_maker.add_op(read_op)
+op_seq_maker.add_op(general_dist_kv_infer_op)
+op_seq_maker.add_op(response_op)
+server = Server()
+server.set_op_sequence(op_seq_maker.get_op_sequence())
+server.set_num_threads(4)
+server.load_model_config(sys.argv[1], sys.argv[2])
+server.prepare_server(workdir="work_dir1", port=9292, device="cpu")
+server.run_server()
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server_quant.py
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server_quant.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+import os
+import sys
+from paddle_serving_server import OpMaker
+from paddle_serving_server import OpSeqMaker
+from paddle_serving_server import MultiLangServer as Server
+op_maker = OpMaker()
+read_op = op_maker.create('general_reader')
+general_dist_kv_infer_op = op_maker.create('general_dist_kv_quant_infer')
+response_op = op_maker.create('general_response')
+op_seq_maker = OpSeqMaker()
+op_seq_maker.add_op(read_op)
+op_seq_maker.add_op(general_dist_kv_infer_op)
+op_seq_maker.add_op(response_op)
+server = Server()
+server.set_op_sequence(op_seq_maker.get_op_sequence())
+server.set_num_threads(4)
+server.load_model_config(sys.argv[1], sys.argv[2])
+server.prepare_server(workdir="work_dir1", port=9292, device="cpu")
+server.run_server()
--- a/python/examples/grpc_impl_example/fit_a_line/README_CN.md
+++ b/python/examples/grpc_impl_example/fit_a_line/README_CN.md
+# 线性回归预测服务示例
+## 获取数据
+```shell
+sh get_data.sh
+```
+## 开启 gRPC 服务端
+``` shell
+python test_server.py uci_housing_model/
+```
+也可以通过下面的一行代码开启默认 gRPC 服务：
+```shell
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang
+```
+## 客户端预测
+### 同步预测
+``` shell
+python test_sync_client.py
+```
+### 异步预测
+``` shell
+python test_asyn_client.py
+```
+### Batch 预测
+``` shell
+python test_batch_client.py
+```
+### 通用 pb 预测
+``` shell
+python test_general_pb_client.py
+```
+### 预测超时
+``` shell
+python test_timeout_client.py
+```
+### List 输入
+``` shell
+python test_list_input_client.py
+```
--- a/python/examples/grpc_impl_example/fit_a_line/get_data.sh
+++ b/python/examples/grpc_impl_example/fit_a_line/get_data.sh
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
+tar -xzf uci_housing.tar.gz
--- a/python/examples/fit_a_line/test_multilang_client.py
+++ b/python/examples/fit_a_line/test_multilang_client.py
@@ -13,38 +13,39 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing
-from paddle_serving_client import MultiLangClient
+from paddle_serving_client import MultiLangClient as Client
 import functools
-import sys
 import time
 import threading
+import grpc
-client = MultiLangClient()
+client = Client()
-client.load_client_config(sys.argv[1])
 client.connect(["127.0.0.1:9393"])
-import paddle
-test_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.uci_housing.test(), buf_size=500),
-    batch_size=1)
 complete_task_count = [0]
 lock = threading.Lock()
-def call_back(call_future, data):
+def call_back(call_future):
-    fetch_map = call_future.result()
+    try:
-    print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
+        fetch_map = call_future.result()
-    with lock:
+        print(fetch_map)
-        complete_task_count[0] += 1
+    except grpc.RpcError as e:
+        print(e.code())
+    finally:
+        with lock:
+            complete_task_count[0] += 1
+x = [
+    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
+    0.4919, 0.1856, 0.0795, -0.0332
+]
 task_count = 0
-for data in test_reader():
+for i in range(3):
-    future = client.predict(feed={"x": data[0][0]}, fetch=["price"], asyn=True)
+    future = client.predict(feed={"x": x}, fetch=["price"], asyn=True)
    task_count += 1
-    future.add_done_callback(functools.partial(call_back, data=data))
+    future.add_done_callback(functools.partial(call_back))
 while complete_task_count[0] != task_count:
    time.sleep(0.1)
--- a/python/examples/grpc_impl_example/fit_a_line/test_batch_client.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_batch_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_client import MultiLangClient as Client
+client = Client()
+client.connect(["127.0.0.1:9393"])
+batch_size = 2
+x = [
+    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
+    0.4919, 0.1856, 0.0795, -0.0332
+]
+for i in range(3):
+    batch_feed = [{"x": x} for j in range(batch_size)]
+    fetch_map = client.predict(feed=batch_feed, fetch=["price"])
+    if fetch_map["serving_status_code"] == 0:
+        print(fetch_map)
+    else:
+        print(fetch_map["serving_status_code"])
--- a/python/examples/grpc_impl_example/fit_a_line/test_general_pb_client.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_general_pb_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_client import MultiLangClient as Client
+client = Client()
+client.connect(["127.0.0.1:9393"])
+x = [
+    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
+    0.4919, 0.1856, 0.0795, -0.0332
+]
+for i in range(3):
+    fetch_map = client.predict(feed={"x": x}, fetch=["price"], is_python=False)
+    if fetch_map["serving_status_code"] == 0:
+        print(fetch_map)
+    else:
+        print(fetch_map["serving_status_code"])
--- a/python/examples/grpc_impl_example/fit_a_line/test_numpy_input_client.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_numpy_input_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_client import MultiLangClient as Client
+import numpy as np
+client = Client()
+client.connect(["127.0.0.1:9393"])
+x = [
+    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
+    0.4919, 0.1856, 0.0795, -0.0332
+]
+for i in range(3):
+    fetch_map = client.predict(feed={"x": np.array(x)}, fetch=["price"])
+    if fetch_map["serving_status_code"] == 0:
+        print(fetch_map)
+    else:
+        print(fetch_map["serving_status_code"])
--- a/python/examples/fit_a_line/test_multilang_server.py
+++ b/python/examples/fit_a_line/test_multilang_server.py
@@ -17,7 +17,7 @@ import os
 import sys
 from paddle_serving_server import OpMaker
 from paddle_serving_server import OpSeqMaker
-from paddle_serving_server import MultiLangServer
+from paddle_serving_server import MultiLangServer as Server
 op_maker = OpMaker()
 read_op = op_maker.create('general_reader')
@@ -29,7 +29,7 @@ op_seq_maker.add_op(read_op)
 op_seq_maker.add_op(general_infer_op)
 op_seq_maker.add_op(response_op)
-server = MultiLangServer()
+server = Server()
 server.set_op_sequence(op_seq_maker.get_op_sequence())
 server.load_model_config(sys.argv[1])
 server.prepare_server(workdir="work_dir1", port=9393, device="cpu")

--- a/python/examples/grpc_impl_example/fit_a_line/test_server_gpu.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_server_gpu.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+import os
+import sys
+from paddle_serving_server_gpu import OpMaker
+from paddle_serving_server_gpu import OpSeqMaker
+from paddle_serving_server_gpu import MultiLangServer as Server
+op_maker = OpMaker()
+read_op = op_maker.create('general_reader')
+general_infer_op = op_maker.create('general_infer')
+response_op = op_maker.create('general_response')
+op_seq_maker = OpSeqMaker()
+op_seq_maker.add_op(read_op)
+op_seq_maker.add_op(general_infer_op)
+op_seq_maker.add_op(response_op)
+server = Server()
+server.set_op_sequence(op_seq_maker.get_op_sequence())
+server.load_model_config(sys.argv[1])
+server.set_gpuid(0)
+server.prepare_server(workdir="work_dir1", port=9393, device="cpu")
+server.run_server()
--- a/python/examples/grpc_impl_example/fit_a_line/test_sync_client.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_sync_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_client import MultiLangClient as Client
+client = Client()
+client.connect(["127.0.0.1:9393"])
+x = [
+    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
+    0.4919, 0.1856, 0.0795, -0.0332
+]
+for i in range(3):
+    fetch_map = client.predict(feed={"x": x}, fetch=["price"])
+    if fetch_map["serving_status_code"] == 0:
+        print(fetch_map)
+    else:
+        print(fetch_map["serving_status_code"])
--- a/python/examples/grpc_impl_example/fit_a_line/test_timeout_client.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_timeout_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_client import MultiLangClient as Client
+import grpc
+client = Client()
+client.connect(["127.0.0.1:9393"])
+client.set_rpc_timeout_ms(1)
+x = [
+    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
+    0.4919, 0.1856, 0.0795, -0.0332
+]
+for i in range(3):
+    fetch_map = client.predict(feed={"x": x}, fetch=["price"])
+    if fetch_map["serving_status_code"] == 0:
+        print(fetch_map)
+    elif fetch_map["serving_status_code"] == grpc.StatusCode.DEADLINE_EXCEEDED:
+        print('timeout')
+    else:
+        print(fetch_map["serving_status_code"])
--- a/python/examples/imdb/test_client.py
+++ b/python/examples/imdb/test_client.py
@@ -29,6 +29,6 @@ imdb_dataset.load_resource(sys.argv[2])
 for line in sys.stdin:
    word_ids, label = imdb_dataset.get_words_and_label(line)
    feed = {"words": word_ids}
-    fetch = ["acc", "cost", "prediction"]
+    fetch = ["prediction"]
    fetch_map = client.predict(feed=feed, fetch=fetch)
    print("{} {}".format(fetch_map["prediction"][0], label[0]))
--- a/python/examples/imdb/test_ensemble_client.py
+++ b/python/examples/imdb/test_ensemble_client.py
@@ -32,11 +32,7 @@ for i in range(3):
    line = 'i am very sad | 0'
    word_ids, label = imdb_dataset.get_words_and_label(line)
    feed = {"words": word_ids}
-    fetch = ["acc", "cost", "prediction"]
+    fetch = ["prediction"]
    fetch_maps = client.predict(feed=feed, fetch=fetch)
-    if len(fetch_maps) == 1:
+    for model, fetch_map in fetch_maps.items():
-        print("step: {}, res: {}".format(i, fetch_maps['prediction'][0][1]))
+        print("step: {}, model: {}, res: {}".format(i, model, fetch_map))
-    else:
-        for model, fetch_map in fetch_maps.items():
-            print("step: {}, model: {}, res: {}".format(i, model, fetch_map[
-                'prediction'][0][1]))
--- a/python/examples/imdb/test_multilang_ensemble_client.py
+++ b/python/examples/imdb/test_multilang_ensemble_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_client import MultiLangClient
+from imdb_reader import IMDBDataset
+client = MultiLangClient()
+# If you have more than one model, make sure that the input
+# and output of more than one model are the same.
+client.connect(["127.0.0.1:9393"])
+# you can define any english sentence or dataset here
+# This example reuses imdb reader in training, you
+# can define your own data preprocessing easily.
+imdb_dataset = IMDBDataset()
+imdb_dataset.load_resource('imdb.vocab')
+for i in range(3):
+    line = 'i am very sad | 0'
+    word_ids, label = imdb_dataset.get_words_and_label(line)
+    feed = {"words": word_ids}
+    fetch = ["prediction"]
+    fetch_maps = client.predict(feed=feed, fetch=fetch)
+    for model, fetch_map in fetch_maps.items():
+        print("step: {}, model: {}, res: {}".format(i, model, fetch_map))
--- a/python/examples/imdb/test_multilang_ensemble_server.py
+++ b/python/examples/imdb/test_multilang_ensemble_server.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_server import OpMaker
+from paddle_serving_server import OpGraphMaker
+from paddle_serving_server import MultiLangServer
+op_maker = OpMaker()
+read_op = op_maker.create('general_reader')
+cnn_infer_op = op_maker.create(
+    'general_infer', engine_name='cnn', inputs=[read_op])
+bow_infer_op = op_maker.create(
+    'general_infer', engine_name='bow', inputs=[read_op])
+response_op = op_maker.create(
+    'general_response', inputs=[cnn_infer_op, bow_infer_op])
+op_graph_maker = OpGraphMaker()
+op_graph_maker.add_op(read_op)
+op_graph_maker.add_op(cnn_infer_op)
+op_graph_maker.add_op(bow_infer_op)
+op_graph_maker.add_op(response_op)
+server = MultiLangServer()
+server.set_op_graph(op_graph_maker.get_op_graph())
+model_config = {cnn_infer_op: 'imdb_cnn_model', bow_infer_op: 'imdb_bow_model'}
+server.load_model_config(model_config)
+server.prepare_server(workdir="work_dir1", port=9393, device="cpu")
+server.run_server()
--- a/python/examples/pipeline/imdb_model_ensemble/config.yml
+++ b/python/examples/pipeline/imdb_model_ensemble/config.yml
+use_multithread: true
+client_type: brpc
+retry: 1
+profile: false
+prot: 8080
+worker_num: 2
--- a/python/examples/pipeline/imdb_model_ensemble/get_data.sh
+++ b/python/examples/pipeline/imdb_model_ensemble/get_data.sh
+wget --no-check-certificate https://fleet.bj.bcebos.com/text_classification_data.tar.gz
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/imdb-demo/imdb_model.tar.gz
+tar -zxvf text_classification_data.tar.gz
+tar -zxvf imdb_model.tar.gz
--- a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_client.py
+++ b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_client.pipeline import PipelineClient
+import numpy as np
+from line_profiler import LineProfiler
+client = PipelineClient()
+client.connect('localhost:8080')
+lp = LineProfiler()
+lp_wrapper = lp(client.predict)
+words = 'i am very sad | 0'
+for i in range(1):
+    fetch_map = lp_wrapper(feed_dict={"words": words}, fetch=["prediction"])
+    print(fetch_map)
+#lp.print_stats()
--- a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
+++ b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_server.pipeline import Op, RequestOp, ResponseOp
+from paddle_serving_server.pipeline import PipelineServer
+from paddle_serving_server.pipeline.proto import pipeline_service_pb2
+from paddle_serving_server.pipeline.channel import ChannelDataEcode
+import numpy as np
+import logging
+from paddle_serving_app.reader import IMDBDataset
+_LOGGER = logging.getLogger(__name__)
+logging.basicConfig(
+    format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='%Y-%m-%d %H:%M',
+    level=logging.DEBUG)
+class ImdbRequestOp(RequestOp):
+    def load_user_resources(self):
+        self.imdb_dataset = IMDBDataset()
+        self.imdb_dataset.load_resource('imdb.vocab')
+    def unpack_request_package(self, request):
+        dictdata = {}
+        for idx, key in enumerate(request.key):
+            if key != "words":
+                continue
+            words = request.value[idx]
+            word_ids, _ = self.imdb_dataset.get_words_and_label(words)
+            dictdata[key] = np.array(word_ids)
+        return dictdata
+class CombineOp(Op):
+    def preprocess(self, input_data):
+        combined_prediction = 0
+        for op_name, data in input_data.items():
+            _LOGGER.info("{}: {}".format(op_name, data["prediction"]))
+            combined_prediction += data["prediction"]
+        data = {"prediction": combined_prediction / 2}
+        return data
+class ImdbResponseOp(ResponseOp):
+    # Here ImdbResponseOp is consistent with the default ResponseOp implementation
+    def pack_response_package(self, channeldata):
+        resp = pipeline_service_pb2.Response()
+        resp.ecode = channeldata.ecode
+        if resp.ecode == ChannelDataEcode.OK.value:
+            feed = channeldata.parse()
+            # ndarray to string
+            for name, var in feed.items():
+                resp.value.append(var.__repr__())
+                resp.key.append(name)
+        else:
+            resp.error_info = channeldata.error_info
+        return resp
+read_op = ImdbRequestOp()
+bow_op = Op(name="bow",
+            input_ops=[read_op],
+            server_endpoints=["127.0.0.1:9393"],
+            fetch_list=["prediction"],
+            client_config="imdb_bow_client_conf/serving_client_conf.prototxt",
+            concurrency=1,
+            timeout=-1,
+            retry=1)
+cnn_op = Op(name="cnn",
+            input_ops=[read_op],
+            server_endpoints=["127.0.0.1:9292"],
+            fetch_list=["prediction"],
+            client_config="imdb_cnn_client_conf/serving_client_conf.prototxt",
+            concurrency=1,
+            timeout=-1,
+            retry=1)
+combine_op = CombineOp(
+    name="combine",
+    input_ops=[bow_op, cnn_op],
+    concurrency=1,
+    timeout=-1,
+    retry=1)
+# fetch output of bow_op
+# response_op = ImdbResponseOp(input_ops=[bow_op])
+# fetch output of combine_op
+response_op = ImdbResponseOp(input_ops=[combine_op])
+# use default ResponseOp implementation
+# response_op = ResponseOp(input_ops=[combine_op])
+server = PipelineServer()
+server.set_response_op(response_op)
+server.prepare_server('config.yml')
+server.run_server()
--- a/python/examples/util/timeline_trace.py
+++ b/python/examples/util/timeline_trace.py
@@ -16,10 +16,16 @@ def prase(pid_str, time_str, counter):
        if len(name_list) == 2:
            name = name_list[0]
        else:
-            name = name_list[0] + "_" + name_list[1]
+            name = "_".join(name_list[:-1])
+        name_list = name.split("#")
+        if len(name_list) > 1:
+            tid = name_list[-1]
+            name = "#".join(name_list[:-1])
+        else:
+            tid = 0
        event_dict = {}
        event_dict["name"] = name
-        event_dict["tid"] = 0
+        event_dict["tid"] = tid
        event_dict["pid"] = pid
        event_dict["ts"] = ts
        event_dict["ph"] = ph

--- a/python/examples/yolov4/000000570688.jpg
+++ b/python/examples/yolov4/000000570688.jpg
--- a/python/examples/yolov4/README.md
+++ b/python/examples/yolov4/README.md
+# Yolov4 Detection Service
+([简体中文](README_CN.md)|English)
+## Get Model
+```
+python -m paddle_serving_app.package --get_model yolov4
+tar -xzvf yolov4.tar.gz
+```
+## Start RPC Service
+```
+python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0
+```
+## Prediction
+```
+python test_client.py 000000570688.jpg
+```
+After the prediction is completed, a json file to save the prediction result and a picture with the detection result box will be generated in the `./outpu folder.
--- a/python/examples/yolov4/README_CN.md
+++ b/python/examples/yolov4/README_CN.md
+# Yolov4 检测服务
+(简体中文|[English](README.md))
+## 获取模型
+```
+python -m paddle_serving_app.package --get_model yolov4
+tar -xzvf yolov4.tar.gz
+```
+## 启动RPC服务
+```
+python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0
+```
+## 预测
+```
+python test_client.py 000000570688.jpg
+```
+预测完成会在`./output`文件夹下生成保存预测结果的json文件以及标出检测结果框的图片。
--- a/python/examples/yolov4/label_list.txt
+++ b/python/examples/yolov4/label_list.txt
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+dining table
+toilet
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
--- a/python/examples/yolov4/test_client.py
+++ b/python/examples/yolov4/test_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
+preprocess = Sequential([
+    File2Image(), BGR2RGB(), Resize(
+        (608, 608), interpolation=cv2.INTER_LINEAR), Div(255.0), Transpose(
+            (2, 0, 1))
+])
+postprocess = RCNNPostprocess("label_list.txt", "output", [608, 608])
+client = Client()
+client.load_client_config("yolov4_client/serving_client_conf.prototxt")
+client.connect(['127.0.0.1:9393'])
+im = preprocess(sys.argv[1])
+print(im.shape)
+fetch_map = client.predict(
+    feed={
+        "image": im,
+        "im_size": np.array(list(im.shape[1:])),
+    },
+    fetch=["save_infer_model/scale_0.tmp_0"])
+fetch_map["image"] = sys.argv[1]
+postprocess(fetch_map)
--- a/python/paddle_serving_app/models/model_list.py
+++ b/python/paddle_serving_app/models/model_list.py
@@ -24,7 +24,7 @@ class ServingModels(object):
            "SentimentAnalysis"] = ["senta_bilstm", "senta_bow", "senta_cnn"]
        self.model_dict["SemanticRepresentation"] = ["ernie"]
        self.model_dict["ChineseWordSegmentation"] = ["lac"]
-        self.model_dict["ObjectDetection"] = ["faster_rcnn"]
+        self.model_dict["ObjectDetection"] = ["faster_rcnn", "yolov4"]
        self.model_dict["ImageSegmentation"] = [
            "unet", "deeplabv3", "deeplabv3+cityscapes"
        ]

--- a/python/paddle_serving_app/reader/image_reader.py
+++ b/python/paddle_serving_app/reader/image_reader.py
@@ -280,10 +280,11 @@ class SegPostprocess(object):
 class RCNNPostprocess(object):
-    def __init__(self, label_file, output_dir):
+    def __init__(self, label_file, output_dir, resize_shape=None):
        self.output_dir = output_dir
        self.label_file = label_file
        self.label_list = []
+        self.resize_shape = resize_shape
        with open(label_file) as fin:
            for line in fin:
                self.label_list.append(line.strip())
@@ -378,6 +379,13 @@ class RCNNPostprocess(object):
            xmax = xmin + w
            ymax = ymin + h
+            img_w, img_h = image.size
+            if self.resize_shape is not None:
+                xmin = xmin * img_w / self.resize_shape[0]
+                xmax = xmax * img_w / self.resize_shape[0]
+                ymin = ymin * img_h / self.resize_shape[1]
+                ymax = ymax * img_h / self.resize_shape[1]
            color = tuple(color_list[catid])
            # draw bbox

--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
@@ -28,8 +28,11 @@ sys.path.append(
    os.path.join(os.path.abspath(os.path.dirname(__file__)), 'proto'))
 from .proto import multi_lang_general_model_service_pb2_grpc
-int_type = 0
+int64_type = 0
-float_type = 1
+float32_type = 1
+int32_type = 2
+int_type = set([int64_type, int32_type])
+float_type = set([float32_type])
 class _NOPProfiler(object):
@@ -279,7 +282,7 @@ class Client(object):
                    raise ValueError("Wrong feed name: {}.".format(key))
                #if not isinstance(feed_i[key], np.ndarray):
                self.shape_check(feed_i, key)
-                if self.feed_types_[key] == int_type:
+                if self.feed_types_[key] in int_type:
                    if i == 0:
                        int_feed_names.append(key)
                        if isinstance(feed_i[key], np.ndarray):
@@ -292,7 +295,7 @@ class Client(object):
                    else:
                        int_slot.append(feed_i[key])
                        self.all_numpy_input = False
-                elif self.feed_types_[key] == float_type:
+                elif self.feed_types_[key] in float_type:
                    if i == 0:
                        float_feed_names.append(key)
                        if isinstance(feed_i[key], np.ndarray):
@@ -339,7 +342,7 @@ class Client(object):
            result_map = {}
            # result map needs to be a numpy array
            for i, name in enumerate(fetch_names):
-                if self.fetch_names_to_type_[name] == int_type:
+                if self.fetch_names_to_type_[name] == int64_type:
                    # result_map[name] will be py::array(numpy array)
                    result_map[name] = result_batch_handle.get_int64_by_name(
                        mi, name)
@@ -348,7 +351,7 @@ class Client(object):
                    if name in self.lod_tensor_set:
                        result_map["{}.lod".format(
                            name)] = result_batch_handle.get_lod(mi, name)
-                elif self.fetch_names_to_type_[name] == float_type:
+                elif self.fetch_names_to_type_[name] == float32_type:
                    result_map[name] = result_batch_handle.get_float_by_name(
                        mi, name)
                    shape = result_batch_handle.get_shape(mi, name)
@@ -356,6 +359,16 @@ class Client(object):
                    if name in self.lod_tensor_set:
                        result_map["{}.lod".format(
                            name)] = result_batch_handle.get_lod(mi, name)
+                elif self.fetch_names_to_type_[name] == int32_type:
+                    # result_map[name] will be py::array(numpy array)
+                    result_map[name] = result_batch_handle.get_int32_by_name(
+                        mi, name)
+                    shape = result_batch_handle.get_shape(mi, name)
+                    result_map[name].shape = shape
+                    if name in self.lod_tensor_set:
+                        result_map["{}.lod".format(
+                            name)] = result_batch_handle.get_lod(mi, name)
            multi_result_map.append(result_map)
        ret = None
        if len(model_engine_names) == 1:
@@ -384,22 +397,41 @@ class Client(object):
 class MultiLangClient(object):
    def __init__(self):
        self.channel_ = None
+        self.stub_ = None
+        self.rpc_timeout_s_ = 2
-    def load_client_config(self, path):
+    def add_variant(self, tag, cluster, variant_weight):
-        if not isinstance(path, str):
+        # TODO
-            raise Exception("GClient only supports multi-model temporarily")
+        raise Exception("cannot support ABtest yet")
-        self._parse_model_config(path)
+    def set_rpc_timeout_ms(self, rpc_timeout):
+        if self.stub_ is None:
+            raise Exception("set timeout must be set after connect.")
+        if not isinstance(rpc_timeout, int):
+            # for bclient
+            raise ValueError("rpc_timeout must be int type.")
+        self.rpc_timeout_s_ = rpc_timeout / 1000.0
+        timeout_req = multi_lang_general_model_service_pb2.SetTimeoutRequest()
+        timeout_req.timeout_ms = rpc_timeout
+        resp = self.stub_.SetTimeout(timeout_req)
+        return resp.err_code == 0
-    def connect(self, endpoint):
+    def connect(self, endpoints):
        # https://github.com/tensorflow/serving/issues/1382
        options = [('grpc.max_receive_message_length', 512 * 1024 * 1024),
                   ('grpc.max_send_message_length', 512 * 1024 * 1024),
-                   ('grpc.max_receive_message_length', 512 * 1024 * 1024)]
+                   ('grpc.lb_policy_name', 'round_robin')]
+        # TODO: weight round robin
-        self.channel_ = grpc.insecure_channel(
+        g_endpoint = 'ipv4:{}'.format(','.join(endpoints))
-            endpoint[0], options=options)  #TODO
+        self.channel_ = grpc.insecure_channel(g_endpoint, options=options)
        self.stub_ = multi_lang_general_model_service_pb2_grpc.MultiLangGeneralModelServiceStub(
            self.channel_)
+        # get client model config
+        get_client_config_req = multi_lang_general_model_service_pb2.GetClientConfigRequest(
+        )
+        resp = self.stub_.GetClientConfig(get_client_config_req)
+        model_config_str = resp.client_config_str
+        self._parse_model_config(model_config_str)
    def _flatten_list(self, nested_list):
        for item in nested_list:
@@ -409,11 +441,10 @@ class MultiLangClient(object):
            else:
                yield item
-    def _parse_model_config(self, model_config_path):
+    def _parse_model_config(self, model_config_str):
        model_conf = m_config.GeneralModelConfig()
-        f = open(model_config_path, 'r')
+        model_conf = google.protobuf.text_format.Merge(model_config_str,
-        model_conf = google.protobuf.text_format.Merge(
+                                                       model_conf)
-            str(f.read()), model_conf)
        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
        self.feed_types_ = {}
        self.feed_shapes_ = {}
@@ -434,8 +465,8 @@ class MultiLangClient(object):
            if var.is_lod_tensor:
                self.lod_tensor_set_.add(var.alias_name)
-    def _pack_feed_data(self, feed, fetch, is_python):
+    def _pack_inference_request(self, feed, fetch, is_python):
-        req = multi_lang_general_model_service_pb2.Request()
+        req = multi_lang_general_model_service_pb2.InferenceRequest()
        req.fetch_var_names.extend(fetch)
        req.is_python = is_python
        feed_batch = None
@@ -460,26 +491,50 @@ class MultiLangClient(object):
                            data = np.array(var, dtype="int64")
                        elif v_type == 1:  # float32
                            data = np.array(var, dtype="float32")
+                        elif v_type == 2:  # int32
+                            data = np.array(var, dtype="int32")
                        else:
-                            raise Exception("error type.")
+                            raise Exception("error tensor value type.")
-                    else:
+                    elif isinstance(var, np.ndarray):
                        data = var
-                        if var.dtype == "float64":
+                        if v_type == 0:
-                            data = data.astype("float32")
+                            if data.dtype != 'int64':
+                                data = data.astype("int64")
+                        elif v_type == 1:
+                            if data.dtype != 'float32':
+                                data = data.astype("float32")
+                        elif v_type == 2:
+                            if data.dtype != 'int32':
+                                data = data.astype("int32")
+                        else:
+                            raise Exception("error tensor value type.")
+                    else:
+                        raise Exception("var must be list or ndarray.")
                    tensor.data = data.tobytes()
                else:
-                    if v_type == 0:  # int64
+                    if isinstance(var, np.ndarray):
-                        if isinstance(var, np.ndarray):
+                        if v_type == 0:  # int64
-                            tensor.int64_data.extend(var.reshape(-1).tolist())
+                            tensor.int64_data.extend(
+                                var.reshape(-1).astype("int64").tolist())
+                        elif v_type == 1:
+                            tensor.float_data.extend(
+                                var.reshape(-1).astype('float32').tolist())
+                        elif v_type == 2:
+                            tensor.int32_data.extend(
+                                var.reshape(-1).astype('int32').tolist())
                        else:
+                            raise Exception("error tensor value type.")
+                    elif isinstance(var, list):
+                        if v_type == 0:
                            tensor.int64_data.extend(self._flatten_list(var))
-                    elif v_type == 1:  # float32
+                        elif v_type == 1:
-                        if isinstance(var, np.ndarray):
-                            tensor.float_data.extend(var.reshape(-1).tolist())
-                        else:
                            tensor.float_data.extend(self._flatten_list(var))
+                        elif v_type == 2:
+                            tensor.int32_data.extend(self._flatten_list(var))
+                        else:
+                            raise Exception("error tensor value type.")
                    else:
-                        raise Exception("error type.")
+                        raise Exception("var must be list or ndarray.")
                if isinstance(var, np.ndarray):
                    tensor.shape.extend(list(var.shape))
                else:
@@ -488,37 +543,52 @@ class MultiLangClient(object):
            req.insts.append(inst)
        return req
-    def _unpack_resp(self, resp, fetch, is_python, need_variant_tag):
+    def _unpack_inference_response(self, resp, fetch, is_python,
-        result_map = {}
+                                   need_variant_tag):
-        inst = resp.outputs[0].insts[0]
+        if resp.err_code != 0:
+            return None
        tag = resp.tag
-        for i, name in enumerate(fetch):
+        multi_result_map = {}
-            var = inst.tensor_array[i]
+        for model_result in resp.outputs:
-            v_type = self.fetch_types_[name]
+            inst = model_result.insts[0]
-            if is_python:
+            result_map = {}
-                if v_type == 0:  # int64
+            for i, name in enumerate(fetch):
-                    result_map[name] = np.frombuffer(var.data, dtype="int64")
+                var = inst.tensor_array[i]
-                elif v_type == 1:  # float32
+                v_type = self.fetch_types_[name]
-                    result_map[name] = np.frombuffer(var.data, dtype="float32")
+                if is_python:
-                else:
+                    if v_type == 0:  # int64
-                    raise Exception("error type.")
+                        result_map[name] = np.frombuffer(
-            else:
+                            var.data, dtype="int64")
-                if v_type == 0:  # int64
+                    elif v_type == 1:  # float32
-                    result_map[name] = np.array(
+                        result_map[name] = np.frombuffer(
-                        list(var.int64_data), dtype="int64")
+                            var.data, dtype="float32")
-                elif v_type == 1:  # float32
+                    else:
-                    result_map[name] = np.array(
+                        raise Exception("error type.")
-                        list(var.float_data), dtype="float32")
                else:
-                    raise Exception("error type.")
+                    if v_type == 0:  # int64
-            result_map[name].shape = list(var.shape)
+                        result_map[name] = np.array(
-            if name in self.lod_tensor_set_:
+                            list(var.int64_data), dtype="int64")
-                result_map["{}.lod".format(name)] = np.array(list(var.lod))
+                    elif v_type == 1:  # float32
-        return result_map if not need_variant_tag else [result_map, tag]
+                        result_map[name] = np.array(
+                            list(var.float_data), dtype="float32")
+                    else:
+                        raise Exception("error type.")
+                result_map[name].shape = list(var.shape)
+                if name in self.lod_tensor_set_:
+                    result_map["{}.lod".format(name)] = np.array(list(var.lod))
+            multi_result_map[model_result.engine_name] = result_map
+        ret = None
+        if len(resp.outputs) == 1:
+            ret = list(multi_result_map.values())[0]
+        else:
+            ret = multi_result_map
+        ret["serving_status_code"] = 0
+        return ret if not need_variant_tag else [ret, tag]
    def _done_callback_func(self, fetch, is_python, need_variant_tag):
        def unpack_resp(resp):
-            return self._unpack_resp(resp, fetch, is_python, need_variant_tag)
+            return self._unpack_inference_response(resp, fetch, is_python,
+                                                   need_variant_tag)
        return unpack_resp
@@ -531,16 +601,20 @@ class MultiLangClient(object):
                need_variant_tag=False,
                asyn=False,
                is_python=True):
-        req = self._pack_feed_data(feed, fetch, is_python=is_python)
+        req = self._pack_inference_request(feed, fetch, is_python=is_python)
        if not asyn:
-            resp = self.stub_.inference(req)
+            try:
-            return self._unpack_resp(
+                resp = self.stub_.Inference(req, timeout=self.rpc_timeout_s_)
-                resp,
+                return self._unpack_inference_response(
-                fetch,
+                    resp,
-                is_python=is_python,
+                    fetch,
-                need_variant_tag=need_variant_tag)
+                    is_python=is_python,
+                    need_variant_tag=need_variant_tag)
+            except grpc.RpcError as e:
+                return {"serving_status_code": e.code()}
        else:
-            call_future = self.stub_.inference.future(req)
+            call_future = self.stub_.Inference.future(
+                req, timeout=self.rpc_timeout_s_)
            return MultiLangPredictFuture(
                call_future,
                self._done_callback_func(
@@ -555,7 +629,10 @@ class MultiLangPredictFuture(object):
        self.callback_func_ = callback_func
    def result(self):
-        resp = self.call_future_.result()
+        try:
+            resp = self.call_future_.result()
+        except grpc.RpcError as e:
+            return {"serving_status_code": e.code()}
        return self.callback_func_(resp)
    def add_done_callback(self, fn):

--- a/python/paddle_serving_client/io/__init__.py
+++ b/python/paddle_serving_client/io/__init__.py
@@ -48,16 +48,18 @@ def save_model(server_model_folder,
    config = model_conf.GeneralModelConfig()
+    #int64 = 0; float32 = 1; int32 = 2;
    for key in feed_var_dict:
        feed_var = model_conf.FeedVar()
        feed_var.alias_name = key
        feed_var.name = feed_var_dict[key].name
        feed_var.is_lod_tensor = feed_var_dict[key].lod_level >= 1
-        if feed_var_dict[key].dtype == core.VarDesc.VarType.INT32 or \
+        if feed_var_dict[key].dtype == core.VarDesc.VarType.INT64:
-           feed_var_dict[key].dtype == core.VarDesc.VarType.INT64:
            feed_var.feed_type = 0
        if feed_var_dict[key].dtype == core.VarDesc.VarType.FP32:
            feed_var.feed_type = 1
+        if feed_var_dict[key].dtype == core.VarDesc.VarType.INT32:
+            feed_var.feed_type = 2
        if feed_var.is_lod_tensor:
            feed_var.shape.extend([-1])
        else:
@@ -73,13 +75,12 @@ def save_model(server_model_folder,
        fetch_var.alias_name = key
        fetch_var.name = fetch_var_dict[key].name
        fetch_var.is_lod_tensor = fetch_var_dict[key].lod_level >= 1
-        if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT32 or \
+        if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT64:
-           fetch_var_dict[key].dtype == core.VarDesc.VarType.INT64:
            fetch_var.fetch_type = 0
        if fetch_var_dict[key].dtype == core.VarDesc.VarType.FP32:
            fetch_var.fetch_type = 1
+        if fetch_var_dict[key].dtype == core.VarDesc.VarType.INT32:
+            fetch_var.fetch_type = 2
        if fetch_var.is_lod_tensor:
            fetch_var.shape.extend([-1])
        else:

--- a/python/paddle_serving_server/__init__.py
+++ b/python/paddle_serving_server/__init__.py
@@ -231,6 +231,7 @@ class Server(object):
            self.infer_service_conf.services.extend([infer_service])
    def _prepare_resource(self, workdir):
+        self.workdir = workdir
        if self.resource_conf == None:
            with open("{}/{}".format(workdir, self.general_model_config_fn),
                      "w") as fout:
@@ -328,10 +329,10 @@ class Server(object):
        os.chdir(self.module_path)
        need_download = False
        device_version = self.get_device_version()
-        floder_name = device_version + serving_server_version
+        folder_name = device_version + serving_server_version
-        tar_name = floder_name + ".tar.gz"
+        tar_name = folder_name + ".tar.gz"
        bin_url = "https://paddle-serving.bj.bcebos.com/bin/" + tar_name
-        self.server_path = os.path.join(self.module_path, floder_name)
+        self.server_path = os.path.join(self.module_path, folder_name)
        #acquire lock
        version_file = open("{}/version.py".format(self.module_path), "r")
@@ -357,7 +358,7 @@ class Server(object):
                        os.remove(exe_path)
                    raise SystemExit(
                        'Decompressing failed, please check your permission of {} or disk space left.'.
-                        foemat(self.module_path))
+                        format(self.module_path))
                finally:
                    os.remove(tar_name)
        #release lock
@@ -374,11 +375,11 @@ class Server(object):
        os.system("touch {}/fluid_time_file".format(workdir))
        if not self.port_is_available(port):
-            raise SystemExit("Prot {} is already used".format(port))
+            raise SystemExit("Port {} is already used".format(port))
+        self.set_port(port)
        self._prepare_resource(workdir)
        self._prepare_engine(self.model_config_paths, device)
        self._prepare_infer_service(port)
-        self.port = port
        self.workdir = workdir
        infer_service_fn = "{}/{}".format(workdir, self.infer_service_fn)
@@ -440,22 +441,29 @@ class Server(object):
        os.system(command)
-class MultiLangServerService(
+class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
-        multi_lang_general_model_service_pb2_grpc.MultiLangGeneralModelService):
+                                     MultiLangGeneralModelServiceServicer):
-    def __init__(self, model_config_path, endpoints):
+    def __init__(self, model_config_path, is_multi_model, endpoints):
+        self.is_multi_model_ = is_multi_model
+        self.model_config_path_ = model_config_path
+        self.endpoints_ = endpoints
+        with open(self.model_config_path_) as f:
+            self.model_config_str_ = str(f.read())
+        self._parse_model_config(self.model_config_str_)
+        self._init_bclient(self.model_config_path_, self.endpoints_)
+    def _init_bclient(self, model_config_path, endpoints, timeout_ms=None):
        from paddle_serving_client import Client
-        self._parse_model_config(model_config_path)
        self.bclient_ = Client()
-        self.bclient_.load_client_config(
+        if timeout_ms is not None:
-            "{}/serving_server_conf.prototxt".format(model_config_path))
+            self.bclient_.set_rpc_timeout_ms(timeout_ms)
+        self.bclient_.load_client_config(model_config_path)
        self.bclient_.connect(endpoints)
-    def _parse_model_config(self, model_config_path):
+    def _parse_model_config(self, model_config_str):
        model_conf = m_config.GeneralModelConfig()
-        f = open("{}/serving_server_conf.prototxt".format(model_config_path),
+        model_conf = google.protobuf.text_format.Merge(model_config_str,
-                 'r')
+                                                       model_conf)
-        model_conf = google.protobuf.text_format.Merge(
-            str(f.read()), model_conf)
        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
        self.feed_types_ = {}
        self.feed_shapes_ = {}
@@ -480,7 +488,7 @@ class MultiLangServerService(
            else:
                yield item
-    def _unpack_request(self, request):
+    def _unpack_inference_request(self, request):
        feed_names = list(request.feed_var_names)
        fetch_names = list(request.fetch_var_names)
        is_python = request.is_python
@@ -492,10 +500,12 @@ class MultiLangServerService(
                v_type = self.feed_types_[name]
                data = None
                if is_python:
-                    if v_type == 0:
+                    if v_type == 0:  # int64
                        data = np.frombuffer(var.data, dtype="int64")
-                    elif v_type == 1:
+                    elif v_type == 1:  # float32
                        data = np.frombuffer(var.data, dtype="float32")
+                    elif v_type == 2:  # int32
+                        data = np.frombuffer(var.data, dtype="int32")
                    else:
                        raise Exception("error type.")
                else:
@@ -503,6 +513,8 @@ class MultiLangServerService(
                        data = np.array(list(var.int64_data), dtype="int64")
                    elif v_type == 1:  # float32
                        data = np.array(list(var.float_data), dtype="float32")
+                    elif v_type == 2:  # int32
+                        data = np.array(list(var.int32_data), dtype="int32")
                    else:
                        raise Exception("error type.")
                data.shape = list(feed_inst.tensor_array[idx].shape)
@@ -510,55 +522,132 @@ class MultiLangServerService(
            feed_batch.append(feed_dict)
        return feed_batch, fetch_names, is_python
-    def _pack_resp_package(self, result, fetch_names, is_python, tag):
+    def _pack_inference_response(self, ret, fetch_names, is_python):
-        resp = multi_lang_general_model_service_pb2.Response()
+        resp = multi_lang_general_model_service_pb2.InferenceResponse()
-        # Only one model is supported temporarily
+        if ret is None:
-        model_output = multi_lang_general_model_service_pb2.ModelOutput()
+            resp.err_code = 1
-        inst = multi_lang_general_model_service_pb2.FetchInst()
+            return resp
-        for idx, name in enumerate(fetch_names):
+        results, tag = ret
-            tensor = multi_lang_general_model_service_pb2.Tensor()
-            v_type = self.fetch_types_[name]
-            if is_python:
-                tensor.data = result[name].tobytes()
-            else:
-                if v_type == 0:  # int64
-                    tensor.int64_data.extend(result[name].reshape(-1).tolist())
-                elif v_type == 1:  # float32
-                    tensor.float_data.extend(result[name].reshape(-1).tolist())
-                else:
-                    raise Exception("error type.")
-            tensor.shape.extend(list(result[name].shape))
-            if name in self.lod_tensor_set_:
-                tensor.lod.extend(result["{}.lod".format(name)].tolist())
-            inst.tensor_array.append(tensor)
-        model_output.insts.append(inst)
-        resp.outputs.append(model_output)
        resp.tag = tag
+        resp.err_code = 0
+        if not self.is_multi_model_:
+            results = {'general_infer_0': results}
+        for model_name, model_result in results.items():
+            model_output = multi_lang_general_model_service_pb2.ModelOutput()
+            inst = multi_lang_general_model_service_pb2.FetchInst()
+            for idx, name in enumerate(fetch_names):
+                tensor = multi_lang_general_model_service_pb2.Tensor()
+                v_type = self.fetch_types_[name]
+                if is_python:
+                    tensor.data = model_result[name].tobytes()
+                else:
+                    if v_type == 0:  # int64
+                        tensor.int64_data.extend(model_result[name].reshape(-1)
+                                                 .tolist())
+                    elif v_type == 1:  # float32
+                        tensor.float_data.extend(model_result[name].reshape(-1)
+                                                 .tolist())
+                    elif v_type == 2:  # int32
+                        tensor.int32_data.extend(model_result[name].reshape(-1)
+                                                 .tolist())
+                    else:
+                        raise Exception("error type.")
+                tensor.shape.extend(list(model_result[name].shape))
+                if name in self.lod_tensor_set_:
+                    tensor.lod.extend(model_result["{}.lod".format(name)]
+                                      .tolist())
+                inst.tensor_array.append(tensor)
+            model_output.insts.append(inst)
+            model_output.engine_name = model_name
+            resp.outputs.append(model_output)
+        return resp
+    def SetTimeout(self, request, context):
+        # This porcess and Inference process cannot be operate at the same time.
+        # For performance reasons, do not add thread lock temporarily.
+        timeout_ms = request.timeout_ms
+        self._init_bclient(self.model_config_path_, self.endpoints_, timeout_ms)
+        resp = multi_lang_general_model_service_pb2.SimpleResponse()
+        resp.err_code = 0
        return resp
-    def inference(self, request, context):
+    def Inference(self, request, context):
-        feed_dict, fetch_names, is_python = self._unpack_request(request)
+        feed_dict, fetch_names, is_python = self._unpack_inference_request(
-        data, tag = self.bclient_.predict(
+            request)
+        ret = self.bclient_.predict(
            feed=feed_dict, fetch=fetch_names, need_variant_tag=True)
-        return self._pack_resp_package(data, fetch_names, is_python, tag)
+        return self._pack_inference_response(ret, fetch_names, is_python)
+    def GetClientConfig(self, request, context):
+        resp = multi_lang_general_model_service_pb2.GetClientConfigResponse()
+        resp.client_config_str = self.model_config_str_
+        return resp
 class MultiLangServer(object):
-    def __init__(self, worker_num=2):
+    def __init__(self):
        self.bserver_ = Server()
-        self.worker_num_ = worker_num
+        self.worker_num_ = 4
+        self.body_size_ = 64 * 1024 * 1024
+        self.concurrency_ = 100000
+        self.is_multi_model_ = False  # for model ensemble
+    def set_max_concurrency(self, concurrency):
+        self.concurrency_ = concurrency
+        self.bserver_.set_max_concurrency(concurrency)
+    def set_num_threads(self, threads):
+        self.worker_num_ = threads
+        self.bserver_.set_num_threads(threads)
+    def set_max_body_size(self, body_size):
+        self.bserver_.set_max_body_size(body_size)
+        if body_size >= self.body_size_:
+            self.body_size_ = body_size
+        else:
+            print(
+                "max_body_size is less than default value, will use default value in service."
+            )
+    def set_port(self, port):
+        self.gport_ = port
+    def set_reload_interval(self, interval):
+        self.bserver_.set_reload_interval(interval)
    def set_op_sequence(self, op_seq):
        self.bserver_.set_op_sequence(op_seq)
-    def load_model_config(self, model_config_path):
+    def set_op_graph(self, op_graph):
-        if not isinstance(model_config_path, str):
+        self.bserver_.set_op_graph(op_graph)
-            raise Exception(
-                "MultiLangServer only supports multi-model temporarily")
+    def set_memory_optimize(self, flag=False):
-        self.bserver_.load_model_config(model_config_path)
+        self.bserver_.set_memory_optimize(flag)
-        self.model_config_path_ = model_config_path
+    def set_ir_optimize(self, flag=False):
+        self.bserver_.set_ir_optimize(flag)
+    def set_op_sequence(self, op_seq):
+        self.bserver_.set_op_sequence(op_seq)
+    def use_mkl(self, flag):
+        self.bserver_.use_mkl(flag)
+    def load_model_config(self, server_config_paths, client_config_path=None):
+        self.bserver_.load_model_config(server_config_paths)
+        if client_config_path is None:
+            if isinstance(server_config_paths, dict):
+                self.is_multi_model_ = True
+                client_config_path = '{}/serving_server_conf.prototxt'.format(
+                    list(server_config_paths.items())[0][1])
+            else:
+                client_config_path = '{}/serving_server_conf.prototxt'.format(
+                    server_config_paths)
+        self.bclient_config_path_ = client_config_path
    def prepare_server(self, workdir=None, port=9292, device="cpu"):
+        if not self._port_is_available(port):
+            raise SystemExit("Prot {} is already used".format(port))
        default_port = 12000
        self.port_list_ = []
        for i in range(1000):
@@ -568,7 +657,7 @@ class MultiLangServer(object):
                break
        self.bserver_.prepare_server(
            workdir=workdir, port=self.port_list_[0], device=device)
-        self.gport_ = port
+        self.set_port(port)
    def _launch_brpc_service(self, bserver):
        bserver.run_server()
@@ -583,12 +672,16 @@ class MultiLangServer(object):
        p_bserver = Process(
            target=self._launch_brpc_service, args=(self.bserver_, ))
        p_bserver.start()
+        options = [('grpc.max_send_message_length', self.body_size_),
+                   ('grpc.max_receive_message_length', self.body_size_)]
        server = grpc.server(
-            futures.ThreadPoolExecutor(max_workers=self.worker_num_))
+            futures.ThreadPoolExecutor(max_workers=self.worker_num_),
+            options=options,
+            maximum_concurrent_rpcs=self.concurrency_)
        multi_lang_general_model_service_pb2_grpc.add_MultiLangGeneralModelServiceServicer_to_server(
-            MultiLangServerService(self.model_config_path_,
+            MultiLangServerServiceServicer(
-                                   ["0.0.0.0:{}".format(self.port_list_[0])]),
+                self.bclient_config_path_, self.is_multi_model_,
-            server)
+                ["0.0.0.0:{}".format(self.port_list_[0])]), server)
        server.add_insecure_port('[::]:{}'.format(self.gport_))
        server.start()
        p_bserver.join()

--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
@@ -53,6 +53,11 @@ def parse_args():  # pylint: disable=doc-string-missing
        type=int,
        default=512 * 1024 * 1024,
        help="Limit sizes of messages")
+    parser.add_argument(
+        "--use_multilang",
+        default=False,
+        action="store_true",
+        help="Use Multi-language-service")
    return parser.parse_args()
@@ -67,6 +72,7 @@ def start_standard_model():  # pylint: disable=doc-string-missing
    ir_optim = args.ir_optim
    max_body_size = args.max_body_size
    use_mkl = args.use_mkl
+    use_multilang = args.use_multilang
    if model == "":
        print("You must specify your serving model")
@@ -83,7 +89,11 @@ def start_standard_model():  # pylint: disable=doc-string-missing
    op_seq_maker.add_op(general_infer_op)
    op_seq_maker.add_op(general_response_op)
-    server = serving.Server()
+    server = None
+    if use_multilang:
+        server = serving.MultiLangServer()
+    else:
+        server = serving.Server()
    server.set_op_sequence(op_seq_maker.get_op_sequence())
    server.set_num_threads(thread_num)
    server.set_memory_optimize(mem_optim)

--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -85,9 +85,9 @@ class WebService(object):
            fetch_map = self.client.predict(feed=feed, fetch=fetch)
            for key in fetch_map:
                fetch_map[key] = fetch_map[key].tolist()
-            fetch_map = self.postprocess(
+            result = self.postprocess(
                feed=request.json["feed"], fetch=fetch, fetch_map=fetch_map)
-            result = {"result": fetch_map}
+            result = {"result": result}
        except ValueError:
            result = {"result": "Request Value Error"}
        return result

--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
@@ -68,6 +68,11 @@ def serve_args():
        type=int,
        default=512 * 1024 * 1024,
        help="Limit sizes of messages")
+    parser.add_argument(
+        "--use_multilang",
+        default=False,
+        action="store_true",
+        help="Use Multi-language-service")
    return parser.parse_args()
@@ -410,7 +415,7 @@ class Server(object):
        os.system("touch {}/fluid_time_file".format(workdir))
        if not self.port_is_available(port):
-            raise SystemExit("Prot {} is already used".format(port))
+            raise SystemExit("Port {} is already used".format(port))
        self.set_port(port)
        self._prepare_resource(workdir)
@@ -484,22 +489,29 @@ class Server(object):
        os.system(command)
-class MultiLangServerService(
+class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
-        multi_lang_general_model_service_pb2_grpc.MultiLangGeneralModelService):
+                                     MultiLangGeneralModelServiceServicer):
-    def __init__(self, model_config_path, endpoints):
+    def __init__(self, model_config_path, is_multi_model, endpoints):
+        self.is_multi_model_ = is_multi_model
+        self.model_config_path_ = model_config_path
+        self.endpoints_ = endpoints
+        with open(self.model_config_path_) as f:
+            self.model_config_str_ = str(f.read())
+        self._parse_model_config(self.model_config_str_)
+        self._init_bclient(self.model_config_path_, self.endpoints_)
+    def _init_bclient(self, model_config_path, endpoints, timeout_ms=None):
        from paddle_serving_client import Client
-        self._parse_model_config(model_config_path)
        self.bclient_ = Client()
-        self.bclient_.load_client_config(
+        if timeout_ms is not None:
-            "{}/serving_server_conf.prototxt".format(model_config_path))
+            self.bclient_.set_rpc_timeout_ms(timeout_ms)
+        self.bclient_.load_client_config(model_config_path)
        self.bclient_.connect(endpoints)
-    def _parse_model_config(self, model_config_path):
+    def _parse_model_config(self, model_config_str):
        model_conf = m_config.GeneralModelConfig()
-        f = open("{}/serving_server_conf.prototxt".format(model_config_path),
+        model_conf = google.protobuf.text_format.Merge(model_config_str,
-                 'r')
+                                                       model_conf)
-        model_conf = google.protobuf.text_format.Merge(
-            str(f.read()), model_conf)
        self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
        self.feed_types_ = {}
        self.feed_shapes_ = {}
@@ -524,7 +536,7 @@ class MultiLangServerService(
            else:
                yield item
-    def _unpack_request(self, request):
+    def _unpack_inference_request(self, request):
        feed_names = list(request.feed_var_names)
        fetch_names = list(request.fetch_var_names)
        is_python = request.is_python
@@ -540,6 +552,8 @@ class MultiLangServerService(
                        data = np.frombuffer(var.data, dtype="int64")
                    elif v_type == 1:
                        data = np.frombuffer(var.data, dtype="float32")
+                    elif v_type == 2:
+                        data = np.frombuffer(var.data, dtype="int32")
                    else:
                        raise Exception("error type.")
                else:
@@ -547,6 +561,8 @@ class MultiLangServerService(
                        data = np.array(list(var.int64_data), dtype="int64")
                    elif v_type == 1:  # float32
                        data = np.array(list(var.float_data), dtype="float32")
+                    elif v_type == 2:
+                        data = np.array(list(var.int32_data), dtype="int32")
                    else:
                        raise Exception("error type.")
                data.shape = list(feed_inst.tensor_array[idx].shape)
@@ -554,55 +570,129 @@ class MultiLangServerService(
            feed_batch.append(feed_dict)
        return feed_batch, fetch_names, is_python
-    def _pack_resp_package(self, result, fetch_names, is_python, tag):
+    def _pack_inference_response(self, ret, fetch_names, is_python):
-        resp = multi_lang_general_model_service_pb2.Response()
+        resp = multi_lang_general_model_service_pb2.InferenceResponse()
-        # Only one model is supported temporarily
+        if ret is None:
-        model_output = multi_lang_general_model_service_pb2.ModelOutput()
+            resp.err_code = 1
-        inst = multi_lang_general_model_service_pb2.FetchInst()
+            return resp
-        for idx, name in enumerate(fetch_names):
+        results, tag = ret
-            tensor = multi_lang_general_model_service_pb2.Tensor()
-            v_type = self.fetch_types_[name]
-            if is_python:
-                tensor.data = result[name].tobytes()
-            else:
-                if v_type == 0:  # int64
-                    tensor.int64_data.extend(result[name].reshape(-1).tolist())
-                elif v_type == 1:  # float32
-                    tensor.float_data.extend(result[name].reshape(-1).tolist())
-                else:
-                    raise Exception("error type.")
-            tensor.shape.extend(list(result[name].shape))
-            if name in self.lod_tensor_set_:
-                tensor.lod.extend(result["{}.lod".format(name)].tolist())
-            inst.tensor_array.append(tensor)
-        model_output.insts.append(inst)
-        resp.outputs.append(model_output)
        resp.tag = tag
+        resp.err_code = 0
+        if not self.is_multi_model_:
+            results = {'general_infer_0': results}
+        for model_name, model_result in results.items():
+            model_output = multi_lang_general_model_service_pb2.ModelOutput()
+            inst = multi_lang_general_model_service_pb2.FetchInst()
+            for idx, name in enumerate(fetch_names):
+                tensor = multi_lang_general_model_service_pb2.Tensor()
+                v_type = self.fetch_types_[name]
+                if is_python:
+                    tensor.data = model_result[name].tobytes()
+                else:
+                    if v_type == 0:  # int64
+                        tensor.int64_data.extend(model_result[name].reshape(-1)
+                                                 .tolist())
+                    elif v_type == 1:  # float32
+                        tensor.float_data.extend(model_result[name].reshape(-1)
+                                                 .tolist())
+                    elif v_type == 2:  # int32
+                        tensor.int32_data.extend(model_result[name].reshape(-1)
+                                                 .tolist())
+                    else:
+                        raise Exception("error type.")
+                tensor.shape.extend(list(model_result[name].shape))
+                if name in self.lod_tensor_set_:
+                    tensor.lod.extend(model_result["{}.lod".format(name)]
+                                      .tolist())
+                inst.tensor_array.append(tensor)
+            model_output.insts.append(inst)
+            model_output.engine_name = model_name
+            resp.outputs.append(model_output)
+        return resp
+    def SetTimeout(self, request, context):
+        # This porcess and Inference process cannot be operate at the same time.
+        # For performance reasons, do not add thread lock temporarily.
+        timeout_ms = request.timeout_ms
+        self._init_bclient(self.model_config_path_, self.endpoints_, timeout_ms)
+        resp = multi_lang_general_model_service_pb2.SimpleResponse()
+        resp.err_code = 0
        return resp
-    def inference(self, request, context):
+    def Inference(self, request, context):
-        feed_dict, fetch_names, is_python = self._unpack_request(request)
+        feed_dict, fetch_names, is_python = self._unpack_inference_request(
-        data, tag = self.bclient_.predict(
+            request)
+        ret = self.bclient_.predict(
            feed=feed_dict, fetch=fetch_names, need_variant_tag=True)
-        return self._pack_resp_package(data, fetch_names, is_python, tag)
+        return self._pack_inference_response(ret, fetch_names, is_python)
+    def GetClientConfig(self, request, context):
+        resp = multi_lang_general_model_service_pb2.GetClientConfigResponse()
+        resp.client_config_str = self.model_config_str_
+        return resp
 class MultiLangServer(object):
-    def __init__(self, worker_num=2):
+    def __init__(self):
        self.bserver_ = Server()
-        self.worker_num_ = worker_num
+        self.worker_num_ = 4
+        self.body_size_ = 64 * 1024 * 1024
+        self.concurrency_ = 100000
+        self.is_multi_model_ = False  # for model ensemble
+    def set_max_concurrency(self, concurrency):
+        self.concurrency_ = concurrency
+        self.bserver_.set_max_concurrency(concurrency)
+    def set_num_threads(self, threads):
+        self.worker_num_ = threads
+        self.bserver_.set_num_threads(threads)
+    def set_max_body_size(self, body_size):
+        self.bserver_.set_max_body_size(body_size)
+        if body_size >= self.body_size_:
+            self.body_size_ = body_size
+        else:
+            print(
+                "max_body_size is less than default value, will use default value in service."
+            )
+    def set_port(self, port):
+        self.gport_ = port
+    def set_reload_interval(self, interval):
+        self.bserver_.set_reload_interval(interval)
    def set_op_sequence(self, op_seq):
        self.bserver_.set_op_sequence(op_seq)
-    def load_model_config(self, model_config_path):
+    def set_op_graph(self, op_graph):
-        if not isinstance(model_config_path, str):
+        self.bserver_.set_op_graph(op_graph)
-            raise Exception(
-                "MultiLangServer only supports multi-model temporarily")
+    def set_memory_optimize(self, flag=False):
-        self.bserver_.load_model_config(model_config_path)
+        self.bserver_.set_memory_optimize(flag)
-        self.model_config_path_ = model_config_path
+    def set_ir_optimize(self, flag=False):
+        self.bserver_.set_ir_optimize(flag)
+    def set_gpuid(self, gpuid=0):
+        self.bserver_.set_gpuid(gpuid)
+    def load_model_config(self, server_config_paths, client_config_path=None):
+        self.bserver_.load_model_config(server_config_paths)
+        if client_config_path is None:
+            if isinstance(server_config_paths, dict):
+                self.is_multi_model_ = True
+                client_config_path = '{}/serving_server_conf.prototxt'.format(
+                    list(server_config_paths.items())[0][1])
+            else:
+                client_config_path = '{}/serving_server_conf.prototxt'.format(
+                    server_config_paths)
+        self.bclient_config_path_ = client_config_path
    def prepare_server(self, workdir=None, port=9292, device="cpu"):
+        if not self._port_is_available(port):
+            raise SystemExit("Prot {} is already used".format(port))
        default_port = 12000
        self.port_list_ = []
        for i in range(1000):
@@ -612,7 +702,7 @@ class MultiLangServer(object):
                break
        self.bserver_.prepare_server(
            workdir=workdir, port=self.port_list_[0], device=device)
-        self.gport_ = port
+        self.set_port(port)
    def _launch_brpc_service(self, bserver):
        bserver.run_server()
@@ -627,12 +717,16 @@ class MultiLangServer(object):
        p_bserver = Process(
            target=self._launch_brpc_service, args=(self.bserver_, ))
        p_bserver.start()
+        options = [('grpc.max_send_message_length', self.body_size_),
+                   ('grpc.max_receive_message_length', self.body_size_)]
        server = grpc.server(
-            futures.ThreadPoolExecutor(max_workers=self.worker_num_))
+            futures.ThreadPoolExecutor(max_workers=self.worker_num_),
+            options=options,
+            maximum_concurrent_rpcs=self.concurrency_)
        multi_lang_general_model_service_pb2_grpc.add_MultiLangGeneralModelServiceServicer_to_server(
-            MultiLangServerService(self.model_config_path_,
+            MultiLangServerServiceServicer(
-                                   ["0.0.0.0:{}".format(self.port_list_[0])]),
+                self.bclient_config_path_, self.is_multi_model_,
-            server)
+                ["0.0.0.0:{}".format(self.port_list_[0])]), server)
        server.add_insecure_port('[::]:{}'.format(self.gport_))
        server.start()
        p_bserver.join()

--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
@@ -37,6 +37,7 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
    mem_optim = args.mem_optim
    ir_optim = args.ir_optim
    max_body_size = args.max_body_size
+    use_multilang = args.use_multilang
    workdir = "{}_{}".format(args.workdir, gpuid)
    if model == "":
@@ -54,7 +55,10 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
    op_seq_maker.add_op(general_infer_op)
    op_seq_maker.add_op(general_response_op)
-    server = serving.Server()
+    if use_multilang:
+        server = serving.MultiLangServer()
+    else:
+        server = serving.Server()
    server.set_op_sequence(op_seq_maker.get_op_sequence())
    server.set_num_threads(thread_num)
    server.set_memory_optimize(mem_optim)

--- a/python/paddle_serving_server_gpu/web_service.py
+++ b/python/paddle_serving_server_gpu/web_service.py
@@ -50,12 +50,12 @@ class WebService(object):
        general_infer_op = op_maker.create('general_infer')
        general_response_op = op_maker.create('general_response')
-        op_seq_maker = serving.OpSeqMaker()
+        op_seq_maker = OpSeqMaker()
        op_seq_maker.add_op(read_op)
        op_seq_maker.add_op(general_infer_op)
        op_seq_maker.add_op(general_response_op)
-        server = serving.Server()
+        server = Server()
        server.set_op_sequence(op_seq_maker.get_op_sequence())
        server.set_num_threads(thread_num)
@@ -171,7 +171,7 @@ class WebService(object):
                              processes=1)
    def get_app_instance(self):
-        return app_instance
+        return self.app_instance
    def preprocess(self, feed=[], fetch=[]):
        return feed, fetch

--- a/python/pipeline/__init__.py
+++ b/python/pipeline/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from operator import Op, RequestOp, ResponseOp
+from pipeline_server import PipelineServer
+from pipeline_client import PipelineClient
--- a/python/pipeline/channel.py
+++ b/python/pipeline/channel.py
--- a/python/pipeline/operator.py
+++ b/python/pipeline/operator.py
@@ -12,3 +12,473 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
+import threading
+import multiprocessing
+from paddle_serving_client import MultiLangClient, Client
+from concurrent import futures
+import logging
+import func_timeout
+from numpy import *
+from .proto import pipeline_service_pb2
+from .channel import ThreadChannel, ProcessChannel, ChannelDataEcode, ChannelData, ChannelDataType
+from .util import NameGenerator
+_LOGGER = logging.getLogger(__name__)
+_op_name_gen = NameGenerator("Op")
+class Op(object):
+    def __init__(self,
+                 name=None,
+                 input_ops=[],
+                 server_endpoints=[],
+                 fetch_list=[],
+                 client_config=None,
+                 concurrency=1,
+                 timeout=-1,
+                 retry=1):
+        if name is None:
+            name = _op_name_gen.next()
+        self._is_run = False
+        self.name = name  # to identify the type of OP, it must be globally unique
+        self.concurrency = concurrency  # amount of concurrency
+        self.set_input_ops(input_ops)
+        self._server_endpoints = server_endpoints
+        self.with_serving = False
+        if len(self._server_endpoints) != 0:
+            self.with_serving = True
+        self._client_config = client_config
+        self._fetch_names = fetch_list
+        self._timeout = timeout
+        self._retry = max(1, retry)
+        self._input = None
+        self._outputs = []
+        self._profiler = None
+    def init_profiler(self, profiler):
+        self._profiler = profiler
+    def _profiler_record(self, string):
+        if self._profiler is None:
+            return
+        self._profiler.record(string)
+    def init_client(self, client_type, client_config, server_endpoints,
+                    fetch_names):
+        if self.with_serving == False:
+            _LOGGER.debug("{} no client".format(self.name))
+            return
+        _LOGGER.debug("{} client_config: {}".format(self.name, client_config))
+        _LOGGER.debug("{} fetch_names: {}".format(self.name, fetch_names))
+        if client_type == 'brpc':
+            self._client = Client()
+            self._client.load_client_config(client_config)
+        elif client_type == 'grpc':
+            self._client = MultiLangClient()
+        else:
+            raise ValueError("unknow client type: {}".format(client_type))
+        self._client.connect(server_endpoints)
+        self._fetch_names = fetch_names
+    def _get_input_channel(self):
+        return self._input
+    def get_input_ops(self):
+        return self._input_ops
+    def set_input_ops(self, ops):
+        if not isinstance(ops, list):
+            ops = [] if ops is None else [ops]
+        self._input_ops = []
+        for op in ops:
+            if not isinstance(op, Op):
+                raise TypeError(
+                    self._log('input op must be Op type, not {}'.format(
+                        type(op))))
+            self._input_ops.append(op)
+    def add_input_channel(self, channel):
+        if not isinstance(channel, (ThreadChannel, ProcessChannel)):
+            raise TypeError(
+                self._log('input channel must be Channel type, not {}'.format(
+                    type(channel))))
+        channel.add_consumer(self.name)
+        self._input = channel
+    def _get_output_channels(self):
+        return self._outputs
+    def add_output_channel(self, channel):
+        if not isinstance(channel, (ThreadChannel, ProcessChannel)):
+            raise TypeError(
+                self._log('output channel must be Channel type, not {}'.format(
+                    type(channel))))
+        channel.add_producer(self.name)
+        self._outputs.append(channel)
+    def preprocess(self, input_dicts):
+        # multiple previous Op
+        if len(input_dicts) != 1:
+            raise NotImplementedError(
+                'this Op has multiple previous inputs. Please override this func.'
+            )
+        (_, input_dict), = input_dicts.items()
+        return input_dict
+    def process(self, feed_dict):
+        err, err_info = ChannelData.check_npdata(feed_dict)
+        if err != 0:
+            raise NotImplementedError(
+                "{} Please override preprocess func.".format(err_info))
+        _LOGGER.debug(self._log('feed_dict: {}'.format(feed_dict)))
+        _LOGGER.debug(self._log('fetch: {}'.format(self._fetch_names)))
+        call_result = self._client.predict(
+            feed=feed_dict, fetch=self._fetch_names)
+        _LOGGER.debug(self._log("get call_result"))
+        return call_result
+    def postprocess(self, fetch_dict):
+        return fetch_dict
+    def stop(self):
+        self._is_run = False
+    def _parse_channeldata(self, channeldata_dict):
+        data_id, error_channeldata = None, None
+        parsed_data = {}
+        key = list(channeldata_dict.keys())[0]
+        data_id = channeldata_dict[key].id
+        for name, data in channeldata_dict.items():
+            if data.ecode != ChannelDataEcode.OK.value:
+                error_channeldata = data
+                break
+            parsed_data[name] = data.parse()
+        return data_id, error_channeldata, parsed_data
+    def _push_to_output_channels(self, data, channels, name=None):
+        if name is None:
+            name = self.name
+        for channel in channels:
+            channel.push(data, name)
+    def start_with_process(self, client_type):
+        proces = []
+        for concurrency_idx in range(self.concurrency):
+            p = multiprocessing.Process(
+                target=self._run,
+                args=(concurrency_idx, self._get_input_channel(),
+                      self._get_output_channels(), client_type))
+            p.start()
+            proces.append(p)
+        return proces
+    def start_with_thread(self, client_type):
+        threads = []
+        for concurrency_idx in range(self.concurrency):
+            t = threading.Thread(
+                target=self._run,
+                args=(concurrency_idx, self._get_input_channel(),
+                      self._get_output_channels(), client_type))
+            t.start()
+            threads.append(t)
+        return threads
+    def load_user_resources(self):
+        pass
+    def _run_preprocess(self, parsed_data, data_id, log_func):
+        preped_data, error_channeldata = None, None
+        try:
+            preped_data = self.preprocess(parsed_data)
+        except NotImplementedError as e:
+            # preprocess function not implemented
+            error_info = log_func(e)
+            _LOGGER.error(error_info)
+            error_channeldata = ChannelData(
+                ecode=ChannelDataEcode.NOT_IMPLEMENTED.value,
+                error_info=error_info,
+                data_id=data_id)
+        except TypeError as e:
+            # Error type in channeldata.datatype
+            error_info = log_func(e)
+            _LOGGER.error(error_info)
+            error_channeldata = ChannelData(
+                ecode=ChannelDataEcode.TYPE_ERROR.value,
+                error_info=error_info,
+                data_id=data_id)
+        except Exception as e:
+            error_info = log_func(e)
+            _LOGGER.error(error_info)
+            error_channeldata = ChannelData(
+                ecode=ChannelDataEcode.UNKNOW.value,
+                error_info=error_info,
+                data_id=data_id)
+        return preped_data, error_channeldata
+    def _run_process(self, preped_data, data_id, log_func):
+        midped_data, error_channeldata = None, None
+        if self.with_serving:
+            ecode = ChannelDataEcode.OK.value
+            if self._timeout <= 0:
+                try:
+                    midped_data = self.process(preped_data)
+                except Exception as e:
+                    ecode = ChannelDataEcode.UNKNOW.value
+                    error_info = log_func(e)
+                    _LOGGER.error(error_info)
+            else:
+                for i in range(self._retry):
+                    try:
+                        midped_data = func_timeout.func_timeout(
+                            self._timeout, self.process, args=(preped_data, ))
+                    except func_timeout.FunctionTimedOut as e:
+                        if i + 1 >= self._retry:
+                            ecode = ChannelDataEcode.TIMEOUT.value
+                            error_info = log_func(e)
+                            _LOGGER.error(error_info)
+                        else:
+                            _LOGGER.warn(
+                                log_func("timeout, retry({})".format(i + 1)))
+                    except Exception as e:
+                        ecode = ChannelDataEcode.UNKNOW.value
+                        error_info = log_func(e)
+                        _LOGGER.error(error_info)
+                        break
+                    else:
+                        break
+            if ecode != ChannelDataEcode.OK.value:
+                error_channeldata = ChannelData(
+                    ecode=ecode, error_info=error_info, data_id=data_id)
+            elif midped_data is None:
+                # op client return None
+                error_channeldata = ChannelData(
+                    ecode=ChannelDataEcode.CLIENT_ERROR.value,
+                    error_info=log_func(
+                        "predict failed. pls check the server side."),
+                    data_id=data_id)
+        else:
+            midped_data = preped_data
+        return midped_data, error_channeldata
+    def _run_postprocess(self, midped_data, data_id, log_func):
+        output_data, error_channeldata = None, None
+        try:
+            postped_data = self.postprocess(midped_data)
+        except Exception as e:
+            error_info = log_func(e)
+            _LOGGER.error(error_info)
+            error_channeldata = ChannelData(
+                ecode=ChannelDataEcode.UNKNOW.value,
+                error_info=error_info,
+                data_id=data_id)
+            return output_data, error_channeldata
+        if not isinstance(postped_data, dict):
+            error_info = log_func("output of postprocess funticon must be " \
+                    "dict type, but get {}".format(type(postped_data)))
+            _LOGGER.error(error_info)
+            error_channeldata = ChannelData(
+                ecode=ChannelDataEcode.UNKNOW.value,
+                error_info=error_info,
+                data_id=data_id)
+            return output_data, error_channeldata
+        err, _ = ChannelData.check_npdata(postped_data)
+        if err == 0:
+            output_data = ChannelData(
+                ChannelDataType.CHANNEL_NPDATA.value,
+                npdata=postped_data,
+                data_id=data_id)
+        else:
+            output_data = ChannelData(
+                ChannelDataType.DICT.value,
+                dictdata=postped_data,
+                data_id=data_id)
+        return output_data, error_channeldata
+    def _run(self, concurrency_idx, input_channel, output_channels,
+             client_type):
+        def get_log_func(op_info_prefix):
+            def log_func(info_str):
+                return "{} {}".format(op_info_prefix, info_str)
+            return log_func
+        op_info_prefix = "[{}|{}]".format(self.name, concurrency_idx)
+        log = get_log_func(op_info_prefix)
+        tid = threading.current_thread().ident
+        # create client based on client_type
+        self.init_client(client_type, self._client_config,
+                         self._server_endpoints, self._fetch_names)
+        # load user resources
+        self.load_user_resources()
+        self._is_run = True
+        while self._is_run:
+            self._profiler_record("{}-get#{}_0".format(op_info_prefix, tid))
+            channeldata_dict = input_channel.front(self.name)
+            self._profiler_record("{}-get#{}_1".format(op_info_prefix, tid))
+            _LOGGER.debug(log("input_data: {}".format(channeldata_dict)))
+            data_id, error_channeldata, parsed_data = self._parse_channeldata(
+                channeldata_dict)
+            # error data in predecessor Op
+            if error_channeldata is not None:
+                self._push_to_output_channels(error_channeldata,
+                                              output_channels)
+                continue
+            # preprecess
+            self._profiler_record("{}-prep#{}_0".format(op_info_prefix, tid))
+            preped_data, error_channeldata = self._run_preprocess(parsed_data,
+                                                                  data_id, log)
+            self._profiler_record("{}-prep#{}_1".format(op_info_prefix, tid))
+            if error_channeldata is not None:
+                self._push_to_output_channels(error_channeldata,
+                                              output_channels)
+                continue
+            # process
+            self._profiler_record("{}-midp#{}_0".format(op_info_prefix, tid))
+            midped_data, error_channeldata = self._run_process(preped_data,
+                                                               data_id, log)
+            self._profiler_record("{}-midp#{}_1".format(op_info_prefix, tid))
+            if error_channeldata is not None:
+                self._push_to_output_channels(error_channeldata,
+                                              output_channels)
+                continue
+            # postprocess
+            self._profiler_record("{}-postp#{}_0".format(op_info_prefix, tid))
+            output_data, error_channeldata = self._run_postprocess(midped_data,
+                                                                   data_id, log)
+            self._profiler_record("{}-postp#{}_1".format(op_info_prefix, tid))
+            if error_channeldata is not None:
+                self._push_to_output_channels(error_channeldata,
+                                              output_channels)
+                continue
+            # push data to channel (if run succ)
+            self._profiler_record("{}-push#{}_0".format(op_info_prefix, tid))
+            self._push_to_output_channels(output_data, output_channels)
+            self._profiler_record("{}-push#{}_1".format(op_info_prefix, tid))
+    def _log(self, info):
+        return "{} {}".format(self.name, info)
+class RequestOp(Op):
+    """ RequestOp do not run preprocess, process, postprocess. """
+    def __init__(self, concurrency=1):
+        # PipelineService.name = "#G"
+        super(RequestOp, self).__init__(
+            name="#G", input_ops=[], concurrency=concurrency)
+        # load user resources
+        self.load_user_resources()
+    def unpack_request_package(self, request):
+        dictdata = {}
+        for idx, key in enumerate(request.key):
+            data = request.value[idx]
+            try:
+                data = eval(data)
+            except Exception as e:
+                pass
+            dictdata[key] = data
+        return dictdata
+class ResponseOp(Op):
+    """ ResponseOp do not run preprocess, process, postprocess. """
+    def __init__(self, input_ops, concurrency=1):
+        super(ResponseOp, self).__init__(
+            name="#R", input_ops=input_ops, concurrency=concurrency)
+        # load user resources
+        self.load_user_resources()
+    def pack_response_package(self, channeldata):
+        resp = pipeline_service_pb2.Response()
+        resp.ecode = channeldata.ecode
+        if resp.ecode == ChannelDataEcode.OK.value:
+            if channeldata.datatype == ChannelDataType.CHANNEL_NPDATA.value:
+                feed = channeldata.parse()
+                # ndarray to string:
+                # https://stackoverflow.com/questions/30167538/convert-a-numpy-ndarray-to-stringor-bytes-and-convert-it-back-to-numpy-ndarray
+                for name, var in feed.items():
+                    resp.value.append(var.__repr__())
+                    resp.key.append(name)
+            elif channeldata.datatype == ChannelDataType.DICT.value:
+                feed = channeldata.parse()
+                for name, var in feed.items():
+                    if not isinstance(var, str):
+                        resp.ecode = ChannelDataEcode.TYPE_ERROR.value
+                        resp.error_info = self._log(
+                            "fetch var type must be str({}).".format(
+                                type(var)))
+                        break
+                    resp.value.append(var)
+                    resp.key.append(name)
+            else:
+                resp.ecode = ChannelDataEcode.TYPE_ERROR.value
+                resp.error_info = self._log(
+                    "Error type({}) in datatype.".format(channeldata.datatype))
+                _LOGGER.error(resp.error_info)
+        else:
+            resp.error_info = channeldata.error_info
+        return resp
+class VirtualOp(Op):
+    ''' For connecting two channels. '''
+    def __init__(self, name, concurrency=1):
+        super(VirtualOp, self).__init__(
+            name=name, input_ops=None, concurrency=concurrency)
+        self._virtual_pred_ops = []
+    def add_virtual_pred_op(self, op):
+        self._virtual_pred_ops.append(op)
+    def add_output_channel(self, channel):
+        if not isinstance(channel, (ThreadChannel, ProcessChannel)):
+            raise TypeError(
+                self._log('output channel must be Channel type, not {}'.format(
+                    type(channel))))
+        for op in self._virtual_pred_ops:
+            channel.add_producer(op.name)
+        self._outputs.append(channel)
+    def _run(self, concurrency_idx, input_channel, output_channels,
+             client_type):
+        def get_log_func(op_info_prefix):
+            def log_func(info_str):
+                return "{} {}".format(op_info_prefix, info_str)
+            return log_func
+        op_info_prefix = "[{}|{}]".format(self.name, concurrency_idx)
+        log = get_log_func(op_info_prefix)
+        tid = threading.current_thread().ident
+        self._is_run = True
+        while self._is_run:
+            self._profiler_record("{}-get#{}_0".format(op_info_prefix, tid))
+            channeldata_dict = input_channel.front(self.name)
+            self._profiler_record("{}-get#{}_1".format(op_info_prefix, tid))
+            self._profiler_record("{}-push#{}_0".format(op_info_prefix, tid))
+            for name, data in channeldata_dict.items():
+                self._push_to_output_channels(
+                    data, channels=output_channels, name=name)
+            self._profiler_record("{}-push#{}_1".format(op_info_prefix, tid))
--- a/python/pipeline/pipeline_client.py
+++ b/python/pipeline/pipeline_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+import grpc
+import numpy as np
+from numpy import *
+import logging
+import functools
+from .proto import pipeline_service_pb2
+from .proto import pipeline_service_pb2_grpc
+_LOGGER = logging.getLogger(__name__)
+class PipelineClient(object):
+    def __init__(self):
+        self._channel = None
+    def connect(self, endpoint):
+        self._channel = grpc.insecure_channel(endpoint)
+        self._stub = pipeline_service_pb2_grpc.PipelineServiceStub(
+            self._channel)
+    def _pack_request_package(self, feed_dict):
+        req = pipeline_service_pb2.Request()
+        for key, value in feed_dict.items():
+            req.key.append(key)
+            if isinstance(value, np.ndarray):
+                req.value.append(value.__repr__())
+            elif isinstance(value, str):
+                req.value.append(value)
+            elif isinstance(value, list):
+                req.value.append(np.array(value).__repr__())
+            else:
+                raise TypeError("only str and np.ndarray type is supported: {}".
+                                format(type(value)))
+        return req
+    def _unpack_response_package(self, resp, fetch):
+        if resp.ecode != 0:
+            return {"ecode": resp.ecode, "error_info": resp.error_info}
+        fetch_map = {"ecode": resp.ecode}
+        for idx, key in enumerate(resp.key):
+            if key not in fetch:
+                continue
+            data = resp.value[idx]
+            try:
+                data = eval(data)
+            except Exception as e:
+                pass
+            fetch_map[key] = data
+        return fetch_map
+    def predict(self, feed_dict, fetch, asyn=False):
+        if not isinstance(feed_dict, dict):
+            raise TypeError(
+                "feed must be dict type with format: {name: value}.")
+        if not isinstance(fetch, list):
+            raise TypeError("fetch must be list type with format: [name].")
+        req = self._pack_request_package(feed_dict)
+        if not asyn:
+            resp = self._stub.inference(req)
+            return self._unpack_response_package(resp)
+        else:
+            call_future = self._stub.inference.future(req)
+            return PipelinePredictFuture(
+                call_future,
+                functools.partial(
+                    self._unpack_response_package, fetch=fetch))
+class PipelinePredictFuture(object):
+    def __init__(self, call_future, callback_func):
+        self.call_future_ = call_future
+        self.callback_func_ = callback_func
+    def result(self):
+        resp = self.call_future_.result()
+        return self.callback_func_(resp)
--- a/python/pipeline/pipeline_server.py
+++ b/python/pipeline/pipeline_server.py
@@ -12,3 +12,440 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
+import threading
+import multiprocessing
+import multiprocessing.queues
+import sys
+if sys.version_info.major == 2:
+    import Queue
+elif sys.version_info.major == 3:
+    import queue as Queue
+else:
+    raise Exception("Error Python version")
+import os
+from paddle_serving_client import MultiLangClient, Client
+from concurrent import futures
+import numpy as np
+import grpc
+import logging
+import random
+import time
+import func_timeout
+import enum
+import collections
+import copy
+import socket
+from contextlib import closing
+import yaml
+from .proto import pipeline_service_pb2
+from .proto import pipeline_service_pb2_grpc
+from .operator import Op, RequestOp, ResponseOp, VirtualOp
+from .channel import ThreadChannel, ProcessChannel, ChannelData, ChannelDataEcode, ChannelDataType
+from .profiler import TimeProfiler
+from .util import NameGenerator
+_LOGGER = logging.getLogger(__name__)
+_profiler = TimeProfiler()
+class PipelineService(pipeline_service_pb2_grpc.PipelineServiceServicer):
+    def __init__(self, in_channel, out_channel, unpack_func, pack_func,
+                 retry=2):
+        super(PipelineService, self).__init__()
+        self.name = "#G"
+        self.set_in_channel(in_channel)
+        self.set_out_channel(out_channel)
+        _LOGGER.debug(self._log(in_channel.debug()))
+        _LOGGER.debug(self._log(out_channel.debug()))
+        #TODO: 
+        #  multi-lock for different clients
+        #  diffenert lock for server and client
+        self._id_lock = threading.Lock()
+        self._cv = threading.Condition()
+        self._globel_resp_dict = {}
+        self._id_counter = 0
+        self._reset_max_id = 1000000000000000000
+        self._retry = retry
+        self._is_run = True
+        self._pack_func = pack_func
+        self._unpack_func = unpack_func
+        self._recive_func = threading.Thread(
+            target=PipelineService._recive_out_channel_func, args=(self, ))
+        self._recive_func.start()
+    def _log(self, info_str):
+        return "[{}] {}".format(self.name, info_str)
+    def set_in_channel(self, in_channel):
+        if not isinstance(in_channel, (ThreadChannel, ProcessChannel)):
+            raise TypeError(
+                self._log('in_channel must be Channel type, but get {}'.format(
+                    type(in_channel))))
+        in_channel.add_producer(self.name)
+        self._in_channel = in_channel
+    def set_out_channel(self, out_channel):
+        if not isinstance(out_channel, (ThreadChannel, ProcessChannel)):
+            raise TypeError(
+                self._log('out_channel must be Channel type, but get {}'.format(
+                    type(out_channel))))
+        out_channel.add_consumer(self.name)
+        self._out_channel = out_channel
+    def stop(self):
+        self._is_run = False
+    def _recive_out_channel_func(self):
+        while self._is_run:
+            channeldata_dict = self._out_channel.front(self.name)
+            if len(channeldata_dict) != 1:
+                raise Exception("out_channel cannot have multiple input ops")
+            (_, channeldata), = channeldata_dict.items()
+            if not isinstance(channeldata, ChannelData):
+                raise TypeError(
+                    self._log('data must be ChannelData type, but get {}'.
+                              format(type(channeldata))))
+            with self._cv:
+                data_id = channeldata.id
+                self._globel_resp_dict[data_id] = channeldata
+                self._cv.notify_all()
+    def _get_next_id(self):
+        with self._id_lock:
+            if self._id_counter >= self._reset_max_id:
+                self._id_counter -= self._reset_max_id
+            self._id_counter += 1
+            return self._id_counter - 1
+    def _get_data_in_globel_resp_dict(self, data_id):
+        resp = None
+        with self._cv:
+            while data_id not in self._globel_resp_dict:
+                self._cv.wait()
+            resp = self._globel_resp_dict.pop(data_id)
+            self._cv.notify_all()
+        return resp
+    def _pack_data_for_infer(self, request):
+        _LOGGER.debug(self._log('start inferce'))
+        data_id = self._get_next_id()
+        dictdata = None
+        try:
+            dictdata = self._unpack_func(request)
+        except Exception as e:
+            return ChannelData(
+                ecode=ChannelDataEcode.RPC_PACKAGE_ERROR.value,
+                error_info="rpc package error: {}".format(e),
+                data_id=data_id), data_id
+        else:
+            return ChannelData(
+                datatype=ChannelDataType.DICT.value,
+                dictdata=dictdata,
+                data_id=data_id), data_id
+    def _pack_data_for_resp(self, channeldata):
+        _LOGGER.debug(self._log('get channeldata'))
+        return self._pack_func(channeldata)
+    def inference(self, request, context):
+        _profiler.record("{}-prepack_0".format(self.name))
+        data, data_id = self._pack_data_for_infer(request)
+        _profiler.record("{}-prepack_1".format(self.name))
+        resp_channeldata = None
+        for i in range(self._retry):
+            _LOGGER.debug(self._log('push data'))
+            _profiler.record("{}-push_0".format(self.name))
+            self._in_channel.push(data, self.name)
+            _profiler.record("{}-push_1".format(self.name))
+            _LOGGER.debug(self._log('wait for infer'))
+            _profiler.record("{}-fetch_0".format(self.name))
+            resp_channeldata = self._get_data_in_globel_resp_dict(data_id)
+            _profiler.record("{}-fetch_1".format(self.name))
+            if resp_channeldata.ecode == ChannelDataEcode.OK.value:
+                break
+            if i + 1 < self._retry:
+                _LOGGER.warn("retry({}): {}".format(
+                    i + 1, resp_channeldata.error_info))
+        _profiler.record("{}-postpack_0".format(self.name))
+        resp = self._pack_data_for_resp(resp_channeldata)
+        _profiler.record("{}-postpack_1".format(self.name))
+        _profiler.print_profile()
+        return resp
+class PipelineServer(object):
+    def __init__(self):
+        self._channels = []
+        self._actual_ops = []
+        self._port = None
+        self._worker_num = None
+        self._in_channel = None
+        self._out_channel = None
+        self._response_op = None
+        self._pack_func = None
+        self._unpack_func = None
+    def add_channel(self, channel):
+        self._channels.append(channel)
+    def gen_desc(self):
+        _LOGGER.info('here will generate desc for PAAS')
+        pass
+    def set_response_op(self, response_op):
+        if not isinstance(response_op, Op):
+            raise Exception("response_op must be Op type.")
+        if len(response_op.get_input_ops()) != 1:
+            raise Exception("response_op can only have one previous op.")
+        self._response_op = response_op
+    def _topo_sort(self, response_op):
+        if response_op is None:
+            raise Exception("response_op has not been set.")
+        def get_use_ops(root):
+            # root: response_op
+            unique_names = set()
+            use_ops = set()
+            succ_ops_of_use_op = {}  # {op_name: succ_ops}
+            que = Queue.Queue()
+            que.put(root)
+            #use_ops.add(root)
+            #unique_names.add(root.name)
+            while que.qsize() != 0:
+                op = que.get()
+                for pred_op in op.get_input_ops():
+                    if pred_op.name not in succ_ops_of_use_op:
+                        succ_ops_of_use_op[pred_op.name] = []
+                    if op != root:
+                        succ_ops_of_use_op[pred_op.name].append(op)
+                    if pred_op not in use_ops:
+                        que.put(pred_op)
+                        use_ops.add(pred_op)
+                        # check the name of op is globally unique
+                        if pred_op.name in unique_names:
+                            raise Exception("the name of Op must be unique: {}".
+                                            format(pred_op.name))
+                        unique_names.add(pred_op.name)
+            return use_ops, succ_ops_of_use_op
+        use_ops, out_degree_ops = get_use_ops(response_op)
+        if len(use_ops) <= 1:
+            raise Exception(
+                "Besides RequestOp and ResponseOp, there should be at least one Op in DAG."
+            )
+        name2op = {op.name: op for op in use_ops}
+        out_degree_num = {
+            name: len(ops)
+            for name, ops in out_degree_ops.items()
+        }
+        que_idx = 0  # scroll queue 
+        ques = [Queue.Queue() for _ in range(2)]
+        zero_indegree_num = 0
+        for op in use_ops:
+            if len(op.get_input_ops()) == 0:
+                zero_indegree_num += 1
+        if zero_indegree_num != 1:
+            raise Exception("DAG contains multiple input Ops")
+        last_op = response_op.get_input_ops()[0]
+        ques[que_idx].put(last_op)
+        # topo sort to get dag_views
+        dag_views = []
+        sorted_op_num = 0
+        while True:
+            que = ques[que_idx]
+            next_que = ques[(que_idx + 1) % 2]
+            dag_view = []
+            while que.qsize() != 0:
+                op = que.get()
+                dag_view.append(op)
+                sorted_op_num += 1
+                for pred_op in op.get_input_ops():
+                    out_degree_num[pred_op.name] -= 1
+                    if out_degree_num[pred_op.name] == 0:
+                        next_que.put(pred_op)
+            dag_views.append(dag_view)
+            if next_que.qsize() == 0:
+                break
+            que_idx = (que_idx + 1) % 2
+        if sorted_op_num < len(use_ops):
+            raise Exception("not legal DAG")
+        # create channels and virtual ops
+        def gen_channel(name_gen):
+            channel = None
+            if self._use_multithread:
+                channel = ThreadChannel(name=name_gen.next())
+            else:
+                channel = ProcessChannel(self._manager, name=name_gen.next())
+            return channel
+        def gen_virtual_op(name_gen):
+            return VirtualOp(name=name_gen.next())
+        virtual_op_name_gen = NameGenerator("vir")
+        channel_name_gen = NameGenerator("chl")
+        virtual_ops = []
+        channels = []
+        input_channel = None
+        actual_view = None
+        dag_views = list(reversed(dag_views))
+        for v_idx, view in enumerate(dag_views):
+            if v_idx + 1 >= len(dag_views):
+                break
+            next_view = dag_views[v_idx + 1]
+            if actual_view is None:
+                actual_view = view
+            actual_next_view = []
+            pred_op_of_next_view_op = {}
+            for op in actual_view:
+                # find actual succ op in next view and create virtual op
+                for succ_op in out_degree_ops[op.name]:
+                    if succ_op in next_view:
+                        if succ_op not in actual_next_view:
+                            actual_next_view.append(succ_op)
+                        if succ_op.name not in pred_op_of_next_view_op:
+                            pred_op_of_next_view_op[succ_op.name] = []
+                        pred_op_of_next_view_op[succ_op.name].append(op)
+                    else:
+                        # create virtual op
+                        virtual_op = gen_virtual_op(virtual_op_name_gen)
+                        virtual_ops.append(virtual_op)
+                        out_degree_ops[virtual_op.name] = [succ_op]
+                        actual_next_view.append(virtual_op)
+                        pred_op_of_next_view_op[virtual_op.name] = [op]
+                        virtual_op.add_virtual_pred_op(op)
+            actual_view = actual_next_view
+            # create channel
+            processed_op = set()
+            for o_idx, op in enumerate(actual_next_view):
+                if op.name in processed_op:
+                    continue
+                channel = gen_channel(channel_name_gen)
+                channels.append(channel)
+                _LOGGER.debug("{} => {}".format(channel.name, op.name))
+                op.add_input_channel(channel)
+                pred_ops = pred_op_of_next_view_op[op.name]
+                if v_idx == 0:
+                    input_channel = channel
+                else:
+                    # if pred_op is virtual op, it will use ancestors as producers to channel
+                    for pred_op in pred_ops:
+                        _LOGGER.debug("{} => {}".format(pred_op.name,
+                                                        channel.name))
+                        pred_op.add_output_channel(channel)
+                processed_op.add(op.name)
+                # find same input op to combine channel
+                for other_op in actual_next_view[o_idx + 1:]:
+                    if other_op.name in processed_op:
+                        continue
+                    other_pred_ops = pred_op_of_next_view_op[other_op.name]
+                    if len(other_pred_ops) != len(pred_ops):
+                        continue
+                    same_flag = True
+                    for pred_op in pred_ops:
+                        if pred_op not in other_pred_ops:
+                            same_flag = False
+                            break
+                    if same_flag:
+                        _LOGGER.debug("{} => {}".format(channel.name,
+                                                        other_op.name))
+                        other_op.add_input_channel(channel)
+                        processed_op.add(other_op.name)
+        output_channel = gen_channel(channel_name_gen)
+        channels.append(output_channel)
+        last_op.add_output_channel(output_channel)
+        pack_func, unpack_func = None, None
+        pack_func = self._response_op.pack_response_package
+        self._actual_ops = virtual_ops
+        for op in use_ops:
+            if len(op.get_input_ops()) == 0:
+                unpack_func = op.unpack_request_package
+                continue
+            self._actual_ops.append(op)
+        self._channels = channels
+        for c in channels:
+            _LOGGER.debug(c.debug())
+        return input_channel, output_channel, pack_func, unpack_func
+    def _port_is_available(self, port):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+            sock.settimeout(2)
+            result = sock.connect_ex(('0.0.0.0', port))
+        return result != 0
+    def prepare_server(self, yml_file):
+        with open(yml_file) as f:
+            yml_config = yaml.load(f.read())
+        self._port = yml_config.get('port', 8080)
+        if not self._port_is_available(self._port):
+            raise SystemExit("Prot {} is already used".format(self._port))
+        self._worker_num = yml_config.get('worker_num', 2)
+        self._retry = yml_config.get('retry', 1)
+        self._client_type = yml_config.get('client_type', 'brpc')
+        self._use_multithread = yml_config.get('use_multithread', True)
+        profile = yml_config.get('profile', False)
+        if not self._use_multithread:
+            self._manager = multiprocessing.Manager()
+            if profile:
+                raise Exception(
+                    "profile cannot be used in multiprocess version temporarily")
+        _profiler.enable(profile)
+        input_channel, output_channel, self._pack_func, self._unpack_func = self._topo_sort(
+            self._response_op)
+        self._in_channel = input_channel
+        self._out_channel = output_channel
+        for op in self._actual_ops:
+            if op.with_serving:
+                self.prepare_serving(op)
+        self.gen_desc()
+    def _run_ops(self):
+        threads_or_proces = []
+        for op in self._actual_ops:
+            op.init_profiler(_profiler)
+            if self._use_multithread:
+                threads_or_proces.extend(
+                    op.start_with_thread(self._client_type))
+            else:
+                threads_or_proces.extend(
+                    op.start_with_process(self._client_type))
+        return threads_or_proces
+    def _stop_all(self, service):
+        service.stop()
+        for op in self._actual_ops:
+            op.stop()
+        for chl in self._channels:
+            chl.stop()
+    def run_server(self):
+        op_threads_or_proces = self._run_ops()
+        service = PipelineService(self._in_channel, self._out_channel,
+                                  self._unpack_func, self._pack_func,
+                                  self._retry)
+        server = grpc.server(
+            futures.ThreadPoolExecutor(max_workers=self._worker_num))
+        pipeline_service_pb2_grpc.add_PipelineServiceServicer_to_server(service,
+                                                                        server)
+        server.add_insecure_port('[::]:{}'.format(self._port))
+        server.start()
+        server.wait_for_termination()
+        self._stop_all()  # TODO
+        for x in op_threads_or_proces:
+            x.join()
+    def prepare_serving(self, op):
+        # run a server (not in PyServing)
+        _LOGGER.info("run a server (not in PyServing)")
--- a/python/pipeline/profiler.py
+++ b/python/pipeline/profiler.py
@@ -12,3 +12,54 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
+import os
+import sys
+import logging
+if sys.version_info.major == 2:
+    import Queue
+elif sys.version_info.major == 3:
+    import queue as Queue
+else:
+    raise Exception("Error Python version")
+import time
+_LOGGER = logging.getLogger(__name__)
+class TimeProfiler(object):
+    def __init__(self):
+        self._pid = os.getpid()
+        self._print_head = 'PROFILE\tpid:{}\t'.format(self._pid)
+        self._time_record = Queue.Queue()
+        self._enable = False
+    def enable(self, enable):
+        self._enable = enable
+    def record(self, name_with_tag):
+        if self._enable is False:
+            return
+        name_with_tag = name_with_tag.split("_")
+        tag = name_with_tag[-1]
+        name = '_'.join(name_with_tag[:-1])
+        self._time_record.put((name, tag, int(round(time.time() * 1000000))))
+    def print_profile(self):
+        if self._enable is False:
+            return
+        print_str = self._print_head
+        tmp = {}
+        while not self._time_record.empty():
+            name, tag, timestamp = self._time_record.get()
+            if name in tmp:
+                ptag, ptimestamp = tmp.pop(name)
+                print_str += "{}_{}:{} ".format(name, ptag, ptimestamp)
+                print_str += "{}_{}:{} ".format(name, tag, timestamp)
+            else:
+                tmp[name] = (tag, timestamp)
+        print_str += "\n"
+        sys.stderr.write(print_str)
+        for name, item in tmp.items():
+            tag, timestamp = item
+            self._time_record.put((name, tag, timestamp))
--- a/python/pipeline/proto/__init__.py
+++ b/python/pipeline/proto/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/python/pipeline/proto/pipeline_service.proto
+++ b/python/pipeline/proto/pipeline_service.proto
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+syntax = "proto2";
+package baidu.paddle_serving.pipeline_serving;
+message Request {
+  repeated string key = 1;
+  repeated string value = 2;
+};
+message Response {
+  repeated string key = 1;
+  repeated string value = 2;
+  required int32 ecode = 3;
+  optional string error_info = 4;
+};
+service PipelineService {
+  rpc inference(Request) returns (Response) {}
+};
--- a/python/pipeline/proto/run_codegen.py
+++ b/python/pipeline/proto/run_codegen.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright 2015 gRPC authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Runs protoc with the gRPC plugin to generate messages and gRPC stubs."""
+from grpc_tools import protoc
+protoc.main((
+    '',
+    '-I.',
+    '--python_out=.',
+    '--grpc_python_out=.',
+    'pipeline_service.proto', ))
--- a/python/pipeline/util.py
+++ b/python/pipeline/util.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+class NameGenerator(object):
+    def __init__(self, prefix):
+        self._idx = -1
+        self._prefix = prefix
+    def next(self):
+        self._idx += 1
+        return "{}{}".format(self._prefix, self._idx)
--- a/python/requirements.txt
+++ b/python/requirements.txt
 numpy>=1.12, <=1.16.4 ; python_version<"3.5"
 grpcio-tools>=1.28.1
 grpcio>=1.28.1
+func-timeout>=4.3.5
--- a/python/setup.py.client.in
+++ b/python/setup.py.client.in
@@ -65,11 +65,14 @@ REQUIRED_PACKAGES = [
 if not find_package("paddlepaddle") and not find_package("paddlepaddle-gpu"):
    REQUIRED_PACKAGES.append("paddlepaddle")
 packages=['paddle_serving_client',
          'paddle_serving_client.proto',
          'paddle_serving_client.io',
-	  'paddle_serving_client.metric',
+    	  'paddle_serving_client.metric',
-	  'paddle_serving_client.utils',]
+    	  'paddle_serving_client.utils',
+          'paddle_serving_client.pipeline',
+          'paddle_serving_client.pipeline.proto']
 package_data={'paddle_serving_client': ['serving_client.so','lib/*'],}
 package_dir={'paddle_serving_client':
             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client',
@@ -77,10 +80,14 @@ package_dir={'paddle_serving_client':
             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/proto',
             'paddle_serving_client.io':
             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/io',
-	     'paddle_serving_client.metric':
+	         'paddle_serving_client.metric':
-	     '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/metric',
+	         '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/metric',
-	     'paddle_serving_client.utils':
+	         'paddle_serving_client.utils':
-	     '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/utils',}
+	         '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/utils',
+             'paddle_serving_client.pipeline':
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/pipeline',
+             'paddle_serving_client.pipeline.proto':
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/pipeline/proto'}
 setup(
    name='paddle-serving-client',

--- a/python/setup.py.server.in
+++ b/python/setup.py.server.in
@@ -42,12 +42,18 @@ REQUIRED_PACKAGES = [
 ]
 packages=['paddle_serving_server',
-          'paddle_serving_server.proto']
+          'paddle_serving_server.proto',
+          'paddle_serving_server.pipeline',
+          'paddle_serving_server.pipeline.proto']
 package_dir={'paddle_serving_server':
             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server',
             'paddle_serving_server.proto':
-             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto'}
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/proto',
+             'paddle_serving_server.pipeline':
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/pipeline',
+             'paddle_serving_server.pipeline.proto':
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server/pipeline/proto'}
 setup(
    name='paddle-serving-server',

--- a/python/setup.py.server_gpu.in
+++ b/python/setup.py.server_gpu.in
@@ -43,12 +43,18 @@ REQUIRED_PACKAGES = [
 packages=['paddle_serving_server_gpu',
-          'paddle_serving_server_gpu.proto']
+          'paddle_serving_server_gpu.proto',
+          'paddle_serving_server_gpu.pipeline',
+          'paddle_serving_server_gpu.pipeline.proto']
 package_dir={'paddle_serving_server_gpu':
             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu',
             'paddle_serving_server_gpu.proto':
-             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto'}
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/proto',
+             'paddle_serving_server_gpu.pipeline':
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/pipeline',
+             'paddle_serving_server_gpu.pipeline.proto':
+             '${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_server_gpu/pipeline/proto'}
 setup(
    name='paddle-serving-server-gpu',

--- a/tools/Dockerfile
+++ b/tools/Dockerfile
@@ -7,8 +7,9 @@ RUN yum -y install wget && \
    yum -y install libXrender-0.9.10-1.el7.x86_64 --setopt=protected_multilib=false && \
    yum -y install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false && \
    yum -y install python3 python3-devel && \
-    yum clean all && \
+    yum clean all 
-    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
    python get-pip.py && rm get-pip.py && \
    localedef -c -i en_US -f UTF-8 en_US.UTF-8 && \
    echo "export LANG=en_US.utf8" >> /root/.bashrc
--- a/tools/Dockerfile.devel
+++ b/tools/Dockerfile.devel
 FROM centos:7.3.1611
-RUN yum -y install wget >/dev/null \
+RUN yum -y install wget  \
-    && yum -y install gcc gcc-c++ make glibc-static which >/dev/null \
+    && yum -y install gcc gcc-c++ make glibc-static which  \
-    && yum -y install git openssl-devel curl-devel bzip2-devel python-devel >/dev/null \
+    && yum -y install git openssl-devel curl-devel bzip2-devel python-devel
-    && wget https://cmake.org/files/v3.2/cmake-3.2.0-Linux-x86_64.tar.gz >/dev/null \
+RUN wget https://cmake.org/files/v3.2/cmake-3.2.0-Linux-x86_64.tar.gz >/dev/null \
    && tar xzf cmake-3.2.0-Linux-x86_64.tar.gz \
    && mv cmake-3.2.0-Linux-x86_64 /usr/local/cmake3.2.0 \
    && echo 'export PATH=/usr/local/cmake3.2.0/bin:$PATH' >> /root/.bashrc \
-    && rm cmake-3.2.0-Linux-x86_64.tar.gz \
+    && rm cmake-3.2.0-Linux-x86_64.tar.gz 
-    && wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
+RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
    && tar xzf go1.14.linux-amd64.tar.gz \
    && mv go /usr/local/go \
    && echo 'export GOROOT=/usr/local/go' >> /root/.bashrc \
    && echo 'export PATH=/usr/local/go/bin:$PATH' >> /root/.bashrc \
-    && rm go1.14.linux-amd64.tar.gz \
+    && rm go1.14.linux-amd64.tar.gz 
-    && yum -y install python-devel sqlite-devel >/dev/null \
+RUN yum -y install python-devel sqlite-devel  \
    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
    && python get-pip.py >/dev/null \
    && pip install google protobuf setuptools wheel flask >/dev/null \
-    && rm get-pip.py \
+    && rm get-pip.py 
-    && yum install -y python3 python3-devel \
+RUN yum install -y python3 python3-devel \
    && pip3 install google protobuf setuptools wheel flask \
    && yum -y install epel-release && yum -y install patchelf libXext libSM libXrender\
-    && yum clean all \
+    && yum clean all 
-    && localedef -c -i en_US -f UTF-8 en_US.UTF-8 \
+RUN localedef -c -i en_US -f UTF-8 en_US.UTF-8 \
    && echo "export LANG=en_US.utf8" >> /root/.bashrc
--- a/tools/Dockerfile.gpu
+++ b/tools/Dockerfile.gpu
@@ -8,10 +8,12 @@ RUN yum -y install wget && \
    yum -y install libXrender-0.9.10-1.el7.x86_64 --setopt=protected_multilib=false && \
    yum -y install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false && \
    yum -y install python3 python3-devel && \
-    yum clean all && \
+    yum clean all
-    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
-    python get-pip.py && rm get-pip.py && \
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
-    ln -s /usr/local/cuda-9.0/lib64/libcublas.so.9.0 /usr/local/cuda-9.0/lib64/libcublas.so && \
+    python get-pip.py && rm get-pip.py
+RUN ln -s /usr/local/cuda-9.0/lib64/libcublas.so.9.0 /usr/local/cuda-9.0/lib64/libcublas.so && \
    echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> /root/.bashrc && \
    ln -s /usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudnn.so.7 /usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudnn.so && \
    echo 'export LD_LIBRARY_PATH=/usr/local/cuda-9.0/targets/x86_64-linux/lib:$LD_LIBRARY_PATH' >> /root/.bashrc && \

--- a/tools/Dockerfile.gpu.devel
+++ b/tools/Dockerfile.gpu.devel
--- a/tools/serving_build.sh
+++ b/tools/serving_build.sh