Merge remote-tracking branch 'upstream/develop' into trt

67304e33 · MRXLT · 503534a2 · 13ce817d · 67304e33 · 67304e33
118 changed file
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@
 <p>
 <p align="center">
    <br>
    <a href="https://travis-ci.com/PaddlePaddle/Serving">
@@ -29,7 +30,7 @@ We consider deploying deep learning inference service online to be a user-facing
 <h2 align="center">Installation</h2>
-We **highly recommend** you to **run Paddle Serving in Docker**, please visit [Run in Docker](https://github.com/PaddlePaddle/Serving/blob/develop/doc/RUN_IN_DOCKER.md)
+We **highly recommend** you to **run Paddle Serving in Docker**, please visit [Run in Docker](https://github.com/PaddlePaddle/Serving/blob/develop/doc/RUN_IN_DOCKER.md). See the [document](doc/DOCKER_IMAGES.md) for more docker images.
 ```
 # Run CPU Docker
 docker pull hub.baidubce.com/paddlepaddle/serving:latest
@@ -38,8 +39,8 @@ docker exec -it test bash
 ```
 ```
 # Run GPU Docker
-nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-gpu
+nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
-nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-gpu
+nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
 nvidia-docker exec -it test bash
 ```
@@ -58,6 +59,15 @@ Packages of Paddle Serving support Centos 6/7 and Ubuntu 16/18, or you can use H
 <h2 align="center"> Pre-built services with Paddle Serving</h2>
+<h3 align="center">Latest release</h4>
+<p align="center">
+    <a href="https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/ocr">Optical Character Recognition</a>
+    <br>
+    <a href="https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/faster_rcnn_model">Object Detection</a>
+    <br>
+    <a href="https://github.com/PaddlePaddle/Serving/tree/develop/python/examples/deeplabv3">Image Segmentation</a>
+<p>
 <h3 align="center">Chinese Word Segmentation</h4>
 ``` shell
@@ -184,11 +194,6 @@ Here, `client.predict` function has two arguments. `feed` is a `python dict` wit
 <h2 align="center">Community</h2>
-### User Group in China
-<p align="center"><img width="200" height="300" margin="500" src="./doc/qq.jpeg"/>&#8194;&#8194;&#8194;&#8194;&#8194<img width="200" height="300"  src="doc/wechat.jpeg"/></p>
-<p align="center">PaddleServing交流QQ群&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;PaddleServing微信群</p>    
 ### Slack

--- a/README_CN.md
+++ b/README_CN.md
@@ -7,6 +7,7 @@
 <p>
 <p align="center">
    <br>
    <a href="https://travis-ci.com/PaddlePaddle/Serving">
@@ -31,7 +32,7 @@ Paddle Serving 旨在帮助深度学习开发者轻易部署在线预测服务
 <h2 align="center">安装</h2>
-**强烈建议**您在**Docker内构建**Paddle Serving，请查看[如何在Docker中运行PaddleServing](doc/RUN_IN_DOCKER_CN.md)
+**强烈建议**您在**Docker内构建**Paddle Serving，请查看[如何在Docker中运行PaddleServing](doc/RUN_IN_DOCKER_CN.md)。更多镜像请查看[Docker镜像列表](doc/DOCKER_IMAGES_CN.md)。
 ```
 # 启动 CPU Docker
@@ -41,8 +42,8 @@ docker exec -it test bash
 ```
 ```
 # 启动 GPU Docker
-nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-gpu
+nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
-nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-gpu
+nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
 nvidia-docker exec -it test bash
 ```
 ```shell

--- a/core/configure/proto/multi_lang_general_model_service.proto
+++ b/core/configure/proto/multi_lang_general_model_service.proto
@@ -14,6 +14,10 @@
 syntax = "proto2";
+option java_multiple_files = true;
+option java_package = "io.paddle.serving.grpc";
+option java_outer_classname = "ServingProto";
 message Tensor {
  optional bytes data = 1;
  repeated int32 int_data = 2;

--- a/core/cube/cube-api/src/cube_cli.cpp
+++ b/core/cube/cube-api/src/cube_cli.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <gflags/gflags.h>
+#include <algorithm>
 #include <atomic>
 #include <fstream>
 #include <thread>  //NOLINT
@@ -31,8 +32,9 @@ DEFINE_bool(print_output, false, "print output flag");
 DEFINE_int32(thread_num, 1, "thread num");
 std::atomic<int> g_concurrency(0);
-std::vector<uint64_t> time_list;
+std::vector<std::vector<uint64_t>> time_list;
 std::vector<uint64_t> request_list;
+int turns = 1000;
 namespace {
 inline uint64_t time_diff(const struct timeval& start_time,
@@ -93,14 +95,15 @@ int run(int argc, char** argv, int thread_id) {
  uint64_t file_size = key_list.size();
  uint64_t index = 0;
  uint64_t request = 0;
  while (g_concurrency.load() >= FLAGS_thread_num) {
  }
  g_concurrency++;
+  time_list[thread_id].resize(turns);
-  while (index < file_size) {
+  while (request < turns) {
    // uint64_t key = strtoul(buffer, NULL, 10);
+    if (index >= file_size) {
+      index = 0;
+    }
    keys.push_back(key_list[index]);
    index += 1;
    int ret = 0;
@@ -121,47 +124,12 @@ int run(int argc, char** argv, int thread_id) {
      }
      ++seek_counter;
      uint64_t seek_cost = time_diff(seek_start, seek_end);
-      seek_cost_total += seek_cost;
+      time_list[thread_id][request - 1] = seek_cost;
-      if (seek_cost > seek_cost_max) {
-        seek_cost_max = seek_cost;
-      }
-      if (seek_cost < seek_cost_min) {
-        seek_cost_min = seek_cost;
-      }
      keys.clear();
      values.clear();
    }
  }
-  /*
-    if (keys.size() > 0) {
-      int ret = 0;
-      values.resize(keys.size());
-      TIME_FLAG(seek_start);
-      ret = cube->seek(FLAGS_dict, keys, &values);
-      TIME_FLAG(seek_end);
-      if (ret != 0) {
-        LOG(WARNING) << "cube seek failed";
-      } else if (FLAGS_print_output) {
-        for (size_t i = 0; i < keys.size(); ++i) {
-          fprintf(stdout,
-                  "key:%lu value:%s\n",
-                  keys[i],
-                  string_to_hex(values[i].buff).c_str());
-        }
-      }
-      ++seek_counter;
-      uint64_t seek_cost = time_diff(seek_start, seek_end);
-      seek_cost_total += seek_cost;
-      if (seek_cost > seek_cost_max) {
-        seek_cost_max = seek_cost;
-      }
-      if (seek_cost < seek_cost_min) {
-        seek_cost_min = seek_cost;
-      }
-    }
-  */
  g_concurrency--;
  // fclose(key_file);
@@ -171,12 +139,6 @@ int run(int argc, char** argv, int thread_id) {
    LOG(WARNING) << "destroy cube api failed err=" << ret;
  }
-  uint64_t seek_cost_avg = seek_cost_total / seek_counter;
-  LOG(INFO) << "seek cost avg = " << seek_cost_avg;
-  LOG(INFO) << "seek cost max = " << seek_cost_max;
-  LOG(INFO) << "seek cost min = " << seek_cost_min;
-  time_list[thread_id] = seek_cost_avg;
  request_list[thread_id] = request;
  return 0;
@@ -188,6 +150,7 @@ int run_m(int argc, char** argv) {
  request_list.resize(thread_num);
  time_list.resize(thread_num);
  std::vector<std::thread*> thread_pool;
+  TIME_FLAG(main_start);
  for (int i = 0; i < thread_num; i++) {
    thread_pool.push_back(new std::thread(run, argc, argv, i));
  }
@@ -195,27 +158,42 @@ int run_m(int argc, char** argv) {
    thread_pool[i]->join();
    delete thread_pool[i];
  }
+  TIME_FLAG(main_end);
  uint64_t sum_time = 0;
  uint64_t max_time = 0;
  uint64_t min_time = 1000000;
-  uint64_t request_num = 0;
+  std::vector<uint64_t> all_time_list;
  for (int i = 0; i < thread_num; i++) {
-    sum_time += time_list[i];
+    for (int j = 0; j < request_list[i]; j++) {
-    if (time_list[i] > max_time) {
+      sum_time += time_list[i][j];
-      max_time = time_list[i];
+      if (time_list[i][j] > max_time) {
-    }
+        max_time = time_list[i][j];
-    if (time_list[i] < min_time) {
+      }
-      min_time = time_list[i];
+      if (time_list[i][j] < min_time) {
-    }
+        min_time = time_list[i][j];
-    request_num += request_list[i];
+      }
-  }
+      all_time_list.push_back(time_list[i][j]);
-  uint64_t mean_time = sum_time / thread_num;
+    }
-  LOG(INFO) << thread_num << " thread seek cost"
+  }
-            << " avg = " << std::to_string(mean_time)
+  std::sort(all_time_list.begin(), all_time_list.end());
-            << " max = " << std::to_string(max_time)
+  uint64_t mean_time = sum_time / (thread_num * turns);
-            << " min = " << std::to_string(min_time);
+  uint64_t main_time = time_diff(main_start, main_end);
-  LOG(INFO) << " total_request = " << std::to_string(request_num) << " speed = "
+  uint64_t request_num = turns * thread_num;
-            << std::to_string(1000000 * thread_num / mean_time)  // mean_time us
+  LOG(INFO)
+      << "\n"
+      << thread_num << " thread seek cost"
+      << "\navg: " << std::to_string(mean_time) << "\n50 percent: "
+      << std::to_string(all_time_list[static_cast<int>(0.5 * request_num)])
+      << "\n80 percent: "
+      << std::to_string(all_time_list[static_cast<int>(0.8 * request_num)])
+      << "\n90 percent: "
+      << std::to_string(all_time_list[static_cast<int>(0.9 * request_num)])
+      << "\n99 percent: "
+      << std::to_string(all_time_list[static_cast<int>(0.99 * request_num)])
+      << "\n99.9 percent: "
+      << std::to_string(all_time_list[static_cast<int>(0.999 * request_num)])
+      << "\ntotal_request: " << std::to_string(request_num) << "\nspeed: "
+      << std::to_string(turns * 1000000 / main_time)  // mean_time us
      << " query per second";
  return 0;
 }

--- a/core/general-server/op/general_dist_kv_infer_op.cpp
+++ b/core/general-server/op/general_dist_kv_infer_op.cpp
@@ -90,6 +90,9 @@ int GeneralDistKVInferOp::inference() {
              keys.begin() + key_idx);
    key_idx += dataptr_size_pairs[i].second;
  }
+  Timer timeline;
+  int64_t cube_start = timeline.TimeStampUS();
+  timeline.Start();
  rec::mcube::CubeAPI *cube = rec::mcube::CubeAPI::instance();
  std::vector<std::string> table_names = cube->get_table_names();
  if (table_names.size() == 0) {
@@ -97,7 +100,7 @@ int GeneralDistKVInferOp::inference() {
    return -1;
  }
  int ret = cube->seek(table_names[0], keys, &values);
+  int64_t cube_end = timeline.TimeStampUS();
  if (values.size() != keys.size() || values[0].buff.size() == 0) {
    LOG(ERROR) << "cube value return null";
  }
@@ -153,9 +156,7 @@ int GeneralDistKVInferOp::inference() {
  VLOG(2) << "infer batch size: " << batch_size;
-  Timer timeline;
  int64_t start = timeline.TimeStampUS();
-  timeline.Start();
  if (InferManager::instance().infer(
          engine_name().c_str(), &infer_in, out, batch_size)) {
@@ -165,6 +166,8 @@ int GeneralDistKVInferOp::inference() {
  int64_t end = timeline.TimeStampUS();
  CopyBlobInfo(input_blob, output_blob);
+  AddBlobInfo(output_blob, cube_start);
+  AddBlobInfo(output_blob, cube_end);
  AddBlobInfo(output_blob, start);
  AddBlobInfo(output_blob, end);
  return 0;

--- a/core/general-server/op/general_response_op.cpp
+++ b/core/general-server/op/general_response_op.cpp
@@ -114,72 +114,50 @@ int GeneralResponseOp::inference() {
      for (int j = 0; j < in->at(idx).shape.size(); ++j) {
        cap *= in->at(idx).shape[j];
      }
-      if (in->at(idx).dtype == paddle::PaddleDType::INT64) {
+      FetchInst *fetch_p = output->mutable_insts(0);
+      auto dtype = in->at(idx).dtype;
+      if (dtype == paddle::PaddleDType::INT64) {
        VLOG(2) << "Prepare int64 var [" << model_config->_fetch_name[idx]
                << "].";
        int64_t *data_ptr = static_cast<int64_t *>(in->at(idx).data.data());
-        if (model_config->_is_lod_fetch[idx]) {
+        // from
-          FetchInst *fetch_p = output->mutable_insts(0);
+        // https://stackoverflow.com/questions/15499641/copy-a-stdvector-to-a-repeated-field-from-protobuf-with-memcpy
-          for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
+        // `Swap` method is faster than `{}` method.
-            fetch_p->mutable_tensor_array(var_idx)->add_lod(
+        google::protobuf::RepeatedField<int64_t> tmp_data(data_ptr,
-                in->at(idx).lod[0][j]);
+                                                          data_ptr + cap);
-          }
+        fetch_p->mutable_tensor_array(var_idx)->mutable_int64_data()->Swap(
-          for (int j = 0; j < cap; ++j) {
+            &tmp_data);
-            fetch_p->mutable_tensor_array(var_idx)->add_int64_data(data_ptr[j]);
+      } else if (dtype == paddle::PaddleDType::FLOAT32) {
-          }
-        } else {
-          FetchInst *fetch_p = output->mutable_insts(0);
-          for (int j = 0; j < cap; ++j) {
-            fetch_p->mutable_tensor_array(var_idx)->add_int64_data(data_ptr[j]);
-          }
-        }
-        VLOG(2) << "fetch var [" << model_config->_fetch_name[idx] << "] ready";
-        var_idx++;
-      } else if (in->at(idx).dtype == paddle::PaddleDType::FLOAT32) {
        VLOG(2) << "Prepare float var [" << model_config->_fetch_name[idx]
                << "].";
        float *data_ptr = static_cast<float *>(in->at(idx).data.data());
-        if (model_config->_is_lod_fetch[idx]) {
+        google::protobuf::RepeatedField<float> tmp_data(data_ptr,
-          FetchInst *fetch_p = output->mutable_insts(0);
+                                                        data_ptr + cap);
-          for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
+        fetch_p->mutable_tensor_array(var_idx)->mutable_float_data()->Swap(
-            fetch_p->mutable_tensor_array(var_idx)->add_lod(
+            &tmp_data);
-                in->at(idx).lod[0][j]);
+      } else if (dtype == paddle::PaddleDType::INT32) {
-          }
-          for (int j = 0; j < cap; ++j) {
-            fetch_p->mutable_tensor_array(var_idx)->add_float_data(data_ptr[j]);
-          }
-        } else {
-          FetchInst *fetch_p = output->mutable_insts(0);
-          for (int j = 0; j < cap; ++j) {
-            fetch_p->mutable_tensor_array(var_idx)->add_float_data(data_ptr[j]);
-          }
-        }
-        VLOG(2) << "fetch var [" << model_config->_fetch_name[idx] << "] ready";
-        var_idx++;
-      } else if (in->at(idx).dtype == paddle::PaddleDType::INT32) {
        VLOG(2) << "Prepare int32 var [" << model_config->_fetch_name[idx]
                << "].";
        int32_t *data_ptr = static_cast<int32_t *>(in->at(idx).data.data());
+        google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr,
+                                                          data_ptr + cap);
+        fetch_p->mutable_tensor_array(var_idx)->mutable_int_data()->Swap(
+            &tmp_data);
+      }
      if (model_config->_is_lod_fetch[idx]) {
-          FetchInst *fetch_p = output->mutable_insts(0);
        for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
          fetch_p->mutable_tensor_array(var_idx)->add_lod(
              in->at(idx).lod[0][j]);
        }
-          for (int j = 0; j < cap; ++j) {
-            fetch_p->mutable_tensor_array(var_idx)->add_int_data(data_ptr[j]);
-          }
-        } else {
-          FetchInst *fetch_p = output->mutable_insts(0);
-          for (int j = 0; j < cap; ++j) {
-            fetch_p->mutable_tensor_array(var_idx)->add_int_data(data_ptr[j]);
-          }
      }
      VLOG(2) << "fetch var [" << model_config->_fetch_name[idx] << "] ready";
      var_idx++;
    }
  }
-  }
  if (req->profile_server()) {
    int64_t end = timeline.TimeStampUS();

--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -11,10 +11,7 @@
 - CMake：3.2.2 and later
 - Python：2.7.2 and later / 3.6 and later
-It is recommended to use Docker for compilation. We have prepared the Paddle Serving compilation environment for you: 
+It is recommended to use Docker for compilation. We have prepared the Paddle Serving compilation environment for you, see [this document](DOCKER_IMAGES.md).
- CPU: `hub.baidubce.com/paddlepaddle/serving:latest-devel`，dockerfile: [Dockerfile.devel](../tools/Dockerfile.devel)
- GPU: `hub.baidubce.com/paddlepaddle/serving:latest-gpu-devel`，dockerfile: [Dockerfile.gpu.devel](../tools/Dockerfile.gpu.devel)
 This document will take Python2 as an example to show how to compile Paddle Serving. If you want to compile with Python3, just adjust the Python options of cmake:
@@ -29,6 +26,9 @@ git clone https://github.com/PaddlePaddle/Serving
 cd Serving && git submodule update --init --recursive
 ```
 ## PYTHONROOT Setting
 ```shell
@@ -38,6 +38,18 @@ export PYTHONROOT=/usr/
 In the default centos7 image we provide, the Python path is `/usr/bin/python`. If you want to use our centos6 image, you need to set it to `export PYTHONROOT=/usr/local/python2.7/`.
+## Install Python dependencies
+```shell
+pip install -r python/requirements.txt
+```
+If Python3 is used, replace `pip` with `pip3`.
 ## Compile Server
 ### Integrated CPU version paddle inference library
@@ -62,6 +74,8 @@ execute `make install` to put targets under directory `./output`
 **Attention：** After the compilation is successful, you need to set the path of `SERVING_BIN`. See [Note](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE.md#Note) for details.
 ## Compile Client
 ``` shell
@@ -72,6 +86,8 @@ make -j10
 execute `make install` to put targets under directory `./output`
 ## Compile the App
 ```bash
@@ -80,15 +96,20 @@ cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PY
 make
 ```
 ## Install wheel package
 Regardless of the client, server or App part, after compiling, install the whl package under `python/dist/`.
 ## Note
 When running the python server, it will check the `SERVING_BIN` environment variable. If you want to use your own compiled binary file, set the environment variable to the path of the corresponding binary file, usually`export SERVING_BIN=${BUILD_DIR}/core/general-server/serving`.
 ## CMake Option Description
 | Compile Options  |                    Description             | Default |

--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -11,10 +11,7 @@
 - CMake：3.2.2及以上
 - Python：2.7.2及以上 / 3.6及以上
-推荐使用Docker编译，我们已经为您准备好了Paddle Serving编译环境：
+推荐使用Docker编译，我们已经为您准备好了Paddle Serving编译环境，详见[该文档](DOCKER_IMAGES_CN.md)。
- CPU: `hub.baidubce.com/paddlepaddle/serving:latest-devel`，dockerfile: [Dockerfile.devel](../tools/Dockerfile.devel)
- GPU: `hub.baidubce.com/paddlepaddle/serving:latest-gpu-devel`，dockerfile: [Dockerfile.gpu.devel](../tools/Dockerfile.gpu.devel)
 本文档将以Python2为例介绍如何编译Paddle Serving。如果您想用Python3进行编译，只需要调整cmake的Python相关选项即可：
@@ -29,6 +26,9 @@ git clone https://github.com/PaddlePaddle/Serving
 cd Serving && git submodule update --init --recursive
 ```
 ## PYTHONROOT设置
 ```shell
@@ -38,6 +38,18 @@ export PYTHONROOT=/usr/
 我们提供默认Centos7的Python路径为`/usr/bin/python`，如果您要使用我们的Centos6镜像，需要将其设置为`export PYTHONROOT=/usr/local/python2.7/`。
+## 安装Python依赖
+```shell
+pip install -r python/requirements.txt
+```
+如果使用 Python3，请以 `pip3` 替换 `pip`。
 ## 编译Server部分
 ### 集成CPU版本Paddle Inference Library
@@ -62,6 +74,8 @@ make -j10
 **注意：** 编译成功后，需要设置`SERVING_BIN`路径，详见后面的[注意事项](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE_CN.md#注意事项)。
 ## 编译Client部分
 ``` shell
@@ -72,6 +86,8 @@ make -j10
 执行`make install`可以把目标产出放在`./output`目录下。
 ## 编译App部分
 ```bash
@@ -80,14 +96,20 @@ cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PY
 make
 ```
 ## 安装wheel包
 无论是Client端，Server端还是App部分，编译完成后，安装`python/dist/`下的whl包即可。
 ## 注意事项
 运行python端Server时，会检查`SERVING_BIN`环境变量，如果想使用自己编译的二进制文件，请将设置该环境变量为对应二进制文件的路径，通常是`export SERVING_BIN=${BUILD_DIR}/core/general-server/serving`。
 ## CMake选项说明
 |     编译选项     |                    说明                    | 默认 |

--- a/doc/CUBE_QUANT.md
+++ b/doc/CUBE_QUANT.md
@@ -42,7 +42,7 @@ cd python/examples/criteo_ctr_with_cube
 python local_train.py
 cp ../../../build_server/core/predictor/seq_generator seq_generator
 cp ../../../build_server/output/bin/cube* ./cube/
-sh cube_prepare_quant.sh &
+sh cube_quant_prepare.sh &
 python test_server_quant.py ctr_serving_model_kv &
 python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
 ```

--- a/doc/CUBE_QUANT_CN.md
+++ b/doc/CUBE_QUANT_CN.md
@@ -42,7 +42,7 @@ cd python/examples/criteo_ctr_with_cube
 python local_train.py
 cp ../../../build_server/core/predictor/seq_generator seq_generator
 cp ../../../build_server/output/bin/cube* ./cube/
-sh cube_prepare_quant.sh &
+sh cube_quant_prepare.sh &
 python test_server_quant.py ctr_serving_model_kv &
 python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
 ```

--- a/doc/DOCKER_IMAGES.md
+++ b/doc/DOCKER_IMAGES.md
+# Docker Images
+([简体中文](DOCKER_IMAGES_CN.md)|English)
+This document maintains a list of docker images provided by Paddle Serving.
+## Get docker image
+You can get images in two ways:
+1. Pull image directly from `hub.baidubce.com ` or `docker.io` through TAG:
+   ```shell
+   docker pull hub.baidubce.com/paddlepaddle/serving:<TAG> # hub.baidubce.com
+   docker pull paddlepaddle/serving:<TAG> # hub.docker.com
+   ```
+2. Building image based on dockerfile
+   Create a new folder and copy Dockerfile to this folder, and run the following command:
+   ```shell
+   docker build -t <image-name>:<images-tag> .
+   ```
+## Image description
+Runtime images cannot be used for compilation.
+|                         Description                          |   OS    |             TAG              |                          Dockerfile                          |
+| :----------------------------------------------------------: | :-----: | :--------------------------: | :----------------------------------------------------------: |
+|                         CPU runtime                          | CentOS7 |            latest            |              [Dockerfile](../tools/Dockerfile)               |
+|                       CPU development                        | CentOS7 |         latest-devel         |        [Dockerfile.devel](../tools/Dockerfile.devel)         |
+|                 GPU (cuda9.0-cudnn7) runtime                 | CentOS7 |    latest-cuda9.0-cudnn7     | [Dockerfile.cuda9.0-cudnn7](../tools/Dockerfile.cuda9.0-cudnn7) |
+|               GPU (cuda9.0-cudnn7) development               | CentOS7 | latest-cuda9.0-cudnn7-devel  | [Dockerfile.cuda9.0-cudnn7.devel](../tools/Dockerfile.cuda9.0-cudnn7.devel) |
+|                GPU (cuda10.0-cudnn7) runtime                 | CentOS7 |    latest-cuda10.0-cudnn7    | [Dockerfile.cuda10.0-cudnn7](../tools/Dockerfile.cuda10.0-cudnn7) |
+|              GPU (cuda10.0-cudnn7) development               | CentOS7 | latest-cuda10.0-cudnn7-devel | [Dockerfile.cuda10.0-cudnn7.devel](../tools/Dockerfile.cuda10.0-cudnn7.devel) |
+|     CPU development (Used to compile packages on Ubuntu)     | CentOS6 |            <None>            | [Dockerfile.centos6.devel](../tools/Dockerfile.centos6.devel) |
+| GPU (cuda9.0-cudnn7) development (Used to compile packages on Ubuntu) | CentOS6 |            <None>            | [Dockerfile.centos6.cuda9.0-cudnn7.devel](../tools/Dockerfile.centos6.cuda9.0-cudnn7.devel) |
--- a/doc/DOCKER_IMAGES_CN.md
+++ b/doc/DOCKER_IMAGES_CN.md
+# Docker 镜像
+(简体中文|[English](DOCKER_IMAGES.md))
+该文档维护了 Paddle Serving 提供的镜像列表。
+## 获取镜像
+您可以通过两种方式获取镜像。
+1. 通过 TAG 直接从 `hub.baidubce.com ` 或 `docker.io` 拉取镜像：
+   ```shell
+   docker pull hub.baidubce.com/paddlepaddle/serving:<TAG> # hub.baidubce.com
+   docker pull paddlepaddle/serving:<TAG> # hub.docker.com
+   ```
+2. 基于 Dockerfile 构建镜像
+   建立新目录，复制对应 Dockerfile 内容到该目录下 Dockerfile 文件。执行
+   ```shell
+   docker build -t <image-name>:<images-tag> .
+   ```
+## 镜像说明
+运行时镜像不能用于开发编译。
+| 镜像说明                                           | 操作系统 | TAG                          | Dockerfile                                                   |
+| -------------------------------------------------- | -------- | ---------------------------- | ------------------------------------------------------------ |
+| CPU 运行镜像                                       | CentOS7  | latest                       | [Dockerfile](../tools/Dockerfile)                            |
+| CPU 开发镜像                                       | CentOS7  | latest-devel                 | [Dockerfile.devel](../tools/Dockerfile.devel)                |
+| GPU (cuda9.0-cudnn7) 运行镜像                      | CentOS7  | latest-cuda9.0-cudnn7        | [Dockerfile.cuda9.0-cudnn7](../tools/Dockerfile.cuda9.0-cudnn7) |
+| GPU (cuda9.0-cudnn7) 开发镜像                      | CentOS7  | latest-cuda9.0-cudnn7-devel  | [Dockerfile.cuda9.0-cudnn7.devel](../tools/Dockerfile.cuda9.0-cudnn7.devel) |
+| GPU (cuda10.0-cudnn7) 运行镜像                     | CentOS7  | latest-cuda10.0-cudnn7       | [Dockerfile.cuda10.0-cudnn7](../tools/Dockerfile.cuda10.0-cudnn7) |
+| GPU (cuda10.0-cudnn7) 开发镜像                     | CentOS7  | latest-cuda10.0-cudnn7-devel | [Dockerfile.cuda10.0-cudnn7.devel](../tools/Dockerfile.cuda10.0-cudnn7.devel) |
+| CPU 开发镜像 (用于编译 Ubuntu 包)                  | CentOS6  | <无>                         | [Dockerfile.centos6.devel](../tools/Dockerfile.centos6.devel) |
+| GPU (cuda9.0-cudnn7) 开发镜像 (用于编译 Ubuntu 包) | CentOS6  | <无>                         | [Dockerfile.centos6.cuda9.0-cudnn7.devel](../tools/Dockerfile.centos6.cuda9.0-cudnn7.devel) |
--- a/doc/INFERNCE_TO_SERVING.md
+++ b/doc/INFERNCE_TO_SERVING.md
--- a/doc/INFERNCE_TO_SERVING_CN.md
+++ b/doc/INFERNCE_TO_SERVING_CN.md
--- a/doc/JAVA_SDK.md
+++ b/doc/JAVA_SDK.md
+# Paddle Serving Client Java SDK
+([简体中文](JAVA_SDK_CN.md)|English)
+Paddle Serving provides Java SDK，which supports predict on the Client side with Java language. This document shows how to use the Java SDK.
+## Getting started
+### Prerequisites
+```
+- Java 8 or higher
+- Apache Maven
+```
+The following table shows compatibilities between Paddle Serving Server and Java SDK.
+| Paddle Serving Server version | Java SDK version |
+| :---------------------------: | :--------------: |
+|             0.3.2             |      0.0.1       |
+### Install Java SDK
+You can download jar and install it to the local Maven repository:
+```shell
+wget https://paddle-serving.bj.bcebos.com/jar/paddle-serving-sdk-java-0.0.1.jar
+mvn install:install-file -Dfile=$PWD/paddle-serving-sdk-java-0.0.1.jar -DgroupId=io.paddle.serving.client -DartifactId=paddle-serving-sdk-java -Dversion=0.0.1 -Dpackaging=jar
+```
+Or compile from the source code and install it to the local Maven repository:
+```shell
+cd Serving/java
+mvn compile
+mvn install
+```
+### Maven configure
+```text
+ <dependency>
+     <groupId>io.paddle.serving.client</groupId>
+     <artifactId>paddle-serving-sdk-java</artifactId>
+     <version>0.0.1</version>
+ </dependency>
+```
+## Example
+Here we will show how to use Java SDK for Boston house price prediction. Please refer to [examples](../java/examples) folder for more examples.
+### Get model
+```shell
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
+tar -xzf uci_housing.tar.gz
+```
+### Start Python Server
+```shell
+python -m paddle_serving_server.serve --model uci_housing_model --port 9393 --use_multilang 
+```
+#### Client side code example
+```java
+import io.paddle.serving.client.*;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.factory.Nd4j;
+import java.util.*;
+public class PaddleServingClientExample {
+    public static void main( String[] args ) {
+        float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
+            0.0582f, -0.0727f, -0.1583f, -0.0584f,
+            0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
+        INDArray npdata = Nd4j.createFromArray(data);
+        HashMap<String, INDArray> feed_data
+            = new HashMap<String, INDArray>() {{
+                put("x", npdata);
+            }};
+        List<String> fetch = Arrays.asList("price");
+        Client client = new Client();
+        String target = "localhost:9393";
+        boolean succ = client.connect(target);
+        if (succ != true) {
+            System.out.println("connect failed.");
+            return ;
+        }
+        Map<String, INDArray> fetch_map = client.predict(feed_data, fetch);
+        if (fetch_map == null) {
+            System.out.println("predict failed.");
+            return ;
+        }
+        for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
+            System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
+        }
+        return ;
+    }
+}
+```
--- a/doc/JAVA_SDK_CN.md
+++ b/doc/JAVA_SDK_CN.md
+# Paddle Serving Client Java SDK
+(简体中文|[English](JAVA_SDK.md))
+Paddle Serving 提供了 Java SDK，支持 Client 端用 Java 语言进行预测，本文档说明了如何使用 Java SDK。
+## 快速开始
+### 环境要求
+```
+- Java 8 or higher
+- Apache Maven
+```
+下表显示了 Paddle Serving Server 和 Java SDK 之间的兼容性
+| Paddle Serving Server version | Java SDK version |
+| :---------------------------: | :--------------: |
+|             0.3.2             |      0.0.1       |
+### 安装
+您可以直接下载 jar，安装到本地 Maven 库：
+```shell
+wget https://paddle-serving.bj.bcebos.com/jar/paddle-serving-sdk-java-0.0.1.jar
+mvn install:install-file -Dfile=$PWD/paddle-serving-sdk-java-0.0.1.jar -DgroupId=io.paddle.serving.client -DartifactId=paddle-serving-sdk-java -Dversion=0.0.1 -Dpackaging=jar
+```
+或者从源码进行编译，安装到本地 Maven 库：
+```shell
+cd Serving/java
+mvn compile
+mvn install
+```
+### Maven 配置
+```text
+ <dependency>
+     <groupId>io.paddle.serving.client</groupId>
+     <artifactId>paddle-serving-sdk-java</artifactId>
+     <version>0.0.1</version>
+ </dependency>
+```
+## 使用样例
+这里将展示如何使用 Java SDK 进行房价预测，更多例子详见 [examples](../java/examples) 文件夹。
+### 获取房价预测模型
+```shell
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
+tar -xzf uci_housing.tar.gz
+```
+### 启动 Python 端 Server
+```shell
+python -m paddle_serving_server.serve --model uci_housing_model --port 9393 --use_multilang 
+```
+### Client 端代码示例
+```java
+import io.paddle.serving.client.*;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.factory.Nd4j;
+import java.util.*;
+public class PaddleServingClientExample {
+    public static void main( String[] args ) {
+        float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
+            0.0582f, -0.0727f, -0.1583f, -0.0584f,
+            0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
+        INDArray npdata = Nd4j.createFromArray(data);
+        HashMap<String, INDArray> feed_data
+            = new HashMap<String, INDArray>() {{
+                put("x", npdata);
+            }};
+        List<String> fetch = Arrays.asList("price");
+        Client client = new Client();
+        String target = "localhost:9393";
+        boolean succ = client.connect(target);
+        if (succ != true) {
+            System.out.println("connect failed.");
+            return ;
+        }
+        Map<String, INDArray> fetch_map = client.predict(feed_data, fetch);
+        if (fetch_map == null) {
+            System.out.println("predict failed.");
+            return ;
+        }
+        for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
+            System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
+        }
+        return ;
+    }
+}
+```
--- a/doc/NEW_WEB_SERVICE.md
+++ b/doc/NEW_WEB_SERVICE.md
@@ -2,7 +2,7 @@
 ([简体中文](NEW_WEB_SERVICE_CN.md)|English)
-This document will take the image classification service based on the Imagenet data set as an example to introduce how to develop a new web service. The complete code can be visited at [here](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/imagenet/image_classification_service.py).
+This document will take the image classification service based on the Imagenet data set as an example to introduce how to develop a new web service. The complete code can be visited at [here](../python/examples/imagenet/resnet50_web_service.py).
 ## WebService base class

--- a/doc/NEW_WEB_SERVICE_CN.md
+++ b/doc/NEW_WEB_SERVICE_CN.md
@@ -2,7 +2,7 @@
 (简体中文|[English](NEW_WEB_SERVICE.md))
-本文档将以Imagenet图像分类服务为例，来介绍如何开发一个新的Web Service。您可以在[这里](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/imagenet/image_classification_service.py)查阅完整的代码。
+本文档将以Imagenet图像分类服务为例，来介绍如何开发一个新的Web Service。您可以在[这里](../python/examples/imagenet/resnet50_web_service.py)查阅完整的代码。
 ## WebService基类

--- a/doc/PIPELINE_SERVING.md
+++ b/doc/PIPELINE_SERVING.md
+# Pipeline Serving
+([简体中文](PIPELINE_SERVING_CN.md)|English)
+Paddle Serving is usually used for the deployment of single model, but the end-to-end deep learning model can not solve all the problems at present. Usually, it is necessary to use multiple deep learning models to solve practical problems.
+Paddle Serving provides a user-friendly programming framework for multi-model composite services, Pipeline Serving, which aims to reduce the threshold of programming, improve resource utilization (especially GPU), and improve the prediction efficiency.
+## Architecture Design
+The Server side is built based on gRPC and graph execution engine. The relationship between them is shown in the following figure.
+<center>
+<img src='pipeline_serving-image1.png' height = "250" align="middle"/>
+</center>
+### Graph Execution Engine
+The graph execution engine consists of OPs and Channels, and the connected OPs share one Channel.
+- Channel can be understood as a buffer queue. Each OP accepts only one Channel input and multiply Channel outputs (each output is the same); a Channel can contain outputs from multiple OPs, and data from the same Channel can be used as input for multiple OPs.
+- Users only need to define relationships between OPs. Graph engine will analyze the dependencies of the entire graph and declaring Channels at the compile time.
+- After Request data enters the graph execution engine service, the graph engine will generator an Request ID, and Reponse is returned through corresponding Request ID.
+- For cases where large data needs to be transferred between OPs, consider RAM DB external memory for global storage and data transfer by passing index keys in Channel.
+<center>
+<img src='pipeline_serving-image2.png' height = "300" align="middle"/>
+</center>
+### OP Design
+- The default function of a single OP is to access a single Paddle Serving Service based on the input Channel data and put the result into the output Channel.
+- OP supports user customization, including preprocess, process, postprocess functions that can be inherited and implemented by the user.
+- OP can set the number of concurrencies to increase the number of concurrencies processed.
+- OP can be started by a thread or process.
+### Channel Design
+- Channel is the data structure for sharing data between OPs, responsible for sharing data or sharing data status information.
+- Outputs from multiple OPs can be stored in the same Channel, and data from the same Channel can be used by multiple OPs.
+- The following illustration shows the design of Channel in the graph execution engine, using input buffer and output buffer to align data between multiple OP inputs and multiple OP outputs, with a queue in the middle to buffer.
+<center>
+<img src='pipeline_serving-image3.png' height = "500" align="middle"/>
+</center>
+### Extreme Case Consideration
+- Request timeout
+  The entire graph execution engine may time out at every step. The graph execution engine controls the time out by setting `timeout` value. Requests that time out at any step will return a timeout response.
+- Channel stores too much data
+  Channels may store too much data, causing copy time to be too high. Graph execution engines can store OP calculation results in external memory, such as high-speed memory KV systems.
+- Whether input buffers and output buffers in Channel will increase indefinitely
+  - It will not increase indefinitely. The input to the entire graph execution engine is placed inside a Channel's internal queue, directly acting as a traffic control buffer queue for the entire service.
+  - For input buffer, adjust the number of concurrencies of OP1 and OP2 according to the amount of computation, so that the number of input buffers from each input OP is relatively balanced.
+  - For output buffer, you can use a similar process as input buffer, which adjusts the concurrency of OP3 and OP4 to control the buffer length of output buffer.
+  - Note: The length of the input buffer depends on the speed at which each item in the internal queue is ready, and the length of the output buffer depends on the speed at which downstream OPs obtain data from the output buffer.
+## Detailed Design
+### User Interface Design
+#### 1. General OP Definition
+As the basic unit of graph execution engine, the general OP constructor is as follows:
+```python
+def __init__(name=None,
+             input_ops=[],
+             server_endpoints=[],
+             fetch_list=[],
+             client_config=None,
+             concurrency=1,
+             timeout=-1,
+             retry=1)
+```
+The meaning of each parameter is as follows:
+|    Parameter     |                           Meaning                            |
+| :--------------: | :----------------------------------------------------------: |
+|       name       | (str) String used to identify the OP type, which must be globally unique. |
+|    input_ops     |     (list) A list of all previous OPs of the current Op.     |
+| server_endpoints | (list) List of endpoints for remote Paddle Serving Service. If this parameter is not set, the OP will not access the remote Paddle Serving Service, that is, the process operation will not be performed. |
+|    fetch_list    | (list) List of fetch variable names for remote Paddle Serving Service. |
+|  client_config   | (str) The path of the client configuration file corresponding to the Paddle Serving Service. |
+|   concurrency    |             (int) The number of concurrent OPs.              |
+|     timeout      | (int) The timeout time of the process operation, in seconds. If the value is less than zero, no timeout is considered. |
+|      retry       | (int) Timeout number of retries. When the value is 1, no retries are made. |
+#### 2. General OP Secondary Development Interface
+|             Interface or Variable              |                           Explain                            |
+| :--------------------------------------------: | :----------------------------------------------------------: |
+|       def preprocess(self, input_dicts)        | Process the data obtained from the channel, and the processed data will be used as the input of the **process** function. |
+|          def process(self, feed_dict)          | The RPC prediction process is based on the Paddle Serving Client, and the processed data will be used as the input of the **postprocess** function. |
+| def postprocess(self, input_dicts, fetch_dict) | After processing the prediction results, the processed data will be put into the subsequent Channel to be obtained by the subsequent OP. |
+|               def init_op(self)                |      Used to load resources (such as word dictionary).       |
+|              self.concurrency_idx              | Concurrency index of current thread / process (different kinds of OP are calculated separately). |
+In a running cycle, OP will execute three operations: preprocess, process, and postprocess (when the `server_endpoints` parameter is not set, the process operation is not executed). Users can rewrite these three functions. The default implementation is as follows:
+```python
+def preprocess(self, input_dicts):
+  # multiple previous Op
+  if len(input_dicts) != 1:
+    raise NotImplementedError(
+      'this Op has multiple previous inputs. Please override this func.'
+    ）
+  (_, input_dict), = input_dicts.items()
+  return input_dict
+def process(self, feed_dict):
+  err, err_info = ChannelData.check_npdata(feed_dict)
+  if err != 0:
+    raise NotImplementedError(
+      "{} Please override preprocess func.".format(err_info))
+  call_result = self.client.predict(
+    feed=feed_dict, fetch=self._fetch_names)
+  return call_result
+def postprocess(self, input_dicts, fetch_dict):
+  return fetch_dict
+```
+The parameter of **preprocess** is the data `input_dicts` in the previous Channel. This variable is a dictionary with the name of the previous OP as key and the output of the corresponding OP as value.
+The parameter of **process** is the input variable `fetch_dict` (the return value of the preprocess function) of the Paddle Serving Client prediction interface. This variable is a dictionary with feed_name as the key and the data in the ndarray format as the value.
+The parameters of **postprocess** are `input_dicts` and `fetch_dict`. `input_dicts` is consistent with the parameter of preprocess, and `fetch_dict` is the return value of the process function (if process is not executed, this value is the return value of preprocess).
+Users can also rewrite the **init_op** function to load some custom resources (such as word dictionary). The default implementation is as follows:
+```python
+def init_op(self):
+  pass
+```
+It should be noted that in the threaded version of OP, each OP will only call this function once, so the loaded resources must be thread safe.
+#### 3. RequestOp Definition
+RequestOp is used to process RPC data received by Pipeline Server, and the processed data will be added to the graph execution engine. Its constructor is as follows:
+```python
+def __init__(self)
+```
+#### 4. RequestOp Secondary Development Interface
+|           Interface or Variable           |                           Explain                            |
+| :---------------------------------------: | :----------------------------------------------------------: |
+|             def init_op(self)             | It is used to load resources (such as dictionaries), and is consistent with general OP. |
+| def unpack_request_package(self, request) |                  Process received RPC data.                  |
+The default implementation of **unpack_request_package** is to make the key and value in RPC request into a dictionary:
+```python
+def unpack_request_package(self, request):
+  dictdata = {}
+  for idx, key in enumerate(request.key):
+    data = request.value[idx]
+    try:
+      data = eval(data)
+    except Exception as e:
+      pass
+    dictdata[key] = data
+  return dictdata
+```
+The return value is required to be a dictionary type.
+#### 5. ResponseOp Definition
+ResponseOp is used to process the prediction results of the graph execution engine. The processed data will be used as the RPC return value of Pipeline Server. Its constructor is as follows:
+```python
+def __init__(self, input_ops)
+```
+`input_ops` is the last OP of graph execution engine. Users can construct different DAGs by setting different `input_ops` without modifying the topology of OPs.
+#### 6. ResponseOp Secondary Development Interface
+|            Interface or Variable             |                           Explain                            |
+| :------------------------------------------: | :----------------------------------------------------------: |
+|              def init_op(self)               | It is used to load resources (such as dictionaries), and is consistent with general OP. |
+| def pack_response_package(self, channeldata) | Process the prediction results of the graph execution engine as the return of RPC. |
+The default implementation of **pack_response_package** is to convert the dictionary of prediction results into key and value in RPC response:
+```python
+def pack_response_package(self, channeldata):
+  resp = pipeline_service_pb2.Response()
+  resp.ecode = channeldata.ecode
+  if resp.ecode == ChannelDataEcode.OK.value:
+    if channeldata.datatype == ChannelDataType.CHANNEL_NPDATA.value:
+      feed = channeldata.parse()
+      np.set_printoptions(threshold=np.nan)
+      for name, var in feed.items():
+        resp.value.append(var.__repr__())
+        resp.key.append(name)
+    elif channeldata.datatype == ChannelDataType.DICT.value:
+      feed = channeldata.parse()
+      for name, var in feed.items():
+        if not isinstance(var, str):
+          resp.ecode = ChannelDataEcode.TYPE_ERROR.value
+          resp.error_info = self._log(
+            "fetch var type must be str({}).".format(type(var)))
+          break
+        resp.value.append(var)
+        resp.key.append(name)
+    else:
+      resp.ecode = ChannelDataEcode.TYPE_ERROR.value
+      resp.error_info = self._log(
+        "Error type({}) in datatype.".format(channeldata.datatype))
+  else:
+    resp.error_info = channeldata.error_info
+  return resp
+```
+#### 7. PipelineServer Definition
+The definition of PipelineServer is relatively simple, as follows:
+```python
+server = PipelineServer()
+server.set_response_op(response_op)
+server.prepare_server(config_yml_path)
+server.run_server()
+```
+Where `response_op` is the responseop mentioned above, PipelineServer will initialize Channels according to the topology relationship of each OP and build the calculation graph. `config_yml_path` is the configuration file of PipelineServer. The example file is as follows:
+```yaml
+port: 18080  # gRPC port
+worker_num: 1  # gRPC thread pool size (the number of processes in the process version servicer). The default is 1
+build_dag_each_worker: false  # Whether to use process server or not. The default is false
+dag:
+    is_thread_op: true  # Whether to use the thread version of OP. The default is true
+    client_type: brpc  # Use brpc or grpc client. The default is brpc
+    retry: 1  # The number of times DAG executor retries after failure. The default value is 1, that is, no retrying
+    use_profile: false  # Whether to print the log on the server side. The default is false
+```
+## Example
+Here, we build a simple imdb model enable example to show how to use Pipeline Serving. The relevant code can be found in the `python/examples/pipeline/imdb_model_ensemble` folder. The Server-side structure in the example is shown in the following figure:
+<center>
+<img src='pipeline_serving-image4.png' height = "200" align="middle"/>
+</center>
+### Get the model file and start the Paddle Serving Service
+```shell
+cd python/examples/pipeline/imdb_model_ensemble
+sh get_data.sh
+python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 &> cnn.log &
+python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.log &
+```
+### Start PipelineServer
+Run the following code
+```python
+from paddle_serving_server.pipeline import Op, RequestOp, ResponseOp
+from paddle_serving_server.pipeline import PipelineServer
+from paddle_serving_server.pipeline.proto import pipeline_service_pb2
+from paddle_serving_server.pipeline.channel import ChannelDataEcode
+import numpy as np
+import logging
+from paddle_serving_app.reader import IMDBDataset
+logging.basicConfig(level=logging.DEBUG)
+_LOGGER = logging.getLogger()
+class ImdbRequestOp(RequestOp):
+    def init_op(self):
+        self.imdb_dataset = IMDBDataset()
+        self.imdb_dataset.load_resource('imdb.vocab')
+    def unpack_request_package(self, request):
+        dictdata = {}
+        for idx, key in enumerate(request.key):
+            if key != "words":
+                continue
+            words = request.value[idx]
+            word_ids, _ = self.imdb_dataset.get_words_and_label(words)
+            dictdata[key] = np.array(word_ids)
+        return dictdata
+class CombineOp(Op):
+    def preprocess(self, input_data):
+        combined_prediction = 0
+        for op_name, data in input_data.items():
+            _LOGGER.info("{}: {}".format(op_name, data["prediction"]))
+            combined_prediction += data["prediction"]
+        data = {"prediction": combined_prediction / 2}
+        return data
+read_op = ImdbRequestOp()
+bow_op = Op(name="bow",
+            input_ops=[read_op],
+            server_endpoints=["127.0.0.1:9393"],
+            fetch_list=["prediction"],
+            client_config="imdb_bow_client_conf/serving_client_conf.prototxt",
+            concurrency=1,
+            timeout=-1,
+            retry=1)
+cnn_op = Op(name="cnn",
+            input_ops=[read_op],
+            server_endpoints=["127.0.0.1:9292"],
+            fetch_list=["prediction"],
+            client_config="imdb_cnn_client_conf/serving_client_conf.prototxt",
+            concurrency=1,
+            timeout=-1,
+            retry=1)
+combine_op = CombineOp(
+    name="combine",
+    input_ops=[bow_op, cnn_op],
+    concurrency=5,
+    timeout=-1,
+    retry=1)
+# use default ResponseOp implementation
+response_op = ResponseOp(input_ops=[combine_op])
+server = PipelineServer()
+server.set_response_op(response_op)
+server.prepare_server('config.yml')
+server.run_server()
+```
+### Perform prediction through PipelineClient
+```python
+from paddle_serving_client.pipeline import PipelineClient
+import numpy as np
+client = PipelineClient()
+client.connect(['127.0.0.1:18080'])
+words = 'i am very sad | 0'
+futures = []
+for i in range(3):
+    futures.append(
+        client.predict(
+            feed_dict={"words": words},
+            fetch=["prediction"],
+            asyn=True))
+for f in futures:
+    res = f.result()
+    if res["ecode"] != 0:
+        print(res)
+        exit(1)
+```
+## How to optimize through the timeline tool
+In order to better optimize the performance, PipelineServing provides a timeline tool to monitor the time of each stage of the whole service.
+### Output profile information on server side
+The server is controlled by the `use_profile` field in yaml:
+```yaml
+dag:
+    use_profile: true
+```
+After the function is enabled, the server will print the corresponding log information to the standard output in the process of prediction. In order to show the time consumption of each stage more intuitively, scripts are provided for further analysis and processing of log files.
+The output of the server is first saved to a file. Taking profile as an example, the script converts the time monitoring information in the log into JSON format and saves it to the trace file. The trace file can be visualized through the tracing function of Chrome browser.
+```shell
+python timeline_trace.py profile trace
+```
+Specific operation: open Chrome browser, input in the address bar `chrome://tracing/` , jump to the tracing page, click the load button, open the saved trace file, and then visualize the time information of each stage of the prediction service.
+### Output profile information on client side
+The profile function can be enabled by setting `profile=True` in the `predict` interface on the client side.
+After the function is enabled, the client will print the log information corresponding to the prediction to the standard output during the prediction process, and the subsequent analysis and processing are the same as that of the server.
--- a/doc/PIPELINE_SERVING_CN.md
+++ b/doc/PIPELINE_SERVING_CN.md
+# Pipeline Serving
+(简体中文|[English](PIPELINE_SERVING.md))
+Paddle Serving 通常用于单模型的一键部署，但端到端的深度学习模型当前还不能解决所有问题，多个深度学习模型配合起来使用还是解决现实问题的常规手段。
+Paddle Serving 提供了用户友好的多模型组合服务编程框架，Pipeline Serving，旨在降低编程门槛，提高资源使用率（尤其是GPU设备），提升整体的预估效率。
+## 整体架构设计
+Server端基于 gRPC 和图执行引擎构建，两者的关系如下图所示。
+<center>
+<img src='pipeline_serving-image1.png' height = "250" align="middle"/>
+</center>
+### 图执行引擎
+图执行引擎由 OP 和 Channel 构成，相连接的 OP 之间会共享一个 Channel。
+- Channel 可以理解为一个缓冲队列。每个 OP 只接受一个 Channel 的输入和多个 Channel 的输出（每个输出相同）；一个 Channel 可以包含来自多个 OP 的输出，同一个 Channel 的数据可以作为多个 OP 的输入Channel
+- 用户只需要定义 OP 间的关系，在编译期图引擎负责分析整个图的依赖关系，并声明Channel
+- Request 进入图执行引擎服务后会产生一个 Request Id，Reponse 会通过 Request Id 进行对应的返回
+- 对于 OP 之间需要传输过大数据的情况，可以考虑 RAM DB 外存进行全局存储，通过在 Channel 中传递索引的 Key 来进行数据传输
+<center>
+<img src='pipeline_serving-image2.png' height = "300" align="middle"/>
+</center>
+### OP的设计
+- 单个OP默认的功能是根据输入的 Channel 数据，访问一个 Paddle Serving 的单模型服务，并将结果存在输出的 Channel
+- 单个 OP 可以支持用户自定义，包括 preprocess，process，postprocess 三个函数都可以由用户继承和实现
+- 单个 OP 可以控制并发数，从而增加处理并发数
+- OP 可以由线程或进程启动
+### Channel的设计
+- Channel 是 OP 之间共享数据的数据结构，负责共享数据或者共享数据状态信息
+- Channel 可以支持多个OP的输出存储在同一个 Channel，同一个 Channel 中的数据可以被多个 OP 使用
+- 下图为图执行引擎中 Channel 的设计，采用 input buffer 和 output buffer 进行多 OP 输入或多 OP 输出的数据对齐，中间采用一个 Queue 进行缓冲
+<center>
+<img src='pipeline_serving-image3.png' height = "500" align="middle"/>
+</center>
+### 极端情况的考虑
+- 请求超时的处理
+  整个图执行引擎每一步都有可能发生超时，图执行引擎里面通过设置 timeout 值来控制，任何环节超时的请求都会返回超时响应。
+- Channel 存储的数据过大
+  Channel 中可能会存储过大的数据，导致拷贝等耗时过高，图执行引擎里面可以通过将 OP 计算结果数据存储到外存，如高速的内存 KV 系统
+- Channel 设计中的 input buffer 和 output buffer 是否会无限增加
+  - 不会。整个图执行引擎的输入会放到一个 Channel 的 internal queue 里面，直接作为整个服务的流量控制缓冲队列
+  - 对于 input buffer，根据计算量的情况调整 OP1 和 OP2 的并发数，使得 input buffer 来自各个输入 OP 的数量相对平衡
+  - 对于 output buffer，可以采用和 input buffer 类似的处理方法，即调整 OP3 和 OP4 的并发数，使得 output buffer 的缓冲长度得到控制
+  - 注：input buffer 的长度取决于 internal queue 中每个 item 完全 ready 的速度，output buffer 的长度取决于下游 OP 从 output buffer 获取数据的速度
+## 详细设计
+### 用户接口设计
+#### 1. 普通 OP 定义
+普通 OP 作为图执行引擎中的基本单元，其构造函数如下：
+```python
+def __init__(name=None,
+             input_ops=[],
+             server_endpoints=[],
+             fetch_list=[],
+             client_config=None,
+             concurrency=1,
+             timeout=-1,
+             retry=1)
+```
+各参数含义如下
+|      参数名      |                             含义                             |
+| :--------------: | :----------------------------------------------------------: |
+|       name       |    （str）用于标识 OP 类型的字符串，该字段必须全局唯一。     |
+|    input_ops     |            （list）当前 OP 的所有前继 OP 的列表。            |
+| server_endpoints | （list）远程 Paddle Serving Service 的 endpoints 列表。如果不设置该参数，则不访问远程 Paddle Serving Service，即 不会执行 process 操作。 |
+|    fetch_list    |     （list）远程 Paddle Serving Service 的 fetch 列表。      |
+|  client_config   | （str）Paddle Serving Service 对应的 Client 端配置文件路径。 |
+|   concurrency    |                     （int）OP 的并发数。                     |
+|     timeout      | （int）process 操作的超时时间，单位为秒。若该值小于零，则视作不超时。 |
+|      retry       |       （int）超时重试次数。当该值为 1 时，不进行重试。       |
+#### 2. 普通 OP二次开发接口
+|                   变量或接口                   |                             说明                             |
+| :--------------------------------------------: | :----------------------------------------------------------: |
+|       def preprocess(self, input_dicts)        | 对从 Channel 中获取的数据进行处理，处理完的数据将作为 **process** 函数的输入。 |
+|          def process(self, feed_dict)          | 基于 Paddle Serving Client 进行 RPC 预测，处理完的数据将作为 **postprocess** 函数的输入。 |
+| def postprocess(self, input_dicts, fetch_dict) | 处理预测结果，处理完的数据将被放入后继 Channel 中，以被后继 OP 获取。 |
+|               def init_op(self)                |                  用于加载资源（如字典等）。                  |
+|              self.concurrency_idx              |   当前线程（进程）的并发数索引（不同种类的 OP 单独计算）。   |
+OP 在一个运行周期中会依次执行 preprocess，process，postprocess 三个操作（当不设置 `server_endpoints` 参数时，不执行 process 操作），用户可以对这三个函数进行重写，默认实现如下：
+```python
+def preprocess(self, input_dicts):
+  # multiple previous Op
+  if len(input_dicts) != 1:
+    raise NotImplementedError(
+      'this Op has multiple previous inputs. Please override this func.'
+    ）
+  (_, input_dict), = input_dicts.items()
+  return input_dict
+def process(self, feed_dict):
+  err, err_info = ChannelData.check_npdata(feed_dict)
+  if err != 0:
+    raise NotImplementedError(
+      "{} Please override preprocess func.".format(err_info))
+  call_result = self.client.predict(
+    feed=feed_dict, fetch=self._fetch_names)
+  return call_result
+def postprocess(self, input_dicts, fetch_dict):
+  return fetch_dict
+```
+**preprocess** 的参数是前继 Channel 中的数据 `input_dicts`，该变量是一个以前继 OP 的 name 为 Key，对应 OP 的输出为 Value 的字典。
+**process** 的参数是 Paddle Serving Client 预测接口的输入变量 `fetch_dict`（preprocess 函数的返回值），该变量是一个以 feed_name 为 Key，对应 ndarray 格式的数据为 Value 的字典。
+**postprocess** 的参数是 `input_dicts` 和 `fetch_dict`，`input_dicts` 与 preprocess 的参数一致，`fetch_dict` 是 process 函数的返回值（如果没有执行 process ，则该值为 preprocess 的返回值）。
+用户还可以对 **init_op** 函数进行重写，已加载自定义的一些资源（比如字典等），默认实现如下：
+```python
+def init_op(self):
+  pass
+```
+需要注意的是，在线程版 OP 中，每个 OP 只会调用一次该函数，故加载的资源必须要求是线程安全的。
+#### 3. RequestOp 定义
+RequestOp 用于处理 Pipeline Server 接收到的 RPC 数据，处理后的数据将会被加入到图执行引擎中。其构造函数如下：
+```python
+def __init__(self)
+```
+#### 4. RequestOp 二次开发接口
+|                变量或接口                 |                    说明                    |
+| :---------------------------------------: | :----------------------------------------: |
+|             def init_op(self)             | 用于加载资源（如字典等），与普通 OP 一致。 |
+| def unpack_request_package(self, request) |          处理接收到的 RPC 数据。           |
+**unpack_request_package** 的默认实现是将 RPC request 中的 key 和 value 做成字典：
+```python
+def unpack_request_package(self, request):
+  dictdata = {}
+  for idx, key in enumerate(request.key):
+    data = request.value[idx]
+    try:
+      data = eval(data)
+    except Exception as e:
+      pass
+    dictdata[key] = data
+  return dictdata
+```
+要求返回值是一个字典类型。
+#### 5. ResponseOp 定义
+ResponseOp 用于处理图执行引擎的预测结果，处理后的数据将会作为 Pipeline Server 的RPC 返回值，其构造函数如下：
+```python
+def __init__(self, input_ops)
+```
+其中，`input_ops` 是图执行引擎的最后一个 OP，用户可以通过设置不同的 `input_ops` 以在不修改 OP 的拓扑关系下构造不同的 DAG。
+#### 6. ResponseOp 二次开发接口
+|                  变量或接口                  |                    说明                     |
+| :------------------------------------------: | :-----------------------------------------: |
+|              def init_op(self)               | 用于加载资源（如字典等），与普通 OP 一致。  |
+| def pack_response_package(self, channeldata) | 处理图执行引擎的预测结果，作为 RPC 的返回。 |
+**pack_response_package** 的默认实现是将预测结果的字典转化为 RPC response 中的 key 和 value：
+```python
+def pack_response_package(self, channeldata):
+  resp = pipeline_service_pb2.Response()
+  resp.ecode = channeldata.ecode
+  if resp.ecode == ChannelDataEcode.OK.value:
+    if channeldata.datatype == ChannelDataType.CHANNEL_NPDATA.value:
+      feed = channeldata.parse()
+      np.set_printoptions(threshold=np.nan)
+      for name, var in feed.items():
+        resp.value.append(var.__repr__())
+        resp.key.append(name)
+    elif channeldata.datatype == ChannelDataType.DICT.value:
+      feed = channeldata.parse()
+      for name, var in feed.items():
+        if not isinstance(var, str):
+          resp.ecode = ChannelDataEcode.TYPE_ERROR.value
+          resp.error_info = self._log(
+            "fetch var type must be str({}).".format(type(var)))
+          break
+        resp.value.append(var)
+        resp.key.append(name)
+    else:
+      resp.ecode = ChannelDataEcode.TYPE_ERROR.value
+      resp.error_info = self._log(
+        "Error type({}) in datatype.".format(channeldata.datatype))
+  else:
+    resp.error_info = channeldata.error_info
+  return resp
+```
+#### 7. PipelineServer定义
+PipelineServer 的定义比较简单，如下所示：
+```python
+server = PipelineServer()
+server.set_response_op(response_op)
+server.prepare_server(config_yml_path)
+server.run_server()
+```
+其中，`response_op` 为上面提到的 ResponseOp，PipelineServer 将会根据各个 OP 的拓扑关系初始化 Channel 并构建计算图。`config_yml_path` 为 PipelineServer 的配置文件，示例文件如下：
+```yaml
+port: 18080  # gRPC端口号
+worker_num: 1  # gRPC线程池大小（进程版 Servicer 中为进程数），默认为 1
+build_dag_each_worker: false  # 是否使用进程版 Servicer，默认为 false
+dag:
+    is_thread_op: true  # 是否使用线程版Op，默认为 true
+    client_type: brpc  # 使用 brpc 或 grpc client，默认为 brpc
+    retry: 1  # DAG Executor 在失败后重试次数，默认为 1，即不重试
+    use_profile: false  # 是否在 Server 端打印日志，默认为 false
+```
+## 例子
+这里通过搭建简单的 imdb model ensemble 例子来展示如何使用 Pipeline Serving，相关代码在 `python/examples/pipeline/imdb_model_ensemble` 文件夹下可以找到，例子中的 Server 端结构如下图所示：
+<center>
+<img src='pipeline_serving-image4.png' height = "200" align="middle"/>
+</center>
+### 获取模型文件并启动 Paddle Serving Service
+```shell
+cd python/examples/pipeline/imdb_model_ensemble
+sh get_data.sh
+python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 &> cnn.log &
+python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 &> bow.log &
+```
+### 启动 PipelineServer
+运行下面代码
+```python
+from paddle_serving_server.pipeline import Op, RequestOp, ResponseOp
+from paddle_serving_server.pipeline import PipelineServer
+from paddle_serving_server.pipeline.proto import pipeline_service_pb2
+from paddle_serving_server.pipeline.channel import ChannelDataEcode
+import numpy as np
+import logging
+from paddle_serving_app.reader import IMDBDataset
+logging.basicConfig(level=logging.DEBUG)
+_LOGGER = logging.getLogger()
+class ImdbRequestOp(RequestOp):
+    def init_op(self):
+        self.imdb_dataset = IMDBDataset()
+        self.imdb_dataset.load_resource('imdb.vocab')
+    def unpack_request_package(self, request):
+        dictdata = {}
+        for idx, key in enumerate(request.key):
+            if key != "words":
+                continue
+            words = request.value[idx]
+            word_ids, _ = self.imdb_dataset.get_words_and_label(words)
+            dictdata[key] = np.array(word_ids)
+        return dictdata
+class CombineOp(Op):
+    def preprocess(self, input_data):
+        combined_prediction = 0
+        for op_name, data in input_data.items():
+            _LOGGER.info("{}: {}".format(op_name, data["prediction"]))
+            combined_prediction += data["prediction"]
+        data = {"prediction": combined_prediction / 2}
+        return data
+read_op = ImdbRequestOp()
+bow_op = Op(name="bow",
+            input_ops=[read_op],
+            server_endpoints=["127.0.0.1:9393"],
+            fetch_list=["prediction"],
+            client_config="imdb_bow_client_conf/serving_client_conf.prototxt",
+            concurrency=1,
+            timeout=-1,
+            retry=1)
+cnn_op = Op(name="cnn",
+            input_ops=[read_op],
+            server_endpoints=["127.0.0.1:9292"],
+            fetch_list=["prediction"],
+            client_config="imdb_cnn_client_conf/serving_client_conf.prototxt",
+            concurrency=1,
+            timeout=-1,
+            retry=1)
+combine_op = CombineOp(
+    name="combine",
+    input_ops=[bow_op, cnn_op],
+    concurrency=5,
+    timeout=-1,
+    retry=1)
+# use default ResponseOp implementation
+response_op = ResponseOp(input_ops=[combine_op])
+server = PipelineServer()
+server.set_response_op(response_op)
+server.prepare_server('config.yml')
+server.run_server()
+```
+### 通过 PipelineClient 执行预测
+```python
+from paddle_serving_client.pipeline import PipelineClient
+import numpy as np
+client = PipelineClient()
+client.connect(['127.0.0.1:18080'])
+words = 'i am very sad | 0'
+futures = []
+for i in range(3):
+    futures.append(
+        client.predict(
+            feed_dict={"words": words},
+            fetch=["prediction"],
+            asyn=True))
+for f in futures:
+    res = f.result()
+    if res["ecode"] != 0:
+        print(res)
+        exit(1)
+```
+## 如何通过 Timeline 工具进行优化
+为了更好地对性能进行优化，PipelineServing 提供了 Timeline 工具，对整个服务的各个阶段时间进行打点。
+### 在 Server 端输出 Profile 信息
+Server 端用 yaml 中的 `use_profile` 字段进行控制：
+```yaml
+dag:
+    use_profile: true
+```
+开启该功能后，Server 端在预测的过程中会将对应的日志信息打印到标准输出，为了更直观地展现各阶段的耗时，提供脚本对日志文件做进一步的分析处理。
+使用时先将 Server 的输出保存到文件，以 profile 为例，脚本将日志中的时间打点信息转换成 json 格式保存到trace 文件，trace 文件可以通过 chrome 浏览器的 tracing 功能进行可视化。
+```shell
+python timeline_trace.py profile trace
+```
+具体操作：打开 chrome 浏览器，在地址栏输入 chrome://tracing/ ，跳转至 tracing 页面，点击 load 按钮，打开保存的 trace 文件，即可将预测服务的各阶段时间信息可视化。
+### 在 Client 端输出 Profile 信息
+Client 端在 `predict` 接口设置 `profile=True`，即可开启 Profile 功能。
+开启该功能后，Client 端在预测的过程中会将该次预测对应的日志信息打印到标准输出，后续分析处理同 Server。
--- a/doc/RUN_IN_DOCKER.md
+++ b/doc/RUN_IN_DOCKER.md
@@ -12,21 +12,12 @@ This document takes Python2 as an example to show how to run Paddle Serving in d
 ### Get docker image
-You can get images in two ways:
+Refer to [this document](DOCKER_IMAGES.md) for a docker image:
-1. Pull image directly
+```shell
+docker pull hub.baidubce.com/paddlepaddle/serving:latest
-   ```bash
+```
-   docker pull hub.baidubce.com/paddlepaddle/serving:latest
-   ```
-2. Building image based on dockerfile
-   Create a new folder and copy [Dockerfile](../tools/Dockerfile) to this folder, and run the following command:
-   ```bash
-   docker build -t hub.baidubce.com/paddlepaddle/serving:latest .
-   ```
 ### Create container
@@ -104,26 +95,16 @@ The GPU version is basically the same as the CPU version, with only some differe
 ### Get docker image
-You can also get images in two ways:
+Refer to [this document](DOCKER_IMAGES.md) for a docker image, the following is an example of an `cuda9.0-cudnn7` image:
-1. Pull image directly
-   ```bash
+```shell
-   nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-gpu
+nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
-   ```
+```
-2. Building image based on dockerfile
-   Create a new folder and copy [Dockerfile.gpu](../tools/Dockerfile.gpu) to this folder, and run the following command:
-   ```bash
-   nvidia-docker build -t hub.baidubce.com/paddlepaddle/serving:latest-gpu .
-   ```
 ### Create container
 ```bash
-nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-gpu
+nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
 nvidia-docker exec -it test bash
 ```
@@ -200,4 +181,4 @@ tar -xzf uci_housing.tar.gz
 ## Attention
-The images provided by this document are all runtime images, which do not support compilation. If you want to compile from source, refer to [COMPILE](COMPILE.md).
+Runtime images cannot be used for compilation. If you want to compile from source, refer to [COMPILE](COMPILE.md).
--- a/doc/RUN_IN_DOCKER_CN.md
+++ b/doc/RUN_IN_DOCKER_CN.md
@@ -12,21 +12,12 @@ Docker（GPU版本需要在GPU机器上安装nvidia-docker）
 ### 获取镜像
-可以通过两种方式获取镜像。
+参考[该文档](DOCKER_IMAGES_CN.md)获取镜像：
-1. 直接拉取镜像
+```shell
+docker pull hub.baidubce.com/paddlepaddle/serving:latest
-   ```bash
+```
-   docker pull hub.baidubce.com/paddlepaddle/serving:latest
-   ```
-2. 基于Dockerfile构建镜像
-   建立新目录，复制[Dockerfile](../tools/Dockerfile)内容到该目录下Dockerfile文件。执行
-   ```bash
-   docker build -t hub.baidubce.com/paddlepaddle/serving:latest .
-   ```
 ### 创建容器并进入
@@ -102,26 +93,16 @@ GPU版本与CPU版本基本一致，只有部分接口命名的差别（GPU版
 ### 获取镜像
-可以通过两种方式获取镜像。
+参考[该文档](DOCKER_IMAGES_CN.md)获取镜像，这里以 `cuda9.0-cudnn7` 的镜像为例：
-1. 直接拉取镜像
-   ```bash
+```shell
-   nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-gpu
+nvidia-docker pull hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
-   ```
+```
-2. 基于Dockerfile构建镜像
-   建立新目录，复制[Dockerfile.gpu](../tools/Dockerfile.gpu)内容到该目录下Dockerfile文件。执行
-   ```bash
-   nvidia-docker build -t hub.baidubce.com/paddlepaddle/serving:latest-gpu .
-   ```
 ### 创建容器并进入
 ```bash
-nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-gpu
+nvidia-docker run -p 9292:9292 --name test -dit hub.baidubce.com/paddlepaddle/serving:latest-cuda9.0-cudnn7
 nvidia-docker exec -it test bash
 ```
@@ -195,4 +176,4 @@ tar -xzf uci_housing.tar.gz
 ## 注意事项
-该文档提供的镜像均为运行镜像，不支持开发编译。如果想要从源码编译，请查看[如何编译PaddleServing](COMPILE.md)。
+运行时镜像不能用于开发编译。如果想要从源码编译，请查看[如何编译PaddleServing](COMPILE.md)。
--- a/doc/pipeline_serving-image1.png
+++ b/doc/pipeline_serving-image1.png
--- a/doc/pipeline_serving-image2.png
+++ b/doc/pipeline_serving-image2.png
--- a/doc/pipeline_serving-image3.png
+++ b/doc/pipeline_serving-image3.png
--- a/doc/pipeline_serving-image4.png
+++ b/doc/pipeline_serving-image4.png
--- a/java/examples/pom.xml
+++ b/java/examples/pom.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>io.paddle.serving.client</groupId>
+    <artifactId>paddle-serving-sdk-java-examples</artifactId>
+    <version>0.0.1</version>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <configuration>
+                    <source>8</source>
+                    <target>8</target>
+                </configuration>
+                <version>3.8.1</version>
+            </plugin>
+        	<plugin>
+                <artifactId>maven-assembly-plugin</artifactId>
+                <configuration>
+                    <archive>
+                        <manifest>
+                            <addClasspath>true</addClasspath>
+                            <mainClass>my.fully.qualified.class.Main</mainClass>
+                        </manifest>
+                    </archive>
+                    <descriptorRefs>
+                        <descriptorRef>jar-with-dependencies</descriptorRef>
+                    </descriptorRefs>
+                </configuration>
+                <executions>
+                    <execution>
+                        <id>make-my-jar-with-dependencies</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <nd4j.backend>nd4j-native</nd4j.backend>
+        <nd4j.version>1.0.0-beta7</nd4j.version>
+        <datavec.version>1.0.0-beta7</datavec.version>
+        <paddle.serving.client.version>0.0.1</paddle.serving.client.version>
+        <maven.compiler.source>1.7</maven.compiler.source>
+        <maven.compiler.target>1.7</maven.compiler.target>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>io.paddle.serving.client</groupId>
+            <artifactId>paddle-serving-sdk-java</artifactId>
+            <version>${paddle.serving.client.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-api</artifactId>
+            <version>1.7.30</version>
+        </dependency>
+        <dependency>
+            <groupId>org.nd4j</groupId>
+            <artifactId>${nd4j.backend}</artifactId>
+            <version>${nd4j.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.11</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.datavec</groupId>
+            <artifactId>datavec-data-image</artifactId>
+            <version>${datavec.version}</version>
+        </dependency>
+    </dependencies>
+</project>
--- a/java/examples/src/main/java/PaddleServingClientExample.java
+++ b/java/examples/src/main/java/PaddleServingClientExample.java
+import io.paddle.serving.client.*;
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import org.nd4j.linalg.api.iter.NdIndexIterator;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.datavec.image.loader.NativeImageLoader;
+import org.nd4j.linalg.api.ops.CustomOp;
+import org.nd4j.linalg.api.ops.DynamicCustomOp;
+import org.nd4j.linalg.factory.Nd4j;
+import java.util.*;
+public class PaddleServingClientExample {
+    boolean fit_a_line() {
+        float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
+            0.0582f, -0.0727f, -0.1583f, -0.0584f,
+            0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
+        INDArray npdata = Nd4j.createFromArray(data);
+        HashMap<String, INDArray> feed_data
+            = new HashMap<String, INDArray>() {{
+                put("x", npdata);
+            }};
+        List<String> fetch = Arrays.asList("price");
+        Client client = new Client();
+        String target = "localhost:9393";
+        boolean succ = client.connect(target);
+        if (succ != true) {
+            System.out.println("connect failed.");
+            return false;
+        }
+        Map<String, INDArray> fetch_map = client.predict(feed_data, fetch);
+        if (fetch_map == null) {
+            return false;
+        }
+        for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
+            System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
+        }
+        return true;
+    }
+    boolean yolov4(String filename) {
+        // https://deeplearning4j.konduit.ai/
+        int height = 608;
+        int width = 608;
+        int channels = 3;
+        NativeImageLoader loader = new NativeImageLoader(height, width, channels);
+        INDArray BGRimage = null;
+        try {
+            BGRimage = loader.asMatrix(new File(filename));
+        } catch (java.io.IOException e) {
+            System.out.println("load image fail.");
+            return false;
+        }   
+        // shape: (channels, height, width)
+        BGRimage = BGRimage.reshape(channels, height, width);
+        INDArray RGBimage = Nd4j.create(BGRimage.shape());
+        // BGR2RGB
+        CustomOp op = DynamicCustomOp.builder("reverse")
+            .addInputs(BGRimage)
+            .addOutputs(RGBimage)
+            .addIntegerArguments(0)
+            .build();
+        Nd4j.getExecutioner().exec(op);
+        // Div(255.0)
+        INDArray image = RGBimage.divi(255.0);
+        INDArray im_size = Nd4j.createFromArray(new int[]{height, width});
+        HashMap<String, INDArray> feed_data
+            = new HashMap<String, INDArray>() {{
+                put("image", image);
+                put("im_size", im_size);
+            }};
+        List<String> fetch = Arrays.asList("save_infer_model/scale_0.tmp_0");
+        Client client = new Client();
+        String target = "localhost:9393";
+        boolean succ = client.connect(target);
+        if (succ != true) {
+            System.out.println("connect failed.");
+            return false;
+        }
+        succ = client.setRpcTimeoutMs(20000); // cpu
+        if (succ != true) {
+            System.out.println("set timeout failed.");
+            return false;
+        }
+        Map<String, INDArray> fetch_map = client.predict(feed_data, fetch);
+        if (fetch_map == null) {
+            return false;
+        }
+        for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
+            System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
+        }
+        return true;
+    }
+    boolean batch_predict() {
+        float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
+            0.0582f, -0.0727f, -0.1583f, -0.0584f,
+            0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
+        INDArray npdata = Nd4j.createFromArray(data);
+        HashMap<String, INDArray> feed_data
+            = new HashMap<String, INDArray>() {{
+                put("x", npdata);
+            }};
+        List<HashMap<String, INDArray>> feed_batch
+            = new ArrayList<HashMap<String, INDArray>>() {{
+                add(feed_data);
+                add(feed_data);
+            }};
+        List<String> fetch = Arrays.asList("price");
+        Client client = new Client();
+        String target = "localhost:9393";
+        boolean succ = client.connect(target);
+        if (succ != true) {
+            System.out.println("connect failed.");
+            return false;
+        }
+        Map<String, INDArray> fetch_map = client.predict(feed_batch, fetch);
+        if (fetch_map == null) {
+            return false;
+        }
+        for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
+            System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
+        }
+        return true;
+    }
+    boolean asyn_predict() {
+        float[] data = {0.0137f, -0.1136f, 0.2553f, -0.0692f,
+            0.0582f, -0.0727f, -0.1583f, -0.0584f,
+            0.6283f, 0.4919f, 0.1856f, 0.0795f, -0.0332f};
+        INDArray npdata = Nd4j.createFromArray(data);
+        HashMap<String, INDArray> feed_data
+            = new HashMap<String, INDArray>() {{
+                put("x", npdata);
+            }};
+        List<String> fetch = Arrays.asList("price");
+        Client client = new Client();
+        String target = "localhost:9393";
+        boolean succ = client.connect(target);
+        if (succ != true) {
+            System.out.println("connect failed.");
+            return false;
+        }
+        PredictFuture future = client.asyn_predict(feed_data, fetch);
+        Map<String, INDArray> fetch_map = future.get();
+        if (fetch_map == null) {
+            System.out.println("Get future reslut failed");
+            return false;
+        }
+        for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
+            System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
+        }
+        return true;
+    }
+    boolean model_ensemble() {
+        long[] data = {8, 233, 52, 601};
+        INDArray npdata = Nd4j.createFromArray(data);
+        HashMap<String, INDArray> feed_data
+            = new HashMap<String, INDArray>() {{
+                put("words", npdata);
+            }};
+        List<String> fetch = Arrays.asList("prediction");
+        Client client = new Client();
+        String target = "localhost:9393";
+        boolean succ = client.connect(target);
+        if (succ != true) {
+            System.out.println("connect failed.");
+            return false;
+        }
+        Map<String, HashMap<String, INDArray>> fetch_map
+            = client.ensemble_predict(feed_data, fetch);
+        if (fetch_map == null) {
+            return false;
+        }
+        for (Map.Entry<String, HashMap<String, INDArray>> entry : fetch_map.entrySet()) {
+            System.out.println("Model = " + entry.getKey());
+            HashMap<String, INDArray> tt = entry.getValue();
+            for (Map.Entry<String, INDArray> e : tt.entrySet()) {
+                System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
+            }
+        }
+        return true;
+    }
+    boolean bert() {
+        float[] input_mask = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+        long[] position_ids = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+        long[] input_ids = {101, 6843, 3241, 749, 8024, 7662, 2533, 1391, 2533, 2523, 7676, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+        long[] segment_ids = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+        HashMap<String, INDArray> feed_data
+            = new HashMap<String, INDArray>() {{
+                put("input_mask", Nd4j.createFromArray(input_mask));
+                put("position_ids", Nd4j.createFromArray(position_ids));
+                put("input_ids", Nd4j.createFromArray(input_ids));
+                put("segment_ids", Nd4j.createFromArray(segment_ids));
+            }};
+        List<String> fetch = Arrays.asList("pooled_output");
+        Client client = new Client();
+        String target = "localhost:9393";
+        boolean succ = client.connect(target);
+        if (succ != true) {
+            System.out.println("connect failed.");
+            return false;
+        }
+        Map<String, INDArray> fetch_map = client.predict(feed_data, fetch);
+        if (fetch_map == null) {
+            return false;
+        }
+        for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
+            System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
+        }
+        return true;
+    }
+    boolean cube_local() {
+        long[] embedding_14 = {250644};
+        long[] embedding_2 = {890346};
+        long[] embedding_10 = {3939};
+        long[] embedding_17 = {421122};
+        long[] embedding_23 = {664215};
+        long[] embedding_6 = {704846};
+        float[] dense_input = {0.0f, 0.006633499170812604f, 0.03f, 0.0f,
+            0.145078125f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+        long[] embedding_24 = {269955};
+        long[] embedding_12 = {295309};
+        long[] embedding_7 = {437731};
+        long[] embedding_3 = {990128};
+        long[] embedding_1 = {7753};
+        long[] embedding_4 = {286835};
+        long[] embedding_8 = {27346};
+        long[] embedding_9 = {636474};
+        long[] embedding_18 = {880474};
+        long[] embedding_16 = {681378};
+        long[] embedding_22 = {410878};
+        long[] embedding_13 = {255651};
+        long[] embedding_5 = {25207};
+        long[] embedding_11 = {10891};
+        long[] embedding_20 = {238459};
+        long[] embedding_21 = {26235};
+        long[] embedding_15 = {691460};
+        long[] embedding_25 = {544187};
+        long[] embedding_19 = {537425};
+        long[] embedding_0 = {737395};
+        HashMap<String, INDArray> feed_data
+            = new HashMap<String, INDArray>() {{
+                put("embedding_14.tmp_0", Nd4j.createFromArray(embedding_14));
+                put("embedding_2.tmp_0", Nd4j.createFromArray(embedding_2));
+                put("embedding_10.tmp_0", Nd4j.createFromArray(embedding_10));
+                put("embedding_17.tmp_0", Nd4j.createFromArray(embedding_17));
+                put("embedding_23.tmp_0", Nd4j.createFromArray(embedding_23));
+                put("embedding_6.tmp_0", Nd4j.createFromArray(embedding_6));
+                put("dense_input", Nd4j.createFromArray(dense_input));
+                put("embedding_24.tmp_0", Nd4j.createFromArray(embedding_24));
+                put("embedding_12.tmp_0", Nd4j.createFromArray(embedding_12));
+                put("embedding_7.tmp_0", Nd4j.createFromArray(embedding_7));
+                put("embedding_3.tmp_0", Nd4j.createFromArray(embedding_3));
+                put("embedding_1.tmp_0", Nd4j.createFromArray(embedding_1));
+                put("embedding_4.tmp_0", Nd4j.createFromArray(embedding_4));
+                put("embedding_8.tmp_0", Nd4j.createFromArray(embedding_8));
+                put("embedding_9.tmp_0", Nd4j.createFromArray(embedding_9));
+                put("embedding_18.tmp_0", Nd4j.createFromArray(embedding_18));
+                put("embedding_16.tmp_0", Nd4j.createFromArray(embedding_16));
+                put("embedding_22.tmp_0", Nd4j.createFromArray(embedding_22));
+                put("embedding_13.tmp_0", Nd4j.createFromArray(embedding_13));
+                put("embedding_5.tmp_0", Nd4j.createFromArray(embedding_5));
+                put("embedding_11.tmp_0", Nd4j.createFromArray(embedding_11));
+                put("embedding_20.tmp_0", Nd4j.createFromArray(embedding_20));
+                put("embedding_21.tmp_0", Nd4j.createFromArray(embedding_21));
+                put("embedding_15.tmp_0", Nd4j.createFromArray(embedding_15));
+                put("embedding_25.tmp_0", Nd4j.createFromArray(embedding_25));
+                put("embedding_19.tmp_0", Nd4j.createFromArray(embedding_19));
+                put("embedding_0.tmp_0", Nd4j.createFromArray(embedding_0));
+            }};
+        List<String> fetch = Arrays.asList("prob");
+        Client client = new Client();
+        String target = "localhost:9393";
+        boolean succ = client.connect(target);
+        if (succ != true) {
+            System.out.println("connect failed.");
+            return false;
+        }
+        Map<String, INDArray> fetch_map = client.predict(feed_data, fetch);
+        if (fetch_map == null) {
+            return false;
+        }
+        for (Map.Entry<String, INDArray> e : fetch_map.entrySet()) {
+            System.out.println("Key = " + e.getKey() + ", Value = " + e.getValue());
+        }
+        return true;
+    }
+    public static void main( String[] args ) {
+        // DL4J（Deep Learning for Java）Document:
+        // https://www.bookstack.cn/read/deeplearning4j/bcb48e8eeb38b0c6.md
+        PaddleServingClientExample e = new PaddleServingClientExample();
+        boolean succ = false;
+        if (args.length < 1) {
+            System.out.println("Usage: java -cp <jar> PaddleServingClientExample <test-type>.");
+            System.out.println("<test-type>: fit_a_line bert model_ensemble asyn_predict batch_predict cube_local cube_quant yolov4");
+            return;
+        }
+        String testType = args[0];
+        System.out.format("[Example] %s\n", testType);
+        if ("fit_a_line".equals(testType)) {
+            succ = e.fit_a_line();
+        } else if ("bert".equals(testType)) {
+            succ = e.bert();
+        } else if ("model_ensemble".equals(testType)) {
+            succ = e.model_ensemble();
+        } else if ("asyn_predict".equals(testType)) {
+            succ = e.asyn_predict();
+        } else if ("batch_predict".equals(testType)) {
+            succ = e.batch_predict();
+        } else if ("cube_local".equals(testType)) {
+            succ = e.cube_local();
+        } else if ("cube_quant".equals(testType)) {
+            succ = e.cube_local();
+        } else if ("yolov4".equals(testType)) {
+            if (args.length < 2) {
+                System.out.println("Usage: java -cp <jar> PaddleServingClientExample yolov4 <image-filepath>.");
+                return;
+            }
+            succ = e.yolov4(args[1]);
+        } else {
+            System.out.format("test-type(%s) not match.\n", testType);
+            return;
+        }
+        if (succ == true) {
+            System.out.println("[Example] succ.");
+        } else {
+            System.out.println("[Example] fail.");
+        }
+    }
+}
--- a/java/examples/src/main/resources/000000570688.jpg
+++ b/java/examples/src/main/resources/000000570688.jpg
--- a/java/pom.xml
+++ b/java/pom.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>io.paddle.serving.client</groupId>
+    <artifactId>paddle-serving-sdk-java</artifactId>
+    <version>0.0.1</version>
+    <packaging>jar</packaging>
+    <name>paddle-serving-sdk-java</name>
+    <description>Java SDK for Paddle Sering Client.</description>
+    <url>https://github.com/PaddlePaddle/Serving</url>
+    <licenses>
+        <license>
+            <name>Apache License, Version 2.0</name>
+            <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+            <distribution>repo</distribution>
+        </license>
+    </licenses>
+    <developers>
+        <developer>
+            <name>PaddlePaddle Author</name>
+            <email>guru4elephant@gmail.com</email>
+            <organization>PaddlePaddle</organization>
+            <organizationUrl>https://github.com/PaddlePaddle/Serving</organizationUrl>
+        </developer>
+    </developers>
+    <scm>
+        <connection>scm:git:https://github.com/PaddlePaddle/Serving.git</connection>
+        <developerConnection>scm:git:https://github.com/PaddlePaddle/Serving.git</developerConnection>
+        <url>https://github.com/PaddlePaddle/Serving</url>
+    </scm>
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <grpc.version>1.27.2</grpc.version>
+        <protobuf.version>3.11.0</protobuf.version>
+        <protoc.version>3.11.0</protoc.version>
+        <nd4j.backend>nd4j-native</nd4j.backend>
+        <nd4j.version>1.0.0-beta7</nd4j.version>
+        <maven.compiler.source>1.8</maven.compiler.source>
+        <maven.compiler.target>1.8</maven.compiler.target>
+    </properties>
+    <dependencyManagement>
+        <dependencies>
+            <dependency>
+                <groupId>io.grpc</groupId>
+                <artifactId>grpc-bom</artifactId>
+                <version>${grpc.version}</version>
+                <type>pom</type>
+                <scope>import</scope>
+            </dependency>
+        </dependencies>
+    </dependencyManagement>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.maven.plugins</groupId>
+            <artifactId>maven-gpg-plugin</artifactId>
+            <version>1.6</version>
+        </dependency>
+        <dependency>
+            <groupId>io.grpc</groupId>
+            <artifactId>grpc-netty-shaded</artifactId>
+            <scope>runtime</scope>
+        </dependency>
+        <dependency>
+            <groupId>io.grpc</groupId>
+            <artifactId>grpc-protobuf</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>io.grpc</groupId>
+            <artifactId>grpc-stub</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>javax.annotation</groupId>
+            <artifactId>javax.annotation-api</artifactId>
+            <version>1.2</version>
+            <scope>provided</scope> <!-- not needed at runtime -->
+        </dependency>
+        <dependency>
+            <groupId>io.grpc</groupId>
+            <artifactId>grpc-testing</artifactId>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.google.protobuf</groupId>
+            <artifactId>protobuf-java-util</artifactId>
+            <version>${protobuf.version}</version>
+            <scope>runtime</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.google.errorprone</groupId>
+            <artifactId>error_prone_annotations</artifactId>
+            <version>2.3.4</version> <!-- prefer to use 2.3.3 or later -->
+        </dependency>
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter</artifactId>
+            <version>5.5.2</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-text</artifactId>
+            <version>1.6</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-collections4</artifactId>
+            <version>4.4</version>
+        </dependency>
+        <dependency>
+            <groupId>org.json</groupId>
+            <artifactId>json</artifactId>
+            <version>20190722</version>
+        </dependency>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-api</artifactId>
+            <version>1.7.30</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>log4j-slf4j-impl</artifactId>
+            <version>2.12.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.nd4j</groupId>
+            <artifactId>${nd4j.backend}</artifactId>
+            <version>${nd4j.version}</version>
+        </dependency>
+    </dependencies>
+    <profiles>
+        <profile>
+            <id>release</id>
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-source-plugin</artifactId>
+                        <version>3.1.0</version>
+                        <executions>
+                            <execution>
+                                <id>attach-sources</id>
+                                <goals>
+                                    <goal>jar-no-fork</goal>
+                                </goals>
+                            </execution>
+                        </executions>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-javadoc-plugin</artifactId>
+                        <version>3.1.1</version>
+                        <configuration>
+                            <javadocExecutable>${java.home}/bin/javadoc</javadocExecutable>
+                        </configuration>
+                        <executions>
+                            <execution>
+                                <id>attach-javadocs</id>
+                                <goals>
+                                    <goal>jar</goal>
+                                </goals>
+                            </execution>
+                        </executions>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-gpg-plugin</artifactId>
+                        <version>1.6</version>
+                        <executions>
+                            <execution>
+                                <id>sign-artifacts</id>
+                                <phase>verify</phase>
+                                <goals>
+                                    <goal>sign</goal>
+                                </goals>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+    </profiles>
+    <build>
+        <extensions>
+            <extension>
+                <groupId>kr.motd.maven</groupId>
+                <artifactId>os-maven-plugin</artifactId>
+                <version>1.6.2</version>
+            </extension>
+        </extensions>
+        <plugins>
+            <plugin>
+                <groupId>org.sonatype.plugins</groupId>
+                <artifactId>nexus-staging-maven-plugin</artifactId>
+                <version>1.6.8</version>
+                <extensions>true</extensions>
+                <configuration>
+                    <serverId>ossrh</serverId>
+                    <nexusUrl>https://oss.sonatype.org/</nexusUrl>
+                    <autoReleaseAfterClose>true</autoReleaseAfterClose>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-release-plugin</artifactId>
+                <version>2.5.3</version>
+                <configuration>
+                    <autoVersionSubmodules>true</autoVersionSubmodules>
+                    <useReleaseProfile>false</useReleaseProfile>
+                    <releaseProfiles>release</releaseProfiles>
+                    <goals>deploy</goals>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.xolstice.maven.plugins</groupId>
+                <artifactId>protobuf-maven-plugin</artifactId>
+                <version>0.6.1</version>
+                <configuration>
+                    <protocArtifact>com.google.protobuf:protoc:${protoc.version}:exe:${os.detected.classifier}
+                    </protocArtifact>
+                    <pluginId>grpc-java</pluginId>
+                    <pluginArtifact>io.grpc:protoc-gen-grpc-java:${grpc.version}:exe:${os.detected.classifier}
+                    </pluginArtifact>
+                </configuration>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>compile</goal>
+                            <goal>compile-custom</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-enforcer-plugin</artifactId>
+                <version>3.0.0-M2</version>
+                <executions>
+                    <execution>
+                        <id>enforce</id>
+                        <configuration>
+                            <rules>
+                                <requireUpperBoundDeps/>
+                            </rules>
+                        </configuration>
+                        <goals>
+                            <goal>enforce</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+</project>
--- a/java/src/main/java/io/paddle/serving/client/Client.java
+++ b/java/src/main/java/io/paddle/serving/client/Client.java
+package io.paddle.serving.client;
+import java.util.*;
+import java.util.function.Function;
+import java.lang.management.ManagementFactory;
+import java.lang.management.RuntimeMXBean;
+import io.grpc.ManagedChannel;
+import io.grpc.ManagedChannelBuilder;
+import io.grpc.StatusRuntimeException;
+import com.google.protobuf.ByteString;
+import com.google.common.util.concurrent.ListenableFuture;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.iter.NdIndexIterator;
+import org.nd4j.linalg.factory.Nd4j;
+import io.paddle.serving.grpc.*;
+import io.paddle.serving.configure.*;
+import io.paddle.serving.client.PredictFuture;
+class Profiler {
+    int pid_;
+    String print_head_ = null;
+    List<String> time_record_ = null;
+    boolean enable_ = false;
+    Profiler() {
+        RuntimeMXBean runtimeMXBean = ManagementFactory.getRuntimeMXBean();
+        pid_ = Integer.valueOf(runtimeMXBean.getName().split("@")[0]).intValue();
+        print_head_ = "\nPROFILE\tpid:" + pid_ + "\t";
+        time_record_ = new ArrayList<String>();
+        time_record_.add(print_head_);
+    }
+    void record(String name) {
+        if (enable_) {
+            long ctime = System.currentTimeMillis() * 1000;
+            time_record_.add(name + ":" + String.valueOf(ctime) + " ");
+        }
+    }
+    void printProfile() {
+        if (enable_) {
+            String profile_str = String.join("", time_record_);
+            time_record_ = new ArrayList<String>();
+            time_record_.add(print_head_);
+        }
+    }
+    void enable(boolean flag) {
+        enable_ = flag;
+    }
+}
+public class Client {
+    private ManagedChannel channel_;
+    private MultiLangGeneralModelServiceGrpc.MultiLangGeneralModelServiceBlockingStub blockingStub_;
+    private MultiLangGeneralModelServiceGrpc.MultiLangGeneralModelServiceFutureStub futureStub_;
+    private double rpcTimeoutS_;
+    private List<String> feedNames_;
+    private Map<String, Integer> feedTypes_;
+    private Map<String, List<Integer>> feedShapes_;
+    private List<String> fetchNames_;
+    private Map<String, Integer> fetchTypes_;
+    private Set<String> lodTensorSet_;
+    private Map<String, Integer> feedTensorLen_;
+    private Profiler profiler_;
+    public Client() {
+        channel_ = null;
+        blockingStub_ = null;
+        futureStub_ = null;
+        rpcTimeoutS_ = 2;
+        feedNames_ = null;
+        feedTypes_ = null;
+        feedShapes_ = null;
+        fetchNames_ = null;
+        fetchTypes_ = null;
+        lodTensorSet_ = null;
+        feedTensorLen_ = null;
+        profiler_ = new Profiler();
+        boolean is_profile = false;
+        String FLAGS_profile_client = System.getenv("FLAGS_profile_client");
+        if (FLAGS_profile_client != null && FLAGS_profile_client.equals("1")) {
+            is_profile = true;
+        }
+        profiler_.enable(is_profile);
+    }
+    public boolean setRpcTimeoutMs(int rpc_timeout) {
+        if (futureStub_ == null || blockingStub_ == null) {
+            System.out.println("set timeout must be set after connect.");
+            return false;
+        }
+        rpcTimeoutS_ = rpc_timeout / 1000.0;
+        SetTimeoutRequest timeout_req = SetTimeoutRequest.newBuilder()
+            .setTimeoutMs(rpc_timeout)
+            .build();
+        SimpleResponse resp;
+        try {
+            resp = blockingStub_.setTimeout(timeout_req);
+        } catch (StatusRuntimeException e) {
+            System.out.format("Set RPC timeout failed: %s\n", e.toString());
+            return false;
+        }
+        return resp.getErrCode() == 0;
+    }
+    public boolean connect(String target) {
+        // TODO: target must be NameResolver-compliant URI
+        // https://grpc.github.io/grpc-java/javadoc/io/grpc/ManagedChannelBuilder.html
+        try {
+            channel_ = ManagedChannelBuilder.forTarget(target)
+                .defaultLoadBalancingPolicy("round_robin")
+                .maxInboundMessageSize(Integer.MAX_VALUE)
+                .usePlaintext()
+                .build();
+            blockingStub_ = MultiLangGeneralModelServiceGrpc.newBlockingStub(channel_);
+            futureStub_ = MultiLangGeneralModelServiceGrpc.newFutureStub(channel_);
+        } catch (Exception e) {
+            System.out.format("Connect failed: %s\n", e.toString());
+            return false;
+        }
+        GetClientConfigRequest get_client_config_req = GetClientConfigRequest.newBuilder().build();
+        GetClientConfigResponse resp;
+        try {
+            resp = blockingStub_.getClientConfig(get_client_config_req);
+        } catch (Exception e) {
+            System.out.format("Get Client config failed: %s\n", e.toString());
+            return false;
+        }
+        String model_config_str = resp.getClientConfigStr();
+        _parseModelConfig(model_config_str);
+        return true;
+    }
+    private void _parseModelConfig(String model_config_str) {
+        GeneralModelConfig.Builder model_conf_builder = GeneralModelConfig.newBuilder();
+        try {
+            com.google.protobuf.TextFormat.getParser().merge(model_config_str, model_conf_builder);
+        } catch (com.google.protobuf.TextFormat.ParseException e) {
+            System.out.format("Parse client config failed: %s\n", e.toString());
+        }
+        GeneralModelConfig model_conf = model_conf_builder.build();
+        feedNames_ = new ArrayList<String>();
+        fetchNames_ = new ArrayList<String>();
+        feedTypes_ = new HashMap<String, Integer>();
+        feedShapes_ = new HashMap<String, List<Integer>>();
+        fetchTypes_ = new HashMap<String, Integer>();
+        lodTensorSet_ = new HashSet<String>();
+        feedTensorLen_ = new HashMap<String, Integer>();
+        List<FeedVar> feed_var_list = model_conf.getFeedVarList();
+        for (FeedVar feed_var : feed_var_list) {
+            feedNames_.add(feed_var.getAliasName());
+        }
+        List<FetchVar> fetch_var_list = model_conf.getFetchVarList();
+        for (FetchVar fetch_var : fetch_var_list) {
+            fetchNames_.add(fetch_var.getAliasName());
+        }
+        for (int i = 0; i < feed_var_list.size(); ++i) {
+            FeedVar feed_var = feed_var_list.get(i);
+            String var_name = feed_var.getAliasName();
+            feedTypes_.put(var_name, feed_var.getFeedType());
+            feedShapes_.put(var_name, feed_var.getShapeList());
+            if (feed_var.getIsLodTensor()) {
+                lodTensorSet_.add(var_name);
+            } else {
+                int counter = 1;
+                for (int dim : feedShapes_.get(var_name)) {
+                    counter *= dim;
+                }
+                feedTensorLen_.put(var_name, counter);
+            }
+        }
+        for (int i = 0; i < fetch_var_list.size(); i++) {
+            FetchVar fetch_var = fetch_var_list.get(i);
+            String var_name = fetch_var.getAliasName();
+            fetchTypes_.put(var_name, fetch_var.getFetchType());
+            if (fetch_var.getIsLodTensor()) {
+                lodTensorSet_.add(var_name);
+            }
+        }
+    }
+    private InferenceRequest _packInferenceRequest(
+            List<HashMap<String, INDArray>> feed_batch,
+            Iterable<String> fetch) throws IllegalArgumentException {
+        List<String> feed_var_names = new ArrayList<String>();
+        feed_var_names.addAll(feed_batch.get(0).keySet());
+        InferenceRequest.Builder req_builder = InferenceRequest.newBuilder()
+            .addAllFeedVarNames(feed_var_names)
+            .addAllFetchVarNames(fetch)
+            .setIsPython(false);
+        for (HashMap<String, INDArray> feed_data: feed_batch) {
+            FeedInst.Builder inst_builder = FeedInst.newBuilder();
+            for (String name: feed_var_names) {
+                Tensor.Builder tensor_builder = Tensor.newBuilder();
+                INDArray variable = feed_data.get(name);
+                long[] flattened_shape = {-1};
+                INDArray flattened_list = variable.reshape(flattened_shape);
+                int v_type = feedTypes_.get(name);
+                NdIndexIterator iter = new NdIndexIterator(flattened_list.shape());
+                if (v_type == 0) { // int64
+                    while (iter.hasNext()) {
+                        long[] next_index = iter.next();
+                        long x = flattened_list.getLong(next_index);
+                        tensor_builder.addInt64Data(x);
+                    }
+                } else if (v_type == 1) { // float32
+                    while (iter.hasNext()) {
+                        long[] next_index = iter.next();
+                        float x = flattened_list.getFloat(next_index);
+                        tensor_builder.addFloatData(x);
+                    }
+                } else if (v_type == 2) { // int32
+                    while (iter.hasNext()) {
+                        long[] next_index = iter.next();
+                        // the interface of INDArray is strange:
+                        // https://deeplearning4j.org/api/latest/org/nd4j/linalg/api/ndarray/INDArray.html
+                        int[] int_next_index = new int[next_index.length];
+                        for(int i = 0; i < next_index.length; i++) {
+                            int_next_index[i] = (int)next_index[i];
+                        }
+                        int x = flattened_list.getInt(int_next_index);
+                        tensor_builder.addIntData(x);
+                    }
+                } else {
+                    throw new IllegalArgumentException("error tensor value type.");
+                }
+                tensor_builder.addAllShape(feedShapes_.get(name));
+                inst_builder.addTensorArray(tensor_builder.build());
+            }
+            req_builder.addInsts(inst_builder.build());
+        }
+        return req_builder.build();
+    }
+    private Map<String, HashMap<String, INDArray>>
+        _unpackInferenceResponse(
+            InferenceResponse resp,
+            Iterable<String> fetch,
+            Boolean need_variant_tag) throws IllegalArgumentException {
+        return Client._staticUnpackInferenceResponse(
+                resp, fetch, fetchTypes_, lodTensorSet_, need_variant_tag);
+    }
+    private static Map<String, HashMap<String, INDArray>>
+        _staticUnpackInferenceResponse(
+            InferenceResponse resp,
+            Iterable<String> fetch,
+            Map<String, Integer> fetchTypes,
+            Set<String> lodTensorSet,
+            Boolean need_variant_tag) throws IllegalArgumentException {
+        if (resp.getErrCode() != 0) {
+            return null;
+        }
+        String tag = resp.getTag();
+        HashMap<String, HashMap<String, INDArray>> multi_result_map
+            = new HashMap<String, HashMap<String, INDArray>>();
+        for (ModelOutput model_result: resp.getOutputsList()) {
+            String engine_name = model_result.getEngineName();
+            FetchInst inst = model_result.getInsts(0);
+            HashMap<String, INDArray> result_map
+                = new HashMap<String, INDArray>();
+            int index = 0;
+            for (String name: fetch) {
+                Tensor variable = inst.getTensorArray(index);
+                int v_type = fetchTypes.get(name);
+                INDArray data = null; 
+                if (v_type == 0) { // int64
+                    List<Long> list = variable.getInt64DataList();
+                    long[] array = new long[list.size()];
+                    for (int i = 0; i < list.size(); i++) {
+                        array[i] = list.get(i);
+                    }
+                    data = Nd4j.createFromArray(array);
+                } else if (v_type == 1) { // float32
+                    List<Float> list = variable.getFloatDataList();
+                    float[] array = new float[list.size()];
+                    for (int i = 0; i < list.size(); i++) {
+                        array[i] = list.get(i);
+                    }
+                    data = Nd4j.createFromArray(array);
+                } else if (v_type == 2) { // int32
+                    List<Integer> list = variable.getIntDataList();
+                    int[] array = new int[list.size()];
+                    for (int i = 0; i < list.size(); i++) {
+                        array[i] = list.get(i);
+                    }
+                    data = Nd4j.createFromArray(array);
+                } else {
+                    throw new IllegalArgumentException("error tensor value type.");
+                }
+                // shape
+                List<Integer> shape_lsit = variable.getShapeList();
+                int[] shape_array = new int[shape_lsit.size()];
+                for (int i = 0; i < shape_lsit.size(); ++i) {
+                    shape_array[i] = shape_lsit.get(i);
+                }
+                data = data.reshape(shape_array);
+                // put data to result_map
+                result_map.put(name, data);
+                // lod
+                if (lodTensorSet.contains(name)) {
+                    List<Integer> list = variable.getLodList();
+                    int[] array = new int[list.size()];
+                    for (int i = 0; i < list.size(); i++) {
+                        array[i] = list.get(i);
+                    }
+                    result_map.put(name + ".lod", Nd4j.createFromArray(array));
+                }
+                index += 1;
+            }
+            multi_result_map.put(engine_name, result_map);
+        }
+        // TODO: tag(ABtest not support now)
+        return multi_result_map;
+    }
+    public Map<String, INDArray> predict(
+            HashMap<String, INDArray> feed,
+            Iterable<String> fetch) {
+        return predict(feed, fetch, false);
+    }
+    public Map<String, HashMap<String, INDArray>> ensemble_predict(
+            HashMap<String, INDArray> feed,
+            Iterable<String> fetch) {
+        return ensemble_predict(feed, fetch, false);
+    }
+    public PredictFuture asyn_predict(
+            HashMap<String, INDArray> feed,
+            Iterable<String> fetch) {
+        return asyn_predict(feed, fetch, false);
+    }
+    public Map<String, INDArray> predict(
+            HashMap<String, INDArray> feed,
+            Iterable<String> fetch,
+            Boolean need_variant_tag) {
+        List<HashMap<String, INDArray>> feed_batch
+            = new ArrayList<HashMap<String, INDArray>>();
+        feed_batch.add(feed);
+        return predict(feed_batch, fetch, need_variant_tag);
+    }
+    public Map<String, HashMap<String, INDArray>> ensemble_predict(
+            HashMap<String, INDArray> feed,
+            Iterable<String> fetch,
+            Boolean need_variant_tag) {
+        List<HashMap<String, INDArray>> feed_batch
+            = new ArrayList<HashMap<String, INDArray>>();
+        feed_batch.add(feed);
+        return ensemble_predict(feed_batch, fetch, need_variant_tag);
+    }
+    public PredictFuture asyn_predict(
+            HashMap<String, INDArray> feed,
+            Iterable<String> fetch,
+            Boolean need_variant_tag) {
+        List<HashMap<String, INDArray>> feed_batch
+            = new ArrayList<HashMap<String, INDArray>>();
+        feed_batch.add(feed);
+        return asyn_predict(feed_batch, fetch, need_variant_tag);
+    }
+    public Map<String, INDArray> predict(
+            List<HashMap<String, INDArray>> feed_batch,
+            Iterable<String> fetch) {
+        return predict(feed_batch, fetch, false);
+    }
+    public Map<String, HashMap<String, INDArray>> ensemble_predict(
+            List<HashMap<String, INDArray>> feed_batch,
+            Iterable<String> fetch) {
+        return ensemble_predict(feed_batch, fetch, false);
+    }
+    public PredictFuture asyn_predict(
+            List<HashMap<String, INDArray>> feed_batch,
+            Iterable<String> fetch) {
+        return asyn_predict(feed_batch, fetch, false);
+    }
+    public Map<String, INDArray> predict(
+            List<HashMap<String, INDArray>> feed_batch,
+            Iterable<String> fetch,
+            Boolean need_variant_tag) {
+        try {
+            profiler_.record("java_prepro_0");
+            InferenceRequest req = _packInferenceRequest(feed_batch, fetch);
+            profiler_.record("java_prepro_1");
+            profiler_.record("java_client_infer_0");
+            InferenceResponse resp = blockingStub_.inference(req);
+            profiler_.record("java_client_infer_1");
+            profiler_.record("java_postpro_0");
+            Map<String, HashMap<String, INDArray>> ensemble_result
+                = _unpackInferenceResponse(resp, fetch, need_variant_tag);
+            List<Map.Entry<String, HashMap<String, INDArray>>> list
+                = new ArrayList<Map.Entry<String, HashMap<String, INDArray>>>(
+                    ensemble_result.entrySet());
+            if (list.size() != 1) {
+                System.out.format("predict failed: please use ensemble_predict impl.\n");
+                return null;
+            }
+            profiler_.record("java_postpro_1");
+            profiler_.printProfile();
+            return list.get(0).getValue();
+        } catch (StatusRuntimeException e) {
+            System.out.format("predict failed: %s\n", e.toString());
+            return null;
+        }
+    }
+    public Map<String, HashMap<String, INDArray>> ensemble_predict(
+            List<HashMap<String, INDArray>> feed_batch,
+            Iterable<String> fetch,
+            Boolean need_variant_tag) {
+        try {
+            profiler_.record("java_prepro_0");
+            InferenceRequest req = _packInferenceRequest(feed_batch, fetch);
+            profiler_.record("java_prepro_1");
+            profiler_.record("java_client_infer_0");
+            InferenceResponse resp = blockingStub_.inference(req);
+            profiler_.record("java_client_infer_1");
+            profiler_.record("java_postpro_0");
+            Map<String, HashMap<String, INDArray>> ensemble_result 
+               = _unpackInferenceResponse(resp, fetch, need_variant_tag);
+            profiler_.record("java_postpro_1");
+            profiler_.printProfile();
+            return ensemble_result;
+        } catch (StatusRuntimeException e) {
+            System.out.format("predict failed: %s\n", e.toString());
+            return null;
+        }
+    }
+    public PredictFuture asyn_predict(
+            List<HashMap<String, INDArray>> feed_batch,
+            Iterable<String> fetch,
+            Boolean need_variant_tag) {
+        InferenceRequest req = _packInferenceRequest(feed_batch, fetch);
+        ListenableFuture<InferenceResponse> future = futureStub_.inference(req);
+        PredictFuture predict_future = new PredictFuture(future, 
+            (InferenceResponse resp) -> {
+                return Client._staticUnpackInferenceResponse(
+                resp, fetch, fetchTypes_, lodTensorSet_, need_variant_tag);
+            }
+        );
+        return predict_future;
+    }
+}
--- a/java/src/main/java/io/paddle/serving/client/PredictFuture.java
+++ b/java/src/main/java/io/paddle/serving/client/PredictFuture.java
+package io.paddle.serving.client;
+import java.util.*;
+import java.util.function.Function;
+import io.grpc.StatusRuntimeException;
+import com.google.common.util.concurrent.ListenableFuture;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import io.paddle.serving.client.Client;
+import io.paddle.serving.grpc.*;
+public class PredictFuture {
+    private ListenableFuture<InferenceResponse> callFuture_;
+    private Function<InferenceResponse, 
+                     Map<String, HashMap<String, INDArray>>> callBackFunc_;
+    PredictFuture(ListenableFuture<InferenceResponse> call_future,
+            Function<InferenceResponse, 
+                     Map<String, HashMap<String, INDArray>>> call_back_func) {
+        callFuture_ = call_future;
+        callBackFunc_ = call_back_func;
+    }
+    public Map<String, INDArray> get() {
+        InferenceResponse resp = null;
+        try {
+            resp = callFuture_.get();
+        } catch (Exception e) {
+            System.out.format("predict failed: %s\n", e.toString());
+            return null;
+        }
+        Map<String, HashMap<String, INDArray>> ensemble_result
+            = callBackFunc_.apply(resp);
+        List<Map.Entry<String, HashMap<String, INDArray>>> list
+            = new ArrayList<Map.Entry<String, HashMap<String, INDArray>>>(
+                    ensemble_result.entrySet());
+        if (list.size() != 1) {
+            System.out.format("predict failed: please use get_ensemble impl.\n");
+            return null;
+        }
+        return list.get(0).getValue();
+    }
+    public Map<String, HashMap<String, INDArray>> ensemble_get() {
+        InferenceResponse resp = null;
+        try {
+            resp = callFuture_.get();
+        } catch (Exception e) {
+            System.out.format("predict failed: %s\n", e.toString());
+            return null;
+        }
+        return callBackFunc_.apply(resp);
+    }
+}
--- a/java/src/main/proto/general_model_config.proto
+++ b/java/src/main/proto/general_model_config.proto
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+syntax = "proto2";
+option java_multiple_files = true;
+option java_package = "io.paddle.serving.configure";
+option java_outer_classname = "ConfigureProto";
+package paddle.serving.configure;
+message FeedVar {
+  optional string name = 1;
+  optional string alias_name = 2;
+  optional bool is_lod_tensor = 3 [ default = false ];
+  optional int32 feed_type = 4 [ default = 0 ];
+  repeated int32 shape = 5;
+}
+message FetchVar {
+  optional string name = 1;
+  optional string alias_name = 2;
+  optional bool is_lod_tensor = 3 [ default = false ];
+  optional int32 fetch_type = 4 [ default = 0 ];
+  repeated int32 shape = 5;
+}
+message GeneralModelConfig {
+  repeated FeedVar feed_var = 1;
+  repeated FetchVar fetch_var = 2;
+};
--- a/java/src/main/proto/multi_lang_general_model_service.proto
+++ b/java/src/main/proto/multi_lang_general_model_service.proto
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+syntax = "proto2";
+option java_multiple_files = true;
+option java_package = "io.paddle.serving.grpc";
+option java_outer_classname = "ServingProto";
+message Tensor {
+  optional bytes data = 1;
+  repeated int32 int_data = 2;
+  repeated int64 int64_data = 3;
+  repeated float float_data = 4;
+  optional int32 elem_type = 5;
+  repeated int32 shape = 6;
+  repeated int32 lod = 7; // only for fetch tensor currently
+};
+message FeedInst { repeated Tensor tensor_array = 1; };
+message FetchInst { repeated Tensor tensor_array = 1; };
+message InferenceRequest {
+  repeated FeedInst insts = 1;
+  repeated string feed_var_names = 2;
+  repeated string fetch_var_names = 3;
+  required bool is_python = 4 [ default = false ];
+};
+message InferenceResponse {
+  repeated ModelOutput outputs = 1;
+  optional string tag = 2;
+  required int32 err_code = 3;
+};
+message ModelOutput {
+  repeated FetchInst insts = 1;
+  optional string engine_name = 2;
+}
+message SetTimeoutRequest { required int32 timeout_ms = 1; }
+message SimpleResponse { required int32 err_code = 1; }
+message GetClientConfigRequest {}
+message GetClientConfigResponse { required string client_config_str = 1; }
+service MultiLangGeneralModelService {
+  rpc Inference(InferenceRequest) returns (InferenceResponse) {}
+  rpc SetTimeout(SetTimeoutRequest) returns (SimpleResponse) {}
+  rpc GetClientConfig(GetClientConfigRequest)
+      returns (GetClientConfigResponse) {}
+};
--- a/java/src/main/resources/log4j2.xml
+++ b/java/src/main/resources/log4j2.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<Configuration status="INFO">
+    <Appenders>
+        <Console name="Console" target="SYSTEM_OUT">
+            <PatternLayout pattern="%highlight{%d{yyyy-MM-dd HH:mm:ss} %C %M %n%p: %m%n}{STYLE=Logback}"/>
+        </Console>
+    </Appenders>
+    <Loggers>
+        <Root level="INFO">
+            <AppenderRef ref="Console"/>
+        </Root>
+    </Loggers>
+</Configuration>
--- a/python/examples/bert/benchmark.sh
+++ b/python/examples/bert/benchmark.sh
-rm profile_log
+rm profile_log*
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 export FLAGS_profile_server=1
 export FLAGS_profile_client=1
 export FLAGS_serving_latency=1
-python3 -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim False --ir_optim True 2> elog > stdlog &
-hostname=`echo $(hostname)|awk -F '.baidu.com' '{print $1}'`
-sleep 5
 gpu_id=0
+#save cpu and gpu utilization log
+if [ -d utilization ];then
+    rm -rf utilization
+else
+    mkdir utilization
+fi
+#start server
+$PYTHONROOT/bin/python3 -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim  --ir_optim >  elog  2>&1 &
+sleep 5
 #warm up
-python3 benchmark.py --thread 8 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
+$PYTHONROOT/bin/python3 benchmark.py --thread 4 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
+echo -e "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
-for thread_num in 4 8 16
+for thread_num in 1 4 8 16
 do
-for batch_size in 1 4 16 64 256
+for batch_size in 1 4 16 64
 do
    job_bt=`date '+%Y%m%d%H%M%S'`
-    nvidia-smi --id=$gpu_id --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
+    nvidia-smi --id=0 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
+    nvidia-smi --id=0 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
    gpu_memory_pid=$!
-    python3 benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
+    $PYTHONROOT/bin/python3 benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
    kill ${gpu_memory_pid}
+    kill `ps -ef|grep used_memory|awk '{print $2}'`
    echo "model_name:" $1
    echo "thread_num:" $thread_num
    echo "batch_size:" $batch_size
    echo "=================Done===================="
    echo "model_name:$1" >> profile_log_$1
    echo "batch_size:$batch_size" >> profile_log_$1
+    $PYTHONROOT/bin/python3 cpu_utilization.py >> profile_log_$1
    job_et=`date '+%Y%m%d%H%M%S'`
-    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY_USE:", max}' gpu_use.log >> profile_log_$1
+    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$1
-    monquery -n ${hostname} -i GPU_AVERAGE_UTILIZATION -s $job_bt -e $job_et -d 10 > gpu_log_file_${job_bt}
+    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$1
-    monquery -n ${hostname} -i CPU_USER -s $job_bt -e $job_et  -d 10 > cpu_log_file_${job_bt}
+    rm -rf gpu_use.log gpu_utilization.log
-    cpu_num=$(cat /proc/cpuinfo | grep processor | wc -l)
+    $PYTHONROOT/bin/python3 ../util/show_profile.py profile $thread_num >> profile_log_$1
-    gpu_num=$(nvidia-smi -L|wc -l)
-    python ../util/show_profile.py profile $thread_num >> profile_log_$1
    tail -n 8 profile >> profile_log_$1
    echo "" >> profile_log_$1
 done
 done
+#Divided log
+awk 'BEGIN{RS="\n\n"}{i++}{print > "bert_log_"i}' profile_log_$1
+mkdir bert_log && mv bert_log_* bert_log
 ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9
--- a/python/examples/blazeface/README.md
+++ b/python/examples/blazeface/README.md
+# Blazeface 
+## Get Model
+```
+python -m paddle_serving_app.package --get_model blazeface
+tar -xzvf blazeface.tar.gz
+```
+## RPC Service
+### Start Service
+```
+python -m paddle_serving_server.serve --model serving_server --port 9494
+```
+### Client Prediction
+```
+python test_client.py serving_client/serving_client_conf.prototxt test.jpg
+```
+the result is in `output` folder, including a json file and image file with bounding boxes.
--- a/python/examples/ocr/test_ocr_rec_client.py
+++ b/python/examples/ocr/test_ocr_rec_client.py
@@ -13,19 +13,26 @@
 # limitations under the License.
 from paddle_serving_client import Client
-from paddle_serving_app.reader import OCRReader
+from paddle_serving_app.reader import *
-import cv2
+import sys
+import numpy as np
+preprocess = Sequential([
+    File2Image(),
+    Normalize([104, 117, 123], [127.502231, 127.502231, 127.502231], False)
+])
+postprocess = BlazeFacePostprocess("label_list.txt", "output")
 client = Client()
-client.load_client_config("ocr_rec_client/serving_client_conf.prototxt")
-client.connect(["127.0.0.1:9292"])
-image_file_list = ["./test_rec.jpg"]
+client.load_client_config(sys.argv[1])
-img = cv2.imread(image_file_list[0])
+client.connect(['127.0.0.1:9494'])
-ocr_reader = OCRReader()
-feed = {"image": ocr_reader.preprocess([img])}
+im_0 = preprocess(sys.argv[2])
-fetch = ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"]
+tmp = Transpose((2, 0, 1))
-fetch_map = client.predict(feed=feed, fetch=fetch)
+im = tmp(im_0)
-rec_res = ocr_reader.postprocess(fetch_map)
+fetch_map = client.predict(
-print(image_file_list[0])
+    feed={"image": im}, fetch=["detection_output_0.tmp_0"])
-print(rec_res[0][0])
+fetch_map["image"] = sys.argv[2]
+fetch_map["im_shape"] = im_0.shape
+postprocess(fetch_map)
--- a/python/examples/criteo_ctr_with_cube/README.md
+++ b/python/examples/criteo_ctr_with_cube/README.md
@@ -27,7 +27,7 @@ mv cube_app/cube* ./cube/
 sh cube_prepare.sh &
 ```
-Here, the sparse parameter is loaded by cube sparse parameter indexing service Cube，for more details please read [Cube: Sparse Parameter Indexing Service (Local Mode)](../../../doc/CUBE_LOCAL.md)
+Here, the sparse parameter is loaded by cube sparse parameter indexing service Cube.
 ### Start RPC Predictor, the number of serving thread is 4（configurable in test_server.py）
@@ -45,7 +45,7 @@ python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
 CPU ：Intel(R) Xeon(R) CPU 6148 @ 2.40GHz 
-Model ：[Criteo CTR](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/ctr_criteo_with_cube/network_conf.py)
+Model ：[Criteo CTR](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/criteo_ctr_with_cube/network_conf.py)
 server core/thread num ： 4/8

--- a/python/examples/criteo_ctr_with_cube/README_CN.md
+++ b/python/examples/criteo_ctr_with_cube/README_CN.md
@@ -25,7 +25,7 @@ mv cube_app/cube* ./cube/
 sh cube_prepare.sh &
 ```
-此处，模型当中的稀疏参数会被存放在稀疏参数索引服务Cube当中，关于稀疏参数索引服务Cube的介绍，请阅读[稀疏参数索引服务Cube单机版使用指南](../../../doc/CUBE_LOCAL_CN.md)
+此处，模型当中的稀疏参数会被存放在稀疏参数索引服务Cube当中。
 ### 启动RPC预测服务，服务端线程数为4（可在test_server.py配置）
@@ -43,7 +43,7 @@ python test_client.py ctr_client_conf/serving_client_conf.prototxt ./raw_data
 设备 ：Intel(R) Xeon(R) CPU 6148 @ 2.40GHz 
-模型 ：[Criteo CTR](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/ctr_criteo_with_cube/network_conf.py)
+模型 ：[Criteo CTR](https://github.com/PaddlePaddle/Serving/blob/develop/python/examples/criteo_ctr_with_cube/network_conf.py)
 server core/thread num ： 4/8

--- a/python/examples/criteo_ctr_with_cube/benchmark.py
+++ b/python/examples/criteo_ctr_with_cube/benchmark.py
@@ -24,6 +24,7 @@ from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args
 from paddle_serving_client.metric import auc
+py_version = sys.version_info[0]
 args = benchmark_args()
@@ -49,7 +50,10 @@ def single_func(idx, resource):
            if args.batch_size > 0:
                feed_batch = []
                for bi in range(args.batch_size):
+                    if py_version == 2:
                        data = reader().next()
+                    else:
+                        data = reader().__next__()
                    feed_dict = {}
                    feed_dict['dense_input'] = data[0][0]
                    for i in range(1, 27):
@@ -71,14 +75,17 @@ if __name__ == '__main__':
    multi_thread_runner = MultiThreadRunner()
    endpoint_list = ["127.0.0.1:9292"]
    #result = single_func(0, {"endpoint": endpoint_list})
+    start = time.time()
    result = multi_thread_runner.run(single_func, args.thread,
                                     {"endpoint": endpoint_list})
-    print(result)
+    end = time.time()
+    total_cost = end - start
    avg_cost = 0
    qps = 0
    for i in range(args.thread):
        avg_cost += result[0][i * 2 + 0]
        qps += result[0][i * 2 + 1]
    avg_cost = avg_cost / args.thread
+    print("total cost: {}".format(total_cost))
    print("average total cost {} s.".format(avg_cost))
    print("qps {} ins/s".format(qps))
--- a/python/examples/criteo_ctr_with_cube/benchmark.sh
+++ b/python/examples/criteo_ctr_with_cube/benchmark.sh
 rm profile_log
 export FLAGS_profile_client=1
 export FLAGS_profile_server=1
-for thread_num in 1 2 4 8 16
+wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz --no-check-certificate
+tar xf ctr_cube_unittest.tar.gz
+mv models/ctr_client_conf ./
+mv models/ctr_serving_model_kv ./
+mv models/data ./cube/
+wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz --no-check-certificate
+tar xf cube_app.tar.gz
+mv cube_app/cube* ./cube/
+sh cube_prepare.sh &
+python test_server.py ctr_serving_model_kv > serving_log 2>&1 &
+for thread_num in 1 4 16
 do
-for batch_size in 1 4 16 64 256
+for batch_size in 1 4 16 64
 do
    $PYTHONROOT/bin/python benchmark.py --thread $thread_num --batch_size $batch_size --model serving_client_conf/serving_client_conf.prototxt --request rpc > profile 2>&1
    echo "batch size : $batch_size"
@@ -11,6 +25,8 @@ do
    echo "========================================"
    echo "batch size : $batch_size" >> profile_log
    $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log
-    tail -n 2 profile >> profile_log
+    tail -n 3 profile >> profile_log
 done
 done
+ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9
--- a/python/examples/criteo_ctr_with_cube/benchmark_cube.sh
+++ b/python/examples/criteo_ctr_with_cube/benchmark_cube.sh
+rm profile_log
+#wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz --no-check-certificate
+#tar xf ctr_cube_unittest.tar.gz
+mv models/ctr_client_conf ./
+mv models/ctr_serving_model_kv ./
+mv models/data ./cube/
+#wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz --no-check-certificate
+#tar xf cube_app.tar.gz
+mv cube_app/cube* ./cube/
+sh cube_prepare.sh &
+cp ../../../build_server/core/cube/cube-api/cube-cli .
+python gen_key.py
+for thread_num in 1 4 16 32
+do
+for batch_size in 1000
+do
+    ./cube-cli -config_file ./cube/conf/cube.conf -keys key -dict test_dict -thread_num $thread_num --batch $batch_size > profile 2>&1
+    echo "batch size : $batch_size"
+    echo "thread num : $thread_num"
+    echo "========================================"
+    echo "batch size : $batch_size" >> profile_log
+    echo "thread num : $thread_num" >> profile_log
+    tail -n 8 profile >> profile_log
+done
+done
+ps -ef|grep 'cube'|grep -v grep|cut -c 9-15 | xargs kill -9
--- a/python/examples/criteo_ctr_with_cube/gen_key.py
+++ b/python/examples/criteo_ctr_with_cube/gen_key.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import random
+with open("key", "w") as f:
+    for i in range(1000000):
+        f.write("{}\n".format(random.randint(0, 999999)))
--- a/python/examples/criteo_ctr_with_cube/test_client.py
+++ b/python/examples/criteo_ctr_with_cube/test_client.py
@@ -20,6 +20,8 @@ import criteo as criteo
 import time
 from paddle_serving_client.metric import auc
+py_version = sys.version_info[0]
 client = Client()
 client.load_client_config(sys.argv[1])
 client.connect(["127.0.0.1:9292"])
@@ -34,7 +36,10 @@ label_list = []
 prob_list = []
 start = time.time()
 for ei in range(10000):
+    if py_version == 2:
        data = reader().next()
+    else:
+        data = reader().__next__()
    feed_dict = {}
    feed_dict['dense_input'] = data[0][0]
    for i in range(1, 27):

--- a/python/examples/criteo_ctr_with_cube/test_server.py
+++ b/python/examples/criteo_ctr_with_cube/test_server.py
@@ -33,5 +33,9 @@ server = Server()
 server.set_op_sequence(op_seq_maker.get_op_sequence())
 server.set_num_threads(4)
 server.load_model_config(sys.argv[1])
-server.prepare_server(workdir="work_dir1", port=9292, device="cpu")
+server.prepare_server(
+    workdir="work_dir1",
+    port=9292,
+    device="cpu",
+    cube_conf="./cube/conf/cube.conf")
 server.run_server()
--- a/python/examples/criteo_ctr_with_cube/test_server_gpu.py
+++ b/python/examples/criteo_ctr_with_cube/test_server_gpu.py
@@ -33,5 +33,9 @@ server = Server()
 server.set_op_sequence(op_seq_maker.get_op_sequence())
 server.set_num_threads(4)
 server.load_model_config(sys.argv[1])
-server.prepare_server(workdir="work_dir1", port=9292, device="cpu")
+server.prepare_server(
+    workdir="work_dir1",
+    port=9292,
+    device="cpu",
+    cube_conf="./cube/conf/cube.conf")
 server.run_server()
--- a/python/examples/criteo_ctr_with_cube/test_server_quant.py
+++ b/python/examples/criteo_ctr_with_cube/test_server_quant.py
@@ -33,5 +33,9 @@ server = Server()
 server.set_op_sequence(op_seq_maker.get_op_sequence())
 server.set_num_threads(4)
 server.load_model_config(sys.argv[1])
-server.prepare_server(workdir="work_dir1", port=9292, device="cpu")
+server.prepare_server(
+    workdir="work_dir1",
+    port=9292,
+    device="cpu",
+    cube_conf="./cube/conf/cube.conf")
 server.run_server()
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server.py
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server.py
@@ -33,5 +33,9 @@ server = Server()
 server.set_op_sequence(op_seq_maker.get_op_sequence())
 server.set_num_threads(4)
 server.load_model_config(sys.argv[1], sys.argv[2])
-server.prepare_server(workdir="work_dir1", port=9292, device="cpu")
+server.prepare_server(
+    workdir="work_dir1",
+    port=9292,
+    device="cpu",
+    cube_conf="./cube/conf/cube.conf")
 server.run_server()
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server_gpu.py
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server_gpu.py
@@ -33,5 +33,9 @@ server = Server()
 server.set_op_sequence(op_seq_maker.get_op_sequence())
 server.set_num_threads(4)
 server.load_model_config(sys.argv[1], sys.argv[2])
-server.prepare_server(workdir="work_dir1", port=9292, device="cpu")
+server.prepare_server(
+    workdir="work_dir1",
+    port=9292,
+    device="cpu",
+    cube_conf="./cube/conf/cube.conf")
 server.run_server()
--- a/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server_quant.py
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server_quant.py
@@ -33,5 +33,9 @@ server = Server()
 server.set_op_sequence(op_seq_maker.get_op_sequence())
 server.set_num_threads(4)
 server.load_model_config(sys.argv[1], sys.argv[2])
-server.prepare_server(workdir="work_dir1", port=9292, device="cpu")
+server.prepare_server(
+    workdir="work_dir1",
+    port=9292,
+    device="cpu",
+    cube_conf="./cube/conf/cube.conf")
 server.run_server()
--- a/python/examples/grpc_impl_example/imdb/get_data.sh
+++ b/python/examples/grpc_impl_example/imdb/get_data.sh
+wget --no-check-certificate https://fleet.bj.bcebos.com/text_classification_data.tar.gz
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/imdb-demo/imdb_model.tar.gz
+tar -zxvf text_classification_data.tar.gz
+tar -zxvf imdb_model.tar.gz
--- a/python/examples/grpc_impl_example/imdb/imdb_reader.py
+++ b/python/examples/grpc_impl_example/imdb/imdb_reader.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+import sys
+import os
+import paddle
+import re
+import paddle.fluid.incubate.data_generator as dg
+py_version = sys.version_info[0]
+class IMDBDataset(dg.MultiSlotDataGenerator):
+    def load_resource(self, dictfile):
+        self._vocab = {}
+        wid = 0
+        if py_version == 2:
+            with open(dictfile) as f:
+                for line in f:
+                    self._vocab[line.strip()] = wid
+                    wid += 1
+        else:
+            with open(dictfile, encoding="utf-8") as f:
+                for line in f:
+                    self._vocab[line.strip()] = wid
+                    wid += 1
+        self._unk_id = len(self._vocab)
+        self._pattern = re.compile(r'(;|,|\.|\?|!|\s|\(|\))')
+        self.return_value = ("words", [1, 2, 3, 4, 5, 6]), ("label", [0])
+    def get_words_only(self, line):
+        sent = line.lower().replace("<br />", " ").strip()
+        words = [x for x in self._pattern.split(sent) if x and x != " "]
+        feas = [
+            self._vocab[x] if x in self._vocab else self._unk_id for x in words
+        ]
+        return feas
+    def get_words_and_label(self, line):
+        send = '|'.join(line.split('|')[:-1]).lower().replace("<br />",
+                                                              " ").strip()
+        label = [int(line.split('|')[-1])]
+        words = [x for x in self._pattern.split(send) if x and x != " "]
+        feas = [
+            self._vocab[x] if x in self._vocab else self._unk_id for x in words
+        ]
+        return feas, label
+    def infer_reader(self, infer_filelist, batch, buf_size):
+        def local_iter():
+            for fname in infer_filelist:
+                with open(fname, "r") as fin:
+                    for line in fin:
+                        feas, label = self.get_words_and_label(line)
+                        yield feas, label
+        import paddle
+        batch_iter = paddle.batch(
+            paddle.reader.shuffle(
+                local_iter, buf_size=buf_size),
+            batch_size=batch)
+        return batch_iter
+    def generate_sample(self, line):
+        def memory_iter():
+            for i in range(1000):
+                yield self.return_value
+        def data_iter():
+            feas, label = self.get_words_and_label(line)
+            yield ("words", feas), ("label", label)
+        return data_iter
+if __name__ == "__main__":
+    imdb = IMDBDataset()
+    imdb.load_resource("imdb.vocab")
+    imdb.run_from_stdin()
--- a/python/examples/imdb/test_multilang_ensemble_client.py
+++ b/python/examples/imdb/test_multilang_ensemble_client.py
@@ -34,4 +34,6 @@ for i in range(3):
    fetch = ["prediction"]
    fetch_maps = client.predict(feed=feed, fetch=fetch)
    for model, fetch_map in fetch_maps.items():
+        if model == "serving_status_code":
+            continue
        print("step: {}, model: {}, res: {}".format(i, model, fetch_map))
--- a/python/examples/imdb/test_multilang_ensemble_server.py
+++ b/python/examples/imdb/test_multilang_ensemble_server.py
--- a/python/examples/grpc_impl_example/yolov4/000000570688.jpg
+++ b/python/examples/grpc_impl_example/yolov4/000000570688.jpg
--- a/python/examples/grpc_impl_example/yolov4/README.md
+++ b/python/examples/grpc_impl_example/yolov4/README.md
+# Yolov4 Detection Service
+([简体中文](README_CN.md)|English)
+## Get Model
+```
+python -m paddle_serving_app.package --get_model yolov4
+tar -xzvf yolov4.tar.gz
+```
+## Start RPC Service
+```
+python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang
+```
+## Prediction
+```
+python test_client.py 000000570688.jpg
+```
+After the prediction is completed, a json file to save the prediction result and a picture with the detection result box will be generated in the `./outpu folder.
--- a/python/examples/grpc_impl_example/yolov4/README_CN.md
+++ b/python/examples/grpc_impl_example/yolov4/README_CN.md
+# Yolov4 检测服务
+(简体中文|[English](README.md))
+## 获取模型
+```
+python -m paddle_serving_app.package --get_model yolov4
+tar -xzvf yolov4.tar.gz
+```
+## 启动RPC服务
+```
+python -m paddle_serving_server_gpu.serve --model yolov4_model --port 9393 --gpu_ids 0 --use_multilang
+```
+## 预测
+```
+python test_client.py 000000570688.jpg
+```
+预测完成会在`./output`文件夹下生成保存预测结果的json文件以及标出检测结果框的图片。
--- a/python/examples/grpc_impl_example/yolov4/label_list.txt
+++ b/python/examples/grpc_impl_example/yolov4/label_list.txt
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+dining table
+toilet
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
--- a/python/examples/grpc_impl_example/yolov4/test_client.py
+++ b/python/examples/grpc_impl_example/yolov4/test_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import numpy as np
+from paddle_serving_client import MultiLangClient as Client
+from paddle_serving_app.reader import *
+import cv2
+preprocess = Sequential([
+    File2Image(), BGR2RGB(), Resize(
+        (608, 608), interpolation=cv2.INTER_LINEAR), Div(255.0), Transpose(
+            (2, 0, 1))
+])
+postprocess = RCNNPostprocess("label_list.txt", "output", [608, 608])
+client = Client()
+client.connect(['127.0.0.1:9393'])
+# client.set_rpc_timeout_ms(10000)
+im = preprocess(sys.argv[1])
+fetch_map = client.predict(
+    feed={
+        "image": im,
+        "im_size": np.array(list(im.shape[1:])),
+    },
+    fetch=["save_infer_model/scale_0.tmp_0"])
+fetch_map.pop("serving_status_code")
+fetch_map["image"] = sys.argv[1]
+postprocess(fetch_map)
--- a/python/examples/imagenet/benchmark.py
+++ b/python/examples/imagenet/benchmark.py
@@ -24,7 +24,7 @@ import json
 import base64
 from paddle_serving_client import Client
 from paddle_serving_client.utils import MultiThreadRunner
-from paddle_serving_client.utils import benchmark_args
+from paddle_serving_client.utils import benchmark_args, show_latency
 from paddle_serving_app.reader import Sequential, File2Image, Resize
 from paddle_serving_app.reader import CenterCrop, RGB2BGR, Transpose, Div, Normalize
@@ -38,7 +38,11 @@ seq_preprocess = Sequential([
 def single_func(idx, resource):
    file_list = []
-    turns = 10
+    turns = resource["turns"]
+    latency_flags = False
+    if os.getenv("FLAGS_serving_latency"):
+        latency_flags = True
+        latency_list = []
    for file_name in os.listdir("./image_data/n01440764"):
        file_list.append(file_name)
    img_list = []
@@ -56,6 +60,7 @@ def single_func(idx, resource):
        start = time.time()
        for i in range(turns):
            if args.batch_size >= 1:
+                l_start = time.time()
                feed_batch = []
                i_start = time.time()
                for bi in range(args.batch_size):
@@ -69,6 +74,9 @@ def single_func(idx, resource):
                                 int(round(i_end * 1000000))))
                result = client.predict(feed=feed_batch, fetch=fetch)
+                l_end = time.time()
+                if latency_flags:
+                    latency_list.append(l_end * 1000 - l_start * 1000)
            else:
                print("unsupport batch size {}".format(args.batch_size))
@@ -88,6 +96,8 @@ def single_func(idx, resource):
            r = requests.post(
                server, data=req, headers={"Content-Type": "application/json"})
    end = time.time()
+    if latency_flags:
+        return [[end - start], latency_list]
    return [[end - start]]
@@ -96,11 +106,21 @@ if __name__ == '__main__':
    endpoint_list = [
        "127.0.0.1:9292", "127.0.0.1:9293", "127.0.0.1:9294", "127.0.0.1:9295"
    ]
-    result = multi_thread_runner.run(single_func, args.thread,
+    turns = 100
-                                     {"endpoint": endpoint_list})
+    start = time.time()
+    result = multi_thread_runner.run(
+        single_func, args.thread, {"endpoint": endpoint_list,
+                                   "turns": turns})
    #result = single_func(0, {"endpoint": endpoint_list})
+    end = time.time()
+    total_cost = end - start
    avg_cost = 0
    for i in range(args.thread):
        avg_cost += result[0][i]
    avg_cost = avg_cost / args.thread
-    print("average total cost {} s.".format(avg_cost))
+    print("total cost: {}s".format(end - start))
+    print("each thread cost: {}s.".format(avg_cost))
+    print("qps: {}samples/s".format(args.batch_size * args.thread * turns /
+                                    total_cost))
+    if os.getenv("FLAGS_serving_latency"):
+        show_latency(result[1])
--- a/python/examples/imagenet/benchmark.sh
+++ b/python/examples/imagenet/benchmark.sh
-rm profile_log
+rm profile_log*
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 export FLAGS_profile_server=1
 export FLAGS_profile_client=1
-python -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 2> elog > stdlog &
+python -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim --ir_optim  2> elog > stdlog &
 sleep 5
+gpu_id=0
+#save cpu and gpu utilization log
+if [ -d utilization ];then
+    rm -rf utilization
+else
+    mkdir utilization
+fi
 #warm up
-$PYTHONROOT/bin/python benchmark.py --thread 8 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
+$PYTHONROOT/bin/python3 benchmark.py --thread 4 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
+echo -e "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
-for thread_num in 4 8 16
+for thread_num in 1 4 8 16
 do
 for batch_size in 1 4 16 64
 do
+    job_bt=`date '+%Y%m%d%H%M%S'`
+    nvidia-smi --id=0 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
+    nvidia-smi --id=0 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
+    gpu_memory_pid=$!
    $PYTHONROOT/bin/python benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
+    kill ${gpu_memory_pid}
+    kill `ps -ef|grep used_memory|awk '{print $2}'`
    echo "model name :" $1
    echo "thread num :" $thread_num
    echo "batch size :" $batch_size
    echo "=================Done===================="
    echo "model name :$1" >> profile_log
    echo "batch size :$batch_size" >> profile_log
+    job_et=`date '+%Y%m%d%H%M%S'`
+    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$1
+    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$1
+    rm -rf gpu_use.log gpu_utilization.log
    $PYTHONROOT/bin/python ../util/show_profile.py profile $thread_num >> profile_log
    tail -n 8 profile >> profile_log
+    echo "" >> profile_log_$1
 done
 done
+#Divided log
+awk 'BEGIN{RS="\n\n"}{i++}{print > "ResNet_log_"i}' profile_log_$1
+mkdir $1_log && mv ResNet_log_* $1_log
 ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9
--- a/python/examples/imagenet/benchmark_batch.py.lprof
+++ b/python/examples/imagenet/benchmark_batch.py.lprof
--- a/python/examples/imagenet/test_image_reader.py
+++ b/python/examples/imagenet/test_image_reader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_app.reader.image_reader import String2Image, Base64ToImage, Sequential
+import base64
+def test_String2Image():
+    with open("./daisy.jpg") as f:
+        img_str = f.read()
+    seq = Sequential([String2Image()])
+    img = seq(img_str)
+    assert (img.shape == (563, 500, 3))
+def test_Base64ToImage():
+    with open("./daisy.jpg") as f:
+        img_str = f.read()
+    seq = Sequential([Base64ToImage()])
+    img = seq(base64.b64encode(img_str))
+    assert (img.shape == (563, 500, 3))
+if __name__ == "__main__":
+    test_String2Image()
+    test_Base64ToImage()
--- a/python/examples/imdb/benchmark.sh
+++ b/python/examples/imdb/benchmark.sh
-rm profile_log
+rm profile_log*
-export CUDA_VISIBLE_DEVICES=0,1,2,3
 export FLAGS_profile_server=1
 export FLAGS_profile_client=1
 export FLAGS_serving_latency=1
-python -m paddle_serving_server_gpu.serve --model $1 --port 9292 --thread 4 --gpu_ids 0,1,2,3 --mem_optim --ir_optim  2> elog > stdlog &
+$PYTHONROOT/bin/python3 -m paddle_serving_server.serve --model $1 --port 9292 --thread 4 --mem_optim --ir_optim  2> elog > stdlog &
 hostname=`echo $(hostname)|awk -F '.baidu.com' '{print $1}'`
+#save cpu and gpu utilization log
+if [ -d utilization ];then
+    rm -rf utilization
+else
+    mkdir utilization
+fi
 sleep 5
-for thread_num in 4 8 16
+#warm up
+$PYTHONROOT/bin/python3 benchmark.py --thread 4 --batch_size 1 --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
+echo -e "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
+for thread_num in 1 4 8 16
 do
-for batch_size in 1 4 16 64 256
+for batch_size in 1 4 16 64
 do
    job_bt=`date '+%Y%m%d%H%M%S'`
-    python benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
+    $PYTHONROOT/bin/python3 benchmark.py --thread $thread_num --batch_size $batch_size --model $2/serving_client_conf.prototxt --request rpc > profile 2>&1
    echo "model_name:" $1
    echo "thread_num:" $thread_num
    echo "batch_size:" $batch_size
@@ -21,15 +30,14 @@ do
    echo "model_name:$1" >> profile_log_$1
    echo "batch_size:$batch_size" >> profile_log_$1
    job_et=`date '+%Y%m%d%H%M%S'`
-    awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY_USE:", max}' gpu_use.log >> profile_log_$1
+    $PYTHONROOT/bin/python3 ../util/show_profile.py profile $thread_num >> profile_log_$1
-    monquery -n ${hostname} -i GPU_AVERAGE_UTILIZATION -s $job_bt -e $job_et -d 10 > gpu_log_file_${job_bt}
+    $PYTHONROOT/bin/python3 cpu_utilization.py >> profile_log_$1
-    monquery -n ${hostname} -i CPU_USER -s $job_bt -e $job_et  -d 10 > cpu_log_file_${job_bt}
-    cpu_num=$(cat /proc/cpuinfo | grep processor | wc -l)
-    gpu_num=$(nvidia-smi -L|wc -l)
-    python ../util/show_profile.py profile $thread_num >> profile_log_$1
    tail -n 8 profile >> profile_log_$1
    echo "" >> profile_log_$1
 done
 done
+#Divided log
+awk 'BEGIN{RS="\n\n"}{i++}{print > "imdb_log_"i}' profile_log_$1
+mkdir $1_log && mv imdb_log_* $1_log
 ps -ef|grep 'serving'|grep -v grep|cut -c 9-15 | xargs kill -9
--- a/python/examples/ocr/README.md
+++ b/python/examples/ocr/README.md
 # OCR 
+(English|[简体中文](./README_CN.md))
 ## Get Model
 ```
 python -m paddle_serving_app.package --get_model ocr_rec
 tar -xzvf ocr_rec.tar.gz
+python -m paddle_serving_app.package --get_model ocr_det
+tar -xzvf ocr_det.tar.gz
+```
+## Get Dataset (Optional)
+```
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/ocr/test_imgs.tar
+tar xf test_imgs.tar
 ```
-## RPC Service
+## Web Service
 ### Start Service
 ```
-python -m paddle_serving_server.serve --model ocr_rec_model --port 9292
+python -m paddle_serving_server_gpu.serve --model ocr_det_model --port 9293 --gpu_id 0
+python ocr_web_server.py
 ```
 ### Client Prediction
+```
+python ocr_web_client.py
+```
+If you want a faster web service, please try Web Debugger Service
+## Web Debugger Service
+```
+python ocr_debugger_server.py
+```
+## Web Debugger Client Prediction
+```
+python ocr_web_client.py
+```
+## Benchmark
+CPU: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz * 40
+GPU: Nvidia Tesla V100 * 1
+Dataset: RCTW 500 sample images
+| engine                       | client read image(ms) | client-server tras time(ms) | server read image（ms） | det pre(ms) | det infer(ms) | det post(ms) | rec pre(ms) | rec infer(ms) | rec post(ms) | server-client trans time(ms) | server side time consumption(ms) | server side overhead(ms) | total time（ms) |
+|------------------------------|----------------|----------------------------|------------------|--------------------|------------------|--------------------|--------------------|------------------|--------------------|--------------------------|--------------------|--------------|---------------|
+| Serving web service          | 8.69         | 13.41                      | 109.97           | 2.82               | 87.76            | 4.29               | 3.98               | 78.51            | 3.66               | 4.12                     | 181.02             | 136.49       | 317.51        |
+| Serving Debugger web service |  8.73        | 16.42                      | 115.27           | 2.93               | 20.63            | 3.97               | 4.48               | 13.84            | 3.60               | 6.91                     | 49.45              | 147.33       | 196.78        |
+## Appendix: Det or Rec only
+if you are going to detect images not recognize it or directly recognize the words from images. We also provide Det and Rec server for you.
+### Det Server 
+```
+python det_web_server.py 
+#or
+python det_debugger_server.py
+```
+### Det Client
+```
+# also use ocr_web_client.py
+python ocr_web_client.py
+```
+### Rec Server
+```
+python rec_web_server.py 
+#or
+python rec_debugger_server.py
+```
+### Rec Client
 ```
-python test_ocr_rec_client.py
+python rec_web_client.py
 ```
--- a/python/examples/ocr/README_CN.md
+++ b/python/examples/ocr/README_CN.md
+# OCR 服务
+([English](./README.md)|简体中文)
+## 获取模型
+```
+python -m paddle_serving_app.package --get_model ocr_rec
+tar -xzvf ocr_rec.tar.gz
+python -m paddle_serving_app.package --get_model ocr_det
+tar -xzvf ocr_det.tar.gz
+```
+## 获取数据集（可选）
+```
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/ocr/test_imgs.tar
+tar xf test_imgs.tar
+```
+### 客户端预测
+```
+python ocr_rpc_client.py
+```
+## Web Service服务
+### 启动服务
+```
+python -m paddle_serving_server_gpu.serve --model ocr_det_model --port 9293 --gpu_id 0
+python ocr_web_server.py
+```
+### 启动客户端
+```
+python ocr_web_client.py
+```
+如果用户需要更快的执行速度，请尝试Debugger版Web服务
+## 启动Debugger版Web服务
+```
+python ocr_debugger_server.py
+```
+## 启动客户端
+```
+python ocr_web_client.py
+```
+## 性能指标
+CPU: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz * 40
+GPU: Nvidia Tesla V100单卡
+数据集：RCTW 500张测试数据集
+| engine                       | 客户端读图(ms) | 客户端发送请求到服务端(ms) | 服务端读图（ms） | 检测预处理耗时(ms) | 检测模型耗时(ms) | 检测后处理耗时(ms) | 识别预处理耗时(ms) | 识别模型耗时(ms) | 识别后处理耗时(ms) | 服务端回传客户端时间(ms) | 服务端整体耗时(ms) | 空跑耗时(ms) | 整体耗时（ms) |
+|------------------------------|----------------|----------------------------|------------------|--------------------|------------------|--------------------|--------------------|------------------|--------------------|--------------------------|--------------------|--------------|---------------|
+| Serving web service          | 8.69         | 13.41                      | 109.97           | 2.82               | 87.76            | 4.29               | 3.98               | 78.51            | 3.66               | 4.12                     | 181.02             | 136.49      | 317.51        |
+| Serving Debugger web service | 8.73         | 16.42                      | 115.27           | 2.93               | 20.63            | 3.97               | 4.48               | 13.84            | 3.60               | 6.91                     | 49.45              | 147.33      | 196.78        |
+## 附录： 检测/识别单服务启动
+如果您想单独启动检测或者识别服务，我们也提供了启动单服务的代码
+### 启动检测服务
+```
+python det_web_server.py 
+#or
+python det_debugger_server.py
+```
+### 检测服务客户端
+```
+# also use ocr_web_client.py
+python ocr_web_client.py
+```
+### 启动识别服务
+```
+python rec_web_server.py 
+#or
+python rec_debugger_server.py
+```
+### 识别服务客户端
+```
+python rec_web_client.py
+```
--- a/python/examples/ocr/det_debugger_server.py
+++ b/python/examples/ocr/det_debugger_server.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_client import Client
+import cv2
+import sys
+import numpy as np
+import os
+from paddle_serving_client import Client
+from paddle_serving_app.reader import Sequential, ResizeByFactor
+from paddle_serving_app.reader import Div, Normalize, Transpose
+from paddle_serving_app.reader import DBPostProcess, FilterBoxes
+from paddle_serving_server_gpu.web_service import WebService
+import time
+import re
+import base64
+class OCRService(WebService):
+    def init_det(self):
+        self.det_preprocess = Sequential([
+            ResizeByFactor(32, 960), Div(255),
+            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
+                (2, 0, 1))
+        ])
+        self.filter_func = FilterBoxes(10, 10)
+        self.post_func = DBPostProcess({
+            "thresh": 0.3,
+            "box_thresh": 0.5,
+            "max_candidates": 1000,
+            "unclip_ratio": 1.5,
+            "min_size": 3
+        })
+    def preprocess(self, feed=[], fetch=[]):
+        data = base64.b64decode(feed[0]["image"].encode('utf8'))
+        data = np.fromstring(data, np.uint8)
+        im = cv2.imdecode(data, cv2.IMREAD_COLOR)
+        self.ori_h, self.ori_w, _ = im.shape
+        det_img = self.det_preprocess(im)
+        _, self.new_h, self.new_w = det_img.shape
+        return {"image": det_img[np.newaxis, :].copy()}, ["concat_1.tmp_0"]
+    def postprocess(self, feed={}, fetch=[], fetch_map=None):
+        det_out = fetch_map["concat_1.tmp_0"]
+        ratio_list = [
+            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
+        ]
+        dt_boxes_list = self.post_func(det_out, [ratio_list])
+        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
+        return {"dt_boxes": dt_boxes.tolist()}
+ocr_service = OCRService(name="ocr")
+ocr_service.load_model_config("ocr_det_model")
+ocr_service.set_gpus("0")
+ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu", gpuid=0)
+ocr_service.init_det()
+ocr_service.run_debugger_service()
+ocr_service.run_web_service()
--- a/python/examples/ocr/det_web_server.py
+++ b/python/examples/ocr/det_web_server.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_client import Client
+import cv2
+import sys
+import numpy as np
+import os
+from paddle_serving_client import Client
+from paddle_serving_app.reader import Sequential, ResizeByFactor
+from paddle_serving_app.reader import Div, Normalize, Transpose
+from paddle_serving_app.reader import DBPostProcess, FilterBoxes
+from paddle_serving_server_gpu.web_service import WebService
+import time
+import re
+import base64
+class OCRService(WebService):
+    def init_det(self):
+        self.det_preprocess = Sequential([
+            ResizeByFactor(32, 960), Div(255),
+            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
+                (2, 0, 1))
+        ])
+        self.filter_func = FilterBoxes(10, 10)
+        self.post_func = DBPostProcess({
+            "thresh": 0.3,
+            "box_thresh": 0.5,
+            "max_candidates": 1000,
+            "unclip_ratio": 1.5,
+            "min_size": 3
+        })
+    def preprocess(self, feed=[], fetch=[]):
+        data = base64.b64decode(feed[0]["image"].encode('utf8'))
+        data = np.fromstring(data, np.uint8)
+        im = cv2.imdecode(data, cv2.IMREAD_COLOR)
+        self.ori_h, self.ori_w, _ = im.shape
+        det_img = self.det_preprocess(im)
+        _, self.new_h, self.new_w = det_img.shape
+        print(det_img)
+        return {"image": det_img}, ["concat_1.tmp_0"]
+    def postprocess(self, feed={}, fetch=[], fetch_map=None):
+        det_out = fetch_map["concat_1.tmp_0"]
+        ratio_list = [
+            float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
+        ]
+        dt_boxes_list = self.post_func(det_out, [ratio_list])
+        dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
+        return {"dt_boxes": dt_boxes.tolist()}
+ocr_service = OCRService(name="ocr")
+ocr_service.load_model_config("ocr_det_model")
+ocr_service.set_gpus("0")
+ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu", gpuid=0)
+ocr_service.init_det()
+ocr_service.run_rpc_service()
+ocr_service.run_web_service()
--- a/python/examples/ocr/imgs/1.jpg
+++ b/python/examples/ocr/imgs/1.jpg
--- a/python/examples/ocr/ocr_debugger_server.py
+++ b/python/examples/ocr/ocr_debugger_server.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_client import Client
+from paddle_serving_app.reader import OCRReader
+import cv2
+import sys
+import numpy as np
+import os
+from paddle_serving_client import Client
+from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
+from paddle_serving_app.reader import Div, Normalize, Transpose
+from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
+from paddle_serving_server_gpu.web_service import WebService
+from paddle_serving_app.local_predict import Debugger
+import time
+import re
+import base64
+class OCRService(WebService):
+    def init_det_debugger(self, det_model_config):
+        self.det_preprocess = Sequential([
+            ResizeByFactor(32, 960), Div(255),
+            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
+                (2, 0, 1))
+        ])
+        self.det_client = Debugger()
+        self.det_client.load_model_config(
+            det_model_config, gpu=True, profile=False)
+        self.ocr_reader = OCRReader()
+    def preprocess(self, feed=[], fetch=[]):
+        data = base64.b64decode(feed[0]["image"].encode('utf8'))
+        data = np.fromstring(data, np.uint8)
+        im = cv2.imdecode(data, cv2.IMREAD_COLOR)
+        ori_h, ori_w, _ = im.shape
+        det_img = self.det_preprocess(im)
+        _, new_h, new_w = det_img.shape
+        det_img = det_img[np.newaxis, :]
+        det_img = det_img.copy()
+        det_out = self.det_client.predict(
+            feed={"image": det_img}, fetch=["concat_1.tmp_0"])
+        filter_func = FilterBoxes(10, 10)
+        post_func = DBPostProcess({
+            "thresh": 0.3,
+            "box_thresh": 0.5,
+            "max_candidates": 1000,
+            "unclip_ratio": 1.5,
+            "min_size": 3
+        })
+        sorted_boxes = SortedBoxes()
+        ratio_list = [float(new_h) / ori_h, float(new_w) / ori_w]
+        dt_boxes_list = post_func(det_out["concat_1.tmp_0"], [ratio_list])
+        dt_boxes = filter_func(dt_boxes_list[0], [ori_h, ori_w])
+        dt_boxes = sorted_boxes(dt_boxes)
+        get_rotate_crop_image = GetRotateCropImage()
+        img_list = []
+        max_wh_ratio = 0
+        for i, dtbox in enumerate(dt_boxes):
+            boximg = get_rotate_crop_image(im, dt_boxes[i])
+            img_list.append(boximg)
+            h, w = boximg.shape[0:2]
+            wh_ratio = w * 1.0 / h
+            max_wh_ratio = max(max_wh_ratio, wh_ratio)
+        if len(img_list) == 0:
+            return [], []
+        _, w, h = self.ocr_reader.resize_norm_img(img_list[0],
+                                                  max_wh_ratio).shape
+        imgs = np.zeros((len(img_list), 3, w, h)).astype('float32')
+        for id, img in enumerate(img_list):
+            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
+            imgs[id] = norm_img
+        feed = {"image": imgs.copy()}
+        fetch = ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"]
+        return feed, fetch
+    def postprocess(self, feed={}, fetch=[], fetch_map=None):
+        rec_res = self.ocr_reader.postprocess(fetch_map, with_score=True)
+        res_lst = []
+        for res in rec_res:
+            res_lst.append(res[0])
+        res = {"res": res_lst}
+        return res
+ocr_service = OCRService(name="ocr")
+ocr_service.load_model_config("ocr_rec_model")
+ocr_service.prepare_server(workdir="workdir", port=9292)
+ocr_service.init_det_debugger(det_model_config="ocr_det_model")
+ocr_service.run_debugger_service(gpu=True)
+ocr_service.run_web_service()
--- a/python/examples/ocr/ocr_web_client.py
+++ b/python/examples/ocr/ocr_web_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# -*- coding: utf-8 -*-
+import requests
+import json
+import cv2
+import base64
+import os, sys
+import time
+def cv2_to_base64(image):
+    #data = cv2.imencode('.jpg', image)[1]
+    return base64.b64encode(image).decode(
+        'utf8')  #data.tostring()).decode('utf8')
+headers = {"Content-type": "application/json"}
+url = "http://127.0.0.1:9292/ocr/prediction"
+test_img_dir = "imgs/"
+for img_file in os.listdir(test_img_dir):
+    with open(os.path.join(test_img_dir, img_file), 'rb') as file:
+        image_data1 = file.read()
+    image = cv2_to_base64(image_data1)
+    data = {"feed": [{"image": image}], "fetch": ["res"]}
+    r = requests.post(url=url, headers=headers, data=json.dumps(data))
+    print(r.json())
--- a/python/examples/ocr/ocr_web_server.py
+++ b/python/examples/ocr/ocr_web_server.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_client import Client
+from paddle_serving_app.reader import OCRReader
+import cv2
+import sys
+import numpy as np
+import os
+from paddle_serving_client import Client
+from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
+from paddle_serving_app.reader import Div, Normalize, Transpose
+from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
+from paddle_serving_server_gpu.web_service import WebService
+import time
+import re
+import base64
+class OCRService(WebService):
+    def init_det_client(self, det_port, det_client_config):
+        self.det_preprocess = Sequential([
+            ResizeByFactor(32, 960), Div(255),
+            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose(
+                (2, 0, 1))
+        ])
+        self.det_client = Client()
+        self.det_client.load_client_config(det_client_config)
+        self.det_client.connect(["127.0.0.1:{}".format(det_port)])
+        self.ocr_reader = OCRReader()
+    def preprocess(self, feed=[], fetch=[]):
+        data = base64.b64decode(feed[0]["image"].encode('utf8'))
+        data = np.fromstring(data, np.uint8)
+        im = cv2.imdecode(data, cv2.IMREAD_COLOR)
+        ori_h, ori_w, _ = im.shape
+        det_img = self.det_preprocess(im)
+        det_out = self.det_client.predict(
+            feed={"image": det_img}, fetch=["concat_1.tmp_0"])
+        _, new_h, new_w = det_img.shape
+        filter_func = FilterBoxes(10, 10)
+        post_func = DBPostProcess({
+            "thresh": 0.3,
+            "box_thresh": 0.5,
+            "max_candidates": 1000,
+            "unclip_ratio": 1.5,
+            "min_size": 3
+        })
+        sorted_boxes = SortedBoxes()
+        ratio_list = [float(new_h) / ori_h, float(new_w) / ori_w]
+        dt_boxes_list = post_func(det_out["concat_1.tmp_0"], [ratio_list])
+        dt_boxes = filter_func(dt_boxes_list[0], [ori_h, ori_w])
+        dt_boxes = sorted_boxes(dt_boxes)
+        get_rotate_crop_image = GetRotateCropImage()
+        feed_list = []
+        img_list = []
+        max_wh_ratio = 0
+        for i, dtbox in enumerate(dt_boxes):
+            boximg = get_rotate_crop_image(im, dt_boxes[i])
+            img_list.append(boximg)
+            h, w = boximg.shape[0:2]
+            wh_ratio = w * 1.0 / h
+            max_wh_ratio = max(max_wh_ratio, wh_ratio)
+        for img in img_list:
+            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
+            feed = {"image": norm_img}
+            feed_list.append(feed)
+        fetch = ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"]
+        return feed_list, fetch
+    def postprocess(self, feed={}, fetch=[], fetch_map=None):
+        rec_res = self.ocr_reader.postprocess(fetch_map, with_score=True)
+        res_lst = []
+        for res in rec_res:
+            res_lst.append(res[0])
+        res = {"res": res_lst}
+        return res
+ocr_service = OCRService(name="ocr")
+ocr_service.load_model_config("ocr_rec_model")
+ocr_service.set_gpus("0")
+ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu", gpuid=0)
+ocr_service.init_det_client(
+    det_port=9293,
+    det_client_config="ocr_det_client/serving_client_conf.prototxt")
+ocr_service.run_rpc_service()
+ocr_service.run_web_service()
--- a/python/examples/ocr/rec_debugger_server.py
+++ b/python/examples/ocr/rec_debugger_server.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_client import Client
+from paddle_serving_app.reader import OCRReader
+import cv2
+import sys
+import numpy as np
+import os
+from paddle_serving_client import Client
+from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
+from paddle_serving_app.reader import Div, Normalize, Transpose
+from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
+from paddle_serving_server_gpu.web_service import WebService
+import time
+import re
+import base64
+class OCRService(WebService):
+    def init_rec(self):
+        self.ocr_reader = OCRReader()
+    def preprocess(self, feed=[], fetch=[]):
+        img_list = []
+        for feed_data in feed:
+            data = base64.b64decode(feed_data["image"].encode('utf8'))
+            data = np.fromstring(data, np.uint8)
+            im = cv2.imdecode(data, cv2.IMREAD_COLOR)
+            img_list.append(im)
+        max_wh_ratio = 0
+        for i, boximg in enumerate(img_list):
+            h, w = boximg.shape[0:2]
+            wh_ratio = w * 1.0 / h
+            max_wh_ratio = max(max_wh_ratio, wh_ratio)
+        _, w, h = self.ocr_reader.resize_norm_img(img_list[0],
+                                                  max_wh_ratio).shape
+        imgs = np.zeros((len(img_list), 3, w, h)).astype('float32')
+        for i, img in enumerate(img_list):
+            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
+            imgs[i] = norm_img
+        feed = {"image": imgs.copy()}
+        fetch = ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"]
+        return feed, fetch
+    def postprocess(self, feed={}, fetch=[], fetch_map=None):
+        rec_res = self.ocr_reader.postprocess(fetch_map, with_score=True)
+        res_lst = []
+        for res in rec_res:
+            res_lst.append(res[0])
+        res = {"res": res_lst}
+        return res
+ocr_service = OCRService(name="ocr")
+ocr_service.load_model_config("ocr_rec_model")
+ocr_service.set_gpus("0")
+ocr_service.init_rec()
+ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu", gpuid=0)
+ocr_service.run_debugger_service()
+ocr_service.run_web_service()
--- a/python/examples/ocr/rec_img/ch_doc3.jpg
+++ b/python/examples/ocr/rec_img/ch_doc3.jpg
--- a/python/examples/ocr/rec_web_client.py
+++ b/python/examples/ocr/rec_web_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# -*- coding: utf-8 -*-
+import requests
+import json
+import cv2
+import base64
+import os, sys
+import time
+def cv2_to_base64(image):
+    #data = cv2.imencode('.jpg', image)[1]
+    return base64.b64encode(image).decode(
+        'utf8')  #data.tostring()).decode('utf8')
+headers = {"Content-type": "application/json"}
+url = "http://127.0.0.1:9292/ocr/prediction"
+test_img_dir = "rec_img/"
+for img_file in os.listdir(test_img_dir):
+    with open(os.path.join(test_img_dir, img_file), 'rb') as file:
+        image_data1 = file.read()
+    image = cv2_to_base64(image_data1)
+    #data = {"feed": [{"image": image}], "fetch": ["res"]}
+    data = {"feed": [{"image": image}] * 3, "fetch": ["res"]}
+    r = requests.post(url=url, headers=headers, data=json.dumps(data))
+    print(r.json())
--- a/python/examples/ocr/rec_web_server.py
+++ b/python/examples/ocr/rec_web_server.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_client import Client
+from paddle_serving_app.reader import OCRReader
+import cv2
+import sys
+import numpy as np
+import os
+from paddle_serving_client import Client
+from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
+from paddle_serving_app.reader import Div, Normalize, Transpose
+from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
+from paddle_serving_server_gpu.web_service import WebService
+import time
+import re
+import base64
+class OCRService(WebService):
+    def init_rec(self):
+        self.ocr_reader = OCRReader()
+    def preprocess(self, feed=[], fetch=[]):
+        # TODO: to handle batch rec images
+        img_list = []
+        for feed_data in feed:
+            data = base64.b64decode(feed_data["image"].encode('utf8'))
+            data = np.fromstring(data, np.uint8)
+            im = cv2.imdecode(data, cv2.IMREAD_COLOR)
+            img_list.append(im)
+        feed_list = []
+        max_wh_ratio = 0
+        for i, boximg in enumerate(img_list):
+            h, w = boximg.shape[0:2]
+            wh_ratio = w * 1.0 / h
+            max_wh_ratio = max(max_wh_ratio, wh_ratio)
+        for img in img_list:
+            norm_img = self.ocr_reader.resize_norm_img(img, max_wh_ratio)
+            feed = {"image": norm_img}
+            feed_list.append(feed)
+        fetch = ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"]
+        return feed_list, fetch
+    def postprocess(self, feed={}, fetch=[], fetch_map=None):
+        rec_res = self.ocr_reader.postprocess(fetch_map, with_score=True)
+        res_lst = []
+        for res in rec_res:
+            res_lst.append(res[0])
+        res = {"res": res_lst}
+        return res
+ocr_service = OCRService(name="ocr")
+ocr_service.load_model_config("ocr_rec_model")
+ocr_service.set_gpus("0")
+ocr_service.init_rec()
+ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu", gpuid=0)
+ocr_service.run_rpc_service()
+ocr_service.run_web_service()
--- a/python/examples/ocr/test_rec.jpg
+++ b/python/examples/ocr/test_rec.jpg
--- a/python/examples/pipeline/imdb_model_ensemble/analyse.py
+++ b/python/examples/pipeline/imdb_model_ensemble/analyse.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_server.pipeline import Analyst
+import json
+import logging
+import sys
+logging.basicConfig(level=logging.INFO)
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Usage: python analyse.py <log_filename> <trace_filename>")
+        exit(1)
+    log_filename = sys.argv[1]
+    trace_filename = sys.argv[2]
+    analyst = Analyst(log_filename)
+    analyst.save_trace(trace_filename)
+    op_analyst = analyst.get_op_analyst()
+    op_concurrency = op_analyst.concurrency_analysis("analyse.yaml")
+    print(json.dumps(op_concurrency, indent=2, separators=(',', ':')))
--- a/python/examples/pipeline/imdb_model_ensemble/analyse.yaml
+++ b/python/examples/pipeline/imdb_model_ensemble/analyse.yaml
+bow:
+    midp: 0
+cnn:
+    midp: 1
--- a/python/examples/pipeline/imdb_model_ensemble/config.yml
+++ b/python/examples/pipeline/imdb_model_ensemble/config.yml
-use_multithread: true
+port: 18080
-client_type: brpc
+worker_num: 1
-retry: 1
+build_dag_each_worker: false
-profile: false
+dag:
-prot: 8080
+    is_thread_op: true
-worker_num: 2
+    client_type: brpc
+    retry: 1
+    use_profile: false
--- a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_client.py
+++ b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_client.py
@@ -13,18 +13,19 @@
 # limitations under the License.
 from paddle_serving_client.pipeline import PipelineClient
 import numpy as np
-from line_profiler import LineProfiler
 client = PipelineClient()
-client.connect('localhost:8080')
+client.connect(['127.0.0.1:18080'])
-lp = LineProfiler()
-lp_wrapper = lp(client.predict)
 words = 'i am very sad | 0'
-for i in range(1):
+futures = []
-    fetch_map = lp_wrapper(feed_dict={"words": words}, fetch=["prediction"])
+for i in range(100):
-    print(fetch_map)
+    futures.append(
+        client.predict(
+            feed_dict={"words": words}, fetch=["prediction"], asyn=True))
-#lp.print_stats()
+for f in futures:
+    res = f.result()
+    if res["ecode"] != 0:
+        print("predict failed: {}".format(res))
--- a/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
+++ b/python/examples/pipeline/imdb_model_ensemble/test_pipeline_server.py
@@ -21,16 +21,13 @@ import numpy as np
 import logging
 from paddle_serving_app.reader import IMDBDataset
-_LOGGER = logging.getLogger(__name__)
+logging.basicConfig(level=logging.DEBUG)
-logging.basicConfig(
+_LOGGER = logging.getLogger()
-    format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',
-    datefmt='%Y-%m-%d %H:%M',
-    level=logging.DEBUG)
 class ImdbRequestOp(RequestOp):
-    def load_user_resources(self):
+    def init_op(self):
        self.imdb_dataset = IMDBDataset()
        self.imdb_dataset.load_resource('imdb.vocab')
@@ -91,7 +88,7 @@ cnn_op = Op(name="cnn",
 combine_op = CombineOp(
    name="combine",
    input_ops=[bow_op, cnn_op],
-    concurrency=1,
+    concurrency=5,
    timeout=-1,
    retry=1)

--- a/python/examples/util/timeline_trace.py
+++ b/python/examples/util/timeline_trace.py
@@ -43,6 +43,8 @@ if __name__ == "__main__":
        for line in f.readlines():
            line = line.strip().split("\t")
            if line[0] == "PROFILE":
+                if len(line) < 2:
+                    continue
                trace_list = prase(line[1], line[2], counter)
                counter += 1
                for trace in trace_list:

--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -70,9 +70,10 @@ class Debugger(object):
            config.enable_use_gpu(100, 0)
        if profile:
            config.enable_profile()
+        config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
        config.set_cpu_math_library_num_threads(cpu_num)
        config.switch_ir_optim(False)
+        config.switch_use_feed_fetch_ops(False)
        self.predictor = create_paddle_predictor(config)
    def predict(self, feed=None, fetch=None):
@@ -113,8 +114,8 @@ class Debugger(object):
                "Fetch names should not be empty or out of saved fetch list.")
            return {}
-        inputs = []
+        input_names = self.predictor.get_input_names()
-        for name in self.feed_names_:
+        for name in input_names:
            if isinstance(feed[name], list):
                feed[name] = np.array(feed[name]).reshape(self.feed_shapes_[
                    name])
@@ -122,11 +123,21 @@ class Debugger(object):
                feed[name] = feed[name].astype("int64")
            else:
                feed[name] = feed[name].astype("float32")
-            inputs.append(PaddleTensor(feed[name][np.newaxis, :]))
+            input_tensor = self.predictor.get_input_tensor(name)
+            input_tensor.copy_from_cpu(feed[name])
-        outputs = self.predictor.run(inputs)
+        output_tensors = []
+        output_names = self.predictor.get_output_names()
+        for output_name in output_names:
+            output_tensor = self.predictor.get_output_tensor(output_name)
+            output_tensors.append(output_tensor)
+        outputs = []
+        self.predictor.zero_copy_run()
+        for output_tensor in output_tensors:
+            output = output_tensor.copy_to_cpu()
+            outputs.append(output)
        fetch_map = {}
-        for name in fetch:
+        for i, name in enumerate(fetch):
-            fetch_map[name] = outputs[self.fetch_names_to_idx_[
+            fetch_map[name] = outputs[i]
-                name]].as_ndarray()
+            if len(output_tensors[i].lod()) > 0:
+                fetch_map[name + ".lod"] = output_tensors[i].lod()[0]
        return fetch_map
--- a/python/paddle_serving_app/models/model_list.py
+++ b/python/paddle_serving_app/models/model_list.py
@@ -24,14 +24,15 @@ class ServingModels(object):
            "SentimentAnalysis"] = ["senta_bilstm", "senta_bow", "senta_cnn"]
        self.model_dict["SemanticRepresentation"] = ["ernie"]
        self.model_dict["ChineseWordSegmentation"] = ["lac"]
-        self.model_dict["ObjectDetection"] = ["faster_rcnn", "yolov4"]
+        self.model_dict[
+            "ObjectDetection"] = ["faster_rcnn", "yolov4", "blazeface"]
        self.model_dict["ImageSegmentation"] = [
            "unet", "deeplabv3", "deeplabv3+cityscapes"
        ]
        self.model_dict["ImageClassification"] = [
            "resnet_v2_50_imagenet", "mobilenet_v2_imagenet"
        ]
-        self.model_dict["TextDetection"] = ["ocr_detection"]
+        self.model_dict["TextDetection"] = ["ocr_det"]
        self.model_dict["OCR"] = ["ocr_rec"]
        image_class_url = "https://paddle-serving.bj.bcebos.com/paddle_hub_models/image/ImageClassification/"

--- a/python/paddle_serving_app/reader/__init__.py
+++ b/python/paddle_serving_app/reader/__init__.py
@@ -15,7 +15,7 @@ from .chinese_bert_reader import ChineseBertReader
 from .image_reader import ImageReader, File2Image, URL2Image, Sequential, Normalize
 from .image_reader import CenterCrop, Resize, Transpose, Div, RGB2BGR, BGR2RGB, ResizeByFactor
 from .image_reader import RCNNPostprocess, SegPostprocess, PadStride
-from .image_reader import DBPostProcess, FilterBoxes
+from .image_reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes
 from .lac_reader import LACReader
 from .senta_reader import SentaReader
 from .imdb_reader import IMDBDataset

--- a/python/paddle_serving_app/reader/functional.py
+++ b/python/paddle_serving_app/reader/functional.py
@@ -29,6 +29,7 @@ def normalize(img, mean, std, channel_first):
    else:
        img_mean = np.array(mean).reshape((1, 1, 3))
        img_std = np.array(std).reshape((1, 1, 3))
+    img = np.array(img).astype("float32")
    img -= img_mean
    img /= img_std
    return img

--- a/python/paddle_serving_app/reader/image_reader.py
+++ b/python/paddle_serving_app/reader/image_reader.py
@@ -440,6 +440,30 @@ class RCNNPostprocess(object):
            self.label_file, self.output_dir)
+class BlazeFacePostprocess(RCNNPostprocess):
+    def clip_bbox(self, bbox, im_size=None):
+        h = 1. if im_size is None else im_size[0]
+        w = 1. if im_size is None else im_size[1]
+        xmin = max(min(bbox[0], w), 0.)
+        ymin = max(min(bbox[1], h), 0.)
+        xmax = max(min(bbox[2], w), 0.)
+        ymax = max(min(bbox[3], h), 0.)
+        return xmin, ymin, xmax, ymax
+    def _get_bbox_result(self, fetch_map, fetch_name, clsid2catid):
+        result = {}
+        is_bbox_normalized = True  #for blaze face, set true here
+        output = fetch_map[fetch_name]
+        lod = [fetch_map[fetch_name + '.lod']]
+        lengths = self._offset_to_lengths(lod)
+        np_data = np.array(output)
+        result['bbox'] = (np_data, lengths)
+        result['im_id'] = np.array([[0]])
+        result["im_shape"] = np.array(fetch_map["im_shape"]).astype(np.int32)
+        bbox_results = self._bbox2out([result], clsid2catid, is_bbox_normalized)
+        return bbox_results
 class Sequential(object):
    """
    Args:
@@ -493,6 +517,19 @@ class BGR2RGB(object):
        return self.__class__.__name__ + "()"
+class String2Image(object):
+    def __init__(self):
+        pass
+    def __call__(self, img_buffer):
+        data = np.fromstring(img_buffer, np.uint8)
+        img = cv2.imdecode(data, cv2.IMREAD_COLOR)
+        return img
+    def __repr__(self):
+        return self.__class__.__name__ + "()"
 class File2Image(object):
    def __init__(self):
        pass
@@ -537,7 +574,9 @@ class Base64ToImage(object):
        pass
    def __call__(self, img_base64):
-        img = base64.b64decode(img_base64)
+        sample = base64.b64decode(img_base64)
+        data = np.fromstring(sample, np.uint8)
+        img = cv2.imdecode(data, cv2.IMREAD_COLOR)
        return img
    def __repr__(self):
@@ -653,7 +692,7 @@ class Resize(object):
    Args:
        size (sequence or int): Desired output size. If size is a sequence like
-            (h, w), output size will be matched to this. If size is an int,
+            (w, h), output size will be matched to this. If size is an int,
            smaller edge of the image will be matched to this number.
            i.e, if height > width, then image will be rescaled to
            (size * height / width, size)
@@ -758,6 +797,59 @@ class Transpose(object):
        return format_string
+class SortedBoxes(object):
+    """
+    Sorted bounding boxes from Detection
+    """
+    def __init__(self):
+        pass
+    def __call__(self, dt_boxes):
+        num_boxes = dt_boxes.shape[0]
+        sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
+        _boxes = list(sorted_boxes)
+        for i in range(num_boxes - 1):
+            if abs(_boxes[i+1][0][1] - _boxes[i][0][1]) < 10 and \
+                (_boxes[i + 1][0][0] < _boxes[i][0][0]):
+                tmp = _boxes[i]
+                _boxes[i] = _boxes[i + 1]
+                _boxes[i + 1] = tmp
+        return _boxes
+class GetRotateCropImage(object):
+    """
+    Rotate and Crop image from OCR Det output
+    """
+    def __init__(self):
+        pass
+    def __call__(self, img, points):
+        img_height, img_width = img.shape[0:2]
+        left = int(np.min(points[:, 0]))
+        right = int(np.max(points[:, 0]))
+        top = int(np.min(points[:, 1]))
+        bottom = int(np.max(points[:, 1]))
+        img_crop = img[top:bottom, left:right, :].copy()
+        points[:, 0] = points[:, 0] - left
+        points[:, 1] = points[:, 1] - top
+        img_crop_width = int(np.linalg.norm(points[0] - points[1]))
+        img_crop_height = int(np.linalg.norm(points[0] - points[3]))
+        pts_std = np.float32([[0, 0], [img_crop_width, 0], \
+                      [img_crop_width, img_crop_height], [0, img_crop_height]])
+        M = cv2.getPerspectiveTransform(points, pts_std)
+        dst_img = cv2.warpPerspective(
+            img_crop,
+            M, (img_crop_width, img_crop_height),
+            borderMode=cv2.BORDER_REPLICATE)
+        dst_img_height, dst_img_width = dst_img.shape[0:2]
+        if dst_img_height * 1.0 / dst_img_width >= 1.5:
+            dst_img = np.rot90(dst_img)
+        return dst_img
 class ImageReader():
    def __init__(self,
                 image_shape=[3, 224, 224],

--- a/python/paddle_serving_app/reader/ocr_reader.py
+++ b/python/paddle_serving_app/reader/ocr_reader.py
@@ -120,29 +120,21 @@ class CharacterOps(object):
 class OCRReader(object):
-    def __init__(self):
+    def __init__(self,
-        args = self.parse_args()
+                 algorithm="CRNN",
-        image_shape = [int(v) for v in args.rec_image_shape.split(",")]
+                 image_shape=[3, 32, 320],
+                 char_type="ch",
+                 batch_num=1,
+                 char_dict_path="./ppocr_keys_v1.txt"):
        self.rec_image_shape = image_shape
-        self.character_type = args.rec_char_type
+        self.character_type = char_type
-        self.rec_batch_num = args.rec_batch_num
+        self.rec_batch_num = batch_num
        char_ops_params = {}
-        char_ops_params["character_type"] = args.rec_char_type
+        char_ops_params["character_type"] = char_type
-        char_ops_params["character_dict_path"] = args.rec_char_dict_path
+        char_ops_params["character_dict_path"] = char_dict_path
        char_ops_params['loss_type'] = 'ctc'
        self.char_ops = CharacterOps(char_ops_params)
-    def parse_args(self):
-        parser = argparse.ArgumentParser()
-        parser.add_argument("--rec_algorithm", type=str, default='CRNN')
-        parser.add_argument("--rec_model_dir", type=str)
-        parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320")
-        parser.add_argument("--rec_char_type", type=str, default='ch')
-        parser.add_argument("--rec_batch_num", type=int, default=1)
-        parser.add_argument(
-            "--rec_char_dict_path", type=str, default="./ppocr_keys_v1.txt")
-        return parser.parse_args()
    def resize_norm_img(self, img, max_wh_ratio):
        imgC, imgH, imgW = self.rec_image_shape
        if self.character_type == "ch":
@@ -154,15 +146,14 @@ class OCRReader(object):
            resized_w = imgW
        else:
            resized_w = int(math.ceil(imgH * ratio))
+        resized_image = cv2.resize(img, (resized_w, imgH))
-        seq = Sequential([
+        resized_image = resized_image.astype('float32')
-            Resize(imgH, resized_w), Transpose((2, 0, 1)), Div(255),
+        resized_image = resized_image.transpose((2, 0, 1)) / 255
-            Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5], True)
+        resized_image -= 0.5
-        ])
+        resized_image /= 0.5
-        resized_image = seq(img)
        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
-        padding_im[:, :, 0:resized_w] = resized_image
+        padding_im[:, :, 0:resized_w] = resized_image
        return padding_im
    def preprocess(self, img_list):
@@ -182,22 +173,32 @@ class OCRReader(object):
        return norm_img_batch[0]
-    def postprocess(self, outputs):
+    def postprocess(self, outputs, with_score=False):
        rec_res = []
        rec_idx_lod = outputs["ctc_greedy_decoder_0.tmp_0.lod"]
-        predict_lod = outputs["softmax_0.tmp_0.lod"]
        rec_idx_batch = outputs["ctc_greedy_decoder_0.tmp_0"]
+        if with_score:
+            predict_lod = outputs["softmax_0.tmp_0.lod"]
        for rno in range(len(rec_idx_lod) - 1):
            beg = rec_idx_lod[rno]
            end = rec_idx_lod[rno + 1]
+            if isinstance(rec_idx_batch, list):
+                rec_idx_tmp = [x[0] for x in rec_idx_batch[beg:end]]
+            else:  #nd array
                rec_idx_tmp = rec_idx_batch[beg:end, 0]
            preds_text = self.char_ops.decode(rec_idx_tmp)
+            if with_score:
                beg = predict_lod[rno]
                end = predict_lod[rno + 1]
+                if isinstance(outputs["softmax_0.tmp_0"], list):
+                    outputs["softmax_0.tmp_0"] = np.array(outputs[
+                        "softmax_0.tmp_0"]).astype(np.float32)
                probs = outputs["softmax_0.tmp_0"][beg:end, :]
                ind = np.argmax(probs, axis=1)
                blank = probs.shape[1]
                valid_ind = np.where(ind != (blank - 1))[0]
                score = np.mean(probs[valid_ind, ind[valid_ind]])
                rec_res.append([preds_text, score])
+            else:
+                rec_res.append([preds_text])
        return rec_res
--- a/python/paddle_serving_app/version.py
+++ b/python/paddle_serving_app/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Paddle Serving App version string """
-serving_app_version = "0.1.1"
+serving_app_version = "0.1.2"
--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
@@ -399,6 +399,7 @@ class MultiLangClient(object):
        self.channel_ = None
        self.stub_ = None
        self.rpc_timeout_s_ = 2
+        self.profile_ = _Profiler()
    def add_variant(self, tag, cluster, variant_weight):
        # TODO
@@ -520,7 +521,7 @@ class MultiLangClient(object):
                            tensor.float_data.extend(
                                var.reshape(-1).astype('float32').tolist())
                        elif v_type == 2:
-                            tensor.int32_data.extend(
+                            tensor.int_data.extend(
                                var.reshape(-1).astype('int32').tolist())
                        else:
                            raise Exception("error tensor value type.")
@@ -530,7 +531,7 @@ class MultiLangClient(object):
                        elif v_type == 1:
                            tensor.float_data.extend(self._flatten_list(var))
                        elif v_type == 2:
-                            tensor.int32_data.extend(self._flatten_list(var))
+                            tensor.int_data.extend(self._flatten_list(var))
                        else:
                            raise Exception("error tensor value type.")
                    else:
@@ -582,6 +583,7 @@ class MultiLangClient(object):
            ret = list(multi_result_map.values())[0]
        else:
            ret = multi_result_map
        ret["serving_status_code"] = 0
        return ret if not need_variant_tag else [ret, tag]
@@ -601,18 +603,30 @@ class MultiLangClient(object):
                need_variant_tag=False,
                asyn=False,
                is_python=True):
-        req = self._pack_inference_request(feed, fetch, is_python=is_python)
        if not asyn:
            try:
+                self.profile_.record('py_prepro_0')
+                req = self._pack_inference_request(
+                    feed, fetch, is_python=is_python)
+                self.profile_.record('py_prepro_1')
+                self.profile_.record('py_client_infer_0')
                resp = self.stub_.Inference(req, timeout=self.rpc_timeout_s_)
-                return self._unpack_inference_response(
+                self.profile_.record('py_client_infer_1')
+                self.profile_.record('py_postpro_0')
+                ret = self._unpack_inference_response(
                    resp,
                    fetch,
                    is_python=is_python,
                    need_variant_tag=need_variant_tag)
+                self.profile_.record('py_postpro_1')
+                self.profile_.print_profile()
+                return ret
            except grpc.RpcError as e:
                return {"serving_status_code": e.code()}
        else:
+            req = self._pack_inference_request(feed, fetch, is_python=is_python)
            call_future = self.stub_.Inference.future(
                req, timeout=self.rpc_timeout_s_)
            return MultiLangPredictFuture(

--- a/python/paddle_serving_client/version.py
+++ b/python/paddle_serving_client/version.py
@@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Paddle Serving Client version string """
-serving_client_version = "0.3.1"
+serving_client_version = "0.3.2"
-serving_server_version = "0.3.1"
+serving_server_version = "0.3.2"
-module_proto_version = "0.3.1"
+module_proto_version = "0.3.2"
--- a/python/paddle_serving_server/__init__.py
+++ b/python/paddle_serving_server/__init__.py
@@ -25,6 +25,7 @@ from contextlib import closing
 import collections
 import fcntl
+import shutil
 import numpy as np
 import grpc
 from .proto import multi_lang_general_model_service_pb2
@@ -230,7 +231,7 @@ class Server(object):
            infer_service.workflows.extend(["workflow1"])
            self.infer_service_conf.services.extend([infer_service])
-    def _prepare_resource(self, workdir):
+    def _prepare_resource(self, workdir, cube_conf):
        self.workdir = workdir
        if self.resource_conf == None:
            with open("{}/{}".format(workdir, self.general_model_config_fn),
@@ -242,6 +243,11 @@ class Server(object):
                    if "dist_kv" in node.name:
                        self.resource_conf.cube_config_path = workdir
                        self.resource_conf.cube_config_file = self.cube_config_fn
+                        if cube_conf == None:
+                            raise ValueError(
+                                "Please set the path of cube.conf while use dist_kv op."
+                            )
+                        shutil.copy(cube_conf, workdir)
                        if "quant" in node.name:
                            self.resource_conf.cube_quant_bits = 8
            self.resource_conf.model_toolkit_path = workdir
@@ -366,7 +372,11 @@ class Server(object):
        os.chdir(self.cur_path)
        self.bin_path = self.server_path + "/serving"
-    def prepare_server(self, workdir=None, port=9292, device="cpu"):
+    def prepare_server(self,
+                       workdir=None,
+                       port=9292,
+                       device="cpu",
+                       cube_conf=None):
        if workdir == None:
            workdir = "./tmp"
            os.system("mkdir {}".format(workdir))
@@ -377,7 +387,7 @@ class Server(object):
        if not self.port_is_available(port):
            raise SystemExit("Port {} is already used".format(port))
        self.set_port(port)
-        self._prepare_resource(workdir)
+        self._prepare_resource(workdir, cube_conf)
        self._prepare_engine(self.model_config_paths, device)
        self._prepare_infer_service(port)
        self.workdir = workdir
@@ -514,7 +524,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
                    elif v_type == 1:  # float32
                        data = np.array(list(var.float_data), dtype="float32")
                    elif v_type == 2:  # int32
-                        data = np.array(list(var.int32_data), dtype="int32")
+                        data = np.array(list(var.int_data), dtype="int32")
                    else:
                        raise Exception("error type.")
                data.shape = list(feed_inst.tensor_array[idx].shape)
@@ -530,6 +540,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
        results, tag = ret
        resp.tag = tag
        resp.err_code = 0
        if not self.is_multi_model_:
            results = {'general_infer_0': results}
        for model_name, model_result in results.items():
@@ -548,7 +559,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
                        tensor.float_data.extend(model_result[name].reshape(-1)
                                                 .tolist())
                    elif v_type == 2:  # int32
-                        tensor.int32_data.extend(model_result[name].reshape(-1)
+                        tensor.int_data.extend(model_result[name].reshape(-1)
                                               .tolist())
                    else:
                        raise Exception("error type.")
@@ -645,7 +656,11 @@ class MultiLangServer(object):
                    server_config_paths)
        self.bclient_config_path_ = client_config_path
-    def prepare_server(self, workdir=None, port=9292, device="cpu"):
+    def prepare_server(self,
+                       workdir=None,
+                       port=9292,
+                       device="cpu",
+                       cube_conf=None):
        if not self._port_is_available(port):
            raise SystemExit("Prot {} is already used".format(port))
        default_port = 12000
@@ -656,7 +671,10 @@ class MultiLangServer(object):
                self.port_list_.append(default_port + i)
                break
        self.bserver_.prepare_server(
-            workdir=workdir, port=self.port_list_[0], device=device)
+            workdir=workdir,
+            port=self.port_list_[0],
+            device=device,
+            cube_conf=cube_conf)
        self.set_port(port)
    def _launch_brpc_service(self, bserver):

--- a/python/paddle_serving_server/version.py
+++ b/python/paddle_serving_server/version.py
@@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Paddle Serving Client version string """
-serving_client_version = "0.3.1"
+serving_client_version = "0.3.2"
-serving_server_version = "0.3.1"
+serving_server_version = "0.3.2"
-module_proto_version = "0.3.1"
+module_proto_version = "0.3.2"
--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -88,8 +88,8 @@ class WebService(object):
            result = self.postprocess(
                feed=request.json["feed"], fetch=fetch, fetch_map=fetch_map)
            result = {"result": result}
-        except ValueError:
+        except ValueError as err:
-            result = {"result": "Request Value Error"}
+            result = {"result": err}
        return result
    def run_rpc_service(self):

--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
@@ -26,7 +26,7 @@ from contextlib import closing
 import argparse
 import collections
 import fcntl
+import shutil
 import numpy as np
 import grpc
 from .proto import multi_lang_general_model_service_pb2
@@ -285,7 +285,7 @@ class Server(object):
            infer_service.workflows.extend(["workflow1"])
            self.infer_service_conf.services.extend([infer_service])
-    def _prepare_resource(self, workdir):
+    def _prepare_resource(self, workdir, cube_conf):
        self.workdir = workdir
        if self.resource_conf == None:
            with open("{}/{}".format(workdir, self.general_model_config_fn),
@@ -297,6 +297,11 @@ class Server(object):
                    if "dist_kv" in node.name:
                        self.resource_conf.cube_config_path = workdir
                        self.resource_conf.cube_config_file = self.cube_config_fn
+                        if cube_conf == None:
+                            raise ValueError(
+                                "Please set the path of cube.conf while use dist_kv op."
+                            )
+                        shutil.copy(cube_conf, workdir)
            self.resource_conf.model_toolkit_path = workdir
            self.resource_conf.model_toolkit_file = self.model_toolkit_fn
            self.resource_conf.general_model_path = workdir
@@ -406,7 +411,11 @@ class Server(object):
        os.chdir(self.cur_path)
        self.bin_path = self.server_path + "/serving"
-    def prepare_server(self, workdir=None, port=9292, device="cpu"):
+    def prepare_server(self,
+                       workdir=None,
+                       port=9292,
+                       device="cpu",
+                       cube_conf=None):
        if workdir == None:
            workdir = "./tmp"
            os.system("mkdir {}".format(workdir))
@@ -418,7 +427,7 @@ class Server(object):
            raise SystemExit("Port {} is already used".format(port))
        self.set_port(port)
-        self._prepare_resource(workdir)
+        self._prepare_resource(workdir, cube_conf)
        self._prepare_engine(self.model_config_paths, device)
        self._prepare_infer_service(port)
        self.workdir = workdir
@@ -562,7 +571,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
                    elif v_type == 1:  # float32
                        data = np.array(list(var.float_data), dtype="float32")
                    elif v_type == 2:
-                        data = np.array(list(var.int32_data), dtype="int32")
+                        data = np.array(list(var.int_data), dtype="int32")
                    else:
                        raise Exception("error type.")
                data.shape = list(feed_inst.tensor_array[idx].shape)
@@ -578,6 +587,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
        results, tag = ret
        resp.tag = tag
        resp.err_code = 0
        if not self.is_multi_model_:
            results = {'general_infer_0': results}
        for model_name, model_result in results.items():
@@ -596,7 +606,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
                        tensor.float_data.extend(model_result[name].reshape(-1)
                                                 .tolist())
                    elif v_type == 2:  # int32
-                        tensor.int32_data.extend(model_result[name].reshape(-1)
+                        tensor.int_data.extend(model_result[name].reshape(-1)
                                               .tolist())
                    else:
                        raise Exception("error type.")
@@ -690,7 +700,11 @@ class MultiLangServer(object):
                    server_config_paths)
        self.bclient_config_path_ = client_config_path
-    def prepare_server(self, workdir=None, port=9292, device="cpu"):
+    def prepare_server(self,
+                       workdir=None,
+                       port=9292,
+                       device="cpu",
+                       cube_conf=None):
        if not self._port_is_available(port):
            raise SystemExit("Prot {} is already used".format(port))
        default_port = 12000
@@ -701,7 +715,10 @@ class MultiLangServer(object):
                self.port_list_.append(default_port + i)
                break
        self.bserver_.prepare_server(
-            workdir=workdir, port=self.port_list_[0], device=device)
+            workdir=workdir,
+            port=self.port_list_[0],
+            device=device,
+            cube_conf=cube_conf)
        self.set_port(port)
    def _launch_brpc_service(self, bserver):

--- a/python/paddle_serving_server_gpu/version.py
+++ b/python/paddle_serving_server_gpu/version.py
@@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Paddle Serving Client version string """
-serving_client_version = "0.3.1"
+serving_client_version = "0.3.2"
-serving_server_version = "0.3.1"
+serving_server_version = "0.3.2"
-module_proto_version = "0.3.1"
+module_proto_version = "0.3.2"
--- a/python/paddle_serving_server_gpu/web_service.py
+++ b/python/paddle_serving_server_gpu/web_service.py
@@ -127,14 +127,14 @@ class WebService(object):
                                          request.json["fetch"])
            if isinstance(feed, dict) and "fetch" in feed:
                del feed["fetch"]
+            if len(feed) == 0:
+                raise ValueError("empty input")
            fetch_map = self.client.predict(feed=feed, fetch=fetch)
-            for key in fetch_map:
-                fetch_map[key] = fetch_map[key].tolist()
            result = self.postprocess(
                feed=request.json["feed"], fetch=fetch, fetch_map=fetch_map)
            result = {"result": result}
-        except ValueError:
+        except ValueError as err:
-            result = {"result": "Request Value Error"}
+            result = {"result": err}
        return result
    def run_rpc_service(self):
@@ -164,6 +164,33 @@ class WebService(object):
        self.app_instance = app_instance
+    # TODO: maybe change another API name: maybe run_local_predictor?
+    def run_debugger_service(self, gpu=False):
+        import socket
+        localIP = socket.gethostbyname(socket.gethostname())
+        print("web service address:")
+        print("http://{}:{}/{}/prediction".format(localIP, self.port,
+                                                  self.name))
+        app_instance = Flask(__name__)
+        @app_instance.before_first_request
+        def init():
+            self._launch_local_predictor(gpu)
+        service_name = "/" + self.name + "/prediction"
+        @app_instance.route(service_name, methods=["POST"])
+        def run():
+            return self.get_prediction(request)
+        self.app_instance = app_instance
+    def _launch_local_predictor(self, gpu):
+        from paddle_serving_app.local_predict import Debugger
+        self.client = Debugger()
+        self.client.load_model_config(
+            "{}".format(self.model_config), gpu=gpu, profile=False)
    def run_web_service(self):
        self.app_instance.run(host="0.0.0.0",
                              port=self.port,

--- a/python/pipeline/__init__.py
+++ b/python/pipeline/__init__.py
@@ -15,3 +15,4 @@
 from operator import Op, RequestOp, ResponseOp
 from pipeline_server import PipelineServer
 from pipeline_client import PipelineClient
+from analyse import Analyst
--- a/python/pipeline/analyse.py
+++ b/python/pipeline/analyse.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+import json
+import copy
+import re
+import logging
+_LOGGER = logging.getLogger()
+class Analyst(object):
+    def __init__(self, profile_file):
+        self._profile_file = profile_file
+        self._trace = None
+        self.ave_call = None
+        self.ave_prepack = None
+        self.ave_postpack = None
+        self.op_analyst = None
+        self.start_time = None
+        self.end_time = None
+    def _prase_line(self, pid_str, time_str, counter):
+        pid = pid_str.split(":")[1]
+        event_list = time_str.split(" ")
+        trace_list = []
+        for event in event_list:
+            name, ts = event.split(":")
+            name_list = name.split("_")
+            ph = "B" if (name_list[-1] == "0") else "E"
+            if len(name_list) == 2:
+                name = name_list[0]
+            else:
+                name = "_".join(name_list[:-1])
+            name_list = name.split("#")
+            if len(name_list) > 1:
+                tid = name_list[-1]
+                name = "#".join(name_list[:-1])
+            else:
+                tid = 0
+            event_dict = {}
+            event_dict["name"] = name
+            event_dict["tid"] = tid
+            event_dict["pid"] = pid
+            event_dict["ts"] = ts
+            event_dict["ph"] = ph
+            trace_list.append(event_dict)
+        return trace_list
+    def get_trace(self):
+        if self._trace is not None:
+            return self._trace
+        all_list = []
+        counter = 0
+        with open(self._profile_file) as f:
+            for line in f.readlines():
+                line = line.strip().split("\t")
+                if line[0] == "PROFILE":
+                    trace_list = self._prase_line(line[1], line[2], counter)
+                    counter += 1
+                    for trace in trace_list:
+                        all_list.append(trace)
+        self._trace = all_list
+        return self._trace
+    def save_trace(self, trace_file):
+        self.get_trace()
+        trace = json.dumps(self._trace, indent=2, separators=(',', ':'))
+        with open(trace_file, "w") as f:
+            f.write(trace)
+    def print_profile(self):
+        self.get_profile()
+        print("graph engine call: {}".format(self.ave_call))
+        print("rpc prepack: {}".format(self.ave_prepack))
+        print("rpc postpack: {}".format(self.ave_postpack))
+        print("OP: {}".format(self.op_analyst))
+    def get_op_analyst(self):
+        self.get_profile()
+        return self.op_analyst
+    def get_profile(self):
+        if self.ave_call is not None and \
+                self.ave_prepack is not None and \
+                self.ave_postpack is not None and \
+                self.op_analyst is not None:
+            return (self.ave_call, self.ave_prepack, self.ave_postpack,
+                    self.op_analyst)
+        trace = self.get_trace()
+        time_dict = {}
+        time_list_dict = {}
+        start, end = None, None
+        for event in trace:
+            name = "{}#{}".format(event["name"], event["tid"])
+            event_t = int(event["ts"])
+            if name in time_dict:
+                ts = event_t - time_dict.pop(name)
+                ts = ts / 1e3  # ms
+                if name not in time_list_dict:
+                    time_list_dict[name] = []
+                time_list_dict[name].append(ts)
+            else:
+                time_dict[name] = event_t
+            if start is None:
+                start = event_t
+            elif start > event_t:
+                start = event_t
+            if end is None:
+                end = event_t
+            elif end < event_t:
+                end = event_t
+        self.start_time = start
+        self.end_time = end
+        op_analyst = OpAnalyst(start, end)
+        # reduce prepack_n, postpack_n, call_n
+        pat_prepack = re.compile(r"prepack_\d+#@G")
+        prepack_time_list = []
+        pat_postpack = re.compile(r"postpack_\d+#@G")
+        postpack_time_list = []
+        pat_call = re.compile(r"call_\d+#DAG")
+        call_time_list = []
+        for name in time_list_dict:
+            if pat_prepack.match(name):
+                prepack_time_list.extend(time_list_dict[name])
+            elif pat_postpack.match(name):
+                postpack_time_list.extend(time_list_dict[name])
+            elif pat_call.match(name):
+                call_time_list.extend(time_list_dict[name])
+            else:
+                op_analyst.add(name, time_list_dict[name])
+        self.ave_call = sum(call_time_list) * 1.0 / len(call_time_list)
+        self.ave_prepack = sum(prepack_time_list) * 1.0 / len(prepack_time_list)
+        self.ave_postpack = sum(postpack_time_list) * 1.0 / len(
+            postpack_time_list)
+        self.op_analyst = op_analyst
+        return (self.ave_call, self.ave_prepack, self.ave_postpack,
+                self.op_analyst)
+class OpAnalyst(object):
+    def __init__(self, start_time, end_time):
+        self.op_time_list_dict = {}
+        self._qps = None
+        self._close = False
+        self.start_time = start_time
+        self.end_time = end_time
+    def add(self, name_str, ts_list):
+        if self._close:
+            _LOGGER.error("OpAnalyst is closed.")
+            return
+        op_name, curr_idx, step = self._parse(name_str)
+        if op_name not in self.op_time_list_dict:
+            self.op_time_list_dict[op_name] = {}
+        if curr_idx not in self.op_time_list_dict[op_name]:
+            self.op_time_list_dict[op_name][curr_idx] = {}
+        if step not in self.op_time_list_dict[op_name][curr_idx]:
+            self.op_time_list_dict[op_name][curr_idx][step] = []
+        self.op_time_list_dict[op_name][curr_idx][step].extend(ts_list)
+    def _parse(self, name):
+        step, name_str = name.split("#")
+        name_str = name_str[1:-1]
+        op_name, curr_idx = name_str.split("|")
+        return op_name, curr_idx, step
+    def _reduce_profile(self):
+        """
+        Calculating the average time-consuming of multiple concurrent OPs.
+        """
+        if self._close:
+            return
+        for op_name in self.op_time_list_dict:
+            total_time = None
+            for curr_idx in self.op_time_list_dict[op_name]:
+                ave_dict = {}
+                for step in self.op_time_list_dict[op_name][curr_idx]:
+                    ave_dict[step] = sum(self.op_time_list_dict[op_name][
+                        curr_idx][step]) * 1.0 / len(self.op_time_list_dict[
+                            op_name][curr_idx][step])
+                if total_time is None:
+                    total_time = ave_dict
+                else:
+                    for step in ave_dict:
+                        total_time[step] += ave_dict[step]
+            for step in total_time:
+                total_time[step] = total_time[step] * 1.0 / len(
+                    self.op_time_list_dict[op_name])
+            self.op_time_list_dict[op_name] = total_time
+        self._close = True
+    def _get_qps(self):
+        """
+        Calculating QPS for each step based on the time
+        consumed in each step of OP.
+        """
+        if self._qps is not None:
+            return self._qps
+        self._reduce_profile()
+        self._qps = {}
+        for op_name, times in self.op_time_list_dict.items():
+            self._qps[op_name] = {
+                step: 1000.0 / ts
+                for step, ts in times.items()
+            }
+        return self._qps
+    def __str__(self):
+        self._reduce_profile()
+        return json.dumps(
+            self.op_time_list_dict, indent=2, separators=(', ', ':'))
+    def qps(self, op_name=None):
+        """
+        Get the average QPS of each step of each OP (in q/s)
+        """
+        self._get_qps()
+        if op_name is None:
+            return self._qps
+        else:
+            return self._qps[op_name]
+    def times(self, op_name=None):
+        """
+        Get the average time of each step of each OP (in ms)
+        """
+        self._reduce_profile()
+        if op_name is None:
+            return self.op_time_list_dict
+        else:
+            return self.op_time_list_dict[op_name]
+    def concurrency_analysis(self, op_config_yaml):
+        """
+        Through OP time consuming and op_config_yaml to
+        calculate the theoretical QPS, as well as the
+        number of concurrency required by each OPs.
+        It should be noted that since multiple models
+        will affect each other on one card, only the
+        case that each model is on a different card can
+        be calculated.
+        The format of the yaml file is as follows:
+        ```yaml
+        <op_name>:
+            <step(prep, midp or postp)>: <GPU id>
+        ```
+        For example:
+        ```yaml
+        cnn:
+            midp: 0
+        bow:
+            midp: 1
+        ```
+        """
+        import yaml
+        with open(op_config_yaml) as f:
+            op_config = yaml.load(f)
+        # check that each model is deployed on a different card
+        card_set = set()
+        # and finding the most time consuming part (GPU)
+        op_times = self.times()
+        most_time = 0
+        most_time_op_name = None
+        for op in op_config:
+            for step, cards in op_config[op].items():
+                if isinstance(cards, int):
+                    cards = [cards]
+                elif isinstance(cards, str):
+                    cards = [int(x) for x in cards.split(',')]
+                else:
+                    raise Exception("Error cards type.")
+                for card in cards:
+                    if card in card_set:
+                        raise Exception(
+                            "Analysis is failed because "
+                            "different services interact when different"
+                            " models are deployed on one card.")
+                    else:
+                        card_set.add(card)
+                times_each_card = op_times[op][step] / len(cards)
+                if most_time < times_each_card:
+                    most_time = times_each_card
+                    most_time_op_name = op
+        # calculate base qps
+        base_qps = 1.0 / most_time  # q/ms
+        _LOGGER.info("Most Time Consuming (GPU): {} ms (op: {})"
+                     .format(most_time, most_time_op_name))
+        _LOGGER.info("Theoretically Expected QPS: {} q/s".format(base_qps *
+                                                                 1000))
+        # reduce op times
+        op_times = {
+            op_name: sum(step_times.values())
+            for op_name, step_times in op_times.items()
+        }
+        # calculate op concurrency
+        op_concurrency = {
+            op_name: round(base_qps * times, 3)
+            for op_name, times in op_times.items()
+        }
+        return op_concurrency
--- a/python/pipeline/channel.py
+++ b/python/pipeline/channel.py
@@ -27,7 +27,7 @@ import logging
 import enum
 import copy
-_LOGGER = logging.getLogger(__name__)
+_LOGGER = logging.getLogger()
 class ChannelDataEcode(enum.Enum):
@@ -37,7 +37,8 @@ class ChannelDataEcode(enum.Enum):
    TYPE_ERROR = 3
    RPC_PACKAGE_ERROR = 4
    CLIENT_ERROR = 5
-    UNKNOW = 6
+    CLOSED_ERROR = 6
+    UNKNOW = 7
 class ChannelDataType(enum.Enum):
@@ -53,7 +54,8 @@ class ChannelData(object):
                 dictdata=None,
                 data_id=None,
                 ecode=None,
-                 error_info=None):
+                 error_info=None,
+                 client_need_profile=False):
        '''
        There are several ways to use it:
@@ -87,12 +89,28 @@ class ChannelData(object):
        self.id = data_id
        self.ecode = ecode
        self.error_info = error_info
+        self.client_need_profile = client_need_profile
+        self.profile_data_set = set()
+    def add_profile(self, profile_set):
+        if self.client_need_profile is False:
+            self.client_need_profile = True
+        self.profile_data_set |= profile_set
    @staticmethod
    def check_dictdata(dictdata):
        ecode = ChannelDataEcode.OK.value
        error_info = None
-        if not isinstance(dictdata, dict):
+        if isinstance(dictdata, list):
+            # batch data
+            for sample in dictdata:
+                if not isinstance(sample, dict):
+                    ecode = ChannelDataEcode.TYPE_ERROR.value
+                    error_info = "the value of data must " \
+                            "be dict, but get {}.".format(type(sample))
+                    break
+        elif not isinstance(dictdata, dict):
+            # batch size = 1
            ecode = ChannelDataEcode.TYPE_ERROR.value
            error_info = "the value of data must " \
                        "be dict, but get {}.".format(type(dictdata))
@@ -102,12 +120,32 @@ class ChannelData(object):
    def check_npdata(npdata):
        ecode = ChannelDataEcode.OK.value
        error_info = None
+        if isinstance(npdata, list):
+            # batch data
+            for sample in npdata:
+                if not isinstance(sample, dict):
+                    ecode = ChannelDataEcode.TYPE_ERROR.value
+                    error_info = "the value of data must " \
+                            "be dict, but get {}.".format(type(sample))
+                    break
+                for _, value in sample.items():
+                    if not isinstance(value, np.ndarray):
+                        ecode = ChannelDataEcode.TYPE_ERROR.value
+                        error_info = "the value of data must " \
+                                "be np.ndarray, but get {}.".format(type(value))
+                        return ecode, error_info
+        elif isinstance(npdata, dict):
+            # batch_size = 1
            for _, value in npdata.items():
                if not isinstance(value, np.ndarray):
                    ecode = ChannelDataEcode.TYPE_ERROR.value
                    error_info = "the value of data must " \
                            "be np.ndarray, but get {}.".format(type(value))
                    break
+        else:
+            ecode = ChannelDataEcode.TYPE_ERROR.value
+            error_info = "the value of data must " \
+                    "be dict, but get {}.".format(type(npdata))
        return ecode, error_info
    def parse(self):
@@ -127,7 +165,7 @@ class ChannelData(object):
            ChannelDataType(self.datatype).name, self.ecode, self.id)
-class ProcessChannel(multiprocessing.queues.Queue):
+class ProcessChannel(object):
    """ 
    (Process version) The channel used for communication between Ops.
@@ -157,18 +195,17 @@ class ProcessChannel(multiprocessing.queues.Queue):
    """
    def __init__(self, manager, name=None, maxsize=0, timeout=None):
-        # https://stackoverflow.com/questions/39496554/cannot-subclass-multiprocessing-queue-in-python-3-5/
+        # For queue multiprocess: after putting an object on 
-        if sys.version_info.major == 2:
+        # an empty queue there may be an infinitessimal delay
-            super(ProcessChannel, self).__init__(maxsize=maxsize)
+        # before the queue's :meth:`~Queue.empty`
-        elif sys.version_info.major == 3:
+        # see more:
-            super(ProcessChannel, self).__init__(
+        # - https://bugs.python.org/issue18277
-                maxsize=maxsize, ctx=multiprocessing.get_context())
+        # - https://hg.python.org/cpython/rev/860fc6a2bd21
-        else:
+        self._que = manager.Queue(maxsize=maxsize)
-            raise Exception("Error Python version")
        self._maxsize = maxsize
        self._timeout = timeout
        self.name = name
-        self._stop = False
+        self._stop = manager.Value('i', 0)
        self._cv = multiprocessing.Condition()
@@ -224,15 +261,17 @@ class ProcessChannel(multiprocessing.queues.Queue):
                ))
        elif len(self._producers) == 1:
            with self._cv:
-                while self._stop is False:
+                while self._stop.value == 0:
                    try:
-                        self.put({op_name: channeldata}, timeout=0)
+                        self._que.put({op_name: channeldata}, timeout=0)
                        break
                    except Queue.Full:
                        self._cv.wait()
+                if self._stop.value == 1:
+                    raise ChannelStopError()
                _LOGGER.debug(
                    self._log("{} channel size: {}".format(op_name,
-                                                           self.qsize())))
+                                                           self._que.qsize())))
                self._cv.notify_all()
                _LOGGER.debug(self._log("{} notify all".format(op_name)))
            _LOGGER.debug(self._log("{} push data succ!".format(op_name)))
@@ -271,15 +310,17 @@ class ProcessChannel(multiprocessing.queues.Queue):
                    self._log("{} push data succ, but not push to queue.".
                              format(op_name)))
            else:
-                while self._stop is False:
+                while self._stop.value == 0:
                    try:
                        _LOGGER.debug(
                            self._log("{} push data succ: {}".format(
                                op_name, put_data.__str__())))
-                        self.put(put_data, timeout=0)
+                        self._que.put(put_data, timeout=0)
                        break
                    except Queue.Empty:
                        self._cv.wait()
+                if self._stop.value == 1:
+                    raise ChannelStopError()
                _LOGGER.debug(
                    self._log("multi | {} push data succ!".format(op_name)))
@@ -296,25 +337,21 @@ class ProcessChannel(multiprocessing.queues.Queue):
        elif len(self._consumer_cursors) == 1:
            resp = None
            with self._cv:
-                while self._stop is False and resp is None:
+                while self._stop.value == 0 and resp is None:
                    try:
                        _LOGGER.debug(
                            self._log("{} try to get(with channel empty: {})".
-                                      format(op_name, self.empty())))
+                                      format(op_name, self._que.empty())))
-                        # For queue multiprocess: after putting an object on 
+                        resp = self._que.get(timeout=0)
-                        # an empty queue there may be an infinitessimal delay
-                        # before the queue's :meth:`~Queue.empty`
-                        # see more:
-                        # - https://bugs.python.org/issue18277
-                        # - https://hg.python.org/cpython/rev/860fc6a2bd21
-                        resp = self.get(timeout=1e-3)
                        break
                    except Queue.Empty:
                        _LOGGER.debug(
                            self._log(
                                "{} wait for empty queue(with channel empty: {})".
-                                format(op_name, self.empty())))
+                                format(op_name, self._que.empty())))
                        self._cv.wait()
+                if self._stop.value == 1:
+                    raise ChannelStopError()
            _LOGGER.debug(
                self._log("{} get data succ: {}".format(op_name, resp.__str__(
                ))))
@@ -337,7 +374,7 @@ class ProcessChannel(multiprocessing.queues.Queue):
        with self._cv:
            # When the data required by the current Op is not in output_buf,
            # it is necessary to obtain a data from queue and add it to output_buf.
-            while self._stop is False and self._consumer_cursors[
+            while self._stop.value == 0 and self._consumer_cursors[
                    op_name] - self._base_cursor.value >= len(self._output_buf):
                _LOGGER.debug(
                    self._log(
@@ -347,22 +384,18 @@ class ProcessChannel(multiprocessing.queues.Queue):
                try:
                    _LOGGER.debug(
                        self._log("{} try to get(with channel size: {})".format(
-                            op_name, self.qsize())))
+                            op_name, self._que.qsize())))
-                    # For queue multiprocess: after putting an object on 
+                    channeldata = self._que.get(timeout=0)
-                    # an empty queue there may be an infinitessimal delay
-                    # before the queue's :meth:`~Queue.empty`
-                    # see more:
-                    # - https://bugs.python.org/issue18277
-                    # - https://hg.python.org/cpython/rev/860fc6a2bd21
-                    channeldata = self.get(timeout=1e-3)
                    self._output_buf.append(channeldata)
                    break
                except Queue.Empty:
                    _LOGGER.debug(
                        self._log(
                            "{} wait for empty queue(with channel size: {})".
-                            format(op_name, self.qsize())))
+                            format(op_name, self._que.qsize())))
                    self._cv.wait()
+            if self._stop.value == 1:
+                raise ChannelStopError()
            consumer_cursor = self._consumer_cursors[op_name]
            base_cursor = self._base_cursor.value
@@ -409,9 +442,9 @@ class ProcessChannel(multiprocessing.queues.Queue):
        return resp  # reference, read only
    def stop(self):
-        #TODO
+        _LOGGER.debug(self._log("stop."))
-        self.close()
+        self._stop.value = 1
-        self._stop = True
+        with self._cv:
            self._cv.notify_all()
@@ -511,6 +544,8 @@ class ThreadChannel(Queue.Queue):
                        break
                    except Queue.Full:
                        self._cv.wait()
+                if self._stop:
+                    raise ChannelStopError()
                self._cv.notify_all()
            _LOGGER.debug(self._log("{} push data succ!".format(op_name)))
            return True
@@ -549,6 +584,8 @@ class ThreadChannel(Queue.Queue):
                        break
                    except Queue.Empty:
                        self._cv.wait()
+                if self._stop:
+                    raise ChannelStopError()
                _LOGGER.debug(
                    self._log("multi | {} push data succ!".format(op_name)))
@@ -571,6 +608,8 @@ class ThreadChannel(Queue.Queue):
                        break
                    except Queue.Empty:
                        self._cv.wait()
+                if self._stop:
+                    raise ChannelStopError()
            _LOGGER.debug(
                self._log("{} get data succ: {}".format(op_name, resp.__str__(
                ))))
@@ -601,12 +640,14 @@ class ThreadChannel(Queue.Queue):
                    break
                except Queue.Empty:
                    self._cv.wait()
+            if self._stop:
+                raise ChannelStopError()
            consumer_cursor = self._consumer_cursors[op_name]
            base_cursor = self._base_cursor
            data_idx = consumer_cursor - base_cursor
-            resp = self._output_buf[data_idx]
-            _LOGGER.debug(self._log("{} get data: {}".format(op_name, resp)))
+            resp = None
            self._cursor_count[consumer_cursor] -= 1
            if consumer_cursor == base_cursor and self._cursor_count[
@@ -614,7 +655,7 @@ class ThreadChannel(Queue.Queue):
                # When all the different Ops get the data that data_idx points
                # to, pop the data from output_buf.
                self._cursor_count.pop(consumer_cursor)
-                self._output_buf.pop(0)
+                resp = self._output_buf.pop(0)
                self._base_cursor += 1
                # to avoid cursor overflow
                if self._base_cursor >= self._reset_max_cursor:
@@ -625,6 +666,9 @@ class ThreadChannel(Queue.Queue):
                        cursor - self._reset_max_cursor: count
                        for cursor, count in self._cursor_count.items()
                    }
+            else:
+                resp = copy.deepcopy(self._output_buf[data_idx])
+            _LOGGER.debug(self._log("{} get data: {}".format(op_name, resp)))
            self._consumer_cursors[op_name] += 1
            new_consumer_cursor = self._consumer_cursors[op_name]
@@ -635,11 +679,15 @@ class ThreadChannel(Queue.Queue):
            self._cv.notify_all()
        _LOGGER.debug(self._log("multi | {} get data succ!".format(op_name)))
-        # return resp  # reference, read only
+        return resp
-        return copy.deepcopy(resp)
    def stop(self):
-        #TODO
+        _LOGGER.debug(self._log("stop."))
-        self.close()
        self._stop = True
+        with self._cv:
            self._cv.notify_all()
+class ChannelStopError(RuntimeError):
+    def __init__(self):
+        pass
--- a/python/pipeline/dag.py
+++ b/python/pipeline/dag.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+import threading
+import multiprocessing
+import sys
+import copy
+if sys.version_info.major == 2:
+    import Queue
+elif sys.version_info.major == 3:
+    import queue as Queue
+else:
+    raise Exception("Error Python version")
+import os
+import logging
+from .operator import Op, RequestOp, ResponseOp, VirtualOp
+from .channel import (ThreadChannel, ProcessChannel, ChannelData,
+                      ChannelDataEcode, ChannelDataType, ChannelStopError)
+from .profiler import TimeProfiler
+from .util import NameGenerator
+_LOGGER = logging.getLogger()
+class DAGExecutor(object):
+    def __init__(self, response_op, dag_config, show_info):
+        self._retry = dag_config.get('retry', 1)
+        client_type = dag_config.get('client_type', 'brpc')
+        self._server_use_profile = dag_config.get('use_profile', False)
+        channel_size = dag_config.get('channel_size', 0)
+        self._is_thread_op = dag_config.get('is_thread_op', True)
+        if show_info and self._server_use_profile:
+            _LOGGER.info("================= PROFILER ================")
+            if self._is_thread_op:
+                _LOGGER.info("op: thread")
+                _LOGGER.info("profile mode: sync")
+            else:
+                _LOGGER.info("op: process")
+                _LOGGER.info("profile mode: asyn")
+            _LOGGER.info("-------------------------------------------")
+        self.name = "@G"
+        self._profiler = TimeProfiler()
+        self._profiler.enable(True)
+        self._dag = DAG(self.name, response_op, self._server_use_profile,
+                        self._is_thread_op, client_type, channel_size,
+                        show_info)
+        (in_channel, out_channel, pack_rpc_func,
+         unpack_rpc_func) = self._dag.build()
+        self._dag.start()
+        self._set_in_channel(in_channel)
+        self._set_out_channel(out_channel)
+        self._pack_rpc_func = pack_rpc_func
+        self._unpack_rpc_func = unpack_rpc_func
+        _LOGGER.debug(self._log(in_channel.debug()))
+        _LOGGER.debug(self._log(out_channel.debug()))
+        self._id_lock = threading.Lock()
+        self._id_counter = 0
+        self._reset_max_id = 1000000000000000000
+        self._cv_pool = {}
+        self._cv_for_cv_pool = threading.Condition()
+        self._fetch_buffer = None
+        self._recive_func = None
+        self._client_profile_key = "pipeline.profile"
+        self._client_profile_value = "1"
+    def start(self):
+        self._recive_func = threading.Thread(
+            target=DAGExecutor._recive_out_channel_func, args=(self, ))
+        self._recive_func.start()
+    def stop(self):
+        self._dag.stop()
+        self._dag.join()
+    def _get_next_data_id(self):
+        with self._id_lock:
+            if self._id_counter >= self._reset_max_id:
+                self._id_counter -= self._reset_max_id
+            self._id_counter += 1
+            return self._id_counter - 1
+    def _set_in_channel(self, in_channel):
+        if not isinstance(in_channel, (ThreadChannel, ProcessChannel)):
+            raise TypeError(
+                self._log('in_channel must be Channel type, but get {}'.format(
+                    type(in_channel))))
+        in_channel.add_producer(self.name)
+        self._in_channel = in_channel
+    def _set_out_channel(self, out_channel):
+        if not isinstance(out_channel, (ThreadChannel, ProcessChannel)):
+            raise TypeError(
+                self._log('out_channel must be Channel type, but get {}'.format(
+                    type(out_channel))))
+        out_channel.add_consumer(self.name)
+        self._out_channel = out_channel
+    def _recive_out_channel_func(self):
+        cv = None
+        while True:
+            try:
+                channeldata_dict = self._out_channel.front(self.name)
+            except ChannelStopError:
+                _LOGGER.debug(self._log("stop."))
+                with self._cv_for_cv_pool:
+                    for data_id, cv in self._cv_pool.items():
+                        closed_errror_data = ChannelData(
+                            ecode=ChannelDataEcode.CLOSED_ERROR.value,
+                            error_info="dag closed.",
+                            data_id=data_id)
+                        with cv:
+                            self._fetch_buffer = closed_errror_data
+                            cv.notify_all()
+                break
+            if len(channeldata_dict) != 1:
+                _LOGGER.error("out_channel cannot have multiple input ops")
+                os._exit(-1)
+            (_, channeldata), = channeldata_dict.items()
+            if not isinstance(channeldata, ChannelData):
+                raise TypeError(
+                    self._log('data must be ChannelData type, but get {}'.
+                              format(type(channeldata))))
+            data_id = channeldata.id
+            _LOGGER.debug("recive thread fetch data: {}".format(data_id))
+            with self._cv_for_cv_pool:
+                cv = self._cv_pool[data_id]
+            with cv:
+                self._fetch_buffer = channeldata
+                cv.notify_all()
+    def _get_channeldata_from_fetch_buffer(self, data_id):
+        resp = None
+        cv = threading.Condition()
+        with self._cv_for_cv_pool:
+            self._cv_pool[data_id] = cv
+        with cv:
+            cv.wait()
+            _LOGGER.debug("resp func get lock (data_id: {})".format(data_id))
+            resp = copy.deepcopy(self._fetch_buffer)
+        with self._cv_for_cv_pool:
+            self._cv_pool.pop(data_id)
+        return resp
+    def _pack_channeldata(self, rpc_request, data_id):
+        _LOGGER.debug(self._log('start inferce'))
+        dictdata = None
+        try:
+            dictdata = self._unpack_rpc_func(rpc_request)
+        except Exception as e:
+            return ChannelData(
+                ecode=ChannelDataEcode.RPC_PACKAGE_ERROR.value,
+                error_info="rpc package error: {}".format(e),
+                data_id=data_id)
+        else:
+            # because unpack_rpc_func is rewritten by user, we need
+            # to look for client_profile_key field in rpc_request
+            profile_value = None
+            for idx, key in enumerate(rpc_request.key):
+                if key == self._client_profile_key:
+                    profile_value = rpc_request.value[idx]
+                    break
+            return ChannelData(
+                datatype=ChannelDataType.DICT.value,
+                dictdata=dictdata,
+                data_id=data_id,
+                client_need_profile=(
+                    profile_value == self._client_profile_value))
+    def call(self, rpc_request):
+        data_id = self._get_next_data_id()
+        if not self._is_thread_op:
+            self._profiler.record("call_{}#DAG-{}_0".format(data_id, data_id))
+        else:
+            self._profiler.record("call_{}#DAG_0".format(data_id))
+        self._profiler.record("prepack_{}#{}_0".format(data_id, self.name))
+        req_channeldata = self._pack_channeldata(rpc_request, data_id)
+        self._profiler.record("prepack_{}#{}_1".format(data_id, self.name))
+        resp_channeldata = None
+        for i in range(self._retry):
+            _LOGGER.debug(self._log('push data'))
+            #self._profiler.record("push_{}#{}_0".format(data_id, self.name))
+            try:
+                self._in_channel.push(req_channeldata, self.name)
+            except ChannelStopError:
+                _LOGGER.debug(self._log("stop."))
+                return self._pack_for_rpc_resp(
+                    ChannelData(
+                        ecode=ChannelDataEcode.CLOSED_ERROR.value,
+                        error_info="dag closed.",
+                        data_id=data_id))
+            #self._profiler.record("push_{}#{}_1".format(data_id, self.name))
+            _LOGGER.debug(self._log('wait for infer'))
+            #self._profiler.record("fetch_{}#{}_0".format(data_id, self.name))
+            resp_channeldata = self._get_channeldata_from_fetch_buffer(data_id)
+            #self._profiler.record("fetch_{}#{}_1".format(data_id, self.name))
+            if resp_channeldata.ecode == ChannelDataEcode.OK.value:
+                break
+            if i + 1 < self._retry:
+                _LOGGER.warn("retry({}): {}".format(
+                    i + 1, resp_channeldata.error_info))
+        self._profiler.record("postpack_{}#{}_0".format(data_id, self.name))
+        rpc_resp = self._pack_for_rpc_resp(resp_channeldata)
+        self._profiler.record("postpack_{}#{}_1".format(data_id, self.name))
+        if not self._is_thread_op:
+            self._profiler.record("call_{}#DAG-{}_1".format(data_id, data_id))
+        else:
+            self._profiler.record("call_{}#DAG_1".format(data_id))
+        #self._profiler.print_profile()
+        profile_str = self._profiler.gen_profile_str()
+        if self._server_use_profile:
+            sys.stderr.write(profile_str)
+        # add profile info into rpc_resp
+        profile_value = ""
+        if resp_channeldata.client_need_profile:
+            profile_set = resp_channeldata.profile_data_set
+            profile_set.add(profile_str)
+            profile_value = "".join(list(profile_set))
+        rpc_resp.key.append(self._client_profile_key)
+        rpc_resp.value.append(profile_value)
+        return rpc_resp
+    def _pack_for_rpc_resp(self, channeldata):
+        _LOGGER.debug(self._log('get channeldata'))
+        return self._pack_rpc_func(channeldata)
+    def _log(self, info_str):
+        return "[{}] {}".format(self.name, info_str)
+class DAG(object):
+    def __init__(self, request_name, response_op, use_profile, is_thread_op,
+                 client_type, channel_size, show_info):
+        self._request_name = request_name
+        self._response_op = response_op
+        self._use_profile = use_profile
+        self._is_thread_op = is_thread_op
+        self._channel_size = channel_size
+        self._client_type = client_type
+        self._show_info = show_info
+        if not self._is_thread_op:
+            self._manager = multiprocessing.Manager()
+    def get_use_ops(self, response_op):
+        unique_names = set()
+        used_ops = set()
+        succ_ops_of_use_op = {}  # {op_name: succ_ops}
+        que = Queue.Queue()
+        que.put(response_op)
+        while que.qsize() != 0:
+            op = que.get()
+            for pred_op in op.get_input_ops():
+                if pred_op.name not in succ_ops_of_use_op:
+                    succ_ops_of_use_op[pred_op.name] = []
+                if op != response_op:
+                    succ_ops_of_use_op[pred_op.name].append(op)
+                if pred_op not in used_ops:
+                    que.put(pred_op)
+                    used_ops.add(pred_op)
+                    # check the name of op is globally unique
+                    if pred_op.name in unique_names:
+                        raise Exception("the name of Op must be unique: {}".
+                                        format(pred_op.name))
+                    unique_names.add(pred_op.name)
+        return used_ops, succ_ops_of_use_op
+    def _gen_channel(self, name_gen):
+        channel = None
+        if self._is_thread_op:
+            channel = ThreadChannel(
+                name=name_gen.next(), maxsize=self._channel_size)
+        else:
+            channel = ProcessChannel(
+                self._manager, name=name_gen.next(), maxsize=self._channel_size)
+        return channel
+    def _gen_virtual_op(self, name_gen):
+        return VirtualOp(name=name_gen.next())
+    def _topo_sort(self, used_ops, response_op, out_degree_ops):
+        out_degree_num = {
+            name: len(ops)
+            for name, ops in out_degree_ops.items()
+        }
+        que_idx = 0  # scroll queue 
+        ques = [Queue.Queue() for _ in range(2)]
+        zero_indegree_num = 0
+        for op in used_ops:
+            if len(op.get_input_ops()) == 0:
+                zero_indegree_num += 1
+        if zero_indegree_num != 1:
+            raise Exception("DAG contains multiple input Ops")
+        last_op = response_op.get_input_ops()[0]
+        ques[que_idx].put(last_op)
+        # topo sort to get dag_views
+        dag_views = []
+        sorted_op_num = 0
+        while True:
+            que = ques[que_idx]
+            next_que = ques[(que_idx + 1) % 2]
+            dag_view = []
+            while que.qsize() != 0:
+                op = que.get()
+                dag_view.append(op)
+                sorted_op_num += 1
+                for pred_op in op.get_input_ops():
+                    out_degree_num[pred_op.name] -= 1
+                    if out_degree_num[pred_op.name] == 0:
+                        next_que.put(pred_op)
+            dag_views.append(dag_view)
+            if next_que.qsize() == 0:
+                break
+            que_idx = (que_idx + 1) % 2
+        if sorted_op_num < len(used_ops):
+            raise Exception("not legal DAG")
+        return dag_views, last_op
+    def _build_dag(self, response_op):
+        if response_op is None:
+            raise Exception("response_op has not been set.")
+        used_ops, out_degree_ops = self.get_use_ops(response_op)
+        if self._show_info:
+            _LOGGER.info("================= USED OP =================")
+            for op in used_ops:
+                if op.name != self._request_name:
+                    _LOGGER.info(op.name)
+            _LOGGER.info("-------------------------------------------")
+        if len(used_ops) <= 1:
+            raise Exception(
+                "Besides RequestOp and ResponseOp, there should be at least one Op in DAG."
+            )
+        dag_views, last_op = self._topo_sort(used_ops, response_op,
+                                             out_degree_ops)
+        dag_views = list(reversed(dag_views))
+        if self._show_info:
+            _LOGGER.info("================== DAG ====================")
+            for idx, view in enumerate(dag_views):
+                _LOGGER.info("(VIEW {})".format(idx))
+                for op in view:
+                    _LOGGER.info("  [{}]".format(op.name))
+                    for out_op in out_degree_ops[op.name]:
+                        _LOGGER.info("    - {}".format(out_op.name))
+            _LOGGER.info("-------------------------------------------")
+        # create channels and virtual ops
+        virtual_op_name_gen = NameGenerator("vir")
+        channel_name_gen = NameGenerator("chl")
+        virtual_ops = []
+        channels = []
+        input_channel = None
+        actual_view = None
+        for v_idx, view in enumerate(dag_views):
+            if v_idx + 1 >= len(dag_views):
+                break
+            next_view = dag_views[v_idx + 1]
+            if actual_view is None:
+                actual_view = view
+            actual_next_view = []
+            pred_op_of_next_view_op = {}
+            for op in actual_view:
+                # find actual succ op in next view and create virtual op
+                for succ_op in out_degree_ops[op.name]:
+                    if succ_op in next_view:
+                        if succ_op not in actual_next_view:
+                            actual_next_view.append(succ_op)
+                        if succ_op.name not in pred_op_of_next_view_op:
+                            pred_op_of_next_view_op[succ_op.name] = []
+                        pred_op_of_next_view_op[succ_op.name].append(op)
+                    else:
+                        # create virtual op
+                        virtual_op = self._gen_virtual_op(virtual_op_name_gen)
+                        virtual_ops.append(virtual_op)
+                        out_degree_ops[virtual_op.name] = [succ_op]
+                        actual_next_view.append(virtual_op)
+                        pred_op_of_next_view_op[virtual_op.name] = [op]
+                        virtual_op.add_virtual_pred_op(op)
+            actual_view = actual_next_view
+            # create channel
+            processed_op = set()
+            for o_idx, op in enumerate(actual_next_view):
+                if op.name in processed_op:
+                    continue
+                channel = self._gen_channel(channel_name_gen)
+                channels.append(channel)
+                _LOGGER.debug("{} => {}".format(channel.name, op.name))
+                op.add_input_channel(channel)
+                pred_ops = pred_op_of_next_view_op[op.name]
+                if v_idx == 0:
+                    input_channel = channel
+                else:
+                    # if pred_op is virtual op, it will use ancestors as producers to channel
+                    for pred_op in pred_ops:
+                        _LOGGER.debug("{} => {}".format(pred_op.name,
+                                                        channel.name))
+                        pred_op.add_output_channel(channel)
+                processed_op.add(op.name)
+                # find same input op to combine channel
+                for other_op in actual_next_view[o_idx + 1:]:
+                    if other_op.name in processed_op:
+                        continue
+                    other_pred_ops = pred_op_of_next_view_op[other_op.name]
+                    if len(other_pred_ops) != len(pred_ops):
+                        continue
+                    same_flag = True
+                    for pred_op in pred_ops:
+                        if pred_op not in other_pred_ops:
+                            same_flag = False
+                            break
+                    if same_flag:
+                        _LOGGER.debug("{} => {}".format(channel.name,
+                                                        other_op.name))
+                        other_op.add_input_channel(channel)
+                        processed_op.add(other_op.name)
+        output_channel = self._gen_channel(channel_name_gen)
+        channels.append(output_channel)
+        last_op.add_output_channel(output_channel)
+        pack_func, unpack_func = None, None
+        pack_func = response_op.pack_response_package
+        actual_ops = virtual_ops
+        for op in used_ops:
+            if len(op.get_input_ops()) == 0:
+                unpack_func = op.unpack_request_package
+                continue
+            actual_ops.append(op)
+        for c in channels:
+            _LOGGER.debug(c.debug())
+        return (actual_ops, channels, input_channel, output_channel, pack_func,
+                unpack_func)
+    def build(self):
+        (actual_ops, channels, input_channel, output_channel, pack_func,
+         unpack_func) = self._build_dag(self._response_op)
+        self._actual_ops = actual_ops
+        self._channels = channels
+        self._input_channel = input_channel
+        self._output_channel = output_channel
+        self._pack_func = pack_func
+        self._unpack_func = unpack_func
+        return self._input_channel, self._output_channel, self._pack_func, self._unpack_func
+    def start(self):
+        self._threads_or_proces = []
+        for op in self._actual_ops:
+            op.use_profiler(self._use_profile)
+            if self._is_thread_op:
+                self._threads_or_proces.extend(
+                    op.start_with_thread(self._client_type))
+            else:
+                self._threads_or_proces.extend(
+                    op.start_with_process(self._client_type))
+        # not join yet
+        return self._threads_or_proces
+    def join(self):
+        for x in self._threads_or_proces:
+            x.join()
+    def stop(self):
+        for chl in self._channels:
+            chl.stop()
+        for op in self._actual_ops:
+            op.clean_input_channel()
+            op.clean_output_channels()
--- a/python/pipeline/operator.py
+++ b/python/pipeline/operator.py
@@ -19,13 +19,18 @@ from paddle_serving_client import MultiLangClient, Client
 from concurrent import futures
 import logging
 import func_timeout
+import os
+import sys
+import numpy as np
 from numpy import *
 from .proto import pipeline_service_pb2
-from .channel import ThreadChannel, ProcessChannel, ChannelDataEcode, ChannelData, ChannelDataType
+from .channel import (ThreadChannel, ProcessChannel, ChannelDataEcode,
+                      ChannelData, ChannelDataType, ChannelStopError)
 from .util import NameGenerator
+from .profiler import TimeProfiler
-_LOGGER = logging.getLogger(__name__)
+_LOGGER = logging.getLogger()
 _op_name_gen = NameGenerator("Op")
@@ -41,7 +46,6 @@ class Op(object):
                 retry=1):
        if name is None:
            name = _op_name_gen.next()
-        self._is_run = False
        self.name = name  # to identify the type of OP, it must be globally unique
        self.concurrency = concurrency  # amount of concurrency
        self.set_input_ops(input_ops)
@@ -57,10 +61,17 @@ class Op(object):
        self._retry = max(1, retry)
        self._input = None
        self._outputs = []
-        self._profiler = None
-    def init_profiler(self, profiler):
+        self._server_use_profile = False
-        self._profiler = profiler
+        # only for multithread
+        self._for_init_op_lock = threading.Lock()
+        self._for_close_op_lock = threading.Lock()
+        self._succ_init_op = False
+        self._succ_close_op = False
+    def use_profiler(self, use_profile):
+        self._server_use_profile = use_profile
    def _profiler_record(self, string):
        if self._profiler is None:
@@ -71,21 +82,19 @@ class Op(object):
                    fetch_names):
        if self.with_serving == False:
            _LOGGER.debug("{} no client".format(self.name))
-            return
+            return None
        _LOGGER.debug("{} client_config: {}".format(self.name, client_config))
        _LOGGER.debug("{} fetch_names: {}".format(self.name, fetch_names))
        if client_type == 'brpc':
-            self._client = Client()
+            client = Client()
-            self._client.load_client_config(client_config)
+            client.load_client_config(client_config)
        elif client_type == 'grpc':
-            self._client = MultiLangClient()
+            client = MultiLangClient()
        else:
            raise ValueError("unknow client type: {}".format(client_type))
-        self._client.connect(server_endpoints)
+        client.connect(server_endpoints)
        self._fetch_names = fetch_names
+        return client
-    def _get_input_channel(self):
-        return self._input
    def get_input_ops(self):
        return self._input_ops
@@ -109,8 +118,11 @@ class Op(object):
        channel.add_consumer(self.name)
        self._input = channel
-    def _get_output_channels(self):
+    def clean_input_channel(self):
-        return self._outputs
+        self._input = None
+    def _get_input_channel(self):
+        return self._input
    def add_output_channel(self, channel):
        if not isinstance(channel, (ThreadChannel, ProcessChannel)):
@@ -120,6 +132,12 @@ class Op(object):
        channel.add_producer(self.name)
        self._outputs.append(channel)
+    def clean_output_channels(self):
+        self._outputs = []
+    def _get_output_channels(self):
+        return self._outputs
    def preprocess(self, input_dicts):
        # multiple previous Op
        if len(input_dicts) != 1:
@@ -135,46 +153,63 @@ class Op(object):
        if err != 0:
            raise NotImplementedError(
                "{} Please override preprocess func.".format(err_info))
-        _LOGGER.debug(self._log('feed_dict: {}'.format(feed_dict)))
+        call_result = self.client.predict(
-        _LOGGER.debug(self._log('fetch: {}'.format(self._fetch_names)))
-        call_result = self._client.predict(
            feed=feed_dict, fetch=self._fetch_names)
        _LOGGER.debug(self._log("get call_result"))
        return call_result
-    def postprocess(self, fetch_dict):
+    def postprocess(self, input_dict, fetch_dict):
        return fetch_dict
-    def stop(self):
-        self._is_run = False
    def _parse_channeldata(self, channeldata_dict):
        data_id, error_channeldata = None, None
+        client_need_profile, profile_set = False, set()
        parsed_data = {}
        key = list(channeldata_dict.keys())[0]
        data_id = channeldata_dict[key].id
+        client_need_profile = channeldata_dict[key].client_need_profile
        for name, data in channeldata_dict.items():
            if data.ecode != ChannelDataEcode.OK.value:
                error_channeldata = data
                break
            parsed_data[name] = data.parse()
-        return data_id, error_channeldata, parsed_data
+            if client_need_profile:
+                profile_set |= data.profile_data_set
-    def _push_to_output_channels(self, data, channels, name=None):
+        return (data_id, error_channeldata, parsed_data, client_need_profile,
+                profile_set)
+    def _push_to_output_channels(self,
+                                 data,
+                                 channels,
+                                 name=None,
+                                 client_need_profile=False,
+                                 profile_set=None):
        if name is None:
            name = self.name
+        self._add_profile_into_channeldata(data, client_need_profile,
+                                           profile_set)
        for channel in channels:
            channel.push(data, name)
+    def _add_profile_into_channeldata(self, data, client_need_profile,
+                                      profile_set):
+        profile_str = self._profiler.gen_profile_str()
+        if self._server_use_profile:
+            sys.stderr.write(profile_str)
+        if client_need_profile and profile_set is not None:
+            profile_set.add(profile_str)
+            data.add_profile(profile_set)
    def start_with_process(self, client_type):
        proces = []
        for concurrency_idx in range(self.concurrency):
            p = multiprocessing.Process(
                target=self._run,
                args=(concurrency_idx, self._get_input_channel(),
-                      self._get_output_channels(), client_type))
+                      self._get_output_channels(), client_type, False))
            p.start()
            proces.append(p)
        return proces
@@ -185,12 +220,12 @@ class Op(object):
            t = threading.Thread(
                target=self._run,
                args=(concurrency_idx, self._get_input_channel(),
-                      self._get_output_channels(), client_type))
+                      self._get_output_channels(), client_type, True))
            t.start()
            threads.append(t)
        return threads
-    def load_user_resources(self):
+    def init_op(self):
        pass
    def _run_preprocess(self, parsed_data, data_id, log_func):
@@ -267,10 +302,10 @@ class Op(object):
            midped_data = preped_data
        return midped_data, error_channeldata
-    def _run_postprocess(self, midped_data, data_id, log_func):
+    def _run_postprocess(self, input_dict, midped_data, data_id, log_func):
        output_data, error_channeldata = None, None
        try:
-            postped_data = self.postprocess(midped_data)
+            postped_data = self.postprocess(input_dict, midped_data)
        except Exception as e:
            error_info = log_func(e)
            _LOGGER.error(error_info)
@@ -303,8 +338,8 @@ class Op(object):
                data_id=data_id)
        return output_data, error_channeldata
-    def _run(self, concurrency_idx, input_channel, output_channels,
+    def _run(self, concurrency_idx, input_channel, output_channels, client_type,
-             client_type):
+             is_thread_op):
        def get_log_func(op_info_prefix):
            def log_func(info_str):
                return "{} {}".format(op_info_prefix, info_str)
@@ -315,62 +350,130 @@ class Op(object):
        log = get_log_func(op_info_prefix)
        tid = threading.current_thread().ident
-        # create client based on client_type
+        # init op
-        self.init_client(client_type, self._client_config,
+        self.concurrency_idx = concurrency_idx
+        try:
+            if is_thread_op:
+                with self._for_init_op_lock:
+                    if not self._succ_init_op:
+                        # init profiler
+                        self._profiler = TimeProfiler()
+                        self._profiler.enable(True)
+                        # init client
+                        self.client = self.init_client(
+                            client_type, self._client_config,
                            self._server_endpoints, self._fetch_names)
+                        # user defined
+                        self.init_op()
+                        self._succ_init_op = True
+                        self._succ_close_op = False
+            else:
+                # init profiler
+                self._profiler = TimeProfiler()
+                self._profiler.enable(True)
+                # init client
+                self.client = self.init_client(client_type, self._client_config,
+                                               self._server_endpoints,
+                                               self._fetch_names)
+                # user defined
+                self.init_op()
+        except Exception as e:
+            _LOGGER.error(log(e))
+            os._exit(-1)
-        # load user resources
+        while True:
-        self.load_user_resources()
+            #self._profiler_record("get#{}_0".format(op_info_prefix))
+            try:
-        self._is_run = True
-        while self._is_run:
-            self._profiler_record("{}-get#{}_0".format(op_info_prefix, tid))
                channeldata_dict = input_channel.front(self.name)
-            self._profiler_record("{}-get#{}_1".format(op_info_prefix, tid))
+            except ChannelStopError:
+                _LOGGER.debug(log("stop."))
+                if is_thread_op:
+                    with self._for_close_op_lock:
+                        if not self._succ_close_op:
+                            self._profiler = None
+                            self.client = None
+                            self._succ_init_op = False
+                            self._succ_close_op = True
+                break
+            #self._profiler_record("get#{}_1".format(op_info_prefix))
            _LOGGER.debug(log("input_data: {}".format(channeldata_dict)))
-            data_id, error_channeldata, parsed_data = self._parse_channeldata(
+            (data_id, error_channeldata, parsed_data, client_need_profile,
-                channeldata_dict)
+             profile_set) = self._parse_channeldata(channeldata_dict)
            # error data in predecessor Op
            if error_channeldata is not None:
+                try:
+                    # error_channeldata with profile info
                    self._push_to_output_channels(error_channeldata,
                                                  output_channels)
+                except ChannelStopError:
+                    _LOGGER.debug(log("stop."))
+                    break
                continue
            # preprecess
-            self._profiler_record("{}-prep#{}_0".format(op_info_prefix, tid))
+            self._profiler_record("prep#{}_0".format(op_info_prefix))
            preped_data, error_channeldata = self._run_preprocess(parsed_data,
                                                                  data_id, log)
-            self._profiler_record("{}-prep#{}_1".format(op_info_prefix, tid))
+            self._profiler_record("prep#{}_1".format(op_info_prefix))
            if error_channeldata is not None:
-                self._push_to_output_channels(error_channeldata,
+                try:
-                                              output_channels)
+                    self._push_to_output_channels(
+                        error_channeldata,
+                        output_channels,
+                        client_need_profile=client_need_profile,
+                        profile_set=profile_set)
+                except ChannelStopError:
+                    _LOGGER.debug(log("stop."))
+                    break
                continue
            # process
-            self._profiler_record("{}-midp#{}_0".format(op_info_prefix, tid))
+            self._profiler_record("midp#{}_0".format(op_info_prefix))
            midped_data, error_channeldata = self._run_process(preped_data,
                                                               data_id, log)
-            self._profiler_record("{}-midp#{}_1".format(op_info_prefix, tid))
+            self._profiler_record("midp#{}_1".format(op_info_prefix))
            if error_channeldata is not None:
-                self._push_to_output_channels(error_channeldata,
+                try:
-                                              output_channels)
+                    self._push_to_output_channels(
+                        error_channeldata,
+                        output_channels,
+                        client_need_profile=client_need_profile,
+                        profile_set=profile_set)
+                except ChannelStopError:
+                    _LOGGER.debug(log("stop."))
+                    break
                continue
            # postprocess
-            self._profiler_record("{}-postp#{}_0".format(op_info_prefix, tid))
+            self._profiler_record("postp#{}_0".format(op_info_prefix))
-            output_data, error_channeldata = self._run_postprocess(midped_data,
+            output_data, error_channeldata = self._run_postprocess(
-                                                                   data_id, log)
+                parsed_data, midped_data, data_id, log)
-            self._profiler_record("{}-postp#{}_1".format(op_info_prefix, tid))
+            self._profiler_record("postp#{}_1".format(op_info_prefix))
            if error_channeldata is not None:
-                self._push_to_output_channels(error_channeldata,
+                try:
-                                              output_channels)
+                    self._push_to_output_channels(
+                        error_channeldata,
+                        output_channels,
+                        client_need_profile=client_need_profile,
+                        profile_set=profile_set)
+                except ChannelStopError:
+                    _LOGGER.debug(log("stop."))
+                    break
                continue
            # push data to channel (if run succ)
-            self._profiler_record("{}-push#{}_0".format(op_info_prefix, tid))
+            #self._profiler_record("push#{}_0".format(op_info_prefix))
-            self._push_to_output_channels(output_data, output_channels)
+            try:
-            self._profiler_record("{}-push#{}_1".format(op_info_prefix, tid))
+                self._push_to_output_channels(
+                    output_data,
+                    output_channels,
+                    client_need_profile=client_need_profile,
+                    profile_set=profile_set)
+            except ChannelStopError:
+                _LOGGER.debug(log("stop."))
+                break
+            #self._profiler_record("push#{}_1".format(op_info_prefix))
    def _log(self, info):
        return "{} {}".format(self.name, info)
@@ -379,12 +482,15 @@ class Op(object):
 class RequestOp(Op):
    """ RequestOp do not run preprocess, process, postprocess. """
-    def __init__(self, concurrency=1):
+    def __init__(self):
-        # PipelineService.name = "#G"
+        # PipelineService.name = "@G"
-        super(RequestOp, self).__init__(
+        super(RequestOp, self).__init__(name="@G", input_ops=[])
-            name="#G", input_ops=[], concurrency=concurrency)
+        # init op
-        # load user resources
+        try:
-        self.load_user_resources()
+            self.init_op()
+        except Exception as e:
+            _LOGGER.error(e)
+            os._exit(-1)
    def unpack_request_package(self, request):
        dictdata = {}
@@ -401,11 +507,14 @@ class RequestOp(Op):
 class ResponseOp(Op):
    """ ResponseOp do not run preprocess, process, postprocess. """
-    def __init__(self, input_ops, concurrency=1):
+    def __init__(self, input_ops):
-        super(ResponseOp, self).__init__(
+        super(ResponseOp, self).__init__(name="@R", input_ops=input_ops)
-            name="#R", input_ops=input_ops, concurrency=concurrency)
+        # init op
-        # load user resources
+        try:
-        self.load_user_resources()
+            self.init_op()
+        except Exception as e:
+            _LOGGER.error(e)
+            os._exit(-1)
    def pack_response_package(self, channeldata):
        resp = pipeline_service_pb2.Response()
@@ -415,6 +524,7 @@ class ResponseOp(Op):
                feed = channeldata.parse()
                # ndarray to string:
                # https://stackoverflow.com/questions/30167538/convert-a-numpy-ndarray-to-stringor-bytes-and-convert-it-back-to-numpy-ndarray
+                np.set_printoptions(threshold=np.nan)
                for name, var in feed.items():
                    resp.value.append(var.__repr__())
                    resp.key.append(name)
@@ -450,17 +560,26 @@ class VirtualOp(Op):
    def add_virtual_pred_op(self, op):
        self._virtual_pred_ops.append(op)
+    def _actual_pred_op_names(self, op):
+        if not isinstance(op, VirtualOp):
+            return [op.name]
+        names = []
+        for x in op._virtual_pred_ops:
+            names.extend(self._actual_pred_op_names(x))
+        return names
    def add_output_channel(self, channel):
        if not isinstance(channel, (ThreadChannel, ProcessChannel)):
            raise TypeError(
                self._log('output channel must be Channel type, not {}'.format(
                    type(channel))))
        for op in self._virtual_pred_ops:
-            channel.add_producer(op.name)
+            for op_name in self._actual_pred_op_names(op):
+                channel.add_producer(op_name)
        self._outputs.append(channel)
-    def _run(self, concurrency_idx, input_channel, output_channels,
+    def _run(self, concurrency_idx, input_channel, output_channels, client_type,
-             client_type):
+             is_thread_op):
        def get_log_func(op_info_prefix):
            def log_func(info_str):
                return "{} {}".format(op_info_prefix, info_str)
@@ -471,14 +590,17 @@ class VirtualOp(Op):
        log = get_log_func(op_info_prefix)
        tid = threading.current_thread().ident
-        self._is_run = True
+        while True:
-        while self._is_run:
+            try:
-            self._profiler_record("{}-get#{}_0".format(op_info_prefix, tid))
                channeldata_dict = input_channel.front(self.name)
-            self._profiler_record("{}-get#{}_1".format(op_info_prefix, tid))
+            except ChannelStopError:
+                _LOGGER.debug(log("stop."))
+                break
-            self._profiler_record("{}-push#{}_0".format(op_info_prefix, tid))
+            try:
                for name, data in channeldata_dict.items():
                    self._push_to_output_channels(
                        data, channels=output_channels, name=name)
-            self._profiler_record("{}-push#{}_1".format(op_info_prefix, tid))
+            except ChannelStopError:
+                _LOGGER.debug(log("stop."))
+                break
--- a/python/pipeline/pipeline_client.py
+++ b/python/pipeline/pipeline_client.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing
 import grpc
+import sys
 import numpy as np
 from numpy import *
 import logging
@@ -20,19 +21,25 @@ import functools
 from .proto import pipeline_service_pb2
 from .proto import pipeline_service_pb2_grpc
-_LOGGER = logging.getLogger(__name__)
+_LOGGER = logging.getLogger()
 class PipelineClient(object):
    def __init__(self):
        self._channel = None
+        self._profile_key = "pipeline.profile"
+        self._profile_value = "1"
-    def connect(self, endpoint):
+    def connect(self, endpoints):
-        self._channel = grpc.insecure_channel(endpoint)
+        options = [('grpc.max_receive_message_length', 512 * 1024 * 1024),
+                   ('grpc.max_send_message_length', 512 * 1024 * 1024),
+                   ('grpc.lb_policy_name', 'round_robin')]
+        g_endpoint = 'ipv4:{}'.format(','.join(endpoints))
+        self._channel = grpc.insecure_channel(g_endpoint, options=options)
        self._stub = pipeline_service_pb2_grpc.PipelineServiceStub(
            self._channel)
-    def _pack_request_package(self, feed_dict):
+    def _pack_request_package(self, feed_dict, profile):
        req = pipeline_service_pb2.Request()
        for key, value in feed_dict.items():
            req.key.append(key)
@@ -45,6 +52,9 @@ class PipelineClient(object):
            else:
                raise TypeError("only str and np.ndarray type is supported: {}".
                                format(type(value)))
+        if profile:
+            req.key.append(self._profile_key)
+            req.value.append(self._profile_value)
        return req
    def _unpack_response_package(self, resp, fetch):
@@ -52,7 +62,11 @@ class PipelineClient(object):
            return {"ecode": resp.ecode, "error_info": resp.error_info}
        fetch_map = {"ecode": resp.ecode}
        for idx, key in enumerate(resp.key):
-            if key not in fetch:
+            if key == self._profile_key:
+                if resp.value[idx] != "":
+                    sys.stderr.write(resp.value[idx])
+                continue
+            if fetch is not None and key not in fetch:
                continue
            data = resp.value[idx]
            try:
@@ -62,16 +76,16 @@ class PipelineClient(object):
            fetch_map[key] = data
        return fetch_map
-    def predict(self, feed_dict, fetch, asyn=False):
+    def predict(self, feed_dict, fetch=None, asyn=False, profile=False):
        if not isinstance(feed_dict, dict):
            raise TypeError(
                "feed must be dict type with format: {name: value}.")
-        if not isinstance(fetch, list):
+        if fetch is not None and not isinstance(fetch, list):
            raise TypeError("fetch must be list type with format: [name].")
-        req = self._pack_request_package(feed_dict)
+        req = self._pack_request_package(feed_dict, profile)
        if not asyn:
            resp = self._stub.inference(req)
-            return self._unpack_response_package(resp)
+            return self._unpack_response_package(resp, fetch)
        else:
            call_future = self._stub.inference.future(req)
            return PipelinePredictFuture(

--- a/python/pipeline/pipeline_server.py
+++ b/python/pipeline/pipeline_server.py
@@ -12,370 +12,65 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=doc-string-missing
-import threading
-import multiprocessing
-import multiprocessing.queues
-import sys
-if sys.version_info.major == 2:
-    import Queue
-elif sys.version_info.major == 3:
-    import queue as Queue
-else:
-    raise Exception("Error Python version")
-import os
-from paddle_serving_client import MultiLangClient, Client
 from concurrent import futures
-import numpy as np
 import grpc
 import logging
-import random
-import time
-import func_timeout
-import enum
-import collections
-import copy
 import socket
+import contextlib
 from contextlib import closing
+import multiprocessing
 import yaml
-from .proto import pipeline_service_pb2
 from .proto import pipeline_service_pb2_grpc
-from .operator import Op, RequestOp, ResponseOp, VirtualOp
+from .operator import ResponseOp
-from .channel import ThreadChannel, ProcessChannel, ChannelData, ChannelDataEcode, ChannelDataType
+from .dag import DAGExecutor
-from .profiler import TimeProfiler
-from .util import NameGenerator
-_LOGGER = logging.getLogger(__name__)
+_LOGGER = logging.getLogger()
-_profiler = TimeProfiler()
 class PipelineService(pipeline_service_pb2_grpc.PipelineServiceServicer):
-    def __init__(self, in_channel, out_channel, unpack_func, pack_func,
+    def __init__(self, response_op, dag_config, show_info):
-                 retry=2):
        super(PipelineService, self).__init__()
-        self.name = "#G"
+        # init dag executor
-        self.set_in_channel(in_channel)
+        self._dag_executor = DAGExecutor(
-        self.set_out_channel(out_channel)
+            response_op, dag_config, show_info=show_info)
-        _LOGGER.debug(self._log(in_channel.debug()))
+        self._dag_executor.start()
-        _LOGGER.debug(self._log(out_channel.debug()))
-        #TODO: 
-        #  multi-lock for different clients
-        #  diffenert lock for server and client
-        self._id_lock = threading.Lock()
-        self._cv = threading.Condition()
-        self._globel_resp_dict = {}
-        self._id_counter = 0
-        self._reset_max_id = 1000000000000000000
-        self._retry = retry
-        self._is_run = True
-        self._pack_func = pack_func
-        self._unpack_func = unpack_func
-        self._recive_func = threading.Thread(
-            target=PipelineService._recive_out_channel_func, args=(self, ))
-        self._recive_func.start()
-    def _log(self, info_str):
-        return "[{}] {}".format(self.name, info_str)
-    def set_in_channel(self, in_channel):
-        if not isinstance(in_channel, (ThreadChannel, ProcessChannel)):
-            raise TypeError(
-                self._log('in_channel must be Channel type, but get {}'.format(
-                    type(in_channel))))
-        in_channel.add_producer(self.name)
-        self._in_channel = in_channel
-    def set_out_channel(self, out_channel):
-        if not isinstance(out_channel, (ThreadChannel, ProcessChannel)):
-            raise TypeError(
-                self._log('out_channel must be Channel type, but get {}'.format(
-                    type(out_channel))))
-        out_channel.add_consumer(self.name)
-        self._out_channel = out_channel
-    def stop(self):
-        self._is_run = False
-    def _recive_out_channel_func(self):
-        while self._is_run:
-            channeldata_dict = self._out_channel.front(self.name)
-            if len(channeldata_dict) != 1:
-                raise Exception("out_channel cannot have multiple input ops")
-            (_, channeldata), = channeldata_dict.items()
-            if not isinstance(channeldata, ChannelData):
-                raise TypeError(
-                    self._log('data must be ChannelData type, but get {}'.
-                              format(type(channeldata))))
-            with self._cv:
-                data_id = channeldata.id
-                self._globel_resp_dict[data_id] = channeldata
-                self._cv.notify_all()
-    def _get_next_id(self):
-        with self._id_lock:
-            if self._id_counter >= self._reset_max_id:
-                self._id_counter -= self._reset_max_id
-            self._id_counter += 1
-            return self._id_counter - 1
-    def _get_data_in_globel_resp_dict(self, data_id):
-        resp = None
-        with self._cv:
-            while data_id not in self._globel_resp_dict:
-                self._cv.wait()
-            resp = self._globel_resp_dict.pop(data_id)
-            self._cv.notify_all()
-        return resp
-    def _pack_data_for_infer(self, request):
-        _LOGGER.debug(self._log('start inferce'))
-        data_id = self._get_next_id()
-        dictdata = None
-        try:
-            dictdata = self._unpack_func(request)
-        except Exception as e:
-            return ChannelData(
-                ecode=ChannelDataEcode.RPC_PACKAGE_ERROR.value,
-                error_info="rpc package error: {}".format(e),
-                data_id=data_id), data_id
-        else:
-            return ChannelData(
-                datatype=ChannelDataType.DICT.value,
-                dictdata=dictdata,
-                data_id=data_id), data_id
-    def _pack_data_for_resp(self, channeldata):
-        _LOGGER.debug(self._log('get channeldata'))
-        return self._pack_func(channeldata)
    def inference(self, request, context):
-        _profiler.record("{}-prepack_0".format(self.name))
+        resp = self._dag_executor.call(request)
-        data, data_id = self._pack_data_for_infer(request)
+        return resp
-        _profiler.record("{}-prepack_1".format(self.name))
-        resp_channeldata = None
-        for i in range(self._retry):
-            _LOGGER.debug(self._log('push data'))
-            _profiler.record("{}-push_0".format(self.name))
-            self._in_channel.push(data, self.name)
-            _profiler.record("{}-push_1".format(self.name))
-            _LOGGER.debug(self._log('wait for infer'))
+    def __del__(self):
-            _profiler.record("{}-fetch_0".format(self.name))
+        self._dag_executor.stop()
-            resp_channeldata = self._get_data_in_globel_resp_dict(data_id)
-            _profiler.record("{}-fetch_1".format(self.name))
-            if resp_channeldata.ecode == ChannelDataEcode.OK.value:
-                break
-            if i + 1 < self._retry:
-                _LOGGER.warn("retry({}): {}".format(
-                    i + 1, resp_channeldata.error_info))
-        _profiler.record("{}-postpack_0".format(self.name))
+@contextlib.contextmanager
-        resp = self._pack_data_for_resp(resp_channeldata)
+def _reserve_port(port):
-        _profiler.record("{}-postpack_1".format(self.name))
+    """Find and reserve a port for all subprocesses to use."""
-        _profiler.print_profile()
+    sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
-        return resp
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
+    if sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT) == 0:
+        raise RuntimeError("Failed to set SO_REUSEPORT.")
+    sock.bind(('', port))
+    try:
+        yield sock.getsockname()[1]
+    finally:
+        sock.close()
 class PipelineServer(object):
    def __init__(self):
-        self._channels = []
-        self._actual_ops = []
        self._port = None
        self._worker_num = None
-        self._in_channel = None
-        self._out_channel = None
        self._response_op = None
-        self._pack_func = None
-        self._unpack_func = None
-    def add_channel(self, channel):
-        self._channels.append(channel)
-    def gen_desc(self):
-        _LOGGER.info('here will generate desc for PAAS')
-        pass
    def set_response_op(self, response_op):
-        if not isinstance(response_op, Op):
+        if not isinstance(response_op, ResponseOp):
-            raise Exception("response_op must be Op type.")
+            raise Exception("response_op must be ResponseOp type.")
        if len(response_op.get_input_ops()) != 1:
            raise Exception("response_op can only have one previous op.")
        self._response_op = response_op
-    def _topo_sort(self, response_op):
-        if response_op is None:
-            raise Exception("response_op has not been set.")
-        def get_use_ops(root):
-            # root: response_op
-            unique_names = set()
-            use_ops = set()
-            succ_ops_of_use_op = {}  # {op_name: succ_ops}
-            que = Queue.Queue()
-            que.put(root)
-            #use_ops.add(root)
-            #unique_names.add(root.name)
-            while que.qsize() != 0:
-                op = que.get()
-                for pred_op in op.get_input_ops():
-                    if pred_op.name not in succ_ops_of_use_op:
-                        succ_ops_of_use_op[pred_op.name] = []
-                    if op != root:
-                        succ_ops_of_use_op[pred_op.name].append(op)
-                    if pred_op not in use_ops:
-                        que.put(pred_op)
-                        use_ops.add(pred_op)
-                        # check the name of op is globally unique
-                        if pred_op.name in unique_names:
-                            raise Exception("the name of Op must be unique: {}".
-                                            format(pred_op.name))
-                        unique_names.add(pred_op.name)
-            return use_ops, succ_ops_of_use_op
-        use_ops, out_degree_ops = get_use_ops(response_op)
-        if len(use_ops) <= 1:
-            raise Exception(
-                "Besides RequestOp and ResponseOp, there should be at least one Op in DAG."
-            )
-        name2op = {op.name: op for op in use_ops}
-        out_degree_num = {
-            name: len(ops)
-            for name, ops in out_degree_ops.items()
-        }
-        que_idx = 0  # scroll queue 
-        ques = [Queue.Queue() for _ in range(2)]
-        zero_indegree_num = 0
-        for op in use_ops:
-            if len(op.get_input_ops()) == 0:
-                zero_indegree_num += 1
-        if zero_indegree_num != 1:
-            raise Exception("DAG contains multiple input Ops")
-        last_op = response_op.get_input_ops()[0]
-        ques[que_idx].put(last_op)
-        # topo sort to get dag_views
-        dag_views = []
-        sorted_op_num = 0
-        while True:
-            que = ques[que_idx]
-            next_que = ques[(que_idx + 1) % 2]
-            dag_view = []
-            while que.qsize() != 0:
-                op = que.get()
-                dag_view.append(op)
-                sorted_op_num += 1
-                for pred_op in op.get_input_ops():
-                    out_degree_num[pred_op.name] -= 1
-                    if out_degree_num[pred_op.name] == 0:
-                        next_que.put(pred_op)
-            dag_views.append(dag_view)
-            if next_que.qsize() == 0:
-                break
-            que_idx = (que_idx + 1) % 2
-        if sorted_op_num < len(use_ops):
-            raise Exception("not legal DAG")
-        # create channels and virtual ops
-        def gen_channel(name_gen):
-            channel = None
-            if self._use_multithread:
-                channel = ThreadChannel(name=name_gen.next())
-            else:
-                channel = ProcessChannel(self._manager, name=name_gen.next())
-            return channel
-        def gen_virtual_op(name_gen):
-            return VirtualOp(name=name_gen.next())
-        virtual_op_name_gen = NameGenerator("vir")
-        channel_name_gen = NameGenerator("chl")
-        virtual_ops = []
-        channels = []
-        input_channel = None
-        actual_view = None
-        dag_views = list(reversed(dag_views))
-        for v_idx, view in enumerate(dag_views):
-            if v_idx + 1 >= len(dag_views):
-                break
-            next_view = dag_views[v_idx + 1]
-            if actual_view is None:
-                actual_view = view
-            actual_next_view = []
-            pred_op_of_next_view_op = {}
-            for op in actual_view:
-                # find actual succ op in next view and create virtual op
-                for succ_op in out_degree_ops[op.name]:
-                    if succ_op in next_view:
-                        if succ_op not in actual_next_view:
-                            actual_next_view.append(succ_op)
-                        if succ_op.name not in pred_op_of_next_view_op:
-                            pred_op_of_next_view_op[succ_op.name] = []
-                        pred_op_of_next_view_op[succ_op.name].append(op)
-                    else:
-                        # create virtual op
-                        virtual_op = gen_virtual_op(virtual_op_name_gen)
-                        virtual_ops.append(virtual_op)
-                        out_degree_ops[virtual_op.name] = [succ_op]
-                        actual_next_view.append(virtual_op)
-                        pred_op_of_next_view_op[virtual_op.name] = [op]
-                        virtual_op.add_virtual_pred_op(op)
-            actual_view = actual_next_view
-            # create channel
-            processed_op = set()
-            for o_idx, op in enumerate(actual_next_view):
-                if op.name in processed_op:
-                    continue
-                channel = gen_channel(channel_name_gen)
-                channels.append(channel)
-                _LOGGER.debug("{} => {}".format(channel.name, op.name))
-                op.add_input_channel(channel)
-                pred_ops = pred_op_of_next_view_op[op.name]
-                if v_idx == 0:
-                    input_channel = channel
-                else:
-                    # if pred_op is virtual op, it will use ancestors as producers to channel
-                    for pred_op in pred_ops:
-                        _LOGGER.debug("{} => {}".format(pred_op.name,
-                                                        channel.name))
-                        pred_op.add_output_channel(channel)
-                processed_op.add(op.name)
-                # find same input op to combine channel
-                for other_op in actual_next_view[o_idx + 1:]:
-                    if other_op.name in processed_op:
-                        continue
-                    other_pred_ops = pred_op_of_next_view_op[other_op.name]
-                    if len(other_pred_ops) != len(pred_ops):
-                        continue
-                    same_flag = True
-                    for pred_op in pred_ops:
-                        if pred_op not in other_pred_ops:
-                            same_flag = False
-                            break
-                    if same_flag:
-                        _LOGGER.debug("{} => {}".format(channel.name,
-                                                        other_op.name))
-                        other_op.add_input_channel(channel)
-                        processed_op.add(other_op.name)
-        output_channel = gen_channel(channel_name_gen)
-        channels.append(output_channel)
-        last_op.add_output_channel(output_channel)
-        pack_func, unpack_func = None, None
-        pack_func = self._response_op.pack_response_package
-        self._actual_ops = virtual_ops
-        for op in use_ops:
-            if len(op.get_input_ops()) == 0:
-                unpack_func = op.unpack_request_package
-                continue
-            self._actual_ops.append(op)
-        self._channels = channels
-        for c in channels:
-            _LOGGER.debug(c.debug())
-        return input_channel, output_channel, pack_func, unpack_func
    def _port_is_available(self, port):
        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
            sock.settimeout(2)
@@ -385,67 +80,59 @@ class PipelineServer(object):
    def prepare_server(self, yml_file):
        with open(yml_file) as f:
            yml_config = yaml.load(f.read())
-        self._port = yml_config.get('port', 8080)
+        self._port = yml_config.get('port')
+        if self._port is None:
+            raise SystemExit("Please set *port* in [{}] yaml file.".format(
+                yml_file))
        if not self._port_is_available(self._port):
            raise SystemExit("Prot {} is already used".format(self._port))
-        self._worker_num = yml_config.get('worker_num', 2)
+        self._worker_num = yml_config.get('worker_num', 1)
+        self._build_dag_each_worker = yml_config.get('build_dag_each_worker',
-        self._retry = yml_config.get('retry', 1)
+                                                     False)
-        self._client_type = yml_config.get('client_type', 'brpc')
+        _LOGGER.info("============= PIPELINE SERVER =============")
-        self._use_multithread = yml_config.get('use_multithread', True)
+        _LOGGER.info("port: {}".format(self._port))
-        profile = yml_config.get('profile', False)
+        _LOGGER.info("worker_num: {}".format(self._worker_num))
+        servicer_info = "build_dag_each_worker: {}".format(
-        if not self._use_multithread:
+            self._build_dag_each_worker)
-            self._manager = multiprocessing.Manager()
+        if self._build_dag_each_worker is True:
-            if profile:
+            servicer_info += " (Make sure that install grpcio whl with --no-binary flag)"
-                raise Exception(
+        _LOGGER.info(servicer_info)
-                    "profile cannot be used in multiprocess version temporarily")
+        _LOGGER.info("-------------------------------------------")
-        _profiler.enable(profile)
+        self._dag_config = yml_config.get("dag", {})
-        input_channel, output_channel, self._pack_func, self._unpack_func = self._topo_sort(
-            self._response_op)
-        self._in_channel = input_channel
-        self._out_channel = output_channel
-        for op in self._actual_ops:
-            if op.with_serving:
-                self.prepare_serving(op)
-        self.gen_desc()
-    def _run_ops(self):
-        threads_or_proces = []
-        for op in self._actual_ops:
-            op.init_profiler(_profiler)
-            if self._use_multithread:
-                threads_or_proces.extend(
-                    op.start_with_thread(self._client_type))
-            else:
-                threads_or_proces.extend(
-                    op.start_with_process(self._client_type))
-        return threads_or_proces
-    def _stop_all(self, service):
-        service.stop()
-        for op in self._actual_ops:
-            op.stop()
-        for chl in self._channels:
-            chl.stop()
    def run_server(self):
-        op_threads_or_proces = self._run_ops()
+        if self._build_dag_each_worker:
-        service = PipelineService(self._in_channel, self._out_channel,
+            with _reserve_port(self._port) as port:
-                                  self._unpack_func, self._pack_func,
+                bind_address = 'localhost:{}'.format(port)
-                                  self._retry)
+                workers = []
+                for i in range(self._worker_num):
+                    show_info = (i == 0)
+                    worker = multiprocessing.Process(
+                        target=self._run_server_func,
+                        args=(bind_address, self._response_op,
+                              self._dag_config))
+                    worker.start()
+                    workers.append(worker)
+                for worker in workers:
+                    worker.join()
+        else:
            server = grpc.server(
                futures.ThreadPoolExecutor(max_workers=self._worker_num))
-        pipeline_service_pb2_grpc.add_PipelineServiceServicer_to_server(service,
+            pipeline_service_pb2_grpc.add_PipelineServiceServicer_to_server(
+                PipelineService(self._response_op, self._dag_config, True),
                server)
            server.add_insecure_port('[::]:{}'.format(self._port))
            server.start()
            server.wait_for_termination()
-        self._stop_all()  # TODO
-        for x in op_threads_or_proces:
-            x.join()
-    def prepare_serving(self, op):
+    def _run_server_func(self, bind_address, response_op, dag_config):
-        # run a server (not in PyServing)
+        options = (('grpc.so_reuseport', 1), )
-        _LOGGER.info("run a server (not in PyServing)")
+        server = grpc.server(
+            futures.ThreadPoolExecutor(
+                max_workers=1, ), options=options)
+        pipeline_service_pb2_grpc.add_PipelineServiceServicer_to_server(
+            PipelineService(response_op, dag_config, False), server)
+        server.add_insecure_port(bind_address)
+        server.start()
+        server.wait_for_termination()
--- a/python/pipeline/profiler.py
+++ b/python/pipeline/profiler.py
@@ -23,8 +23,9 @@ elif sys.version_info.major == 3:
 else:
    raise Exception("Error Python version")
 import time
+import threading
-_LOGGER = logging.getLogger(__name__)
+_LOGGER = logging.getLogger()
 class TimeProfiler(object):
@@ -33,6 +34,7 @@ class TimeProfiler(object):
        self._print_head = 'PROFILE\tpid:{}\t'.format(self._pid)
        self._time_record = Queue.Queue()
        self._enable = False
+        self._lock = threading.Lock()
    def enable(self, enable):
        self._enable = enable
@@ -40,16 +42,24 @@ class TimeProfiler(object):
    def record(self, name_with_tag):
        if self._enable is False:
            return
+        timestamp = int(round(time.time() * 1000000))
        name_with_tag = name_with_tag.split("_")
        tag = name_with_tag[-1]
        name = '_'.join(name_with_tag[:-1])
-        self._time_record.put((name, tag, int(round(time.time() * 1000000))))
+        with self._lock:
+            self._time_record.put((name, tag, timestamp))
    def print_profile(self):
+        if self._enable is False:
+            return
+        sys.stderr.write(self.gen_profile_str())
+    def gen_profile_str(self):
        if self._enable is False:
            return
        print_str = self._print_head
        tmp = {}
+        with self._lock:
            while not self._time_record.empty():
                name, tag, timestamp = self._time_record.get()
                if name in tmp:
@@ -58,8 +68,8 @@ class TimeProfiler(object):
                    print_str += "{}_{}:{} ".format(name, tag, timestamp)
                else:
                    tmp[name] = (tag, timestamp)
-        print_str += "\n"
+            print_str = "\n{}\n".format(print_str)
-        sys.stderr.write(print_str)
            for name, item in tmp.items():
                tag, timestamp = item
                self._time_record.put((name, tag, timestamp))
+            return print_str
--- a/python/requirements.txt
+++ b/python/requirements.txt
 numpy>=1.12, <=1.16.4 ; python_version<"3.5"
+protobuf>=3.12.2
 grpcio-tools>=1.28.1
 grpcio>=1.28.1
 func-timeout>=4.3.5
+pyyaml>=1.3.0
--- a/python/setup.py.client.in
+++ b/python/setup.py.client.in
@@ -58,7 +58,7 @@ if '${PACK}' == 'ON':
 REQUIRED_PACKAGES = [
-    'six >= 1.10.0', 'protobuf >= 3.1.0', 'numpy >= 1.12', 'grpcio >= 1.28.1',
+    'six >= 1.10.0', 'protobuf >= 3.11.0', 'numpy >= 1.12', 'grpcio >= 1.28.1',
    'grpcio-tools >= 1.28.1'
 ]

--- a/python/setup.py.server.in
+++ b/python/setup.py.server.in
@@ -37,7 +37,7 @@ def python_version():
 max_version, mid_version, min_version = python_version()
 REQUIRED_PACKAGES = [
-    'six >= 1.10.0', 'protobuf >= 3.1.0', 'grpcio >= 1.28.1', 'grpcio-tools >= 1.28.1',
+    'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio >= 1.28.1', 'grpcio-tools >= 1.28.1',
    'paddle_serving_client', 'flask >= 1.1.1', 'paddle_serving_app'
 ]

--- a/python/setup.py.server_gpu.in
+++ b/python/setup.py.server_gpu.in
@@ -37,7 +37,7 @@ def python_version():
 max_version, mid_version, min_version = python_version()
 REQUIRED_PACKAGES = [
-    'six >= 1.10.0', 'protobuf >= 3.1.0', 'grpcio >= 1.28.1', 'grpcio-tools >= 1.28.1',
+    'six >= 1.10.0', 'protobuf >= 3.11.0', 'grpcio >= 1.28.1', 'grpcio-tools >= 1.28.1',
    'paddle_serving_client', 'flask >= 1.1.1', 'paddle_serving_app'
 ]

--- a/tools/Dockerfile.centos6.gpu.devel
+++ b/tools/Dockerfile.centos6.gpu.devel
--- a/tools/Dockerfile.ci
+++ b/tools/Dockerfile.ci
 FROM centos:7.3.1611
 RUN yum -y install wget >/dev/null \
    && yum -y install gcc gcc-c++ make glibc-static which >/dev/null \
    && yum -y install git openssl-devel curl-devel bzip2-devel python-devel >/dev/null \
    && yum -y install libSM-1.2.2-2.el7.x86_64 --setopt=protected_multilib=false \
    && yum -y install libXrender-0.9.10-1.el7.x86_64 --setopt=protected_multilib=false \
-    && yum -y install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false \
+    && yum -y install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false
-    && wget https://cmake.org/files/v3.2/cmake-3.2.0-Linux-x86_64.tar.gz >/dev/null \
+RUN wget https://cmake.org/files/v3.2/cmake-3.2.0-Linux-x86_64.tar.gz >/dev/null \
    && tar xzf cmake-3.2.0-Linux-x86_64.tar.gz \
    && mv cmake-3.2.0-Linux-x86_64 /usr/local/cmake3.2.0 \
    && echo 'export PATH=/usr/local/cmake3.2.0/bin:$PATH' >> /root/.bashrc \
-    && rm cmake-3.2.0-Linux-x86_64.tar.gz \
+    && rm cmake-3.2.0-Linux-x86_64.tar.gz
-    && wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
+RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
    && tar xzf go1.14.linux-amd64.tar.gz \
    && mv go /usr/local/go \
    && echo 'export GOROOT=/usr/local/go' >> /root/.bashrc \
    && echo 'export PATH=/usr/local/go/bin:$PATH' >> /root/.bashrc \
-    && rm go1.14.linux-amd64.tar.gz \
+    && rm go1.14.linux-amd64.tar.gz
-    && yum -y install python-devel sqlite-devel >/dev/null \
+RUN yum -y install python-devel sqlite-devel >/dev/null \
    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
    && python get-pip.py >/dev/null \
    && pip install google protobuf setuptools wheel flask >/dev/null \
-    && rm get-pip.py \
+    && rm get-pip.py
-    && wget http://nixos.org/releases/patchelf/patchelf-0.10/patchelf-0.10.tar.bz2 \
+RUN wget http://nixos.org/releases/patchelf/patchelf-0.10/patchelf-0.10.tar.bz2 \
    && yum -y install bzip2 >/dev/null \
    && tar -jxf patchelf-0.10.tar.bz2 \
    && cd patchelf-0.10 \
    && ./configure --prefix=/usr \
    && make >/dev/null && make install >/dev/null \
    && cd .. \
-    && rm -rf patchelf-0.10* \
+    && rm -rf patchelf-0.10*
-    && yum install -y python3 python3-devel \
-    && pip3 install google protobuf setuptools wheel flask \
+RUN yum install -y python3 python3-devel \
-    && yum -y update >/dev/null \
+    && pip3 install google protobuf setuptools wheel flask
+RUN yum -y update >/dev/null \
    && yum -y install dnf >/dev/null \
    && yum -y install dnf-plugins-core >/dev/null \
    && dnf copr enable alonid/llvm-3.8.0 -y \
    && dnf install llvm-3.8.0 clang-3.8.0 compiler-rt-3.8.0 -y \
    && echo 'export PATH=/opt/llvm-3.8.0/bin:$PATH' >> /root/.bashrc
+RUN yum install -y java \
+    && wget http://repos.fedorapeople.org/repos/dchen/apache-maven/epel-apache-maven.repo -O /etc/yum.repos.d/epel-apache-maven.repo \
+    && yum install -y apache-maven
+RUN yum install -y lsof
--- a/tools/Dockerfile.cuda10.0-cudnn7
+++ b/tools/Dockerfile.cuda10.0-cudnn7
+FROM nvidia/cuda:10.0-cudnn7-devel-centos7 as builder
+FROM nvidia/cuda:10.0-cudnn7-runtime-centos7
+RUN yum -y install wget && \
+    yum -y install epel-release && yum -y install patchelf && \
+    yum -y install gcc gcc-c++ make python-devel && \
+    yum -y install libSM-1.2.2-2.el7.x86_64 --setopt=protected_multilib=false && \
+    yum -y install libXrender-0.9.10-1.el7.x86_64 --setopt=protected_multilib=false && \
+    yum -y install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false && \
+    yum -y install python3 python3-devel && \
+    yum clean all
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+    python get-pip.py && rm get-pip.py
+RUN ln -s /usr/local/cuda-10.0/lib64/libcublas.so.10.0 /usr/local/cuda-10.0/lib64/libcublas.so && \
+    echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> /root/.bashrc && \
+    ln -s /usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudnn.so.7 /usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudnn.so && \
+    echo 'export LD_LIBRARY_PATH=/usr/local/cuda-10.0/targets/x86_64-linux/lib:$LD_LIBRARY_PATH' >> /root/.bashrc && \
+    echo "export LANG=en_US.utf8" >> /root/.bashrc && \
+    mkdir -p /usr/local/cuda/extras
+COPY --from=builder /usr/local/cuda/extras/CUPTI /usr/local/cuda/extras/CUPTI
--- a/tools/Dockerfile.cuda10.0-cudnn7.devel
+++ b/tools/Dockerfile.cuda10.0-cudnn7.devel
+FROM nvidia/cuda:10.0-cudnn7-runtime-centos7
+RUN yum -y install wget >/dev/null \
+    && yum -y install gcc gcc-c++ make glibc-static which  \
+    && yum -y install git openssl-devel curl-devel bzip2-devel python-devel \
+    && yum -y install libSM-1.2.2-2.el7.x86_64 --setopt=protected_multilib=false \
+    && yum -y install libXrender-0.9.10-1.el7.x86_64 --setopt=protected_multilib=false \
+    && yum -y install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false
+RUN wget https://cmake.org/files/v3.2/cmake-3.2.0-Linux-x86_64.tar.gz >/dev/null \
+    && tar xzf cmake-3.2.0-Linux-x86_64.tar.gz \
+    && mv cmake-3.2.0-Linux-x86_64 /usr/local/cmake3.2.0 \
+    && echo 'export PATH=/usr/local/cmake3.2.0/bin:$PATH' >> /root/.bashrc \
+    && rm cmake-3.2.0-Linux-x86_64.tar.gz 
+RUN wget https://dl.google.com/go/go1.14.linux-amd64.tar.gz >/dev/null \
+    && tar xzf go1.14.linux-amd64.tar.gz \
+    && mv go /usr/local/go \
+    && echo 'export GOROOT=/usr/local/go' >> /root/.bashrc \
+    && echo 'export PATH=/usr/local/go/bin:$PATH' >> /root/.bashrc \
+    && rm go1.14.linux-amd64.tar.gz 
+RUN yum -y install python-devel sqlite-devel  \
+    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py >/dev/null \
+    && python get-pip.py >/dev/null \
+    && pip install google protobuf setuptools wheel flask >/dev/null \
+    && rm get-pip.py 
+RUN yum install -y python3 python3-devel \
+    && pip3 install google protobuf setuptools wheel flask \
+    && yum -y install epel-release && yum -y install patchelf libXext libSM libXrender\
+    && yum clean all 
+RUN localedef -c -i en_US -f UTF-8 en_US.UTF-8 \
+    && echo "export LANG=en_US.utf8" >> /root/.bashrc
--- a/tools/Dockerfile.gpu
+++ b/tools/Dockerfile.gpu
@@ -3,7 +3,7 @@ FROM nvidia/cuda:9.0-cudnn7-devel-centos7 as builder
 FROM nvidia/cuda:9.0-cudnn7-runtime-centos7
 RUN yum -y install wget && \
    yum -y install epel-release && yum -y install patchelf && \
-    yum -y install gcc make python-devel && \
+    yum -y install gcc gcc-c++ make python-devel && \
    yum -y install libSM-1.2.2-2.el7.x86_64 --setopt=protected_multilib=false && \
    yum -y install libXrender-0.9.10-1.el7.x86_64 --setopt=protected_multilib=false && \
    yum -y install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false && \

--- a/tools/Dockerfile.gpu.devel
+++ b/tools/Dockerfile.gpu.devel
 FROM nvidia/cuda:9.0-cudnn7-devel-centos7
 RUN yum -y install wget >/dev/null \
    && yum -y install gcc gcc-c++ make glibc-static which  \
-    && yum -y install git openssl-devel curl-devel bzip2-devel python-devel
+    && yum -y install git openssl-devel curl-devel bzip2-devel python-devel \
+    && yum -y install libSM-1.2.2-2.el7.x86_64 --setopt=protected_multilib=false \
+    && yum -y install libXrender-0.9.10-1.el7.x86_64 --setopt=protected_multilib=false \
+    && yum -y install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false
 RUN wget https://cmake.org/files/v3.2/cmake-3.2.0-Linux-x86_64.tar.gz >/dev/null \
    && tar xzf cmake-3.2.0-Linux-x86_64.tar.gz \

--- a/tools/serving_build.sh
+++ b/tools/serving_build.sh
@@ -61,7 +61,7 @@ function build_app() {
                  -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
                  -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
                  -DAPP=ON ..
-            rerun "make -j2 >/dev/null" 3 # due to some network reasons, compilation may fail
+            rerun "make -j10 >/dev/null" 3 # due to some network reasons, compilation may fail
            pip install -U python/dist/paddle_serving_app* >/dev/null
            ;;
        *)
@@ -84,7 +84,7 @@ function build_client() {
                  -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython2.7.so \
                  -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
                  -DCLIENT=ON ..
-            rerun "make -j2 >/dev/null" 3 # due to some network reasons, compilation may fail
+            rerun "make -j10 >/dev/null" 3 # due to some network reasons, compilation may fail
            pip install -U python/dist/paddle_serving_client* >/dev/null
            ;;
        *)
@@ -108,7 +108,7 @@ function build_server() {
                  -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython2.7.so \
                  -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
                  -DSERVER=ON ..
-            rerun "make -j2 >/dev/null" 3 # due to some network reasons, compilation may fail
+            rerun "make -j10 >/dev/null" 3 # due to some network reasons, compilation may fail
            check_cmd "make install -j2 >/dev/null"
            pip install -U python/dist/paddle_serving_server* >/dev/null
            ;;
@@ -118,7 +118,7 @@ function build_server() {
                  -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
                  -DSERVER=ON \
                  -DWITH_GPU=ON ..
-            rerun "make -j2 >/dev/null" 3 # due to some network reasons, compilation may fail
+            rerun "make -j10 >/dev/null" 3 # due to some network reasons, compilation may fail
            check_cmd "make install -j2 >/dev/null"
            pip install -U python/dist/paddle_serving_server* >/dev/null
            ;;
@@ -137,6 +137,15 @@ function kill_server_process() {
    sleep 1
 }
+function kill_process_by_port() {
+    if [ $# != 1 ]; then
+        echo "usage: kill_process_by_port <PID>"
+        exit 1
+    fi
+    local PID=$1
+    lsof -i:$PID | awk 'NR == 1 {next} {print $2}' | xargs kill
+}
 function python_test_fit_a_line() {
    # pwd: /Serving/python/examples
    cd fit_a_line # pwd: /Serving/python/examples/fit_a_line
@@ -182,26 +191,26 @@ function python_test_fit_a_line() {
            kill_server_process
            # test web
-            unsetproxy # maybe the proxy is used on iPipe, which makes web-test failed.
+            #unsetproxy # maybe the proxy is used on iPipe, which makes web-test failed.
-            check_cmd "python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9393 --thread 2 --gpu_ids 0 --name uci > /dev/null &"
+            #check_cmd "python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9393 --thread 2 --gpu_ids 0 --name uci > /dev/null &"
-            sleep 5 # wait for the server to start
+            #sleep 5 # wait for the server to start
-            check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
+            #check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
            # check http code
-            http_code=`curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
+            #http_code=`curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
-            if [ ${http_code} -ne 200 ]; then
+            #if [ ${http_code} -ne 200 ]; then
-                echo "HTTP status code -ne 200"
+            #    echo "HTTP status code -ne 200"
-                exit 1
+            #    exit 1
-            fi
+            #fi
            # test web batch
-            check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
+            #check_cmd "curl -H \"Content-Type:application/json\" -X POST -d '{\"feed\":[{\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {\"x\": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], \"fetch\":[\"price\"]}' http://127.0.0.1:9393/uci/prediction"
            # check http code
-            http_code=`curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
+            #http_code=`curl -H "Content-Type:application/json" -X POST -d '{"feed":[{"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}, {"x": [0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283, 0.4919, 0.1856, 0.0795, -0.0332]}], "fetch":["price"]}' -s -w "%{http_code}" -o /dev/null http://127.0.0.1:9393/uci/prediction`
-            if [ ${http_code} -ne 200 ]; then
+            #if [ ${http_code} -ne 200 ]; then
-                echo "HTTP status code -ne 200"
+            #    echo "HTTP status code -ne 200"
-                exit 1
+            #    exit 1
-            fi
+            #fi
-            setproxy # recover proxy state
+            #setproxy # recover proxy state
-            kill_server_process
+            #kill_server_process
            ;;
        *)
            echo "error type"
@@ -229,10 +238,7 @@ function python_run_criteo_ctr_with_cube() {
            check_cmd "mv models/data ./cube/"
            check_cmd "mv models/ut_data ./"
            cp ../../../build-server-$TYPE/output/bin/cube* ./cube/
-            mkdir -p $PYTHONROOT/lib/python2.7/site-packages/paddle_serving_server/serving-cpu-avx-openblas-0.1.3/
-            yes | cp ../../../build-server-$TYPE/output/demo/serving/bin/serving $PYTHONROOT/lib/python2.7/site-packages/paddle_serving_server/serving-cpu-avx-openblas-0.1.3/
            sh cube_prepare.sh &
-            check_cmd "mkdir work_dir1 && cp cube/conf/cube.conf ./work_dir1/"
            python test_server.py ctr_serving_model_kv &
            sleep 5
            check_cmd "python test_client.py ctr_client_conf/serving_client_conf.prototxt ./ut_data >score"
@@ -257,10 +263,7 @@ function python_run_criteo_ctr_with_cube() {
            check_cmd "mv models/data ./cube/"
            check_cmd "mv models/ut_data ./"
            cp ../../../build-server-$TYPE/output/bin/cube* ./cube/
-            mkdir -p $PYTHONROOT/lib/python2.7/site-packages/paddle_serving_server_gpu/serving-gpu-0.1.3/
-            yes | cp ../../../build-server-$TYPE/output/demo/serving/bin/serving $PYTHONROOT/lib/python2.7/site-packages/paddle_serving_server_gpu/serving-gpu-0.1.3/
            sh cube_prepare.sh &
-            check_cmd "mkdir work_dir1 && cp cube/conf/cube.conf ./work_dir1/"
            python test_server_gpu.py ctr_serving_model_kv &
            sleep 5
            # for warm up
@@ -505,6 +508,64 @@ function python_test_lac() {
    cd ..
 }
+function java_run_test() {
+    # pwd: /Serving
+    local TYPE=$1
+    export SERVING_BIN=${SERVING_WORKDIR}/build-server-${TYPE}/core/general-server/serving
+    unsetproxy
+    case $TYPE in
+        CPU)
+            # compile java sdk
+            cd java # pwd: /Serving/java
+            mvn compile > /dev/null
+            mvn install > /dev/null
+            # compile java sdk example
+            cd examples # pwd: /Serving/java/examples
+            mvn compile > /dev/null
+            mvn install > /dev/null
+            # fit_a_line (general, asyn_predict, batch_predict)
+            cd ../../python/examples/grpc_impl_example/fit_a_line # pwd: /Serving/python/examples/grpc_impl_example/fit_a_line
+            sh get_data.sh
+            check_cmd "python -m paddle_serving_server.serve --model uci_housing_model --port 9393 --thread 4 --use_multilang > /dev/null &"
+            sleep 5 # wait for the server to start
+            cd ../../../java/examples # /Serving/java/examples
+            java -cp target/paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample fit_a_line
+            java -cp target/paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample asyn_predict
+            java -cp target/paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample batch_predict
+            kill_server_process
+            # imdb (model_ensemble)
+            cd ../../python/examples/grpc_impl_example/imdb # pwd: /Serving/python/examples/grpc_impl_example/imdb
+            sh get_data.sh > /dev/null
+            check_cmd "python test_multilang_ensemble_server.py > /dev/null &"
+            sleep 5 # wait for the server to start
+            cd ../../../java/examples # /Serving/java/examples
+            java -cp target/paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample model_ensemble
+            kill_server_process
+            # yolov4 (int32)
+            cd ../../python/examples/grpc_impl_example/yolov4 # pwd: /Serving/python/examples/grpc_impl_example/yolov4
+            python -m paddle_serving_app.package --get_model yolov4 > /dev/null
+            tar -xzf yolov4.tar.gz > /dev/null
+            check_cmd "python -m paddle_serving_server.serve --model yolov4_model --port 9393 --use_multilang --mem_optim > /dev/null &"
+            cd ../../../java/examples # /Serving/java/examples
+            java -cp target/paddle-serving-sdk-java-examples-0.0.1-jar-with-dependencies.jar PaddleServingClientExample yolov4 src/main/resources/000000570688.jpg
+            kill_server_process
+            cd ../../ # pwd: /Serving
+            ;;
+        GPU)
+            ;;
+        *)
+            echo "error type"
+            exit 1
+            ;;
+    esac
+    echo "java-sdk $TYPE part finished as expected."
+    setproxy
+    unset SERVING_BIN
+}
 function python_test_grpc_impl() {
    # pwd: /Serving/python/examples
    cd grpc_impl_example # pwd: /Serving/python/examples/grpc_impl_example
@@ -527,6 +588,7 @@ function python_test_grpc_impl() {
            check_cmd "python test_batch_client.py > /dev/null"
            check_cmd "python test_timeout_client.py > /dev/null"
            kill_server_process
+            kill_process_by_port 9393
            check_cmd "python test_server.py uci_housing_model > /dev/null &"
            sleep 5 # wait for the server to start
@@ -537,13 +599,14 @@ function python_test_grpc_impl() {
            check_cmd "python test_batch_client.py > /dev/null"
            check_cmd "python test_timeout_client.py > /dev/null"
            kill_server_process
+            kill_process_by_port 9393
            cd .. # pwd: /Serving/python/examples/grpc_impl_example
            # test load server config and client config in Server side
            cd criteo_ctr_with_cube # pwd: /Serving/python/examples/grpc_impl_example/criteo_ctr_with_cube
-            check_cmd "wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz"
+            check_cmd "wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz > /dev/null"
            check_cmd "tar xf ctr_cube_unittest.tar.gz"
            check_cmd "mv models/ctr_client_conf ./"
            check_cmd "mv models/ctr_serving_model_kv ./"
@@ -585,6 +648,7 @@ function python_test_grpc_impl() {
            check_cmd "python test_batch_client.py > /dev/null"
            check_cmd "python test_timeout_client.py > /dev/null"
            kill_server_process
+            kill_process_by_port 9393
            check_cmd "python test_server_gpu.py uci_housing_model > /dev/null &"
            sleep 5 # wait for the server to start
@@ -595,7 +659,8 @@ function python_test_grpc_impl() {
            check_cmd "python test_batch_client.py > /dev/null"
            check_cmd "python test_timeout_client.py > /dev/null"
            kill_server_process
-            ps -ef | grep "test_server_gpu" | grep -v serving_build | grep -v grep | awk '{print $2}' | xargs kill
+            kill_process_by_port 9393
+            #ps -ef | grep "test_server_gpu" | grep -v serving_build | grep -v grep | awk '{print $2}' | xargs kill
            cd .. # pwd: /Serving/python/examples/grpc_impl_example
@@ -649,13 +714,7 @@ function python_test_yolov4(){
    cd yolov4
    case $TYPE in
        CPU)
-            python -m paddle_serving_app.package --get_model yolov4
+            echo "no implement for cpu type"
-            tar -xzvf yolov4.tar.gz
-            check_cmd "python -m paddle_serving_server.serve --model yolov4_model/ --port 9393 &"
-            sleep 5
-            check_cmd "python test_client.py 000000570688.jpg"
-            echo "yolov4 CPU RPC inference pass"
-            kill_server_process
            ;;
        GPU)
            python -m paddle_serving_app.package --get_model yolov4
@@ -676,6 +735,175 @@ function python_test_yolov4(){
    cd ..
 }
+function python_test_resnet50(){
+    #pwd:/ Serving/python/examples
+    local TYPE=$1
+    export SERVING_BIN=${SERVING_WORKDIR}/build-server-${TYPE}/core/general-server/serving
+    cd imagenet
+    case $TYPE in
+        CPU)
+            echo "no implement for cpu type"
+            ;;
+        GPU)
+            sh get_model.sh
+            check_cmd"python -m paddle_serving_server_gpu.serve --model ResNet50_vd_model --port 9696 --gpu_ids 0"
+            sleep 5
+            check_cmd"python resnet50_rpc_client.py ResNet50_vd_client_config/serving_client_conf.prototxt"
+            echo "resnet50 GPU RPC inference pass"
+            kill_server_process
+            ;;
+        *)
+            echo "error type"
+            exit 1
+            ;;
+    esac
+    echo "test resnet $TYPE finished as expected"
+    unset SERVING_BIN
+    cd ..
+}
+function python_test_pipeline(){
+    # pwd:/ Serving/python/examples
+    local TYPE=$1
+    export SERVING_BIN=${SERVING_WORKDIR}/build-server-${TYPE}/core/general-server/serving
+    unsetproxy
+    cd pipeline/imdb_model_ensemble
+    case $TYPE in
+        CPU)
+            # start paddle serving service (brpc)
+            sh get_data.sh
+            python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 --workdir test9292 &> cnn.log &
+            python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 --workdir test9393 &> bow.log &
+            sleep 5
+            # test: thread servicer & thread op
+            cat << EOF > config.yml
+port: 18080
+worker_num: 2
+build_dag_each_worker: false
+dag:
+    is_thread_op: true
+    client_type: brpc
+    retry: 1
+    use_profile: false
+EOF
+            python test_pipeline_server.py > /dev/null &
+            sleep 5
+            check_cmd "python test_pipeline_client.py"
+            ps -ef | grep "pipeline_server" | grep -v grep | awk '{print $2}' | xargs kill
+            kill_process_by_port 18080
+            # test: thread servicer & process op
+            cat << EOF > config.yml
+port: 18080
+worker_num: 2
+build_dag_each_worker: false
+dag:
+    is_thread_op: false
+    client_type: brpc
+    retry: 1
+    use_profile: false
+EOF
+            python test_pipeline_server.py > /dev/null &
+            sleep 5
+            check_cmd "python test_pipeline_client.py"
+            ps -ef | grep "pipeline_server" | grep -v grep | awk '{print $2}' | xargs kill
+            kill_process_by_port 18080
+            # test: process servicer & thread op
+            cat << EOF > config.yml
+port: 18080
+worker_num: 2
+build_dag_each_worker: true
+dag:
+    is_thread_op: flase
+    client_type: brpc
+    retry: 1
+    use_profile: false
+EOF
+            python test_pipeline_server.py > /dev/null &
+            sleep 5
+            check_cmd "python test_pipeline_client.py"
+            ps -ef | grep "pipeline_server" | grep -v grep | awk '{print $2}' | xargs kill
+            kill_process_by_port 18080
+            # test: process servicer & process op
+            cat << EOF > config.yml
+port: 18080
+worker_num: 2
+build_dag_each_worker: false
+dag:
+    is_thread_op: false
+    client_type: brpc
+    retry: 1
+    use_profile: false
+EOF
+            python test_pipeline_server.py > /dev/null &
+            sleep 5
+            check_cmd "python test_pipeline_client.py"
+            ps -ef | grep "pipeline_server" | grep -v grep | awk '{print $2}' | xargs kill
+            kill_process_by_port 18080
+            kill_server_process
+            kill_process_by_port 9292
+            kill_process_by_port 9393
+            # start paddle serving service (grpc)
+            python -m paddle_serving_server.serve --model imdb_cnn_model --port 9292 --use_multilang --workdir test9292 &> cnn.log &
+            python -m paddle_serving_server.serve --model imdb_bow_model --port 9393 --use_multilang --workdir test9393 &> bow.log &
+            sleep 5
+            cat << EOF > config.yml
+port: 18080
+worker_num: 2
+build_dag_each_worker: false
+dag:
+    is_thread_op: false
+    client_type: grpc
+    retry: 1
+    use_profile: false
+EOF
+            python test_pipeline_server.py > /dev/null &
+            sleep 5
+            check_cmd "python test_pipeline_client.py"
+            ps -ef | grep "pipeline_server" | grep -v grep | awk '{print $2}' | xargs kill
+            kill_process_by_port 18080
+            kill_server_process
+            kill_process_by_port 9292
+            kill_process_by_port 9393
+            ;;
+        GPU)
+            echo "pipeline ignore GPU test"
+            ;;
+        *)
+            echo "error type"
+            exit 1
+            ;;
+    esac
+    cd ../../
+    setproxy
+    unset SERVING_BIN
+}
+function python_app_api_test(){
+    #pwd:/ Serving/python/examples
+    #test image reader
+    local TYPE=$1
+    cd imagenet
+    case $TYPE in
+        CPU)
+            check_cmd "python test_image_reader.py"
+            ;;
+        GPU)
+            echo "no implement for cpu type"
+            ;;
+        *)
+            echo "error type"
+            exit 1
+            ;;
+    esac
+    echo "test app api finised as expected"
+    cd ..
+}
 function python_run_test() {
    # Using the compiled binary
@@ -690,6 +918,8 @@ function python_run_test() {
    python_test_multi_fetch $TYPE # pwd: /Serving/python/examples
    python_test_yolov4 $TYPE # pwd: /Serving/python/examples
    python_test_grpc_impl $TYPE # pwd: /Serving/python/examples
+    python_test_resnet50 $TYPE # pwd: /Serving/python/examples
+    python_test_pipeline $TYPE # pwd: /Serving/python/examples
    echo "test python $TYPE part finished as expected."
    cd ../.. # pwd: /Serving
 }
@@ -942,6 +1172,7 @@ function main() {
    build_client $TYPE # pwd: /Serving
    build_server $TYPE # pwd: /Serving
    build_app $TYPE # pwd: /Serving
+    java_run_test $TYPE # pwd: /Serving
    python_run_test $TYPE # pwd: /Serving
    monitor_test $TYPE # pwd: /Serving
    echo "serving $TYPE part finished as expected."