diff --git a/core/configure/proto/multi_lang_general_model_service.proto b/core/configure/proto/multi_lang_general_model_service.proto
index 6e1764b23b3e6f7d9eb9a33925bcd83cfb1810bb..2a8a8bc1532c19aa02a1998aa751aa7ba9d41570 100644
--- a/core/configure/proto/multi_lang_general_model_service.proto
+++ b/core/configure/proto/multi_lang_general_model_service.proto
@@ -28,16 +28,17 @@ message FeedInst { repeated Tensor tensor_array = 1; };
 
 message FetchInst { repeated Tensor tensor_array = 1; };
 
-message Request {
+message InferenceRequest {
   repeated FeedInst insts = 1;
   repeated string feed_var_names = 2;
   repeated string fetch_var_names = 3;
   required bool is_python = 4 [ default = false ];
 };
 
-message Response {
+message InferenceResponse {
   repeated ModelOutput outputs = 1;
   optional string tag = 2;
+  required int32 err_code = 3;
 };
 
 message ModelOutput {
@@ -45,6 +46,17 @@ message ModelOutput {
   optional string engine_name = 2;
 }
 
+message SetTimeoutRequest { required int32 timeout_ms = 1; }
+
+message SimpleResponse { required int32 err_code = 1; }
+
+message GetClientConfigRequest {}
+
+message GetClientConfigResponse { required string client_config_str = 1; }
+
 service MultiLangGeneralModelService {
-  rpc inference(Request) returns (Response) {}
+  rpc Inference(InferenceRequest) returns (InferenceResponse) {}
+  rpc SetTimeout(SetTimeoutRequest) returns (SimpleResponse) {}
+  rpc GetClientConfig(GetClientConfigRequest)
+      returns (GetClientConfigResponse) {}
 };
diff --git a/core/predictor/tools/seq_generator.cpp b/core/predictor/tools/seq_generator.cpp
index 135e25d6dd7ce44fa04f510f7d521b42998bc955..eb7e7ed7f9a609e0c21be9a2c3d686dd7d9a1abd 100644
--- a/core/predictor/tools/seq_generator.cpp
+++ b/core/predictor/tools/seq_generator.cpp
@@ -233,7 +233,7 @@ int compress_parameter_parallel(const char *file1,
         greedy_search(
             emb_table + k * emb_size, xmin, xmax, loss, emb_size, bits);
         // 得出 loss 最小的时候的 scale
-        float scale = (xmax - xmin) * (pow2bits - 1);
+        float scale = (xmax - xmin) / (pow2bits - 1);
         char *min_ptr = tensor_temp;
         char *max_ptr = tensor_temp + sizeof(float);
         memcpy(min_ptr, &xmin, sizeof(float));
diff --git a/doc/GRPC_IMPL_CN.md b/doc/GRPC_IMPL_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..7b10907caec98ae5754126a7ec54096cc4cd48af
--- /dev/null
+++ b/doc/GRPC_IMPL_CN.md
@@ -0,0 +1,52 @@
+# gRPC接口
+
+gRPC 接口实现形式类似 Web Service：
+
+![](grpc_impl.png)
+
+## 与bRPC接口对比
+
+1. gRPC Server 端 `load_model_config` 函数添加 `client_config_path` 参数：
+
+   ```python
+   def load_model_config(self, server_config_paths, client_config_path=None)
+   ```
+
+   在一些例子中 bRPC Server 端与 bRPC Client 端的配置文件可能是不同的（如 cube local 例子中，Client 端的数据先交给 cube，经过 cube 处理后再交给预测库），所以 gRPC Server 端需要获取 gRPC Client 端的配置；同时为了取消 gRPC Client 端手动加载配置文件的过程，所以设计 gRPC Server 端同时加载两个配置文件。`client_config_path` 默认为 `<server_config_path>/serving_server_conf.prototxt`。
+
+2. gRPC Client 端取消 `load_client_config` 步骤：
+
+   在 `connect` 步骤通过 RPC 获取相应的 prototxt（从任意一个 endpoint 获取即可）。
+
+3. gRPC Client 需要通过 RPC 方式设置 timeout 时间（调用形式与 bRPC Client保持一致）
+
+   因为 bRPC Client 在 `connect` 后无法更改 timeout 时间，所以当 gRPC Server 收到变更 timeout 的调用请求时会重新创建 bRPC Client 实例以变更 bRPC Client timeout时间，同时 gRPC Client 会设置 gRPC 的 deadline 时间。
+
+   **注意，设置 timeout 接口和 Inference 接口不能同时调用（非线程安全），出于性能考虑暂时不加锁。**
+
+4. gRPC Client 端 `predict` 函数添加 `asyn` 和 `is_python` 参数：
+
+   ```python
+   def predict(self, feed, fetch, need_variant_tag=False, asyn=False, is_python=True)
+   ```
+
+   其中，`asyn` 为异步调用选项。当 `asyn=True` 时为异步调用，返回 `MultiLangPredictFuture` 对象，通过 `MultiLangPredictFuture.result()` 阻塞获取预测值；当 `asyn=Fasle` 为同步调用。
+
+   `is_python` 为 proto 格式选项。当 `is_python=True` 时，基于 numpy bytes 格式进行数据传输，目前只适用于 Python；当 `is_python=False` 时，以普通数据格式传输，更加通用。使用 numpy bytes 格式传输耗时比普通数据格式小很多（详见 [#654](https://github.com/PaddlePaddle/Serving/pull/654)）。
+
+5. 异常处理：当 gRPC Server 端的 bRPC Client 预测失败（返回 `None`）时，gRPC Client 端同样返回None。其他 gRPC 异常会在 Client 内部捕获，并在返回的 fetch_map 中添加一个 "status_code" 字段来区分是否预测正常（参考 timeout 样例）。
+
+6. 由于 gRPC 只支持 pick_first 和 round_robin 负载均衡策略，ABTEST 特性还未打齐。
+
+7. 经测试，gRPC 版本可以在 Windows、macOS 平台使用。
+
+8. 计划支持的客户端语言：
+
+   - [x] Python
+   - [ ] Java
+   - [ ] Go
+   - [ ] JavaScript
+
+## Python 端的一些例子 
+
+详见 `python/examples/grpc_impl_example` 下的示例文件。
diff --git a/doc/grpc_impl.png b/doc/grpc_impl.png
new file mode 100644
index 0000000000000000000000000000000000000000..05b1a67e815efae5f4b7b81758444bff48cfe59d
Binary files /dev/null and b/doc/grpc_impl.png differ
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/README_CN.md b/python/examples/grpc_impl_example/criteo_ctr_with_cube/README_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..07fc1acc18903256c49d77e2af8e9c2d74b21c16
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/README_CN.md
@@ -0,0 +1,40 @@
+## 带稀疏参数索引服务的CTR预测服务
+
+该样例是为了展示gRPC Server 端 `load_model_config` 函数，在这个例子中，bRPC Server 端与 bRPC Client 端的配置文件是不同的（bPRC Client 端的数据先交给 cube，经过 cube 处理后再交给预测库）
+
+### 获取样例数据
+```
+sh get_data.sh
+```
+
+### 下载模型和稀疏参数序列文件
+```
+wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz
+tar xf ctr_cube_unittest.tar.gz
+mv models/ctr_client_conf ./
+mv models/ctr_serving_model_kv ./
+mv models/data ./cube/
+```
+执行脚本后会在当前目录有ctr_server_model_kv和ctr_client_config文件夹。
+
+### 启动稀疏参数索引服务
+```
+wget https://paddle-serving.bj.bcebos.com/others/cube_app.tar.gz
+tar xf cube_app.tar.gz
+mv cube_app/cube* ./cube/
+sh cube_prepare.sh &
+```
+
+此处，模型当中的稀疏参数会被存放在稀疏参数索引服务Cube当中，关于稀疏参数索引服务Cube的介绍，请阅读[稀疏参数索引服务Cube单机版使用指南](../../../doc/CUBE_LOCAL_CN.md)
+
+### 启动RPC预测服务，服务端线程数为4（可在test_server.py配置）
+
+```
+python test_server.py ctr_serving_model_kv ctr_client_conf/serving_client_conf.prototxt 
+```
+
+### 执行预测
+
+```
+python test_client.py ./raw_data
+```
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/args.py b/python/examples/grpc_impl_example/criteo_ctr_with_cube/args.py
new file mode 100755
index 0000000000000000000000000000000000000000..30124d4ebd9cd27cdb4580e654a8a47c55b178bf
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/args.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="PaddlePaddle CTR example")
+    parser.add_argument(
+        '--train_data_path',
+        type=str,
+        default='./data/raw/train.txt',
+        help="The path of training dataset")
+    parser.add_argument(
+        '--sparse_only',
+        type=bool,
+        default=False,
+        help="Whether we use sparse features only")
+    parser.add_argument(
+        '--test_data_path',
+        type=str,
+        default='./data/raw/valid.txt',
+        help="The path of testing dataset")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=1000,
+        help="The size of mini-batch (default:1000)")
+    parser.add_argument(
+        '--embedding_size',
+        type=int,
+        default=10,
+        help="The size for embedding layer (default:10)")
+    parser.add_argument(
+        '--num_passes',
+        type=int,
+        default=10,
+        help="The number of passes to train (default: 10)")
+    parser.add_argument(
+        '--model_output_dir',
+        type=str,
+        default='models',
+        help='The path for model to store (default: models)')
+    parser.add_argument(
+        '--sparse_feature_dim',
+        type=int,
+        default=1000001,
+        help='sparse feature hashing space for index processing')
+    parser.add_argument(
+        '--is_local',
+        type=int,
+        default=1,
+        help='Local train or distributed train (default: 1)')
+    parser.add_argument(
+        '--cloud_train',
+        type=int,
+        default=0,
+        help='Local train or distributed train on paddlecloud (default: 0)')
+    parser.add_argument(
+        '--async_mode',
+        action='store_true',
+        default=False,
+        help='Whether start pserver in async mode to support ASGD')
+    parser.add_argument(
+        '--no_split_var',
+        action='store_true',
+        default=False,
+        help='Whether split variables into blocks when update_method is pserver')
+    parser.add_argument(
+        '--role',
+        type=str,
+        default='pserver',  # trainer or pserver
+        help='The path for model to store (default: models)')
+    parser.add_argument(
+        '--endpoints',
+        type=str,
+        default='127.0.0.1:6000',
+        help='The pserver endpoints, like: 127.0.0.1:6000,127.0.0.1:6001')
+    parser.add_argument(
+        '--current_endpoint',
+        type=str,
+        default='127.0.0.1:6000',
+        help='The path for model to store (default: 127.0.0.1:6000)')
+    parser.add_argument(
+        '--trainer_id',
+        type=int,
+        default=0,
+        help='The path for model to store (default: models)')
+    parser.add_argument(
+        '--trainers',
+        type=int,
+        default=1,
+        help='The num of trianers, (default: 1)')
+    return parser.parse_args()
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/clean.sh b/python/examples/grpc_impl_example/criteo_ctr_with_cube/clean.sh
new file mode 100755
index 0000000000000000000000000000000000000000..99a4819802178f1910c5fced7d4c5a39c3037e4a
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/clean.sh
@@ -0,0 +1,4 @@
+ps -ef | grep cube | awk {'print $2'} | xargs kill -9
+rm -rf cube/cube_data cube/data cube/log* cube/nohup* cube/output/ cube/donefile cube/input cube/monitor cube/cube-builder.INFO
+ps -ef | grep test | awk {'print $2'} | xargs kill -9
+ps -ef | grep serving | awk {'print $2'} | xargs kill -9
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/criteo.py b/python/examples/grpc_impl_example/criteo_ctr_with_cube/criteo.py
new file mode 100755
index 0000000000000000000000000000000000000000..f37eb1d2c1d8df6975ec0c28923c6e17c0272290
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/criteo.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+
+class CriteoDataset(object):
+    def setup(self, sparse_feature_dim):
+        self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        self.cont_max_ = [
+            20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
+        ]
+        self.cont_diff_ = [
+            20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
+        ]
+        self.hash_dim_ = sparse_feature_dim
+        # here, training data are lines with line_index < train_idx_
+        self.train_idx_ = 41256555
+        self.continuous_range_ = range(1, 14)
+        self.categorical_range_ = range(14, 40)
+
+    def _process_line(self, line):
+        features = line.rstrip('\n').split('\t')
+        dense_feature = []
+        sparse_feature = []
+        for idx in self.continuous_range_:
+            if features[idx] == '':
+                dense_feature.append(0.0)
+            else:
+                dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / \
+                                     self.cont_diff_[idx - 1])
+        for idx in self.categorical_range_:
+            sparse_feature.append(
+                [hash(str(idx) + features[idx]) % self.hash_dim_])
+
+        return dense_feature, sparse_feature, [int(features[0])]
+
+    def infer_reader(self, filelist, batch, buf_size):
+        def local_iter():
+            for fname in filelist:
+                with open(fname.strip(), "r") as fin:
+                    for line in fin:
+                        dense_feature, sparse_feature, label = self._process_line(
+                            line)
+                        #yield dense_feature, sparse_feature, label
+                        yield [dense_feature] + sparse_feature + [label]
+
+        import paddle
+        batch_iter = paddle.batch(
+            paddle.reader.shuffle(
+                local_iter, buf_size=buf_size),
+            batch_size=batch)
+        return batch_iter
+
+    def generate_sample(self, line):
+        def data_iter():
+            dense_feature, sparse_feature, label = self._process_line(line)
+            feature_name = ["dense_input"]
+            for idx in self.categorical_range_:
+                feature_name.append("C" + str(idx - 13))
+            feature_name.append("label")
+            yield zip(feature_name, [dense_feature] + sparse_feature + [label])
+
+        return data_iter
+
+
+if __name__ == "__main__":
+    criteo_dataset = CriteoDataset()
+    criteo_dataset.setup(int(sys.argv[1]))
+    criteo_dataset.run_from_stdin()
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/criteo_reader.py b/python/examples/grpc_impl_example/criteo_ctr_with_cube/criteo_reader.py
new file mode 100755
index 0000000000000000000000000000000000000000..2a80af78a9c2033bf246f703ca70a817ab786af3
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/criteo_reader.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+import sys
+import paddle.fluid.incubate.data_generator as dg
+
+
+class CriteoDataset(dg.MultiSlotDataGenerator):
+    def setup(self, sparse_feature_dim):
+        self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        self.cont_max_ = [
+            20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
+        ]
+        self.cont_diff_ = [
+            20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
+        ]
+        self.hash_dim_ = sparse_feature_dim
+        # here, training data are lines with line_index < train_idx_
+        self.train_idx_ = 41256555
+        self.continuous_range_ = range(1, 14)
+        self.categorical_range_ = range(14, 40)
+
+    def _process_line(self, line):
+        features = line.rstrip('\n').split('\t')
+        dense_feature = []
+        sparse_feature = []
+        for idx in self.continuous_range_:
+            if features[idx] == '':
+                dense_feature.append(0.0)
+            else:
+                dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / \
+                                     self.cont_diff_[idx - 1])
+        for idx in self.categorical_range_:
+            sparse_feature.append(
+                [hash(str(idx) + features[idx]) % self.hash_dim_])
+
+        return dense_feature, sparse_feature, [int(features[0])]
+
+    def infer_reader(self, filelist, batch, buf_size):
+        def local_iter():
+            for fname in filelist:
+                with open(fname.strip(), "r") as fin:
+                    for line in fin:
+                        dense_feature, sparse_feature, label = self._process_line(
+                            line)
+                        #yield dense_feature, sparse_feature, label
+                        yield [dense_feature] + sparse_feature + [label]
+
+        import paddle
+        batch_iter = paddle.batch(
+            paddle.reader.shuffle(
+                local_iter, buf_size=buf_size),
+            batch_size=batch)
+        return batch_iter
+
+    def generate_sample(self, line):
+        def data_iter():
+            dense_feature, sparse_feature, label = self._process_line(line)
+            feature_name = ["dense_input"]
+            for idx in self.categorical_range_:
+                feature_name.append("C" + str(idx - 13))
+            feature_name.append("label")
+            yield zip(feature_name, [dense_feature] + sparse_feature + [label])
+
+        return data_iter
+
+
+if __name__ == "__main__":
+    criteo_dataset = CriteoDataset()
+    criteo_dataset.setup(int(sys.argv[1]))
+    criteo_dataset.run_from_stdin()
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/conf/cube.conf b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/conf/cube.conf
new file mode 100755
index 0000000000000000000000000000000000000000..b70f6e34247e410f9b80054010338d3c8f452ec6
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/conf/cube.conf
@@ -0,0 +1,13 @@
+[{
+    "dict_name": "test_dict",
+    "shard": 1,
+    "dup": 1,
+    "timeout": 200,
+    "retry": 3,
+    "backup_request": 100,
+    "type": "ipport_list",
+    "load_balancer": "rr",
+    "nodes": [{
+        "ipport_list": "list://127.0.0.1:8027"
+    }]
+}]
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/conf/gflags.conf b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/conf/gflags.conf
new file mode 100755
index 0000000000000000000000000000000000000000..21c7bddebd8f22b91d0ba26a6121007f96a4380b
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/conf/gflags.conf
@@ -0,0 +1,4 @@
+--port=8027
+--dict_split=1
+--in_mem=true
+--log_dir=./log/
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/keys b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/keys
new file mode 100755
index 0000000000000000000000000000000000000000..f00c965d8307308469e537302baa73048488f162
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube/keys
@@ -0,0 +1,10 @@
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube_prepare.sh b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube_prepare.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1417254a54e2194ab3a0194f2ec970f480787acd
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube_prepare.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+#! /bin/bash
+
+mkdir -p cube_model
+mkdir -p cube/data
+./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature  
+./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1  -only_build=false
+mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
+cd cube && ./cube 
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube_quant_prepare.sh b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube_quant_prepare.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0db6575ab307fb81cdd0336a20bb9a8ec30d446d
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/cube_quant_prepare.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+#! /bin/bash
+
+mkdir -p cube_model
+mkdir -p cube/data
+./seq_generator ctr_serving_model/SparseFeatFactors ./cube_model/feature 8  
+./cube/cube-builder -dict_name=test_dict -job_mode=base -last_version=0 -cur_version=0 -depend_version=0 -input_path=./cube_model -output_path=${PWD}/cube/data -shard_num=1  -only_build=false
+mv ./cube/data/0_0/test_dict_part0/* ./cube/data/
+cd cube && ./cube 
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/get_data.sh b/python/examples/grpc_impl_example/criteo_ctr_with_cube/get_data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1f244b3a4aa81488bb493825576ba30c4b3bba22
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/get_data.sh
@@ -0,0 +1,2 @@
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/data/ctr_prediction/ctr_data.tar.gz
+tar -zxvf ctr_data.tar.gz
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/local_train.py b/python/examples/grpc_impl_example/criteo_ctr_with_cube/local_train.py
new file mode 100755
index 0000000000000000000000000000000000000000..d4a1bc930924e348048f7ac3e5c46381d9b6441b
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/local_train.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+from __future__ import print_function
+
+from args import parse_args
+import os
+import paddle.fluid as fluid
+import sys
+from network_conf import dnn_model
+
+dense_feature_dim = 13
+
+
+def train():
+    args = parse_args()
+    sparse_only = args.sparse_only
+    if not os.path.isdir(args.model_output_dir):
+        os.mkdir(args.model_output_dir)
+    dense_input = fluid.layers.data(
+        name="dense_input", shape=[dense_feature_dim], dtype='float32')
+    sparse_input_ids = [
+        fluid.layers.data(
+            name="C" + str(i), shape=[1], lod_level=1, dtype="int64")
+        for i in range(1, 27)
+    ]
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    #nn_input = None if sparse_only else dense_input
+    nn_input = dense_input
+    predict_y, loss, auc_var, batch_auc_var, infer_vars = dnn_model(
+        nn_input, sparse_input_ids, label, args.embedding_size,
+        args.sparse_feature_dim)
+
+    optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
+    optimizer.minimize(loss)
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    exe.run(fluid.default_startup_program())
+    dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+    dataset.set_use_var([dense_input] + sparse_input_ids + [label])
+
+    python_executable = "python"
+    pipe_command = "{} criteo_reader.py {}".format(python_executable,
+                                                   args.sparse_feature_dim)
+
+    dataset.set_pipe_command(pipe_command)
+    dataset.set_batch_size(128)
+    thread_num = 10
+    dataset.set_thread(thread_num)
+
+    whole_filelist = [
+        "raw_data/part-%d" % x for x in range(len(os.listdir("raw_data")))
+    ]
+
+    print(whole_filelist)
+    dataset.set_filelist(whole_filelist[:100])
+    dataset.load_into_memory()
+    fluid.layers.Print(auc_var)
+    epochs = 1
+    for i in range(epochs):
+        exe.train_from_dataset(
+            program=fluid.default_main_program(), dataset=dataset, debug=True)
+        print("epoch {} finished".format(i))
+
+    import paddle_serving_client.io as server_io
+    feed_var_dict = {}
+    feed_var_dict['dense_input'] = dense_input
+    for i, sparse in enumerate(sparse_input_ids):
+        feed_var_dict["embedding_{}.tmp_0".format(i)] = sparse
+    fetch_var_dict = {"prob": predict_y}
+
+    feed_kv_dict = {}
+    feed_kv_dict['dense_input'] = dense_input
+    for i, emb in enumerate(infer_vars):
+        feed_kv_dict["embedding_{}.tmp_0".format(i)] = emb
+    fetch_var_dict = {"prob": predict_y}
+
+    server_io.save_model("ctr_serving_model", "ctr_client_conf", feed_var_dict,
+                         fetch_var_dict, fluid.default_main_program())
+
+    server_io.save_model("ctr_serving_model_kv", "ctr_client_conf_kv",
+                         feed_kv_dict, fetch_var_dict,
+                         fluid.default_main_program())
+
+
+if __name__ == '__main__':
+    train()
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/network_conf.py b/python/examples/grpc_impl_example/criteo_ctr_with_cube/network_conf.py
new file mode 100755
index 0000000000000000000000000000000000000000..2975533a72ad21d6dd5896446fd06c1f9bdfe8b4
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/network_conf.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+import paddle.fluid as fluid
+import math
+
+
+def dnn_model(dense_input, sparse_inputs, label, embedding_size,
+              sparse_feature_dim):
+    def embedding_layer(input):
+        emb = fluid.layers.embedding(
+            input=input,
+            is_sparse=True,
+            is_distributed=False,
+            size=[sparse_feature_dim, embedding_size],
+            param_attr=fluid.ParamAttr(
+                name="SparseFeatFactors",
+                initializer=fluid.initializer.Uniform()))
+        x = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+        return emb, x
+
+    def mlp_input_tensor(emb_sums, dense_tensor):
+        #if isinstance(dense_tensor, fluid.Variable):
+        #    return fluid.layers.concat(emb_sums, axis=1)
+        #else:
+        return fluid.layers.concat(emb_sums + [dense_tensor], axis=1)
+
+    def mlp(mlp_input):
+        fc1 = fluid.layers.fc(input=mlp_input,
+                              size=400,
+                              act='relu',
+                              param_attr=fluid.ParamAttr(
+                                  initializer=fluid.initializer.Normal(
+                                      scale=1 / math.sqrt(mlp_input.shape[1]))))
+        fc2 = fluid.layers.fc(input=fc1,
+                              size=400,
+                              act='relu',
+                              param_attr=fluid.ParamAttr(
+                                  initializer=fluid.initializer.Normal(
+                                      scale=1 / math.sqrt(fc1.shape[1]))))
+        fc3 = fluid.layers.fc(input=fc2,
+                              size=400,
+                              act='relu',
+                              param_attr=fluid.ParamAttr(
+                                  initializer=fluid.initializer.Normal(
+                                      scale=1 / math.sqrt(fc2.shape[1]))))
+        pre = fluid.layers.fc(input=fc3,
+                              size=2,
+                              act='softmax',
+                              param_attr=fluid.ParamAttr(
+                                  initializer=fluid.initializer.Normal(
+                                      scale=1 / math.sqrt(fc3.shape[1]))))
+        return pre
+
+    emb_pair_sums = list(map(embedding_layer, sparse_inputs))
+    emb_sums = [x[1] for x in emb_pair_sums]
+    infer_vars = [x[0] for x in emb_pair_sums]
+    mlp_in = mlp_input_tensor(emb_sums, dense_input)
+    predict = mlp(mlp_in)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.reduce_sum(cost)
+    accuracy = fluid.layers.accuracy(input=predict, label=label)
+    auc_var, batch_auc_var, auc_states = \
+        fluid.layers.auc(input=predict, label=label, num_thresholds=2 ** 12, slide_steps=20)
+    return predict, avg_cost, auc_var, batch_auc_var, infer_vars
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_client.py b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_client.py
new file mode 100755
index 0000000000000000000000000000000000000000..f82c1a21c153594e0be192506af5318c24a4e99a
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_client.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+from paddle_serving_client import MultiLangClient as Client
+import sys
+import os
+import criteo as criteo
+import time
+from paddle_serving_client.metric import auc
+import grpc
+
+client = Client()
+client.connect(["127.0.0.1:9292"])
+
+batch = 1
+buf_size = 100
+dataset = criteo.CriteoDataset()
+dataset.setup(1000001)
+test_filelists = ["{}/part-0".format(sys.argv[1])]
+reader = dataset.infer_reader(test_filelists, batch, buf_size)
+label_list = []
+prob_list = []
+start = time.time()
+for ei in range(10000):
+    data = reader().next()
+    feed_dict = {}
+    feed_dict['dense_input'] = data[0][0]
+    for i in range(1, 27):
+        feed_dict["embedding_{}.tmp_0".format(i - 1)] = data[0][i]
+    fetch_map = client.predict(feed=feed_dict, fetch=["prob"])
+    if fetch_map["serving_status_code"] == 0:
+        prob_list.append(fetch_map['prob'][0][1])
+        label_list.append(data[0][-1][0])
+
+print(auc(label_list, prob_list))
+end = time.time()
+print(end - start)
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server.py b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server.py
new file mode 100755
index 0000000000000000000000000000000000000000..361d5a59becb7c110907f66d8b651e05e7eb418e
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+import os
+import sys
+from paddle_serving_server import OpMaker
+from paddle_serving_server import OpSeqMaker
+from paddle_serving_server import MultiLangServer as Server
+
+op_maker = OpMaker()
+read_op = op_maker.create('general_reader')
+general_dist_kv_infer_op = op_maker.create('general_dist_kv_infer')
+response_op = op_maker.create('general_response')
+
+op_seq_maker = OpSeqMaker()
+op_seq_maker.add_op(read_op)
+op_seq_maker.add_op(general_dist_kv_infer_op)
+op_seq_maker.add_op(response_op)
+
+server = Server()
+server.set_op_sequence(op_seq_maker.get_op_sequence())
+server.set_num_threads(4)
+server.load_model_config(sys.argv[1], sys.argv[2])
+server.prepare_server(workdir="work_dir1", port=9292, device="cpu")
+server.run_server()
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server_gpu.py b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server_gpu.py
new file mode 100755
index 0000000000000000000000000000000000000000..38e1bf82118f6af7cfe7b467003332a5328b2979
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server_gpu.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+import os
+import sys
+from paddle_serving_server_gpu import OpMaker
+from paddle_serving_server_gpu import OpSeqMaker
+from paddle_serving_server_gpu import MultiLangServer as Server
+
+op_maker = OpMaker()
+read_op = op_maker.create('general_reader')
+general_dist_kv_infer_op = op_maker.create('general_dist_kv_infer')
+response_op = op_maker.create('general_response')
+
+op_seq_maker = OpSeqMaker()
+op_seq_maker.add_op(read_op)
+op_seq_maker.add_op(general_dist_kv_infer_op)
+op_seq_maker.add_op(response_op)
+
+server = Server()
+server.set_op_sequence(op_seq_maker.get_op_sequence())
+server.set_num_threads(4)
+server.load_model_config(sys.argv[1], sys.argv[2])
+server.prepare_server(workdir="work_dir1", port=9292, device="cpu")
+server.run_server()
diff --git a/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server_quant.py b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server_quant.py
new file mode 100755
index 0000000000000000000000000000000000000000..feca75b077d737a614bdfd955b7bf0d82ed08529
--- /dev/null
+++ b/python/examples/grpc_impl_example/criteo_ctr_with_cube/test_server_quant.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+import os
+import sys
+from paddle_serving_server import OpMaker
+from paddle_serving_server import OpSeqMaker
+from paddle_serving_server import MultiLangServer as Server
+
+op_maker = OpMaker()
+read_op = op_maker.create('general_reader')
+general_dist_kv_infer_op = op_maker.create('general_dist_kv_quant_infer')
+response_op = op_maker.create('general_response')
+
+op_seq_maker = OpSeqMaker()
+op_seq_maker.add_op(read_op)
+op_seq_maker.add_op(general_dist_kv_infer_op)
+op_seq_maker.add_op(response_op)
+
+server = Server()
+server.set_op_sequence(op_seq_maker.get_op_sequence())
+server.set_num_threads(4)
+server.load_model_config(sys.argv[1], sys.argv[2])
+server.prepare_server(workdir="work_dir1", port=9292, device="cpu")
+server.run_server()
diff --git a/python/examples/grpc_impl_example/fit_a_line/README_CN.md b/python/examples/grpc_impl_example/fit_a_line/README_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..93e0d1cf7262d620df18570401ed39db67f839ef
--- /dev/null
+++ b/python/examples/grpc_impl_example/fit_a_line/README_CN.md
@@ -0,0 +1,57 @@
+# 线性回归预测服务示例
+
+## 获取数据
+
+```shell
+sh get_data.sh
+```
+
+## 开启 gRPC 服务端
+
+``` shell
+python test_server.py uci_housing_model/
+```
+
+也可以通过下面的一行代码开启默认 gRPC 服务：
+
+```shell
+python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9393 --use_multilang
+```
+
+## 客户端预测
+
+### 同步预测
+
+``` shell
+python test_sync_client.py
+```
+
+### 异步预测
+
+``` shell
+python test_asyn_client.py
+```
+
+### Batch 预测
+
+``` shell
+python test_batch_client.py
+```
+
+### 通用 pb 预测
+
+``` shell
+python test_general_pb_client.py
+```
+
+### 预测超时
+
+``` shell
+python test_timeout_client.py
+```
+
+### List 输入
+
+``` shell
+python test_list_input_client.py
+```
diff --git a/python/examples/grpc_impl_example/fit_a_line/get_data.sh b/python/examples/grpc_impl_example/fit_a_line/get_data.sh
new file mode 100644
index 0000000000000000000000000000000000000000..84a3966a0ef323cef4b146d8e9489c70a7a8ae35
--- /dev/null
+++ b/python/examples/grpc_impl_example/fit_a_line/get_data.sh
@@ -0,0 +1,2 @@
+wget --no-check-certificate https://paddle-serving.bj.bcebos.com/uci_housing.tar.gz
+tar -xzf uci_housing.tar.gz
diff --git a/python/examples/fit_a_line/test_multilang_client.py b/python/examples/grpc_impl_example/fit_a_line/test_asyn_client.py
similarity index 58%
rename from python/examples/fit_a_line/test_multilang_client.py
rename to python/examples/grpc_impl_example/fit_a_line/test_asyn_client.py
index f85814a4b24693c269c192b23f0f5ab1c7d566a6..b01a9372585bae42abca213fe8fb8a55505dfe57 100644
--- a/python/examples/fit_a_line/test_multilang_client.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_asyn_client.py
@@ -13,38 +13,39 @@
 # limitations under the License.
 # pylint: disable=doc-string-missing
 
-from paddle_serving_client import MultiLangClient
+from paddle_serving_client import MultiLangClient as Client
 import functools
-import sys
 import time
 import threading
+import grpc
 
-client = MultiLangClient()
-client.load_client_config(sys.argv[1])
+client = Client()
 client.connect(["127.0.0.1:9393"])
 
-import paddle
-test_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.uci_housing.test(), buf_size=500),
-    batch_size=1)
-
 complete_task_count = [0]
 lock = threading.Lock()
 
 
-def call_back(call_future, data):
-    fetch_map = call_future.result()
-    print("{} {}".format(fetch_map["price"][0], data[0][1][0]))
-    with lock:
-        complete_task_count[0] += 1
+def call_back(call_future):
+    try:
+        fetch_map = call_future.result()
+        print(fetch_map)
+    except grpc.RpcError as e:
+        print(e.code())
+    finally:
+        with lock:
+            complete_task_count[0] += 1
 
 
+x = [
+    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
+    0.4919, 0.1856, 0.0795, -0.0332
+]
 task_count = 0
-for data in test_reader():
-    future = client.predict(feed={"x": data[0][0]}, fetch=["price"], asyn=True)
+for i in range(3):
+    future = client.predict(feed={"x": x}, fetch=["price"], asyn=True)
     task_count += 1
-    future.add_done_callback(functools.partial(call_back, data=data))
+    future.add_done_callback(functools.partial(call_back))
 
 while complete_task_count[0] != task_count:
     time.sleep(0.1)
diff --git a/python/examples/grpc_impl_example/fit_a_line/test_batch_client.py b/python/examples/grpc_impl_example/fit_a_line/test_batch_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..0630a0a960e5e40a7507454feb57418c8cfbdc68
--- /dev/null
+++ b/python/examples/grpc_impl_example/fit_a_line/test_batch_client.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_client import MultiLangClient as Client
+
+client = Client()
+client.connect(["127.0.0.1:9393"])
+
+batch_size = 2
+x = [
+    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
+    0.4919, 0.1856, 0.0795, -0.0332
+]
+
+for i in range(3):
+    batch_feed = [{"x": x} for j in range(batch_size)]
+    fetch_map = client.predict(feed=batch_feed, fetch=["price"])
+    if fetch_map["serving_status_code"] == 0:
+        print(fetch_map)
+    else:
+        print(fetch_map["serving_status_code"])
diff --git a/python/examples/grpc_impl_example/fit_a_line/test_general_pb_client.py b/python/examples/grpc_impl_example/fit_a_line/test_general_pb_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2744906b0dcd321f86a1b8117a78307e24578e5
--- /dev/null
+++ b/python/examples/grpc_impl_example/fit_a_line/test_general_pb_client.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+from paddle_serving_client import MultiLangClient as Client
+
+client = Client()
+client.connect(["127.0.0.1:9393"])
+
+x = [
+    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
+    0.4919, 0.1856, 0.0795, -0.0332
+]
+for i in range(3):
+    fetch_map = client.predict(feed={"x": x}, fetch=["price"], is_python=False)
+    if fetch_map["serving_status_code"] == 0:
+        print(fetch_map)
+    else:
+        print(fetch_map["serving_status_code"])
diff --git a/python/examples/grpc_impl_example/fit_a_line/test_numpy_input_client.py b/python/examples/grpc_impl_example/fit_a_line/test_numpy_input_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..e98c1e87bb48613e4226cf5378063aec7c5b4093
--- /dev/null
+++ b/python/examples/grpc_impl_example/fit_a_line/test_numpy_input_client.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+from paddle_serving_client import MultiLangClient as Client
+import numpy as np
+
+client = Client()
+client.connect(["127.0.0.1:9393"])
+
+x = [
+    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
+    0.4919, 0.1856, 0.0795, -0.0332
+]
+for i in range(3):
+    fetch_map = client.predict(feed={"x": np.array(x)}, fetch=["price"])
+    if fetch_map["serving_status_code"] == 0:
+        print(fetch_map)
+    else:
+        print(fetch_map["serving_status_code"])
diff --git a/python/examples/fit_a_line/test_multilang_server.py b/python/examples/grpc_impl_example/fit_a_line/test_server.py
similarity index 94%
rename from python/examples/fit_a_line/test_multilang_server.py
rename to python/examples/grpc_impl_example/fit_a_line/test_server.py
index 23eb938f0ee1bf6b195509816dea5221bbfa9218..6acc7bfe2e6d00621f32f1f7f437691fc15d20fc 100644
--- a/python/examples/fit_a_line/test_multilang_server.py
+++ b/python/examples/grpc_impl_example/fit_a_line/test_server.py
@@ -17,7 +17,7 @@ import os
 import sys
 from paddle_serving_server import OpMaker
 from paddle_serving_server import OpSeqMaker
-from paddle_serving_server import MultiLangServer
+from paddle_serving_server import MultiLangServer as Server
 
 op_maker = OpMaker()
 read_op = op_maker.create('general_reader')
@@ -29,7 +29,7 @@ op_seq_maker.add_op(read_op)
 op_seq_maker.add_op(general_infer_op)
 op_seq_maker.add_op(response_op)
 
-server = MultiLangServer()
+server = Server()
 server.set_op_sequence(op_seq_maker.get_op_sequence())
 server.load_model_config(sys.argv[1])
 server.prepare_server(workdir="work_dir1", port=9393, device="cpu")
diff --git a/python/examples/grpc_impl_example/fit_a_line/test_server_gpu.py b/python/examples/grpc_impl_example/fit_a_line/test_server_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..1547ee445f4f8ceebe58e6f9e4f05b92520911eb
--- /dev/null
+++ b/python/examples/grpc_impl_example/fit_a_line/test_server_gpu.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+import os
+import sys
+from paddle_serving_server_gpu import OpMaker
+from paddle_serving_server_gpu import OpSeqMaker
+from paddle_serving_server_gpu import MultiLangServer as Server
+
+op_maker = OpMaker()
+read_op = op_maker.create('general_reader')
+general_infer_op = op_maker.create('general_infer')
+response_op = op_maker.create('general_response')
+
+op_seq_maker = OpSeqMaker()
+op_seq_maker.add_op(read_op)
+op_seq_maker.add_op(general_infer_op)
+op_seq_maker.add_op(response_op)
+
+server = Server()
+server.set_op_sequence(op_seq_maker.get_op_sequence())
+server.load_model_config(sys.argv[1])
+server.set_gpuid(0)
+server.prepare_server(workdir="work_dir1", port=9393, device="cpu")
+server.run_server()
diff --git a/python/examples/grpc_impl_example/fit_a_line/test_sync_client.py b/python/examples/grpc_impl_example/fit_a_line/test_sync_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..89530dc2f2a33ef44b2dbde52975634f4b4d8295
--- /dev/null
+++ b/python/examples/grpc_impl_example/fit_a_line/test_sync_client.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+from paddle_serving_client import MultiLangClient as Client
+
+client = Client()
+client.connect(["127.0.0.1:9393"])
+
+x = [
+    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
+    0.4919, 0.1856, 0.0795, -0.0332
+]
+for i in range(3):
+    fetch_map = client.predict(feed={"x": x}, fetch=["price"])
+    if fetch_map["serving_status_code"] == 0:
+        print(fetch_map)
+    else:
+        print(fetch_map["serving_status_code"])
diff --git a/python/examples/grpc_impl_example/fit_a_line/test_timeout_client.py b/python/examples/grpc_impl_example/fit_a_line/test_timeout_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..f90fab38533aabf3daa7627ee0b79c56892444dd
--- /dev/null
+++ b/python/examples/grpc_impl_example/fit_a_line/test_timeout_client.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+from paddle_serving_client import MultiLangClient as Client
+import grpc
+
+client = Client()
+client.connect(["127.0.0.1:9393"])
+client.set_rpc_timeout_ms(1)
+
+x = [
+    0.0137, -0.1136, 0.2553, -0.0692, 0.0582, -0.0727, -0.1583, -0.0584, 0.6283,
+    0.4919, 0.1856, 0.0795, -0.0332
+]
+for i in range(3):
+    fetch_map = client.predict(feed={"x": x}, fetch=["price"])
+    if fetch_map["serving_status_code"] == 0:
+        print(fetch_map)
+    elif fetch_map["serving_status_code"] == grpc.StatusCode.DEADLINE_EXCEEDED:
+        print('timeout')
+    else:
+        print(fetch_map["serving_status_code"])
diff --git a/python/examples/imdb/test_ensemble_client.py b/python/examples/imdb/test_ensemble_client.py
index 6cafb3389fff5a25103bcb2b3a867b73b35b9e8e..eb1e29ddd6d5a02854e4859a35474306c1c4d073 100644
--- a/python/examples/imdb/test_ensemble_client.py
+++ b/python/examples/imdb/test_ensemble_client.py
@@ -32,11 +32,7 @@ for i in range(3):
     line = 'i am very sad | 0'
     word_ids, label = imdb_dataset.get_words_and_label(line)
     feed = {"words": word_ids}
-    fetch = ["acc", "cost", "prediction"]
+    fetch = ["prediction"]
     fetch_maps = client.predict(feed=feed, fetch=fetch)
-    if len(fetch_maps) == 1:
-        print("step: {}, res: {}".format(i, fetch_maps['prediction'][0][1]))
-    else:
-        for model, fetch_map in fetch_maps.items():
-            print("step: {}, model: {}, res: {}".format(i, model, fetch_map[
-                'prediction'][0][1]))
+    for model, fetch_map in fetch_maps.items():
+        print("step: {}, model: {}, res: {}".format(i, model, fetch_map))
diff --git a/python/examples/imdb/test_multilang_ensemble_client.py b/python/examples/imdb/test_multilang_ensemble_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..6686d4c8c38d6a17cb9c5701abf7d76773031772
--- /dev/null
+++ b/python/examples/imdb/test_multilang_ensemble_client.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+from paddle_serving_client import MultiLangClient
+from imdb_reader import IMDBDataset
+
+client = MultiLangClient()
+# If you have more than one model, make sure that the input
+# and output of more than one model are the same.
+client.connect(["127.0.0.1:9393"])
+
+# you can define any english sentence or dataset here
+# This example reuses imdb reader in training, you
+# can define your own data preprocessing easily.
+imdb_dataset = IMDBDataset()
+imdb_dataset.load_resource('imdb.vocab')
+
+for i in range(3):
+    line = 'i am very sad | 0'
+    word_ids, label = imdb_dataset.get_words_and_label(line)
+    feed = {"words": word_ids}
+    fetch = ["prediction"]
+    fetch_maps = client.predict(feed=feed, fetch=fetch)
+    for model, fetch_map in fetch_maps.items():
+        print("step: {}, model: {}, res: {}".format(i, model, fetch_map))
diff --git a/python/examples/imdb/test_multilang_ensemble_server.py b/python/examples/imdb/test_multilang_ensemble_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..053aa06f0219de231415ba178135782334e56c1f
--- /dev/null
+++ b/python/examples/imdb/test_multilang_ensemble_server.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+from paddle_serving_server import OpMaker
+from paddle_serving_server import OpGraphMaker
+from paddle_serving_server import MultiLangServer
+
+op_maker = OpMaker()
+read_op = op_maker.create('general_reader')
+cnn_infer_op = op_maker.create(
+    'general_infer', engine_name='cnn', inputs=[read_op])
+bow_infer_op = op_maker.create(
+    'general_infer', engine_name='bow', inputs=[read_op])
+response_op = op_maker.create(
+    'general_response', inputs=[cnn_infer_op, bow_infer_op])
+
+op_graph_maker = OpGraphMaker()
+op_graph_maker.add_op(read_op)
+op_graph_maker.add_op(cnn_infer_op)
+op_graph_maker.add_op(bow_infer_op)
+op_graph_maker.add_op(response_op)
+
+server = MultiLangServer()
+server.set_op_graph(op_graph_maker.get_op_graph())
+model_config = {cnn_infer_op: 'imdb_cnn_model', bow_infer_op: 'imdb_bow_model'}
+server.load_model_config(model_config)
+server.prepare_server(workdir="work_dir1", port=9393, device="cpu")
+server.run_server()
diff --git a/python/paddle_serving_client/__init__.py b/python/paddle_serving_client/__init__.py
index 37f52c48b4c168d93f877a4a7cd4f1bd9afc8b1d..455bcf62cd039dde69736ec514892856eabd3088 100644
--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
@@ -397,22 +397,41 @@ class Client(object):
 class MultiLangClient(object):
     def __init__(self):
         self.channel_ = None
+        self.stub_ = None
+        self.rpc_timeout_s_ = 2
 
-    def load_client_config(self, path):
-        if not isinstance(path, str):
-            raise Exception("GClient only supports multi-model temporarily")
-        self._parse_model_config(path)
+    def add_variant(self, tag, cluster, variant_weight):
+        # TODO
+        raise Exception("cannot support ABtest yet")
 
-    def connect(self, endpoint):
+    def set_rpc_timeout_ms(self, rpc_timeout):
+        if self.stub_ is None:
+            raise Exception("set timeout must be set after connect.")
+        if not isinstance(rpc_timeout, int):
+            # for bclient
+            raise ValueError("rpc_timeout must be int type.")
+        self.rpc_timeout_s_ = rpc_timeout / 1000.0
+        timeout_req = multi_lang_general_model_service_pb2.SetTimeoutRequest()
+        timeout_req.timeout_ms = rpc_timeout
+        resp = self.stub_.SetTimeout(timeout_req)
+        return resp.err_code == 0
+
+    def connect(self, endpoints):
         # https://github.com/tensorflow/serving/issues/1382
         options = [('grpc.max_receive_message_length', 512 * 1024 * 1024),
                    ('grpc.max_send_message_length', 512 * 1024 * 1024),
-                   ('grpc.max_receive_message_length', 512 * 1024 * 1024)]
-
-        self.channel_ = grpc.insecure_channel(
-            endpoint[0], options=options)  #TODO
+                   ('grpc.lb_policy_name', 'round_robin')]
+        # TODO: weight round robin
+        g_endpoint = 'ipv4:{}'.format(','.join(endpoints))
+        self.channel_ = grpc.insecure_channel(g_endpoint, options=options)
         self.stub_ = multi_lang_general_model_service_pb2_grpc.MultiLangGeneralModelServiceStub(
             self.channel_)
+        # get client model config
+        get_client_config_req = multi_lang_general_model_service_pb2.GetClientConfigRequest(
+        )
+        resp = self.stub_.GetClientConfig(get_client_config_req)
+        model_config_str = resp.client_config_str
+        self._parse_model_config(model_config_str)
 
     def _flatten_list(self, nested_list):
         for item in nested_list:
@@ -422,11 +441,10 @@ class MultiLangClient(object):
             else:
                 yield item
 
-    def _parse_model_config(self, model_config_path):
+    def _parse_model_config(self, model_config_str):
         model_conf = m_config.GeneralModelConfig()
-        f = open(model_config_path, 'r')
-        model_conf = google.protobuf.text_format.Merge(
-            str(f.read()), model_conf)
+        model_conf = google.protobuf.text_format.Merge(model_config_str,
+                                                       model_conf)
         self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
         self.feed_types_ = {}
         self.feed_shapes_ = {}
@@ -447,8 +465,8 @@ class MultiLangClient(object):
             if var.is_lod_tensor:
                 self.lod_tensor_set_.add(var.alias_name)
 
-    def _pack_feed_data(self, feed, fetch, is_python):
-        req = multi_lang_general_model_service_pb2.Request()
+    def _pack_inference_request(self, feed, fetch, is_python):
+        req = multi_lang_general_model_service_pb2.InferenceRequest()
         req.fetch_var_names.extend(fetch)
         req.is_python = is_python
         feed_batch = None
@@ -473,33 +491,50 @@ class MultiLangClient(object):
                             data = np.array(var, dtype="int64")
                         elif v_type == 1:  # float32
                             data = np.array(var, dtype="float32")
-                        elif v_type == 2:  #int32
+                        elif v_type == 2:  # int32
                             data = np.array(var, dtype="int32")
                         else:
-                            raise Exception("error type.")
-                    else:
+                            raise Exception("error tensor value type.")
+                    elif isinstance(var, np.ndarray):
                         data = var
-                        if var.dtype == "float64":
-                            data = data.astype("float32")
+                        if v_type == 0:
+                            if data.dtype != 'int64':
+                                data = data.astype("int64")
+                        elif v_type == 1:
+                            if data.dtype != 'float32':
+                                data = data.astype("float32")
+                        elif v_type == 2:
+                            if data.dtype != 'int32':
+                                data = data.astype("int32")
+                        else:
+                            raise Exception("error tensor value type.")
+                    else:
+                        raise Exception("var must be list or ndarray.")
                     tensor.data = data.tobytes()
                 else:
-                    if v_type == 0:  # int64
-                        if isinstance(var, np.ndarray):
-                            tensor.int64_data.extend(var.reshape(-1).tolist())
+                    if isinstance(var, np.ndarray):
+                        if v_type == 0:  # int64
+                            tensor.int64_data.extend(
+                                var.reshape(-1).astype("int64").tolist())
+                        elif v_type == 1:
+                            tensor.float_data.extend(
+                                var.reshape(-1).astype('float32').tolist())
+                        elif v_type == 2:
+                            tensor.int32_data.extend(
+                                var.reshape(-1).astype('int32').tolist())
                         else:
+                            raise Exception("error tensor value type.")
+                    elif isinstance(var, list):
+                        if v_type == 0:
                             tensor.int64_data.extend(self._flatten_list(var))
-                    elif v_type == 1:  # float32
-                        if isinstance(var, np.ndarray):
-                            tensor.float_data.extend(var.reshape(-1).tolist())
-                        else:
+                        elif v_type == 1:
                             tensor.float_data.extend(self._flatten_list(var))
-                    elif v_type == 2:  #int32
-                        if isinstance(car, np.array):
-                            tensor.int_data.extend(var.reshape(-1).tolist())
+                        elif v_type == 2:
+                            tensor.int32_data.extend(self._flatten_list(var))
                         else:
-                            tensor.int_data.extend(self._flatten_list(var))
+                            raise Exception("error tensor value type.")
                     else:
-                        raise Exception("error type.")
+                        raise Exception("var must be list or ndarray.")
                 if isinstance(var, np.ndarray):
                     tensor.shape.extend(list(var.shape))
                 else:
@@ -508,40 +543,52 @@ class MultiLangClient(object):
             req.insts.append(inst)
         return req
 
-    def _unpack_resp(self, resp, fetch, is_python, need_variant_tag):
-        result_map = {}
-        inst = resp.outputs[0].insts[0]
+    def _unpack_inference_response(self, resp, fetch, is_python,
+                                   need_variant_tag):
+        if resp.err_code != 0:
+            return None
         tag = resp.tag
-        for i, name in enumerate(fetch):
-            var = inst.tensor_array[i]
-            v_type = self.fetch_types_[name]
-            if is_python:
-                if v_type == 0:  # int64
-                    result_map[name] = np.frombuffer(var.data, dtype="int64")
-                elif v_type == 1:  # float32
-                    result_map[name] = np.frombuffer(var.data, dtype="float32")
-                else:
-                    raise Exception("error type.")
-            else:
-                if v_type == 0:  # int64
-                    result_map[name] = np.array(
-                        list(var.int64_data), dtype="int64")
-                elif v_type == 1:  # float32
-                    result_map[name] = np.array(
-                        list(var.float_data), dtype="float32")
-                elif v_type == 2:  # int32
-                    result_map[name] = np.array(
-                        list(var.int_data), dtype="int32")
+        multi_result_map = {}
+        for model_result in resp.outputs:
+            inst = model_result.insts[0]
+            result_map = {}
+            for i, name in enumerate(fetch):
+                var = inst.tensor_array[i]
+                v_type = self.fetch_types_[name]
+                if is_python:
+                    if v_type == 0:  # int64
+                        result_map[name] = np.frombuffer(
+                            var.data, dtype="int64")
+                    elif v_type == 1:  # float32
+                        result_map[name] = np.frombuffer(
+                            var.data, dtype="float32")
+                    else:
+                        raise Exception("error type.")
                 else:
-                    raise Exception("error type.")
-            result_map[name].shape = list(var.shape)
-            if name in self.lod_tensor_set_:
-                result_map["{}.lod".format(name)] = np.array(list(var.lod))
-        return result_map if not need_variant_tag else [result_map, tag]
+                    if v_type == 0:  # int64
+                        result_map[name] = np.array(
+                            list(var.int64_data), dtype="int64")
+                    elif v_type == 1:  # float32
+                        result_map[name] = np.array(
+                            list(var.float_data), dtype="float32")
+                    else:
+                        raise Exception("error type.")
+                result_map[name].shape = list(var.shape)
+                if name in self.lod_tensor_set_:
+                    result_map["{}.lod".format(name)] = np.array(list(var.lod))
+            multi_result_map[model_result.engine_name] = result_map
+        ret = None
+        if len(resp.outputs) == 1:
+            ret = list(multi_result_map.values())[0]
+        else:
+            ret = multi_result_map
+        ret["serving_status_code"] = 0
+        return ret if not need_variant_tag else [ret, tag]
 
     def _done_callback_func(self, fetch, is_python, need_variant_tag):
         def unpack_resp(resp):
-            return self._unpack_resp(resp, fetch, is_python, need_variant_tag)
+            return self._unpack_inference_response(resp, fetch, is_python,
+                                                   need_variant_tag)
 
         return unpack_resp
 
@@ -554,16 +601,20 @@ class MultiLangClient(object):
                 need_variant_tag=False,
                 asyn=False,
                 is_python=True):
-        req = self._pack_feed_data(feed, fetch, is_python=is_python)
+        req = self._pack_inference_request(feed, fetch, is_python=is_python)
         if not asyn:
-            resp = self.stub_.inference(req)
-            return self._unpack_resp(
-                resp,
-                fetch,
-                is_python=is_python,
-                need_variant_tag=need_variant_tag)
+            try:
+                resp = self.stub_.Inference(req, timeout=self.rpc_timeout_s_)
+                return self._unpack_inference_response(
+                    resp,
+                    fetch,
+                    is_python=is_python,
+                    need_variant_tag=need_variant_tag)
+            except grpc.RpcError as e:
+                return {"serving_status_code": e.code()}
         else:
-            call_future = self.stub_.inference.future(req)
+            call_future = self.stub_.Inference.future(
+                req, timeout=self.rpc_timeout_s_)
             return MultiLangPredictFuture(
                 call_future,
                 self._done_callback_func(
@@ -578,7 +629,10 @@ class MultiLangPredictFuture(object):
         self.callback_func_ = callback_func
 
     def result(self):
-        resp = self.call_future_.result()
+        try:
+            resp = self.call_future_.result()
+        except grpc.RpcError as e:
+            return {"serving_status_code": e.code()}
         return self.callback_func_(resp)
 
     def add_done_callback(self, fn):
diff --git a/python/paddle_serving_server/__init__.py b/python/paddle_serving_server/__init__.py
index 5d4c8554c0883965e08bce1fe927764d0a9626da..875e275c759d9fb1a9ccb6632816418a75a93aec 100644
--- a/python/paddle_serving_server/__init__.py
+++ b/python/paddle_serving_server/__init__.py
@@ -441,22 +441,29 @@ class Server(object):
         os.system(command)
 
 
-class MultiLangServerService(
-        multi_lang_general_model_service_pb2_grpc.MultiLangGeneralModelService):
-    def __init__(self, model_config_path, endpoints):
+class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
+                                     MultiLangGeneralModelServiceServicer):
+    def __init__(self, model_config_path, is_multi_model, endpoints):
+        self.is_multi_model_ = is_multi_model
+        self.model_config_path_ = model_config_path
+        self.endpoints_ = endpoints
+        with open(self.model_config_path_) as f:
+            self.model_config_str_ = str(f.read())
+        self._parse_model_config(self.model_config_str_)
+        self._init_bclient(self.model_config_path_, self.endpoints_)
+
+    def _init_bclient(self, model_config_path, endpoints, timeout_ms=None):
         from paddle_serving_client import Client
-        self._parse_model_config(model_config_path)
         self.bclient_ = Client()
-        self.bclient_.load_client_config(
-            "{}/serving_server_conf.prototxt".format(model_config_path))
+        if timeout_ms is not None:
+            self.bclient_.set_rpc_timeout_ms(timeout_ms)
+        self.bclient_.load_client_config(model_config_path)
         self.bclient_.connect(endpoints)
 
-    def _parse_model_config(self, model_config_path):
+    def _parse_model_config(self, model_config_str):
         model_conf = m_config.GeneralModelConfig()
-        f = open("{}/serving_server_conf.prototxt".format(model_config_path),
-                 'r')
-        model_conf = google.protobuf.text_format.Merge(
-            str(f.read()), model_conf)
+        model_conf = google.protobuf.text_format.Merge(model_config_str,
+                                                       model_conf)
         self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
         self.feed_types_ = {}
         self.feed_shapes_ = {}
@@ -481,7 +488,7 @@ class MultiLangServerService(
             else:
                 yield item
 
-    def _unpack_request(self, request):
+    def _unpack_inference_request(self, request):
         feed_names = list(request.feed_var_names)
         fetch_names = list(request.fetch_var_names)
         is_python = request.is_python
@@ -493,10 +500,12 @@ class MultiLangServerService(
                 v_type = self.feed_types_[name]
                 data = None
                 if is_python:
-                    if v_type == 0:
+                    if v_type == 0:  # int64
                         data = np.frombuffer(var.data, dtype="int64")
-                    elif v_type == 1:
+                    elif v_type == 1:  # float32
                         data = np.frombuffer(var.data, dtype="float32")
+                    elif v_type == 2:  # int32
+                        data = np.frombuffer(var.data, dtype="int32")
                     else:
                         raise Exception("error type.")
                 else:
@@ -504,6 +513,8 @@ class MultiLangServerService(
                         data = np.array(list(var.int64_data), dtype="int64")
                     elif v_type == 1:  # float32
                         data = np.array(list(var.float_data), dtype="float32")
+                    elif v_type == 2:  # int32
+                        data = np.array(list(var.int32_data), dtype="int32")
                     else:
                         raise Exception("error type.")
                 data.shape = list(feed_inst.tensor_array[idx].shape)
@@ -511,55 +522,132 @@ class MultiLangServerService(
             feed_batch.append(feed_dict)
         return feed_batch, fetch_names, is_python
 
-    def _pack_resp_package(self, result, fetch_names, is_python, tag):
-        resp = multi_lang_general_model_service_pb2.Response()
-        # Only one model is supported temporarily
-        model_output = multi_lang_general_model_service_pb2.ModelOutput()
-        inst = multi_lang_general_model_service_pb2.FetchInst()
-        for idx, name in enumerate(fetch_names):
-            tensor = multi_lang_general_model_service_pb2.Tensor()
-            v_type = self.fetch_types_[name]
-            if is_python:
-                tensor.data = result[name].tobytes()
-            else:
-                if v_type == 0:  # int64
-                    tensor.int64_data.extend(result[name].reshape(-1).tolist())
-                elif v_type == 1:  # float32
-                    tensor.float_data.extend(result[name].reshape(-1).tolist())
-                else:
-                    raise Exception("error type.")
-            tensor.shape.extend(list(result[name].shape))
-            if name in self.lod_tensor_set_:
-                tensor.lod.extend(result["{}.lod".format(name)].tolist())
-            inst.tensor_array.append(tensor)
-        model_output.insts.append(inst)
-        resp.outputs.append(model_output)
+    def _pack_inference_response(self, ret, fetch_names, is_python):
+        resp = multi_lang_general_model_service_pb2.InferenceResponse()
+        if ret is None:
+            resp.err_code = 1
+            return resp
+        results, tag = ret
         resp.tag = tag
+        resp.err_code = 0
+        if not self.is_multi_model_:
+            results = {'general_infer_0': results}
+        for model_name, model_result in results.items():
+            model_output = multi_lang_general_model_service_pb2.ModelOutput()
+            inst = multi_lang_general_model_service_pb2.FetchInst()
+            for idx, name in enumerate(fetch_names):
+                tensor = multi_lang_general_model_service_pb2.Tensor()
+                v_type = self.fetch_types_[name]
+                if is_python:
+                    tensor.data = model_result[name].tobytes()
+                else:
+                    if v_type == 0:  # int64
+                        tensor.int64_data.extend(model_result[name].reshape(-1)
+                                                 .tolist())
+                    elif v_type == 1:  # float32
+                        tensor.float_data.extend(model_result[name].reshape(-1)
+                                                 .tolist())
+                    elif v_type == 2:  # int32
+                        tensor.int32_data.extend(model_result[name].reshape(-1)
+                                                 .tolist())
+                    else:
+                        raise Exception("error type.")
+                tensor.shape.extend(list(model_result[name].shape))
+                if name in self.lod_tensor_set_:
+                    tensor.lod.extend(model_result["{}.lod".format(name)]
+                                      .tolist())
+                inst.tensor_array.append(tensor)
+            model_output.insts.append(inst)
+            model_output.engine_name = model_name
+            resp.outputs.append(model_output)
         return resp
 
-    def inference(self, request, context):
-        feed_dict, fetch_names, is_python = self._unpack_request(request)
-        data, tag = self.bclient_.predict(
+    def SetTimeout(self, request, context):
+        # This porcess and Inference process cannot be operate at the same time.
+        # For performance reasons, do not add thread lock temporarily.
+        timeout_ms = request.timeout_ms
+        self._init_bclient(self.model_config_path_, self.endpoints_, timeout_ms)
+        resp = multi_lang_general_model_service_pb2.SimpleResponse()
+        resp.err_code = 0
+        return resp
+
+    def Inference(self, request, context):
+        feed_dict, fetch_names, is_python = self._unpack_inference_request(
+            request)
+        ret = self.bclient_.predict(
             feed=feed_dict, fetch=fetch_names, need_variant_tag=True)
-        return self._pack_resp_package(data, fetch_names, is_python, tag)
+        return self._pack_inference_response(ret, fetch_names, is_python)
+
+    def GetClientConfig(self, request, context):
+        resp = multi_lang_general_model_service_pb2.GetClientConfigResponse()
+        resp.client_config_str = self.model_config_str_
+        return resp
 
 
 class MultiLangServer(object):
-    def __init__(self, worker_num=2):
+    def __init__(self):
         self.bserver_ = Server()
-        self.worker_num_ = worker_num
+        self.worker_num_ = 4
+        self.body_size_ = 64 * 1024 * 1024
+        self.concurrency_ = 100000
+        self.is_multi_model_ = False  # for model ensemble
+
+    def set_max_concurrency(self, concurrency):
+        self.concurrency_ = concurrency
+        self.bserver_.set_max_concurrency(concurrency)
+
+    def set_num_threads(self, threads):
+        self.worker_num_ = threads
+        self.bserver_.set_num_threads(threads)
+
+    def set_max_body_size(self, body_size):
+        self.bserver_.set_max_body_size(body_size)
+        if body_size >= self.body_size_:
+            self.body_size_ = body_size
+        else:
+            print(
+                "max_body_size is less than default value, will use default value in service."
+            )
+
+    def set_port(self, port):
+        self.gport_ = port
+
+    def set_reload_interval(self, interval):
+        self.bserver_.set_reload_interval(interval)
 
     def set_op_sequence(self, op_seq):
         self.bserver_.set_op_sequence(op_seq)
 
-    def load_model_config(self, model_config_path):
-        if not isinstance(model_config_path, str):
-            raise Exception(
-                "MultiLangServer only supports multi-model temporarily")
-        self.bserver_.load_model_config(model_config_path)
-        self.model_config_path_ = model_config_path
+    def set_op_graph(self, op_graph):
+        self.bserver_.set_op_graph(op_graph)
+
+    def set_memory_optimize(self, flag=False):
+        self.bserver_.set_memory_optimize(flag)
+
+    def set_ir_optimize(self, flag=False):
+        self.bserver_.set_ir_optimize(flag)
+
+    def set_op_sequence(self, op_seq):
+        self.bserver_.set_op_sequence(op_seq)
+
+    def use_mkl(self, flag):
+        self.bserver_.use_mkl(flag)
+
+    def load_model_config(self, server_config_paths, client_config_path=None):
+        self.bserver_.load_model_config(server_config_paths)
+        if client_config_path is None:
+            if isinstance(server_config_paths, dict):
+                self.is_multi_model_ = True
+                client_config_path = '{}/serving_server_conf.prototxt'.format(
+                    list(server_config_paths.items())[0][1])
+            else:
+                client_config_path = '{}/serving_server_conf.prototxt'.format(
+                    server_config_paths)
+        self.bclient_config_path_ = client_config_path
 
     def prepare_server(self, workdir=None, port=9292, device="cpu"):
+        if not self._port_is_available(port):
+            raise SystemExit("Prot {} is already used".format(port))
         default_port = 12000
         self.port_list_ = []
         for i in range(1000):
@@ -569,7 +657,7 @@ class MultiLangServer(object):
                 break
         self.bserver_.prepare_server(
             workdir=workdir, port=self.port_list_[0], device=device)
-        self.gport_ = port
+        self.set_port(port)
 
     def _launch_brpc_service(self, bserver):
         bserver.run_server()
@@ -584,12 +672,16 @@ class MultiLangServer(object):
         p_bserver = Process(
             target=self._launch_brpc_service, args=(self.bserver_, ))
         p_bserver.start()
+        options = [('grpc.max_send_message_length', self.body_size_),
+                   ('grpc.max_receive_message_length', self.body_size_)]
         server = grpc.server(
-            futures.ThreadPoolExecutor(max_workers=self.worker_num_))
+            futures.ThreadPoolExecutor(max_workers=self.worker_num_),
+            options=options,
+            maximum_concurrent_rpcs=self.concurrency_)
         multi_lang_general_model_service_pb2_grpc.add_MultiLangGeneralModelServiceServicer_to_server(
-            MultiLangServerService(self.model_config_path_,
-                                   ["0.0.0.0:{}".format(self.port_list_[0])]),
-            server)
+            MultiLangServerServiceServicer(
+                self.bclient_config_path_, self.is_multi_model_,
+                ["0.0.0.0:{}".format(self.port_list_[0])]), server)
         server.add_insecure_port('[::]:{}'.format(self.gport_))
         server.start()
         p_bserver.join()
diff --git a/python/paddle_serving_server/serve.py b/python/paddle_serving_server/serve.py
index e75240dfafd436e5557a8f11396029e6be5868fe..009a6ce00af2290b64716e211429385d09189831 100644
--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
@@ -53,6 +53,11 @@ def parse_args():  # pylint: disable=doc-string-missing
         type=int,
         default=512 * 1024 * 1024,
         help="Limit sizes of messages")
+    parser.add_argument(
+        "--use_multilang",
+        default=False,
+        action="store_true",
+        help="Use Multi-language-service")
     return parser.parse_args()
 
 
@@ -67,6 +72,7 @@ def start_standard_model():  # pylint: disable=doc-string-missing
     ir_optim = args.ir_optim
     max_body_size = args.max_body_size
     use_mkl = args.use_mkl
+    use_multilang = args.use_multilang
 
     if model == "":
         print("You must specify your serving model")
@@ -83,7 +89,11 @@ def start_standard_model():  # pylint: disable=doc-string-missing
     op_seq_maker.add_op(general_infer_op)
     op_seq_maker.add_op(general_response_op)
 
-    server = serving.Server()
+    server = None
+    if use_multilang:
+        server = serving.MultiLangServer()
+    else:
+        server = serving.Server()
     server.set_op_sequence(op_seq_maker.get_op_sequence())
     server.set_num_threads(thread_num)
     server.set_memory_optimize(mem_optim)
diff --git a/python/paddle_serving_server_gpu/__init__.py b/python/paddle_serving_server_gpu/__init__.py
index 5310a5eeccce2421e3680a2f1767c623f7bae441..26288f6ae65ce823a57ee201130d40ff6510c4a5 100644
--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
@@ -68,6 +68,11 @@ def serve_args():
         type=int,
         default=512 * 1024 * 1024,
         help="Limit sizes of messages")
+    parser.add_argument(
+        "--use_multilang",
+        default=False,
+        action="store_true",
+        help="Use Multi-language-service")
     return parser.parse_args()
 
 
@@ -484,22 +489,29 @@ class Server(object):
         os.system(command)
 
 
-class MultiLangServerService(
-        multi_lang_general_model_service_pb2_grpc.MultiLangGeneralModelService):
-    def __init__(self, model_config_path, endpoints):
+class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
+                                     MultiLangGeneralModelServiceServicer):
+    def __init__(self, model_config_path, is_multi_model, endpoints):
+        self.is_multi_model_ = is_multi_model
+        self.model_config_path_ = model_config_path
+        self.endpoints_ = endpoints
+        with open(self.model_config_path_) as f:
+            self.model_config_str_ = str(f.read())
+        self._parse_model_config(self.model_config_str_)
+        self._init_bclient(self.model_config_path_, self.endpoints_)
+
+    def _init_bclient(self, model_config_path, endpoints, timeout_ms=None):
         from paddle_serving_client import Client
-        self._parse_model_config(model_config_path)
         self.bclient_ = Client()
-        self.bclient_.load_client_config(
-            "{}/serving_server_conf.prototxt".format(model_config_path))
+        if timeout_ms is not None:
+            self.bclient_.set_rpc_timeout_ms(timeout_ms)
+        self.bclient_.load_client_config(model_config_path)
         self.bclient_.connect(endpoints)
 
-    def _parse_model_config(self, model_config_path):
+    def _parse_model_config(self, model_config_str):
         model_conf = m_config.GeneralModelConfig()
-        f = open("{}/serving_server_conf.prototxt".format(model_config_path),
-                 'r')
-        model_conf = google.protobuf.text_format.Merge(
-            str(f.read()), model_conf)
+        model_conf = google.protobuf.text_format.Merge(model_config_str,
+                                                       model_conf)
         self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
         self.feed_types_ = {}
         self.feed_shapes_ = {}
@@ -524,7 +536,7 @@ class MultiLangServerService(
             else:
                 yield item
 
-    def _unpack_request(self, request):
+    def _unpack_inference_request(self, request):
         feed_names = list(request.feed_var_names)
         fetch_names = list(request.fetch_var_names)
         is_python = request.is_python
@@ -540,6 +552,8 @@ class MultiLangServerService(
                         data = np.frombuffer(var.data, dtype="int64")
                     elif v_type == 1:
                         data = np.frombuffer(var.data, dtype="float32")
+                    elif v_type == 2:
+                        data = np.frombuffer(var.data, dtype="int32")
                     else:
                         raise Exception("error type.")
                 else:
@@ -547,6 +561,8 @@ class MultiLangServerService(
                         data = np.array(list(var.int64_data), dtype="int64")
                     elif v_type == 1:  # float32
                         data = np.array(list(var.float_data), dtype="float32")
+                    elif v_type == 2:
+                        data = np.array(list(var.int32_data), dtype="int32")
                     else:
                         raise Exception("error type.")
                 data.shape = list(feed_inst.tensor_array[idx].shape)
@@ -554,55 +570,129 @@ class MultiLangServerService(
             feed_batch.append(feed_dict)
         return feed_batch, fetch_names, is_python
 
-    def _pack_resp_package(self, result, fetch_names, is_python, tag):
-        resp = multi_lang_general_model_service_pb2.Response()
-        # Only one model is supported temporarily
-        model_output = multi_lang_general_model_service_pb2.ModelOutput()
-        inst = multi_lang_general_model_service_pb2.FetchInst()
-        for idx, name in enumerate(fetch_names):
-            tensor = multi_lang_general_model_service_pb2.Tensor()
-            v_type = self.fetch_types_[name]
-            if is_python:
-                tensor.data = result[name].tobytes()
-            else:
-                if v_type == 0:  # int64
-                    tensor.int64_data.extend(result[name].reshape(-1).tolist())
-                elif v_type == 1:  # float32
-                    tensor.float_data.extend(result[name].reshape(-1).tolist())
-                else:
-                    raise Exception("error type.")
-            tensor.shape.extend(list(result[name].shape))
-            if name in self.lod_tensor_set_:
-                tensor.lod.extend(result["{}.lod".format(name)].tolist())
-            inst.tensor_array.append(tensor)
-        model_output.insts.append(inst)
-        resp.outputs.append(model_output)
+    def _pack_inference_response(self, ret, fetch_names, is_python):
+        resp = multi_lang_general_model_service_pb2.InferenceResponse()
+        if ret is None:
+            resp.err_code = 1
+            return resp
+        results, tag = ret
         resp.tag = tag
+        resp.err_code = 0
+        if not self.is_multi_model_:
+            results = {'general_infer_0': results}
+        for model_name, model_result in results.items():
+            model_output = multi_lang_general_model_service_pb2.ModelOutput()
+            inst = multi_lang_general_model_service_pb2.FetchInst()
+            for idx, name in enumerate(fetch_names):
+                tensor = multi_lang_general_model_service_pb2.Tensor()
+                v_type = self.fetch_types_[name]
+                if is_python:
+                    tensor.data = model_result[name].tobytes()
+                else:
+                    if v_type == 0:  # int64
+                        tensor.int64_data.extend(model_result[name].reshape(-1)
+                                                 .tolist())
+                    elif v_type == 1:  # float32
+                        tensor.float_data.extend(model_result[name].reshape(-1)
+                                                 .tolist())
+                    elif v_type == 2:  # int32
+                        tensor.int32_data.extend(model_result[name].reshape(-1)
+                                                 .tolist())
+                    else:
+                        raise Exception("error type.")
+                tensor.shape.extend(list(model_result[name].shape))
+                if name in self.lod_tensor_set_:
+                    tensor.lod.extend(model_result["{}.lod".format(name)]
+                                      .tolist())
+                inst.tensor_array.append(tensor)
+            model_output.insts.append(inst)
+            model_output.engine_name = model_name
+            resp.outputs.append(model_output)
+        return resp
+
+    def SetTimeout(self, request, context):
+        # This porcess and Inference process cannot be operate at the same time.
+        # For performance reasons, do not add thread lock temporarily.
+        timeout_ms = request.timeout_ms
+        self._init_bclient(self.model_config_path_, self.endpoints_, timeout_ms)
+        resp = multi_lang_general_model_service_pb2.SimpleResponse()
+        resp.err_code = 0
         return resp
 
-    def inference(self, request, context):
-        feed_dict, fetch_names, is_python = self._unpack_request(request)
-        data, tag = self.bclient_.predict(
+    def Inference(self, request, context):
+        feed_dict, fetch_names, is_python = self._unpack_inference_request(
+            request)
+        ret = self.bclient_.predict(
             feed=feed_dict, fetch=fetch_names, need_variant_tag=True)
-        return self._pack_resp_package(data, fetch_names, is_python, tag)
+        return self._pack_inference_response(ret, fetch_names, is_python)
+
+    def GetClientConfig(self, request, context):
+        resp = multi_lang_general_model_service_pb2.GetClientConfigResponse()
+        resp.client_config_str = self.model_config_str_
+        return resp
 
 
 class MultiLangServer(object):
-    def __init__(self, worker_num=2):
+    def __init__(self):
         self.bserver_ = Server()
-        self.worker_num_ = worker_num
+        self.worker_num_ = 4
+        self.body_size_ = 64 * 1024 * 1024
+        self.concurrency_ = 100000
+        self.is_multi_model_ = False  # for model ensemble
+
+    def set_max_concurrency(self, concurrency):
+        self.concurrency_ = concurrency
+        self.bserver_.set_max_concurrency(concurrency)
+
+    def set_num_threads(self, threads):
+        self.worker_num_ = threads
+        self.bserver_.set_num_threads(threads)
+
+    def set_max_body_size(self, body_size):
+        self.bserver_.set_max_body_size(body_size)
+        if body_size >= self.body_size_:
+            self.body_size_ = body_size
+        else:
+            print(
+                "max_body_size is less than default value, will use default value in service."
+            )
+
+    def set_port(self, port):
+        self.gport_ = port
+
+    def set_reload_interval(self, interval):
+        self.bserver_.set_reload_interval(interval)
 
     def set_op_sequence(self, op_seq):
         self.bserver_.set_op_sequence(op_seq)
 
-    def load_model_config(self, model_config_path):
-        if not isinstance(model_config_path, str):
-            raise Exception(
-                "MultiLangServer only supports multi-model temporarily")
-        self.bserver_.load_model_config(model_config_path)
-        self.model_config_path_ = model_config_path
+    def set_op_graph(self, op_graph):
+        self.bserver_.set_op_graph(op_graph)
+
+    def set_memory_optimize(self, flag=False):
+        self.bserver_.set_memory_optimize(flag)
+
+    def set_ir_optimize(self, flag=False):
+        self.bserver_.set_ir_optimize(flag)
+
+    def set_gpuid(self, gpuid=0):
+        self.bserver_.set_gpuid(gpuid)
+
+    def load_model_config(self, server_config_paths, client_config_path=None):
+        self.bserver_.load_model_config(server_config_paths)
+        if client_config_path is None:
+            if isinstance(server_config_paths, dict):
+                self.is_multi_model_ = True
+                client_config_path = '{}/serving_server_conf.prototxt'.format(
+                    list(server_config_paths.items())[0][1])
+            else:
+                client_config_path = '{}/serving_server_conf.prototxt'.format(
+                    server_config_paths)
+        self.bclient_config_path_ = client_config_path
 
     def prepare_server(self, workdir=None, port=9292, device="cpu"):
+        if not self._port_is_available(port):
+            raise SystemExit("Prot {} is already used".format(port))
         default_port = 12000
         self.port_list_ = []
         for i in range(1000):
@@ -612,7 +702,7 @@ class MultiLangServer(object):
                 break
         self.bserver_.prepare_server(
             workdir=workdir, port=self.port_list_[0], device=device)
-        self.gport_ = port
+        self.set_port(port)
 
     def _launch_brpc_service(self, bserver):
         bserver.run_server()
@@ -627,12 +717,16 @@ class MultiLangServer(object):
         p_bserver = Process(
             target=self._launch_brpc_service, args=(self.bserver_, ))
         p_bserver.start()
+        options = [('grpc.max_send_message_length', self.body_size_),
+                   ('grpc.max_receive_message_length', self.body_size_)]
         server = grpc.server(
-            futures.ThreadPoolExecutor(max_workers=self.worker_num_))
+            futures.ThreadPoolExecutor(max_workers=self.worker_num_),
+            options=options,
+            maximum_concurrent_rpcs=self.concurrency_)
         multi_lang_general_model_service_pb2_grpc.add_MultiLangGeneralModelServiceServicer_to_server(
-            MultiLangServerService(self.model_config_path_,
-                                   ["0.0.0.0:{}".format(self.port_list_[0])]),
-            server)
+            MultiLangServerServiceServicer(
+                self.bclient_config_path_, self.is_multi_model_,
+                ["0.0.0.0:{}".format(self.port_list_[0])]), server)
         server.add_insecure_port('[::]:{}'.format(self.gport_))
         server.start()
         p_bserver.join()
diff --git a/python/paddle_serving_server_gpu/serve.py b/python/paddle_serving_server_gpu/serve.py
index 309896a876bda5fc9b1baceb089242baa6d77dc5..e26b32c2699d09b714b2658cafad0ae8c5138071 100644
--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
@@ -37,6 +37,7 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
     mem_optim = args.mem_optim
     ir_optim = args.ir_optim
     max_body_size = args.max_body_size
+    use_multilang = args.use_multilang
     workdir = "{}_{}".format(args.workdir, gpuid)
 
     if model == "":
@@ -54,7 +55,10 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
     op_seq_maker.add_op(general_infer_op)
     op_seq_maker.add_op(general_response_op)
 
-    server = serving.Server()
+    if use_multilang:
+        server = serving.MultiLangServer()
+    else:
+        server = serving.Server()
     server.set_op_sequence(op_seq_maker.get_op_sequence())
     server.set_num_threads(thread_num)
     server.set_memory_optimize(mem_optim)
diff --git a/tools/serving_build.sh b/tools/serving_build.sh
index b70936e5522b52964518d017fece3ceb78d66b87..097123165988fb266f7c4a3a0da603ade6d98be1 100644
--- a/tools/serving_build.sh
+++ b/tools/serving_build.sh
@@ -134,6 +134,7 @@ function build_server() {
 
 function kill_server_process() {
     ps -ef | grep "serving" | grep -v serving_build | grep -v grep | awk '{print $2}' | xargs kill
+    sleep 1
 }
 
 function python_test_fit_a_line() {
@@ -246,6 +247,7 @@ function python_run_criteo_ctr_with_cube() {
             echo "criteo_ctr_with_cube inference auc test success"
             kill_server_process
             ps -ef | grep "cube" | grep -v grep | awk '{print $2}' | xargs kill
+            sleep 1
             ;;
         GPU)
             check_cmd "wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz"
@@ -261,6 +263,8 @@ function python_run_criteo_ctr_with_cube() {
             check_cmd "mkdir work_dir1 && cp cube/conf/cube.conf ./work_dir1/"
             python test_server_gpu.py ctr_serving_model_kv &
             sleep 5
+            # for warm up
+            python test_client.py ctr_client_conf/serving_client_conf.prototxt ./ut_data > /dev/null || true
             check_cmd "python test_client.py ctr_client_conf/serving_client_conf.prototxt ./ut_data >score"
             tail -n 2 score | awk 'NR==1'
             AUC=$(tail -n 2  score | awk 'NR==1')
@@ -273,6 +277,7 @@ function python_run_criteo_ctr_with_cube() {
             echo "criteo_ctr_with_cube inference auc test success"
             kill_server_process
             ps -ef | grep "cube" | grep -v grep | awk '{print $2}' | xargs kill
+            sleep 1
             ;;
         *)
             echo "error type"
@@ -484,6 +489,7 @@ function python_test_lac() {
             setproxy # recover proxy state
             kill_server_process
             ps -ef | grep "lac_web_service" | grep -v grep | awk '{print $2}' | xargs kill
+            sleep 1
             echo "lac CPU HTTP inference pass"
             ;;
         GPU)
@@ -499,6 +505,143 @@ function python_test_lac() {
     cd ..
 }
 
+function python_test_grpc_impl() {
+    # pwd: /Serving/python/examples
+    cd grpc_impl_example # pwd: /Serving/python/examples/grpc_impl_example
+    local TYPE=$1
+    export SERVING_BIN=${SERVING_WORKDIR}/build-server-${TYPE}/core/general-server/serving
+    unsetproxy
+    case $TYPE in
+        CPU)
+            # test general case
+            cd fit_a_line # pwd: /Serving/python/examples/grpc_impl_example/fit_a_line
+            sh get_data.sh
+
+            # one line command start
+            check_cmd "python -m paddle_serving_server.serve --model uci_housing_model --port 9393 --thread 4 --use_multilang > /dev/null &"
+            sleep 5 # wait for the server to start
+            check_cmd "python test_sync_client.py > /dev/null"
+            check_cmd "python test_asyn_client.py > /dev/null"
+            check_cmd "python test_general_pb_client.py > /dev/null"
+            check_cmd "python test_numpy_input_client.py > /dev/null"
+            check_cmd "python test_batch_client.py > /dev/null"
+            check_cmd "python test_timeout_client.py > /dev/null"
+            kill_server_process
+
+            check_cmd "python test_server.py uci_housing_model > /dev/null &"
+            sleep 5 # wait for the server to start
+            check_cmd "python test_sync_client.py > /dev/null"
+            check_cmd "python test_asyn_client.py > /dev/null"
+            check_cmd "python test_general_pb_client.py > /dev/null"
+            check_cmd "python test_numpy_input_client.py > /dev/null"
+            check_cmd "python test_batch_client.py > /dev/null"
+            check_cmd "python test_timeout_client.py > /dev/null"
+            kill_server_process
+
+            cd .. # pwd: /Serving/python/examples/grpc_impl_example
+
+            # test load server config and client config in Server side
+            cd criteo_ctr_with_cube # pwd: /Serving/python/examples/grpc_impl_example/criteo_ctr_with_cube
+
+            check_cmd "wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz"
+            check_cmd "tar xf ctr_cube_unittest.tar.gz"
+            check_cmd "mv models/ctr_client_conf ./"
+            check_cmd "mv models/ctr_serving_model_kv ./"
+            check_cmd "mv models/data ./cube/"
+            check_cmd "mv models/ut_data ./"
+            cp ../../../../build-server-$TYPE/output/bin/cube* ./cube/
+            sh cube_prepare.sh &
+            check_cmd "mkdir work_dir1 && cp cube/conf/cube.conf ./work_dir1/"
+            python test_server.py ctr_serving_model_kv ctr_client_conf/serving_client_conf.prototxt &
+            sleep 5
+            check_cmd "python test_client.py ./ut_data >score"
+            tail -n 2 score | awk 'NR==1'
+            AUC=$(tail -n 2  score | awk 'NR==1')
+            VAR2="0.67" #TODO: temporarily relax the threshold to 0.67
+            RES=$( echo "$AUC>$VAR2" | bc )
+            if [[ $RES -eq 0 ]]; then
+                echo "error with criteo_ctr_with_cube inference auc test, auc should > 0.67"
+                exit 1
+            fi
+            echo "grpc impl test success"
+            kill_server_process
+            ps -ef | grep "cube" | grep -v grep | awk '{print $2}' | xargs kill
+
+            cd .. # pwd: /Serving/python/examples/grpc_impl_example
+            ;;
+        GPU)
+            export CUDA_VISIBLE_DEVICES=0
+            # test general case
+            cd fit_a_line # pwd: /Serving/python/examples/grpc_impl_example/fit_a_line
+            sh get_data.sh
+
+            # one line command start
+            check_cmd "python -m paddle_serving_server_gpu.serve --model uci_housing_model --port 9393 --thread 4 --gpu_ids 0 --use_multilang > /dev/null &"
+            sleep 5 # wait for the server to start
+            check_cmd "python test_sync_client.py > /dev/null"
+            check_cmd "python test_asyn_client.py > /dev/null"
+            check_cmd "python test_general_pb_client.py > /dev/null"
+            check_cmd "python test_numpy_input_client.py > /dev/null"
+            check_cmd "python test_batch_client.py > /dev/null"
+            check_cmd "python test_timeout_client.py > /dev/null"
+            kill_server_process
+
+            check_cmd "python test_server_gpu.py uci_housing_model > /dev/null &"
+            sleep 5 # wait for the server to start
+            check_cmd "python test_sync_client.py > /dev/null"
+            check_cmd "python test_asyn_client.py > /dev/null"
+            check_cmd "python test_general_pb_client.py > /dev/null"
+            check_cmd "python test_numpy_input_client.py > /dev/null"
+            check_cmd "python test_batch_client.py > /dev/null"
+            check_cmd "python test_timeout_client.py > /dev/null"
+            kill_server_process
+            ps -ef | grep "test_server_gpu" | grep -v serving_build | grep -v grep | awk '{print $2}' | xargs kill
+
+            cd .. # pwd: /Serving/python/examples/grpc_impl_example
+
+            # test load server config and client config in Server side
+            cd criteo_ctr_with_cube # pwd: /Serving/python/examples/grpc_impl_example/criteo_ctr_with_cube
+
+            check_cmd "wget https://paddle-serving.bj.bcebos.com/unittest/ctr_cube_unittest.tar.gz"
+            check_cmd "tar xf ctr_cube_unittest.tar.gz"
+            check_cmd "mv models/ctr_client_conf ./"
+            check_cmd "mv models/ctr_serving_model_kv ./"
+            check_cmd "mv models/data ./cube/"
+            check_cmd "mv models/ut_data ./"
+            cp ../../../../build-server-$TYPE/output/bin/cube* ./cube/
+            sh cube_prepare.sh &
+            check_cmd "mkdir work_dir1 && cp cube/conf/cube.conf ./work_dir1/"
+            python test_server_gpu.py ctr_serving_model_kv ctr_client_conf/serving_client_conf.prototxt &
+            sleep 5
+            # for warm up
+            python test_client.py ./ut_data &> /dev/null || true
+            check_cmd "python test_client.py ./ut_data >score"
+            tail -n 2 score | awk 'NR==1'
+            AUC=$(tail -n 2  score | awk 'NR==1')
+            VAR2="0.67" #TODO: temporarily relax the threshold to 0.67
+            RES=$( echo "$AUC>$VAR2" | bc )
+            if [[ $RES -eq 0 ]]; then
+                echo "error with criteo_ctr_with_cube inference auc test, auc should > 0.67"
+                exit 1
+            fi
+            echo "grpc impl test success"
+            kill_server_process
+            ps -ef | grep "test_server_gpu" | grep -v serving_build | grep -v grep | awk '{print $2}' | xargs kill
+            ps -ef | grep "cube" | grep -v grep | awk '{print $2}' | xargs kill
+            cd .. # pwd: /Serving/python/examples/grpc_impl_example
+            ;;
+        *)
+            echo "error type"
+            exit 1
+            ;;
+    esac
+    echo "test grpc impl $TYPE part finished as expected."
+    setproxy
+    unset SERVING_BIN
+    cd .. # pwd: /Serving/python/examples
+}
+
+
 function python_test_yolov4(){
     #pwd:/ Serving/python/examples
     local TYPE=$1
@@ -546,6 +689,7 @@ function python_run_test() {
     python_test_multi_process $TYPE # pwd: /Serving/python/examples
     python_test_multi_fetch $TYPE # pwd: /Serving/python/examples
     python_test_yolov4 $TYPE # pwd: /Serving/python/examples
+    python_test_grpc_impl $TYPE # pwd: /Serving/python/examples
     echo "test python $TYPE part finished as expected."
     cd ../.. # pwd: /Serving
 }
@@ -804,3 +948,4 @@ function main() {
 }
 
 main $@
+exit 0