diff --git a/README.md b/README.md
old mode 100644
new mode 100755
index 749b71c134beccf0000aebf7efb63ad4d4f08c1d..61d043074a2ef65d78086e48beefab1388c8e7ae
--- a/README.md
+++ b/README.md
@@ -176,8 +176,8 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 | Argument                                       | Type | Default | Description                                           |
 | ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- |
 | `thread`                                       | int  | `2`     | Number of brpc service thread                         |
-| `op_num`                                       | int[]| `0`     | Thread Number for each model in asynchronous mode     |
-| `op_max_batch`                                 | int[]| `0`     | Batch Number for each model in asynchronous mode      |
+| `runtime_thread_num`                           | int[]| `0`     | Thread Number for each model in asynchronous mode     |
+| `batch_infer_size`                             | int[]| `0`     | Batch Number for each model in asynchronous mode      |
 | `gpu_ids`                                      | str[]| `"-1"`  | Gpu card id for each model                            |
 | `port`                                         | int  | `9292`  | Exposed port of current service to users              |
 | `model`                                        | str[]| `""`    | Path of paddle model directory to be served           |
@@ -197,8 +197,8 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
     In asynchronous mode, each model will start n threads of the number you specify, and each thread contains a model instance. In other words, each model is equivalent to a thread pool containing N threads, and the task is taken from the task queue of the thread pool to execute.
     In asynchronous mode, each RPC server thread is only responsible for putting the request into the task queue of the model thread pool. After the task is executed, the completed task is removed from the task queue.
     In the above table, the number of RPC server threads is specified by --thread, and the default value is 2.
-    --op_num specifies the number of threads in the thread pool of each model. The default value is 0, indicating that asynchronous mode is not used.
-    --op_max_batch specifies the number of batches for each model. The default value is 32. It takes effect when --op_num is not 0.
+    --runtime_thread_num specifies the number of threads in the thread pool of each model. The default value is 0, indicating that asynchronous mode is not used.
+    --batch_infer_size specifies the number of batches for each model. The default value is 32. It takes effect when --runtime_thread_num is not 0.
 #### When you want a model to use multiple GPU cards.
 python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0,1,2
 #### When you want 2 models.
@@ -206,7 +206,7 @@ python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_m
 #### When you want 2 models, and want each of them use multiple GPU cards.
 python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2
 #### When a service contains two models, and each model needs to specify multiple GPU cards, and needs asynchronous mode, each model specifies different concurrency number.
-python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2 --op_num 4 8
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2 --runtime_thread_num 4 8
 </center>
 
 ```python
diff --git a/README_CN.md b/README_CN.md
old mode 100644
new mode 100755
index a30b04e30d2e5805b1b5fe700ae81a70b379eaae..f766c57365bdebb665b1154fcdbadd1e4b8599e0
--- a/README_CN.md
+++ b/README_CN.md
@@ -175,8 +175,8 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 | Argument                                       | Type | Default | Description                                           |
 | ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- |
 | `thread`                                       | int  | `2`     | Number of brpc service thread                         |
-| `op_num`                                       | int[]| `0`     | Thread Number for each model in asynchronous mode     |
-| `op_max_batch`                                 | int[]| `32`    | Batch Number for each model in asynchronous mode      |
+| `runtime_thread_num`                           | int[]| `0`     | Thread Number for each model in asynchronous mode     |
+| `batch_infer_size`                             | int[]| `32`    | Batch Number for each model in asynchronous mode      |
 | `gpu_ids`                                      | str[]| `"-1"`  | Gpu card id for each model                            |
 | `port`                                         | int  | `9292`  | Exposed port of current service to users              |
 | `model`                                        | str[]| `""`    | Path of paddle model directory to be served           |
@@ -195,8 +195,8 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
     异步模式有助于提高Service服务的吞吐（QPS），但对于单次请求而言，时延会有少量增加。
     异步模式中，每个模型会启动您指定个数的N个线程，每个线程中包含一个模型实例，换句话说每个模型相当于包含N个线程的线程池，从线程池的任务队列中取任务来执行。
     异步模式中，各个RPC Server的线程只负责将Request请求放入模型线程池的任务队列中，等任务被执行完毕后，再从任务队列中取出已完成的任务。
-    上表中通过 --thread 10 指定的是RPC Server的线程数量，默认值为2，--op_num 指定的是各个模型的线程池中线程数N，默认值为0，表示不使用异步模式。
-    --op_max_batch 指定的各个模型的batch数量，默认值为32，该参数只有当--op_num不为0时才生效。
+    上表中通过 --thread 10 指定的是RPC Server的线程数量，默认值为2，--runtime_thread_num 指定的是各个模型的线程池中线程数N，默认值为0，表示不使用异步模式。
+    --batch_infer_size 指定的各个模型的batch数量，默认值为32，该参数只有当--runtime_thread_num不为0时才生效。
     
 #### 当您的某个模型想使用多张GPU卡部署时.
 python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0,1,2
@@ -205,7 +205,7 @@ python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_m
 #### 当您的一个服务包含两个模型，且每个模型都需要指定多张GPU卡部署时.
 python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2
 #### 当您的一个服务包含两个模型，且每个模型都需要指定多张GPU卡，且需要异步模式每个模型指定不同的并发数时.
-python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2 --op_num 4 8
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2 --runtime_thread_num 4 8
 
 </center>
 
diff --git a/python/examples/pipeline/PaddleDetection/faster_rcnn/config.yml b/python/examples/pipeline/PaddleDetection/faster_rcnn/config.yml
index 0bcb6c288914acc852c82974eb7eacf560784255..891b4b997c2ebb98d6694464b5dbe0532c01145c 100644
--- a/python/examples/pipeline/PaddleDetection/faster_rcnn/config.yml
+++ b/python/examples/pipeline/PaddleDetection/faster_rcnn/config.yml
@@ -1,18 +1,29 @@
 dag:
+  #op资源类型, True, 为线程模型；False，为进程模型
   is_thread_op: false
+  #使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
   tracer:
     interval_s: 30
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
 http_port: 18082
 op:
   faster_rcnn:
+    #并发数，is_thread_op=True时，为线程并发；否则为进程并发
     concurrency: 2
-
     local_service_conf:
+      #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
       client_type: local_predictor
+      # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
       device_type: 1
+      #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
       devices: '2'
+      #Fetch结果列表，以bert_seq128_model中fetch_var的alias_name为准, 如果没有设置则全部返回
       fetch_list:
       - save_infer_model/scale_0.tmp_1
+      #模型路径
       model_config: serving_server/
+#rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时，会自动将rpc_port设置为http_port+1
 rpc_port: 9998
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+#当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
 worker_num: 20
diff --git a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/config.yml b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/config.yml
index 0476d4ce6554af31e21d0c2ea0473e23de18523f..71e93f39c7979522e73058af7fa2969575b5129c 100644
--- a/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/config.yml
+++ b/python/examples/pipeline/PaddleDetection/ppyolo_mbv3/config.yml
@@ -1,18 +1,30 @@
 dag:
+  #op资源类型, True, 为线程模型；False，为进程模型
   is_thread_op: false
+  #使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
   tracer:
     interval_s: 30
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
 http_port: 18082
 op:
   ppyolo_mbv3:
+    #并发数，is_thread_op=True时，为线程并发；否则为进程并发
     concurrency: 10
 
     local_service_conf:
+      #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
       client_type: local_predictor
+      # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
       device_type: 1
+      #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
       devices: '2'
+      #Fetch结果列表，以bert_seq128_model中fetch_var的alias_name为准, 如果没有设置则全部返回
       fetch_list:
       - save_infer_model/scale_0.tmp_1
+      #模型路径
       model_config: serving_server/
+#rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时，会自动将rpc_port设置为http_port+1
 rpc_port: 9998
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+#当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
 worker_num: 20
diff --git a/python/examples/pipeline/PaddleDetection/yolov3/config.yml b/python/examples/pipeline/PaddleDetection/yolov3/config.yml
index 20653280736316c87d50786e76db5ba842040525..0f6d839edd3467e4dca203b9a21db850db3f4d5e 100644
--- a/python/examples/pipeline/PaddleDetection/yolov3/config.yml
+++ b/python/examples/pipeline/PaddleDetection/yolov3/config.yml
@@ -1,18 +1,29 @@
 dag:
+  #op资源类型, True, 为线程模型；False，为进程模型
   is_thread_op: false
+  #使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
   tracer:
     interval_s: 30
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
 http_port: 18082
 op:
   yolov3:
+    #并发数，is_thread_op=True时，为线程并发；否则为进程并发
     concurrency: 10
-
     local_service_conf:
+      #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
       client_type: local_predictor
+      # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
       device_type: 1
+      #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
       devices: '2'
+      #Fetch结果列表，以bert_seq128_model中fetch_var的alias_name为准, 如果没有设置则全部返回
       fetch_list:
       - save_infer_model/scale_0.tmp_1
+      #模型路径
       model_config: serving_server/
+#rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时，会自动将rpc_port设置为http_port+1
 rpc_port: 9998
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+#当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
 worker_num: 20
diff --git a/python/examples/pipeline/bert/config.yml b/python/examples/pipeline/bert/config.yml
index a2b39264dd78ccb8f2936c7bd603d1c3d57b2574..5f1226646bb1a14fee3460bc98e25321b6aaa27a 100644
--- a/python/examples/pipeline/bert/config.yml
+++ b/python/examples/pipeline/bert/config.yml
@@ -1,17 +1,32 @@
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+##当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
 worker_num: 20
+#build_dag_each_worker, False，框架在进程内创建一条DAG；True，框架会每个进程内创建多个独立的DAG
+build_dag_each_worker: false
+
 dag:
+  #op资源类型, True, 为线程模型；False，为进程模型
   is_thread_op: false
+  #使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
   tracer:
     interval_s: 10
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
 http_port: 18082
+#rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时，会自动将rpc_port设置为http_port+1
 rpc_port: 9998
 op:
   bert:
+    #并发数，is_thread_op=True时，为线程并发；否则为进程并发
     concurrency: 2
-
+    #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
     local_service_conf:
+      #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
       client_type: local_predictor
+      # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
       device_type: 1
+      #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
       devices: '2'
+      #Fetch结果列表，以bert_seq128_model中fetch_var的alias_name为准, 如果没有设置则全部返回
       fetch_list:
+      #bert模型路径
       model_config: bert_seq128_model/
diff --git a/python/examples/pipeline/ocr/config.yml b/python/examples/pipeline/ocr/config.yml
index 58e3ed54d5d286290ff4846364c2393af427bd9d..2767fa77ceaa975c4e20bedaaf13ffa0e2b35de3 100644
--- a/python/examples/pipeline/ocr/config.yml
+++ b/python/examples/pipeline/ocr/config.yml
@@ -38,6 +38,9 @@ op:
 
             #Fetch结果列表，以client_config中fetch_var的alias_name为准
             fetch_list: ["concat_1.tmp_0"]
+            
+            # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
+            device_type: 0
 
             #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
             devices: ""
@@ -71,6 +74,8 @@ op:
 
             #Fetch结果列表，以client_config中fetch_var的alias_name为准
             fetch_list: ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"] 
+            # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
+            device_type: 0
 
             #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
             devices: ""
diff --git a/python/paddle_serving_server/serve.py b/python/paddle_serving_server/serve.py
index f0a9d699600c3a1993514c70e4667b0d6c4e5a05..0447f5ecb5dd6ede7b53758a7601a82b21bbb1e9 100755
--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
@@ -109,7 +109,12 @@ def is_gpu_mode(unformatted_gpus):
 
 def serve_args():
     parser = argparse.ArgumentParser("serve")
-    parser.add_argument("server", type=str, default="start",nargs="?", help="stop or start PaddleServing")
+    parser.add_argument(
+        "server",
+        type=str,
+        default="start",
+        nargs="?",
+        help="stop or start PaddleServing")
     parser.add_argument(
         "--thread",
         type=int,
@@ -123,9 +128,13 @@ def serve_args():
     parser.add_argument(
         "--gpu_ids", type=str, default="", nargs="+", help="gpu ids")
     parser.add_argument(
-        "--op_num", type=int, default=0, nargs="+", help="Number of each op")
+        "--runtime_thread_num",
+        type=int,
+        default=0,
+        nargs="+",
+        help="Number of each op")
     parser.add_argument(
-        "--op_max_batch",
+        "--batch_infer_size",
         type=int,
         default=32,
         nargs="+",
@@ -251,11 +260,11 @@ def start_gpu_card_model(gpu_mode, port, args):  # pylint: disable=doc-string-mi
     if args.gpu_multi_stream and device == "gpu":
         server.set_gpu_multi_stream()
 
-    if args.op_num:
-        server.set_op_num(args.op_num)
+    if args.runtime_thread_num:
+        server.set_runtime_thread_num(args.runtime_thread_num)
 
-    if args.op_max_batch:
-        server.set_op_max_batch(args.op_max_batch)
+    if args.batch_infer_size:
+        server.set_batch_infer_size(args.batch_infer_size)
 
     if args.use_lite:
         server.set_lite()
@@ -370,7 +379,7 @@ class MainService(BaseHTTPRequestHandler):
         self.wfile.write(json.dumps(response).encode())
 
 
-def stop_serving(command : str, port : int = None):
+def stop_serving(command: str, port: int=None):
     '''
     Stop PaddleServing by port.
 
@@ -400,7 +409,7 @@ def stop_serving(command : str, port : int = None):
         start_time = info["start_time"]
         if port is not None:
             if port in storedPort:
-                kill_stop_process_by_pid(command ,pid)
+                kill_stop_process_by_pid(command, pid)
                 infoList.remove(info)
                 if len(infoList):
                     with open(filepath, "w") as fp:
@@ -410,17 +419,18 @@ def stop_serving(command : str, port : int = None):
                 return True
             else:
                 if lastInfo == info:
-                     raise ValueError(
-                         "Please confirm the port [%s] you specified is correct." %
-                         port)
+                    raise ValueError(
+                        "Please confirm the port [%s] you specified is correct."
+                        % port)
                 else:
                     pass
         else:
-            kill_stop_process_by_pid(command ,pid)
+            kill_stop_process_by_pid(command, pid)
             if lastInfo == info:
                 os.remove(filepath)
     return True
 
+
 if __name__ == "__main__":
     # args.device is not used at all.
     # just keep the interface.
@@ -436,7 +446,7 @@ if __name__ == "__main__":
             os._exit(0)
         else:
             os._exit(-1)
-    
+
     for single_model_config in args.model:
         if os.path.isdir(single_model_config):
             pass
diff --git a/python/paddle_serving_server/server.py b/python/paddle_serving_server/server.py
index 0510579d7c4225d9bef81d880e01642ea93efd90..909cb8764bb7572af079e9a51c4f76dbd86441e0 100755
--- a/python/paddle_serving_server/server.py
+++ b/python/paddle_serving_server/server.py
@@ -82,8 +82,8 @@ class Server(object):
         self.mkl_flag = False
         self.device = "cpu"
         self.gpuid = []
-        self.op_num = [0]
-        self.op_max_batch = [32]
+        self.runtime_thread_num = [0]
+        self.batch_infer_size = [32]
         self.use_trt = False
         self.gpu_multi_stream = False
         self.use_lite = False
@@ -171,11 +171,11 @@ class Server(object):
     def set_gpuid(self, gpuid):
         self.gpuid = format_gpu_to_strlist(gpuid)
 
-    def set_op_num(self, op_num):
-        self.op_num = op_num
+    def set_runtime_thread_num(self, runtime_thread_num):
+        self.runtime_thread_num = runtime_thread_num
 
-    def set_op_max_batch(self, op_max_batch):
-        self.op_max_batch = op_max_batch
+    def set_batch_infer_size(self, batch_infer_size):
+        self.batch_infer_size = batch_infer_size
 
     def set_trt(self):
         self.use_trt = True
@@ -205,15 +205,15 @@ class Server(object):
             else:
                 self.gpuid = ["-1"]
 
-        if isinstance(self.op_num, int):
-            self.op_num = [self.op_num]
-        if len(self.op_num) == 0:
-            self.op_num.append(0)
+        if isinstance(self.runtime_thread_num, int):
+            self.runtime_thread_num = [self.runtime_thread_num]
+        if len(self.runtime_thread_num) == 0:
+            self.runtime_thread_num.append(0)
 
-        if isinstance(self.op_max_batch, int):
-            self.op_max_batch = [self.op_max_batch]
-        if len(self.op_max_batch) == 0:
-            self.op_max_batch.append(32)
+        if isinstance(self.batch_infer_size, int):
+            self.batch_infer_size = [self.batch_infer_size]
+        if len(self.batch_infer_size) == 0:
+            self.batch_infer_size.append(32)
 
         index = 0
 
@@ -224,9 +224,10 @@ class Server(object):
             engine.reloadable_meta = model_config_path + "/fluid_time_file"
             os.system("touch {}".format(engine.reloadable_meta))
             engine.reloadable_type = "timestamp_ne"
-            engine.runtime_thread_num = self.op_num[index % len(self.op_num)]
-            engine.batch_infer_size = self.op_max_batch[index %
-                                                        len(self.op_max_batch)]
+            engine.runtime_thread_num = self.runtime_thread_num[index % len(
+                self.runtime_thread_num)]
+            engine.batch_infer_size = self.batch_infer_size[index % len(
+                self.batch_infer_size)]
 
             engine.enable_overrun = False
             engine.allow_split_request = True
diff --git a/python/paddle_serving_server/web_service.py b/python/paddle_serving_server/web_service.py
index fd1810e3fac2caa7f1eeb9c4b921a704743dcca8..677d24a4e4a3101e5d0c2b33c8f50ba3f61421dc 100755
--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -133,8 +133,8 @@ class WebService(object):
                             use_calib=False,
                             use_trt=False,
                             gpu_multi_stream=False,
-                            op_num=None,
-                            op_max_batch=None):
+                            runtime_thread_num=None,
+                            batch_infer_size=None):
 
         device = "cpu"
         server = Server()
@@ -187,11 +187,11 @@ class WebService(object):
         if gpu_multi_stream and device == "gpu":
             server.set_gpu_multi_stream()
 
-        if op_num:
-            server.set_op_num(op_num)
+        if runtime_thread_num:
+            server.set_runtime_thread_num(runtime_thread_num)
 
-        if op_max_batch:
-            server.set_op_max_batch(op_max_batch)
+        if batch_infer_size:
+            server.set_batch_infer_size(batch_infer_size)
 
         if use_lite:
             server.set_lite()
@@ -225,8 +225,8 @@ class WebService(object):
                 use_calib=self.use_calib,
                 use_trt=self.use_trt,
                 gpu_multi_stream=self.gpu_multi_stream,
-                op_num=self.op_num,
-                op_max_batch=self.op_max_batch))
+                runtime_thread_num=self.runtime_thread_num,
+                batch_infer_size=self.batch_infer_size))
 
     def prepare_server(self,
                        workdir,
@@ -241,8 +241,8 @@ class WebService(object):
                        mem_optim=True,
                        use_trt=False,
                        gpu_multi_stream=False,
-                       op_num=None,
-                       op_max_batch=None,
+                       runtime_thread_num=None,
+                       batch_infer_size=None,
                        gpuid=None):
         print("This API will be deprecated later. Please do not use it")
         self.workdir = workdir
@@ -259,9 +259,9 @@ class WebService(object):
         self.port_list = []
         self.use_trt = use_trt
         self.gpu_multi_stream = gpu_multi_stream
-        self.op_num = op_num
-        self.op_max_batch = op_max_batch
-       
+        self.runtime_thread_num = runtime_thread_num
+        self.batch_infer_size = batch_infer_size
+
         # record port and pid info for stopping process
         dump_pid_file([self.port], "web_service")
         # if gpuid != None, we will use gpuid first.
diff --git a/tools/Dockerfile.cuda10.1-cudnn7.devel b/tools/Dockerfile.cuda10.1-cudnn7.devel
index 24087af9490b8b5f4b7f57d70cb927c580da6066..1ed462ec4c1df845bc461577d97c3fee7d5852d6 100644
--- a/tools/Dockerfile.cuda10.1-cudnn7.devel
+++ b/tools/Dockerfile.cuda10.1-cudnn7.devel
@@ -83,7 +83,7 @@ RUN ln -sf /usr/local/bin/python3.6 /usr/local/bin/python3 && ln -sf /usr/local/
 RUN rm -r /root/python_build
 
 # Install Go and glide
-RUN wget -qO- https://dl.google.com/go/go1.14.linux-amd64.tar.gz | \
+RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.17.2.linux-amd64.tar.gz | \
     tar -xz -C /usr/local && \
     mkdir /root/go && \
     mkdir /root/go/bin && \
diff --git a/tools/Dockerfile.cuda10.2-cudnn7.devel b/tools/Dockerfile.cuda10.2-cudnn7.devel
index 6425a7a39ec1ca84a3f4d5ab305bcb6b413862bc..eee59b6e43ac18fc645dfb9c8399b33dff9f0e6d 100644
--- a/tools/Dockerfile.cuda10.2-cudnn7.devel
+++ b/tools/Dockerfile.cuda10.2-cudnn7.devel
@@ -83,7 +83,7 @@ RUN ln -sf /usr/local/bin/python3.6 /usr/local/bin/python3 && ln -sf /usr/local/
 RUN rm -r /root/python_build
 
 # Install Go and glide
-RUN wget -qO- https://dl.google.com/go/go1.14.linux-amd64.tar.gz | \
+RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.17.2.linux-amd64.tar.gz | \
     tar -xz -C /usr/local && \
     mkdir /root/go && \
     mkdir /root/go/bin && \
diff --git a/tools/Dockerfile.cuda10.2-cudnn8.devel b/tools/Dockerfile.cuda10.2-cudnn8.devel
index d07731343bb9bfd28f59dd4dcf240bcb26d302f5..5ba14c77c3ed3f479db5e05e9c9fbc8e6468dab6 100644
--- a/tools/Dockerfile.cuda10.2-cudnn8.devel
+++ b/tools/Dockerfile.cuda10.2-cudnn8.devel
@@ -83,7 +83,7 @@ RUN ln -sf /usr/local/bin/python3.6 /usr/local/bin/python3 && ln -sf /usr/local/
 RUN rm -r /root/python_build
 
 # Install Go and glide
-RUN wget -qO- https://dl.google.com/go/go1.14.linux-amd64.tar.gz | \
+RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.17.2.linux-amd64.tar.gz | \
     tar -xz -C /usr/local && \
     mkdir /root/go && \
     mkdir /root/go/bin && \
diff --git a/tools/Dockerfile.cuda11.2-cudnn8.devel b/tools/Dockerfile.cuda11.2-cudnn8.devel
new file mode 100644
index 0000000000000000000000000000000000000000..363096b1ddd48268275992941c740c9d8d34e868
--- /dev/null
+++ b/tools/Dockerfile.cuda11.2-cudnn8.devel
@@ -0,0 +1,147 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
+# When you modify it, please be aware of cudnn-runtime version
+FROM nvidia/cuda:11.2.0-cudnn8-devel-ubuntu16.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+# ENV variables
+ARG WITH_GPU
+ARG WITH_AVX
+
+ENV WITH_GPU=${WITH_GPU:-ON}
+ENV WITH_AVX=${WITH_AVX:-ON}
+
+ENV HOME /root
+# Add bash enhancements
+COPY tools/dockerfiles/root/ /root/
+
+# Prepare packages for Python
+RUN apt-get update && \
+    apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+    libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+    xz-utils tk-dev libffi-dev liblzma-dev
+
+RUN apt-get update && \
+    apt-get install -y --allow-downgrades --allow-change-held-packages \
+    patchelf git python-pip python-dev python-opencv openssh-server bison \
+    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
+    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
+    python-matplotlib unzip \
+    automake locales clang-format swig  \
+    liblapack-dev liblapacke-dev libcurl4-openssl-dev \
+    net-tools libtool module-init-tools vim && \
+    apt-get clean -y
+
+RUN ln -s /usr/lib/x86_64-linux-gnu/libssl.so /usr/lib/libssl.so.10 && \
+    ln -s /usr/lib/x86_64-linux-gnu/libcrypto.so /usr/lib/libcrypto.so.10
+
+RUN wget https://github.com/koalaman/shellcheck/releases/download/v0.7.1/shellcheck-v0.7.1.linux.x86_64.tar.xz -O shellcheck-v0.7.1.linux.x86_64.tar.xz && \
+    tar -xf shellcheck-v0.7.1.linux.x86_64.tar.xz && cp  shellcheck-v0.7.1/shellcheck /usr/bin/shellcheck && \
+    rm -rf shellcheck-v0.7.1.linux.x86_64.tar.xz shellcheck-v0.7.1
+
+# Downgrade gcc&&g++
+WORKDIR /usr/bin 
+      COPY tools/dockerfiles/build_scripts /build_scripts 
+      RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts 
+      RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ 
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc 
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ 
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc 
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ 
+      ENV PATH=/usr/local/gcc-8.2/bin:$PATH 
+
+# install cmake
+WORKDIR /home
+RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz
+ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
+
+# Install Python3.6
+RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \
+    tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \
+    ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz
+
+RUN wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
+    tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null && ldconfig && cd .. && rm -rf Python-3.6.0*
+
+# Install Python3.7
+RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
+    tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null && ldconfig && cd .. && rm -rf Python-3.7.0*
+
+# Install Python3.8
+RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz && \
+    tar -xzf Python-3.8.0.tgz && cd Python-3.8.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null && ldconfig && cd .. && rm -rf Python-3.8.0*
+
+ENV LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}
+RUN ln -sf /usr/local/bin/python3.6 /usr/local/bin/python3 && ln -sf /usr/local/bin/python3.6 /usr/bin/python3 && ln -sf /usr/local/bin/pip3.6 /usr/local/bin/pip3 && ln -sf /usr/local/bin/pip3.6 /usr/bin/pip3
+
+RUN rm -r /root/python_build
+
+# Install Go and glide
+RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.17.2.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/go && \
+    mkdir /root/go/bin && \
+    mkdir /root/go/src && \
+    echo "GOROOT=/usr/local/go" >> /root/.bashrc && \
+    echo "GOPATH=/root/go" >> /root/.bashrc && \
+    echo "PATH=/usr/local/go/bin:/root/go/bin:$PATH" >> /root/.bashrc
+ENV GOROOT=/usr/local/go GOPATH=/root/go
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=usr/local/go/bin:/root/go/bin:${PATH}
+
+# Install TensorRT
+# following TensorRT.tar.gz is not the default official one, we do two miny changes:
+# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
+#    and its size is only one-third of the official one.
+# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
+#    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
+
+# Downgrade TensorRT 
+COPY tools/dockerfiles/build_scripts /build_scripts
+RUN bash /build_scripts/install_trt.sh cuda11.2 
+RUN rm -rf /build_scripts
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN apt-get install libprotobuf-dev -y
+
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# So install a newer version here.
+RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
+    dpkg -i patchelf_0.10-2_amd64.deb
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+CMD source ~/.bashrc
+
+# ccache 3.7.9
+RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
+    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
+    ./configure -prefix=/usr/local/ccache-3.7.9 && \
+    make -j8 && make install && \
+    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
+
+RUN python3.8 -m pip install --upgrade pip==21.1.1 requests && \
+    python3.7 -m pip install --upgrade pip==21.1.1 requests && \
+    python3.6 -m pip install --upgrade pip==21.1.1 requests 
+
+RUN wget https://paddle-serving.bj.bcebos.com/others/centos_ssl.tar && \
+    tar xf centos_ssl.tar && rm -rf centos_ssl.tar && \
+    mv libcrypto.so.1.0.2k /usr/lib/libcrypto.so.1.0.2k && mv libssl.so.1.0.2k /usr/lib/libssl.so.1.0.2k && \
+    ln -sf /usr/lib/libcrypto.so.1.0.2k /usr/lib/libcrypto.so.10 && \
+    ln -sf /usr/lib/libssl.so.1.0.2k /usr/lib/libssl.so.10 && \
+    ln -sf /usr/lib/libcrypto.so.10 /usr/lib/libcrypto.so && \
+    ln -sf /usr/lib/libssl.so.10 /usr/lib/libssl.so
+
+EXPOSE 22
diff --git a/tools/Dockerfile.devel b/tools/Dockerfile.devel
index be31b2e9abd90f644eb0f94a6d672639e4b7f6c5..287759e8f82f3fc37200bb791a1bd6530ab6516e 100644
--- a/tools/Dockerfile.devel
+++ b/tools/Dockerfile.devel
@@ -83,7 +83,7 @@ RUN ln -sf /usr/local/bin/python3.6 /usr/local/bin/python3 && ln -sf /usr/local/
 RUN rm -r /root/python_build
 
 # Install Go and glide
-RUN wget -qO- https://dl.google.com/go/go1.14.linux-amd64.tar.gz | \
+RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.17.2.linux-amd64.tar.gz | \
     tar -xz -C /usr/local && \
     mkdir /root/go && \
     mkdir /root/go/bin && \