diff --git a/core/predictor/framework/infer.cpp b/core/predictor/framework/infer.cpp
old mode 100755
new mode 100644
index fa77c50506c02c4350569349e4e6679dde173ffc..5149a4852570298d16183709f6c2d457e1cc524f
--- a/core/predictor/framework/infer.cpp
+++ b/core/predictor/framework/infer.cpp
@@ -391,7 +391,8 @@ int InferManager::proc_initialize(const char* path,
     return -1;
   }
   uint32_t engine_num = model_toolkit_conf.engines_size();
-  im::bsf::TaskExecutorVector<TaskT>::instance().resize(*engine_index_ptr+engine_num);
+  im::bsf::TaskExecutorVector<TaskT>::instance().resize(*engine_index_ptr +
+                                                        engine_num);
   for (uint32_t ei = 0; ei < engine_num; ++ei) {
     LOG(INFO) << "model_toolkit_conf.engines(" << ei
               << ").name: " << model_toolkit_conf.engines(ei).name();
diff --git a/python/paddle_serving_client/client.py b/python/paddle_serving_client/client.py
index 80d62826c4a5db842572e3144047fdd42bb06d23..eeecd9d6ebc7d489d4ce038383ecb78d8fa55464 100755
--- a/python/paddle_serving_client/client.py
+++ b/python/paddle_serving_client/client.py
@@ -79,7 +79,7 @@ class SDKConfig(object):
         self.tag_list = []
         self.cluster_list = []
         self.variant_weight_list = []
-        self.rpc_timeout_ms = 20000
+        self.rpc_timeout_ms = 200000
         self.load_balance_strategy = "la"
 
     def add_server_variant(self, tag, cluster, variant_weight):
@@ -142,7 +142,7 @@ class Client(object):
         self.profile_ = _Profiler()
         self.all_numpy_input = True
         self.has_numpy_input = False
-        self.rpc_timeout_ms = 20000
+        self.rpc_timeout_ms = 200000
         from .serving_client import PredictorRes
         self.predictorres_constructor = PredictorRes
 
diff --git a/python/paddle_serving_server/serve.py b/python/paddle_serving_server/serve.py
index 68e1d19b7a2d587783fcc4d0b3b5226f616ac8a4..91e4dab3253d35ff14b121e9784ab2fd93044114 100755
--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
@@ -31,6 +31,67 @@ elif sys.version_info.major == 3:
     from http.server import BaseHTTPRequestHandler, HTTPServer
 
 
+def format_gpu_to_strlist(unformatted_gpus):
+    gpus_strlist = []
+    if isinstance(unformatted_gpus, int):
+        gpus_strlist = [str(unformatted_gpus)]
+    elif isinstance(unformatted_gpus, list):
+        if unformatted_gpus == [""]:
+            gpus_strlist = ["-1"]
+        elif len(unformatted_gpus) == 0:
+            gpus_strlist = ["-1"]
+        else:
+            gpus_strlist = [str(x) for x in unformatted_gpus]
+    elif isinstance(unformatted_gpus, str):
+        if unformatted_gpus == "":
+            gpus_strlist = ["-1"]
+        else:
+            gpus_strlist = [unformatted_gpus]
+    elif unformatted_gpus == None:
+        gpus_strlist = ["-1"]
+    else:
+        raise ValueError("error input of set_gpus")
+
+    # check cuda visible
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        env_gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        for op_gpus_str in gpus_strlist:
+            op_gpu_list = op_gpus_str.split(",")
+            # op_gpu_list == ["-1"] means this op use CPU
+            # so don`t check cudavisible.
+            if op_gpu_list == ["-1"]:
+                continue
+            for ids in op_gpu_list:
+                if ids not in env_gpus:
+                    print("gpu_ids is not in CUDA_VISIBLE_DEVICES.")
+                    exit(-1)
+
+    # check gpuid is valid
+    for op_gpus_str in gpus_strlist:
+        op_gpu_list = op_gpus_str.split(",")
+        use_gpu = False
+        for ids in op_gpu_list:
+            if int(ids) < -1:
+                raise ValueError("The input of gpuid error.")
+            if int(ids) >= 0:
+                use_gpu = True
+            if int(ids) == -1 and use_gpu:
+                raise ValueError("You can not use CPU and GPU in one model.")
+
+    return gpus_strlist
+
+
+def is_gpu_mode(unformatted_gpus):
+    gpus_strlist = format_gpu_to_strlist(unformatted_gpus)
+    for op_gpus_str in gpus_strlist:
+        op_gpu_list = op_gpus_str.split(",")
+        for ids in op_gpu_list:
+            if int(ids) >= 0:
+                return True
+
+    return False
+
+
 def serve_args():
     parser = argparse.ArgumentParser("serve")
     parser.add_argument(
@@ -38,7 +99,7 @@ def serve_args():
     parser.add_argument(
         "--port", type=int, default=9292, help="Port of the starting gpu")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="Type of device")
+        "--device", type=str, default="cpu", help="Type of device")
     parser.add_argument(
         "--gpu_ids", type=str, default="", nargs="+", help="gpu ids")
     parser.add_argument(
@@ -118,9 +179,9 @@ def serve_args():
 
 def start_gpu_card_model(gpu_mode, port, args):  # pylint: disable=doc-string-missing
 
-    device = "gpu"
-    if gpu_mode == False:
-        device = "cpu"
+    device = "cpu"
+    if gpu_mode == True:
+        device = "gpu"
 
     thread_num = args.thread
     model = args.model
@@ -211,34 +272,15 @@ def start_gpu_card_model(gpu_mode, port, args):  # pylint: disable=doc-string-mi
 
 
 def start_multi_card(args, serving_port=None):  # pylint: disable=doc-string-missing
-    gpus = []
+
     if serving_port == None:
         serving_port = args.port
 
-    if args.gpu_ids == "":
-        gpus = []
-    else:
-        #check the gpu_id is valid or not.
-        gpus = args.gpu_ids
-        if isinstance(gpus, str):
-            gpus = [gpus]
-        if "CUDA_VISIBLE_DEVICES" in os.environ:
-            env_gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
-            for op_gpus_str in gpus:
-                op_gpu_list = op_gpus_str.split(",")
-                for ids in op_gpu_list:
-                    if ids not in env_gpus:
-                        print("gpu_ids is not in CUDA_VISIBLE_DEVICES.")
-                        exit(-1)
-
     if args.use_lite:
         print("run using paddle-lite.")
         start_gpu_card_model(False, serving_port, args)
-    elif len(gpus) <= 0:
-        print("gpu_ids not set, going to run cpu service.")
-        start_gpu_card_model(False, serving_port, args)
     else:
-        start_gpu_card_model(True, serving_port, args)
+        start_gpu_card_model(is_gpu_mode(args.gpu_ids), serving_port, args)
 
 
 class MainService(BaseHTTPRequestHandler):
@@ -320,7 +362,9 @@ class MainService(BaseHTTPRequestHandler):
 
 
 if __name__ == "__main__":
-
+    # args.device is not used at all.
+    # just keep the interface.
+    # so --device should not be recommended at the HomePage.
     args = serve_args()
     for single_model_config in args.model:
         if os.path.isdir(single_model_config):
@@ -346,29 +390,10 @@ if __name__ == "__main__":
         web_service = WebService(name=args.name)
         web_service.load_model_config(args.model)
 
-        if args.gpu_ids == "":
-            gpus = []
-        else:
-            #check the gpu_id is valid or not.
-            gpus = args.gpu_ids
-            if isinstance(gpus, str):
-                gpus = [gpus]
-            if "CUDA_VISIBLE_DEVICES" in os.environ:
-                env_gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
-                for op_gpus_str in gpus:
-                    op_gpu_list = op_gpus_str.split(",")
-                    for ids in op_gpu_list:
-                        if ids not in env_gpus:
-                            print("gpu_ids is not in CUDA_VISIBLE_DEVICES.")
-                            exit(-1)
-
-        if len(gpus) > 0:
-            web_service.set_gpus(gpus)
         workdir = "{}_{}".format(args.workdir, args.port)
         web_service.prepare_server(
             workdir=workdir,
             port=args.port,
-            device=args.device,
             use_lite=args.use_lite,
             use_xpu=args.use_xpu,
             ir_optim=args.ir_optim,
@@ -378,7 +403,8 @@ if __name__ == "__main__":
             use_trt=args.use_trt,
             gpu_multi_stream=args.gpu_multi_stream,
             op_num=args.op_num,
-            op_max_batch=args.op_max_batch)
+            op_max_batch=args.op_max_batch,
+            gpuid=args.gpu_ids)
         web_service.run_rpc_service()
 
         app_instance = Flask(__name__)
diff --git a/python/paddle_serving_server/server.py b/python/paddle_serving_server/server.py
index 2cc839d00e31199d743ae219998843560e10daaa..61f98b93c2f493f25a202876fdf8f9d2a1b71e25 100755
--- a/python/paddle_serving_server/server.py
+++ b/python/paddle_serving_server/server.py
@@ -17,6 +17,7 @@ import tarfile
 import socket
 import paddle_serving_server as paddle_serving_server
 from paddle_serving_server.rpc_service import MultiLangServerServiceServicer
+from paddle_serving_server.serve import format_gpu_to_strlist
 from .proto import server_configure_pb2 as server_sdk
 from .proto import general_model_config_pb2 as m_config
 from .proto import multi_lang_general_model_service_pb2_grpc
@@ -171,12 +172,7 @@ class Server(object):
         self.device = device
 
     def set_gpuid(self, gpuid):
-        if isinstance(gpuid, int):
-            self.gpuid = str(gpuid)
-        elif isinstance(gpuid, list):
-            self.gpuid = [str(x) for x in gpuid]
-        else:
-            self.gpuid = gpuid
+        self.gpuid = format_gpu_to_strlist(gpuid)
 
     def set_op_num(self, op_num):
         self.op_num = op_num
@@ -197,23 +193,20 @@ class Server(object):
         self.use_xpu = True
 
     def _prepare_engine(self, model_config_paths, device, use_encryption_model):
+        self.device = device
         if self.model_toolkit_conf == None:
             self.model_toolkit_conf = []
-        self.device = device
-
-        # Generally, self.gpuid = str[] or str.
-        # such as "0" or ["0"] or ["0,1"] or ["0,1" , "1,2"]
-        if isinstance(self.gpuid, str):
-            self.gpuid = [self.gpuid]
 
+        # Generally, self.gpuid = str[] or [].
         # when len(self.gpuid) means no gpuid is specified.
         # if self.device == "gpu" or self.use_trt:
         # we assume you forget to set gpuid, so set gpuid = ['0'];
-        if len(self.gpuid) == 0:
-            if self.device == "gpu" or self.use_trt:
-                self.gpuid.append("0")
+        if len(self.gpuid) == 0 or self.gpuid == ["-1"]:
+            if self.device == "gpu" or self.use_trt or self.gpu_multi_stream:
+                self.gpuid = ["0"]
+                self.device = "gpu"
             else:
-                self.gpuid.append("-1")
+                self.gpuid = ["-1"]
 
         if isinstance(self.op_num, int):
             self.op_num = [self.op_num]
@@ -254,12 +247,14 @@ class Server(object):
             for ids in op_gpu_list:
                 engine.gpu_ids.extend([int(ids)])
 
-            if self.device == "gpu" or self.use_trt:
+            if self.device == "gpu" or self.use_trt or self.gpu_multi_stream:
                 engine.use_gpu = True
                 # this is for Mixed use of GPU and CPU
                 # if model-1 use GPU and set the device="gpu"
                 # but gpuid[1] = "-1" which means use CPU in Model-2
                 # so config about GPU should be False.
+                # op_gpu_list = gpuid[index].split(",")
+                # which is the gpuid for each engine.
                 if len(op_gpu_list) == 1:
                     if int(op_gpu_list[0]) == -1:
                         engine.use_gpu = False
@@ -500,10 +495,17 @@ class Server(object):
     def prepare_server(self,
                        workdir=None,
                        port=9292,
-                       device="cpu",
+                       device=None,
                        use_encryption_model=False,
                        cube_conf=None):
-        self.device = device
+        # if `device` is not set, use self.device
+        # self.device may not be changed.
+        # or self.device may have changed by set_device.
+        if device == None:
+            device = self.device
+        # if `device` is set, let self.device = device.
+        else:
+            self.device = device
         if workdir == None:
             workdir = "./tmp"
             os.system("mkdir -p {}".format(workdir))
@@ -602,6 +604,7 @@ class MultiLangServer(object):
         self.body_size_ = 64 * 1024 * 1024
         self.concurrency_ = 100000
         self.is_multi_model_ = False  # for model ensemble, which is not useful right now.
+        self.device = "cpu"  # this is the default value for multilang `device`.
 
     def set_max_concurrency(self, concurrency):
         self.concurrency_ = concurrency
@@ -609,6 +612,7 @@ class MultiLangServer(object):
 
     def set_device(self, device="cpu"):
         self.device = device
+        self.bserver_.set_device(device)
 
     def set_num_threads(self, threads):
         self.worker_num_ = threads
@@ -727,10 +731,18 @@ class MultiLangServer(object):
     def prepare_server(self,
                        workdir=None,
                        port=9292,
-                       device="cpu",
+                       device=None,
                        use_encryption_model=False,
                        cube_conf=None):
-        self.device = device
+        # if `device` is not set, use self.device
+        # self.device may not be changed.
+        # or self.device may have changed by set_device.
+        if device == None:
+            device = self.device
+        # if `device` is set, let self.device = device.
+        else:
+            self.device = device
+
         if not self._port_is_available(port):
             raise SystemExit("Port {} is already used".format(port))
         default_port = 12000
diff --git a/python/paddle_serving_server/web_service.py b/python/paddle_serving_server/web_service.py
index de7cec92e7430593ae53a570b07c15126b3c1d6e..b2ef5979c2ffd821936655ff2ed5182020b34eb1 100755
--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -26,6 +26,7 @@ import numpy as np
 import os
 from paddle_serving_server import pipeline
 from paddle_serving_server.pipeline import Op
+from paddle_serving_server.serve import format_gpu_to_strlist
 
 
 def port_is_available(port):
@@ -44,7 +45,7 @@ class WebService(object):
         # pipeline
         self._server = pipeline.PipelineServer(self.name)
 
-        self.gpus = []  # deprecated
+        self.gpus = ["-1"]  # deprecated
         self.rpc_service_list = []  # deprecated
 
     def get_pipeline_response(self, read_op):
@@ -103,19 +104,24 @@ class WebService(object):
         if client_config_path == None:
             self.client_config_path = file_path_list
 
+    # after this function, self.gpus should be a list of str or [].
     def set_gpus(self, gpus):
         print("This API will be deprecated later. Please do not use it")
-        if isinstance(gpus, int):
-            self.gpus = str(gpus)
-        elif isinstance(gpus, list):
-            self.gpus = [str(x) for x in gpus]
-        else:
-            self.gpus = gpus
+        self.gpus = format_gpu_to_strlist(gpus)
+
+# this function can be called by user
+# or by Function create_rpc_config
+# if by user, user can set_gpus or pass the `gpus`
+# if `gpus` == None, which means it`s not set at all.
+# at this time, we should use self.gpus instead.
+# otherwise, we should use the `gpus` first.
+# which means if set_gpus and `gpus` is both set.
+# `gpus` will be used.
 
     def default_rpc_service(self,
                             workdir,
                             port=9292,
-                            gpus=-1,
+                            gpus=None,
                             thread_num=2,
                             mem_optim=True,
                             use_lite=False,
@@ -127,16 +133,25 @@ class WebService(object):
                             gpu_multi_stream=False,
                             op_num=None,
                             op_max_batch=None):
-        device = "gpu"
+
+        device = "cpu"
         server = Server()
+        # only when `gpus == None`, which means it`s not set at all
+        # we will use the self.gpus.
+        if gpus == None:
+            gpus = self.gpus
+
+        gpus = format_gpu_to_strlist(gpus)
+        server.set_gpuid(gpus)
 
-        if gpus == -1 or gpus == "-1":
+        if len(gpus) == 0 or gpus == ["-1"]:
             if use_lite:
                 device = "arm"
             else:
                 device = "cpu"
         else:
-            server.set_gpuid(gpus)
+            device = "gpu"
+
         op_maker = OpMaker()
         op_seq_maker = OpSeqMaker()
 
@@ -190,45 +205,31 @@ class WebService(object):
     def _launch_rpc_service(self, service_idx):
         self.rpc_service_list[service_idx].run_server()
 
+    # if use this function, self.gpus must be set before.
+    # if not, we will use the default value, self.gpus = ["-1"].
+    # so we always pass the `gpus` = self.gpus. 
     def create_rpc_config(self):
-        if len(self.gpus) == 0:
-            # init cpu service
-            self.rpc_service_list.append(
-                self.default_rpc_service(
-                    self.workdir,
-                    self.port_list[0],
-                    -1,
-                    thread_num=self.thread_num,
-                    mem_optim=self.mem_optim,
-                    use_lite=self.use_lite,
-                    use_xpu=self.use_xpu,
-                    ir_optim=self.ir_optim,
-                    precision=self.precision,
-                    use_calib=self.use_calib,
-                    op_num=self.op_num,
-                    op_max_batch=self.op_max_batch))
-        else:
-            self.rpc_service_list.append(
-                self.default_rpc_service(
-                    self.workdir,
-                    self.port_list[0],
-                    self.gpus,
-                    thread_num=self.thread_num,
-                    mem_optim=self.mem_optim,
-                    use_lite=self.use_lite,
-                    use_xpu=self.use_xpu,
-                    ir_optim=self.ir_optim,
-                    precision=self.precision,
-                    use_calib=self.use_calib,
-                    use_trt=self.use_trt,
-                    gpu_multi_stream=self.gpu_multi_stream,
-                    op_num=self.op_num,
-                    op_max_batch=self.op_max_batch))
+        self.rpc_service_list.append(
+            self.default_rpc_service(
+                self.workdir,
+                self.port_list[0],
+                self.gpus,
+                thread_num=self.thread_num,
+                mem_optim=self.mem_optim,
+                use_lite=self.use_lite,
+                use_xpu=self.use_xpu,
+                ir_optim=self.ir_optim,
+                precision=self.precision,
+                use_calib=self.use_calib,
+                use_trt=self.use_trt,
+                gpu_multi_stream=self.gpu_multi_stream,
+                op_num=self.op_num,
+                op_max_batch=self.op_max_batch))
 
     def prepare_server(self,
                        workdir,
                        port=9393,
-                       device="gpu",
+                       device="cpu",
                        precision="fp32",
                        use_calib=False,
                        use_lite=False,
@@ -240,12 +241,13 @@ class WebService(object):
                        gpu_multi_stream=False,
                        op_num=None,
                        op_max_batch=None,
-                       gpuid=-1):
+                       gpuid=None):
         print("This API will be deprecated later. Please do not use it")
         self.workdir = workdir
         self.port = port
         self.thread_num = thread_num
-        self.device = device
+        # self.device is not used at all.
+        # device is set by gpuid.
         self.precision = precision
         self.use_calib = use_calib
         self.use_lite = use_lite
@@ -257,12 +259,14 @@ class WebService(object):
         self.gpu_multi_stream = gpu_multi_stream
         self.op_num = op_num
         self.op_max_batch = op_max_batch
-        if isinstance(gpuid, int):
-            self.gpus = str(gpuid)
-        elif isinstance(gpuid, list):
-            self.gpus = [str(x) for x in gpuid]
+
+        # if gpuid != None, we will use gpuid first.
+        # otherwise, keep the self.gpus unchanged.
+        # maybe self.gpus is set by the Function set_gpus.
+        if gpuid != None:
+            self.gpus = format_gpu_to_strlist(gpuid)
         else:
-            self.gpus = gpuid
+            pass
 
         default_port = 12000
         for i in range(1000):
@@ -359,8 +363,8 @@ class WebService(object):
         if gpu:
             # if user forget to call function `set_gpus` to set self.gpus.
             # default self.gpus = [0].
-            if len(self.gpus) == 0:
-                self.gpus.append(0)
+            if len(self.gpus) == 0 or self.gpus == ["-1"]:
+                self.gpus = ["0"]
             # right now, local Predictor only support 1 card.
             # no matter how many gpu_id is in gpus, we only use the first one.
             gpu_id = (self.gpus[0].split(","))[0]